/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 134 - (hide annotations) (download)
Mon Mar 26 16:00:17 2007 UTC (7 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 81190 byte(s)
Commit after detrailing; set executable on autogen.sh.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 nigel 93 #define NLBLOCK md /* Block containing newline information */
48     #define PSSTART start_subject /* Field containing processed string start */
49     #define PSEND end_subject /* Field containing processed string end */
50    
51 nigel 77 #include "pcre_internal.h"
52    
53    
54     /* For use to indent debugging output */
55    
56     #define SP " "
57    
58    
59    
60     /*************************************************
61     * Code parameters and static tables *
62     *************************************************/
63    
64     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
66 nigel 77 enough. */
67    
68 nigel 93 #define OP_PROP_EXTRA 100
69     #define OP_EXTUNI_EXTRA 120
70     #define OP_ANYNL_EXTRA 140
71 nigel 77
72    
73     /* This table identifies those opcodes that are followed immediately by a
74     character that is to be tested in some way. This makes is possible to
75     centralize the loading of these characters. In the case of Type * etc, the
76     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
77     small value. */
78    
79     static uschar coptable[] = {
80     0, /* End */
81     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
82     0, 0, /* Any, Anybyte */
83 nigel 93 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
84 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
85     1, /* Char */
86     1, /* Charnc */
87     1, /* not */
88     /* Positive single-char repeats */
89     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
90     3, 3, 3, /* upto, minupto, exact */
91 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
92 nigel 77 /* Negative single-char repeats - only for chars < 256 */
93     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
94     3, 3, 3, /* NOT upto, minupto, exact */
95 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
96 nigel 77 /* Positive type repeats */
97     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
98     3, 3, 3, /* Type upto, minupto, exact */
99 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
100 nigel 77 /* Character class & ref repeats */
101     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
102     0, 0, /* CRRANGE, CRMINRANGE */
103     0, /* CLASS */
104     0, /* NCLASS */
105     0, /* XCLASS - variable length */
106     0, /* REF */
107     0, /* RECURSE */
108     0, /* CALLOUT */
109     0, /* Alt */
110     0, /* Ket */
111     0, /* KetRmax */
112     0, /* KetRmin */
113     0, /* Assert */
114     0, /* Assert not */
115     0, /* Assert behind */
116     0, /* Assert behind not */
117     0, /* Reverse */
118 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
119     0, 0, 0, /* SBRA, SCBRA, SCOND */
120 nigel 77 0, /* CREF */
121 nigel 93 0, /* RREF */
122     0, /* DEF */
123     0, 0 /* BRAZERO, BRAMINZERO */
124 nigel 77 };
125    
126     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
127     and \w */
128    
129     static uschar toptable1[] = {
130     0, 0, 0, 0, 0,
131     ctype_digit, ctype_digit,
132     ctype_space, ctype_space,
133     ctype_word, ctype_word,
134     0 /* OP_ANY */
135     };
136    
137     static uschar toptable2[] = {
138     0, 0, 0, 0, 0,
139     ctype_digit, 0,
140     ctype_space, 0,
141     ctype_word, 0,
142     1 /* OP_ANY */
143     };
144    
145    
146     /* Structure for holding data about a particular state, which is in effect the
147     current data for an active path through the match tree. It must consist
148     entirely of ints because the working vector we are passed, and which we put
149     these structures in, is a vector of ints. */
150    
151     typedef struct stateblock {
152     int offset; /* Offset to opcode */
153     int count; /* Count for repeats */
154     int ims; /* ims flag bits */
155     int data; /* Some use extra data */
156     } stateblock;
157    
158     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
159    
160    
161     #ifdef DEBUG
162     /*************************************************
163     * Print character string *
164     *************************************************/
165    
166     /* Character string printing function for debugging.
167    
168     Arguments:
169     p points to string
170     length number of bytes
171     f where to print
172    
173     Returns: nothing
174     */
175    
176     static void
177     pchars(unsigned char *p, int length, FILE *f)
178     {
179     int c;
180     while (length-- > 0)
181     {
182     if (isprint(c = *(p++)))
183     fprintf(f, "%c", c);
184     else
185     fprintf(f, "\\x%02x", c);
186     }
187     }
188     #endif
189    
190    
191    
192     /*************************************************
193     * Execute a Regular Expression - DFA engine *
194     *************************************************/
195    
196     /* This internal function applies a compiled pattern to a subject string,
197     starting at a given point, using a DFA engine. This function is called from the
198     external one, possibly multiple times if the pattern is not anchored. The
199     function calls itself recursively for some kinds of subpattern.
200    
201     Arguments:
202     md the match_data block with fixed information
203     this_start_code the opening bracket of this subexpression's code
204     current_subject where we currently are in the subject string
205     start_offset start offset in the subject string
206     offsets vector to contain the matching string offsets
207     offsetcount size of same
208     workspace vector of workspace
209     wscount size of same
210     ims the current ims flags
211     rlevel function call recursion level
212     recursing regex recursive call level
213    
214     Returns: > 0 =>
215     = 0 =>
216     -1 => failed to match
217     < -1 => some kind of unexpected problem
218    
219     The following macros are used for adding states to the two state vectors (one
220     for the current character, one for the following character). */
221    
222     #define ADD_ACTIVE(x,y) \
223     if (active_count++ < wscount) \
224     { \
225     next_active_state->offset = (x); \
226     next_active_state->count = (y); \
227     next_active_state->ims = ims; \
228     next_active_state++; \
229     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
230     } \
231     else return PCRE_ERROR_DFA_WSSIZE
232    
233     #define ADD_ACTIVE_DATA(x,y,z) \
234     if (active_count++ < wscount) \
235     { \
236     next_active_state->offset = (x); \
237     next_active_state->count = (y); \
238     next_active_state->ims = ims; \
239     next_active_state->data = (z); \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_NEW(x,y) \
246     if (new_count++ < wscount) \
247     { \
248     next_new_state->offset = (x); \
249     next_new_state->count = (y); \
250     next_new_state->ims = ims; \
251     next_new_state++; \
252     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
253     } \
254     else return PCRE_ERROR_DFA_WSSIZE
255    
256     #define ADD_NEW_DATA(x,y,z) \
257     if (new_count++ < wscount) \
258     { \
259     next_new_state->offset = (x); \
260     next_new_state->count = (y); \
261     next_new_state->ims = ims; \
262     next_new_state->data = (z); \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     /* And now, here is the code */
269    
270     static int
271     internal_dfa_exec(
272     dfa_match_data *md,
273     const uschar *this_start_code,
274     const uschar *current_subject,
275     int start_offset,
276     int *offsets,
277     int offsetcount,
278     int *workspace,
279     int wscount,
280     int ims,
281     int rlevel,
282     int recursing)
283     {
284     stateblock *active_states, *new_states, *temp_states;
285     stateblock *next_active_state, *next_new_state;
286    
287     const uschar *ctypes, *lcc, *fcc;
288     const uschar *ptr;
289 nigel 93 const uschar *end_code, *first_op;
290 nigel 77
291     int active_count, new_count, match_count;
292    
293     /* Some fields in the md block are frequently referenced, so we load them into
294     independent variables in the hope that this will perform better. */
295    
296     const uschar *start_subject = md->start_subject;
297     const uschar *end_subject = md->end_subject;
298     const uschar *start_code = md->start_code;
299    
300 nigel 87 #ifdef SUPPORT_UTF8
301 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
302 nigel 93 #else
303     BOOL utf8 = FALSE;
304 nigel 87 #endif
305 nigel 77
306     rlevel++;
307     offsetcount &= (-2);
308    
309     wscount -= 2;
310     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
311     (2 * INTS_PER_STATEBLOCK);
312    
313     DPRINTF(("\n%.*s---------------------\n"
314     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
315     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
316    
317     ctypes = md->tables + ctypes_offset;
318     lcc = md->tables + lcc_offset;
319     fcc = md->tables + fcc_offset;
320    
321     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
322    
323     active_states = (stateblock *)(workspace + 2);
324     next_new_state = new_states = active_states + wscount;
325     new_count = 0;
326    
327 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
328     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
329    
330 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
331     the alternative states onto the list, and find out where the end is. This
332     makes is possible to use this function recursively, when we want to stop at a
333     matching internal ket rather than at the end.
334    
335     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
336     a backward assertion. In that case, we have to find out the maximum amount to
337     move back, and set up each alternative appropriately. */
338    
339 nigel 93 if (*first_op == OP_REVERSE)
340 nigel 77 {
341     int max_back = 0;
342     int gone_back;
343    
344     end_code = this_start_code;
345     do
346     {
347     int back = GET(end_code, 2+LINK_SIZE);
348     if (back > max_back) max_back = back;
349     end_code += GET(end_code, 1);
350     }
351     while (*end_code == OP_ALT);
352    
353     /* If we can't go back the amount required for the longest lookbehind
354     pattern, go back as far as we can; some alternatives may still be viable. */
355    
356     #ifdef SUPPORT_UTF8
357     /* In character mode we have to step back character by character */
358    
359     if (utf8)
360     {
361     for (gone_back = 0; gone_back < max_back; gone_back++)
362     {
363     if (current_subject <= start_subject) break;
364     current_subject--;
365     while (current_subject > start_subject &&
366     (*current_subject & 0xc0) == 0x80)
367     current_subject--;
368     }
369     }
370     else
371     #endif
372    
373     /* In byte-mode we can do this quickly. */
374    
375     {
376     gone_back = (current_subject - max_back < start_subject)?
377     current_subject - start_subject : max_back;
378     current_subject -= gone_back;
379     }
380    
381     /* Now we can process the individual branches. */
382    
383     end_code = this_start_code;
384     do
385     {
386     int back = GET(end_code, 2+LINK_SIZE);
387     if (back <= gone_back)
388     {
389     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
390     ADD_NEW_DATA(-bstate, 0, gone_back - back);
391     }
392     end_code += GET(end_code, 1);
393     }
394     while (*end_code == OP_ALT);
395     }
396    
397     /* This is the code for a "normal" subpattern (not a backward assertion). The
398     start of a whole pattern is always one of these. If we are at the top level,
399     we may be asked to restart matching from the same point that we reached for a
400     previous partial match. We still have to scan through the top-level branches to
401     find the end state. */
402    
403     else
404     {
405     end_code = this_start_code;
406    
407     /* Restarting */
408    
409     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
410     {
411     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
412     new_count = workspace[1];
413     if (!workspace[0])
414     memcpy(new_states, active_states, new_count * sizeof(stateblock));
415     }
416    
417     /* Not restarting */
418    
419     else
420     {
421 nigel 93 int length = 1 + LINK_SIZE +
422     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
423 nigel 77 do
424     {
425 nigel 93 ADD_NEW(end_code - start_code + length, 0);
426 nigel 77 end_code += GET(end_code, 1);
427 nigel 93 length = 1 + LINK_SIZE;
428 nigel 77 }
429     while (*end_code == OP_ALT);
430     }
431     }
432    
433     workspace[0] = 0; /* Bit indicating which vector is current */
434    
435     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
436    
437     /* Loop for scanning the subject */
438    
439     ptr = current_subject;
440     for (;;)
441     {
442     int i, j;
443 nigel 91 int clen, dlen;
444     unsigned int c, d;
445 nigel 77
446     /* Make the new state list into the active state list and empty the
447     new state list. */
448    
449     temp_states = active_states;
450     active_states = new_states;
451     new_states = temp_states;
452     active_count = new_count;
453     new_count = 0;
454    
455     workspace[0] ^= 1; /* Remember for the restarting feature */
456     workspace[1] = active_count;
457    
458     #ifdef DEBUG
459     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
460     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
461     printf("\"\n");
462    
463     printf("%.*sActive states: ", rlevel*2-2, SP);
464     for (i = 0; i < active_count; i++)
465     printf("%d/%d ", active_states[i].offset, active_states[i].count);
466     printf("\n");
467     #endif
468    
469     /* Set the pointers for adding new states */
470    
471     next_active_state = active_states + active_count;
472     next_new_state = new_states;
473    
474     /* Load the current character from the subject outside the loop, as many
475     different states may want to look at it, and we assume that at least one
476     will. */
477    
478     if (ptr < end_subject)
479     {
480 nigel 93 clen = 1; /* Number of bytes in the character */
481 nigel 77 #ifdef SUPPORT_UTF8
482     if (utf8) { GETCHARLEN(c, ptr, clen); } else
483     #endif /* SUPPORT_UTF8 */
484     c = *ptr;
485     }
486     else
487     {
488 nigel 93 clen = 0; /* This indicates the end of the subject */
489     c = NOTACHAR; /* This value should never actually be used */
490 nigel 77 }
491    
492     /* Scan up the active states and act on each one. The result of an action
493     may be to add more states to the currently active list (e.g. on hitting a
494     parenthesis) or it may be to put states on the new list, for considering
495     when we move the character pointer on. */
496    
497     for (i = 0; i < active_count; i++)
498     {
499     stateblock *current_state = active_states + i;
500     const uschar *code;
501     int state_offset = current_state->offset;
502     int count, codevalue;
503 nigel 87 int chartype, script;
504 nigel 77
505     #ifdef DEBUG
506     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
507 nigel 93 if (clen == 0) printf("EOL\n");
508 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
509     else printf("0x%02x\n", c);
510     #endif
511    
512     /* This variable is referred to implicity in the ADD_xxx macros. */
513    
514     ims = current_state->ims;
515    
516     /* A negative offset is a special case meaning "hold off going to this
517     (negated) state until the number of characters in the data field have
518     been skipped". */
519    
520     if (state_offset < 0)
521     {
522     if (current_state->data > 0)
523     {
524     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
525     ADD_NEW_DATA(state_offset, current_state->count,
526     current_state->data - 1);
527     continue;
528     }
529     else
530     {
531     current_state->offset = state_offset = -state_offset;
532     }
533     }
534    
535     /* Check for a duplicate state with the same count, and skip if found. */
536    
537     for (j = 0; j < i; j++)
538     {
539     if (active_states[j].offset == state_offset &&
540     active_states[j].count == current_state->count)
541     {
542     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
543     goto NEXT_ACTIVE_STATE;
544     }
545     }
546    
547     /* The state offset is the offset to the opcode */
548    
549     code = start_code + state_offset;
550     codevalue = *code;
551    
552     /* If this opcode is followed by an inline character, load it. It is
553     tempting to test for the presence of a subject character here, but that
554     is wrong, because sometimes zero repetitions of the subject are
555     permitted.
556    
557     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
558     argument that is not a data character - but is always one byte long.
559     Unfortunately, we have to take special action to deal with \P, \p, and
560     \X in this case. To keep the other cases fast, convert these ones to new
561     opcodes. */
562    
563     if (coptable[codevalue] > 0)
564     {
565     dlen = 1;
566     #ifdef SUPPORT_UTF8
567     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
568     #endif /* SUPPORT_UTF8 */
569     d = code[coptable[codevalue]];
570     if (codevalue >= OP_TYPESTAR)
571     {
572 nigel 93 switch(d)
573     {
574     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
575     case OP_NOTPROP:
576     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
577     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
578     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
579     default: break;
580     }
581 nigel 77 }
582     }
583     else
584     {
585     dlen = 0; /* Not strictly necessary, but compilers moan */
586 nigel 93 d = NOTACHAR; /* if these variables are not set. */
587 nigel 77 }
588    
589    
590     /* Now process the individual opcodes */
591    
592     switch (codevalue)
593     {
594    
595     /* ========================================================================== */
596     /* Reached a closing bracket. If not at the end of the pattern, carry
597     on with the next opcode. Otherwise, unless we have an empty string and
598     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
599     matches so we always have the longest first. */
600    
601     case OP_KET:
602     case OP_KETRMIN:
603     case OP_KETRMAX:
604     if (code != end_code)
605     {
606     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
607     if (codevalue != OP_KET)
608     {
609     ADD_ACTIVE(state_offset - GET(code, 1), 0);
610     }
611     }
612     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
613     {
614     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
615     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
616     match_count = 0;
617     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
618     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
619     if (offsetcount >= 2)
620     {
621     offsets[0] = current_subject - start_subject;
622     offsets[1] = ptr - start_subject;
623     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
624     offsets[1] - offsets[0], current_subject));
625     }
626     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
627     {
628     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
629     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
630     match_count, rlevel*2-2, SP));
631     return match_count;
632     }
633     }
634     break;
635    
636     /* ========================================================================== */
637     /* These opcodes add to the current list of states without looking
638     at the current character. */
639    
640     /*-----------------------------------------------------------------*/
641     case OP_ALT:
642     do { code += GET(code, 1); } while (*code == OP_ALT);
643     ADD_ACTIVE(code - start_code, 0);
644     break;
645    
646     /*-----------------------------------------------------------------*/
647     case OP_BRA:
648 nigel 93 case OP_SBRA:
649 nigel 77 do
650     {
651     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
652     code += GET(code, 1);
653     }
654     while (*code == OP_ALT);
655     break;
656    
657     /*-----------------------------------------------------------------*/
658 nigel 93 case OP_CBRA:
659     case OP_SCBRA:
660     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
661     code += GET(code, 1);
662     while (*code == OP_ALT)
663     {
664     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
665     code += GET(code, 1);
666     }
667     break;
668    
669     /*-----------------------------------------------------------------*/
670 nigel 77 case OP_BRAZERO:
671     case OP_BRAMINZERO:
672     ADD_ACTIVE(state_offset + 1, 0);
673     code += 1 + GET(code, 2);
674     while (*code == OP_ALT) code += GET(code, 1);
675     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
676     break;
677    
678     /*-----------------------------------------------------------------*/
679     case OP_CIRC:
680     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
681 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
682     ptr != end_subject &&
683 nigel 93 WAS_NEWLINE(ptr)))
684 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
685     break;
686    
687     /*-----------------------------------------------------------------*/
688     case OP_EOD:
689     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
690     break;
691    
692     /*-----------------------------------------------------------------*/
693     case OP_OPT:
694     ims = code[1];
695     ADD_ACTIVE(state_offset + 2, 0);
696     break;
697    
698     /*-----------------------------------------------------------------*/
699     case OP_SOD:
700     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
701     break;
702    
703     /*-----------------------------------------------------------------*/
704     case OP_SOM:
705     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
706     break;
707    
708    
709     /* ========================================================================== */
710     /* These opcodes inspect the next subject character, and sometimes
711     the previous one as well, but do not have an argument. The variable
712     clen contains the length of the current character and is zero if we are
713     at the end of the subject. */
714    
715     /*-----------------------------------------------------------------*/
716     case OP_ANY:
717 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
718 nigel 77 { ADD_NEW(state_offset + 1, 0); }
719     break;
720    
721     /*-----------------------------------------------------------------*/
722     case OP_EODN:
723 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
724 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
725     break;
726    
727     /*-----------------------------------------------------------------*/
728     case OP_DOLL:
729     if ((md->moptions & PCRE_NOTEOL) == 0)
730     {
731 nigel 91 if (clen == 0 ||
732 nigel 93 (IS_NEWLINE(ptr) &&
733 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
734     ))
735 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
736     }
737 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
738 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
739     break;
740    
741     /*-----------------------------------------------------------------*/
742    
743     case OP_DIGIT:
744     case OP_WHITESPACE:
745     case OP_WORDCHAR:
746     if (clen > 0 && c < 256 &&
747     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
748     { ADD_NEW(state_offset + 1, 0); }
749     break;
750    
751     /*-----------------------------------------------------------------*/
752     case OP_NOT_DIGIT:
753     case OP_NOT_WHITESPACE:
754     case OP_NOT_WORDCHAR:
755     if (clen > 0 && (c >= 256 ||
756     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
757     { ADD_NEW(state_offset + 1, 0); }
758     break;
759    
760     /*-----------------------------------------------------------------*/
761     case OP_WORD_BOUNDARY:
762     case OP_NOT_WORD_BOUNDARY:
763     {
764     int left_word, right_word;
765    
766     if (ptr > start_subject)
767     {
768     const uschar *temp = ptr - 1;
769     #ifdef SUPPORT_UTF8
770     if (utf8) BACKCHAR(temp);
771     #endif
772     GETCHARTEST(d, temp);
773     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
774     }
775     else left_word = 0;
776    
777     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
778     else right_word = 0;
779    
780     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
781     { ADD_ACTIVE(state_offset + 1, 0); }
782     }
783     break;
784    
785    
786     #ifdef SUPPORT_UCP
787    
788     /*-----------------------------------------------------------------*/
789     /* Check the next character by Unicode property. We will get here only
790     if the support is in the binary; otherwise a compile-time error occurs.
791     */
792    
793     case OP_PROP:
794     case OP_NOTPROP:
795     if (clen > 0)
796     {
797 nigel 87 BOOL OK;
798     int category = _pcre_ucp_findprop(c, &chartype, &script);
799     switch(code[1])
800 nigel 77 {
801 nigel 87 case PT_ANY:
802     OK = TRUE;
803     break;
804    
805     case PT_LAMP:
806     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
807     break;
808    
809     case PT_GC:
810     OK = category == code[2];
811     break;
812    
813     case PT_PC:
814     OK = chartype == code[2];
815     break;
816    
817     case PT_SC:
818     OK = script == code[2];
819     break;
820    
821     /* Should never occur, but keep compilers from grumbling. */
822    
823     default:
824     OK = codevalue != OP_PROP;
825     break;
826 nigel 77 }
827 nigel 87
828     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
829 nigel 77 }
830     break;
831     #endif
832    
833    
834    
835     /* ========================================================================== */
836     /* These opcodes likewise inspect the subject character, but have an
837     argument that is not a data character. It is one of these opcodes:
838     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
839     OP_NOT_WORDCHAR. The value is loaded into d. */
840    
841     case OP_TYPEPLUS:
842     case OP_TYPEMINPLUS:
843 nigel 93 case OP_TYPEPOSPLUS:
844 nigel 77 count = current_state->count; /* Already matched */
845     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
846     if (clen > 0)
847     {
848     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
849     (c < 256 &&
850 nigel 91 (d != OP_ANY ||
851     (ims & PCRE_DOTALL) != 0 ||
852     !IS_NEWLINE(ptr)
853     ) &&
854 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
855     {
856 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
857     {
858     active_count--; /* Remove non-match possibility */
859     next_active_state--;
860     }
861 nigel 77 count++;
862     ADD_NEW(state_offset, count);
863     }
864     }
865     break;
866    
867     /*-----------------------------------------------------------------*/
868     case OP_TYPEQUERY:
869     case OP_TYPEMINQUERY:
870 nigel 93 case OP_TYPEPOSQUERY:
871 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
872     if (clen > 0)
873     {
874     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
875     (c < 256 &&
876 nigel 91 (d != OP_ANY ||
877     (ims & PCRE_DOTALL) != 0 ||
878     !IS_NEWLINE(ptr)
879     ) &&
880 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
881     {
882 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
883     {
884     active_count--; /* Remove non-match possibility */
885     next_active_state--;
886     }
887 nigel 77 ADD_NEW(state_offset + 2, 0);
888     }
889     }
890     break;
891    
892     /*-----------------------------------------------------------------*/
893     case OP_TYPESTAR:
894     case OP_TYPEMINSTAR:
895 nigel 93 case OP_TYPEPOSSTAR:
896 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
897     if (clen > 0)
898     {
899     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900     (c < 256 &&
901 nigel 91 (d != OP_ANY ||
902     (ims & PCRE_DOTALL) != 0 ||
903     !IS_NEWLINE(ptr)
904     ) &&
905 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
906     {
907 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
908     {
909     active_count--; /* Remove non-match possibility */
910     next_active_state--;
911     }
912 nigel 77 ADD_NEW(state_offset, 0);
913     }
914     }
915     break;
916    
917     /*-----------------------------------------------------------------*/
918     case OP_TYPEEXACT:
919 nigel 93 count = current_state->count; /* Number already matched */
920     if (clen > 0)
921     {
922     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
923     (c < 256 &&
924     (d != OP_ANY ||
925     (ims & PCRE_DOTALL) != 0 ||
926     !IS_NEWLINE(ptr)
927     ) &&
928     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
929     {
930     if (++count >= GET2(code, 1))
931     { ADD_NEW(state_offset + 4, 0); }
932     else
933     { ADD_NEW(state_offset, count); }
934     }
935     }
936     break;
937    
938     /*-----------------------------------------------------------------*/
939 nigel 77 case OP_TYPEUPTO:
940     case OP_TYPEMINUPTO:
941 nigel 93 case OP_TYPEPOSUPTO:
942     ADD_ACTIVE(state_offset + 4, 0);
943 nigel 77 count = current_state->count; /* Number already matched */
944     if (clen > 0)
945     {
946     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
947     (c < 256 &&
948 nigel 91 (d != OP_ANY ||
949     (ims & PCRE_DOTALL) != 0 ||
950     !IS_NEWLINE(ptr)
951     ) &&
952 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
953     {
954 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
955     {
956     active_count--; /* Remove non-match possibility */
957     next_active_state--;
958     }
959 nigel 77 if (++count >= GET2(code, 1))
960     { ADD_NEW(state_offset + 4, 0); }
961     else
962     { ADD_NEW(state_offset, count); }
963     }
964     }
965     break;
966    
967     /* ========================================================================== */
968     /* These are virtual opcodes that are used when something like
969 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
970     argument. It keeps the code above fast for the other cases. The argument
971     is in the d variable. */
972 nigel 77
973     case OP_PROP_EXTRA + OP_TYPEPLUS:
974     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
975 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
976 nigel 77 count = current_state->count; /* Already matched */
977 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
978 nigel 77 if (clen > 0)
979     {
980 nigel 87 BOOL OK;
981     int category = _pcre_ucp_findprop(c, &chartype, &script);
982     switch(code[2])
983     {
984     case PT_ANY:
985     OK = TRUE;
986     break;
987    
988     case PT_LAMP:
989     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
990     break;
991    
992     case PT_GC:
993     OK = category == code[3];
994     break;
995    
996     case PT_PC:
997     OK = chartype == code[3];
998     break;
999    
1000     case PT_SC:
1001     OK = script == code[3];
1002     break;
1003    
1004     /* Should never occur, but keep compilers from grumbling. */
1005    
1006     default:
1007     OK = codevalue != OP_PROP;
1008     break;
1009     }
1010    
1011 nigel 93 if (OK == (d == OP_PROP))
1012     {
1013     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1014     {
1015     active_count--; /* Remove non-match possibility */
1016     next_active_state--;
1017     }
1018     count++;
1019     ADD_NEW(state_offset, count);
1020     }
1021 nigel 77 }
1022     break;
1023    
1024     /*-----------------------------------------------------------------*/
1025     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1026     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1027 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1028 nigel 77 count = current_state->count; /* Already matched */
1029     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1030 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1031 nigel 77 {
1032     const uschar *nptr = ptr + clen;
1033     int ncount = 0;
1034 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1035     {
1036     active_count--; /* Remove non-match possibility */
1037     next_active_state--;
1038     }
1039 nigel 77 while (nptr < end_subject)
1040     {
1041     int nd;
1042     int ndlen = 1;
1043     GETCHARLEN(nd, nptr, ndlen);
1044 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1045 nigel 77 ncount++;
1046     nptr += ndlen;
1047     }
1048     count++;
1049     ADD_NEW_DATA(-state_offset, count, ncount);
1050     }
1051     break;
1052    
1053     /*-----------------------------------------------------------------*/
1054 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1055     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1056     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1057     count = current_state->count; /* Already matched */
1058     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1059     if (clen > 0)
1060     {
1061     int ncount = 0;
1062     switch (c)
1063     {
1064     case 0x000d:
1065     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1066     /* Fall through */
1067     case 0x000a:
1068     case 0x000b:
1069     case 0x000c:
1070     case 0x0085:
1071     case 0x2028:
1072     case 0x2029:
1073     if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1074     {
1075     active_count--; /* Remove non-match possibility */
1076     next_active_state--;
1077     }
1078     count++;
1079     ADD_NEW_DATA(-state_offset, count, ncount);
1080     break;
1081     default:
1082     break;
1083     }
1084     }
1085     break;
1086    
1087     /*-----------------------------------------------------------------*/
1088 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1089     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1090 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1091 nigel 87 count = 4;
1092 nigel 77 goto QS1;
1093    
1094     case OP_PROP_EXTRA + OP_TYPESTAR:
1095     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1096 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1097 nigel 77 count = 0;
1098    
1099     QS1:
1100    
1101 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1102 nigel 77 if (clen > 0)
1103     {
1104 nigel 87 BOOL OK;
1105     int category = _pcre_ucp_findprop(c, &chartype, &script);
1106     switch(code[2])
1107     {
1108     case PT_ANY:
1109     OK = TRUE;
1110     break;
1111    
1112     case PT_LAMP:
1113     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1114     break;
1115    
1116     case PT_GC:
1117     OK = category == code[3];
1118     break;
1119    
1120     case PT_PC:
1121     OK = chartype == code[3];
1122     break;
1123    
1124     case PT_SC:
1125     OK = script == code[3];
1126     break;
1127    
1128     /* Should never occur, but keep compilers from grumbling. */
1129    
1130     default:
1131     OK = codevalue != OP_PROP;
1132     break;
1133     }
1134    
1135 nigel 93 if (OK == (d == OP_PROP))
1136     {
1137     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1138     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1139     {
1140     active_count--; /* Remove non-match possibility */
1141     next_active_state--;
1142     }
1143     ADD_NEW(state_offset + count, 0);
1144     }
1145 nigel 77 }
1146     break;
1147    
1148     /*-----------------------------------------------------------------*/
1149     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1150     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1151 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1152 nigel 77 count = 2;
1153     goto QS2;
1154    
1155     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1156     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1157 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1158 nigel 77 count = 0;
1159    
1160     QS2:
1161    
1162     ADD_ACTIVE(state_offset + 2, 0);
1163 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1164 nigel 77 {
1165     const uschar *nptr = ptr + clen;
1166     int ncount = 0;
1167 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1168     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1169     {
1170     active_count--; /* Remove non-match possibility */
1171     next_active_state--;
1172     }
1173 nigel 77 while (nptr < end_subject)
1174     {
1175     int nd;
1176     int ndlen = 1;
1177     GETCHARLEN(nd, nptr, ndlen);
1178 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1179 nigel 77 ncount++;
1180     nptr += ndlen;
1181     }
1182     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1183     }
1184     break;
1185    
1186     /*-----------------------------------------------------------------*/
1187 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1188     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1189     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1190     count = 2;
1191     goto QS3;
1192    
1193     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1194     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1195     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1196     count = 0;
1197    
1198     QS3:
1199     ADD_ACTIVE(state_offset + 2, 0);
1200     if (clen > 0)
1201     {
1202     int ncount = 0;
1203     switch (c)
1204     {
1205     case 0x000d:
1206     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1207     /* Fall through */
1208     case 0x000a:
1209     case 0x000b:
1210     case 0x000c:
1211     case 0x0085:
1212     case 0x2028:
1213     case 0x2029:
1214     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1215     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1216     {
1217     active_count--; /* Remove non-match possibility */
1218     next_active_state--;
1219     }
1220     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1221     break;
1222     default:
1223     break;
1224     }
1225     }
1226     break;
1227    
1228     /*-----------------------------------------------------------------*/
1229 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1230     case OP_PROP_EXTRA + OP_TYPEUPTO:
1231     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1232 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1233 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1234 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1235 nigel 77 count = current_state->count; /* Number already matched */
1236     if (clen > 0)
1237     {
1238 nigel 87 BOOL OK;
1239     int category = _pcre_ucp_findprop(c, &chartype, &script);
1240     switch(code[4])
1241 nigel 77 {
1242 nigel 87 case PT_ANY:
1243     OK = TRUE;
1244     break;
1245    
1246     case PT_LAMP:
1247     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1248     break;
1249    
1250     case PT_GC:
1251     OK = category == code[5];
1252     break;
1253    
1254     case PT_PC:
1255     OK = chartype == code[5];
1256     break;
1257    
1258     case PT_SC:
1259     OK = script == code[5];
1260     break;
1261    
1262     /* Should never occur, but keep compilers from grumbling. */
1263    
1264     default:
1265     OK = codevalue != OP_PROP;
1266     break;
1267     }
1268    
1269     if (OK == (d == OP_PROP))
1270     {
1271 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1272     {
1273     active_count--; /* Remove non-match possibility */
1274     next_active_state--;
1275     }
1276 nigel 77 if (++count >= GET2(code, 1))
1277 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1278 nigel 77 else
1279     { ADD_NEW(state_offset, count); }
1280     }
1281     }
1282     break;
1283    
1284     /*-----------------------------------------------------------------*/
1285     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1286     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1287     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1288 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1289 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1290     { ADD_ACTIVE(state_offset + 4, 0); }
1291     count = current_state->count; /* Number already matched */
1292 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1293 nigel 77 {
1294     const uschar *nptr = ptr + clen;
1295     int ncount = 0;
1296 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1297     {
1298     active_count--; /* Remove non-match possibility */
1299     next_active_state--;
1300     }
1301 nigel 77 while (nptr < end_subject)
1302     {
1303     int nd;
1304     int ndlen = 1;
1305     GETCHARLEN(nd, nptr, ndlen);
1306 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1307 nigel 77 ncount++;
1308     nptr += ndlen;
1309     }
1310     if (++count >= GET2(code, 1))
1311     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1312     else
1313     { ADD_NEW_DATA(-state_offset, count, ncount); }
1314     }
1315     break;
1316    
1317 nigel 93 /*-----------------------------------------------------------------*/
1318     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1319     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1320     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1321     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1322     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1323     { ADD_ACTIVE(state_offset + 4, 0); }
1324     count = current_state->count; /* Number already matched */
1325     if (clen > 0)
1326     {
1327     int ncount = 0;
1328     switch (c)
1329     {
1330     case 0x000d:
1331     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1332     /* Fall through */
1333     case 0x000a:
1334     case 0x000b:
1335     case 0x000c:
1336     case 0x0085:
1337     case 0x2028:
1338     case 0x2029:
1339     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1340     {
1341     active_count--; /* Remove non-match possibility */
1342     next_active_state--;
1343     }
1344     if (++count >= GET2(code, 1))
1345     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1346     else
1347     { ADD_NEW_DATA(-state_offset, count, ncount); }
1348     break;
1349     default:
1350     break;
1351     }
1352     }
1353     break;
1354    
1355 nigel 77 /* ========================================================================== */
1356     /* These opcodes are followed by a character that is usually compared
1357     to the current subject character; it is loaded into d. We still get
1358     here even if there is no subject character, because in some cases zero
1359     repetitions are permitted. */
1360    
1361     /*-----------------------------------------------------------------*/
1362     case OP_CHAR:
1363     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1364     break;
1365    
1366     /*-----------------------------------------------------------------*/
1367     case OP_CHARNC:
1368     if (clen == 0) break;
1369    
1370     #ifdef SUPPORT_UTF8
1371     if (utf8)
1372     {
1373     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1374     {
1375 nigel 93 unsigned int othercase;
1376 nigel 77 if (c < 128) othercase = fcc[c]; else
1377    
1378     /* If we have Unicode property support, we can use it to test the
1379 nigel 87 other case of the character. */
1380 nigel 77
1381     #ifdef SUPPORT_UCP
1382 nigel 87 othercase = _pcre_ucp_othercase(c);
1383     #else
1384 nigel 93 othercase = NOTACHAR;
1385 nigel 77 #endif
1386    
1387     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1388     }
1389     }
1390     else
1391     #endif /* SUPPORT_UTF8 */
1392    
1393     /* Non-UTF-8 mode */
1394     {
1395     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1396     }
1397     break;
1398    
1399    
1400     #ifdef SUPPORT_UCP
1401     /*-----------------------------------------------------------------*/
1402     /* This is a tricky one because it can match more than one character.
1403     Find out how many characters to skip, and then set up a negative state
1404     to wait for them to pass before continuing. */
1405    
1406     case OP_EXTUNI:
1407 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1408 nigel 77 {
1409     const uschar *nptr = ptr + clen;
1410     int ncount = 0;
1411     while (nptr < end_subject)
1412     {
1413     int nclen = 1;
1414     GETCHARLEN(c, nptr, nclen);
1415 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1416 nigel 77 ncount++;
1417     nptr += nclen;
1418     }
1419     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1420     }
1421     break;
1422     #endif
1423    
1424     /*-----------------------------------------------------------------*/
1425 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1426     character (when CR is followed by LF). In this case, set up a negative
1427     state to wait for one character to pass before continuing. */
1428    
1429     case OP_ANYNL:
1430     if (clen > 0) switch(c)
1431     {
1432     case 0x000a:
1433     case 0x000b:
1434     case 0x000c:
1435     case 0x0085:
1436     case 0x2028:
1437     case 0x2029:
1438     ADD_NEW(state_offset + 1, 0);
1439     break;
1440     case 0x000d:
1441     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1442     {
1443     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1444     }
1445     else
1446     {
1447     ADD_NEW(state_offset + 1, 0);
1448     }
1449     break;
1450     }
1451     break;
1452    
1453     /*-----------------------------------------------------------------*/
1454 nigel 77 /* Match a negated single character. This is only used for one-byte
1455     characters, that is, we know that d < 256. The character we are
1456     checking (c) can be multibyte. */
1457    
1458     case OP_NOT:
1459     if (clen > 0)
1460     {
1461 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1462 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1463     }
1464     break;
1465    
1466     /*-----------------------------------------------------------------*/
1467     case OP_PLUS:
1468     case OP_MINPLUS:
1469 nigel 93 case OP_POSPLUS:
1470 nigel 77 case OP_NOTPLUS:
1471     case OP_NOTMINPLUS:
1472 nigel 93 case OP_NOTPOSPLUS:
1473 nigel 77 count = current_state->count; /* Already matched */
1474     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1475     if (clen > 0)
1476     {
1477 nigel 93 unsigned int otherd = NOTACHAR;
1478 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1479     {
1480     #ifdef SUPPORT_UTF8
1481 nigel 87 if (utf8 && d >= 128)
1482 nigel 77 {
1483     #ifdef SUPPORT_UCP
1484 nigel 87 otherd = _pcre_ucp_othercase(d);
1485 nigel 77 #endif /* SUPPORT_UCP */
1486     }
1487     else
1488     #endif /* SUPPORT_UTF8 */
1489     otherd = fcc[d];
1490     }
1491     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1492 nigel 93 {
1493     if (count > 0 &&
1494     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1495     {
1496     active_count--; /* Remove non-match possibility */
1497     next_active_state--;
1498     }
1499     count++;
1500     ADD_NEW(state_offset, count);
1501     }
1502 nigel 77 }
1503     break;
1504    
1505     /*-----------------------------------------------------------------*/
1506     case OP_QUERY:
1507     case OP_MINQUERY:
1508 nigel 93 case OP_POSQUERY:
1509 nigel 77 case OP_NOTQUERY:
1510     case OP_NOTMINQUERY:
1511 nigel 93 case OP_NOTPOSQUERY:
1512 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1513     if (clen > 0)
1514     {
1515 nigel 93 unsigned int otherd = NOTACHAR;
1516 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1517 nigel 77 {
1518     #ifdef SUPPORT_UTF8
1519 nigel 87 if (utf8 && d >= 128)
1520 nigel 77 {
1521     #ifdef SUPPORT_UCP
1522 nigel 87 otherd = _pcre_ucp_othercase(d);
1523 nigel 77 #endif /* SUPPORT_UCP */
1524     }
1525     else
1526     #endif /* SUPPORT_UTF8 */
1527     otherd = fcc[d];
1528     }
1529     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1530 nigel 93 {
1531     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1532     {
1533     active_count--; /* Remove non-match possibility */
1534     next_active_state--;
1535     }
1536     ADD_NEW(state_offset + dlen + 1, 0);
1537     }
1538 nigel 77 }
1539     break;
1540    
1541     /*-----------------------------------------------------------------*/
1542     case OP_STAR:
1543     case OP_MINSTAR:
1544 nigel 93 case OP_POSSTAR:
1545 nigel 77 case OP_NOTSTAR:
1546     case OP_NOTMINSTAR:
1547 nigel 93 case OP_NOTPOSSTAR:
1548 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1549     if (clen > 0)
1550     {
1551 nigel 93 unsigned int otherd = NOTACHAR;
1552 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1553 nigel 77 {
1554     #ifdef SUPPORT_UTF8
1555 nigel 87 if (utf8 && d >= 128)
1556 nigel 77 {
1557     #ifdef SUPPORT_UCP
1558 nigel 87 otherd = _pcre_ucp_othercase(d);
1559 nigel 77 #endif /* SUPPORT_UCP */
1560     }
1561     else
1562     #endif /* SUPPORT_UTF8 */
1563     otherd = fcc[d];
1564     }
1565     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1566 nigel 93 {
1567     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1568     {
1569     active_count--; /* Remove non-match possibility */
1570     next_active_state--;
1571     }
1572     ADD_NEW(state_offset, 0);
1573     }
1574 nigel 77 }
1575     break;
1576    
1577     /*-----------------------------------------------------------------*/
1578     case OP_EXACT:
1579 nigel 93 case OP_NOTEXACT:
1580     count = current_state->count; /* Number already matched */
1581     if (clen > 0)
1582     {
1583     unsigned int otherd = NOTACHAR;
1584     if ((ims & PCRE_CASELESS) != 0)
1585     {
1586     #ifdef SUPPORT_UTF8
1587     if (utf8 && d >= 128)
1588     {
1589     #ifdef SUPPORT_UCP
1590     otherd = _pcre_ucp_othercase(d);
1591     #endif /* SUPPORT_UCP */
1592     }
1593     else
1594     #endif /* SUPPORT_UTF8 */
1595     otherd = fcc[d];
1596     }
1597     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1598     {
1599     if (++count >= GET2(code, 1))
1600     { ADD_NEW(state_offset + dlen + 3, 0); }
1601     else
1602     { ADD_NEW(state_offset, count); }
1603     }
1604     }
1605     break;
1606    
1607     /*-----------------------------------------------------------------*/
1608 nigel 77 case OP_UPTO:
1609     case OP_MINUPTO:
1610 nigel 93 case OP_POSUPTO:
1611 nigel 77 case OP_NOTUPTO:
1612     case OP_NOTMINUPTO:
1613 nigel 93 case OP_NOTPOSUPTO:
1614     ADD_ACTIVE(state_offset + dlen + 3, 0);
1615 nigel 77 count = current_state->count; /* Number already matched */
1616     if (clen > 0)
1617     {
1618 nigel 93 unsigned int otherd = NOTACHAR;
1619 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1620     {
1621     #ifdef SUPPORT_UTF8
1622 nigel 87 if (utf8 && d >= 128)
1623 nigel 77 {
1624     #ifdef SUPPORT_UCP
1625 nigel 87 otherd = _pcre_ucp_othercase(d);
1626 nigel 77 #endif /* SUPPORT_UCP */
1627     }
1628     else
1629     #endif /* SUPPORT_UTF8 */
1630     otherd = fcc[d];
1631     }
1632     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1633     {
1634 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1635     {
1636     active_count--; /* Remove non-match possibility */
1637     next_active_state--;
1638     }
1639 nigel 77 if (++count >= GET2(code, 1))
1640     { ADD_NEW(state_offset + dlen + 3, 0); }
1641     else
1642     { ADD_NEW(state_offset, count); }
1643     }
1644     }
1645     break;
1646    
1647    
1648     /* ========================================================================== */
1649     /* These are the class-handling opcodes */
1650    
1651     case OP_CLASS:
1652     case OP_NCLASS:
1653     case OP_XCLASS:
1654     {
1655     BOOL isinclass = FALSE;
1656     int next_state_offset;
1657     const uschar *ecode;
1658    
1659     /* For a simple class, there is always just a 32-byte table, and we
1660     can set isinclass from it. */
1661    
1662     if (codevalue != OP_XCLASS)
1663     {
1664     ecode = code + 33;
1665     if (clen > 0)
1666     {
1667     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1668     ((code[1 + c/8] & (1 << (c&7))) != 0);
1669     }
1670     }
1671    
1672     /* An extended class may have a table or a list of single characters,
1673     ranges, or both, and it may be positive or negative. There's a
1674     function that sorts all this out. */
1675    
1676     else
1677     {
1678     ecode = code + GET(code, 1);
1679     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1680     }
1681    
1682     /* At this point, isinclass is set for all kinds of class, and ecode
1683     points to the byte after the end of the class. If there is a
1684     quantifier, this is where it will be. */
1685    
1686     next_state_offset = ecode - start_code;
1687    
1688     switch (*ecode)
1689     {
1690     case OP_CRSTAR:
1691     case OP_CRMINSTAR:
1692     ADD_ACTIVE(next_state_offset + 1, 0);
1693     if (isinclass) { ADD_NEW(state_offset, 0); }
1694     break;
1695    
1696     case OP_CRPLUS:
1697     case OP_CRMINPLUS:
1698     count = current_state->count; /* Already matched */
1699     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1700     if (isinclass) { count++; ADD_NEW(state_offset, count); }
1701     break;
1702    
1703     case OP_CRQUERY:
1704     case OP_CRMINQUERY:
1705     ADD_ACTIVE(next_state_offset + 1, 0);
1706     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1707     break;
1708    
1709     case OP_CRRANGE:
1710     case OP_CRMINRANGE:
1711     count = current_state->count; /* Already matched */
1712     if (count >= GET2(ecode, 1))
1713     { ADD_ACTIVE(next_state_offset + 5, 0); }
1714     if (isinclass)
1715     {
1716 nigel 91 int max = GET2(ecode, 3);
1717     if (++count >= max && max != 0) /* Max 0 => no limit */
1718 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
1719     else
1720     { ADD_NEW(state_offset, count); }
1721     }
1722     break;
1723    
1724     default:
1725     if (isinclass) { ADD_NEW(next_state_offset, 0); }
1726     break;
1727     }
1728     }
1729     break;
1730    
1731     /* ========================================================================== */
1732     /* These are the opcodes for fancy brackets of various kinds. We have
1733     to use recursion in order to handle them. */
1734    
1735     case OP_ASSERT:
1736     case OP_ASSERT_NOT:
1737     case OP_ASSERTBACK:
1738     case OP_ASSERTBACK_NOT:
1739     {
1740     int rc;
1741     int local_offsets[2];
1742     int local_workspace[1000];
1743     const uschar *endasscode = code + GET(code, 1);
1744    
1745     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1746    
1747     rc = internal_dfa_exec(
1748     md, /* static match data */
1749     code, /* this subexpression's code */
1750     ptr, /* where we currently are */
1751     ptr - start_subject, /* start offset */
1752     local_offsets, /* offset vector */
1753     sizeof(local_offsets)/sizeof(int), /* size of same */
1754     local_workspace, /* workspace vector */
1755     sizeof(local_workspace)/sizeof(int), /* size of same */
1756     ims, /* the current ims flags */
1757     rlevel, /* function recursion level */
1758     recursing); /* pass on regex recursion */
1759    
1760     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1761     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1762     }
1763     break;
1764    
1765     /*-----------------------------------------------------------------*/
1766     case OP_COND:
1767 nigel 93 case OP_SCOND:
1768 nigel 77 {
1769     int local_offsets[1000];
1770     int local_workspace[1000];
1771     int condcode = code[LINK_SIZE+1];
1772    
1773 nigel 93 /* Back reference conditions are not supported */
1774 nigel 77
1775 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1776    
1777     /* The DEFINE condition is always false */
1778    
1779     if (condcode == OP_DEF)
1780 nigel 77 {
1781 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1782     }
1783    
1784     /* The only supported version of OP_RREF is for the value RREF_ANY,
1785     which means "test if in any recursion". We can't test for specifically
1786     recursed groups. */
1787    
1788     else if (condcode == OP_RREF)
1789     {
1790 nigel 77 int value = GET2(code, LINK_SIZE+2);
1791 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1792 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1793     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1794     }
1795    
1796     /* Otherwise, the condition is an assertion */
1797    
1798     else
1799     {
1800     int rc;
1801     const uschar *asscode = code + LINK_SIZE + 1;
1802     const uschar *endasscode = asscode + GET(asscode, 1);
1803    
1804     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1805    
1806     rc = internal_dfa_exec(
1807     md, /* fixed match data */
1808     asscode, /* this subexpression's code */
1809     ptr, /* where we currently are */
1810     ptr - start_subject, /* start offset */
1811     local_offsets, /* offset vector */
1812     sizeof(local_offsets)/sizeof(int), /* size of same */
1813     local_workspace, /* workspace vector */
1814     sizeof(local_workspace)/sizeof(int), /* size of same */
1815     ims, /* the current ims flags */
1816     rlevel, /* function recursion level */
1817     recursing); /* pass on regex recursion */
1818    
1819     if ((rc >= 0) ==
1820     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1821     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1822     else
1823     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1824     }
1825     }
1826     break;
1827    
1828     /*-----------------------------------------------------------------*/
1829     case OP_RECURSE:
1830     {
1831     int local_offsets[1000];
1832     int local_workspace[1000];
1833     int rc;
1834    
1835     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1836     recursing + 1));
1837    
1838     rc = internal_dfa_exec(
1839     md, /* fixed match data */
1840     start_code + GET(code, 1), /* this subexpression's code */
1841     ptr, /* where we currently are */
1842     ptr - start_subject, /* start offset */
1843     local_offsets, /* offset vector */
1844     sizeof(local_offsets)/sizeof(int), /* size of same */
1845     local_workspace, /* workspace vector */
1846     sizeof(local_workspace)/sizeof(int), /* size of same */
1847     ims, /* the current ims flags */
1848     rlevel, /* function recursion level */
1849     recursing + 1); /* regex recurse level */
1850    
1851     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1852     recursing + 1, rc));
1853    
1854     /* Ran out of internal offsets */
1855    
1856     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1857    
1858     /* For each successful matched substring, set up the next state with a
1859     count of characters to skip before trying it. Note that the count is in
1860     characters, not bytes. */
1861    
1862     if (rc > 0)
1863     {
1864     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1865     {
1866     const uschar *p = start_subject + local_offsets[rc];
1867     const uschar *pp = start_subject + local_offsets[rc+1];
1868     int charcount = local_offsets[rc+1] - local_offsets[rc];
1869     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1870     if (charcount > 0)
1871     {
1872     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1873     }
1874     else
1875     {
1876     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1877     }
1878     }
1879     }
1880     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1881     }
1882     break;
1883    
1884     /*-----------------------------------------------------------------*/
1885     case OP_ONCE:
1886     {
1887     int local_offsets[2];
1888     int local_workspace[1000];
1889    
1890     int rc = internal_dfa_exec(
1891     md, /* fixed match data */
1892     code, /* this subexpression's code */
1893     ptr, /* where we currently are */
1894     ptr - start_subject, /* start offset */
1895     local_offsets, /* offset vector */
1896     sizeof(local_offsets)/sizeof(int), /* size of same */
1897     local_workspace, /* workspace vector */
1898     sizeof(local_workspace)/sizeof(int), /* size of same */
1899     ims, /* the current ims flags */
1900     rlevel, /* function recursion level */
1901     recursing); /* pass on regex recursion */
1902    
1903     if (rc >= 0)
1904     {
1905     const uschar *end_subpattern = code;
1906     int charcount = local_offsets[1] - local_offsets[0];
1907     int next_state_offset, repeat_state_offset;
1908    
1909     do { end_subpattern += GET(end_subpattern, 1); }
1910     while (*end_subpattern == OP_ALT);
1911     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1912    
1913     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1914     arrange for the repeat state also to be added to the relevant list.
1915     Calculate the offset, or set -1 for no repeat. */
1916    
1917     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1918     *end_subpattern == OP_KETRMIN)?
1919     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1920    
1921     /* If we have matched an empty string, add the next state at the
1922     current character pointer. This is important so that the duplicate
1923     checking kicks in, which is what breaks infinite loops that match an
1924     empty string. */
1925    
1926     if (charcount == 0)
1927     {
1928     ADD_ACTIVE(next_state_offset, 0);
1929     }
1930    
1931     /* Optimization: if there are no more active states, and there
1932     are no new states yet set up, then skip over the subject string
1933     right here, to save looping. Otherwise, set up the new state to swing
1934     into action when the end of the substring is reached. */
1935    
1936     else if (i + 1 >= active_count && new_count == 0)
1937     {
1938     ptr += charcount;
1939     clen = 0;
1940     ADD_NEW(next_state_offset, 0);
1941    
1942     /* If we are adding a repeat state at the new character position,
1943     we must fudge things so that it is the only current state.
1944     Otherwise, it might be a duplicate of one we processed before, and
1945     that would cause it to be skipped. */
1946    
1947     if (repeat_state_offset >= 0)
1948     {
1949     next_active_state = active_states;
1950     active_count = 0;
1951     i = -1;
1952     ADD_ACTIVE(repeat_state_offset, 0);
1953     }
1954     }
1955     else
1956     {
1957     const uschar *p = start_subject + local_offsets[0];
1958     const uschar *pp = start_subject + local_offsets[1];
1959     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1960     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1961     if (repeat_state_offset >= 0)
1962     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1963     }
1964    
1965     }
1966     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1967     }
1968     break;
1969    
1970    
1971     /* ========================================================================== */
1972     /* Handle callouts */
1973    
1974     case OP_CALLOUT:
1975     if (pcre_callout != NULL)
1976     {
1977     int rrc;
1978     pcre_callout_block cb;
1979     cb.version = 1; /* Version 1 of the callout block */
1980     cb.callout_number = code[1];
1981     cb.offset_vector = offsets;
1982 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
1983 nigel 77 cb.subject_length = end_subject - start_subject;
1984     cb.start_match = current_subject - start_subject;
1985     cb.current_position = ptr - start_subject;
1986     cb.pattern_position = GET(code, 2);
1987     cb.next_item_length = GET(code, 2 + LINK_SIZE);
1988     cb.capture_top = 1;
1989     cb.capture_last = -1;
1990     cb.callout_data = md->callout_data;
1991     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1992     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1993     }
1994     break;
1995    
1996    
1997     /* ========================================================================== */
1998     default: /* Unsupported opcode */
1999     return PCRE_ERROR_DFA_UITEM;
2000     }
2001    
2002     NEXT_ACTIVE_STATE: continue;
2003    
2004     } /* End of loop scanning active states */
2005    
2006     /* We have finished the processing at the current subject character. If no
2007     new states have been set for the next character, we have found all the
2008     matches that we are going to find. If we are at the top level and partial
2009     matching has been requested, check for appropriate conditions. */
2010    
2011     if (new_count <= 0)
2012     {
2013     if (match_count < 0 && /* No matches found */
2014     rlevel == 1 && /* Top level match function */
2015     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2016     ptr >= end_subject && /* Reached end of subject */
2017     ptr > current_subject) /* Matched non-empty string */
2018     {
2019     if (offsetcount >= 2)
2020     {
2021     offsets[0] = current_subject - start_subject;
2022     offsets[1] = end_subject - start_subject;
2023     }
2024     match_count = PCRE_ERROR_PARTIAL;
2025     }
2026    
2027     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2028     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2029     rlevel*2-2, SP));
2030 nigel 91 break; /* In effect, "return", but see the comment below */
2031 nigel 77 }
2032    
2033     /* One or more states are active for the next character. */
2034    
2035     ptr += clen; /* Advance to next subject character */
2036     } /* Loop to move along the subject string */
2037    
2038 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2039     if we use "return" above, we have compiler trouble. Some compilers warn if
2040     there's nothing here because they think the function doesn't return a value. On
2041     the other hand, if we put a dummy statement here, some more clever compilers
2042     complain that it can't be reached. Sigh. */
2043 nigel 77
2044 nigel 91 return match_count;
2045 nigel 77 }
2046    
2047    
2048    
2049    
2050     /*************************************************
2051     * Execute a Regular Expression - DFA engine *
2052     *************************************************/
2053    
2054     /* This external function applies a compiled re to a subject string using a DFA
2055     engine. This function calls the internal function multiple times if the pattern
2056     is not anchored.
2057    
2058     Arguments:
2059     argument_re points to the compiled expression
2060 ph10 97 extra_data points to extra data or is NULL
2061 nigel 77 subject points to the subject string
2062     length length of subject string (may contain binary zeros)
2063     start_offset where to start in the subject string
2064     options option bits
2065     offsets vector of match offsets
2066     offsetcount size of same
2067     workspace workspace vector
2068     wscount size of same
2069    
2070     Returns: > 0 => number of match offset pairs placed in offsets
2071     = 0 => offsets overflowed; longest matches are present
2072     -1 => failed to match
2073     < -1 => some kind of unexpected problem
2074     */
2075    
2076 nigel 87 PCRE_DATA_SCOPE int
2077 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2078     const char *subject, int length, int start_offset, int options, int *offsets,
2079     int offsetcount, int *workspace, int wscount)
2080     {
2081     real_pcre *re = (real_pcre *)argument_re;
2082     dfa_match_data match_block;
2083 nigel 91 dfa_match_data *md = &match_block;
2084 nigel 77 BOOL utf8, anchored, startline, firstline;
2085     const uschar *current_subject, *end_subject, *lcc;
2086    
2087     pcre_study_data internal_study;
2088     const pcre_study_data *study = NULL;
2089     real_pcre internal_re;
2090    
2091     const uschar *req_byte_ptr;
2092     const uschar *start_bits = NULL;
2093     BOOL first_byte_caseless = FALSE;
2094     BOOL req_byte_caseless = FALSE;
2095     int first_byte = -1;
2096     int req_byte = -1;
2097     int req_byte2 = -1;
2098 nigel 91 int newline;
2099 nigel 77
2100     /* Plausibility checks */
2101    
2102     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2103     if (re == NULL || subject == NULL || workspace == NULL ||
2104     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2105     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2106     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2107    
2108     /* We need to find the pointer to any study data before we test for byte
2109     flipping, so we scan the extra_data block first. This may set two fields in the
2110     match block, so we must initialize them beforehand. However, the other fields
2111     in the match block must not be set until after the byte flipping. */
2112    
2113 nigel 91 md->tables = re->tables;
2114     md->callout_data = NULL;
2115 nigel 77
2116     if (extra_data != NULL)
2117     {
2118     unsigned int flags = extra_data->flags;
2119     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2120     study = (const pcre_study_data *)extra_data->study_data;
2121     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2122 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2123     return PCRE_ERROR_DFA_UMLIMIT;
2124 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2125 nigel 91 md->callout_data = extra_data->callout_data;
2126 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2127 nigel 91 md->tables = extra_data->tables;
2128 nigel 77 }
2129    
2130     /* Check that the first field in the block is the magic number. If it is not,
2131     test for a regex that was compiled on a host of opposite endianness. If this is
2132     the case, flipped values are put in internal_re and internal_study if there was
2133     study data too. */
2134    
2135     if (re->magic_number != MAGIC_NUMBER)
2136     {
2137     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2138     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2139     if (study != NULL) study = &internal_study;
2140     }
2141    
2142     /* Set some local values */
2143    
2144     current_subject = (const unsigned char *)subject + start_offset;
2145     end_subject = (const unsigned char *)subject + length;
2146     req_byte_ptr = current_subject - 1;
2147    
2148 nigel 91 #ifdef SUPPORT_UTF8
2149 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2150 nigel 91 #else
2151     utf8 = FALSE;
2152     #endif
2153 nigel 77
2154 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2155     (re->options & PCRE_ANCHORED) != 0;
2156    
2157 nigel 77 /* The remaining fixed data for passing around. */
2158    
2159 nigel 91 md->start_code = (const uschar *)argument_re +
2160 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2161 nigel 91 md->start_subject = (const unsigned char *)subject;
2162     md->end_subject = end_subject;
2163     md->moptions = options;
2164     md->poptions = re->options;
2165 nigel 77
2166 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2167     nothing is set at run time, whatever was used at compile time applies. */
2168 nigel 91
2169 nigel 93 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
2170     PCRE_NEWLINE_BITS)
2171 nigel 91 {
2172 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2173 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2174     case PCRE_NEWLINE_LF: newline = '\n'; break;
2175     case PCRE_NEWLINE_CR+
2176     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2177 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2178     default: return PCRE_ERROR_BADNEWLINE;
2179 nigel 91 }
2180    
2181 nigel 93 if (newline < 0)
2182 nigel 91 {
2183 nigel 93 md->nltype = NLTYPE_ANY;
2184 nigel 91 }
2185     else
2186     {
2187 nigel 93 md->nltype = NLTYPE_FIXED;
2188     if (newline > 255)
2189     {
2190     md->nllen = 2;
2191     md->nl[0] = (newline >> 8) & 255;
2192     md->nl[1] = newline & 255;
2193     }
2194     else
2195     {
2196     md->nllen = 1;
2197     md->nl[0] = newline;
2198     }
2199 nigel 91 }
2200    
2201 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2202     back the character offset. */
2203    
2204     #ifdef SUPPORT_UTF8
2205     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2206     {
2207     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2208     return PCRE_ERROR_BADUTF8;
2209     if (start_offset > 0 && start_offset < length)
2210     {
2211     int tb = ((uschar *)subject)[start_offset];
2212     if (tb > 127)
2213     {
2214     tb &= 0xc0;
2215     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2216     }
2217     }
2218     }
2219     #endif
2220    
2221     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2222     is a feature that makes it possible to save compiled regex and re-use them
2223     in other programs later. */
2224    
2225 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2226 nigel 77
2227     /* The lower casing table and the "must be at the start of a line" flag are
2228     used in a loop when finding where to start. */
2229    
2230 nigel 91 lcc = md->tables + lcc_offset;
2231 nigel 77 startline = (re->options & PCRE_STARTLINE) != 0;
2232     firstline = (re->options & PCRE_FIRSTLINE) != 0;
2233    
2234     /* Set up the first character to match, if available. The first_byte value is
2235     never set for an anchored regular expression, but the anchoring may be forced
2236     at run time, so we have to test for anchoring. The first char may be unset for
2237     an unanchored pattern, of course. If there's no first char and the pattern was
2238     studied, there may be a bitmap of possible first characters. */
2239    
2240     if (!anchored)
2241     {
2242     if ((re->options & PCRE_FIRSTSET) != 0)
2243     {
2244     first_byte = re->first_byte & 255;
2245     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2246     first_byte = lcc[first_byte];
2247     }
2248     else
2249     {
2250     if (startline && study != NULL &&
2251     (study->options & PCRE_STUDY_MAPPED) != 0)
2252     start_bits = study->start_bits;
2253     }
2254     }
2255    
2256     /* For anchored or unanchored matches, there may be a "last known required
2257     character" set. */
2258    
2259     if ((re->options & PCRE_REQCHSET) != 0)
2260     {
2261     req_byte = re->req_byte & 255;
2262     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2263 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2264 nigel 77 }
2265    
2266     /* Call the main matching function, looping for a non-anchored regex after a
2267     failed match. Unless restarting, optimize by moving to the first match
2268     character if possible, when not anchored. Then unless wanting a partial match,
2269     check for a required later character. */
2270    
2271     for (;;)
2272     {
2273     int rc;
2274    
2275     if ((options & PCRE_DFA_RESTART) == 0)
2276     {
2277     const uschar *save_end_subject = end_subject;
2278    
2279     /* Advance to a unique first char if possible. If firstline is TRUE, the
2280     start of the match is constrained to the first line of a multiline string.
2281 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2282     scanning at a newline. If the match fails at the newline, later code breaks
2283     this loop. */
2284 nigel 77
2285     if (firstline)
2286     {
2287     const uschar *t = current_subject;
2288 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2289 nigel 77 end_subject = t;
2290     }
2291    
2292     if (first_byte >= 0)
2293     {
2294     if (first_byte_caseless)
2295     while (current_subject < end_subject &&
2296     lcc[*current_subject] != first_byte)
2297     current_subject++;
2298     else
2299     while (current_subject < end_subject && *current_subject != first_byte)
2300     current_subject++;
2301     }
2302    
2303 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2304 nigel 77
2305     else if (startline)
2306     {
2307 nigel 93 if (current_subject > md->start_subject + start_offset)
2308 nigel 77 {
2309 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2310 nigel 77 current_subject++;
2311 ph10 130
2312     /* If we have just passed a CR and the newline option is ANY, and we
2313     are now at a LF, advance the match position by one more character. */
2314 ph10 134
2315 ph10 130 if (current_subject[-1] == '\r' &&
2316     md->nltype == NLTYPE_ANY &&
2317     current_subject < end_subject &&
2318     *current_subject == '\n')
2319     current_subject++;
2320 nigel 77 }
2321     }
2322    
2323     /* Or to a non-unique first char after study */
2324    
2325     else if (start_bits != NULL)
2326     {
2327     while (current_subject < end_subject)
2328     {
2329     register unsigned int c = *current_subject;
2330     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2331     else break;
2332     }
2333     }
2334    
2335     /* Restore fudged end_subject */
2336    
2337     end_subject = save_end_subject;
2338     }
2339    
2340     /* If req_byte is set, we know that that character must appear in the subject
2341     for the match to succeed. If the first character is set, req_byte must be
2342     later in the subject; otherwise the test starts at the match point. This
2343     optimization can save a huge amount of work in patterns with nested unlimited
2344     repeats that aren't going to match. Writing separate code for cased/caseless
2345     versions makes it go faster, as does using an autoincrement and backing off
2346     on a match.
2347    
2348     HOWEVER: when the subject string is very, very long, searching to its end can
2349     take a long time, and give bad performance on quite ordinary patterns. This
2350     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2351     don't do this when the string is sufficiently long.
2352    
2353     ALSO: this processing is disabled when partial matching is requested.
2354     */
2355    
2356     if (req_byte >= 0 &&
2357     end_subject - current_subject < REQ_BYTE_MAX &&
2358     (options & PCRE_PARTIAL) == 0)
2359     {
2360     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2361    
2362     /* We don't need to repeat the search if we haven't yet reached the
2363     place we found it at last time. */
2364    
2365     if (p > req_byte_ptr)
2366     {
2367     if (req_byte_caseless)
2368     {
2369     while (p < end_subject)
2370     {
2371     register int pp = *p++;
2372     if (pp == req_byte || pp == req_byte2) { p--; break; }
2373     }
2374     }
2375     else
2376     {
2377     while (p < end_subject)
2378     {
2379     if (*p++ == req_byte) { p--; break; }
2380     }
2381     }
2382    
2383     /* If we can't find the required character, break the matching loop,
2384     which will cause a return or PCRE_ERROR_NOMATCH. */
2385    
2386     if (p >= end_subject) break;
2387    
2388     /* If we have found the required character, save the point where we
2389     found it, so that we don't search again next time round the loop if
2390     the start hasn't passed this character yet. */
2391    
2392     req_byte_ptr = p;
2393     }
2394     }
2395    
2396     /* OK, now we can do the business */
2397    
2398     rc = internal_dfa_exec(
2399 nigel 91 md, /* fixed match data */
2400     md->start_code, /* this subexpression's code */
2401     current_subject, /* where we currently are */
2402     start_offset, /* start offset in subject */
2403     offsets, /* offset vector */
2404     offsetcount, /* size of same */
2405     workspace, /* workspace vector */
2406     wscount, /* size of same */
2407 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2408 nigel 91 0, /* function recurse level */
2409     0); /* regex recurse level */
2410 nigel 77
2411     /* Anything other than "no match" means we are done, always; otherwise, carry
2412     on only if not anchored. */
2413    
2414     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2415    
2416     /* Advance to the next subject character unless we are at the end of a line
2417     and firstline is set. */
2418    
2419 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2420 nigel 77 current_subject++;
2421     if (utf8)
2422     {
2423     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2424     current_subject++;
2425     }
2426     if (current_subject > end_subject) break;
2427    
2428 nigel 93 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
2429     are now at a LF, advance the match position by one more character. */
2430    
2431     if (current_subject[-1] == '\r' &&
2432     (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
2433     current_subject < end_subject &&
2434     *current_subject == '\n')
2435     current_subject++;
2436    
2437     } /* "Bumpalong" loop */
2438    
2439 nigel 77 return PCRE_ERROR_NOMATCH;
2440     }
2441    
2442     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12