/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 152 - (hide annotations) (download)
Tue Apr 17 15:55:53 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 81531 byte(s)
Typos in the docs, missing casts and #ifdefs in the code.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 nigel 93 #define NLBLOCK md /* Block containing newline information */
48     #define PSSTART start_subject /* Field containing processed string start */
49     #define PSEND end_subject /* Field containing processed string end */
50    
51 nigel 77 #include "pcre_internal.h"
52    
53    
54     /* For use to indent debugging output */
55    
56     #define SP " "
57    
58    
59    
60     /*************************************************
61     * Code parameters and static tables *
62     *************************************************/
63    
64     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
66 nigel 77 enough. */
67    
68 nigel 93 #define OP_PROP_EXTRA 100
69     #define OP_EXTUNI_EXTRA 120
70     #define OP_ANYNL_EXTRA 140
71 nigel 77
72    
73     /* This table identifies those opcodes that are followed immediately by a
74     character that is to be tested in some way. This makes is possible to
75     centralize the loading of these characters. In the case of Type * etc, the
76     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
77     small value. */
78    
79     static uschar coptable[] = {
80     0, /* End */
81     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
82     0, 0, /* Any, Anybyte */
83 nigel 93 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
84 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
85     1, /* Char */
86     1, /* Charnc */
87     1, /* not */
88     /* Positive single-char repeats */
89     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
90     3, 3, 3, /* upto, minupto, exact */
91 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
92 nigel 77 /* Negative single-char repeats - only for chars < 256 */
93     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
94     3, 3, 3, /* NOT upto, minupto, exact */
95 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
96 nigel 77 /* Positive type repeats */
97     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
98     3, 3, 3, /* Type upto, minupto, exact */
99 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
100 nigel 77 /* Character class & ref repeats */
101     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
102     0, 0, /* CRRANGE, CRMINRANGE */
103     0, /* CLASS */
104     0, /* NCLASS */
105     0, /* XCLASS - variable length */
106     0, /* REF */
107     0, /* RECURSE */
108     0, /* CALLOUT */
109     0, /* Alt */
110     0, /* Ket */
111     0, /* KetRmax */
112     0, /* KetRmin */
113     0, /* Assert */
114     0, /* Assert not */
115     0, /* Assert behind */
116     0, /* Assert behind not */
117     0, /* Reverse */
118 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
119     0, 0, 0, /* SBRA, SCBRA, SCOND */
120 nigel 77 0, /* CREF */
121 nigel 93 0, /* RREF */
122     0, /* DEF */
123     0, 0 /* BRAZERO, BRAMINZERO */
124 nigel 77 };
125    
126     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
127     and \w */
128    
129     static uschar toptable1[] = {
130     0, 0, 0, 0, 0,
131     ctype_digit, ctype_digit,
132     ctype_space, ctype_space,
133     ctype_word, ctype_word,
134     0 /* OP_ANY */
135     };
136    
137     static uschar toptable2[] = {
138     0, 0, 0, 0, 0,
139     ctype_digit, 0,
140     ctype_space, 0,
141     ctype_word, 0,
142     1 /* OP_ANY */
143     };
144    
145    
146     /* Structure for holding data about a particular state, which is in effect the
147     current data for an active path through the match tree. It must consist
148     entirely of ints because the working vector we are passed, and which we put
149     these structures in, is a vector of ints. */
150    
151     typedef struct stateblock {
152     int offset; /* Offset to opcode */
153     int count; /* Count for repeats */
154     int ims; /* ims flag bits */
155     int data; /* Some use extra data */
156     } stateblock;
157    
158     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
159    
160    
161     #ifdef DEBUG
162     /*************************************************
163     * Print character string *
164     *************************************************/
165    
166     /* Character string printing function for debugging.
167    
168     Arguments:
169     p points to string
170     length number of bytes
171     f where to print
172    
173     Returns: nothing
174     */
175    
176     static void
177     pchars(unsigned char *p, int length, FILE *f)
178     {
179     int c;
180     while (length-- > 0)
181     {
182     if (isprint(c = *(p++)))
183     fprintf(f, "%c", c);
184     else
185     fprintf(f, "\\x%02x", c);
186     }
187     }
188     #endif
189    
190    
191    
192     /*************************************************
193     * Execute a Regular Expression - DFA engine *
194     *************************************************/
195    
196     /* This internal function applies a compiled pattern to a subject string,
197     starting at a given point, using a DFA engine. This function is called from the
198     external one, possibly multiple times if the pattern is not anchored. The
199     function calls itself recursively for some kinds of subpattern.
200    
201     Arguments:
202     md the match_data block with fixed information
203     this_start_code the opening bracket of this subexpression's code
204     current_subject where we currently are in the subject string
205     start_offset start offset in the subject string
206     offsets vector to contain the matching string offsets
207     offsetcount size of same
208     workspace vector of workspace
209     wscount size of same
210     ims the current ims flags
211     rlevel function call recursion level
212     recursing regex recursive call level
213    
214     Returns: > 0 =>
215     = 0 =>
216     -1 => failed to match
217     < -1 => some kind of unexpected problem
218    
219     The following macros are used for adding states to the two state vectors (one
220     for the current character, one for the following character). */
221    
222     #define ADD_ACTIVE(x,y) \
223     if (active_count++ < wscount) \
224     { \
225     next_active_state->offset = (x); \
226     next_active_state->count = (y); \
227     next_active_state->ims = ims; \
228     next_active_state++; \
229     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
230     } \
231     else return PCRE_ERROR_DFA_WSSIZE
232    
233     #define ADD_ACTIVE_DATA(x,y,z) \
234     if (active_count++ < wscount) \
235     { \
236     next_active_state->offset = (x); \
237     next_active_state->count = (y); \
238     next_active_state->ims = ims; \
239     next_active_state->data = (z); \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_NEW(x,y) \
246     if (new_count++ < wscount) \
247     { \
248     next_new_state->offset = (x); \
249     next_new_state->count = (y); \
250     next_new_state->ims = ims; \
251     next_new_state++; \
252     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
253     } \
254     else return PCRE_ERROR_DFA_WSSIZE
255    
256     #define ADD_NEW_DATA(x,y,z) \
257     if (new_count++ < wscount) \
258     { \
259     next_new_state->offset = (x); \
260     next_new_state->count = (y); \
261     next_new_state->ims = ims; \
262     next_new_state->data = (z); \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     /* And now, here is the code */
269    
270     static int
271     internal_dfa_exec(
272     dfa_match_data *md,
273     const uschar *this_start_code,
274     const uschar *current_subject,
275     int start_offset,
276     int *offsets,
277     int offsetcount,
278     int *workspace,
279     int wscount,
280     int ims,
281     int rlevel,
282     int recursing)
283     {
284     stateblock *active_states, *new_states, *temp_states;
285     stateblock *next_active_state, *next_new_state;
286    
287     const uschar *ctypes, *lcc, *fcc;
288     const uschar *ptr;
289 nigel 93 const uschar *end_code, *first_op;
290 nigel 77
291     int active_count, new_count, match_count;
292    
293     /* Some fields in the md block are frequently referenced, so we load them into
294     independent variables in the hope that this will perform better. */
295    
296     const uschar *start_subject = md->start_subject;
297     const uschar *end_subject = md->end_subject;
298     const uschar *start_code = md->start_code;
299    
300 nigel 87 #ifdef SUPPORT_UTF8
301 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
302 nigel 93 #else
303     BOOL utf8 = FALSE;
304 nigel 87 #endif
305 nigel 77
306     rlevel++;
307     offsetcount &= (-2);
308    
309     wscount -= 2;
310     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
311     (2 * INTS_PER_STATEBLOCK);
312    
313     DPRINTF(("\n%.*s---------------------\n"
314     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
315     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
316    
317     ctypes = md->tables + ctypes_offset;
318     lcc = md->tables + lcc_offset;
319     fcc = md->tables + fcc_offset;
320    
321     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
322    
323     active_states = (stateblock *)(workspace + 2);
324     next_new_state = new_states = active_states + wscount;
325     new_count = 0;
326    
327 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
328     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
329    
330 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
331     the alternative states onto the list, and find out where the end is. This
332     makes is possible to use this function recursively, when we want to stop at a
333     matching internal ket rather than at the end.
334    
335     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
336     a backward assertion. In that case, we have to find out the maximum amount to
337     move back, and set up each alternative appropriately. */
338    
339 nigel 93 if (*first_op == OP_REVERSE)
340 nigel 77 {
341     int max_back = 0;
342     int gone_back;
343    
344     end_code = this_start_code;
345     do
346     {
347     int back = GET(end_code, 2+LINK_SIZE);
348     if (back > max_back) max_back = back;
349     end_code += GET(end_code, 1);
350     }
351     while (*end_code == OP_ALT);
352    
353     /* If we can't go back the amount required for the longest lookbehind
354     pattern, go back as far as we can; some alternatives may still be viable. */
355    
356     #ifdef SUPPORT_UTF8
357     /* In character mode we have to step back character by character */
358    
359     if (utf8)
360     {
361     for (gone_back = 0; gone_back < max_back; gone_back++)
362     {
363     if (current_subject <= start_subject) break;
364     current_subject--;
365     while (current_subject > start_subject &&
366     (*current_subject & 0xc0) == 0x80)
367     current_subject--;
368     }
369     }
370     else
371     #endif
372    
373     /* In byte-mode we can do this quickly. */
374    
375     {
376     gone_back = (current_subject - max_back < start_subject)?
377     current_subject - start_subject : max_back;
378     current_subject -= gone_back;
379     }
380    
381     /* Now we can process the individual branches. */
382    
383     end_code = this_start_code;
384     do
385     {
386     int back = GET(end_code, 2+LINK_SIZE);
387     if (back <= gone_back)
388     {
389     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
390     ADD_NEW_DATA(-bstate, 0, gone_back - back);
391     }
392     end_code += GET(end_code, 1);
393     }
394     while (*end_code == OP_ALT);
395     }
396    
397     /* This is the code for a "normal" subpattern (not a backward assertion). The
398     start of a whole pattern is always one of these. If we are at the top level,
399     we may be asked to restart matching from the same point that we reached for a
400     previous partial match. We still have to scan through the top-level branches to
401     find the end state. */
402    
403     else
404     {
405     end_code = this_start_code;
406    
407     /* Restarting */
408    
409     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
410     {
411     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
412     new_count = workspace[1];
413     if (!workspace[0])
414     memcpy(new_states, active_states, new_count * sizeof(stateblock));
415     }
416    
417     /* Not restarting */
418    
419     else
420     {
421 nigel 93 int length = 1 + LINK_SIZE +
422     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
423 nigel 77 do
424     {
425 nigel 93 ADD_NEW(end_code - start_code + length, 0);
426 nigel 77 end_code += GET(end_code, 1);
427 nigel 93 length = 1 + LINK_SIZE;
428 nigel 77 }
429     while (*end_code == OP_ALT);
430     }
431     }
432    
433     workspace[0] = 0; /* Bit indicating which vector is current */
434    
435     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
436    
437     /* Loop for scanning the subject */
438    
439     ptr = current_subject;
440     for (;;)
441     {
442     int i, j;
443 nigel 91 int clen, dlen;
444     unsigned int c, d;
445 nigel 77
446     /* Make the new state list into the active state list and empty the
447     new state list. */
448    
449     temp_states = active_states;
450     active_states = new_states;
451     new_states = temp_states;
452     active_count = new_count;
453     new_count = 0;
454    
455     workspace[0] ^= 1; /* Remember for the restarting feature */
456     workspace[1] = active_count;
457    
458     #ifdef DEBUG
459     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
460     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
461     printf("\"\n");
462    
463     printf("%.*sActive states: ", rlevel*2-2, SP);
464     for (i = 0; i < active_count; i++)
465     printf("%d/%d ", active_states[i].offset, active_states[i].count);
466     printf("\n");
467     #endif
468    
469     /* Set the pointers for adding new states */
470    
471     next_active_state = active_states + active_count;
472     next_new_state = new_states;
473    
474     /* Load the current character from the subject outside the loop, as many
475     different states may want to look at it, and we assume that at least one
476     will. */
477    
478     if (ptr < end_subject)
479     {
480 nigel 93 clen = 1; /* Number of bytes in the character */
481 nigel 77 #ifdef SUPPORT_UTF8
482     if (utf8) { GETCHARLEN(c, ptr, clen); } else
483     #endif /* SUPPORT_UTF8 */
484     c = *ptr;
485     }
486     else
487     {
488 nigel 93 clen = 0; /* This indicates the end of the subject */
489     c = NOTACHAR; /* This value should never actually be used */
490 nigel 77 }
491    
492     /* Scan up the active states and act on each one. The result of an action
493     may be to add more states to the currently active list (e.g. on hitting a
494     parenthesis) or it may be to put states on the new list, for considering
495     when we move the character pointer on. */
496    
497     for (i = 0; i < active_count; i++)
498     {
499     stateblock *current_state = active_states + i;
500     const uschar *code;
501     int state_offset = current_state->offset;
502     int count, codevalue;
503 ph10 152 #ifdef SUPPORT_UCP
504 nigel 87 int chartype, script;
505 ph10 152 #endif
506 nigel 77
507     #ifdef DEBUG
508     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
509 nigel 93 if (clen == 0) printf("EOL\n");
510 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
511     else printf("0x%02x\n", c);
512     #endif
513    
514     /* This variable is referred to implicity in the ADD_xxx macros. */
515    
516     ims = current_state->ims;
517    
518     /* A negative offset is a special case meaning "hold off going to this
519     (negated) state until the number of characters in the data field have
520     been skipped". */
521    
522     if (state_offset < 0)
523     {
524     if (current_state->data > 0)
525     {
526     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
527     ADD_NEW_DATA(state_offset, current_state->count,
528     current_state->data - 1);
529     continue;
530     }
531     else
532     {
533     current_state->offset = state_offset = -state_offset;
534     }
535     }
536    
537     /* Check for a duplicate state with the same count, and skip if found. */
538    
539     for (j = 0; j < i; j++)
540     {
541     if (active_states[j].offset == state_offset &&
542     active_states[j].count == current_state->count)
543     {
544     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
545     goto NEXT_ACTIVE_STATE;
546     }
547     }
548    
549     /* The state offset is the offset to the opcode */
550    
551     code = start_code + state_offset;
552     codevalue = *code;
553    
554     /* If this opcode is followed by an inline character, load it. It is
555     tempting to test for the presence of a subject character here, but that
556     is wrong, because sometimes zero repetitions of the subject are
557     permitted.
558    
559     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
560     argument that is not a data character - but is always one byte long.
561     Unfortunately, we have to take special action to deal with \P, \p, and
562     \X in this case. To keep the other cases fast, convert these ones to new
563     opcodes. */
564    
565     if (coptable[codevalue] > 0)
566     {
567     dlen = 1;
568     #ifdef SUPPORT_UTF8
569     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
570     #endif /* SUPPORT_UTF8 */
571     d = code[coptable[codevalue]];
572     if (codevalue >= OP_TYPESTAR)
573     {
574 nigel 93 switch(d)
575     {
576     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
577     case OP_NOTPROP:
578     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
579     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
580     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
581     default: break;
582     }
583 nigel 77 }
584     }
585     else
586     {
587     dlen = 0; /* Not strictly necessary, but compilers moan */
588 nigel 93 d = NOTACHAR; /* if these variables are not set. */
589 nigel 77 }
590    
591    
592     /* Now process the individual opcodes */
593    
594     switch (codevalue)
595     {
596    
597     /* ========================================================================== */
598     /* Reached a closing bracket. If not at the end of the pattern, carry
599     on with the next opcode. Otherwise, unless we have an empty string and
600     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
601     matches so we always have the longest first. */
602    
603     case OP_KET:
604     case OP_KETRMIN:
605     case OP_KETRMAX:
606     if (code != end_code)
607     {
608     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
609     if (codevalue != OP_KET)
610     {
611     ADD_ACTIVE(state_offset - GET(code, 1), 0);
612     }
613     }
614     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
615     {
616     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
617     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
618     match_count = 0;
619     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
620     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
621     if (offsetcount >= 2)
622     {
623     offsets[0] = current_subject - start_subject;
624     offsets[1] = ptr - start_subject;
625     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
626     offsets[1] - offsets[0], current_subject));
627     }
628     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
629     {
630     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
631     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
632     match_count, rlevel*2-2, SP));
633     return match_count;
634     }
635     }
636     break;
637    
638     /* ========================================================================== */
639     /* These opcodes add to the current list of states without looking
640     at the current character. */
641    
642     /*-----------------------------------------------------------------*/
643     case OP_ALT:
644     do { code += GET(code, 1); } while (*code == OP_ALT);
645     ADD_ACTIVE(code - start_code, 0);
646     break;
647    
648     /*-----------------------------------------------------------------*/
649     case OP_BRA:
650 nigel 93 case OP_SBRA:
651 nigel 77 do
652     {
653     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
654     code += GET(code, 1);
655     }
656     while (*code == OP_ALT);
657     break;
658    
659     /*-----------------------------------------------------------------*/
660 nigel 93 case OP_CBRA:
661     case OP_SCBRA:
662     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
663     code += GET(code, 1);
664     while (*code == OP_ALT)
665     {
666     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
667     code += GET(code, 1);
668     }
669     break;
670    
671     /*-----------------------------------------------------------------*/
672 nigel 77 case OP_BRAZERO:
673     case OP_BRAMINZERO:
674     ADD_ACTIVE(state_offset + 1, 0);
675     code += 1 + GET(code, 2);
676     while (*code == OP_ALT) code += GET(code, 1);
677     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
678     break;
679    
680     /*-----------------------------------------------------------------*/
681     case OP_CIRC:
682     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
683 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
684     ptr != end_subject &&
685 nigel 93 WAS_NEWLINE(ptr)))
686 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
687     break;
688    
689     /*-----------------------------------------------------------------*/
690     case OP_EOD:
691     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
692     break;
693    
694     /*-----------------------------------------------------------------*/
695     case OP_OPT:
696     ims = code[1];
697     ADD_ACTIVE(state_offset + 2, 0);
698     break;
699    
700     /*-----------------------------------------------------------------*/
701     case OP_SOD:
702     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
703     break;
704    
705     /*-----------------------------------------------------------------*/
706     case OP_SOM:
707     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
708     break;
709    
710    
711     /* ========================================================================== */
712     /* These opcodes inspect the next subject character, and sometimes
713     the previous one as well, but do not have an argument. The variable
714     clen contains the length of the current character and is zero if we are
715     at the end of the subject. */
716    
717     /*-----------------------------------------------------------------*/
718     case OP_ANY:
719 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
720 nigel 77 { ADD_NEW(state_offset + 1, 0); }
721     break;
722    
723     /*-----------------------------------------------------------------*/
724     case OP_EODN:
725 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
726 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
727     break;
728    
729     /*-----------------------------------------------------------------*/
730     case OP_DOLL:
731     if ((md->moptions & PCRE_NOTEOL) == 0)
732     {
733 nigel 91 if (clen == 0 ||
734 nigel 93 (IS_NEWLINE(ptr) &&
735 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
736     ))
737 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
738     }
739 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
740 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
741     break;
742    
743     /*-----------------------------------------------------------------*/
744    
745     case OP_DIGIT:
746     case OP_WHITESPACE:
747     case OP_WORDCHAR:
748     if (clen > 0 && c < 256 &&
749     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
750     { ADD_NEW(state_offset + 1, 0); }
751     break;
752    
753     /*-----------------------------------------------------------------*/
754     case OP_NOT_DIGIT:
755     case OP_NOT_WHITESPACE:
756     case OP_NOT_WORDCHAR:
757     if (clen > 0 && (c >= 256 ||
758     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
759     { ADD_NEW(state_offset + 1, 0); }
760     break;
761    
762     /*-----------------------------------------------------------------*/
763     case OP_WORD_BOUNDARY:
764     case OP_NOT_WORD_BOUNDARY:
765     {
766     int left_word, right_word;
767    
768     if (ptr > start_subject)
769     {
770     const uschar *temp = ptr - 1;
771     #ifdef SUPPORT_UTF8
772     if (utf8) BACKCHAR(temp);
773     #endif
774     GETCHARTEST(d, temp);
775     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
776     }
777     else left_word = 0;
778    
779     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
780     else right_word = 0;
781    
782     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
783     { ADD_ACTIVE(state_offset + 1, 0); }
784     }
785     break;
786    
787    
788     /*-----------------------------------------------------------------*/
789     /* Check the next character by Unicode property. We will get here only
790     if the support is in the binary; otherwise a compile-time error occurs.
791     */
792    
793 ph10 151 #ifdef SUPPORT_UCP
794 nigel 77 case OP_PROP:
795     case OP_NOTPROP:
796     if (clen > 0)
797     {
798 nigel 87 BOOL OK;
799     int category = _pcre_ucp_findprop(c, &chartype, &script);
800     switch(code[1])
801 nigel 77 {
802 nigel 87 case PT_ANY:
803     OK = TRUE;
804     break;
805    
806     case PT_LAMP:
807     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
808     break;
809    
810     case PT_GC:
811     OK = category == code[2];
812     break;
813    
814     case PT_PC:
815     OK = chartype == code[2];
816     break;
817    
818     case PT_SC:
819     OK = script == code[2];
820     break;
821    
822     /* Should never occur, but keep compilers from grumbling. */
823    
824     default:
825     OK = codevalue != OP_PROP;
826     break;
827 nigel 77 }
828 nigel 87
829     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
830 nigel 77 }
831     break;
832     #endif
833    
834    
835    
836     /* ========================================================================== */
837     /* These opcodes likewise inspect the subject character, but have an
838     argument that is not a data character. It is one of these opcodes:
839     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
840     OP_NOT_WORDCHAR. The value is loaded into d. */
841    
842     case OP_TYPEPLUS:
843     case OP_TYPEMINPLUS:
844 nigel 93 case OP_TYPEPOSPLUS:
845 nigel 77 count = current_state->count; /* Already matched */
846     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
847     if (clen > 0)
848     {
849     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
850     (c < 256 &&
851 nigel 91 (d != OP_ANY ||
852     (ims & PCRE_DOTALL) != 0 ||
853     !IS_NEWLINE(ptr)
854     ) &&
855 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
856     {
857 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
858     {
859     active_count--; /* Remove non-match possibility */
860     next_active_state--;
861     }
862 nigel 77 count++;
863     ADD_NEW(state_offset, count);
864     }
865     }
866     break;
867    
868     /*-----------------------------------------------------------------*/
869     case OP_TYPEQUERY:
870     case OP_TYPEMINQUERY:
871 nigel 93 case OP_TYPEPOSQUERY:
872 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
873     if (clen > 0)
874     {
875     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876     (c < 256 &&
877 nigel 91 (d != OP_ANY ||
878     (ims & PCRE_DOTALL) != 0 ||
879     !IS_NEWLINE(ptr)
880     ) &&
881 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
882     {
883 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
884     {
885     active_count--; /* Remove non-match possibility */
886     next_active_state--;
887     }
888 nigel 77 ADD_NEW(state_offset + 2, 0);
889     }
890     }
891     break;
892    
893     /*-----------------------------------------------------------------*/
894     case OP_TYPESTAR:
895     case OP_TYPEMINSTAR:
896 nigel 93 case OP_TYPEPOSSTAR:
897 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
898     if (clen > 0)
899     {
900     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
901     (c < 256 &&
902 nigel 91 (d != OP_ANY ||
903     (ims & PCRE_DOTALL) != 0 ||
904     !IS_NEWLINE(ptr)
905     ) &&
906 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
907     {
908 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
909     {
910     active_count--; /* Remove non-match possibility */
911     next_active_state--;
912     }
913 nigel 77 ADD_NEW(state_offset, 0);
914     }
915     }
916     break;
917    
918     /*-----------------------------------------------------------------*/
919     case OP_TYPEEXACT:
920 nigel 93 count = current_state->count; /* Number already matched */
921     if (clen > 0)
922     {
923     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
924     (c < 256 &&
925     (d != OP_ANY ||
926     (ims & PCRE_DOTALL) != 0 ||
927     !IS_NEWLINE(ptr)
928     ) &&
929     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
930     {
931     if (++count >= GET2(code, 1))
932     { ADD_NEW(state_offset + 4, 0); }
933     else
934     { ADD_NEW(state_offset, count); }
935     }
936     }
937     break;
938    
939     /*-----------------------------------------------------------------*/
940 nigel 77 case OP_TYPEUPTO:
941     case OP_TYPEMINUPTO:
942 nigel 93 case OP_TYPEPOSUPTO:
943     ADD_ACTIVE(state_offset + 4, 0);
944 nigel 77 count = current_state->count; /* Number already matched */
945     if (clen > 0)
946     {
947     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
948     (c < 256 &&
949 nigel 91 (d != OP_ANY ||
950     (ims & PCRE_DOTALL) != 0 ||
951     !IS_NEWLINE(ptr)
952     ) &&
953 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
954     {
955 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
956     {
957     active_count--; /* Remove non-match possibility */
958     next_active_state--;
959     }
960 nigel 77 if (++count >= GET2(code, 1))
961     { ADD_NEW(state_offset + 4, 0); }
962     else
963     { ADD_NEW(state_offset, count); }
964     }
965     }
966     break;
967    
968     /* ========================================================================== */
969     /* These are virtual opcodes that are used when something like
970 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
971     argument. It keeps the code above fast for the other cases. The argument
972     is in the d variable. */
973 nigel 77
974 ph10 151 #ifdef SUPPORT_UCP
975 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
976     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
977 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
978 nigel 77 count = current_state->count; /* Already matched */
979 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
980 nigel 77 if (clen > 0)
981     {
982 nigel 87 BOOL OK;
983     int category = _pcre_ucp_findprop(c, &chartype, &script);
984     switch(code[2])
985     {
986     case PT_ANY:
987     OK = TRUE;
988     break;
989    
990     case PT_LAMP:
991     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
992     break;
993    
994     case PT_GC:
995     OK = category == code[3];
996     break;
997    
998     case PT_PC:
999     OK = chartype == code[3];
1000     break;
1001    
1002     case PT_SC:
1003     OK = script == code[3];
1004     break;
1005    
1006     /* Should never occur, but keep compilers from grumbling. */
1007    
1008     default:
1009     OK = codevalue != OP_PROP;
1010     break;
1011     }
1012    
1013 nigel 93 if (OK == (d == OP_PROP))
1014     {
1015     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1016     {
1017     active_count--; /* Remove non-match possibility */
1018     next_active_state--;
1019     }
1020     count++;
1021     ADD_NEW(state_offset, count);
1022     }
1023 nigel 77 }
1024     break;
1025    
1026     /*-----------------------------------------------------------------*/
1027     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1028     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1029 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1030 nigel 77 count = current_state->count; /* Already matched */
1031     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1032 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1033 nigel 77 {
1034     const uschar *nptr = ptr + clen;
1035     int ncount = 0;
1036 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1037     {
1038     active_count--; /* Remove non-match possibility */
1039     next_active_state--;
1040     }
1041 nigel 77 while (nptr < end_subject)
1042     {
1043     int nd;
1044     int ndlen = 1;
1045     GETCHARLEN(nd, nptr, ndlen);
1046 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1047 nigel 77 ncount++;
1048     nptr += ndlen;
1049     }
1050     count++;
1051     ADD_NEW_DATA(-state_offset, count, ncount);
1052     }
1053     break;
1054 ph10 151 #endif
1055 nigel 77
1056     /*-----------------------------------------------------------------*/
1057 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1058     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1059     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1060     count = current_state->count; /* Already matched */
1061     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1062     if (clen > 0)
1063     {
1064     int ncount = 0;
1065     switch (c)
1066     {
1067     case 0x000d:
1068     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1069     /* Fall through */
1070     case 0x000a:
1071     case 0x000b:
1072     case 0x000c:
1073     case 0x0085:
1074     case 0x2028:
1075     case 0x2029:
1076     if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1077     {
1078     active_count--; /* Remove non-match possibility */
1079     next_active_state--;
1080     }
1081     count++;
1082     ADD_NEW_DATA(-state_offset, count, ncount);
1083     break;
1084     default:
1085     break;
1086     }
1087     }
1088     break;
1089    
1090     /*-----------------------------------------------------------------*/
1091 ph10 151 #ifdef SUPPORT_UCP
1092 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1093     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1094 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1095 nigel 87 count = 4;
1096 nigel 77 goto QS1;
1097    
1098     case OP_PROP_EXTRA + OP_TYPESTAR:
1099     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1100 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1101 nigel 77 count = 0;
1102    
1103     QS1:
1104    
1105 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1106 nigel 77 if (clen > 0)
1107     {
1108 nigel 87 BOOL OK;
1109     int category = _pcre_ucp_findprop(c, &chartype, &script);
1110     switch(code[2])
1111     {
1112     case PT_ANY:
1113     OK = TRUE;
1114     break;
1115    
1116     case PT_LAMP:
1117     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1118     break;
1119    
1120     case PT_GC:
1121     OK = category == code[3];
1122     break;
1123    
1124     case PT_PC:
1125     OK = chartype == code[3];
1126     break;
1127    
1128     case PT_SC:
1129     OK = script == code[3];
1130     break;
1131    
1132     /* Should never occur, but keep compilers from grumbling. */
1133    
1134     default:
1135     OK = codevalue != OP_PROP;
1136     break;
1137     }
1138    
1139 nigel 93 if (OK == (d == OP_PROP))
1140     {
1141     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1142     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1143     {
1144     active_count--; /* Remove non-match possibility */
1145     next_active_state--;
1146     }
1147     ADD_NEW(state_offset + count, 0);
1148     }
1149 nigel 77 }
1150     break;
1151    
1152     /*-----------------------------------------------------------------*/
1153     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1154     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1155 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1156 nigel 77 count = 2;
1157     goto QS2;
1158    
1159     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1160     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1161 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1162 nigel 77 count = 0;
1163    
1164     QS2:
1165    
1166     ADD_ACTIVE(state_offset + 2, 0);
1167 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1168 nigel 77 {
1169     const uschar *nptr = ptr + clen;
1170     int ncount = 0;
1171 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1172     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1173     {
1174     active_count--; /* Remove non-match possibility */
1175     next_active_state--;
1176     }
1177 nigel 77 while (nptr < end_subject)
1178     {
1179     int nd;
1180     int ndlen = 1;
1181     GETCHARLEN(nd, nptr, ndlen);
1182 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1183 nigel 77 ncount++;
1184     nptr += ndlen;
1185     }
1186     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1187     }
1188     break;
1189 ph10 151 #endif
1190 nigel 77
1191     /*-----------------------------------------------------------------*/
1192 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1193     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1194     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1195     count = 2;
1196     goto QS3;
1197    
1198     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1199     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1200     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1201     count = 0;
1202    
1203     QS3:
1204     ADD_ACTIVE(state_offset + 2, 0);
1205     if (clen > 0)
1206     {
1207     int ncount = 0;
1208     switch (c)
1209     {
1210     case 0x000d:
1211     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1212     /* Fall through */
1213     case 0x000a:
1214     case 0x000b:
1215     case 0x000c:
1216     case 0x0085:
1217     case 0x2028:
1218     case 0x2029:
1219     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1220     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1221     {
1222     active_count--; /* Remove non-match possibility */
1223     next_active_state--;
1224     }
1225     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1226     break;
1227     default:
1228     break;
1229     }
1230     }
1231     break;
1232    
1233     /*-----------------------------------------------------------------*/
1234 ph10 151 #ifdef SUPPORT_UCP
1235 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1236     case OP_PROP_EXTRA + OP_TYPEUPTO:
1237     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1238 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1239 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1240 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1241 nigel 77 count = current_state->count; /* Number already matched */
1242     if (clen > 0)
1243     {
1244 nigel 87 BOOL OK;
1245     int category = _pcre_ucp_findprop(c, &chartype, &script);
1246     switch(code[4])
1247 nigel 77 {
1248 nigel 87 case PT_ANY:
1249     OK = TRUE;
1250     break;
1251    
1252     case PT_LAMP:
1253     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1254     break;
1255    
1256     case PT_GC:
1257     OK = category == code[5];
1258     break;
1259    
1260     case PT_PC:
1261     OK = chartype == code[5];
1262     break;
1263    
1264     case PT_SC:
1265     OK = script == code[5];
1266     break;
1267    
1268     /* Should never occur, but keep compilers from grumbling. */
1269    
1270     default:
1271     OK = codevalue != OP_PROP;
1272     break;
1273     }
1274    
1275     if (OK == (d == OP_PROP))
1276     {
1277 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1278     {
1279     active_count--; /* Remove non-match possibility */
1280     next_active_state--;
1281     }
1282 nigel 77 if (++count >= GET2(code, 1))
1283 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1284 nigel 77 else
1285     { ADD_NEW(state_offset, count); }
1286     }
1287     }
1288     break;
1289    
1290     /*-----------------------------------------------------------------*/
1291     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1292     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1293     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1294 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1295 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1296     { ADD_ACTIVE(state_offset + 4, 0); }
1297     count = current_state->count; /* Number already matched */
1298 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1299 nigel 77 {
1300     const uschar *nptr = ptr + clen;
1301     int ncount = 0;
1302 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1303     {
1304     active_count--; /* Remove non-match possibility */
1305     next_active_state--;
1306     }
1307 nigel 77 while (nptr < end_subject)
1308     {
1309     int nd;
1310     int ndlen = 1;
1311     GETCHARLEN(nd, nptr, ndlen);
1312 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1313 nigel 77 ncount++;
1314     nptr += ndlen;
1315     }
1316     if (++count >= GET2(code, 1))
1317     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1318     else
1319     { ADD_NEW_DATA(-state_offset, count, ncount); }
1320     }
1321     break;
1322 ph10 151 #endif
1323 nigel 77
1324 nigel 93 /*-----------------------------------------------------------------*/
1325     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1326     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1327     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1328     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1329     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1330     { ADD_ACTIVE(state_offset + 4, 0); }
1331     count = current_state->count; /* Number already matched */
1332     if (clen > 0)
1333     {
1334     int ncount = 0;
1335     switch (c)
1336     {
1337     case 0x000d:
1338     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1339     /* Fall through */
1340     case 0x000a:
1341     case 0x000b:
1342     case 0x000c:
1343     case 0x0085:
1344     case 0x2028:
1345     case 0x2029:
1346     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1347     {
1348     active_count--; /* Remove non-match possibility */
1349     next_active_state--;
1350     }
1351     if (++count >= GET2(code, 1))
1352     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1353     else
1354     { ADD_NEW_DATA(-state_offset, count, ncount); }
1355     break;
1356     default:
1357     break;
1358     }
1359     }
1360     break;
1361    
1362 nigel 77 /* ========================================================================== */
1363     /* These opcodes are followed by a character that is usually compared
1364     to the current subject character; it is loaded into d. We still get
1365     here even if there is no subject character, because in some cases zero
1366     repetitions are permitted. */
1367    
1368     /*-----------------------------------------------------------------*/
1369     case OP_CHAR:
1370     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1371     break;
1372    
1373     /*-----------------------------------------------------------------*/
1374     case OP_CHARNC:
1375     if (clen == 0) break;
1376    
1377     #ifdef SUPPORT_UTF8
1378     if (utf8)
1379     {
1380     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1381     {
1382 nigel 93 unsigned int othercase;
1383 nigel 77 if (c < 128) othercase = fcc[c]; else
1384    
1385     /* If we have Unicode property support, we can use it to test the
1386 nigel 87 other case of the character. */
1387 nigel 77
1388     #ifdef SUPPORT_UCP
1389 nigel 87 othercase = _pcre_ucp_othercase(c);
1390     #else
1391 nigel 93 othercase = NOTACHAR;
1392 nigel 77 #endif
1393    
1394     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1395     }
1396     }
1397     else
1398     #endif /* SUPPORT_UTF8 */
1399    
1400     /* Non-UTF-8 mode */
1401     {
1402     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1403     }
1404     break;
1405    
1406    
1407     #ifdef SUPPORT_UCP
1408     /*-----------------------------------------------------------------*/
1409     /* This is a tricky one because it can match more than one character.
1410     Find out how many characters to skip, and then set up a negative state
1411     to wait for them to pass before continuing. */
1412    
1413     case OP_EXTUNI:
1414 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1415 nigel 77 {
1416     const uschar *nptr = ptr + clen;
1417     int ncount = 0;
1418     while (nptr < end_subject)
1419     {
1420     int nclen = 1;
1421     GETCHARLEN(c, nptr, nclen);
1422 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1423 nigel 77 ncount++;
1424     nptr += nclen;
1425     }
1426     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1427     }
1428     break;
1429     #endif
1430    
1431     /*-----------------------------------------------------------------*/
1432 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1433     character (when CR is followed by LF). In this case, set up a negative
1434     state to wait for one character to pass before continuing. */
1435    
1436     case OP_ANYNL:
1437     if (clen > 0) switch(c)
1438     {
1439     case 0x000a:
1440     case 0x000b:
1441     case 0x000c:
1442     case 0x0085:
1443     case 0x2028:
1444     case 0x2029:
1445     ADD_NEW(state_offset + 1, 0);
1446     break;
1447     case 0x000d:
1448     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1449     {
1450     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1451     }
1452     else
1453     {
1454     ADD_NEW(state_offset + 1, 0);
1455     }
1456     break;
1457     }
1458     break;
1459    
1460     /*-----------------------------------------------------------------*/
1461 nigel 77 /* Match a negated single character. This is only used for one-byte
1462     characters, that is, we know that d < 256. The character we are
1463     checking (c) can be multibyte. */
1464    
1465     case OP_NOT:
1466     if (clen > 0)
1467     {
1468 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1469 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1470     }
1471     break;
1472    
1473     /*-----------------------------------------------------------------*/
1474     case OP_PLUS:
1475     case OP_MINPLUS:
1476 nigel 93 case OP_POSPLUS:
1477 nigel 77 case OP_NOTPLUS:
1478     case OP_NOTMINPLUS:
1479 nigel 93 case OP_NOTPOSPLUS:
1480 nigel 77 count = current_state->count; /* Already matched */
1481     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1482     if (clen > 0)
1483     {
1484 nigel 93 unsigned int otherd = NOTACHAR;
1485 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1486     {
1487     #ifdef SUPPORT_UTF8
1488 nigel 87 if (utf8 && d >= 128)
1489 nigel 77 {
1490     #ifdef SUPPORT_UCP
1491 nigel 87 otherd = _pcre_ucp_othercase(d);
1492 nigel 77 #endif /* SUPPORT_UCP */
1493     }
1494     else
1495     #endif /* SUPPORT_UTF8 */
1496     otherd = fcc[d];
1497     }
1498     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1499 nigel 93 {
1500     if (count > 0 &&
1501     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1502     {
1503     active_count--; /* Remove non-match possibility */
1504     next_active_state--;
1505     }
1506     count++;
1507     ADD_NEW(state_offset, count);
1508     }
1509 nigel 77 }
1510     break;
1511    
1512     /*-----------------------------------------------------------------*/
1513     case OP_QUERY:
1514     case OP_MINQUERY:
1515 nigel 93 case OP_POSQUERY:
1516 nigel 77 case OP_NOTQUERY:
1517     case OP_NOTMINQUERY:
1518 nigel 93 case OP_NOTPOSQUERY:
1519 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1520     if (clen > 0)
1521     {
1522 nigel 93 unsigned int otherd = NOTACHAR;
1523 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1524 nigel 77 {
1525     #ifdef SUPPORT_UTF8
1526 nigel 87 if (utf8 && d >= 128)
1527 nigel 77 {
1528     #ifdef SUPPORT_UCP
1529 nigel 87 otherd = _pcre_ucp_othercase(d);
1530 nigel 77 #endif /* SUPPORT_UCP */
1531     }
1532     else
1533     #endif /* SUPPORT_UTF8 */
1534     otherd = fcc[d];
1535     }
1536     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1537 nigel 93 {
1538     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1539     {
1540     active_count--; /* Remove non-match possibility */
1541     next_active_state--;
1542     }
1543     ADD_NEW(state_offset + dlen + 1, 0);
1544     }
1545 nigel 77 }
1546     break;
1547    
1548     /*-----------------------------------------------------------------*/
1549     case OP_STAR:
1550     case OP_MINSTAR:
1551 nigel 93 case OP_POSSTAR:
1552 nigel 77 case OP_NOTSTAR:
1553     case OP_NOTMINSTAR:
1554 nigel 93 case OP_NOTPOSSTAR:
1555 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1556     if (clen > 0)
1557     {
1558 nigel 93 unsigned int otherd = NOTACHAR;
1559 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1560 nigel 77 {
1561     #ifdef SUPPORT_UTF8
1562 nigel 87 if (utf8 && d >= 128)
1563 nigel 77 {
1564     #ifdef SUPPORT_UCP
1565 nigel 87 otherd = _pcre_ucp_othercase(d);
1566 nigel 77 #endif /* SUPPORT_UCP */
1567     }
1568     else
1569     #endif /* SUPPORT_UTF8 */
1570     otherd = fcc[d];
1571     }
1572     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1573 nigel 93 {
1574     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1575     {
1576     active_count--; /* Remove non-match possibility */
1577     next_active_state--;
1578     }
1579     ADD_NEW(state_offset, 0);
1580     }
1581 nigel 77 }
1582     break;
1583    
1584     /*-----------------------------------------------------------------*/
1585     case OP_EXACT:
1586 nigel 93 case OP_NOTEXACT:
1587     count = current_state->count; /* Number already matched */
1588     if (clen > 0)
1589     {
1590     unsigned int otherd = NOTACHAR;
1591     if ((ims & PCRE_CASELESS) != 0)
1592     {
1593     #ifdef SUPPORT_UTF8
1594     if (utf8 && d >= 128)
1595     {
1596     #ifdef SUPPORT_UCP
1597     otherd = _pcre_ucp_othercase(d);
1598     #endif /* SUPPORT_UCP */
1599     }
1600     else
1601     #endif /* SUPPORT_UTF8 */
1602     otherd = fcc[d];
1603     }
1604     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1605     {
1606     if (++count >= GET2(code, 1))
1607     { ADD_NEW(state_offset + dlen + 3, 0); }
1608     else
1609     { ADD_NEW(state_offset, count); }
1610     }
1611     }
1612     break;
1613    
1614     /*-----------------------------------------------------------------*/
1615 nigel 77 case OP_UPTO:
1616     case OP_MINUPTO:
1617 nigel 93 case OP_POSUPTO:
1618 nigel 77 case OP_NOTUPTO:
1619     case OP_NOTMINUPTO:
1620 nigel 93 case OP_NOTPOSUPTO:
1621     ADD_ACTIVE(state_offset + dlen + 3, 0);
1622 nigel 77 count = current_state->count; /* Number already matched */
1623     if (clen > 0)
1624     {
1625 nigel 93 unsigned int otherd = NOTACHAR;
1626 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1627     {
1628     #ifdef SUPPORT_UTF8
1629 nigel 87 if (utf8 && d >= 128)
1630 nigel 77 {
1631     #ifdef SUPPORT_UCP
1632 nigel 87 otherd = _pcre_ucp_othercase(d);
1633 nigel 77 #endif /* SUPPORT_UCP */
1634     }
1635     else
1636     #endif /* SUPPORT_UTF8 */
1637     otherd = fcc[d];
1638     }
1639     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1640     {
1641 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1642     {
1643     active_count--; /* Remove non-match possibility */
1644     next_active_state--;
1645     }
1646 nigel 77 if (++count >= GET2(code, 1))
1647     { ADD_NEW(state_offset + dlen + 3, 0); }
1648     else
1649     { ADD_NEW(state_offset, count); }
1650     }
1651     }
1652     break;
1653    
1654    
1655     /* ========================================================================== */
1656     /* These are the class-handling opcodes */
1657    
1658     case OP_CLASS:
1659     case OP_NCLASS:
1660     case OP_XCLASS:
1661     {
1662     BOOL isinclass = FALSE;
1663     int next_state_offset;
1664     const uschar *ecode;
1665    
1666     /* For a simple class, there is always just a 32-byte table, and we
1667     can set isinclass from it. */
1668    
1669     if (codevalue != OP_XCLASS)
1670     {
1671     ecode = code + 33;
1672     if (clen > 0)
1673     {
1674     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1675     ((code[1 + c/8] & (1 << (c&7))) != 0);
1676     }
1677     }
1678    
1679     /* An extended class may have a table or a list of single characters,
1680     ranges, or both, and it may be positive or negative. There's a
1681     function that sorts all this out. */
1682    
1683     else
1684     {
1685     ecode = code + GET(code, 1);
1686     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1687     }
1688    
1689     /* At this point, isinclass is set for all kinds of class, and ecode
1690     points to the byte after the end of the class. If there is a
1691     quantifier, this is where it will be. */
1692    
1693     next_state_offset = ecode - start_code;
1694    
1695     switch (*ecode)
1696     {
1697     case OP_CRSTAR:
1698     case OP_CRMINSTAR:
1699     ADD_ACTIVE(next_state_offset + 1, 0);
1700     if (isinclass) { ADD_NEW(state_offset, 0); }
1701     break;
1702    
1703     case OP_CRPLUS:
1704     case OP_CRMINPLUS:
1705     count = current_state->count; /* Already matched */
1706     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1707     if (isinclass) { count++; ADD_NEW(state_offset, count); }
1708     break;
1709    
1710     case OP_CRQUERY:
1711     case OP_CRMINQUERY:
1712     ADD_ACTIVE(next_state_offset + 1, 0);
1713     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1714     break;
1715    
1716     case OP_CRRANGE:
1717     case OP_CRMINRANGE:
1718     count = current_state->count; /* Already matched */
1719     if (count >= GET2(ecode, 1))
1720     { ADD_ACTIVE(next_state_offset + 5, 0); }
1721     if (isinclass)
1722     {
1723 nigel 91 int max = GET2(ecode, 3);
1724     if (++count >= max && max != 0) /* Max 0 => no limit */
1725 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
1726     else
1727     { ADD_NEW(state_offset, count); }
1728     }
1729     break;
1730    
1731     default:
1732     if (isinclass) { ADD_NEW(next_state_offset, 0); }
1733     break;
1734     }
1735     }
1736     break;
1737    
1738     /* ========================================================================== */
1739     /* These are the opcodes for fancy brackets of various kinds. We have
1740     to use recursion in order to handle them. */
1741    
1742     case OP_ASSERT:
1743     case OP_ASSERT_NOT:
1744     case OP_ASSERTBACK:
1745     case OP_ASSERTBACK_NOT:
1746     {
1747     int rc;
1748     int local_offsets[2];
1749     int local_workspace[1000];
1750     const uschar *endasscode = code + GET(code, 1);
1751    
1752     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1753    
1754     rc = internal_dfa_exec(
1755     md, /* static match data */
1756     code, /* this subexpression's code */
1757     ptr, /* where we currently are */
1758     ptr - start_subject, /* start offset */
1759     local_offsets, /* offset vector */
1760     sizeof(local_offsets)/sizeof(int), /* size of same */
1761     local_workspace, /* workspace vector */
1762     sizeof(local_workspace)/sizeof(int), /* size of same */
1763     ims, /* the current ims flags */
1764     rlevel, /* function recursion level */
1765     recursing); /* pass on regex recursion */
1766    
1767     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1768     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1769     }
1770     break;
1771    
1772     /*-----------------------------------------------------------------*/
1773     case OP_COND:
1774 nigel 93 case OP_SCOND:
1775 nigel 77 {
1776     int local_offsets[1000];
1777     int local_workspace[1000];
1778     int condcode = code[LINK_SIZE+1];
1779    
1780 nigel 93 /* Back reference conditions are not supported */
1781 nigel 77
1782 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1783    
1784     /* The DEFINE condition is always false */
1785    
1786     if (condcode == OP_DEF)
1787 nigel 77 {
1788 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1789     }
1790    
1791     /* The only supported version of OP_RREF is for the value RREF_ANY,
1792     which means "test if in any recursion". We can't test for specifically
1793     recursed groups. */
1794    
1795     else if (condcode == OP_RREF)
1796     {
1797 nigel 77 int value = GET2(code, LINK_SIZE+2);
1798 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1799 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1800     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1801     }
1802    
1803     /* Otherwise, the condition is an assertion */
1804    
1805     else
1806     {
1807     int rc;
1808     const uschar *asscode = code + LINK_SIZE + 1;
1809     const uschar *endasscode = asscode + GET(asscode, 1);
1810    
1811     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1812    
1813     rc = internal_dfa_exec(
1814     md, /* fixed match data */
1815     asscode, /* this subexpression's code */
1816     ptr, /* where we currently are */
1817     ptr - start_subject, /* start offset */
1818     local_offsets, /* offset vector */
1819     sizeof(local_offsets)/sizeof(int), /* size of same */
1820     local_workspace, /* workspace vector */
1821     sizeof(local_workspace)/sizeof(int), /* size of same */
1822     ims, /* the current ims flags */
1823     rlevel, /* function recursion level */
1824     recursing); /* pass on regex recursion */
1825    
1826     if ((rc >= 0) ==
1827     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1828     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1829     else
1830     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1831     }
1832     }
1833     break;
1834    
1835     /*-----------------------------------------------------------------*/
1836     case OP_RECURSE:
1837     {
1838     int local_offsets[1000];
1839     int local_workspace[1000];
1840     int rc;
1841    
1842     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1843     recursing + 1));
1844    
1845     rc = internal_dfa_exec(
1846     md, /* fixed match data */
1847     start_code + GET(code, 1), /* this subexpression's code */
1848     ptr, /* where we currently are */
1849     ptr - start_subject, /* start offset */
1850     local_offsets, /* offset vector */
1851     sizeof(local_offsets)/sizeof(int), /* size of same */
1852     local_workspace, /* workspace vector */
1853     sizeof(local_workspace)/sizeof(int), /* size of same */
1854     ims, /* the current ims flags */
1855     rlevel, /* function recursion level */
1856     recursing + 1); /* regex recurse level */
1857    
1858     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1859     recursing + 1, rc));
1860    
1861     /* Ran out of internal offsets */
1862    
1863     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1864    
1865     /* For each successful matched substring, set up the next state with a
1866     count of characters to skip before trying it. Note that the count is in
1867     characters, not bytes. */
1868    
1869     if (rc > 0)
1870     {
1871     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1872     {
1873     const uschar *p = start_subject + local_offsets[rc];
1874     const uschar *pp = start_subject + local_offsets[rc+1];
1875     int charcount = local_offsets[rc+1] - local_offsets[rc];
1876     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1877     if (charcount > 0)
1878     {
1879     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1880     }
1881     else
1882     {
1883     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1884     }
1885     }
1886     }
1887     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1888     }
1889     break;
1890    
1891     /*-----------------------------------------------------------------*/
1892     case OP_ONCE:
1893     {
1894     int local_offsets[2];
1895     int local_workspace[1000];
1896    
1897     int rc = internal_dfa_exec(
1898     md, /* fixed match data */
1899     code, /* this subexpression's code */
1900     ptr, /* where we currently are */
1901     ptr - start_subject, /* start offset */
1902     local_offsets, /* offset vector */
1903     sizeof(local_offsets)/sizeof(int), /* size of same */
1904     local_workspace, /* workspace vector */
1905     sizeof(local_workspace)/sizeof(int), /* size of same */
1906     ims, /* the current ims flags */
1907     rlevel, /* function recursion level */
1908     recursing); /* pass on regex recursion */
1909    
1910     if (rc >= 0)
1911     {
1912     const uschar *end_subpattern = code;
1913     int charcount = local_offsets[1] - local_offsets[0];
1914     int next_state_offset, repeat_state_offset;
1915    
1916     do { end_subpattern += GET(end_subpattern, 1); }
1917     while (*end_subpattern == OP_ALT);
1918     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1919    
1920     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1921     arrange for the repeat state also to be added to the relevant list.
1922     Calculate the offset, or set -1 for no repeat. */
1923    
1924     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1925     *end_subpattern == OP_KETRMIN)?
1926     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1927    
1928     /* If we have matched an empty string, add the next state at the
1929     current character pointer. This is important so that the duplicate
1930     checking kicks in, which is what breaks infinite loops that match an
1931     empty string. */
1932    
1933     if (charcount == 0)
1934     {
1935     ADD_ACTIVE(next_state_offset, 0);
1936     }
1937    
1938     /* Optimization: if there are no more active states, and there
1939     are no new states yet set up, then skip over the subject string
1940     right here, to save looping. Otherwise, set up the new state to swing
1941     into action when the end of the substring is reached. */
1942    
1943     else if (i + 1 >= active_count && new_count == 0)
1944     {
1945     ptr += charcount;
1946     clen = 0;
1947     ADD_NEW(next_state_offset, 0);
1948    
1949     /* If we are adding a repeat state at the new character position,
1950     we must fudge things so that it is the only current state.
1951     Otherwise, it might be a duplicate of one we processed before, and
1952     that would cause it to be skipped. */
1953    
1954     if (repeat_state_offset >= 0)
1955     {
1956     next_active_state = active_states;
1957     active_count = 0;
1958     i = -1;
1959     ADD_ACTIVE(repeat_state_offset, 0);
1960     }
1961     }
1962     else
1963     {
1964     const uschar *p = start_subject + local_offsets[0];
1965     const uschar *pp = start_subject + local_offsets[1];
1966     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1967     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1968     if (repeat_state_offset >= 0)
1969     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1970     }
1971    
1972     }
1973     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1974     }
1975     break;
1976    
1977    
1978     /* ========================================================================== */
1979     /* Handle callouts */
1980    
1981     case OP_CALLOUT:
1982     if (pcre_callout != NULL)
1983     {
1984     int rrc;
1985     pcre_callout_block cb;
1986     cb.version = 1; /* Version 1 of the callout block */
1987     cb.callout_number = code[1];
1988     cb.offset_vector = offsets;
1989 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
1990 nigel 77 cb.subject_length = end_subject - start_subject;
1991     cb.start_match = current_subject - start_subject;
1992     cb.current_position = ptr - start_subject;
1993     cb.pattern_position = GET(code, 2);
1994     cb.next_item_length = GET(code, 2 + LINK_SIZE);
1995     cb.capture_top = 1;
1996     cb.capture_last = -1;
1997     cb.callout_data = md->callout_data;
1998     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1999     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2000     }
2001     break;
2002    
2003    
2004     /* ========================================================================== */
2005     default: /* Unsupported opcode */
2006     return PCRE_ERROR_DFA_UITEM;
2007     }
2008    
2009     NEXT_ACTIVE_STATE: continue;
2010    
2011     } /* End of loop scanning active states */
2012    
2013     /* We have finished the processing at the current subject character. If no
2014     new states have been set for the next character, we have found all the
2015     matches that we are going to find. If we are at the top level and partial
2016     matching has been requested, check for appropriate conditions. */
2017    
2018     if (new_count <= 0)
2019     {
2020     if (match_count < 0 && /* No matches found */
2021     rlevel == 1 && /* Top level match function */
2022     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2023     ptr >= end_subject && /* Reached end of subject */
2024     ptr > current_subject) /* Matched non-empty string */
2025     {
2026     if (offsetcount >= 2)
2027     {
2028     offsets[0] = current_subject - start_subject;
2029     offsets[1] = end_subject - start_subject;
2030     }
2031     match_count = PCRE_ERROR_PARTIAL;
2032     }
2033    
2034     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2035     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2036     rlevel*2-2, SP));
2037 nigel 91 break; /* In effect, "return", but see the comment below */
2038 nigel 77 }
2039    
2040     /* One or more states are active for the next character. */
2041    
2042     ptr += clen; /* Advance to next subject character */
2043     } /* Loop to move along the subject string */
2044    
2045 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2046     if we use "return" above, we have compiler trouble. Some compilers warn if
2047     there's nothing here because they think the function doesn't return a value. On
2048     the other hand, if we put a dummy statement here, some more clever compilers
2049     complain that it can't be reached. Sigh. */
2050 nigel 77
2051 nigel 91 return match_count;
2052 nigel 77 }
2053    
2054    
2055    
2056    
2057     /*************************************************
2058     * Execute a Regular Expression - DFA engine *
2059     *************************************************/
2060    
2061     /* This external function applies a compiled re to a subject string using a DFA
2062     engine. This function calls the internal function multiple times if the pattern
2063     is not anchored.
2064    
2065     Arguments:
2066     argument_re points to the compiled expression
2067 ph10 97 extra_data points to extra data or is NULL
2068 nigel 77 subject points to the subject string
2069     length length of subject string (may contain binary zeros)
2070     start_offset where to start in the subject string
2071     options option bits
2072     offsets vector of match offsets
2073     offsetcount size of same
2074     workspace workspace vector
2075     wscount size of same
2076    
2077     Returns: > 0 => number of match offset pairs placed in offsets
2078     = 0 => offsets overflowed; longest matches are present
2079     -1 => failed to match
2080     < -1 => some kind of unexpected problem
2081     */
2082    
2083 ph10 145 PCRE_EXP_DEFN int
2084 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2085     const char *subject, int length, int start_offset, int options, int *offsets,
2086     int offsetcount, int *workspace, int wscount)
2087     {
2088     real_pcre *re = (real_pcre *)argument_re;
2089     dfa_match_data match_block;
2090 nigel 91 dfa_match_data *md = &match_block;
2091 nigel 77 BOOL utf8, anchored, startline, firstline;
2092     const uschar *current_subject, *end_subject, *lcc;
2093    
2094     pcre_study_data internal_study;
2095     const pcre_study_data *study = NULL;
2096     real_pcre internal_re;
2097    
2098     const uschar *req_byte_ptr;
2099     const uschar *start_bits = NULL;
2100     BOOL first_byte_caseless = FALSE;
2101     BOOL req_byte_caseless = FALSE;
2102     int first_byte = -1;
2103     int req_byte = -1;
2104     int req_byte2 = -1;
2105 nigel 91 int newline;
2106 nigel 77
2107     /* Plausibility checks */
2108    
2109     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2110     if (re == NULL || subject == NULL || workspace == NULL ||
2111     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2112     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2113     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2114    
2115     /* We need to find the pointer to any study data before we test for byte
2116     flipping, so we scan the extra_data block first. This may set two fields in the
2117     match block, so we must initialize them beforehand. However, the other fields
2118     in the match block must not be set until after the byte flipping. */
2119    
2120 nigel 91 md->tables = re->tables;
2121     md->callout_data = NULL;
2122 nigel 77
2123     if (extra_data != NULL)
2124     {
2125     unsigned int flags = extra_data->flags;
2126     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2127     study = (const pcre_study_data *)extra_data->study_data;
2128     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2129 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2130     return PCRE_ERROR_DFA_UMLIMIT;
2131 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2132 nigel 91 md->callout_data = extra_data->callout_data;
2133 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2134 nigel 91 md->tables = extra_data->tables;
2135 nigel 77 }
2136    
2137     /* Check that the first field in the block is the magic number. If it is not,
2138     test for a regex that was compiled on a host of opposite endianness. If this is
2139     the case, flipped values are put in internal_re and internal_study if there was
2140     study data too. */
2141    
2142     if (re->magic_number != MAGIC_NUMBER)
2143     {
2144     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2145     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2146     if (study != NULL) study = &internal_study;
2147     }
2148    
2149     /* Set some local values */
2150    
2151     current_subject = (const unsigned char *)subject + start_offset;
2152     end_subject = (const unsigned char *)subject + length;
2153     req_byte_ptr = current_subject - 1;
2154    
2155 nigel 91 #ifdef SUPPORT_UTF8
2156 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2157 nigel 91 #else
2158     utf8 = FALSE;
2159     #endif
2160 nigel 77
2161 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2162     (re->options & PCRE_ANCHORED) != 0;
2163    
2164 nigel 77 /* The remaining fixed data for passing around. */
2165    
2166 nigel 91 md->start_code = (const uschar *)argument_re +
2167 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2168 nigel 91 md->start_subject = (const unsigned char *)subject;
2169     md->end_subject = end_subject;
2170     md->moptions = options;
2171     md->poptions = re->options;
2172 nigel 77
2173 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2174     nothing is set at run time, whatever was used at compile time applies. */
2175 nigel 91
2176 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2177 nigel 93 PCRE_NEWLINE_BITS)
2178 nigel 91 {
2179 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2180 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2181     case PCRE_NEWLINE_LF: newline = '\n'; break;
2182     case PCRE_NEWLINE_CR+
2183     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2184 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2185 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2186 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2187 nigel 91 }
2188    
2189 ph10 149 if (newline == -2)
2190 nigel 91 {
2191 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2192     }
2193     else if (newline < 0)
2194     {
2195 nigel 93 md->nltype = NLTYPE_ANY;
2196 nigel 91 }
2197     else
2198     {
2199 nigel 93 md->nltype = NLTYPE_FIXED;
2200     if (newline > 255)
2201     {
2202     md->nllen = 2;
2203     md->nl[0] = (newline >> 8) & 255;
2204     md->nl[1] = newline & 255;
2205     }
2206     else
2207     {
2208     md->nllen = 1;
2209     md->nl[0] = newline;
2210     }
2211 nigel 91 }
2212    
2213 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2214     back the character offset. */
2215    
2216     #ifdef SUPPORT_UTF8
2217     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2218     {
2219     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2220     return PCRE_ERROR_BADUTF8;
2221     if (start_offset > 0 && start_offset < length)
2222     {
2223     int tb = ((uschar *)subject)[start_offset];
2224     if (tb > 127)
2225     {
2226     tb &= 0xc0;
2227     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2228     }
2229     }
2230     }
2231     #endif
2232    
2233     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2234     is a feature that makes it possible to save compiled regex and re-use them
2235     in other programs later. */
2236    
2237 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2238 nigel 77
2239     /* The lower casing table and the "must be at the start of a line" flag are
2240     used in a loop when finding where to start. */
2241    
2242 nigel 91 lcc = md->tables + lcc_offset;
2243 nigel 77 startline = (re->options & PCRE_STARTLINE) != 0;
2244     firstline = (re->options & PCRE_FIRSTLINE) != 0;
2245    
2246     /* Set up the first character to match, if available. The first_byte value is
2247     never set for an anchored regular expression, but the anchoring may be forced
2248     at run time, so we have to test for anchoring. The first char may be unset for
2249     an unanchored pattern, of course. If there's no first char and the pattern was
2250     studied, there may be a bitmap of possible first characters. */
2251    
2252     if (!anchored)
2253     {
2254     if ((re->options & PCRE_FIRSTSET) != 0)
2255     {
2256     first_byte = re->first_byte & 255;
2257     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2258     first_byte = lcc[first_byte];
2259     }
2260     else
2261     {
2262     if (startline && study != NULL &&
2263     (study->options & PCRE_STUDY_MAPPED) != 0)
2264     start_bits = study->start_bits;
2265     }
2266     }
2267    
2268     /* For anchored or unanchored matches, there may be a "last known required
2269     character" set. */
2270    
2271     if ((re->options & PCRE_REQCHSET) != 0)
2272     {
2273     req_byte = re->req_byte & 255;
2274     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2275 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2276 nigel 77 }
2277    
2278     /* Call the main matching function, looping for a non-anchored regex after a
2279     failed match. Unless restarting, optimize by moving to the first match
2280     character if possible, when not anchored. Then unless wanting a partial match,
2281     check for a required later character. */
2282    
2283     for (;;)
2284     {
2285     int rc;
2286    
2287     if ((options & PCRE_DFA_RESTART) == 0)
2288     {
2289     const uschar *save_end_subject = end_subject;
2290    
2291     /* Advance to a unique first char if possible. If firstline is TRUE, the
2292     start of the match is constrained to the first line of a multiline string.
2293 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2294     scanning at a newline. If the match fails at the newline, later code breaks
2295     this loop. */
2296 nigel 77
2297     if (firstline)
2298     {
2299     const uschar *t = current_subject;
2300 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2301 nigel 77 end_subject = t;
2302     }
2303    
2304     if (first_byte >= 0)
2305     {
2306     if (first_byte_caseless)
2307     while (current_subject < end_subject &&
2308     lcc[*current_subject] != first_byte)
2309     current_subject++;
2310     else
2311     while (current_subject < end_subject && *current_subject != first_byte)
2312     current_subject++;
2313     }
2314    
2315 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2316 nigel 77
2317     else if (startline)
2318     {
2319 nigel 93 if (current_subject > md->start_subject + start_offset)
2320 nigel 77 {
2321 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2322 nigel 77 current_subject++;
2323 ph10 130
2324 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2325     ANYCRLF, and we are now at a LF, advance the match position by one more
2326     character. */
2327 ph10 134
2328 ph10 130 if (current_subject[-1] == '\r' &&
2329 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2330 ph10 130 current_subject < end_subject &&
2331     *current_subject == '\n')
2332     current_subject++;
2333 nigel 77 }
2334     }
2335    
2336     /* Or to a non-unique first char after study */
2337    
2338     else if (start_bits != NULL)
2339     {
2340     while (current_subject < end_subject)
2341     {
2342     register unsigned int c = *current_subject;
2343     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2344     else break;
2345     }
2346     }
2347    
2348     /* Restore fudged end_subject */
2349    
2350     end_subject = save_end_subject;
2351     }
2352    
2353     /* If req_byte is set, we know that that character must appear in the subject
2354     for the match to succeed. If the first character is set, req_byte must be
2355     later in the subject; otherwise the test starts at the match point. This
2356     optimization can save a huge amount of work in patterns with nested unlimited
2357     repeats that aren't going to match. Writing separate code for cased/caseless
2358     versions makes it go faster, as does using an autoincrement and backing off
2359     on a match.
2360    
2361     HOWEVER: when the subject string is very, very long, searching to its end can
2362     take a long time, and give bad performance on quite ordinary patterns. This
2363     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2364     don't do this when the string is sufficiently long.
2365    
2366     ALSO: this processing is disabled when partial matching is requested.
2367     */
2368    
2369     if (req_byte >= 0 &&
2370     end_subject - current_subject < REQ_BYTE_MAX &&
2371     (options & PCRE_PARTIAL) == 0)
2372     {
2373     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2374    
2375     /* We don't need to repeat the search if we haven't yet reached the
2376     place we found it at last time. */
2377    
2378     if (p > req_byte_ptr)
2379     {
2380     if (req_byte_caseless)
2381     {
2382     while (p < end_subject)
2383     {
2384     register int pp = *p++;
2385     if (pp == req_byte || pp == req_byte2) { p--; break; }
2386     }
2387     }
2388     else
2389     {
2390     while (p < end_subject)
2391     {
2392     if (*p++ == req_byte) { p--; break; }
2393     }
2394     }
2395    
2396     /* If we can't find the required character, break the matching loop,
2397     which will cause a return or PCRE_ERROR_NOMATCH. */
2398    
2399     if (p >= end_subject) break;
2400    
2401     /* If we have found the required character, save the point where we
2402     found it, so that we don't search again next time round the loop if
2403     the start hasn't passed this character yet. */
2404    
2405     req_byte_ptr = p;
2406     }
2407     }
2408    
2409     /* OK, now we can do the business */
2410    
2411     rc = internal_dfa_exec(
2412 nigel 91 md, /* fixed match data */
2413     md->start_code, /* this subexpression's code */
2414     current_subject, /* where we currently are */
2415     start_offset, /* start offset in subject */
2416     offsets, /* offset vector */
2417     offsetcount, /* size of same */
2418     workspace, /* workspace vector */
2419     wscount, /* size of same */
2420 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2421 nigel 91 0, /* function recurse level */
2422     0); /* regex recurse level */
2423 nigel 77
2424     /* Anything other than "no match" means we are done, always; otherwise, carry
2425     on only if not anchored. */
2426    
2427     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2428    
2429     /* Advance to the next subject character unless we are at the end of a line
2430     and firstline is set. */
2431    
2432 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2433 nigel 77 current_subject++;
2434     if (utf8)
2435     {
2436     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2437     current_subject++;
2438     }
2439     if (current_subject > end_subject) break;
2440    
2441 ph10 150 /* If we have just passed a CR and the newline option is CRLF or ANY or
2442 ph10 149 ANYCRLF, and we are now at a LF, advance the match position by one more
2443     character. */
2444 nigel 93
2445     if (current_subject[-1] == '\r' &&
2446 ph10 150 (md->nltype == NLTYPE_ANY ||
2447     md->nltype == NLTYPE_ANYCRLF ||
2448 ph10 149 md->nllen == 2) &&
2449 nigel 93 current_subject < end_subject &&
2450     *current_subject == '\n')
2451     current_subject++;
2452    
2453     } /* "Bumpalong" loop */
2454    
2455 nigel 77 return PCRE_ERROR_NOMATCH;
2456     }
2457    
2458     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12