/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 172 - (hide annotations) (download)
Tue Jun 5 10:40:13 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 81716 byte(s)
Drastically reduce workspace used for alternatives in groups; also some 
trailing space removals for a test release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 nigel 93 #define NLBLOCK md /* Block containing newline information */
48     #define PSSTART start_subject /* Field containing processed string start */
49     #define PSEND end_subject /* Field containing processed string end */
50    
51 nigel 77 #include "pcre_internal.h"
52    
53    
54     /* For use to indent debugging output */
55    
56     #define SP " "
57    
58    
59    
60     /*************************************************
61     * Code parameters and static tables *
62     *************************************************/
63    
64     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
66 nigel 77 enough. */
67    
68 nigel 93 #define OP_PROP_EXTRA 100
69     #define OP_EXTUNI_EXTRA 120
70     #define OP_ANYNL_EXTRA 140
71 nigel 77
72    
73     /* This table identifies those opcodes that are followed immediately by a
74     character that is to be tested in some way. This makes is possible to
75     centralize the loading of these characters. In the case of Type * etc, the
76     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
77 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
78 ph10 168 that follow must also be modified. */
79 nigel 77
80     static uschar coptable[] = {
81     0, /* End */
82 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
83     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
84 nigel 77 0, 0, /* Any, Anybyte */
85 nigel 93 0, 0, 0, 0, /* NOTPROP, PROP, EXTUNI, ANYNL */
86 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
87     1, /* Char */
88     1, /* Charnc */
89     1, /* not */
90     /* Positive single-char repeats */
91     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
92     3, 3, 3, /* upto, minupto, exact */
93 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
94 nigel 77 /* Negative single-char repeats - only for chars < 256 */
95     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
96     3, 3, 3, /* NOT upto, minupto, exact */
97 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
98 nigel 77 /* Positive type repeats */
99     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* Type upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
102 nigel 77 /* Character class & ref repeats */
103     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
104     0, 0, /* CRRANGE, CRMINRANGE */
105     0, /* CLASS */
106     0, /* NCLASS */
107     0, /* XCLASS - variable length */
108     0, /* REF */
109     0, /* RECURSE */
110     0, /* CALLOUT */
111     0, /* Alt */
112     0, /* Ket */
113     0, /* KetRmax */
114     0, /* KetRmin */
115     0, /* Assert */
116     0, /* Assert not */
117     0, /* Assert behind */
118     0, /* Assert behind not */
119     0, /* Reverse */
120 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
121     0, 0, 0, /* SBRA, SCBRA, SCOND */
122 nigel 77 0, /* CREF */
123 nigel 93 0, /* RREF */
124     0, /* DEF */
125     0, 0 /* BRAZERO, BRAMINZERO */
126 nigel 77 };
127    
128     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
129     and \w */
130    
131     static uschar toptable1[] = {
132 ph10 168 0, 0, 0, 0, 0, 0,
133 nigel 77 ctype_digit, ctype_digit,
134     ctype_space, ctype_space,
135     ctype_word, ctype_word,
136     0 /* OP_ANY */
137     };
138    
139     static uschar toptable2[] = {
140 ph10 168 0, 0, 0, 0, 0, 0,
141 nigel 77 ctype_digit, 0,
142     ctype_space, 0,
143     ctype_word, 0,
144     1 /* OP_ANY */
145     };
146    
147    
148     /* Structure for holding data about a particular state, which is in effect the
149     current data for an active path through the match tree. It must consist
150     entirely of ints because the working vector we are passed, and which we put
151     these structures in, is a vector of ints. */
152    
153     typedef struct stateblock {
154     int offset; /* Offset to opcode */
155     int count; /* Count for repeats */
156     int ims; /* ims flag bits */
157     int data; /* Some use extra data */
158     } stateblock;
159    
160     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
161    
162    
163     #ifdef DEBUG
164     /*************************************************
165     * Print character string *
166     *************************************************/
167    
168     /* Character string printing function for debugging.
169    
170     Arguments:
171     p points to string
172     length number of bytes
173     f where to print
174    
175     Returns: nothing
176     */
177    
178     static void
179     pchars(unsigned char *p, int length, FILE *f)
180     {
181     int c;
182     while (length-- > 0)
183     {
184     if (isprint(c = *(p++)))
185     fprintf(f, "%c", c);
186     else
187     fprintf(f, "\\x%02x", c);
188     }
189     }
190     #endif
191    
192    
193    
194     /*************************************************
195     * Execute a Regular Expression - DFA engine *
196     *************************************************/
197    
198     /* This internal function applies a compiled pattern to a subject string,
199     starting at a given point, using a DFA engine. This function is called from the
200     external one, possibly multiple times if the pattern is not anchored. The
201     function calls itself recursively for some kinds of subpattern.
202    
203     Arguments:
204     md the match_data block with fixed information
205     this_start_code the opening bracket of this subexpression's code
206     current_subject where we currently are in the subject string
207     start_offset start offset in the subject string
208     offsets vector to contain the matching string offsets
209     offsetcount size of same
210     workspace vector of workspace
211     wscount size of same
212     ims the current ims flags
213     rlevel function call recursion level
214     recursing regex recursive call level
215    
216     Returns: > 0 =>
217     = 0 =>
218     -1 => failed to match
219     < -1 => some kind of unexpected problem
220    
221     The following macros are used for adding states to the two state vectors (one
222     for the current character, one for the following character). */
223    
224     #define ADD_ACTIVE(x,y) \
225     if (active_count++ < wscount) \
226     { \
227     next_active_state->offset = (x); \
228     next_active_state->count = (y); \
229     next_active_state->ims = ims; \
230     next_active_state++; \
231     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
232     } \
233     else return PCRE_ERROR_DFA_WSSIZE
234    
235     #define ADD_ACTIVE_DATA(x,y,z) \
236     if (active_count++ < wscount) \
237     { \
238     next_active_state->offset = (x); \
239     next_active_state->count = (y); \
240     next_active_state->ims = ims; \
241     next_active_state->data = (z); \
242     next_active_state++; \
243     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
244     } \
245     else return PCRE_ERROR_DFA_WSSIZE
246    
247     #define ADD_NEW(x,y) \
248     if (new_count++ < wscount) \
249     { \
250     next_new_state->offset = (x); \
251     next_new_state->count = (y); \
252     next_new_state->ims = ims; \
253     next_new_state++; \
254     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
255     } \
256     else return PCRE_ERROR_DFA_WSSIZE
257    
258     #define ADD_NEW_DATA(x,y,z) \
259     if (new_count++ < wscount) \
260     { \
261     next_new_state->offset = (x); \
262     next_new_state->count = (y); \
263     next_new_state->ims = ims; \
264     next_new_state->data = (z); \
265     next_new_state++; \
266     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
267     } \
268     else return PCRE_ERROR_DFA_WSSIZE
269    
270     /* And now, here is the code */
271    
272     static int
273     internal_dfa_exec(
274     dfa_match_data *md,
275     const uschar *this_start_code,
276     const uschar *current_subject,
277     int start_offset,
278     int *offsets,
279     int offsetcount,
280     int *workspace,
281     int wscount,
282     int ims,
283     int rlevel,
284     int recursing)
285     {
286     stateblock *active_states, *new_states, *temp_states;
287     stateblock *next_active_state, *next_new_state;
288    
289     const uschar *ctypes, *lcc, *fcc;
290     const uschar *ptr;
291 nigel 93 const uschar *end_code, *first_op;
292 nigel 77
293     int active_count, new_count, match_count;
294    
295     /* Some fields in the md block are frequently referenced, so we load them into
296     independent variables in the hope that this will perform better. */
297    
298     const uschar *start_subject = md->start_subject;
299     const uschar *end_subject = md->end_subject;
300     const uschar *start_code = md->start_code;
301    
302 nigel 87 #ifdef SUPPORT_UTF8
303 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
304 nigel 93 #else
305     BOOL utf8 = FALSE;
306 nigel 87 #endif
307 nigel 77
308     rlevel++;
309     offsetcount &= (-2);
310    
311     wscount -= 2;
312     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
313     (2 * INTS_PER_STATEBLOCK);
314    
315     DPRINTF(("\n%.*s---------------------\n"
316     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
317     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
318    
319     ctypes = md->tables + ctypes_offset;
320     lcc = md->tables + lcc_offset;
321     fcc = md->tables + fcc_offset;
322    
323     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
324    
325     active_states = (stateblock *)(workspace + 2);
326     next_new_state = new_states = active_states + wscount;
327     new_count = 0;
328    
329 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
330     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
331    
332 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
333     the alternative states onto the list, and find out where the end is. This
334     makes is possible to use this function recursively, when we want to stop at a
335     matching internal ket rather than at the end.
336    
337     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
338     a backward assertion. In that case, we have to find out the maximum amount to
339     move back, and set up each alternative appropriately. */
340    
341 nigel 93 if (*first_op == OP_REVERSE)
342 nigel 77 {
343     int max_back = 0;
344     int gone_back;
345    
346     end_code = this_start_code;
347     do
348     {
349     int back = GET(end_code, 2+LINK_SIZE);
350     if (back > max_back) max_back = back;
351     end_code += GET(end_code, 1);
352     }
353     while (*end_code == OP_ALT);
354    
355     /* If we can't go back the amount required for the longest lookbehind
356     pattern, go back as far as we can; some alternatives may still be viable. */
357    
358     #ifdef SUPPORT_UTF8
359     /* In character mode we have to step back character by character */
360    
361     if (utf8)
362     {
363     for (gone_back = 0; gone_back < max_back; gone_back++)
364     {
365     if (current_subject <= start_subject) break;
366     current_subject--;
367     while (current_subject > start_subject &&
368     (*current_subject & 0xc0) == 0x80)
369     current_subject--;
370     }
371     }
372     else
373     #endif
374    
375     /* In byte-mode we can do this quickly. */
376    
377     {
378     gone_back = (current_subject - max_back < start_subject)?
379     current_subject - start_subject : max_back;
380     current_subject -= gone_back;
381     }
382    
383     /* Now we can process the individual branches. */
384    
385     end_code = this_start_code;
386     do
387     {
388     int back = GET(end_code, 2+LINK_SIZE);
389     if (back <= gone_back)
390     {
391     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
392     ADD_NEW_DATA(-bstate, 0, gone_back - back);
393     }
394     end_code += GET(end_code, 1);
395     }
396     while (*end_code == OP_ALT);
397     }
398    
399     /* This is the code for a "normal" subpattern (not a backward assertion). The
400     start of a whole pattern is always one of these. If we are at the top level,
401     we may be asked to restart matching from the same point that we reached for a
402     previous partial match. We still have to scan through the top-level branches to
403     find the end state. */
404    
405     else
406     {
407     end_code = this_start_code;
408    
409     /* Restarting */
410    
411     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
412     {
413     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
414     new_count = workspace[1];
415     if (!workspace[0])
416     memcpy(new_states, active_states, new_count * sizeof(stateblock));
417     }
418    
419     /* Not restarting */
420    
421     else
422     {
423 nigel 93 int length = 1 + LINK_SIZE +
424     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
425 nigel 77 do
426     {
427 nigel 93 ADD_NEW(end_code - start_code + length, 0);
428 nigel 77 end_code += GET(end_code, 1);
429 nigel 93 length = 1 + LINK_SIZE;
430 nigel 77 }
431     while (*end_code == OP_ALT);
432     }
433     }
434    
435     workspace[0] = 0; /* Bit indicating which vector is current */
436    
437     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
438    
439     /* Loop for scanning the subject */
440    
441     ptr = current_subject;
442     for (;;)
443     {
444     int i, j;
445 nigel 91 int clen, dlen;
446     unsigned int c, d;
447 nigel 77
448     /* Make the new state list into the active state list and empty the
449     new state list. */
450    
451     temp_states = active_states;
452     active_states = new_states;
453     new_states = temp_states;
454     active_count = new_count;
455     new_count = 0;
456    
457     workspace[0] ^= 1; /* Remember for the restarting feature */
458     workspace[1] = active_count;
459    
460     #ifdef DEBUG
461     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
462     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
463     printf("\"\n");
464    
465     printf("%.*sActive states: ", rlevel*2-2, SP);
466     for (i = 0; i < active_count; i++)
467     printf("%d/%d ", active_states[i].offset, active_states[i].count);
468     printf("\n");
469     #endif
470    
471     /* Set the pointers for adding new states */
472    
473     next_active_state = active_states + active_count;
474     next_new_state = new_states;
475    
476     /* Load the current character from the subject outside the loop, as many
477     different states may want to look at it, and we assume that at least one
478     will. */
479    
480     if (ptr < end_subject)
481     {
482 nigel 93 clen = 1; /* Number of bytes in the character */
483 nigel 77 #ifdef SUPPORT_UTF8
484     if (utf8) { GETCHARLEN(c, ptr, clen); } else
485     #endif /* SUPPORT_UTF8 */
486     c = *ptr;
487     }
488     else
489     {
490 nigel 93 clen = 0; /* This indicates the end of the subject */
491     c = NOTACHAR; /* This value should never actually be used */
492 nigel 77 }
493    
494     /* Scan up the active states and act on each one. The result of an action
495     may be to add more states to the currently active list (e.g. on hitting a
496     parenthesis) or it may be to put states on the new list, for considering
497     when we move the character pointer on. */
498    
499     for (i = 0; i < active_count; i++)
500     {
501     stateblock *current_state = active_states + i;
502     const uschar *code;
503     int state_offset = current_state->offset;
504     int count, codevalue;
505 ph10 152 #ifdef SUPPORT_UCP
506 nigel 87 int chartype, script;
507 ph10 152 #endif
508 nigel 77
509     #ifdef DEBUG
510     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
511 nigel 93 if (clen == 0) printf("EOL\n");
512 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
513     else printf("0x%02x\n", c);
514     #endif
515    
516     /* This variable is referred to implicity in the ADD_xxx macros. */
517    
518     ims = current_state->ims;
519    
520     /* A negative offset is a special case meaning "hold off going to this
521     (negated) state until the number of characters in the data field have
522     been skipped". */
523    
524     if (state_offset < 0)
525     {
526     if (current_state->data > 0)
527     {
528     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
529     ADD_NEW_DATA(state_offset, current_state->count,
530     current_state->data - 1);
531     continue;
532     }
533     else
534     {
535     current_state->offset = state_offset = -state_offset;
536     }
537     }
538    
539     /* Check for a duplicate state with the same count, and skip if found. */
540    
541     for (j = 0; j < i; j++)
542     {
543     if (active_states[j].offset == state_offset &&
544     active_states[j].count == current_state->count)
545     {
546     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
547     goto NEXT_ACTIVE_STATE;
548     }
549     }
550    
551     /* The state offset is the offset to the opcode */
552    
553     code = start_code + state_offset;
554     codevalue = *code;
555    
556     /* If this opcode is followed by an inline character, load it. It is
557     tempting to test for the presence of a subject character here, but that
558     is wrong, because sometimes zero repetitions of the subject are
559     permitted.
560    
561     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
562     argument that is not a data character - but is always one byte long.
563     Unfortunately, we have to take special action to deal with \P, \p, and
564     \X in this case. To keep the other cases fast, convert these ones to new
565     opcodes. */
566    
567     if (coptable[codevalue] > 0)
568     {
569     dlen = 1;
570     #ifdef SUPPORT_UTF8
571     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
572     #endif /* SUPPORT_UTF8 */
573     d = code[coptable[codevalue]];
574     if (codevalue >= OP_TYPESTAR)
575     {
576 nigel 93 switch(d)
577     {
578     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
579     case OP_NOTPROP:
580     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
581     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
582     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
583     default: break;
584     }
585 nigel 77 }
586     }
587     else
588     {
589     dlen = 0; /* Not strictly necessary, but compilers moan */
590 nigel 93 d = NOTACHAR; /* if these variables are not set. */
591 nigel 77 }
592    
593    
594     /* Now process the individual opcodes */
595    
596     switch (codevalue)
597     {
598    
599     /* ========================================================================== */
600     /* Reached a closing bracket. If not at the end of the pattern, carry
601     on with the next opcode. Otherwise, unless we have an empty string and
602     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
603     matches so we always have the longest first. */
604    
605     case OP_KET:
606     case OP_KETRMIN:
607     case OP_KETRMAX:
608     if (code != end_code)
609     {
610     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
611     if (codevalue != OP_KET)
612     {
613     ADD_ACTIVE(state_offset - GET(code, 1), 0);
614     }
615     }
616     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
617     {
618     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
619     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
620     match_count = 0;
621     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
622     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
623     if (offsetcount >= 2)
624     {
625     offsets[0] = current_subject - start_subject;
626     offsets[1] = ptr - start_subject;
627     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
628     offsets[1] - offsets[0], current_subject));
629     }
630     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
631     {
632     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
633     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
634     match_count, rlevel*2-2, SP));
635     return match_count;
636     }
637     }
638     break;
639    
640     /* ========================================================================== */
641     /* These opcodes add to the current list of states without looking
642     at the current character. */
643    
644     /*-----------------------------------------------------------------*/
645     case OP_ALT:
646     do { code += GET(code, 1); } while (*code == OP_ALT);
647     ADD_ACTIVE(code - start_code, 0);
648     break;
649    
650     /*-----------------------------------------------------------------*/
651     case OP_BRA:
652 nigel 93 case OP_SBRA:
653 nigel 77 do
654     {
655     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
656     code += GET(code, 1);
657     }
658     while (*code == OP_ALT);
659     break;
660    
661     /*-----------------------------------------------------------------*/
662 nigel 93 case OP_CBRA:
663     case OP_SCBRA:
664     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
665     code += GET(code, 1);
666     while (*code == OP_ALT)
667     {
668     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
669     code += GET(code, 1);
670     }
671     break;
672    
673     /*-----------------------------------------------------------------*/
674 nigel 77 case OP_BRAZERO:
675     case OP_BRAMINZERO:
676     ADD_ACTIVE(state_offset + 1, 0);
677     code += 1 + GET(code, 2);
678     while (*code == OP_ALT) code += GET(code, 1);
679     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
680     break;
681    
682     /*-----------------------------------------------------------------*/
683     case OP_CIRC:
684     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
685 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
686     ptr != end_subject &&
687 nigel 93 WAS_NEWLINE(ptr)))
688 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
689     break;
690    
691     /*-----------------------------------------------------------------*/
692     case OP_EOD:
693     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
694     break;
695    
696     /*-----------------------------------------------------------------*/
697     case OP_OPT:
698     ims = code[1];
699     ADD_ACTIVE(state_offset + 2, 0);
700     break;
701    
702     /*-----------------------------------------------------------------*/
703     case OP_SOD:
704     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
705     break;
706    
707     /*-----------------------------------------------------------------*/
708     case OP_SOM:
709     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
710     break;
711    
712    
713     /* ========================================================================== */
714     /* These opcodes inspect the next subject character, and sometimes
715     the previous one as well, but do not have an argument. The variable
716     clen contains the length of the current character and is zero if we are
717     at the end of the subject. */
718    
719     /*-----------------------------------------------------------------*/
720     case OP_ANY:
721 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
722 nigel 77 { ADD_NEW(state_offset + 1, 0); }
723     break;
724    
725     /*-----------------------------------------------------------------*/
726     case OP_EODN:
727 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
728 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
729     break;
730    
731     /*-----------------------------------------------------------------*/
732     case OP_DOLL:
733     if ((md->moptions & PCRE_NOTEOL) == 0)
734     {
735 nigel 91 if (clen == 0 ||
736 nigel 93 (IS_NEWLINE(ptr) &&
737 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
738     ))
739 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
740     }
741 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
742 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
743     break;
744    
745     /*-----------------------------------------------------------------*/
746    
747     case OP_DIGIT:
748     case OP_WHITESPACE:
749     case OP_WORDCHAR:
750     if (clen > 0 && c < 256 &&
751     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
752     { ADD_NEW(state_offset + 1, 0); }
753     break;
754    
755     /*-----------------------------------------------------------------*/
756     case OP_NOT_DIGIT:
757     case OP_NOT_WHITESPACE:
758     case OP_NOT_WORDCHAR:
759     if (clen > 0 && (c >= 256 ||
760     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
761     { ADD_NEW(state_offset + 1, 0); }
762     break;
763    
764     /*-----------------------------------------------------------------*/
765     case OP_WORD_BOUNDARY:
766     case OP_NOT_WORD_BOUNDARY:
767     {
768     int left_word, right_word;
769    
770     if (ptr > start_subject)
771     {
772     const uschar *temp = ptr - 1;
773     #ifdef SUPPORT_UTF8
774     if (utf8) BACKCHAR(temp);
775     #endif
776     GETCHARTEST(d, temp);
777     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
778     }
779     else left_word = 0;
780    
781     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
782     else right_word = 0;
783    
784     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
785     { ADD_ACTIVE(state_offset + 1, 0); }
786     }
787     break;
788    
789    
790     /*-----------------------------------------------------------------*/
791     /* Check the next character by Unicode property. We will get here only
792     if the support is in the binary; otherwise a compile-time error occurs.
793     */
794    
795 ph10 151 #ifdef SUPPORT_UCP
796 nigel 77 case OP_PROP:
797     case OP_NOTPROP:
798     if (clen > 0)
799     {
800 nigel 87 BOOL OK;
801     int category = _pcre_ucp_findprop(c, &chartype, &script);
802     switch(code[1])
803 nigel 77 {
804 nigel 87 case PT_ANY:
805     OK = TRUE;
806     break;
807    
808     case PT_LAMP:
809     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
810     break;
811    
812     case PT_GC:
813     OK = category == code[2];
814     break;
815    
816     case PT_PC:
817     OK = chartype == code[2];
818     break;
819    
820     case PT_SC:
821     OK = script == code[2];
822     break;
823    
824     /* Should never occur, but keep compilers from grumbling. */
825    
826     default:
827     OK = codevalue != OP_PROP;
828     break;
829 nigel 77 }
830 nigel 87
831     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
832 nigel 77 }
833     break;
834     #endif
835    
836    
837    
838     /* ========================================================================== */
839     /* These opcodes likewise inspect the subject character, but have an
840     argument that is not a data character. It is one of these opcodes:
841     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
842     OP_NOT_WORDCHAR. The value is loaded into d. */
843    
844     case OP_TYPEPLUS:
845     case OP_TYPEMINPLUS:
846 nigel 93 case OP_TYPEPOSPLUS:
847 nigel 77 count = current_state->count; /* Already matched */
848     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
849     if (clen > 0)
850     {
851     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
852     (c < 256 &&
853 nigel 91 (d != OP_ANY ||
854     (ims & PCRE_DOTALL) != 0 ||
855     !IS_NEWLINE(ptr)
856     ) &&
857 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
858     {
859 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
860     {
861     active_count--; /* Remove non-match possibility */
862     next_active_state--;
863     }
864 nigel 77 count++;
865     ADD_NEW(state_offset, count);
866     }
867     }
868     break;
869    
870     /*-----------------------------------------------------------------*/
871     case OP_TYPEQUERY:
872     case OP_TYPEMINQUERY:
873 nigel 93 case OP_TYPEPOSQUERY:
874 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
875     if (clen > 0)
876     {
877     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
878     (c < 256 &&
879 nigel 91 (d != OP_ANY ||
880     (ims & PCRE_DOTALL) != 0 ||
881     !IS_NEWLINE(ptr)
882     ) &&
883 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
884     {
885 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
886     {
887     active_count--; /* Remove non-match possibility */
888     next_active_state--;
889     }
890 nigel 77 ADD_NEW(state_offset + 2, 0);
891     }
892     }
893     break;
894    
895     /*-----------------------------------------------------------------*/
896     case OP_TYPESTAR:
897     case OP_TYPEMINSTAR:
898 nigel 93 case OP_TYPEPOSSTAR:
899 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
900     if (clen > 0)
901     {
902     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
903     (c < 256 &&
904 nigel 91 (d != OP_ANY ||
905     (ims & PCRE_DOTALL) != 0 ||
906     !IS_NEWLINE(ptr)
907     ) &&
908 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
909     {
910 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
911     {
912     active_count--; /* Remove non-match possibility */
913     next_active_state--;
914     }
915 nigel 77 ADD_NEW(state_offset, 0);
916     }
917     }
918     break;
919    
920     /*-----------------------------------------------------------------*/
921     case OP_TYPEEXACT:
922 nigel 93 count = current_state->count; /* Number already matched */
923     if (clen > 0)
924     {
925     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
926     (c < 256 &&
927     (d != OP_ANY ||
928     (ims & PCRE_DOTALL) != 0 ||
929     !IS_NEWLINE(ptr)
930     ) &&
931     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
932     {
933     if (++count >= GET2(code, 1))
934     { ADD_NEW(state_offset + 4, 0); }
935     else
936     { ADD_NEW(state_offset, count); }
937     }
938     }
939     break;
940    
941     /*-----------------------------------------------------------------*/
942 nigel 77 case OP_TYPEUPTO:
943     case OP_TYPEMINUPTO:
944 nigel 93 case OP_TYPEPOSUPTO:
945     ADD_ACTIVE(state_offset + 4, 0);
946 nigel 77 count = current_state->count; /* Number already matched */
947     if (clen > 0)
948     {
949     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
950     (c < 256 &&
951 nigel 91 (d != OP_ANY ||
952     (ims & PCRE_DOTALL) != 0 ||
953     !IS_NEWLINE(ptr)
954     ) &&
955 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
956     {
957 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
958     {
959     active_count--; /* Remove non-match possibility */
960     next_active_state--;
961     }
962 nigel 77 if (++count >= GET2(code, 1))
963     { ADD_NEW(state_offset + 4, 0); }
964     else
965     { ADD_NEW(state_offset, count); }
966     }
967     }
968     break;
969    
970     /* ========================================================================== */
971     /* These are virtual opcodes that are used when something like
972 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
973     argument. It keeps the code above fast for the other cases. The argument
974     is in the d variable. */
975 nigel 77
976 ph10 151 #ifdef SUPPORT_UCP
977 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
978     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
979 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
980 nigel 77 count = current_state->count; /* Already matched */
981 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
982 nigel 77 if (clen > 0)
983     {
984 nigel 87 BOOL OK;
985     int category = _pcre_ucp_findprop(c, &chartype, &script);
986     switch(code[2])
987     {
988     case PT_ANY:
989     OK = TRUE;
990     break;
991    
992     case PT_LAMP:
993     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
994     break;
995    
996     case PT_GC:
997     OK = category == code[3];
998     break;
999    
1000     case PT_PC:
1001     OK = chartype == code[3];
1002     break;
1003    
1004     case PT_SC:
1005     OK = script == code[3];
1006     break;
1007    
1008     /* Should never occur, but keep compilers from grumbling. */
1009    
1010     default:
1011     OK = codevalue != OP_PROP;
1012     break;
1013     }
1014    
1015 nigel 93 if (OK == (d == OP_PROP))
1016     {
1017     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1018     {
1019     active_count--; /* Remove non-match possibility */
1020     next_active_state--;
1021     }
1022     count++;
1023     ADD_NEW(state_offset, count);
1024     }
1025 nigel 77 }
1026     break;
1027    
1028     /*-----------------------------------------------------------------*/
1029     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1030     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1031 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1032 nigel 77 count = current_state->count; /* Already matched */
1033     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1034 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1035 nigel 77 {
1036     const uschar *nptr = ptr + clen;
1037     int ncount = 0;
1038 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1039     {
1040     active_count--; /* Remove non-match possibility */
1041     next_active_state--;
1042     }
1043 nigel 77 while (nptr < end_subject)
1044     {
1045     int nd;
1046     int ndlen = 1;
1047     GETCHARLEN(nd, nptr, ndlen);
1048 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1049 nigel 77 ncount++;
1050     nptr += ndlen;
1051     }
1052     count++;
1053     ADD_NEW_DATA(-state_offset, count, ncount);
1054     }
1055     break;
1056 ph10 151 #endif
1057 nigel 77
1058     /*-----------------------------------------------------------------*/
1059 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1060     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1061     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1062     count = current_state->count; /* Already matched */
1063     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1064     if (clen > 0)
1065     {
1066     int ncount = 0;
1067     switch (c)
1068     {
1069     case 0x000d:
1070     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1071     /* Fall through */
1072     case 0x000a:
1073     case 0x000b:
1074     case 0x000c:
1075     case 0x0085:
1076     case 0x2028:
1077     case 0x2029:
1078     if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1079     {
1080     active_count--; /* Remove non-match possibility */
1081     next_active_state--;
1082     }
1083     count++;
1084     ADD_NEW_DATA(-state_offset, count, ncount);
1085     break;
1086     default:
1087     break;
1088     }
1089     }
1090     break;
1091    
1092     /*-----------------------------------------------------------------*/
1093 ph10 151 #ifdef SUPPORT_UCP
1094 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1095     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1096 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1097 nigel 87 count = 4;
1098 nigel 77 goto QS1;
1099    
1100     case OP_PROP_EXTRA + OP_TYPESTAR:
1101     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1102 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1103 nigel 77 count = 0;
1104    
1105     QS1:
1106    
1107 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1108 nigel 77 if (clen > 0)
1109     {
1110 nigel 87 BOOL OK;
1111     int category = _pcre_ucp_findprop(c, &chartype, &script);
1112     switch(code[2])
1113     {
1114     case PT_ANY:
1115     OK = TRUE;
1116     break;
1117    
1118     case PT_LAMP:
1119     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1120     break;
1121    
1122     case PT_GC:
1123     OK = category == code[3];
1124     break;
1125    
1126     case PT_PC:
1127     OK = chartype == code[3];
1128     break;
1129    
1130     case PT_SC:
1131     OK = script == code[3];
1132     break;
1133    
1134     /* Should never occur, but keep compilers from grumbling. */
1135    
1136     default:
1137     OK = codevalue != OP_PROP;
1138     break;
1139     }
1140    
1141 nigel 93 if (OK == (d == OP_PROP))
1142     {
1143     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1144     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1145     {
1146     active_count--; /* Remove non-match possibility */
1147     next_active_state--;
1148     }
1149     ADD_NEW(state_offset + count, 0);
1150     }
1151 nigel 77 }
1152     break;
1153    
1154     /*-----------------------------------------------------------------*/
1155     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1156     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1157 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1158 nigel 77 count = 2;
1159     goto QS2;
1160    
1161     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1162     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1163 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1164 nigel 77 count = 0;
1165    
1166     QS2:
1167    
1168     ADD_ACTIVE(state_offset + 2, 0);
1169 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1170 nigel 77 {
1171     const uschar *nptr = ptr + clen;
1172     int ncount = 0;
1173 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1174     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1175     {
1176     active_count--; /* Remove non-match possibility */
1177     next_active_state--;
1178     }
1179 nigel 77 while (nptr < end_subject)
1180     {
1181     int nd;
1182     int ndlen = 1;
1183     GETCHARLEN(nd, nptr, ndlen);
1184 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1185 nigel 77 ncount++;
1186     nptr += ndlen;
1187     }
1188     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1189     }
1190     break;
1191 ph10 151 #endif
1192 nigel 77
1193     /*-----------------------------------------------------------------*/
1194 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1195     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1196     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1197     count = 2;
1198     goto QS3;
1199    
1200     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1201     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1202     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1203     count = 0;
1204    
1205     QS3:
1206     ADD_ACTIVE(state_offset + 2, 0);
1207     if (clen > 0)
1208     {
1209     int ncount = 0;
1210     switch (c)
1211     {
1212     case 0x000d:
1213     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1214     /* Fall through */
1215     case 0x000a:
1216     case 0x000b:
1217     case 0x000c:
1218     case 0x0085:
1219     case 0x2028:
1220     case 0x2029:
1221     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1222     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1223     {
1224     active_count--; /* Remove non-match possibility */
1225     next_active_state--;
1226     }
1227     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1228     break;
1229     default:
1230     break;
1231     }
1232     }
1233     break;
1234    
1235     /*-----------------------------------------------------------------*/
1236 ph10 151 #ifdef SUPPORT_UCP
1237 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1238     case OP_PROP_EXTRA + OP_TYPEUPTO:
1239     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1240 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1241 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1242 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1243 nigel 77 count = current_state->count; /* Number already matched */
1244     if (clen > 0)
1245     {
1246 nigel 87 BOOL OK;
1247     int category = _pcre_ucp_findprop(c, &chartype, &script);
1248     switch(code[4])
1249 nigel 77 {
1250 nigel 87 case PT_ANY:
1251     OK = TRUE;
1252     break;
1253    
1254     case PT_LAMP:
1255     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1256     break;
1257    
1258     case PT_GC:
1259     OK = category == code[5];
1260     break;
1261    
1262     case PT_PC:
1263     OK = chartype == code[5];
1264     break;
1265    
1266     case PT_SC:
1267     OK = script == code[5];
1268     break;
1269    
1270     /* Should never occur, but keep compilers from grumbling. */
1271    
1272     default:
1273     OK = codevalue != OP_PROP;
1274     break;
1275     }
1276    
1277     if (OK == (d == OP_PROP))
1278     {
1279 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1280     {
1281     active_count--; /* Remove non-match possibility */
1282     next_active_state--;
1283     }
1284 nigel 77 if (++count >= GET2(code, 1))
1285 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1286 nigel 77 else
1287     { ADD_NEW(state_offset, count); }
1288     }
1289     }
1290     break;
1291    
1292     /*-----------------------------------------------------------------*/
1293     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1294     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1295     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1296 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1297 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1298     { ADD_ACTIVE(state_offset + 4, 0); }
1299     count = current_state->count; /* Number already matched */
1300 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1301 nigel 77 {
1302     const uschar *nptr = ptr + clen;
1303     int ncount = 0;
1304 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1305     {
1306     active_count--; /* Remove non-match possibility */
1307     next_active_state--;
1308     }
1309 nigel 77 while (nptr < end_subject)
1310     {
1311     int nd;
1312     int ndlen = 1;
1313     GETCHARLEN(nd, nptr, ndlen);
1314 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1315 nigel 77 ncount++;
1316     nptr += ndlen;
1317     }
1318     if (++count >= GET2(code, 1))
1319     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1320     else
1321     { ADD_NEW_DATA(-state_offset, count, ncount); }
1322     }
1323     break;
1324 ph10 151 #endif
1325 nigel 77
1326 nigel 93 /*-----------------------------------------------------------------*/
1327     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1328     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1329     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1330     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1331     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1332     { ADD_ACTIVE(state_offset + 4, 0); }
1333     count = current_state->count; /* Number already matched */
1334     if (clen > 0)
1335     {
1336     int ncount = 0;
1337     switch (c)
1338     {
1339     case 0x000d:
1340     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1341     /* Fall through */
1342     case 0x000a:
1343     case 0x000b:
1344     case 0x000c:
1345     case 0x0085:
1346     case 0x2028:
1347     case 0x2029:
1348     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1349     {
1350     active_count--; /* Remove non-match possibility */
1351     next_active_state--;
1352     }
1353     if (++count >= GET2(code, 1))
1354     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1355     else
1356     { ADD_NEW_DATA(-state_offset, count, ncount); }
1357     break;
1358     default:
1359     break;
1360     }
1361     }
1362     break;
1363    
1364 nigel 77 /* ========================================================================== */
1365     /* These opcodes are followed by a character that is usually compared
1366     to the current subject character; it is loaded into d. We still get
1367     here even if there is no subject character, because in some cases zero
1368     repetitions are permitted. */
1369    
1370     /*-----------------------------------------------------------------*/
1371     case OP_CHAR:
1372     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1373     break;
1374    
1375     /*-----------------------------------------------------------------*/
1376     case OP_CHARNC:
1377     if (clen == 0) break;
1378    
1379     #ifdef SUPPORT_UTF8
1380     if (utf8)
1381     {
1382     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1383     {
1384 nigel 93 unsigned int othercase;
1385 nigel 77 if (c < 128) othercase = fcc[c]; else
1386    
1387     /* If we have Unicode property support, we can use it to test the
1388 nigel 87 other case of the character. */
1389 nigel 77
1390     #ifdef SUPPORT_UCP
1391 nigel 87 othercase = _pcre_ucp_othercase(c);
1392     #else
1393 nigel 93 othercase = NOTACHAR;
1394 nigel 77 #endif
1395    
1396     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1397     }
1398     }
1399     else
1400     #endif /* SUPPORT_UTF8 */
1401    
1402     /* Non-UTF-8 mode */
1403     {
1404     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1405     }
1406     break;
1407    
1408    
1409     #ifdef SUPPORT_UCP
1410     /*-----------------------------------------------------------------*/
1411     /* This is a tricky one because it can match more than one character.
1412     Find out how many characters to skip, and then set up a negative state
1413     to wait for them to pass before continuing. */
1414    
1415     case OP_EXTUNI:
1416 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1417 nigel 77 {
1418     const uschar *nptr = ptr + clen;
1419     int ncount = 0;
1420     while (nptr < end_subject)
1421     {
1422     int nclen = 1;
1423     GETCHARLEN(c, nptr, nclen);
1424 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1425 nigel 77 ncount++;
1426     nptr += nclen;
1427     }
1428     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1429     }
1430     break;
1431     #endif
1432    
1433     /*-----------------------------------------------------------------*/
1434 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1435     character (when CR is followed by LF). In this case, set up a negative
1436     state to wait for one character to pass before continuing. */
1437    
1438     case OP_ANYNL:
1439     if (clen > 0) switch(c)
1440     {
1441     case 0x000a:
1442     case 0x000b:
1443     case 0x000c:
1444     case 0x0085:
1445     case 0x2028:
1446     case 0x2029:
1447     ADD_NEW(state_offset + 1, 0);
1448     break;
1449     case 0x000d:
1450     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1451     {
1452     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1453     }
1454     else
1455     {
1456     ADD_NEW(state_offset + 1, 0);
1457     }
1458     break;
1459     }
1460     break;
1461    
1462     /*-----------------------------------------------------------------*/
1463 nigel 77 /* Match a negated single character. This is only used for one-byte
1464     characters, that is, we know that d < 256. The character we are
1465     checking (c) can be multibyte. */
1466    
1467     case OP_NOT:
1468     if (clen > 0)
1469     {
1470 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1471 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1472     }
1473     break;
1474    
1475     /*-----------------------------------------------------------------*/
1476     case OP_PLUS:
1477     case OP_MINPLUS:
1478 nigel 93 case OP_POSPLUS:
1479 nigel 77 case OP_NOTPLUS:
1480     case OP_NOTMINPLUS:
1481 nigel 93 case OP_NOTPOSPLUS:
1482 nigel 77 count = current_state->count; /* Already matched */
1483     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1484     if (clen > 0)
1485     {
1486 nigel 93 unsigned int otherd = NOTACHAR;
1487 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1488     {
1489     #ifdef SUPPORT_UTF8
1490 nigel 87 if (utf8 && d >= 128)
1491 nigel 77 {
1492     #ifdef SUPPORT_UCP
1493 nigel 87 otherd = _pcre_ucp_othercase(d);
1494 nigel 77 #endif /* SUPPORT_UCP */
1495     }
1496     else
1497     #endif /* SUPPORT_UTF8 */
1498     otherd = fcc[d];
1499     }
1500     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1501 nigel 93 {
1502     if (count > 0 &&
1503     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1504     {
1505     active_count--; /* Remove non-match possibility */
1506     next_active_state--;
1507     }
1508     count++;
1509     ADD_NEW(state_offset, count);
1510     }
1511 nigel 77 }
1512     break;
1513    
1514     /*-----------------------------------------------------------------*/
1515     case OP_QUERY:
1516     case OP_MINQUERY:
1517 nigel 93 case OP_POSQUERY:
1518 nigel 77 case OP_NOTQUERY:
1519     case OP_NOTMINQUERY:
1520 nigel 93 case OP_NOTPOSQUERY:
1521 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1522     if (clen > 0)
1523     {
1524 nigel 93 unsigned int otherd = NOTACHAR;
1525 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1526 nigel 77 {
1527     #ifdef SUPPORT_UTF8
1528 nigel 87 if (utf8 && d >= 128)
1529 nigel 77 {
1530     #ifdef SUPPORT_UCP
1531 nigel 87 otherd = _pcre_ucp_othercase(d);
1532 nigel 77 #endif /* SUPPORT_UCP */
1533     }
1534     else
1535     #endif /* SUPPORT_UTF8 */
1536     otherd = fcc[d];
1537     }
1538     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1539 nigel 93 {
1540     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1541     {
1542     active_count--; /* Remove non-match possibility */
1543     next_active_state--;
1544     }
1545     ADD_NEW(state_offset + dlen + 1, 0);
1546     }
1547 nigel 77 }
1548     break;
1549    
1550     /*-----------------------------------------------------------------*/
1551     case OP_STAR:
1552     case OP_MINSTAR:
1553 nigel 93 case OP_POSSTAR:
1554 nigel 77 case OP_NOTSTAR:
1555     case OP_NOTMINSTAR:
1556 nigel 93 case OP_NOTPOSSTAR:
1557 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1558     if (clen > 0)
1559     {
1560 nigel 93 unsigned int otherd = NOTACHAR;
1561 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1562 nigel 77 {
1563     #ifdef SUPPORT_UTF8
1564 nigel 87 if (utf8 && d >= 128)
1565 nigel 77 {
1566     #ifdef SUPPORT_UCP
1567 nigel 87 otherd = _pcre_ucp_othercase(d);
1568 nigel 77 #endif /* SUPPORT_UCP */
1569     }
1570     else
1571     #endif /* SUPPORT_UTF8 */
1572     otherd = fcc[d];
1573     }
1574     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1575 nigel 93 {
1576     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1577     {
1578     active_count--; /* Remove non-match possibility */
1579     next_active_state--;
1580     }
1581     ADD_NEW(state_offset, 0);
1582     }
1583 nigel 77 }
1584     break;
1585    
1586     /*-----------------------------------------------------------------*/
1587     case OP_EXACT:
1588 nigel 93 case OP_NOTEXACT:
1589     count = current_state->count; /* Number already matched */
1590     if (clen > 0)
1591     {
1592     unsigned int otherd = NOTACHAR;
1593     if ((ims & PCRE_CASELESS) != 0)
1594     {
1595     #ifdef SUPPORT_UTF8
1596     if (utf8 && d >= 128)
1597     {
1598     #ifdef SUPPORT_UCP
1599     otherd = _pcre_ucp_othercase(d);
1600     #endif /* SUPPORT_UCP */
1601     }
1602     else
1603     #endif /* SUPPORT_UTF8 */
1604     otherd = fcc[d];
1605     }
1606     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1607     {
1608     if (++count >= GET2(code, 1))
1609     { ADD_NEW(state_offset + dlen + 3, 0); }
1610     else
1611     { ADD_NEW(state_offset, count); }
1612     }
1613     }
1614     break;
1615    
1616     /*-----------------------------------------------------------------*/
1617 nigel 77 case OP_UPTO:
1618     case OP_MINUPTO:
1619 nigel 93 case OP_POSUPTO:
1620 nigel 77 case OP_NOTUPTO:
1621     case OP_NOTMINUPTO:
1622 nigel 93 case OP_NOTPOSUPTO:
1623     ADD_ACTIVE(state_offset + dlen + 3, 0);
1624 nigel 77 count = current_state->count; /* Number already matched */
1625     if (clen > 0)
1626     {
1627 nigel 93 unsigned int otherd = NOTACHAR;
1628 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1629     {
1630     #ifdef SUPPORT_UTF8
1631 nigel 87 if (utf8 && d >= 128)
1632 nigel 77 {
1633     #ifdef SUPPORT_UCP
1634 nigel 87 otherd = _pcre_ucp_othercase(d);
1635 nigel 77 #endif /* SUPPORT_UCP */
1636     }
1637     else
1638     #endif /* SUPPORT_UTF8 */
1639     otherd = fcc[d];
1640     }
1641     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1642     {
1643 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
1644     {
1645     active_count--; /* Remove non-match possibility */
1646     next_active_state--;
1647     }
1648 nigel 77 if (++count >= GET2(code, 1))
1649     { ADD_NEW(state_offset + dlen + 3, 0); }
1650     else
1651     { ADD_NEW(state_offset, count); }
1652     }
1653     }
1654     break;
1655    
1656    
1657     /* ========================================================================== */
1658     /* These are the class-handling opcodes */
1659    
1660     case OP_CLASS:
1661     case OP_NCLASS:
1662     case OP_XCLASS:
1663     {
1664     BOOL isinclass = FALSE;
1665     int next_state_offset;
1666     const uschar *ecode;
1667    
1668     /* For a simple class, there is always just a 32-byte table, and we
1669     can set isinclass from it. */
1670    
1671     if (codevalue != OP_XCLASS)
1672     {
1673     ecode = code + 33;
1674     if (clen > 0)
1675     {
1676     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1677     ((code[1 + c/8] & (1 << (c&7))) != 0);
1678     }
1679     }
1680    
1681     /* An extended class may have a table or a list of single characters,
1682     ranges, or both, and it may be positive or negative. There's a
1683     function that sorts all this out. */
1684    
1685     else
1686     {
1687     ecode = code + GET(code, 1);
1688     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1689     }
1690    
1691     /* At this point, isinclass is set for all kinds of class, and ecode
1692     points to the byte after the end of the class. If there is a
1693     quantifier, this is where it will be. */
1694    
1695     next_state_offset = ecode - start_code;
1696    
1697     switch (*ecode)
1698     {
1699     case OP_CRSTAR:
1700     case OP_CRMINSTAR:
1701     ADD_ACTIVE(next_state_offset + 1, 0);
1702     if (isinclass) { ADD_NEW(state_offset, 0); }
1703     break;
1704    
1705     case OP_CRPLUS:
1706     case OP_CRMINPLUS:
1707     count = current_state->count; /* Already matched */
1708     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1709     if (isinclass) { count++; ADD_NEW(state_offset, count); }
1710     break;
1711    
1712     case OP_CRQUERY:
1713     case OP_CRMINQUERY:
1714     ADD_ACTIVE(next_state_offset + 1, 0);
1715     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1716     break;
1717    
1718     case OP_CRRANGE:
1719     case OP_CRMINRANGE:
1720     count = current_state->count; /* Already matched */
1721     if (count >= GET2(ecode, 1))
1722     { ADD_ACTIVE(next_state_offset + 5, 0); }
1723     if (isinclass)
1724     {
1725 nigel 91 int max = GET2(ecode, 3);
1726     if (++count >= max && max != 0) /* Max 0 => no limit */
1727 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
1728     else
1729     { ADD_NEW(state_offset, count); }
1730     }
1731     break;
1732    
1733     default:
1734     if (isinclass) { ADD_NEW(next_state_offset, 0); }
1735     break;
1736     }
1737     }
1738     break;
1739    
1740     /* ========================================================================== */
1741     /* These are the opcodes for fancy brackets of various kinds. We have
1742     to use recursion in order to handle them. */
1743    
1744     case OP_ASSERT:
1745     case OP_ASSERT_NOT:
1746     case OP_ASSERTBACK:
1747     case OP_ASSERTBACK_NOT:
1748     {
1749     int rc;
1750     int local_offsets[2];
1751     int local_workspace[1000];
1752     const uschar *endasscode = code + GET(code, 1);
1753    
1754     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1755    
1756     rc = internal_dfa_exec(
1757     md, /* static match data */
1758     code, /* this subexpression's code */
1759     ptr, /* where we currently are */
1760     ptr - start_subject, /* start offset */
1761     local_offsets, /* offset vector */
1762     sizeof(local_offsets)/sizeof(int), /* size of same */
1763     local_workspace, /* workspace vector */
1764     sizeof(local_workspace)/sizeof(int), /* size of same */
1765     ims, /* the current ims flags */
1766     rlevel, /* function recursion level */
1767     recursing); /* pass on regex recursion */
1768    
1769     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1770     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1771     }
1772     break;
1773    
1774     /*-----------------------------------------------------------------*/
1775     case OP_COND:
1776 nigel 93 case OP_SCOND:
1777 nigel 77 {
1778     int local_offsets[1000];
1779     int local_workspace[1000];
1780     int condcode = code[LINK_SIZE+1];
1781    
1782 nigel 93 /* Back reference conditions are not supported */
1783 nigel 77
1784 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
1785    
1786     /* The DEFINE condition is always false */
1787    
1788     if (condcode == OP_DEF)
1789 nigel 77 {
1790 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
1791     }
1792    
1793     /* The only supported version of OP_RREF is for the value RREF_ANY,
1794     which means "test if in any recursion". We can't test for specifically
1795     recursed groups. */
1796    
1797     else if (condcode == OP_RREF)
1798     {
1799 nigel 77 int value = GET2(code, LINK_SIZE+2);
1800 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
1801 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1802     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1803     }
1804    
1805     /* Otherwise, the condition is an assertion */
1806    
1807     else
1808     {
1809     int rc;
1810     const uschar *asscode = code + LINK_SIZE + 1;
1811     const uschar *endasscode = asscode + GET(asscode, 1);
1812    
1813     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1814    
1815     rc = internal_dfa_exec(
1816     md, /* fixed match data */
1817     asscode, /* this subexpression's code */
1818     ptr, /* where we currently are */
1819     ptr - start_subject, /* start offset */
1820     local_offsets, /* offset vector */
1821     sizeof(local_offsets)/sizeof(int), /* size of same */
1822     local_workspace, /* workspace vector */
1823     sizeof(local_workspace)/sizeof(int), /* size of same */
1824     ims, /* the current ims flags */
1825     rlevel, /* function recursion level */
1826     recursing); /* pass on regex recursion */
1827    
1828     if ((rc >= 0) ==
1829     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1830     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1831     else
1832     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1833     }
1834     }
1835     break;
1836    
1837     /*-----------------------------------------------------------------*/
1838     case OP_RECURSE:
1839     {
1840     int local_offsets[1000];
1841     int local_workspace[1000];
1842     int rc;
1843    
1844     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1845     recursing + 1));
1846    
1847     rc = internal_dfa_exec(
1848     md, /* fixed match data */
1849     start_code + GET(code, 1), /* this subexpression's code */
1850     ptr, /* where we currently are */
1851     ptr - start_subject, /* start offset */
1852     local_offsets, /* offset vector */
1853     sizeof(local_offsets)/sizeof(int), /* size of same */
1854     local_workspace, /* workspace vector */
1855     sizeof(local_workspace)/sizeof(int), /* size of same */
1856     ims, /* the current ims flags */
1857     rlevel, /* function recursion level */
1858     recursing + 1); /* regex recurse level */
1859    
1860     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1861     recursing + 1, rc));
1862    
1863     /* Ran out of internal offsets */
1864    
1865     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1866    
1867     /* For each successful matched substring, set up the next state with a
1868     count of characters to skip before trying it. Note that the count is in
1869     characters, not bytes. */
1870    
1871     if (rc > 0)
1872     {
1873     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1874     {
1875     const uschar *p = start_subject + local_offsets[rc];
1876     const uschar *pp = start_subject + local_offsets[rc+1];
1877     int charcount = local_offsets[rc+1] - local_offsets[rc];
1878     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1879     if (charcount > 0)
1880     {
1881     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1882     }
1883     else
1884     {
1885     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1886     }
1887     }
1888     }
1889     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1890     }
1891     break;
1892    
1893     /*-----------------------------------------------------------------*/
1894     case OP_ONCE:
1895     {
1896     int local_offsets[2];
1897     int local_workspace[1000];
1898    
1899     int rc = internal_dfa_exec(
1900     md, /* fixed match data */
1901     code, /* this subexpression's code */
1902     ptr, /* where we currently are */
1903     ptr - start_subject, /* start offset */
1904     local_offsets, /* offset vector */
1905     sizeof(local_offsets)/sizeof(int), /* size of same */
1906     local_workspace, /* workspace vector */
1907     sizeof(local_workspace)/sizeof(int), /* size of same */
1908     ims, /* the current ims flags */
1909     rlevel, /* function recursion level */
1910     recursing); /* pass on regex recursion */
1911    
1912     if (rc >= 0)
1913     {
1914     const uschar *end_subpattern = code;
1915     int charcount = local_offsets[1] - local_offsets[0];
1916     int next_state_offset, repeat_state_offset;
1917    
1918     do { end_subpattern += GET(end_subpattern, 1); }
1919     while (*end_subpattern == OP_ALT);
1920     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1921    
1922     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1923     arrange for the repeat state also to be added to the relevant list.
1924     Calculate the offset, or set -1 for no repeat. */
1925    
1926     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1927     *end_subpattern == OP_KETRMIN)?
1928     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1929    
1930     /* If we have matched an empty string, add the next state at the
1931     current character pointer. This is important so that the duplicate
1932     checking kicks in, which is what breaks infinite loops that match an
1933     empty string. */
1934    
1935     if (charcount == 0)
1936     {
1937     ADD_ACTIVE(next_state_offset, 0);
1938     }
1939    
1940     /* Optimization: if there are no more active states, and there
1941     are no new states yet set up, then skip over the subject string
1942     right here, to save looping. Otherwise, set up the new state to swing
1943     into action when the end of the substring is reached. */
1944    
1945     else if (i + 1 >= active_count && new_count == 0)
1946     {
1947     ptr += charcount;
1948     clen = 0;
1949     ADD_NEW(next_state_offset, 0);
1950    
1951     /* If we are adding a repeat state at the new character position,
1952     we must fudge things so that it is the only current state.
1953     Otherwise, it might be a duplicate of one we processed before, and
1954     that would cause it to be skipped. */
1955    
1956     if (repeat_state_offset >= 0)
1957     {
1958     next_active_state = active_states;
1959     active_count = 0;
1960     i = -1;
1961     ADD_ACTIVE(repeat_state_offset, 0);
1962     }
1963     }
1964     else
1965     {
1966     const uschar *p = start_subject + local_offsets[0];
1967     const uschar *pp = start_subject + local_offsets[1];
1968     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1969     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1970     if (repeat_state_offset >= 0)
1971     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1972     }
1973    
1974     }
1975     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1976     }
1977     break;
1978    
1979    
1980     /* ========================================================================== */
1981     /* Handle callouts */
1982    
1983     case OP_CALLOUT:
1984     if (pcre_callout != NULL)
1985     {
1986     int rrc;
1987     pcre_callout_block cb;
1988     cb.version = 1; /* Version 1 of the callout block */
1989     cb.callout_number = code[1];
1990     cb.offset_vector = offsets;
1991 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
1992 nigel 77 cb.subject_length = end_subject - start_subject;
1993     cb.start_match = current_subject - start_subject;
1994     cb.current_position = ptr - start_subject;
1995     cb.pattern_position = GET(code, 2);
1996     cb.next_item_length = GET(code, 2 + LINK_SIZE);
1997     cb.capture_top = 1;
1998     cb.capture_last = -1;
1999     cb.callout_data = md->callout_data;
2000     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2001     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2002     }
2003     break;
2004    
2005    
2006     /* ========================================================================== */
2007     default: /* Unsupported opcode */
2008     return PCRE_ERROR_DFA_UITEM;
2009     }
2010    
2011     NEXT_ACTIVE_STATE: continue;
2012    
2013     } /* End of loop scanning active states */
2014    
2015     /* We have finished the processing at the current subject character. If no
2016     new states have been set for the next character, we have found all the
2017     matches that we are going to find. If we are at the top level and partial
2018     matching has been requested, check for appropriate conditions. */
2019    
2020     if (new_count <= 0)
2021     {
2022     if (match_count < 0 && /* No matches found */
2023     rlevel == 1 && /* Top level match function */
2024     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2025     ptr >= end_subject && /* Reached end of subject */
2026     ptr > current_subject) /* Matched non-empty string */
2027     {
2028     if (offsetcount >= 2)
2029     {
2030     offsets[0] = current_subject - start_subject;
2031     offsets[1] = end_subject - start_subject;
2032     }
2033     match_count = PCRE_ERROR_PARTIAL;
2034     }
2035    
2036     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2037     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2038     rlevel*2-2, SP));
2039 nigel 91 break; /* In effect, "return", but see the comment below */
2040 nigel 77 }
2041    
2042     /* One or more states are active for the next character. */
2043    
2044     ptr += clen; /* Advance to next subject character */
2045     } /* Loop to move along the subject string */
2046    
2047 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2048     if we use "return" above, we have compiler trouble. Some compilers warn if
2049     there's nothing here because they think the function doesn't return a value. On
2050     the other hand, if we put a dummy statement here, some more clever compilers
2051     complain that it can't be reached. Sigh. */
2052 nigel 77
2053 nigel 91 return match_count;
2054 nigel 77 }
2055    
2056    
2057    
2058    
2059     /*************************************************
2060     * Execute a Regular Expression - DFA engine *
2061     *************************************************/
2062    
2063     /* This external function applies a compiled re to a subject string using a DFA
2064     engine. This function calls the internal function multiple times if the pattern
2065     is not anchored.
2066    
2067     Arguments:
2068     argument_re points to the compiled expression
2069 ph10 97 extra_data points to extra data or is NULL
2070 nigel 77 subject points to the subject string
2071     length length of subject string (may contain binary zeros)
2072     start_offset where to start in the subject string
2073     options option bits
2074     offsets vector of match offsets
2075     offsetcount size of same
2076     workspace workspace vector
2077     wscount size of same
2078    
2079     Returns: > 0 => number of match offset pairs placed in offsets
2080     = 0 => offsets overflowed; longest matches are present
2081     -1 => failed to match
2082     < -1 => some kind of unexpected problem
2083     */
2084    
2085 ph10 145 PCRE_EXP_DEFN int
2086 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2087     const char *subject, int length, int start_offset, int options, int *offsets,
2088     int offsetcount, int *workspace, int wscount)
2089     {
2090     real_pcre *re = (real_pcre *)argument_re;
2091     dfa_match_data match_block;
2092 nigel 91 dfa_match_data *md = &match_block;
2093 nigel 77 BOOL utf8, anchored, startline, firstline;
2094     const uschar *current_subject, *end_subject, *lcc;
2095    
2096     pcre_study_data internal_study;
2097     const pcre_study_data *study = NULL;
2098     real_pcre internal_re;
2099    
2100     const uschar *req_byte_ptr;
2101     const uschar *start_bits = NULL;
2102     BOOL first_byte_caseless = FALSE;
2103     BOOL req_byte_caseless = FALSE;
2104     int first_byte = -1;
2105     int req_byte = -1;
2106     int req_byte2 = -1;
2107 nigel 91 int newline;
2108 nigel 77
2109     /* Plausibility checks */
2110    
2111     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2112     if (re == NULL || subject == NULL || workspace == NULL ||
2113     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2114     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2115     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2116    
2117     /* We need to find the pointer to any study data before we test for byte
2118     flipping, so we scan the extra_data block first. This may set two fields in the
2119     match block, so we must initialize them beforehand. However, the other fields
2120     in the match block must not be set until after the byte flipping. */
2121    
2122 nigel 91 md->tables = re->tables;
2123     md->callout_data = NULL;
2124 nigel 77
2125     if (extra_data != NULL)
2126     {
2127     unsigned int flags = extra_data->flags;
2128     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2129     study = (const pcre_study_data *)extra_data->study_data;
2130     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2131 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2132     return PCRE_ERROR_DFA_UMLIMIT;
2133 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2134 nigel 91 md->callout_data = extra_data->callout_data;
2135 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2136 nigel 91 md->tables = extra_data->tables;
2137 nigel 77 }
2138    
2139     /* Check that the first field in the block is the magic number. If it is not,
2140     test for a regex that was compiled on a host of opposite endianness. If this is
2141     the case, flipped values are put in internal_re and internal_study if there was
2142     study data too. */
2143    
2144     if (re->magic_number != MAGIC_NUMBER)
2145     {
2146     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2147     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2148     if (study != NULL) study = &internal_study;
2149     }
2150    
2151     /* Set some local values */
2152    
2153     current_subject = (const unsigned char *)subject + start_offset;
2154     end_subject = (const unsigned char *)subject + length;
2155     req_byte_ptr = current_subject - 1;
2156    
2157 nigel 91 #ifdef SUPPORT_UTF8
2158 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2159 nigel 91 #else
2160     utf8 = FALSE;
2161     #endif
2162 nigel 77
2163 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2164     (re->options & PCRE_ANCHORED) != 0;
2165    
2166 nigel 77 /* The remaining fixed data for passing around. */
2167    
2168 nigel 91 md->start_code = (const uschar *)argument_re +
2169 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2170 nigel 91 md->start_subject = (const unsigned char *)subject;
2171     md->end_subject = end_subject;
2172     md->moptions = options;
2173     md->poptions = re->options;
2174 nigel 77
2175 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2176     nothing is set at run time, whatever was used at compile time applies. */
2177 nigel 91
2178 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2179 nigel 93 PCRE_NEWLINE_BITS)
2180 nigel 91 {
2181 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2182 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2183     case PCRE_NEWLINE_LF: newline = '\n'; break;
2184     case PCRE_NEWLINE_CR+
2185     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2186 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2187 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2188 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2189 nigel 91 }
2190    
2191 ph10 149 if (newline == -2)
2192 nigel 91 {
2193 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2194     }
2195     else if (newline < 0)
2196     {
2197 nigel 93 md->nltype = NLTYPE_ANY;
2198 nigel 91 }
2199     else
2200     {
2201 nigel 93 md->nltype = NLTYPE_FIXED;
2202     if (newline > 255)
2203     {
2204     md->nllen = 2;
2205     md->nl[0] = (newline >> 8) & 255;
2206     md->nl[1] = newline & 255;
2207     }
2208     else
2209     {
2210     md->nllen = 1;
2211     md->nl[0] = newline;
2212     }
2213 nigel 91 }
2214    
2215 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2216     back the character offset. */
2217    
2218     #ifdef SUPPORT_UTF8
2219     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2220     {
2221     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2222     return PCRE_ERROR_BADUTF8;
2223     if (start_offset > 0 && start_offset < length)
2224     {
2225     int tb = ((uschar *)subject)[start_offset];
2226     if (tb > 127)
2227     {
2228     tb &= 0xc0;
2229     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2230     }
2231     }
2232     }
2233     #endif
2234    
2235     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2236     is a feature that makes it possible to save compiled regex and re-use them
2237     in other programs later. */
2238    
2239 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2240 nigel 77
2241     /* The lower casing table and the "must be at the start of a line" flag are
2242     used in a loop when finding where to start. */
2243    
2244 nigel 91 lcc = md->tables + lcc_offset;
2245 nigel 77 startline = (re->options & PCRE_STARTLINE) != 0;
2246     firstline = (re->options & PCRE_FIRSTLINE) != 0;
2247    
2248     /* Set up the first character to match, if available. The first_byte value is
2249     never set for an anchored regular expression, but the anchoring may be forced
2250     at run time, so we have to test for anchoring. The first char may be unset for
2251     an unanchored pattern, of course. If there's no first char and the pattern was
2252     studied, there may be a bitmap of possible first characters. */
2253    
2254     if (!anchored)
2255     {
2256     if ((re->options & PCRE_FIRSTSET) != 0)
2257     {
2258     first_byte = re->first_byte & 255;
2259     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2260     first_byte = lcc[first_byte];
2261     }
2262     else
2263     {
2264     if (startline && study != NULL &&
2265     (study->options & PCRE_STUDY_MAPPED) != 0)
2266     start_bits = study->start_bits;
2267     }
2268     }
2269    
2270     /* For anchored or unanchored matches, there may be a "last known required
2271     character" set. */
2272    
2273     if ((re->options & PCRE_REQCHSET) != 0)
2274     {
2275     req_byte = re->req_byte & 255;
2276     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2277 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2278 nigel 77 }
2279    
2280     /* Call the main matching function, looping for a non-anchored regex after a
2281     failed match. Unless restarting, optimize by moving to the first match
2282     character if possible, when not anchored. Then unless wanting a partial match,
2283     check for a required later character. */
2284    
2285     for (;;)
2286     {
2287     int rc;
2288    
2289     if ((options & PCRE_DFA_RESTART) == 0)
2290     {
2291     const uschar *save_end_subject = end_subject;
2292    
2293     /* Advance to a unique first char if possible. If firstline is TRUE, the
2294     start of the match is constrained to the first line of a multiline string.
2295 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2296     scanning at a newline. If the match fails at the newline, later code breaks
2297     this loop. */
2298 nigel 77
2299     if (firstline)
2300     {
2301     const uschar *t = current_subject;
2302 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2303 nigel 77 end_subject = t;
2304     }
2305    
2306     if (first_byte >= 0)
2307     {
2308     if (first_byte_caseless)
2309     while (current_subject < end_subject &&
2310     lcc[*current_subject] != first_byte)
2311     current_subject++;
2312     else
2313     while (current_subject < end_subject && *current_subject != first_byte)
2314     current_subject++;
2315     }
2316    
2317 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2318 nigel 77
2319     else if (startline)
2320     {
2321 nigel 93 if (current_subject > md->start_subject + start_offset)
2322 nigel 77 {
2323 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2324 nigel 77 current_subject++;
2325 ph10 130
2326 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2327     ANYCRLF, and we are now at a LF, advance the match position by one more
2328     character. */
2329 ph10 134
2330 ph10 130 if (current_subject[-1] == '\r' &&
2331 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2332 ph10 130 current_subject < end_subject &&
2333     *current_subject == '\n')
2334     current_subject++;
2335 nigel 77 }
2336     }
2337    
2338     /* Or to a non-unique first char after study */
2339    
2340     else if (start_bits != NULL)
2341     {
2342     while (current_subject < end_subject)
2343     {
2344     register unsigned int c = *current_subject;
2345     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2346     else break;
2347     }
2348     }
2349    
2350     /* Restore fudged end_subject */
2351    
2352     end_subject = save_end_subject;
2353     }
2354    
2355     /* If req_byte is set, we know that that character must appear in the subject
2356     for the match to succeed. If the first character is set, req_byte must be
2357     later in the subject; otherwise the test starts at the match point. This
2358     optimization can save a huge amount of work in patterns with nested unlimited
2359     repeats that aren't going to match. Writing separate code for cased/caseless
2360     versions makes it go faster, as does using an autoincrement and backing off
2361     on a match.
2362    
2363     HOWEVER: when the subject string is very, very long, searching to its end can
2364     take a long time, and give bad performance on quite ordinary patterns. This
2365     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2366     don't do this when the string is sufficiently long.
2367    
2368     ALSO: this processing is disabled when partial matching is requested.
2369     */
2370    
2371     if (req_byte >= 0 &&
2372     end_subject - current_subject < REQ_BYTE_MAX &&
2373     (options & PCRE_PARTIAL) == 0)
2374     {
2375     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2376    
2377     /* We don't need to repeat the search if we haven't yet reached the
2378     place we found it at last time. */
2379    
2380     if (p > req_byte_ptr)
2381     {
2382     if (req_byte_caseless)
2383     {
2384     while (p < end_subject)
2385     {
2386     register int pp = *p++;
2387     if (pp == req_byte || pp == req_byte2) { p--; break; }
2388     }
2389     }
2390     else
2391     {
2392     while (p < end_subject)
2393     {
2394     if (*p++ == req_byte) { p--; break; }
2395     }
2396     }
2397    
2398     /* If we can't find the required character, break the matching loop,
2399     which will cause a return or PCRE_ERROR_NOMATCH. */
2400    
2401     if (p >= end_subject) break;
2402    
2403     /* If we have found the required character, save the point where we
2404     found it, so that we don't search again next time round the loop if
2405     the start hasn't passed this character yet. */
2406    
2407     req_byte_ptr = p;
2408     }
2409     }
2410    
2411     /* OK, now we can do the business */
2412    
2413     rc = internal_dfa_exec(
2414 nigel 91 md, /* fixed match data */
2415     md->start_code, /* this subexpression's code */
2416     current_subject, /* where we currently are */
2417     start_offset, /* start offset in subject */
2418     offsets, /* offset vector */
2419     offsetcount, /* size of same */
2420     workspace, /* workspace vector */
2421     wscount, /* size of same */
2422 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2423 nigel 91 0, /* function recurse level */
2424     0); /* regex recurse level */
2425 nigel 77
2426     /* Anything other than "no match" means we are done, always; otherwise, carry
2427     on only if not anchored. */
2428    
2429     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2430    
2431     /* Advance to the next subject character unless we are at the end of a line
2432     and firstline is set. */
2433    
2434 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2435 nigel 77 current_subject++;
2436     if (utf8)
2437     {
2438     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2439     current_subject++;
2440     }
2441     if (current_subject > end_subject) break;
2442    
2443 ph10 150 /* If we have just passed a CR and the newline option is CRLF or ANY or
2444 ph10 149 ANYCRLF, and we are now at a LF, advance the match position by one more
2445     character. */
2446 nigel 93
2447     if (current_subject[-1] == '\r' &&
2448 ph10 150 (md->nltype == NLTYPE_ANY ||
2449     md->nltype == NLTYPE_ANYCRLF ||
2450 ph10 149 md->nllen == 2) &&
2451 nigel 93 current_subject < end_subject &&
2452     *current_subject == '\n')
2453     current_subject++;
2454    
2455     } /* "Bumpalong" loop */
2456    
2457 nigel 77 return PCRE_ERROR_NOMATCH;
2458     }
2459    
2460     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12