/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 335 - (hide annotations) (download)
Sat Apr 12 14:36:14 2008 UTC (6 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 95430 byte(s)
Do not discard subpatterns with {0} quantifiers, as they may be called as 
subroutines.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 ph10 200 #ifdef HAVE_CONFIG_H
48 ph10 236 #include "config.h"
49 ph10 200 #endif
50 ph10 199
51 nigel 93 #define NLBLOCK md /* Block containing newline information */
52     #define PSSTART start_subject /* Field containing processed string start */
53     #define PSEND end_subject /* Field containing processed string end */
54    
55 nigel 77 #include "pcre_internal.h"
56    
57    
58     /* For use to indent debugging output */
59    
60     #define SP " "
61    
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87 ph10 327 static const uschar coptable[] = {
88 nigel 77 0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 nigel 77 0, 0, /* Any, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 211 0, 0 /* FAIL, ACCEPT */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141 ph10 327 static const uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146     0 /* OP_ANY */
147     };
148    
149 ph10 327 static const uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154     1 /* OP_ANY */
155     };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226     Returns: > 0 =>
227     = 0 =>
228     -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 nigel 77
458     /* Make the new state list into the active state list and empty the
459     new state list. */
460    
461     temp_states = active_states;
462     active_states = new_states;
463     new_states = temp_states;
464     active_count = new_count;
465     new_count = 0;
466    
467     workspace[0] ^= 1; /* Remember for the restarting feature */
468     workspace[1] = active_count;
469    
470     #ifdef DEBUG
471     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473     printf("\"\n");
474    
475     printf("%.*sActive states: ", rlevel*2-2, SP);
476     for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478     printf("\n");
479     #endif
480    
481     /* Set the pointers for adding new states */
482    
483     next_active_state = active_states + active_count;
484     next_new_state = new_states;
485    
486     /* Load the current character from the subject outside the loop, as many
487     different states may want to look at it, and we assume that at least one
488     will. */
489    
490     if (ptr < end_subject)
491     {
492 nigel 93 clen = 1; /* Number of bytes in the character */
493 nigel 77 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495     #endif /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498     else
499     {
500 nigel 93 clen = 0; /* This indicates the end of the subject */
501     c = NOTACHAR; /* This value should never actually be used */
502 nigel 77 }
503    
504     /* Scan up the active states and act on each one. The result of an action
505     may be to add more states to the currently active list (e.g. on hitting a
506     parenthesis) or it may be to put states on the new list, for considering
507     when we move the character pointer on. */
508    
509     for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514     int count, codevalue;
515 ph10 152 #ifdef SUPPORT_UCP
516 nigel 87 int chartype, script;
517 ph10 152 #endif
518 nigel 77
519     #ifdef DEBUG
520     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 nigel 93 if (clen == 0) printf("EOL\n");
522 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
523     else printf("0x%02x\n", c);
524     #endif
525    
526     /* This variable is referred to implicity in the ADD_xxx macros. */
527    
528     ims = current_state->ims;
529    
530     /* A negative offset is a special case meaning "hold off going to this
531     (negated) state until the number of characters in the data field have
532     been skipped". */
533    
534     if (state_offset < 0)
535     {
536     if (current_state->data > 0)
537     {
538     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539     ADD_NEW_DATA(state_offset, current_state->count,
540     current_state->data - 1);
541     continue;
542     }
543     else
544     {
545     current_state->offset = state_offset = -state_offset;
546     }
547     }
548    
549     /* Check for a duplicate state with the same count, and skip if found. */
550    
551     for (j = 0; j < i; j++)
552     {
553     if (active_states[j].offset == state_offset &&
554     active_states[j].count == current_state->count)
555     {
556     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557     goto NEXT_ACTIVE_STATE;
558     }
559     }
560    
561     /* The state offset is the offset to the opcode */
562    
563     code = start_code + state_offset;
564     codevalue = *code;
565    
566     /* If this opcode is followed by an inline character, load it. It is
567     tempting to test for the presence of a subject character here, but that
568     is wrong, because sometimes zero repetitions of the subject are
569     permitted.
570    
571     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 ph10 178 argument that is not a data character - but is always one byte long. We
573     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574     this case. To keep the other cases fast, convert these ones to new opcodes.
575     */
576 nigel 77
577     if (coptable[codevalue] > 0)
578     {
579     dlen = 1;
580     #ifdef SUPPORT_UTF8
581     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582     #endif /* SUPPORT_UTF8 */
583     d = code[coptable[codevalue]];
584     if (codevalue >= OP_TYPESTAR)
585     {
586 nigel 93 switch(d)
587     {
588     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
589     case OP_NOTPROP:
590     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593 ph10 178 case OP_NOT_HSPACE:
594 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595 ph10 178 case OP_NOT_VSPACE:
596 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 nigel 93 default: break;
598     }
599 nigel 77 }
600     }
601     else
602     {
603     dlen = 0; /* Not strictly necessary, but compilers moan */
604 nigel 93 d = NOTACHAR; /* if these variables are not set. */
605 nigel 77 }
606    
607    
608     /* Now process the individual opcodes */
609    
610     switch (codevalue)
611     {
612    
613     /* ========================================================================== */
614     /* Reached a closing bracket. If not at the end of the pattern, carry
615     on with the next opcode. Otherwise, unless we have an empty string and
616     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617     matches so we always have the longest first. */
618    
619     case OP_KET:
620     case OP_KETRMIN:
621     case OP_KETRMAX:
622     if (code != end_code)
623     {
624     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625     if (codevalue != OP_KET)
626     {
627     ADD_ACTIVE(state_offset - GET(code, 1), 0);
628     }
629     }
630     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
631     {
632     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
634     match_count = 0;
635     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637     if (offsetcount >= 2)
638     {
639     offsets[0] = current_subject - start_subject;
640     offsets[1] = ptr - start_subject;
641     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642     offsets[1] - offsets[0], current_subject));
643     }
644     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
645     {
646     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648     match_count, rlevel*2-2, SP));
649     return match_count;
650     }
651     }
652     break;
653    
654     /* ========================================================================== */
655     /* These opcodes add to the current list of states without looking
656     at the current character. */
657    
658     /*-----------------------------------------------------------------*/
659     case OP_ALT:
660     do { code += GET(code, 1); } while (*code == OP_ALT);
661     ADD_ACTIVE(code - start_code, 0);
662     break;
663    
664     /*-----------------------------------------------------------------*/
665     case OP_BRA:
666 nigel 93 case OP_SBRA:
667 nigel 77 do
668     {
669     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670     code += GET(code, 1);
671     }
672     while (*code == OP_ALT);
673     break;
674    
675     /*-----------------------------------------------------------------*/
676 nigel 93 case OP_CBRA:
677     case OP_SCBRA:
678     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679     code += GET(code, 1);
680     while (*code == OP_ALT)
681     {
682     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683     code += GET(code, 1);
684     }
685     break;
686    
687     /*-----------------------------------------------------------------*/
688 nigel 77 case OP_BRAZERO:
689     case OP_BRAMINZERO:
690     ADD_ACTIVE(state_offset + 1, 0);
691     code += 1 + GET(code, 2);
692     while (*code == OP_ALT) code += GET(code, 1);
693     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
694     break;
695    
696     /*-----------------------------------------------------------------*/
697 ph10 335 case OP_SKIPZERO:
698     code += 1 + GET(code, 2);
699     while (*code == OP_ALT) code += GET(code, 1);
700     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
701     break;
702    
703     /*-----------------------------------------------------------------*/
704 nigel 77 case OP_CIRC:
705     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
706 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
707     ptr != end_subject &&
708 nigel 93 WAS_NEWLINE(ptr)))
709 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
710     break;
711    
712     /*-----------------------------------------------------------------*/
713     case OP_EOD:
714     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
715     break;
716    
717     /*-----------------------------------------------------------------*/
718     case OP_OPT:
719     ims = code[1];
720     ADD_ACTIVE(state_offset + 2, 0);
721     break;
722    
723     /*-----------------------------------------------------------------*/
724     case OP_SOD:
725     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
726     break;
727    
728     /*-----------------------------------------------------------------*/
729     case OP_SOM:
730     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
731     break;
732    
733    
734     /* ========================================================================== */
735     /* These opcodes inspect the next subject character, and sometimes
736     the previous one as well, but do not have an argument. The variable
737     clen contains the length of the current character and is zero if we are
738     at the end of the subject. */
739    
740     /*-----------------------------------------------------------------*/
741     case OP_ANY:
742 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
743 nigel 77 { ADD_NEW(state_offset + 1, 0); }
744     break;
745    
746     /*-----------------------------------------------------------------*/
747     case OP_EODN:
748 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
749 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
750     break;
751    
752     /*-----------------------------------------------------------------*/
753     case OP_DOLL:
754     if ((md->moptions & PCRE_NOTEOL) == 0)
755     {
756 nigel 91 if (clen == 0 ||
757 nigel 93 (IS_NEWLINE(ptr) &&
758 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
759     ))
760 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
761     }
762 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
763 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
764     break;
765    
766     /*-----------------------------------------------------------------*/
767    
768     case OP_DIGIT:
769     case OP_WHITESPACE:
770     case OP_WORDCHAR:
771     if (clen > 0 && c < 256 &&
772     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
773     { ADD_NEW(state_offset + 1, 0); }
774     break;
775    
776     /*-----------------------------------------------------------------*/
777     case OP_NOT_DIGIT:
778     case OP_NOT_WHITESPACE:
779     case OP_NOT_WORDCHAR:
780     if (clen > 0 && (c >= 256 ||
781     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
782     { ADD_NEW(state_offset + 1, 0); }
783     break;
784    
785     /*-----------------------------------------------------------------*/
786     case OP_WORD_BOUNDARY:
787     case OP_NOT_WORD_BOUNDARY:
788     {
789     int left_word, right_word;
790    
791     if (ptr > start_subject)
792     {
793     const uschar *temp = ptr - 1;
794     #ifdef SUPPORT_UTF8
795     if (utf8) BACKCHAR(temp);
796     #endif
797     GETCHARTEST(d, temp);
798     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
799     }
800     else left_word = 0;
801    
802     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
803     else right_word = 0;
804    
805     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
806     { ADD_ACTIVE(state_offset + 1, 0); }
807     }
808     break;
809    
810    
811     /*-----------------------------------------------------------------*/
812     /* Check the next character by Unicode property. We will get here only
813     if the support is in the binary; otherwise a compile-time error occurs.
814     */
815    
816 ph10 151 #ifdef SUPPORT_UCP
817 nigel 77 case OP_PROP:
818     case OP_NOTPROP:
819     if (clen > 0)
820     {
821 nigel 87 BOOL OK;
822     int category = _pcre_ucp_findprop(c, &chartype, &script);
823     switch(code[1])
824 nigel 77 {
825 nigel 87 case PT_ANY:
826     OK = TRUE;
827     break;
828    
829     case PT_LAMP:
830     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
831     break;
832    
833     case PT_GC:
834     OK = category == code[2];
835     break;
836    
837     case PT_PC:
838     OK = chartype == code[2];
839     break;
840    
841     case PT_SC:
842     OK = script == code[2];
843     break;
844    
845     /* Should never occur, but keep compilers from grumbling. */
846    
847     default:
848     OK = codevalue != OP_PROP;
849     break;
850 nigel 77 }
851 nigel 87
852     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
853 nigel 77 }
854     break;
855     #endif
856    
857    
858    
859     /* ========================================================================== */
860     /* These opcodes likewise inspect the subject character, but have an
861     argument that is not a data character. It is one of these opcodes:
862     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
863     OP_NOT_WORDCHAR. The value is loaded into d. */
864    
865     case OP_TYPEPLUS:
866     case OP_TYPEMINPLUS:
867 nigel 93 case OP_TYPEPOSPLUS:
868 nigel 77 count = current_state->count; /* Already matched */
869     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
870     if (clen > 0)
871     {
872     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
873     (c < 256 &&
874 nigel 91 (d != OP_ANY ||
875     (ims & PCRE_DOTALL) != 0 ||
876     !IS_NEWLINE(ptr)
877     ) &&
878 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879     {
880 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
881     {
882     active_count--; /* Remove non-match possibility */
883     next_active_state--;
884     }
885 nigel 77 count++;
886     ADD_NEW(state_offset, count);
887     }
888     }
889     break;
890    
891     /*-----------------------------------------------------------------*/
892     case OP_TYPEQUERY:
893     case OP_TYPEMINQUERY:
894 nigel 93 case OP_TYPEPOSQUERY:
895 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
896     if (clen > 0)
897     {
898     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899     (c < 256 &&
900 nigel 91 (d != OP_ANY ||
901     (ims & PCRE_DOTALL) != 0 ||
902     !IS_NEWLINE(ptr)
903     ) &&
904 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
905     {
906 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
907     {
908     active_count--; /* Remove non-match possibility */
909     next_active_state--;
910     }
911 nigel 77 ADD_NEW(state_offset + 2, 0);
912     }
913     }
914     break;
915    
916     /*-----------------------------------------------------------------*/
917     case OP_TYPESTAR:
918     case OP_TYPEMINSTAR:
919 nigel 93 case OP_TYPEPOSSTAR:
920 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
921     if (clen > 0)
922     {
923     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
924     (c < 256 &&
925 nigel 91 (d != OP_ANY ||
926     (ims & PCRE_DOTALL) != 0 ||
927     !IS_NEWLINE(ptr)
928     ) &&
929 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
930     {
931 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
932     {
933     active_count--; /* Remove non-match possibility */
934     next_active_state--;
935     }
936 nigel 77 ADD_NEW(state_offset, 0);
937     }
938     }
939     break;
940    
941     /*-----------------------------------------------------------------*/
942     case OP_TYPEEXACT:
943 nigel 93 count = current_state->count; /* Number already matched */
944     if (clen > 0)
945     {
946     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
947     (c < 256 &&
948     (d != OP_ANY ||
949     (ims & PCRE_DOTALL) != 0 ||
950     !IS_NEWLINE(ptr)
951     ) &&
952     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
953     {
954     if (++count >= GET2(code, 1))
955     { ADD_NEW(state_offset + 4, 0); }
956     else
957     { ADD_NEW(state_offset, count); }
958     }
959     }
960     break;
961    
962     /*-----------------------------------------------------------------*/
963 nigel 77 case OP_TYPEUPTO:
964     case OP_TYPEMINUPTO:
965 nigel 93 case OP_TYPEPOSUPTO:
966     ADD_ACTIVE(state_offset + 4, 0);
967 nigel 77 count = current_state->count; /* Number already matched */
968     if (clen > 0)
969     {
970     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
971     (c < 256 &&
972 nigel 91 (d != OP_ANY ||
973     (ims & PCRE_DOTALL) != 0 ||
974     !IS_NEWLINE(ptr)
975     ) &&
976 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
977     {
978 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
979     {
980     active_count--; /* Remove non-match possibility */
981     next_active_state--;
982     }
983 nigel 77 if (++count >= GET2(code, 1))
984     { ADD_NEW(state_offset + 4, 0); }
985     else
986     { ADD_NEW(state_offset, count); }
987     }
988     }
989     break;
990    
991     /* ========================================================================== */
992     /* These are virtual opcodes that are used when something like
993 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
994     argument. It keeps the code above fast for the other cases. The argument
995     is in the d variable. */
996 nigel 77
997 ph10 151 #ifdef SUPPORT_UCP
998 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
999     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1000 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1001 nigel 77 count = current_state->count; /* Already matched */
1002 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1003 nigel 77 if (clen > 0)
1004     {
1005 nigel 87 BOOL OK;
1006     int category = _pcre_ucp_findprop(c, &chartype, &script);
1007     switch(code[2])
1008     {
1009     case PT_ANY:
1010     OK = TRUE;
1011     break;
1012    
1013     case PT_LAMP:
1014     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1015     break;
1016    
1017     case PT_GC:
1018     OK = category == code[3];
1019     break;
1020    
1021     case PT_PC:
1022     OK = chartype == code[3];
1023     break;
1024    
1025     case PT_SC:
1026     OK = script == code[3];
1027     break;
1028    
1029     /* Should never occur, but keep compilers from grumbling. */
1030    
1031     default:
1032     OK = codevalue != OP_PROP;
1033     break;
1034     }
1035    
1036 nigel 93 if (OK == (d == OP_PROP))
1037     {
1038     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1039     {
1040     active_count--; /* Remove non-match possibility */
1041     next_active_state--;
1042     }
1043     count++;
1044     ADD_NEW(state_offset, count);
1045     }
1046 nigel 77 }
1047     break;
1048    
1049     /*-----------------------------------------------------------------*/
1050     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1051     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1052 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1053 nigel 77 count = current_state->count; /* Already matched */
1054     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1055 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1056 nigel 77 {
1057     const uschar *nptr = ptr + clen;
1058     int ncount = 0;
1059 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1060     {
1061     active_count--; /* Remove non-match possibility */
1062     next_active_state--;
1063     }
1064 nigel 77 while (nptr < end_subject)
1065     {
1066     int nd;
1067     int ndlen = 1;
1068     GETCHARLEN(nd, nptr, ndlen);
1069 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1070 nigel 77 ncount++;
1071     nptr += ndlen;
1072     }
1073     count++;
1074     ADD_NEW_DATA(-state_offset, count, ncount);
1075     }
1076     break;
1077 ph10 151 #endif
1078 nigel 77
1079     /*-----------------------------------------------------------------*/
1080 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1081     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1082     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1083     count = current_state->count; /* Already matched */
1084     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1085     if (clen > 0)
1086     {
1087     int ncount = 0;
1088     switch (c)
1089     {
1090     case 0x000b:
1091     case 0x000c:
1092     case 0x0085:
1093     case 0x2028:
1094     case 0x2029:
1095 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1096     goto ANYNL01;
1097    
1098     case 0x000d:
1099     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1100     /* Fall through */
1101    
1102     ANYNL01:
1103     case 0x000a:
1104 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1105     {
1106     active_count--; /* Remove non-match possibility */
1107     next_active_state--;
1108     }
1109     count++;
1110     ADD_NEW_DATA(-state_offset, count, ncount);
1111     break;
1112 ph10 231
1113 nigel 93 default:
1114     break;
1115     }
1116     }
1117     break;
1118    
1119     /*-----------------------------------------------------------------*/
1120 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1121     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1122     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1123     count = current_state->count; /* Already matched */
1124     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1125     if (clen > 0)
1126     {
1127 ph10 182 BOOL OK;
1128 ph10 178 switch (c)
1129     {
1130     case 0x000a:
1131     case 0x000b:
1132     case 0x000c:
1133     case 0x000d:
1134     case 0x0085:
1135     case 0x2028:
1136     case 0x2029:
1137     OK = TRUE;
1138 ph10 182 break;
1139 ph10 178
1140     default:
1141     OK = FALSE;
1142 ph10 182 break;
1143 ph10 178 }
1144    
1145     if (OK == (d == OP_VSPACE))
1146 ph10 182 {
1147 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1148     {
1149     active_count--; /* Remove non-match possibility */
1150     next_active_state--;
1151     }
1152     count++;
1153     ADD_NEW_DATA(-state_offset, count, 0);
1154     }
1155     }
1156     break;
1157    
1158     /*-----------------------------------------------------------------*/
1159     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1160     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1161     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1162     count = current_state->count; /* Already matched */
1163     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1164     if (clen > 0)
1165     {
1166 ph10 182 BOOL OK;
1167 ph10 178 switch (c)
1168     {
1169     case 0x09: /* HT */
1170     case 0x20: /* SPACE */
1171     case 0xa0: /* NBSP */
1172     case 0x1680: /* OGHAM SPACE MARK */
1173     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1174     case 0x2000: /* EN QUAD */
1175     case 0x2001: /* EM QUAD */
1176     case 0x2002: /* EN SPACE */
1177     case 0x2003: /* EM SPACE */
1178     case 0x2004: /* THREE-PER-EM SPACE */
1179     case 0x2005: /* FOUR-PER-EM SPACE */
1180     case 0x2006: /* SIX-PER-EM SPACE */
1181     case 0x2007: /* FIGURE SPACE */
1182     case 0x2008: /* PUNCTUATION SPACE */
1183     case 0x2009: /* THIN SPACE */
1184     case 0x200A: /* HAIR SPACE */
1185     case 0x202f: /* NARROW NO-BREAK SPACE */
1186     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1187     case 0x3000: /* IDEOGRAPHIC SPACE */
1188     OK = TRUE;
1189     break;
1190 ph10 182
1191 ph10 178 default:
1192     OK = FALSE;
1193     break;
1194     }
1195 ph10 182
1196 ph10 178 if (OK == (d == OP_HSPACE))
1197 ph10 182 {
1198 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1199     {
1200     active_count--; /* Remove non-match possibility */
1201     next_active_state--;
1202     }
1203     count++;
1204     ADD_NEW_DATA(-state_offset, count, 0);
1205     }
1206     }
1207     break;
1208    
1209     /*-----------------------------------------------------------------*/
1210 ph10 151 #ifdef SUPPORT_UCP
1211 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1212     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1213 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1214 nigel 87 count = 4;
1215 nigel 77 goto QS1;
1216    
1217     case OP_PROP_EXTRA + OP_TYPESTAR:
1218     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1219 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1220 nigel 77 count = 0;
1221    
1222     QS1:
1223    
1224 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1225 nigel 77 if (clen > 0)
1226     {
1227 nigel 87 BOOL OK;
1228     int category = _pcre_ucp_findprop(c, &chartype, &script);
1229     switch(code[2])
1230     {
1231     case PT_ANY:
1232     OK = TRUE;
1233     break;
1234    
1235     case PT_LAMP:
1236     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1237     break;
1238    
1239     case PT_GC:
1240     OK = category == code[3];
1241     break;
1242    
1243     case PT_PC:
1244     OK = chartype == code[3];
1245     break;
1246    
1247     case PT_SC:
1248     OK = script == code[3];
1249     break;
1250    
1251     /* Should never occur, but keep compilers from grumbling. */
1252    
1253     default:
1254     OK = codevalue != OP_PROP;
1255     break;
1256     }
1257    
1258 nigel 93 if (OK == (d == OP_PROP))
1259     {
1260     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1261     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1262     {
1263     active_count--; /* Remove non-match possibility */
1264     next_active_state--;
1265     }
1266     ADD_NEW(state_offset + count, 0);
1267     }
1268 nigel 77 }
1269     break;
1270    
1271     /*-----------------------------------------------------------------*/
1272     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1273     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1274 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1275 nigel 77 count = 2;
1276     goto QS2;
1277    
1278     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1279     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1280 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1281 nigel 77 count = 0;
1282    
1283     QS2:
1284    
1285     ADD_ACTIVE(state_offset + 2, 0);
1286 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1287 nigel 77 {
1288     const uschar *nptr = ptr + clen;
1289     int ncount = 0;
1290 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1291     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1292     {
1293     active_count--; /* Remove non-match possibility */
1294     next_active_state--;
1295     }
1296 nigel 77 while (nptr < end_subject)
1297     {
1298     int nd;
1299     int ndlen = 1;
1300     GETCHARLEN(nd, nptr, ndlen);
1301 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1302 nigel 77 ncount++;
1303     nptr += ndlen;
1304     }
1305     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1306     }
1307     break;
1308 ph10 151 #endif
1309 nigel 77
1310     /*-----------------------------------------------------------------*/
1311 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1312     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1313     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1314     count = 2;
1315     goto QS3;
1316    
1317     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1318     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1319     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1320     count = 0;
1321    
1322     QS3:
1323     ADD_ACTIVE(state_offset + 2, 0);
1324     if (clen > 0)
1325     {
1326     int ncount = 0;
1327     switch (c)
1328     {
1329     case 0x000b:
1330     case 0x000c:
1331     case 0x0085:
1332     case 0x2028:
1333     case 0x2029:
1334 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1335     goto ANYNL02;
1336    
1337     case 0x000d:
1338     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1339     /* Fall through */
1340    
1341     ANYNL02:
1342     case 0x000a:
1343 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1344     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1345     {
1346     active_count--; /* Remove non-match possibility */
1347     next_active_state--;
1348     }
1349     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1350     break;
1351 ph10 231
1352 nigel 93 default:
1353     break;
1354     }
1355     }
1356     break;
1357    
1358     /*-----------------------------------------------------------------*/
1359 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1360     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1361     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1362     count = 2;
1363     goto QS4;
1364    
1365     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1366     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1367     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1368     count = 0;
1369    
1370     QS4:
1371     ADD_ACTIVE(state_offset + 2, 0);
1372     if (clen > 0)
1373     {
1374 ph10 182 BOOL OK;
1375 ph10 178 switch (c)
1376     {
1377     case 0x000a:
1378     case 0x000b:
1379     case 0x000c:
1380     case 0x000d:
1381     case 0x0085:
1382     case 0x2028:
1383     case 0x2029:
1384     OK = TRUE;
1385     break;
1386 ph10 182
1387 ph10 178 default:
1388     OK = FALSE;
1389     break;
1390     }
1391     if (OK == (d == OP_VSPACE))
1392 ph10 182 {
1393 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1394     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1395     {
1396     active_count--; /* Remove non-match possibility */
1397     next_active_state--;
1398     }
1399     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1400     }
1401     }
1402     break;
1403    
1404     /*-----------------------------------------------------------------*/
1405     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1406     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1407     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1408     count = 2;
1409     goto QS5;
1410    
1411     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1412     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1413     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1414     count = 0;
1415    
1416     QS5:
1417     ADD_ACTIVE(state_offset + 2, 0);
1418     if (clen > 0)
1419     {
1420 ph10 182 BOOL OK;
1421 ph10 178 switch (c)
1422     {
1423     case 0x09: /* HT */
1424     case 0x20: /* SPACE */
1425     case 0xa0: /* NBSP */
1426     case 0x1680: /* OGHAM SPACE MARK */
1427     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1428     case 0x2000: /* EN QUAD */
1429     case 0x2001: /* EM QUAD */
1430     case 0x2002: /* EN SPACE */
1431     case 0x2003: /* EM SPACE */
1432     case 0x2004: /* THREE-PER-EM SPACE */
1433     case 0x2005: /* FOUR-PER-EM SPACE */
1434     case 0x2006: /* SIX-PER-EM SPACE */
1435     case 0x2007: /* FIGURE SPACE */
1436     case 0x2008: /* PUNCTUATION SPACE */
1437     case 0x2009: /* THIN SPACE */
1438     case 0x200A: /* HAIR SPACE */
1439     case 0x202f: /* NARROW NO-BREAK SPACE */
1440     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1441     case 0x3000: /* IDEOGRAPHIC SPACE */
1442     OK = TRUE;
1443     break;
1444 ph10 182
1445 ph10 178 default:
1446     OK = FALSE;
1447     break;
1448     }
1449 ph10 182
1450 ph10 178 if (OK == (d == OP_HSPACE))
1451 ph10 182 {
1452 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1453     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1454     {
1455     active_count--; /* Remove non-match possibility */
1456     next_active_state--;
1457     }
1458     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1459     }
1460     }
1461     break;
1462    
1463     /*-----------------------------------------------------------------*/
1464 ph10 151 #ifdef SUPPORT_UCP
1465 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1466     case OP_PROP_EXTRA + OP_TYPEUPTO:
1467     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1468 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1469 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1470 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1471 nigel 77 count = current_state->count; /* Number already matched */
1472     if (clen > 0)
1473     {
1474 nigel 87 BOOL OK;
1475     int category = _pcre_ucp_findprop(c, &chartype, &script);
1476     switch(code[4])
1477 nigel 77 {
1478 nigel 87 case PT_ANY:
1479     OK = TRUE;
1480     break;
1481    
1482     case PT_LAMP:
1483     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1484     break;
1485    
1486     case PT_GC:
1487     OK = category == code[5];
1488     break;
1489    
1490     case PT_PC:
1491     OK = chartype == code[5];
1492     break;
1493    
1494     case PT_SC:
1495     OK = script == code[5];
1496     break;
1497    
1498     /* Should never occur, but keep compilers from grumbling. */
1499    
1500     default:
1501     OK = codevalue != OP_PROP;
1502     break;
1503     }
1504    
1505     if (OK == (d == OP_PROP))
1506     {
1507 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1508     {
1509     active_count--; /* Remove non-match possibility */
1510     next_active_state--;
1511     }
1512 nigel 77 if (++count >= GET2(code, 1))
1513 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1514 nigel 77 else
1515     { ADD_NEW(state_offset, count); }
1516     }
1517     }
1518     break;
1519    
1520     /*-----------------------------------------------------------------*/
1521     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1522     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1523     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1524 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1525 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1526     { ADD_ACTIVE(state_offset + 4, 0); }
1527     count = current_state->count; /* Number already matched */
1528 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1529 nigel 77 {
1530     const uschar *nptr = ptr + clen;
1531     int ncount = 0;
1532 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1533     {
1534     active_count--; /* Remove non-match possibility */
1535     next_active_state--;
1536     }
1537 nigel 77 while (nptr < end_subject)
1538     {
1539     int nd;
1540     int ndlen = 1;
1541     GETCHARLEN(nd, nptr, ndlen);
1542 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1543 nigel 77 ncount++;
1544     nptr += ndlen;
1545     }
1546     if (++count >= GET2(code, 1))
1547     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1548     else
1549     { ADD_NEW_DATA(-state_offset, count, ncount); }
1550     }
1551     break;
1552 ph10 151 #endif
1553 nigel 77
1554 nigel 93 /*-----------------------------------------------------------------*/
1555     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1556     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1557     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1558     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1559     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1560     { ADD_ACTIVE(state_offset + 4, 0); }
1561     count = current_state->count; /* Number already matched */
1562     if (clen > 0)
1563     {
1564     int ncount = 0;
1565     switch (c)
1566     {
1567     case 0x000b:
1568     case 0x000c:
1569     case 0x0085:
1570     case 0x2028:
1571     case 0x2029:
1572 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1573     goto ANYNL03;
1574    
1575     case 0x000d:
1576     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1577     /* Fall through */
1578    
1579     ANYNL03:
1580     case 0x000a:
1581 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1582     {
1583     active_count--; /* Remove non-match possibility */
1584     next_active_state--;
1585     }
1586     if (++count >= GET2(code, 1))
1587     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1588     else
1589     { ADD_NEW_DATA(-state_offset, count, ncount); }
1590     break;
1591 ph10 231
1592 nigel 93 default:
1593     break;
1594     }
1595     }
1596     break;
1597    
1598 ph10 178 /*-----------------------------------------------------------------*/
1599     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1600     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1601     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1602     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1603     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1604     { ADD_ACTIVE(state_offset + 4, 0); }
1605     count = current_state->count; /* Number already matched */
1606     if (clen > 0)
1607     {
1608 ph10 182 BOOL OK;
1609 ph10 178 switch (c)
1610     {
1611     case 0x000a:
1612     case 0x000b:
1613     case 0x000c:
1614     case 0x000d:
1615     case 0x0085:
1616     case 0x2028:
1617     case 0x2029:
1618     OK = TRUE;
1619     break;
1620 ph10 182
1621 ph10 178 default:
1622     OK = FALSE;
1623     }
1624 ph10 182
1625 ph10 178 if (OK == (d == OP_VSPACE))
1626 ph10 182 {
1627 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1628     {
1629     active_count--; /* Remove non-match possibility */
1630     next_active_state--;
1631     }
1632     if (++count >= GET2(code, 1))
1633     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1634     else
1635     { ADD_NEW_DATA(-state_offset, count, 0); }
1636     }
1637     }
1638     break;
1639    
1640     /*-----------------------------------------------------------------*/
1641     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1642     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1643     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1644     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1645     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1646     { ADD_ACTIVE(state_offset + 4, 0); }
1647     count = current_state->count; /* Number already matched */
1648     if (clen > 0)
1649     {
1650 ph10 182 BOOL OK;
1651 ph10 178 switch (c)
1652     {
1653     case 0x09: /* HT */
1654     case 0x20: /* SPACE */
1655     case 0xa0: /* NBSP */
1656     case 0x1680: /* OGHAM SPACE MARK */
1657     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1658     case 0x2000: /* EN QUAD */
1659     case 0x2001: /* EM QUAD */
1660     case 0x2002: /* EN SPACE */
1661     case 0x2003: /* EM SPACE */
1662     case 0x2004: /* THREE-PER-EM SPACE */
1663     case 0x2005: /* FOUR-PER-EM SPACE */
1664     case 0x2006: /* SIX-PER-EM SPACE */
1665     case 0x2007: /* FIGURE SPACE */
1666     case 0x2008: /* PUNCTUATION SPACE */
1667     case 0x2009: /* THIN SPACE */
1668     case 0x200A: /* HAIR SPACE */
1669     case 0x202f: /* NARROW NO-BREAK SPACE */
1670     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1671     case 0x3000: /* IDEOGRAPHIC SPACE */
1672     OK = TRUE;
1673     break;
1674 ph10 182
1675 ph10 178 default:
1676     OK = FALSE;
1677     break;
1678     }
1679 ph10 182
1680 ph10 178 if (OK == (d == OP_HSPACE))
1681 ph10 182 {
1682 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1683     {
1684     active_count--; /* Remove non-match possibility */
1685     next_active_state--;
1686     }
1687     if (++count >= GET2(code, 1))
1688     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1689     else
1690     { ADD_NEW_DATA(-state_offset, count, 0); }
1691     }
1692     }
1693     break;
1694    
1695 nigel 77 /* ========================================================================== */
1696     /* These opcodes are followed by a character that is usually compared
1697     to the current subject character; it is loaded into d. We still get
1698     here even if there is no subject character, because in some cases zero
1699     repetitions are permitted. */
1700    
1701     /*-----------------------------------------------------------------*/
1702     case OP_CHAR:
1703     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1704     break;
1705    
1706     /*-----------------------------------------------------------------*/
1707     case OP_CHARNC:
1708     if (clen == 0) break;
1709    
1710     #ifdef SUPPORT_UTF8
1711     if (utf8)
1712     {
1713     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1714     {
1715 nigel 93 unsigned int othercase;
1716 nigel 77 if (c < 128) othercase = fcc[c]; else
1717    
1718     /* If we have Unicode property support, we can use it to test the
1719 nigel 87 other case of the character. */
1720 nigel 77
1721     #ifdef SUPPORT_UCP
1722 nigel 87 othercase = _pcre_ucp_othercase(c);
1723     #else
1724 nigel 93 othercase = NOTACHAR;
1725 nigel 77 #endif
1726    
1727     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1728     }
1729     }
1730     else
1731     #endif /* SUPPORT_UTF8 */
1732    
1733     /* Non-UTF-8 mode */
1734     {
1735     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1736     }
1737     break;
1738    
1739    
1740     #ifdef SUPPORT_UCP
1741     /*-----------------------------------------------------------------*/
1742     /* This is a tricky one because it can match more than one character.
1743     Find out how many characters to skip, and then set up a negative state
1744     to wait for them to pass before continuing. */
1745    
1746     case OP_EXTUNI:
1747 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1748 nigel 77 {
1749     const uschar *nptr = ptr + clen;
1750     int ncount = 0;
1751     while (nptr < end_subject)
1752     {
1753     int nclen = 1;
1754     GETCHARLEN(c, nptr, nclen);
1755 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1756 nigel 77 ncount++;
1757     nptr += nclen;
1758     }
1759     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1760     }
1761     break;
1762     #endif
1763    
1764     /*-----------------------------------------------------------------*/
1765 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1766     character (when CR is followed by LF). In this case, set up a negative
1767     state to wait for one character to pass before continuing. */
1768    
1769     case OP_ANYNL:
1770     if (clen > 0) switch(c)
1771     {
1772     case 0x000b:
1773     case 0x000c:
1774     case 0x0085:
1775     case 0x2028:
1776     case 0x2029:
1777 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1778    
1779     case 0x000a:
1780 nigel 93 ADD_NEW(state_offset + 1, 0);
1781     break;
1782 ph10 231
1783 nigel 93 case 0x000d:
1784     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1785     {
1786     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1787     }
1788     else
1789     {
1790     ADD_NEW(state_offset + 1, 0);
1791     }
1792     break;
1793     }
1794     break;
1795    
1796     /*-----------------------------------------------------------------*/
1797 ph10 178 case OP_NOT_VSPACE:
1798     if (clen > 0) switch(c)
1799     {
1800     case 0x000a:
1801     case 0x000b:
1802     case 0x000c:
1803     case 0x000d:
1804     case 0x0085:
1805     case 0x2028:
1806     case 0x2029:
1807     break;
1808 ph10 182
1809     default:
1810 ph10 178 ADD_NEW(state_offset + 1, 0);
1811     break;
1812     }
1813     break;
1814    
1815     /*-----------------------------------------------------------------*/
1816     case OP_VSPACE:
1817     if (clen > 0) switch(c)
1818     {
1819     case 0x000a:
1820     case 0x000b:
1821     case 0x000c:
1822     case 0x000d:
1823     case 0x0085:
1824     case 0x2028:
1825     case 0x2029:
1826     ADD_NEW(state_offset + 1, 0);
1827     break;
1828 ph10 182
1829 ph10 178 default: break;
1830     }
1831     break;
1832    
1833     /*-----------------------------------------------------------------*/
1834     case OP_NOT_HSPACE:
1835     if (clen > 0) switch(c)
1836     {
1837     case 0x09: /* HT */
1838     case 0x20: /* SPACE */
1839     case 0xa0: /* NBSP */
1840     case 0x1680: /* OGHAM SPACE MARK */
1841     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1842     case 0x2000: /* EN QUAD */
1843     case 0x2001: /* EM QUAD */
1844     case 0x2002: /* EN SPACE */
1845     case 0x2003: /* EM SPACE */
1846     case 0x2004: /* THREE-PER-EM SPACE */
1847     case 0x2005: /* FOUR-PER-EM SPACE */
1848     case 0x2006: /* SIX-PER-EM SPACE */
1849     case 0x2007: /* FIGURE SPACE */
1850     case 0x2008: /* PUNCTUATION SPACE */
1851     case 0x2009: /* THIN SPACE */
1852     case 0x200A: /* HAIR SPACE */
1853     case 0x202f: /* NARROW NO-BREAK SPACE */
1854     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1855     case 0x3000: /* IDEOGRAPHIC SPACE */
1856     break;
1857 ph10 182
1858     default:
1859 ph10 178 ADD_NEW(state_offset + 1, 0);
1860     break;
1861     }
1862     break;
1863    
1864     /*-----------------------------------------------------------------*/
1865     case OP_HSPACE:
1866     if (clen > 0) switch(c)
1867     {
1868     case 0x09: /* HT */
1869     case 0x20: /* SPACE */
1870     case 0xa0: /* NBSP */
1871     case 0x1680: /* OGHAM SPACE MARK */
1872     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1873     case 0x2000: /* EN QUAD */
1874     case 0x2001: /* EM QUAD */
1875     case 0x2002: /* EN SPACE */
1876     case 0x2003: /* EM SPACE */
1877     case 0x2004: /* THREE-PER-EM SPACE */
1878     case 0x2005: /* FOUR-PER-EM SPACE */
1879     case 0x2006: /* SIX-PER-EM SPACE */
1880     case 0x2007: /* FIGURE SPACE */
1881     case 0x2008: /* PUNCTUATION SPACE */
1882     case 0x2009: /* THIN SPACE */
1883     case 0x200A: /* HAIR SPACE */
1884     case 0x202f: /* NARROW NO-BREAK SPACE */
1885     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1886     case 0x3000: /* IDEOGRAPHIC SPACE */
1887     ADD_NEW(state_offset + 1, 0);
1888     break;
1889     }
1890     break;
1891    
1892     /*-----------------------------------------------------------------*/
1893 nigel 77 /* Match a negated single character. This is only used for one-byte
1894     characters, that is, we know that d < 256. The character we are
1895     checking (c) can be multibyte. */
1896    
1897     case OP_NOT:
1898     if (clen > 0)
1899     {
1900 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1901 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1902     }
1903     break;
1904    
1905     /*-----------------------------------------------------------------*/
1906     case OP_PLUS:
1907     case OP_MINPLUS:
1908 nigel 93 case OP_POSPLUS:
1909 nigel 77 case OP_NOTPLUS:
1910     case OP_NOTMINPLUS:
1911 nigel 93 case OP_NOTPOSPLUS:
1912 nigel 77 count = current_state->count; /* Already matched */
1913     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1914     if (clen > 0)
1915     {
1916 nigel 93 unsigned int otherd = NOTACHAR;
1917 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1918     {
1919     #ifdef SUPPORT_UTF8
1920 nigel 87 if (utf8 && d >= 128)
1921 nigel 77 {
1922     #ifdef SUPPORT_UCP
1923 nigel 87 otherd = _pcre_ucp_othercase(d);
1924 nigel 77 #endif /* SUPPORT_UCP */
1925     }
1926     else
1927     #endif /* SUPPORT_UTF8 */
1928     otherd = fcc[d];
1929     }
1930     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1931 nigel 93 {
1932     if (count > 0 &&
1933     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1934     {
1935     active_count--; /* Remove non-match possibility */
1936     next_active_state--;
1937     }
1938     count++;
1939     ADD_NEW(state_offset, count);
1940     }
1941 nigel 77 }
1942     break;
1943    
1944     /*-----------------------------------------------------------------*/
1945     case OP_QUERY:
1946     case OP_MINQUERY:
1947 nigel 93 case OP_POSQUERY:
1948 nigel 77 case OP_NOTQUERY:
1949     case OP_NOTMINQUERY:
1950 nigel 93 case OP_NOTPOSQUERY:
1951 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1952     if (clen > 0)
1953     {
1954 nigel 93 unsigned int otherd = NOTACHAR;
1955 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1956 nigel 77 {
1957     #ifdef SUPPORT_UTF8
1958 nigel 87 if (utf8 && d >= 128)
1959 nigel 77 {
1960     #ifdef SUPPORT_UCP
1961 nigel 87 otherd = _pcre_ucp_othercase(d);
1962 nigel 77 #endif /* SUPPORT_UCP */
1963     }
1964     else
1965     #endif /* SUPPORT_UTF8 */
1966     otherd = fcc[d];
1967     }
1968     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1969 nigel 93 {
1970     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1971     {
1972     active_count--; /* Remove non-match possibility */
1973     next_active_state--;
1974     }
1975     ADD_NEW(state_offset + dlen + 1, 0);
1976     }
1977 nigel 77 }
1978     break;
1979    
1980     /*-----------------------------------------------------------------*/
1981     case OP_STAR:
1982     case OP_MINSTAR:
1983 nigel 93 case OP_POSSTAR:
1984 nigel 77 case OP_NOTSTAR:
1985     case OP_NOTMINSTAR:
1986 nigel 93 case OP_NOTPOSSTAR:
1987 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1988     if (clen > 0)
1989     {
1990 nigel 93 unsigned int otherd = NOTACHAR;
1991 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1992 nigel 77 {
1993     #ifdef SUPPORT_UTF8
1994 nigel 87 if (utf8 && d >= 128)
1995 nigel 77 {
1996     #ifdef SUPPORT_UCP
1997 nigel 87 otherd = _pcre_ucp_othercase(d);
1998 nigel 77 #endif /* SUPPORT_UCP */
1999     }
2000     else
2001     #endif /* SUPPORT_UTF8 */
2002     otherd = fcc[d];
2003     }
2004     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2005 nigel 93 {
2006     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2007     {
2008     active_count--; /* Remove non-match possibility */
2009     next_active_state--;
2010     }
2011     ADD_NEW(state_offset, 0);
2012     }
2013 nigel 77 }
2014     break;
2015    
2016     /*-----------------------------------------------------------------*/
2017     case OP_EXACT:
2018 nigel 93 case OP_NOTEXACT:
2019     count = current_state->count; /* Number already matched */
2020     if (clen > 0)
2021     {
2022     unsigned int otherd = NOTACHAR;
2023     if ((ims & PCRE_CASELESS) != 0)
2024     {
2025     #ifdef SUPPORT_UTF8
2026     if (utf8 && d >= 128)
2027     {
2028     #ifdef SUPPORT_UCP
2029     otherd = _pcre_ucp_othercase(d);
2030     #endif /* SUPPORT_UCP */
2031     }
2032     else
2033     #endif /* SUPPORT_UTF8 */
2034     otherd = fcc[d];
2035     }
2036     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2037     {
2038     if (++count >= GET2(code, 1))
2039     { ADD_NEW(state_offset + dlen + 3, 0); }
2040     else
2041     { ADD_NEW(state_offset, count); }
2042     }
2043     }
2044     break;
2045    
2046     /*-----------------------------------------------------------------*/
2047 nigel 77 case OP_UPTO:
2048     case OP_MINUPTO:
2049 nigel 93 case OP_POSUPTO:
2050 nigel 77 case OP_NOTUPTO:
2051     case OP_NOTMINUPTO:
2052 nigel 93 case OP_NOTPOSUPTO:
2053     ADD_ACTIVE(state_offset + dlen + 3, 0);
2054 nigel 77 count = current_state->count; /* Number already matched */
2055     if (clen > 0)
2056     {
2057 nigel 93 unsigned int otherd = NOTACHAR;
2058 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2059     {
2060     #ifdef SUPPORT_UTF8
2061 nigel 87 if (utf8 && d >= 128)
2062 nigel 77 {
2063     #ifdef SUPPORT_UCP
2064 nigel 87 otherd = _pcre_ucp_othercase(d);
2065 nigel 77 #endif /* SUPPORT_UCP */
2066     }
2067     else
2068     #endif /* SUPPORT_UTF8 */
2069     otherd = fcc[d];
2070     }
2071     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2072     {
2073 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2074     {
2075     active_count--; /* Remove non-match possibility */
2076     next_active_state--;
2077     }
2078 nigel 77 if (++count >= GET2(code, 1))
2079     { ADD_NEW(state_offset + dlen + 3, 0); }
2080     else
2081     { ADD_NEW(state_offset, count); }
2082     }
2083     }
2084     break;
2085    
2086    
2087     /* ========================================================================== */
2088     /* These are the class-handling opcodes */
2089    
2090     case OP_CLASS:
2091     case OP_NCLASS:
2092     case OP_XCLASS:
2093     {
2094     BOOL isinclass = FALSE;
2095     int next_state_offset;
2096     const uschar *ecode;
2097    
2098     /* For a simple class, there is always just a 32-byte table, and we
2099     can set isinclass from it. */
2100    
2101     if (codevalue != OP_XCLASS)
2102     {
2103     ecode = code + 33;
2104     if (clen > 0)
2105     {
2106     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2107     ((code[1 + c/8] & (1 << (c&7))) != 0);
2108     }
2109     }
2110    
2111     /* An extended class may have a table or a list of single characters,
2112     ranges, or both, and it may be positive or negative. There's a
2113     function that sorts all this out. */
2114    
2115     else
2116     {
2117     ecode = code + GET(code, 1);
2118     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2119     }
2120    
2121     /* At this point, isinclass is set for all kinds of class, and ecode
2122     points to the byte after the end of the class. If there is a
2123     quantifier, this is where it will be. */
2124    
2125     next_state_offset = ecode - start_code;
2126    
2127     switch (*ecode)
2128     {
2129     case OP_CRSTAR:
2130     case OP_CRMINSTAR:
2131     ADD_ACTIVE(next_state_offset + 1, 0);
2132     if (isinclass) { ADD_NEW(state_offset, 0); }
2133     break;
2134    
2135     case OP_CRPLUS:
2136     case OP_CRMINPLUS:
2137     count = current_state->count; /* Already matched */
2138     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2139     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2140     break;
2141    
2142     case OP_CRQUERY:
2143     case OP_CRMINQUERY:
2144     ADD_ACTIVE(next_state_offset + 1, 0);
2145     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2146     break;
2147    
2148     case OP_CRRANGE:
2149     case OP_CRMINRANGE:
2150     count = current_state->count; /* Already matched */
2151     if (count >= GET2(ecode, 1))
2152     { ADD_ACTIVE(next_state_offset + 5, 0); }
2153     if (isinclass)
2154     {
2155 nigel 91 int max = GET2(ecode, 3);
2156     if (++count >= max && max != 0) /* Max 0 => no limit */
2157 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2158     else
2159     { ADD_NEW(state_offset, count); }
2160     }
2161     break;
2162    
2163     default:
2164     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2165     break;
2166     }
2167     }
2168     break;
2169    
2170     /* ========================================================================== */
2171     /* These are the opcodes for fancy brackets of various kinds. We have
2172     to use recursion in order to handle them. */
2173    
2174     case OP_ASSERT:
2175     case OP_ASSERT_NOT:
2176     case OP_ASSERTBACK:
2177     case OP_ASSERTBACK_NOT:
2178     {
2179     int rc;
2180     int local_offsets[2];
2181     int local_workspace[1000];
2182     const uschar *endasscode = code + GET(code, 1);
2183    
2184     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2185    
2186     rc = internal_dfa_exec(
2187     md, /* static match data */
2188     code, /* this subexpression's code */
2189     ptr, /* where we currently are */
2190     ptr - start_subject, /* start offset */
2191     local_offsets, /* offset vector */
2192     sizeof(local_offsets)/sizeof(int), /* size of same */
2193     local_workspace, /* workspace vector */
2194     sizeof(local_workspace)/sizeof(int), /* size of same */
2195     ims, /* the current ims flags */
2196     rlevel, /* function recursion level */
2197     recursing); /* pass on regex recursion */
2198    
2199     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2200     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2201     }
2202     break;
2203    
2204     /*-----------------------------------------------------------------*/
2205     case OP_COND:
2206 nigel 93 case OP_SCOND:
2207 nigel 77 {
2208     int local_offsets[1000];
2209     int local_workspace[1000];
2210     int condcode = code[LINK_SIZE+1];
2211    
2212 nigel 93 /* Back reference conditions are not supported */
2213 nigel 77
2214 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2215    
2216     /* The DEFINE condition is always false */
2217    
2218     if (condcode == OP_DEF)
2219 nigel 77 {
2220 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2221     }
2222    
2223     /* The only supported version of OP_RREF is for the value RREF_ANY,
2224     which means "test if in any recursion". We can't test for specifically
2225     recursed groups. */
2226    
2227     else if (condcode == OP_RREF)
2228     {
2229 nigel 77 int value = GET2(code, LINK_SIZE+2);
2230 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2231 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2232     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2233     }
2234    
2235     /* Otherwise, the condition is an assertion */
2236    
2237     else
2238     {
2239     int rc;
2240     const uschar *asscode = code + LINK_SIZE + 1;
2241     const uschar *endasscode = asscode + GET(asscode, 1);
2242    
2243     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2244    
2245     rc = internal_dfa_exec(
2246     md, /* fixed match data */
2247     asscode, /* this subexpression's code */
2248     ptr, /* where we currently are */
2249     ptr - start_subject, /* start offset */
2250     local_offsets, /* offset vector */
2251     sizeof(local_offsets)/sizeof(int), /* size of same */
2252     local_workspace, /* workspace vector */
2253     sizeof(local_workspace)/sizeof(int), /* size of same */
2254     ims, /* the current ims flags */
2255     rlevel, /* function recursion level */
2256     recursing); /* pass on regex recursion */
2257    
2258     if ((rc >= 0) ==
2259     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2260     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2261     else
2262     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2263     }
2264     }
2265     break;
2266    
2267     /*-----------------------------------------------------------------*/
2268     case OP_RECURSE:
2269     {
2270     int local_offsets[1000];
2271     int local_workspace[1000];
2272     int rc;
2273    
2274     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2275     recursing + 1));
2276    
2277     rc = internal_dfa_exec(
2278     md, /* fixed match data */
2279     start_code + GET(code, 1), /* this subexpression's code */
2280     ptr, /* where we currently are */
2281     ptr - start_subject, /* start offset */
2282     local_offsets, /* offset vector */
2283     sizeof(local_offsets)/sizeof(int), /* size of same */
2284     local_workspace, /* workspace vector */
2285     sizeof(local_workspace)/sizeof(int), /* size of same */
2286     ims, /* the current ims flags */
2287     rlevel, /* function recursion level */
2288     recursing + 1); /* regex recurse level */
2289    
2290     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2291     recursing + 1, rc));
2292    
2293     /* Ran out of internal offsets */
2294    
2295     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2296    
2297     /* For each successful matched substring, set up the next state with a
2298     count of characters to skip before trying it. Note that the count is in
2299     characters, not bytes. */
2300    
2301     if (rc > 0)
2302     {
2303     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2304     {
2305     const uschar *p = start_subject + local_offsets[rc];
2306     const uschar *pp = start_subject + local_offsets[rc+1];
2307     int charcount = local_offsets[rc+1] - local_offsets[rc];
2308     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2309     if (charcount > 0)
2310     {
2311     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2312     }
2313     else
2314     {
2315     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2316     }
2317     }
2318     }
2319     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2320     }
2321     break;
2322    
2323     /*-----------------------------------------------------------------*/
2324     case OP_ONCE:
2325     {
2326     int local_offsets[2];
2327     int local_workspace[1000];
2328    
2329     int rc = internal_dfa_exec(
2330     md, /* fixed match data */
2331     code, /* this subexpression's code */
2332     ptr, /* where we currently are */
2333     ptr - start_subject, /* start offset */
2334     local_offsets, /* offset vector */
2335     sizeof(local_offsets)/sizeof(int), /* size of same */
2336     local_workspace, /* workspace vector */
2337     sizeof(local_workspace)/sizeof(int), /* size of same */
2338     ims, /* the current ims flags */
2339     rlevel, /* function recursion level */
2340     recursing); /* pass on regex recursion */
2341    
2342     if (rc >= 0)
2343     {
2344     const uschar *end_subpattern = code;
2345     int charcount = local_offsets[1] - local_offsets[0];
2346     int next_state_offset, repeat_state_offset;
2347    
2348     do { end_subpattern += GET(end_subpattern, 1); }
2349     while (*end_subpattern == OP_ALT);
2350     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2351    
2352     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2353     arrange for the repeat state also to be added to the relevant list.
2354     Calculate the offset, or set -1 for no repeat. */
2355    
2356     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2357     *end_subpattern == OP_KETRMIN)?
2358     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2359    
2360     /* If we have matched an empty string, add the next state at the
2361     current character pointer. This is important so that the duplicate
2362     checking kicks in, which is what breaks infinite loops that match an
2363     empty string. */
2364    
2365     if (charcount == 0)
2366     {
2367     ADD_ACTIVE(next_state_offset, 0);
2368     }
2369    
2370     /* Optimization: if there are no more active states, and there
2371     are no new states yet set up, then skip over the subject string
2372     right here, to save looping. Otherwise, set up the new state to swing
2373     into action when the end of the substring is reached. */
2374    
2375     else if (i + 1 >= active_count && new_count == 0)
2376     {
2377     ptr += charcount;
2378     clen = 0;
2379     ADD_NEW(next_state_offset, 0);
2380    
2381     /* If we are adding a repeat state at the new character position,
2382     we must fudge things so that it is the only current state.
2383     Otherwise, it might be a duplicate of one we processed before, and
2384     that would cause it to be skipped. */
2385    
2386     if (repeat_state_offset >= 0)
2387     {
2388     next_active_state = active_states;
2389     active_count = 0;
2390     i = -1;
2391     ADD_ACTIVE(repeat_state_offset, 0);
2392     }
2393     }
2394     else
2395     {
2396     const uschar *p = start_subject + local_offsets[0];
2397     const uschar *pp = start_subject + local_offsets[1];
2398     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2399     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2400     if (repeat_state_offset >= 0)
2401     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2402     }
2403    
2404     }
2405     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2406     }
2407     break;
2408    
2409    
2410     /* ========================================================================== */
2411     /* Handle callouts */
2412    
2413     case OP_CALLOUT:
2414     if (pcre_callout != NULL)
2415     {
2416     int rrc;
2417     pcre_callout_block cb;
2418     cb.version = 1; /* Version 1 of the callout block */
2419     cb.callout_number = code[1];
2420     cb.offset_vector = offsets;
2421 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2422 nigel 77 cb.subject_length = end_subject - start_subject;
2423     cb.start_match = current_subject - start_subject;
2424     cb.current_position = ptr - start_subject;
2425     cb.pattern_position = GET(code, 2);
2426     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2427     cb.capture_top = 1;
2428     cb.capture_last = -1;
2429     cb.callout_data = md->callout_data;
2430     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2431     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2432     }
2433     break;
2434    
2435    
2436     /* ========================================================================== */
2437     default: /* Unsupported opcode */
2438     return PCRE_ERROR_DFA_UITEM;
2439     }
2440    
2441     NEXT_ACTIVE_STATE: continue;
2442    
2443     } /* End of loop scanning active states */
2444    
2445     /* We have finished the processing at the current subject character. If no
2446     new states have been set for the next character, we have found all the
2447     matches that we are going to find. If we are at the top level and partial
2448     matching has been requested, check for appropriate conditions. */
2449    
2450     if (new_count <= 0)
2451     {
2452     if (match_count < 0 && /* No matches found */
2453     rlevel == 1 && /* Top level match function */
2454     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2455     ptr >= end_subject && /* Reached end of subject */
2456     ptr > current_subject) /* Matched non-empty string */
2457     {
2458     if (offsetcount >= 2)
2459     {
2460     offsets[0] = current_subject - start_subject;
2461     offsets[1] = end_subject - start_subject;
2462     }
2463     match_count = PCRE_ERROR_PARTIAL;
2464     }
2465    
2466     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2467     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2468     rlevel*2-2, SP));
2469 nigel 91 break; /* In effect, "return", but see the comment below */
2470 nigel 77 }
2471    
2472     /* One or more states are active for the next character. */
2473    
2474     ptr += clen; /* Advance to next subject character */
2475     } /* Loop to move along the subject string */
2476    
2477 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2478     if we use "return" above, we have compiler trouble. Some compilers warn if
2479     there's nothing here because they think the function doesn't return a value. On
2480     the other hand, if we put a dummy statement here, some more clever compilers
2481     complain that it can't be reached. Sigh. */
2482 nigel 77
2483 nigel 91 return match_count;
2484 nigel 77 }
2485    
2486    
2487    
2488    
2489     /*************************************************
2490     * Execute a Regular Expression - DFA engine *
2491     *************************************************/
2492    
2493     /* This external function applies a compiled re to a subject string using a DFA
2494     engine. This function calls the internal function multiple times if the pattern
2495     is not anchored.
2496    
2497     Arguments:
2498     argument_re points to the compiled expression
2499 ph10 97 extra_data points to extra data or is NULL
2500 nigel 77 subject points to the subject string
2501     length length of subject string (may contain binary zeros)
2502     start_offset where to start in the subject string
2503     options option bits
2504     offsets vector of match offsets
2505     offsetcount size of same
2506     workspace workspace vector
2507     wscount size of same
2508    
2509     Returns: > 0 => number of match offset pairs placed in offsets
2510     = 0 => offsets overflowed; longest matches are present
2511     -1 => failed to match
2512     < -1 => some kind of unexpected problem
2513     */
2514    
2515 ph10 145 PCRE_EXP_DEFN int
2516 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2517     const char *subject, int length, int start_offset, int options, int *offsets,
2518     int offsetcount, int *workspace, int wscount)
2519     {
2520     real_pcre *re = (real_pcre *)argument_re;
2521     dfa_match_data match_block;
2522 nigel 91 dfa_match_data *md = &match_block;
2523 nigel 77 BOOL utf8, anchored, startline, firstline;
2524     const uschar *current_subject, *end_subject, *lcc;
2525    
2526     pcre_study_data internal_study;
2527     const pcre_study_data *study = NULL;
2528     real_pcre internal_re;
2529    
2530     const uschar *req_byte_ptr;
2531     const uschar *start_bits = NULL;
2532     BOOL first_byte_caseless = FALSE;
2533     BOOL req_byte_caseless = FALSE;
2534     int first_byte = -1;
2535     int req_byte = -1;
2536     int req_byte2 = -1;
2537 nigel 91 int newline;
2538 nigel 77
2539     /* Plausibility checks */
2540    
2541     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2542     if (re == NULL || subject == NULL || workspace == NULL ||
2543     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2544     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2545     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2546    
2547     /* We need to find the pointer to any study data before we test for byte
2548     flipping, so we scan the extra_data block first. This may set two fields in the
2549     match block, so we must initialize them beforehand. However, the other fields
2550     in the match block must not be set until after the byte flipping. */
2551    
2552 nigel 91 md->tables = re->tables;
2553     md->callout_data = NULL;
2554 nigel 77
2555     if (extra_data != NULL)
2556     {
2557     unsigned int flags = extra_data->flags;
2558     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2559     study = (const pcre_study_data *)extra_data->study_data;
2560     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2561 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2562     return PCRE_ERROR_DFA_UMLIMIT;
2563 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2564 nigel 91 md->callout_data = extra_data->callout_data;
2565 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2566 nigel 91 md->tables = extra_data->tables;
2567 nigel 77 }
2568    
2569     /* Check that the first field in the block is the magic number. If it is not,
2570     test for a regex that was compiled on a host of opposite endianness. If this is
2571     the case, flipped values are put in internal_re and internal_study if there was
2572     study data too. */
2573    
2574     if (re->magic_number != MAGIC_NUMBER)
2575     {
2576     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2577     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2578     if (study != NULL) study = &internal_study;
2579     }
2580    
2581     /* Set some local values */
2582    
2583     current_subject = (const unsigned char *)subject + start_offset;
2584     end_subject = (const unsigned char *)subject + length;
2585     req_byte_ptr = current_subject - 1;
2586    
2587 nigel 91 #ifdef SUPPORT_UTF8
2588 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2589 nigel 91 #else
2590     utf8 = FALSE;
2591     #endif
2592 nigel 77
2593 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2594     (re->options & PCRE_ANCHORED) != 0;
2595    
2596 nigel 77 /* The remaining fixed data for passing around. */
2597    
2598 nigel 91 md->start_code = (const uschar *)argument_re +
2599 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2600 nigel 91 md->start_subject = (const unsigned char *)subject;
2601     md->end_subject = end_subject;
2602     md->moptions = options;
2603     md->poptions = re->options;
2604 nigel 77
2605 ph10 231 /* If the BSR option is not set at match time, copy what was set
2606     at compile time. */
2607    
2608     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2609     {
2610     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2611     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2612     #ifdef BSR_ANYCRLF
2613     else md->moptions |= PCRE_BSR_ANYCRLF;
2614 ph10 243 #endif
2615     }
2616 ph10 231
2617 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2618     nothing is set at run time, whatever was used at compile time applies. */
2619 nigel 91
2620 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2621 nigel 93 PCRE_NEWLINE_BITS)
2622 nigel 91 {
2623 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2624 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2625     case PCRE_NEWLINE_LF: newline = '\n'; break;
2626     case PCRE_NEWLINE_CR+
2627     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2628 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2629 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2630 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2631 nigel 91 }
2632    
2633 ph10 149 if (newline == -2)
2634 nigel 91 {
2635 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2636     }
2637     else if (newline < 0)
2638     {
2639 nigel 93 md->nltype = NLTYPE_ANY;
2640 nigel 91 }
2641     else
2642     {
2643 nigel 93 md->nltype = NLTYPE_FIXED;
2644     if (newline > 255)
2645     {
2646     md->nllen = 2;
2647     md->nl[0] = (newline >> 8) & 255;
2648     md->nl[1] = newline & 255;
2649     }
2650     else
2651     {
2652     md->nllen = 1;
2653     md->nl[0] = newline;
2654     }
2655 nigel 91 }
2656    
2657 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2658     back the character offset. */
2659    
2660     #ifdef SUPPORT_UTF8
2661     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2662     {
2663     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2664     return PCRE_ERROR_BADUTF8;
2665     if (start_offset > 0 && start_offset < length)
2666     {
2667     int tb = ((uschar *)subject)[start_offset];
2668     if (tb > 127)
2669     {
2670     tb &= 0xc0;
2671     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2672     }
2673     }
2674     }
2675     #endif
2676    
2677     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2678     is a feature that makes it possible to save compiled regex and re-use them
2679     in other programs later. */
2680    
2681 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2682 nigel 77
2683     /* The lower casing table and the "must be at the start of a line" flag are
2684     used in a loop when finding where to start. */
2685    
2686 nigel 91 lcc = md->tables + lcc_offset;
2687 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2688 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2689    
2690     /* Set up the first character to match, if available. The first_byte value is
2691     never set for an anchored regular expression, but the anchoring may be forced
2692     at run time, so we have to test for anchoring. The first char may be unset for
2693     an unanchored pattern, of course. If there's no first char and the pattern was
2694     studied, there may be a bitmap of possible first characters. */
2695    
2696     if (!anchored)
2697     {
2698 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2699 nigel 77 {
2700     first_byte = re->first_byte & 255;
2701     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2702     first_byte = lcc[first_byte];
2703     }
2704     else
2705     {
2706     if (startline && study != NULL &&
2707     (study->options & PCRE_STUDY_MAPPED) != 0)
2708     start_bits = study->start_bits;
2709     }
2710     }
2711    
2712     /* For anchored or unanchored matches, there may be a "last known required
2713     character" set. */
2714    
2715 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2716 nigel 77 {
2717     req_byte = re->req_byte & 255;
2718     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2719 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2720 nigel 77 }
2721    
2722     /* Call the main matching function, looping for a non-anchored regex after a
2723     failed match. Unless restarting, optimize by moving to the first match
2724     character if possible, when not anchored. Then unless wanting a partial match,
2725     check for a required later character. */
2726    
2727     for (;;)
2728     {
2729     int rc;
2730    
2731     if ((options & PCRE_DFA_RESTART) == 0)
2732     {
2733     const uschar *save_end_subject = end_subject;
2734    
2735     /* Advance to a unique first char if possible. If firstline is TRUE, the
2736     start of the match is constrained to the first line of a multiline string.
2737 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2738     scanning at a newline. If the match fails at the newline, later code breaks
2739     this loop. */
2740 nigel 77
2741     if (firstline)
2742     {
2743     const uschar *t = current_subject;
2744 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2745 nigel 77 end_subject = t;
2746     }
2747    
2748     if (first_byte >= 0)
2749     {
2750     if (first_byte_caseless)
2751     while (current_subject < end_subject &&
2752     lcc[*current_subject] != first_byte)
2753     current_subject++;
2754     else
2755     while (current_subject < end_subject && *current_subject != first_byte)
2756     current_subject++;
2757     }
2758    
2759 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2760 nigel 77
2761     else if (startline)
2762     {
2763 nigel 93 if (current_subject > md->start_subject + start_offset)
2764 nigel 77 {
2765 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2766 nigel 77 current_subject++;
2767 ph10 130
2768 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2769     ANYCRLF, and we are now at a LF, advance the match position by one more
2770     character. */
2771 ph10 134
2772 ph10 130 if (current_subject[-1] == '\r' &&
2773 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2774 ph10 130 current_subject < end_subject &&
2775     *current_subject == '\n')
2776     current_subject++;
2777 nigel 77 }
2778     }
2779    
2780     /* Or to a non-unique first char after study */
2781    
2782     else if (start_bits != NULL)
2783     {
2784     while (current_subject < end_subject)
2785     {
2786     register unsigned int c = *current_subject;
2787     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2788     else break;
2789     }
2790     }
2791    
2792     /* Restore fudged end_subject */
2793    
2794     end_subject = save_end_subject;
2795     }
2796    
2797     /* If req_byte is set, we know that that character must appear in the subject
2798     for the match to succeed. If the first character is set, req_byte must be
2799     later in the subject; otherwise the test starts at the match point. This
2800     optimization can save a huge amount of work in patterns with nested unlimited
2801     repeats that aren't going to match. Writing separate code for cased/caseless
2802     versions makes it go faster, as does using an autoincrement and backing off
2803     on a match.
2804    
2805     HOWEVER: when the subject string is very, very long, searching to its end can
2806     take a long time, and give bad performance on quite ordinary patterns. This
2807     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2808     don't do this when the string is sufficiently long.
2809    
2810     ALSO: this processing is disabled when partial matching is requested.
2811     */
2812    
2813     if (req_byte >= 0 &&
2814     end_subject - current_subject < REQ_BYTE_MAX &&
2815     (options & PCRE_PARTIAL) == 0)
2816     {
2817     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2818    
2819     /* We don't need to repeat the search if we haven't yet reached the
2820     place we found it at last time. */
2821    
2822     if (p > req_byte_ptr)
2823     {
2824     if (req_byte_caseless)
2825     {
2826     while (p < end_subject)
2827     {
2828     register int pp = *p++;
2829     if (pp == req_byte || pp == req_byte2) { p--; break; }
2830     }
2831     }
2832     else
2833     {
2834     while (p < end_subject)
2835     {
2836     if (*p++ == req_byte) { p--; break; }
2837     }
2838     }
2839    
2840     /* If we can't find the required character, break the matching loop,
2841     which will cause a return or PCRE_ERROR_NOMATCH. */
2842    
2843     if (p >= end_subject) break;
2844    
2845     /* If we have found the required character, save the point where we
2846     found it, so that we don't search again next time round the loop if
2847     the start hasn't passed this character yet. */
2848    
2849     req_byte_ptr = p;
2850     }
2851     }
2852    
2853     /* OK, now we can do the business */
2854    
2855     rc = internal_dfa_exec(
2856 nigel 91 md, /* fixed match data */
2857     md->start_code, /* this subexpression's code */
2858     current_subject, /* where we currently are */
2859     start_offset, /* start offset in subject */
2860     offsets, /* offset vector */
2861     offsetcount, /* size of same */
2862     workspace, /* workspace vector */
2863     wscount, /* size of same */
2864 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2865 nigel 91 0, /* function recurse level */
2866     0); /* regex recurse level */
2867 nigel 77
2868     /* Anything other than "no match" means we are done, always; otherwise, carry
2869     on only if not anchored. */
2870    
2871     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2872    
2873     /* Advance to the next subject character unless we are at the end of a line
2874     and firstline is set. */
2875    
2876 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2877 nigel 77 current_subject++;
2878     if (utf8)
2879     {
2880     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2881     current_subject++;
2882     }
2883     if (current_subject > end_subject) break;
2884    
2885 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2886 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2887     or ANY or ANYCRLF, advance the match position by one more character. */
2888 nigel 93
2889     if (current_subject[-1] == '\r' &&
2890 ph10 226 current_subject < end_subject &&
2891     *current_subject == '\n' &&
2892 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2893 ph10 226 (md->nltype == NLTYPE_ANY ||
2894     md->nltype == NLTYPE_ANYCRLF ||
2895     md->nllen == 2))
2896 nigel 93 current_subject++;
2897    
2898     } /* "Bumpalong" loop */
2899    
2900 nigel 77 return PCRE_ERROR_NOMATCH;
2901     }
2902    
2903     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12