/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 345 - (hide annotations) (download)
Mon Apr 28 15:10:02 2008 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 95544 byte(s)
Tidies for the 7.7-RC1 distribution.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 ph10 200 #ifdef HAVE_CONFIG_H
48 ph10 236 #include "config.h"
49 ph10 200 #endif
50 ph10 199
51 nigel 93 #define NLBLOCK md /* Block containing newline information */
52     #define PSSTART start_subject /* Field containing processed string start */
53     #define PSEND end_subject /* Field containing processed string end */
54    
55 nigel 77 #include "pcre_internal.h"
56    
57    
58     /* For use to indent debugging output */
59    
60     #define SP " "
61    
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87 ph10 327 static const uschar coptable[] = {
88 nigel 77 0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141 ph10 327 static const uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
147 nigel 77 };
148    
149 ph10 327 static const uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
155 nigel 77 };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
227 ph10 341 = 0 => offsets overflowed; longest matches are present
228 nigel 77 -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 nigel 77
458     /* Make the new state list into the active state list and empty the
459     new state list. */
460    
461     temp_states = active_states;
462     active_states = new_states;
463     new_states = temp_states;
464     active_count = new_count;
465     new_count = 0;
466    
467     workspace[0] ^= 1; /* Remember for the restarting feature */
468     workspace[1] = active_count;
469    
470     #ifdef DEBUG
471     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473     printf("\"\n");
474    
475     printf("%.*sActive states: ", rlevel*2-2, SP);
476     for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478     printf("\n");
479     #endif
480    
481     /* Set the pointers for adding new states */
482    
483     next_active_state = active_states + active_count;
484     next_new_state = new_states;
485    
486     /* Load the current character from the subject outside the loop, as many
487     different states may want to look at it, and we assume that at least one
488     will. */
489    
490     if (ptr < end_subject)
491     {
492 nigel 93 clen = 1; /* Number of bytes in the character */
493 nigel 77 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495     #endif /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498     else
499     {
500 nigel 93 clen = 0; /* This indicates the end of the subject */
501     c = NOTACHAR; /* This value should never actually be used */
502 nigel 77 }
503    
504     /* Scan up the active states and act on each one. The result of an action
505     may be to add more states to the currently active list (e.g. on hitting a
506     parenthesis) or it may be to put states on the new list, for considering
507     when we move the character pointer on. */
508    
509     for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514     int count, codevalue;
515 ph10 152 #ifdef SUPPORT_UCP
516 nigel 87 int chartype, script;
517 ph10 152 #endif
518 nigel 77
519     #ifdef DEBUG
520     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 nigel 93 if (clen == 0) printf("EOL\n");
522 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
523     else printf("0x%02x\n", c);
524     #endif
525    
526     /* This variable is referred to implicity in the ADD_xxx macros. */
527    
528     ims = current_state->ims;
529    
530     /* A negative offset is a special case meaning "hold off going to this
531     (negated) state until the number of characters in the data field have
532     been skipped". */
533    
534     if (state_offset < 0)
535     {
536     if (current_state->data > 0)
537     {
538     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539     ADD_NEW_DATA(state_offset, current_state->count,
540     current_state->data - 1);
541     continue;
542     }
543     else
544     {
545     current_state->offset = state_offset = -state_offset;
546     }
547     }
548    
549     /* Check for a duplicate state with the same count, and skip if found. */
550    
551     for (j = 0; j < i; j++)
552     {
553     if (active_states[j].offset == state_offset &&
554     active_states[j].count == current_state->count)
555     {
556     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557     goto NEXT_ACTIVE_STATE;
558     }
559     }
560    
561     /* The state offset is the offset to the opcode */
562    
563     code = start_code + state_offset;
564     codevalue = *code;
565    
566     /* If this opcode is followed by an inline character, load it. It is
567     tempting to test for the presence of a subject character here, but that
568     is wrong, because sometimes zero repetitions of the subject are
569     permitted.
570    
571     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 ph10 178 argument that is not a data character - but is always one byte long. We
573     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574     this case. To keep the other cases fast, convert these ones to new opcodes.
575     */
576 nigel 77
577     if (coptable[codevalue] > 0)
578     {
579     dlen = 1;
580     #ifdef SUPPORT_UTF8
581     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582     #endif /* SUPPORT_UTF8 */
583     d = code[coptable[codevalue]];
584     if (codevalue >= OP_TYPESTAR)
585     {
586 nigel 93 switch(d)
587     {
588     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
589     case OP_NOTPROP:
590     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593 ph10 178 case OP_NOT_HSPACE:
594 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595 ph10 178 case OP_NOT_VSPACE:
596 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 nigel 93 default: break;
598     }
599 nigel 77 }
600     }
601     else
602     {
603     dlen = 0; /* Not strictly necessary, but compilers moan */
604 nigel 93 d = NOTACHAR; /* if these variables are not set. */
605 nigel 77 }
606    
607    
608     /* Now process the individual opcodes */
609    
610     switch (codevalue)
611     {
612    
613     /* ========================================================================== */
614     /* Reached a closing bracket. If not at the end of the pattern, carry
615     on with the next opcode. Otherwise, unless we have an empty string and
616     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617     matches so we always have the longest first. */
618    
619     case OP_KET:
620     case OP_KETRMIN:
621     case OP_KETRMAX:
622     if (code != end_code)
623     {
624     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625     if (codevalue != OP_KET)
626     {
627     ADD_ACTIVE(state_offset - GET(code, 1), 0);
628     }
629     }
630     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
631     {
632     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
634     match_count = 0;
635     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637     if (offsetcount >= 2)
638     {
639     offsets[0] = current_subject - start_subject;
640     offsets[1] = ptr - start_subject;
641     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642     offsets[1] - offsets[0], current_subject));
643     }
644     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
645     {
646     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648     match_count, rlevel*2-2, SP));
649     return match_count;
650     }
651     }
652     break;
653    
654     /* ========================================================================== */
655     /* These opcodes add to the current list of states without looking
656     at the current character. */
657    
658     /*-----------------------------------------------------------------*/
659     case OP_ALT:
660     do { code += GET(code, 1); } while (*code == OP_ALT);
661     ADD_ACTIVE(code - start_code, 0);
662     break;
663    
664     /*-----------------------------------------------------------------*/
665     case OP_BRA:
666 nigel 93 case OP_SBRA:
667 nigel 77 do
668     {
669     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670     code += GET(code, 1);
671     }
672     while (*code == OP_ALT);
673     break;
674    
675     /*-----------------------------------------------------------------*/
676 nigel 93 case OP_CBRA:
677     case OP_SCBRA:
678     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679     code += GET(code, 1);
680     while (*code == OP_ALT)
681     {
682     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683     code += GET(code, 1);
684     }
685     break;
686    
687     /*-----------------------------------------------------------------*/
688 nigel 77 case OP_BRAZERO:
689     case OP_BRAMINZERO:
690     ADD_ACTIVE(state_offset + 1, 0);
691     code += 1 + GET(code, 2);
692     while (*code == OP_ALT) code += GET(code, 1);
693     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
694     break;
695    
696     /*-----------------------------------------------------------------*/
697 ph10 335 case OP_SKIPZERO:
698     code += 1 + GET(code, 2);
699     while (*code == OP_ALT) code += GET(code, 1);
700     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
701     break;
702    
703     /*-----------------------------------------------------------------*/
704 nigel 77 case OP_CIRC:
705     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
706 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
707     ptr != end_subject &&
708 nigel 93 WAS_NEWLINE(ptr)))
709 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
710     break;
711    
712     /*-----------------------------------------------------------------*/
713     case OP_EOD:
714     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
715     break;
716    
717     /*-----------------------------------------------------------------*/
718     case OP_OPT:
719     ims = code[1];
720     ADD_ACTIVE(state_offset + 2, 0);
721     break;
722    
723     /*-----------------------------------------------------------------*/
724     case OP_SOD:
725     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
726     break;
727    
728     /*-----------------------------------------------------------------*/
729     case OP_SOM:
730     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
731     break;
732    
733    
734     /* ========================================================================== */
735     /* These opcodes inspect the next subject character, and sometimes
736     the previous one as well, but do not have an argument. The variable
737     clen contains the length of the current character and is zero if we are
738     at the end of the subject. */
739    
740     /*-----------------------------------------------------------------*/
741     case OP_ANY:
742 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
743 nigel 77 { ADD_NEW(state_offset + 1, 0); }
744     break;
745    
746     /*-----------------------------------------------------------------*/
747 ph10 341 case OP_ALLANY:
748     if (clen > 0)
749     { ADD_NEW(state_offset + 1, 0); }
750     break;
751    
752     /*-----------------------------------------------------------------*/
753 nigel 77 case OP_EODN:
754 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
755 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
756     break;
757    
758     /*-----------------------------------------------------------------*/
759     case OP_DOLL:
760     if ((md->moptions & PCRE_NOTEOL) == 0)
761     {
762 nigel 91 if (clen == 0 ||
763 nigel 93 (IS_NEWLINE(ptr) &&
764 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
765     ))
766 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
767     }
768 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
769 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
770     break;
771    
772     /*-----------------------------------------------------------------*/
773    
774     case OP_DIGIT:
775     case OP_WHITESPACE:
776     case OP_WORDCHAR:
777     if (clen > 0 && c < 256 &&
778     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
779     { ADD_NEW(state_offset + 1, 0); }
780     break;
781    
782     /*-----------------------------------------------------------------*/
783     case OP_NOT_DIGIT:
784     case OP_NOT_WHITESPACE:
785     case OP_NOT_WORDCHAR:
786     if (clen > 0 && (c >= 256 ||
787     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
788     { ADD_NEW(state_offset + 1, 0); }
789     break;
790    
791     /*-----------------------------------------------------------------*/
792     case OP_WORD_BOUNDARY:
793     case OP_NOT_WORD_BOUNDARY:
794     {
795     int left_word, right_word;
796    
797     if (ptr > start_subject)
798     {
799     const uschar *temp = ptr - 1;
800     #ifdef SUPPORT_UTF8
801     if (utf8) BACKCHAR(temp);
802     #endif
803     GETCHARTEST(d, temp);
804     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
805     }
806     else left_word = 0;
807    
808     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
809     else right_word = 0;
810    
811     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
812     { ADD_ACTIVE(state_offset + 1, 0); }
813     }
814     break;
815    
816    
817     /*-----------------------------------------------------------------*/
818     /* Check the next character by Unicode property. We will get here only
819     if the support is in the binary; otherwise a compile-time error occurs.
820     */
821    
822 ph10 151 #ifdef SUPPORT_UCP
823 nigel 77 case OP_PROP:
824     case OP_NOTPROP:
825     if (clen > 0)
826     {
827 nigel 87 BOOL OK;
828     int category = _pcre_ucp_findprop(c, &chartype, &script);
829     switch(code[1])
830 nigel 77 {
831 nigel 87 case PT_ANY:
832     OK = TRUE;
833     break;
834    
835     case PT_LAMP:
836     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
837     break;
838    
839     case PT_GC:
840     OK = category == code[2];
841     break;
842    
843     case PT_PC:
844     OK = chartype == code[2];
845     break;
846    
847     case PT_SC:
848     OK = script == code[2];
849     break;
850    
851     /* Should never occur, but keep compilers from grumbling. */
852    
853     default:
854     OK = codevalue != OP_PROP;
855     break;
856 nigel 77 }
857 nigel 87
858     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
859 nigel 77 }
860     break;
861     #endif
862    
863    
864    
865     /* ========================================================================== */
866     /* These opcodes likewise inspect the subject character, but have an
867     argument that is not a data character. It is one of these opcodes:
868 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
869     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
870 nigel 77
871     case OP_TYPEPLUS:
872     case OP_TYPEMINPLUS:
873 nigel 93 case OP_TYPEPOSPLUS:
874 nigel 77 count = current_state->count; /* Already matched */
875     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
876     if (clen > 0)
877     {
878     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
879     (c < 256 &&
880 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
881 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
882     {
883 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
884     {
885     active_count--; /* Remove non-match possibility */
886     next_active_state--;
887     }
888 nigel 77 count++;
889     ADD_NEW(state_offset, count);
890     }
891     }
892     break;
893    
894     /*-----------------------------------------------------------------*/
895     case OP_TYPEQUERY:
896     case OP_TYPEMINQUERY:
897 nigel 93 case OP_TYPEPOSQUERY:
898 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
899     if (clen > 0)
900     {
901     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
902     (c < 256 &&
903 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
904 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
905     {
906 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
907     {
908     active_count--; /* Remove non-match possibility */
909     next_active_state--;
910     }
911 nigel 77 ADD_NEW(state_offset + 2, 0);
912     }
913     }
914     break;
915    
916     /*-----------------------------------------------------------------*/
917     case OP_TYPESTAR:
918     case OP_TYPEMINSTAR:
919 nigel 93 case OP_TYPEPOSSTAR:
920 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
921     if (clen > 0)
922     {
923     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
924     (c < 256 &&
925 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
926 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
927     {
928 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
929     {
930     active_count--; /* Remove non-match possibility */
931     next_active_state--;
932     }
933 nigel 77 ADD_NEW(state_offset, 0);
934     }
935     }
936     break;
937    
938     /*-----------------------------------------------------------------*/
939     case OP_TYPEEXACT:
940 nigel 93 count = current_state->count; /* Number already matched */
941     if (clen > 0)
942     {
943     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
944     (c < 256 &&
945 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
946 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
947     {
948     if (++count >= GET2(code, 1))
949     { ADD_NEW(state_offset + 4, 0); }
950     else
951     { ADD_NEW(state_offset, count); }
952     }
953     }
954     break;
955    
956     /*-----------------------------------------------------------------*/
957 nigel 77 case OP_TYPEUPTO:
958     case OP_TYPEMINUPTO:
959 nigel 93 case OP_TYPEPOSUPTO:
960     ADD_ACTIVE(state_offset + 4, 0);
961 nigel 77 count = current_state->count; /* Number already matched */
962     if (clen > 0)
963     {
964     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
965     (c < 256 &&
966 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
967 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
968     {
969 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
970     {
971     active_count--; /* Remove non-match possibility */
972     next_active_state--;
973     }
974 nigel 77 if (++count >= GET2(code, 1))
975     { ADD_NEW(state_offset + 4, 0); }
976     else
977     { ADD_NEW(state_offset, count); }
978     }
979     }
980     break;
981    
982     /* ========================================================================== */
983     /* These are virtual opcodes that are used when something like
984 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
985     argument. It keeps the code above fast for the other cases. The argument
986     is in the d variable. */
987 nigel 77
988 ph10 151 #ifdef SUPPORT_UCP
989 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
990     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
991 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
992 nigel 77 count = current_state->count; /* Already matched */
993 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
994 nigel 77 if (clen > 0)
995     {
996 nigel 87 BOOL OK;
997     int category = _pcre_ucp_findprop(c, &chartype, &script);
998     switch(code[2])
999     {
1000     case PT_ANY:
1001     OK = TRUE;
1002     break;
1003    
1004     case PT_LAMP:
1005     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1006     break;
1007    
1008     case PT_GC:
1009     OK = category == code[3];
1010     break;
1011    
1012     case PT_PC:
1013     OK = chartype == code[3];
1014     break;
1015    
1016     case PT_SC:
1017     OK = script == code[3];
1018     break;
1019    
1020     /* Should never occur, but keep compilers from grumbling. */
1021    
1022     default:
1023     OK = codevalue != OP_PROP;
1024     break;
1025     }
1026    
1027 nigel 93 if (OK == (d == OP_PROP))
1028     {
1029     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1030     {
1031     active_count--; /* Remove non-match possibility */
1032     next_active_state--;
1033     }
1034     count++;
1035     ADD_NEW(state_offset, count);
1036     }
1037 nigel 77 }
1038     break;
1039    
1040     /*-----------------------------------------------------------------*/
1041     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1042     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1043 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1044 nigel 77 count = current_state->count; /* Already matched */
1045     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1046 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1047 nigel 77 {
1048     const uschar *nptr = ptr + clen;
1049     int ncount = 0;
1050 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1051     {
1052     active_count--; /* Remove non-match possibility */
1053     next_active_state--;
1054     }
1055 nigel 77 while (nptr < end_subject)
1056     {
1057     int nd;
1058     int ndlen = 1;
1059     GETCHARLEN(nd, nptr, ndlen);
1060 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1061 nigel 77 ncount++;
1062     nptr += ndlen;
1063     }
1064     count++;
1065     ADD_NEW_DATA(-state_offset, count, ncount);
1066     }
1067     break;
1068 ph10 151 #endif
1069 nigel 77
1070     /*-----------------------------------------------------------------*/
1071 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1072     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1073     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1074     count = current_state->count; /* Already matched */
1075     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1076     if (clen > 0)
1077     {
1078     int ncount = 0;
1079     switch (c)
1080     {
1081     case 0x000b:
1082     case 0x000c:
1083     case 0x0085:
1084     case 0x2028:
1085     case 0x2029:
1086 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1087     goto ANYNL01;
1088    
1089     case 0x000d:
1090     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1091     /* Fall through */
1092    
1093     ANYNL01:
1094     case 0x000a:
1095 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1096     {
1097     active_count--; /* Remove non-match possibility */
1098     next_active_state--;
1099     }
1100     count++;
1101     ADD_NEW_DATA(-state_offset, count, ncount);
1102     break;
1103 ph10 231
1104 nigel 93 default:
1105     break;
1106     }
1107     }
1108     break;
1109    
1110     /*-----------------------------------------------------------------*/
1111 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1112     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1113     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1114     count = current_state->count; /* Already matched */
1115     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1116     if (clen > 0)
1117     {
1118 ph10 182 BOOL OK;
1119 ph10 178 switch (c)
1120     {
1121     case 0x000a:
1122     case 0x000b:
1123     case 0x000c:
1124     case 0x000d:
1125     case 0x0085:
1126     case 0x2028:
1127     case 0x2029:
1128     OK = TRUE;
1129 ph10 182 break;
1130 ph10 178
1131     default:
1132     OK = FALSE;
1133 ph10 182 break;
1134 ph10 178 }
1135    
1136     if (OK == (d == OP_VSPACE))
1137 ph10 182 {
1138 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1139     {
1140     active_count--; /* Remove non-match possibility */
1141     next_active_state--;
1142     }
1143     count++;
1144     ADD_NEW_DATA(-state_offset, count, 0);
1145     }
1146     }
1147     break;
1148    
1149     /*-----------------------------------------------------------------*/
1150     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1151     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1152     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1153     count = current_state->count; /* Already matched */
1154     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1155     if (clen > 0)
1156     {
1157 ph10 182 BOOL OK;
1158 ph10 178 switch (c)
1159     {
1160     case 0x09: /* HT */
1161     case 0x20: /* SPACE */
1162     case 0xa0: /* NBSP */
1163     case 0x1680: /* OGHAM SPACE MARK */
1164     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1165     case 0x2000: /* EN QUAD */
1166     case 0x2001: /* EM QUAD */
1167     case 0x2002: /* EN SPACE */
1168     case 0x2003: /* EM SPACE */
1169     case 0x2004: /* THREE-PER-EM SPACE */
1170     case 0x2005: /* FOUR-PER-EM SPACE */
1171     case 0x2006: /* SIX-PER-EM SPACE */
1172     case 0x2007: /* FIGURE SPACE */
1173     case 0x2008: /* PUNCTUATION SPACE */
1174     case 0x2009: /* THIN SPACE */
1175     case 0x200A: /* HAIR SPACE */
1176     case 0x202f: /* NARROW NO-BREAK SPACE */
1177     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1178     case 0x3000: /* IDEOGRAPHIC SPACE */
1179     OK = TRUE;
1180     break;
1181 ph10 182
1182 ph10 178 default:
1183     OK = FALSE;
1184     break;
1185     }
1186 ph10 182
1187 ph10 178 if (OK == (d == OP_HSPACE))
1188 ph10 182 {
1189 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1190     {
1191     active_count--; /* Remove non-match possibility */
1192     next_active_state--;
1193     }
1194     count++;
1195     ADD_NEW_DATA(-state_offset, count, 0);
1196     }
1197     }
1198     break;
1199    
1200     /*-----------------------------------------------------------------*/
1201 ph10 151 #ifdef SUPPORT_UCP
1202 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1203     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1204 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1205 nigel 87 count = 4;
1206 nigel 77 goto QS1;
1207    
1208     case OP_PROP_EXTRA + OP_TYPESTAR:
1209     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1210 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1211 nigel 77 count = 0;
1212    
1213     QS1:
1214    
1215 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1216 nigel 77 if (clen > 0)
1217     {
1218 nigel 87 BOOL OK;
1219     int category = _pcre_ucp_findprop(c, &chartype, &script);
1220     switch(code[2])
1221     {
1222     case PT_ANY:
1223     OK = TRUE;
1224     break;
1225    
1226     case PT_LAMP:
1227     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1228     break;
1229    
1230     case PT_GC:
1231     OK = category == code[3];
1232     break;
1233    
1234     case PT_PC:
1235     OK = chartype == code[3];
1236     break;
1237    
1238     case PT_SC:
1239     OK = script == code[3];
1240     break;
1241    
1242     /* Should never occur, but keep compilers from grumbling. */
1243    
1244     default:
1245     OK = codevalue != OP_PROP;
1246     break;
1247     }
1248    
1249 nigel 93 if (OK == (d == OP_PROP))
1250     {
1251     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1252     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1253     {
1254     active_count--; /* Remove non-match possibility */
1255     next_active_state--;
1256     }
1257     ADD_NEW(state_offset + count, 0);
1258     }
1259 nigel 77 }
1260     break;
1261    
1262     /*-----------------------------------------------------------------*/
1263     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1264     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1265 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1266 nigel 77 count = 2;
1267     goto QS2;
1268    
1269     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1270     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1271 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1272 nigel 77 count = 0;
1273    
1274     QS2:
1275    
1276     ADD_ACTIVE(state_offset + 2, 0);
1277 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1278 nigel 77 {
1279     const uschar *nptr = ptr + clen;
1280     int ncount = 0;
1281 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1282     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1283     {
1284     active_count--; /* Remove non-match possibility */
1285     next_active_state--;
1286     }
1287 nigel 77 while (nptr < end_subject)
1288     {
1289     int nd;
1290     int ndlen = 1;
1291     GETCHARLEN(nd, nptr, ndlen);
1292 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1293 nigel 77 ncount++;
1294     nptr += ndlen;
1295     }
1296     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1297     }
1298     break;
1299 ph10 151 #endif
1300 nigel 77
1301     /*-----------------------------------------------------------------*/
1302 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1303     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1304     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1305     count = 2;
1306     goto QS3;
1307    
1308     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1309     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1310     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1311     count = 0;
1312    
1313     QS3:
1314     ADD_ACTIVE(state_offset + 2, 0);
1315     if (clen > 0)
1316     {
1317     int ncount = 0;
1318     switch (c)
1319     {
1320     case 0x000b:
1321     case 0x000c:
1322     case 0x0085:
1323     case 0x2028:
1324     case 0x2029:
1325 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1326     goto ANYNL02;
1327    
1328     case 0x000d:
1329     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1330     /* Fall through */
1331    
1332     ANYNL02:
1333     case 0x000a:
1334 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1335     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1336     {
1337     active_count--; /* Remove non-match possibility */
1338     next_active_state--;
1339     }
1340     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1341     break;
1342 ph10 231
1343 nigel 93 default:
1344     break;
1345     }
1346     }
1347     break;
1348    
1349     /*-----------------------------------------------------------------*/
1350 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1351     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1352     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1353     count = 2;
1354     goto QS4;
1355    
1356     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1357     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1358     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1359     count = 0;
1360    
1361     QS4:
1362     ADD_ACTIVE(state_offset + 2, 0);
1363     if (clen > 0)
1364     {
1365 ph10 182 BOOL OK;
1366 ph10 178 switch (c)
1367     {
1368     case 0x000a:
1369     case 0x000b:
1370     case 0x000c:
1371     case 0x000d:
1372     case 0x0085:
1373     case 0x2028:
1374     case 0x2029:
1375     OK = TRUE;
1376     break;
1377 ph10 182
1378 ph10 178 default:
1379     OK = FALSE;
1380     break;
1381     }
1382     if (OK == (d == OP_VSPACE))
1383 ph10 182 {
1384 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1385     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1386     {
1387     active_count--; /* Remove non-match possibility */
1388     next_active_state--;
1389     }
1390     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1391     }
1392     }
1393     break;
1394    
1395     /*-----------------------------------------------------------------*/
1396     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1397     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1398     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1399     count = 2;
1400     goto QS5;
1401    
1402     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1403     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1404     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1405     count = 0;
1406    
1407     QS5:
1408     ADD_ACTIVE(state_offset + 2, 0);
1409     if (clen > 0)
1410     {
1411 ph10 182 BOOL OK;
1412 ph10 178 switch (c)
1413     {
1414     case 0x09: /* HT */
1415     case 0x20: /* SPACE */
1416     case 0xa0: /* NBSP */
1417     case 0x1680: /* OGHAM SPACE MARK */
1418     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1419     case 0x2000: /* EN QUAD */
1420     case 0x2001: /* EM QUAD */
1421     case 0x2002: /* EN SPACE */
1422     case 0x2003: /* EM SPACE */
1423     case 0x2004: /* THREE-PER-EM SPACE */
1424     case 0x2005: /* FOUR-PER-EM SPACE */
1425     case 0x2006: /* SIX-PER-EM SPACE */
1426     case 0x2007: /* FIGURE SPACE */
1427     case 0x2008: /* PUNCTUATION SPACE */
1428     case 0x2009: /* THIN SPACE */
1429     case 0x200A: /* HAIR SPACE */
1430     case 0x202f: /* NARROW NO-BREAK SPACE */
1431     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1432     case 0x3000: /* IDEOGRAPHIC SPACE */
1433     OK = TRUE;
1434     break;
1435 ph10 182
1436 ph10 178 default:
1437     OK = FALSE;
1438     break;
1439     }
1440 ph10 182
1441 ph10 178 if (OK == (d == OP_HSPACE))
1442 ph10 182 {
1443 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1444     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1445     {
1446     active_count--; /* Remove non-match possibility */
1447     next_active_state--;
1448     }
1449     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1450     }
1451     }
1452     break;
1453    
1454     /*-----------------------------------------------------------------*/
1455 ph10 151 #ifdef SUPPORT_UCP
1456 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1457     case OP_PROP_EXTRA + OP_TYPEUPTO:
1458     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1459 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1460 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1461 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1462 nigel 77 count = current_state->count; /* Number already matched */
1463     if (clen > 0)
1464     {
1465 nigel 87 BOOL OK;
1466     int category = _pcre_ucp_findprop(c, &chartype, &script);
1467     switch(code[4])
1468 nigel 77 {
1469 nigel 87 case PT_ANY:
1470     OK = TRUE;
1471     break;
1472    
1473     case PT_LAMP:
1474     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1475     break;
1476    
1477     case PT_GC:
1478     OK = category == code[5];
1479     break;
1480    
1481     case PT_PC:
1482     OK = chartype == code[5];
1483     break;
1484    
1485     case PT_SC:
1486     OK = script == code[5];
1487     break;
1488    
1489     /* Should never occur, but keep compilers from grumbling. */
1490    
1491     default:
1492     OK = codevalue != OP_PROP;
1493     break;
1494     }
1495    
1496     if (OK == (d == OP_PROP))
1497     {
1498 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1499     {
1500     active_count--; /* Remove non-match possibility */
1501     next_active_state--;
1502     }
1503 nigel 77 if (++count >= GET2(code, 1))
1504 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1505 nigel 77 else
1506     { ADD_NEW(state_offset, count); }
1507     }
1508     }
1509     break;
1510    
1511     /*-----------------------------------------------------------------*/
1512     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1513     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1514     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1515 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1516 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1517     { ADD_ACTIVE(state_offset + 4, 0); }
1518     count = current_state->count; /* Number already matched */
1519 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1520 nigel 77 {
1521     const uschar *nptr = ptr + clen;
1522     int ncount = 0;
1523 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1524     {
1525     active_count--; /* Remove non-match possibility */
1526     next_active_state--;
1527     }
1528 nigel 77 while (nptr < end_subject)
1529     {
1530     int nd;
1531     int ndlen = 1;
1532     GETCHARLEN(nd, nptr, ndlen);
1533 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1534 nigel 77 ncount++;
1535     nptr += ndlen;
1536     }
1537     if (++count >= GET2(code, 1))
1538     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1539     else
1540     { ADD_NEW_DATA(-state_offset, count, ncount); }
1541     }
1542     break;
1543 ph10 151 #endif
1544 nigel 77
1545 nigel 93 /*-----------------------------------------------------------------*/
1546     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1547     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1548     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1549     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1550     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1551     { ADD_ACTIVE(state_offset + 4, 0); }
1552     count = current_state->count; /* Number already matched */
1553     if (clen > 0)
1554     {
1555     int ncount = 0;
1556     switch (c)
1557     {
1558     case 0x000b:
1559     case 0x000c:
1560     case 0x0085:
1561     case 0x2028:
1562     case 0x2029:
1563 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1564     goto ANYNL03;
1565    
1566     case 0x000d:
1567     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1568     /* Fall through */
1569    
1570     ANYNL03:
1571     case 0x000a:
1572 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1573     {
1574     active_count--; /* Remove non-match possibility */
1575     next_active_state--;
1576     }
1577     if (++count >= GET2(code, 1))
1578     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1579     else
1580     { ADD_NEW_DATA(-state_offset, count, ncount); }
1581     break;
1582 ph10 231
1583 nigel 93 default:
1584     break;
1585     }
1586     }
1587     break;
1588    
1589 ph10 178 /*-----------------------------------------------------------------*/
1590     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1591     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1592     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1593     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1594     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1595     { ADD_ACTIVE(state_offset + 4, 0); }
1596     count = current_state->count; /* Number already matched */
1597     if (clen > 0)
1598     {
1599 ph10 182 BOOL OK;
1600 ph10 178 switch (c)
1601     {
1602     case 0x000a:
1603     case 0x000b:
1604     case 0x000c:
1605     case 0x000d:
1606     case 0x0085:
1607     case 0x2028:
1608     case 0x2029:
1609     OK = TRUE;
1610     break;
1611 ph10 182
1612 ph10 178 default:
1613     OK = FALSE;
1614     }
1615 ph10 182
1616 ph10 178 if (OK == (d == OP_VSPACE))
1617 ph10 182 {
1618 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1619     {
1620     active_count--; /* Remove non-match possibility */
1621     next_active_state--;
1622     }
1623     if (++count >= GET2(code, 1))
1624     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1625     else
1626     { ADD_NEW_DATA(-state_offset, count, 0); }
1627     }
1628     }
1629     break;
1630    
1631     /*-----------------------------------------------------------------*/
1632     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1633     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1634     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1635     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1636     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1637     { ADD_ACTIVE(state_offset + 4, 0); }
1638     count = current_state->count; /* Number already matched */
1639     if (clen > 0)
1640     {
1641 ph10 182 BOOL OK;
1642 ph10 178 switch (c)
1643     {
1644     case 0x09: /* HT */
1645     case 0x20: /* SPACE */
1646     case 0xa0: /* NBSP */
1647     case 0x1680: /* OGHAM SPACE MARK */
1648     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1649     case 0x2000: /* EN QUAD */
1650     case 0x2001: /* EM QUAD */
1651     case 0x2002: /* EN SPACE */
1652     case 0x2003: /* EM SPACE */
1653     case 0x2004: /* THREE-PER-EM SPACE */
1654     case 0x2005: /* FOUR-PER-EM SPACE */
1655     case 0x2006: /* SIX-PER-EM SPACE */
1656     case 0x2007: /* FIGURE SPACE */
1657     case 0x2008: /* PUNCTUATION SPACE */
1658     case 0x2009: /* THIN SPACE */
1659     case 0x200A: /* HAIR SPACE */
1660     case 0x202f: /* NARROW NO-BREAK SPACE */
1661     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1662     case 0x3000: /* IDEOGRAPHIC SPACE */
1663     OK = TRUE;
1664     break;
1665 ph10 182
1666 ph10 178 default:
1667     OK = FALSE;
1668     break;
1669     }
1670 ph10 182
1671 ph10 178 if (OK == (d == OP_HSPACE))
1672 ph10 182 {
1673 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1674     {
1675     active_count--; /* Remove non-match possibility */
1676     next_active_state--;
1677     }
1678     if (++count >= GET2(code, 1))
1679     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1680     else
1681     { ADD_NEW_DATA(-state_offset, count, 0); }
1682     }
1683     }
1684     break;
1685    
1686 nigel 77 /* ========================================================================== */
1687     /* These opcodes are followed by a character that is usually compared
1688     to the current subject character; it is loaded into d. We still get
1689     here even if there is no subject character, because in some cases zero
1690     repetitions are permitted. */
1691    
1692     /*-----------------------------------------------------------------*/
1693     case OP_CHAR:
1694     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1695     break;
1696    
1697     /*-----------------------------------------------------------------*/
1698     case OP_CHARNC:
1699     if (clen == 0) break;
1700    
1701     #ifdef SUPPORT_UTF8
1702     if (utf8)
1703     {
1704     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1705     {
1706 nigel 93 unsigned int othercase;
1707 nigel 77 if (c < 128) othercase = fcc[c]; else
1708    
1709     /* If we have Unicode property support, we can use it to test the
1710 nigel 87 other case of the character. */
1711 nigel 77
1712     #ifdef SUPPORT_UCP
1713 nigel 87 othercase = _pcre_ucp_othercase(c);
1714     #else
1715 nigel 93 othercase = NOTACHAR;
1716 nigel 77 #endif
1717    
1718     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1719     }
1720     }
1721     else
1722     #endif /* SUPPORT_UTF8 */
1723    
1724     /* Non-UTF-8 mode */
1725     {
1726     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1727     }
1728     break;
1729    
1730    
1731     #ifdef SUPPORT_UCP
1732     /*-----------------------------------------------------------------*/
1733     /* This is a tricky one because it can match more than one character.
1734     Find out how many characters to skip, and then set up a negative state
1735     to wait for them to pass before continuing. */
1736    
1737     case OP_EXTUNI:
1738 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1739 nigel 77 {
1740     const uschar *nptr = ptr + clen;
1741     int ncount = 0;
1742     while (nptr < end_subject)
1743     {
1744     int nclen = 1;
1745     GETCHARLEN(c, nptr, nclen);
1746 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1747 nigel 77 ncount++;
1748     nptr += nclen;
1749     }
1750     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1751     }
1752     break;
1753     #endif
1754    
1755     /*-----------------------------------------------------------------*/
1756 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1757     character (when CR is followed by LF). In this case, set up a negative
1758     state to wait for one character to pass before continuing. */
1759    
1760     case OP_ANYNL:
1761     if (clen > 0) switch(c)
1762     {
1763     case 0x000b:
1764     case 0x000c:
1765     case 0x0085:
1766     case 0x2028:
1767     case 0x2029:
1768 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1769    
1770     case 0x000a:
1771 nigel 93 ADD_NEW(state_offset + 1, 0);
1772     break;
1773 ph10 231
1774 nigel 93 case 0x000d:
1775     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1776     {
1777     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1778     }
1779     else
1780     {
1781     ADD_NEW(state_offset + 1, 0);
1782     }
1783     break;
1784     }
1785     break;
1786    
1787     /*-----------------------------------------------------------------*/
1788 ph10 178 case OP_NOT_VSPACE:
1789     if (clen > 0) switch(c)
1790     {
1791     case 0x000a:
1792     case 0x000b:
1793     case 0x000c:
1794     case 0x000d:
1795     case 0x0085:
1796     case 0x2028:
1797     case 0x2029:
1798     break;
1799 ph10 182
1800     default:
1801 ph10 178 ADD_NEW(state_offset + 1, 0);
1802     break;
1803     }
1804     break;
1805    
1806     /*-----------------------------------------------------------------*/
1807     case OP_VSPACE:
1808     if (clen > 0) switch(c)
1809     {
1810     case 0x000a:
1811     case 0x000b:
1812     case 0x000c:
1813     case 0x000d:
1814     case 0x0085:
1815     case 0x2028:
1816     case 0x2029:
1817     ADD_NEW(state_offset + 1, 0);
1818     break;
1819 ph10 182
1820 ph10 178 default: break;
1821     }
1822     break;
1823    
1824     /*-----------------------------------------------------------------*/
1825     case OP_NOT_HSPACE:
1826     if (clen > 0) switch(c)
1827     {
1828     case 0x09: /* HT */
1829     case 0x20: /* SPACE */
1830     case 0xa0: /* NBSP */
1831     case 0x1680: /* OGHAM SPACE MARK */
1832     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1833     case 0x2000: /* EN QUAD */
1834     case 0x2001: /* EM QUAD */
1835     case 0x2002: /* EN SPACE */
1836     case 0x2003: /* EM SPACE */
1837     case 0x2004: /* THREE-PER-EM SPACE */
1838     case 0x2005: /* FOUR-PER-EM SPACE */
1839     case 0x2006: /* SIX-PER-EM SPACE */
1840     case 0x2007: /* FIGURE SPACE */
1841     case 0x2008: /* PUNCTUATION SPACE */
1842     case 0x2009: /* THIN SPACE */
1843     case 0x200A: /* HAIR SPACE */
1844     case 0x202f: /* NARROW NO-BREAK SPACE */
1845     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1846     case 0x3000: /* IDEOGRAPHIC SPACE */
1847     break;
1848 ph10 182
1849     default:
1850 ph10 178 ADD_NEW(state_offset + 1, 0);
1851     break;
1852     }
1853     break;
1854    
1855     /*-----------------------------------------------------------------*/
1856     case OP_HSPACE:
1857     if (clen > 0) switch(c)
1858     {
1859     case 0x09: /* HT */
1860     case 0x20: /* SPACE */
1861     case 0xa0: /* NBSP */
1862     case 0x1680: /* OGHAM SPACE MARK */
1863     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1864     case 0x2000: /* EN QUAD */
1865     case 0x2001: /* EM QUAD */
1866     case 0x2002: /* EN SPACE */
1867     case 0x2003: /* EM SPACE */
1868     case 0x2004: /* THREE-PER-EM SPACE */
1869     case 0x2005: /* FOUR-PER-EM SPACE */
1870     case 0x2006: /* SIX-PER-EM SPACE */
1871     case 0x2007: /* FIGURE SPACE */
1872     case 0x2008: /* PUNCTUATION SPACE */
1873     case 0x2009: /* THIN SPACE */
1874     case 0x200A: /* HAIR SPACE */
1875     case 0x202f: /* NARROW NO-BREAK SPACE */
1876     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1877     case 0x3000: /* IDEOGRAPHIC SPACE */
1878     ADD_NEW(state_offset + 1, 0);
1879     break;
1880     }
1881     break;
1882    
1883     /*-----------------------------------------------------------------*/
1884 nigel 77 /* Match a negated single character. This is only used for one-byte
1885     characters, that is, we know that d < 256. The character we are
1886     checking (c) can be multibyte. */
1887    
1888     case OP_NOT:
1889     if (clen > 0)
1890     {
1891 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1892 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1893     }
1894     break;
1895    
1896     /*-----------------------------------------------------------------*/
1897     case OP_PLUS:
1898     case OP_MINPLUS:
1899 nigel 93 case OP_POSPLUS:
1900 nigel 77 case OP_NOTPLUS:
1901     case OP_NOTMINPLUS:
1902 nigel 93 case OP_NOTPOSPLUS:
1903 nigel 77 count = current_state->count; /* Already matched */
1904     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1905     if (clen > 0)
1906     {
1907 nigel 93 unsigned int otherd = NOTACHAR;
1908 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1909     {
1910     #ifdef SUPPORT_UTF8
1911 nigel 87 if (utf8 && d >= 128)
1912 nigel 77 {
1913     #ifdef SUPPORT_UCP
1914 nigel 87 otherd = _pcre_ucp_othercase(d);
1915 nigel 77 #endif /* SUPPORT_UCP */
1916     }
1917     else
1918     #endif /* SUPPORT_UTF8 */
1919     otherd = fcc[d];
1920     }
1921     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1922 nigel 93 {
1923     if (count > 0 &&
1924     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1925     {
1926     active_count--; /* Remove non-match possibility */
1927     next_active_state--;
1928     }
1929     count++;
1930     ADD_NEW(state_offset, count);
1931     }
1932 nigel 77 }
1933     break;
1934    
1935     /*-----------------------------------------------------------------*/
1936     case OP_QUERY:
1937     case OP_MINQUERY:
1938 nigel 93 case OP_POSQUERY:
1939 nigel 77 case OP_NOTQUERY:
1940     case OP_NOTMINQUERY:
1941 nigel 93 case OP_NOTPOSQUERY:
1942 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1943     if (clen > 0)
1944     {
1945 nigel 93 unsigned int otherd = NOTACHAR;
1946 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1947 nigel 77 {
1948     #ifdef SUPPORT_UTF8
1949 nigel 87 if (utf8 && d >= 128)
1950 nigel 77 {
1951     #ifdef SUPPORT_UCP
1952 nigel 87 otherd = _pcre_ucp_othercase(d);
1953 nigel 77 #endif /* SUPPORT_UCP */
1954     }
1955     else
1956     #endif /* SUPPORT_UTF8 */
1957     otherd = fcc[d];
1958     }
1959     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1960 nigel 93 {
1961     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1962     {
1963     active_count--; /* Remove non-match possibility */
1964     next_active_state--;
1965     }
1966     ADD_NEW(state_offset + dlen + 1, 0);
1967     }
1968 nigel 77 }
1969     break;
1970    
1971     /*-----------------------------------------------------------------*/
1972     case OP_STAR:
1973     case OP_MINSTAR:
1974 nigel 93 case OP_POSSTAR:
1975 nigel 77 case OP_NOTSTAR:
1976     case OP_NOTMINSTAR:
1977 nigel 93 case OP_NOTPOSSTAR:
1978 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1979     if (clen > 0)
1980     {
1981 nigel 93 unsigned int otherd = NOTACHAR;
1982 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1983 nigel 77 {
1984     #ifdef SUPPORT_UTF8
1985 nigel 87 if (utf8 && d >= 128)
1986 nigel 77 {
1987     #ifdef SUPPORT_UCP
1988 nigel 87 otherd = _pcre_ucp_othercase(d);
1989 nigel 77 #endif /* SUPPORT_UCP */
1990     }
1991     else
1992     #endif /* SUPPORT_UTF8 */
1993     otherd = fcc[d];
1994     }
1995     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1996 nigel 93 {
1997     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1998     {
1999     active_count--; /* Remove non-match possibility */
2000     next_active_state--;
2001     }
2002     ADD_NEW(state_offset, 0);
2003     }
2004 nigel 77 }
2005     break;
2006    
2007     /*-----------------------------------------------------------------*/
2008     case OP_EXACT:
2009 nigel 93 case OP_NOTEXACT:
2010     count = current_state->count; /* Number already matched */
2011     if (clen > 0)
2012     {
2013     unsigned int otherd = NOTACHAR;
2014     if ((ims & PCRE_CASELESS) != 0)
2015     {
2016     #ifdef SUPPORT_UTF8
2017     if (utf8 && d >= 128)
2018     {
2019     #ifdef SUPPORT_UCP
2020     otherd = _pcre_ucp_othercase(d);
2021     #endif /* SUPPORT_UCP */
2022     }
2023     else
2024     #endif /* SUPPORT_UTF8 */
2025     otherd = fcc[d];
2026     }
2027     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2028     {
2029     if (++count >= GET2(code, 1))
2030     { ADD_NEW(state_offset + dlen + 3, 0); }
2031     else
2032     { ADD_NEW(state_offset, count); }
2033     }
2034     }
2035     break;
2036    
2037     /*-----------------------------------------------------------------*/
2038 nigel 77 case OP_UPTO:
2039     case OP_MINUPTO:
2040 nigel 93 case OP_POSUPTO:
2041 nigel 77 case OP_NOTUPTO:
2042     case OP_NOTMINUPTO:
2043 nigel 93 case OP_NOTPOSUPTO:
2044     ADD_ACTIVE(state_offset + dlen + 3, 0);
2045 nigel 77 count = current_state->count; /* Number already matched */
2046     if (clen > 0)
2047     {
2048 nigel 93 unsigned int otherd = NOTACHAR;
2049 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2050     {
2051     #ifdef SUPPORT_UTF8
2052 nigel 87 if (utf8 && d >= 128)
2053 nigel 77 {
2054     #ifdef SUPPORT_UCP
2055 nigel 87 otherd = _pcre_ucp_othercase(d);
2056 nigel 77 #endif /* SUPPORT_UCP */
2057     }
2058     else
2059     #endif /* SUPPORT_UTF8 */
2060     otherd = fcc[d];
2061     }
2062     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2063     {
2064 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2065     {
2066     active_count--; /* Remove non-match possibility */
2067     next_active_state--;
2068     }
2069 nigel 77 if (++count >= GET2(code, 1))
2070     { ADD_NEW(state_offset + dlen + 3, 0); }
2071     else
2072     { ADD_NEW(state_offset, count); }
2073     }
2074     }
2075     break;
2076    
2077    
2078     /* ========================================================================== */
2079     /* These are the class-handling opcodes */
2080    
2081     case OP_CLASS:
2082     case OP_NCLASS:
2083     case OP_XCLASS:
2084     {
2085     BOOL isinclass = FALSE;
2086     int next_state_offset;
2087     const uschar *ecode;
2088    
2089     /* For a simple class, there is always just a 32-byte table, and we
2090     can set isinclass from it. */
2091    
2092     if (codevalue != OP_XCLASS)
2093     {
2094     ecode = code + 33;
2095     if (clen > 0)
2096     {
2097     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2098     ((code[1 + c/8] & (1 << (c&7))) != 0);
2099     }
2100     }
2101    
2102     /* An extended class may have a table or a list of single characters,
2103     ranges, or both, and it may be positive or negative. There's a
2104     function that sorts all this out. */
2105    
2106     else
2107     {
2108     ecode = code + GET(code, 1);
2109     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2110     }
2111    
2112     /* At this point, isinclass is set for all kinds of class, and ecode
2113     points to the byte after the end of the class. If there is a
2114     quantifier, this is where it will be. */
2115    
2116     next_state_offset = ecode - start_code;
2117    
2118     switch (*ecode)
2119     {
2120     case OP_CRSTAR:
2121     case OP_CRMINSTAR:
2122     ADD_ACTIVE(next_state_offset + 1, 0);
2123     if (isinclass) { ADD_NEW(state_offset, 0); }
2124     break;
2125    
2126     case OP_CRPLUS:
2127     case OP_CRMINPLUS:
2128     count = current_state->count; /* Already matched */
2129     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2130     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2131     break;
2132    
2133     case OP_CRQUERY:
2134     case OP_CRMINQUERY:
2135     ADD_ACTIVE(next_state_offset + 1, 0);
2136     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2137     break;
2138    
2139     case OP_CRRANGE:
2140     case OP_CRMINRANGE:
2141     count = current_state->count; /* Already matched */
2142     if (count >= GET2(ecode, 1))
2143     { ADD_ACTIVE(next_state_offset + 5, 0); }
2144     if (isinclass)
2145     {
2146 nigel 91 int max = GET2(ecode, 3);
2147     if (++count >= max && max != 0) /* Max 0 => no limit */
2148 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2149     else
2150     { ADD_NEW(state_offset, count); }
2151     }
2152     break;
2153    
2154     default:
2155     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2156     break;
2157     }
2158     }
2159     break;
2160    
2161     /* ========================================================================== */
2162     /* These are the opcodes for fancy brackets of various kinds. We have
2163 ph10 345 to use recursion in order to handle them. The "always failing" assersion
2164 ph10 341 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2165     though the other "backtracking verbs" are not supported. */
2166 ph10 345
2167 ph10 341 case OP_FAIL:
2168 ph10 345 break;
2169 nigel 77
2170     case OP_ASSERT:
2171     case OP_ASSERT_NOT:
2172     case OP_ASSERTBACK:
2173     case OP_ASSERTBACK_NOT:
2174     {
2175     int rc;
2176     int local_offsets[2];
2177     int local_workspace[1000];
2178     const uschar *endasscode = code + GET(code, 1);
2179    
2180     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2181    
2182     rc = internal_dfa_exec(
2183     md, /* static match data */
2184     code, /* this subexpression's code */
2185     ptr, /* where we currently are */
2186     ptr - start_subject, /* start offset */
2187     local_offsets, /* offset vector */
2188     sizeof(local_offsets)/sizeof(int), /* size of same */
2189     local_workspace, /* workspace vector */
2190     sizeof(local_workspace)/sizeof(int), /* size of same */
2191     ims, /* the current ims flags */
2192     rlevel, /* function recursion level */
2193     recursing); /* pass on regex recursion */
2194    
2195     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2196     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2197     }
2198     break;
2199    
2200     /*-----------------------------------------------------------------*/
2201     case OP_COND:
2202 nigel 93 case OP_SCOND:
2203 nigel 77 {
2204     int local_offsets[1000];
2205     int local_workspace[1000];
2206     int condcode = code[LINK_SIZE+1];
2207    
2208 nigel 93 /* Back reference conditions are not supported */
2209 nigel 77
2210 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2211    
2212     /* The DEFINE condition is always false */
2213    
2214     if (condcode == OP_DEF)
2215 nigel 77 {
2216 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2217     }
2218    
2219     /* The only supported version of OP_RREF is for the value RREF_ANY,
2220     which means "test if in any recursion". We can't test for specifically
2221     recursed groups. */
2222    
2223     else if (condcode == OP_RREF)
2224     {
2225 nigel 77 int value = GET2(code, LINK_SIZE+2);
2226 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2227 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2228     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2229     }
2230    
2231     /* Otherwise, the condition is an assertion */
2232    
2233     else
2234     {
2235     int rc;
2236     const uschar *asscode = code + LINK_SIZE + 1;
2237     const uschar *endasscode = asscode + GET(asscode, 1);
2238    
2239     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2240    
2241     rc = internal_dfa_exec(
2242     md, /* fixed match data */
2243     asscode, /* this subexpression's code */
2244     ptr, /* where we currently are */
2245     ptr - start_subject, /* start offset */
2246     local_offsets, /* offset vector */
2247     sizeof(local_offsets)/sizeof(int), /* size of same */
2248     local_workspace, /* workspace vector */
2249     sizeof(local_workspace)/sizeof(int), /* size of same */
2250     ims, /* the current ims flags */
2251     rlevel, /* function recursion level */
2252     recursing); /* pass on regex recursion */
2253    
2254     if ((rc >= 0) ==
2255     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2256     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2257     else
2258     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2259     }
2260     }
2261     break;
2262    
2263     /*-----------------------------------------------------------------*/
2264     case OP_RECURSE:
2265     {
2266     int local_offsets[1000];
2267     int local_workspace[1000];
2268     int rc;
2269    
2270     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2271     recursing + 1));
2272    
2273     rc = internal_dfa_exec(
2274     md, /* fixed match data */
2275     start_code + GET(code, 1), /* this subexpression's code */
2276     ptr, /* where we currently are */
2277     ptr - start_subject, /* start offset */
2278     local_offsets, /* offset vector */
2279     sizeof(local_offsets)/sizeof(int), /* size of same */
2280     local_workspace, /* workspace vector */
2281     sizeof(local_workspace)/sizeof(int), /* size of same */
2282     ims, /* the current ims flags */
2283     rlevel, /* function recursion level */
2284     recursing + 1); /* regex recurse level */
2285    
2286     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2287     recursing + 1, rc));
2288    
2289     /* Ran out of internal offsets */
2290    
2291     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2292    
2293     /* For each successful matched substring, set up the next state with a
2294     count of characters to skip before trying it. Note that the count is in
2295     characters, not bytes. */
2296    
2297     if (rc > 0)
2298     {
2299     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2300     {
2301     const uschar *p = start_subject + local_offsets[rc];
2302     const uschar *pp = start_subject + local_offsets[rc+1];
2303     int charcount = local_offsets[rc+1] - local_offsets[rc];
2304     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2305     if (charcount > 0)
2306     {
2307     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2308     }
2309     else
2310     {
2311     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2312     }
2313     }
2314     }
2315     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2316     }
2317     break;
2318    
2319     /*-----------------------------------------------------------------*/
2320     case OP_ONCE:
2321     {
2322     int local_offsets[2];
2323     int local_workspace[1000];
2324    
2325     int rc = internal_dfa_exec(
2326     md, /* fixed match data */
2327     code, /* this subexpression's code */
2328     ptr, /* where we currently are */
2329     ptr - start_subject, /* start offset */
2330     local_offsets, /* offset vector */
2331     sizeof(local_offsets)/sizeof(int), /* size of same */
2332     local_workspace, /* workspace vector */
2333     sizeof(local_workspace)/sizeof(int), /* size of same */
2334     ims, /* the current ims flags */
2335     rlevel, /* function recursion level */
2336     recursing); /* pass on regex recursion */
2337    
2338     if (rc >= 0)
2339     {
2340     const uschar *end_subpattern = code;
2341     int charcount = local_offsets[1] - local_offsets[0];
2342     int next_state_offset, repeat_state_offset;
2343    
2344     do { end_subpattern += GET(end_subpattern, 1); }
2345     while (*end_subpattern == OP_ALT);
2346     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2347    
2348     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2349     arrange for the repeat state also to be added to the relevant list.
2350     Calculate the offset, or set -1 for no repeat. */
2351    
2352     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2353     *end_subpattern == OP_KETRMIN)?
2354     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2355    
2356     /* If we have matched an empty string, add the next state at the
2357     current character pointer. This is important so that the duplicate
2358     checking kicks in, which is what breaks infinite loops that match an
2359     empty string. */
2360    
2361     if (charcount == 0)
2362     {
2363     ADD_ACTIVE(next_state_offset, 0);
2364     }
2365    
2366     /* Optimization: if there are no more active states, and there
2367     are no new states yet set up, then skip over the subject string
2368     right here, to save looping. Otherwise, set up the new state to swing
2369     into action when the end of the substring is reached. */
2370    
2371     else if (i + 1 >= active_count && new_count == 0)
2372     {
2373     ptr += charcount;
2374     clen = 0;
2375     ADD_NEW(next_state_offset, 0);
2376    
2377     /* If we are adding a repeat state at the new character position,
2378     we must fudge things so that it is the only current state.
2379     Otherwise, it might be a duplicate of one we processed before, and
2380     that would cause it to be skipped. */
2381    
2382     if (repeat_state_offset >= 0)
2383     {
2384     next_active_state = active_states;
2385     active_count = 0;
2386     i = -1;
2387     ADD_ACTIVE(repeat_state_offset, 0);
2388     }
2389     }
2390     else
2391     {
2392     const uschar *p = start_subject + local_offsets[0];
2393     const uschar *pp = start_subject + local_offsets[1];
2394     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2395     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2396     if (repeat_state_offset >= 0)
2397     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2398     }
2399    
2400     }
2401     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2402     }
2403     break;
2404    
2405    
2406     /* ========================================================================== */
2407     /* Handle callouts */
2408    
2409     case OP_CALLOUT:
2410     if (pcre_callout != NULL)
2411     {
2412     int rrc;
2413     pcre_callout_block cb;
2414     cb.version = 1; /* Version 1 of the callout block */
2415     cb.callout_number = code[1];
2416     cb.offset_vector = offsets;
2417 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2418 nigel 77 cb.subject_length = end_subject - start_subject;
2419     cb.start_match = current_subject - start_subject;
2420     cb.current_position = ptr - start_subject;
2421     cb.pattern_position = GET(code, 2);
2422     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2423     cb.capture_top = 1;
2424     cb.capture_last = -1;
2425     cb.callout_data = md->callout_data;
2426     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2427     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2428     }
2429     break;
2430    
2431    
2432     /* ========================================================================== */
2433     default: /* Unsupported opcode */
2434     return PCRE_ERROR_DFA_UITEM;
2435     }
2436    
2437     NEXT_ACTIVE_STATE: continue;
2438    
2439     } /* End of loop scanning active states */
2440    
2441     /* We have finished the processing at the current subject character. If no
2442     new states have been set for the next character, we have found all the
2443     matches that we are going to find. If we are at the top level and partial
2444     matching has been requested, check for appropriate conditions. */
2445    
2446     if (new_count <= 0)
2447     {
2448     if (match_count < 0 && /* No matches found */
2449     rlevel == 1 && /* Top level match function */
2450     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2451     ptr >= end_subject && /* Reached end of subject */
2452     ptr > current_subject) /* Matched non-empty string */
2453     {
2454     if (offsetcount >= 2)
2455     {
2456     offsets[0] = current_subject - start_subject;
2457     offsets[1] = end_subject - start_subject;
2458     }
2459     match_count = PCRE_ERROR_PARTIAL;
2460     }
2461    
2462     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2463     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2464     rlevel*2-2, SP));
2465 nigel 91 break; /* In effect, "return", but see the comment below */
2466 nigel 77 }
2467    
2468     /* One or more states are active for the next character. */
2469    
2470     ptr += clen; /* Advance to next subject character */
2471     } /* Loop to move along the subject string */
2472    
2473 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2474     if we use "return" above, we have compiler trouble. Some compilers warn if
2475     there's nothing here because they think the function doesn't return a value. On
2476     the other hand, if we put a dummy statement here, some more clever compilers
2477     complain that it can't be reached. Sigh. */
2478 nigel 77
2479 nigel 91 return match_count;
2480 nigel 77 }
2481    
2482    
2483    
2484    
2485     /*************************************************
2486     * Execute a Regular Expression - DFA engine *
2487     *************************************************/
2488    
2489     /* This external function applies a compiled re to a subject string using a DFA
2490     engine. This function calls the internal function multiple times if the pattern
2491     is not anchored.
2492    
2493     Arguments:
2494     argument_re points to the compiled expression
2495 ph10 97 extra_data points to extra data or is NULL
2496 nigel 77 subject points to the subject string
2497     length length of subject string (may contain binary zeros)
2498     start_offset where to start in the subject string
2499     options option bits
2500     offsets vector of match offsets
2501     offsetcount size of same
2502     workspace workspace vector
2503     wscount size of same
2504    
2505     Returns: > 0 => number of match offset pairs placed in offsets
2506     = 0 => offsets overflowed; longest matches are present
2507     -1 => failed to match
2508     < -1 => some kind of unexpected problem
2509     */
2510    
2511 ph10 145 PCRE_EXP_DEFN int
2512 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2513     const char *subject, int length, int start_offset, int options, int *offsets,
2514     int offsetcount, int *workspace, int wscount)
2515     {
2516     real_pcre *re = (real_pcre *)argument_re;
2517     dfa_match_data match_block;
2518 nigel 91 dfa_match_data *md = &match_block;
2519 nigel 77 BOOL utf8, anchored, startline, firstline;
2520     const uschar *current_subject, *end_subject, *lcc;
2521    
2522     pcre_study_data internal_study;
2523     const pcre_study_data *study = NULL;
2524     real_pcre internal_re;
2525    
2526     const uschar *req_byte_ptr;
2527     const uschar *start_bits = NULL;
2528     BOOL first_byte_caseless = FALSE;
2529     BOOL req_byte_caseless = FALSE;
2530     int first_byte = -1;
2531     int req_byte = -1;
2532     int req_byte2 = -1;
2533 nigel 91 int newline;
2534 nigel 77
2535     /* Plausibility checks */
2536    
2537     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2538     if (re == NULL || subject == NULL || workspace == NULL ||
2539     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2540     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2541     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2542    
2543     /* We need to find the pointer to any study data before we test for byte
2544     flipping, so we scan the extra_data block first. This may set two fields in the
2545     match block, so we must initialize them beforehand. However, the other fields
2546     in the match block must not be set until after the byte flipping. */
2547    
2548 nigel 91 md->tables = re->tables;
2549     md->callout_data = NULL;
2550 nigel 77
2551     if (extra_data != NULL)
2552     {
2553     unsigned int flags = extra_data->flags;
2554     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2555     study = (const pcre_study_data *)extra_data->study_data;
2556     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2557 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2558     return PCRE_ERROR_DFA_UMLIMIT;
2559 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2560 nigel 91 md->callout_data = extra_data->callout_data;
2561 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2562 nigel 91 md->tables = extra_data->tables;
2563 nigel 77 }
2564    
2565     /* Check that the first field in the block is the magic number. If it is not,
2566     test for a regex that was compiled on a host of opposite endianness. If this is
2567     the case, flipped values are put in internal_re and internal_study if there was
2568     study data too. */
2569    
2570     if (re->magic_number != MAGIC_NUMBER)
2571     {
2572     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2573     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2574     if (study != NULL) study = &internal_study;
2575     }
2576    
2577     /* Set some local values */
2578    
2579     current_subject = (const unsigned char *)subject + start_offset;
2580     end_subject = (const unsigned char *)subject + length;
2581     req_byte_ptr = current_subject - 1;
2582    
2583 nigel 91 #ifdef SUPPORT_UTF8
2584 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2585 nigel 91 #else
2586     utf8 = FALSE;
2587     #endif
2588 nigel 77
2589 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2590     (re->options & PCRE_ANCHORED) != 0;
2591    
2592 nigel 77 /* The remaining fixed data for passing around. */
2593    
2594 nigel 91 md->start_code = (const uschar *)argument_re +
2595 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2596 nigel 91 md->start_subject = (const unsigned char *)subject;
2597     md->end_subject = end_subject;
2598     md->moptions = options;
2599     md->poptions = re->options;
2600 nigel 77
2601 ph10 231 /* If the BSR option is not set at match time, copy what was set
2602     at compile time. */
2603    
2604     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2605     {
2606     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2607     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2608     #ifdef BSR_ANYCRLF
2609     else md->moptions |= PCRE_BSR_ANYCRLF;
2610 ph10 243 #endif
2611     }
2612 ph10 231
2613 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2614     nothing is set at run time, whatever was used at compile time applies. */
2615 nigel 91
2616 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2617 nigel 93 PCRE_NEWLINE_BITS)
2618 nigel 91 {
2619 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2620 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2621     case PCRE_NEWLINE_LF: newline = '\n'; break;
2622     case PCRE_NEWLINE_CR+
2623     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2624 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2625 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2626 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2627 nigel 91 }
2628    
2629 ph10 149 if (newline == -2)
2630 nigel 91 {
2631 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2632     }
2633     else if (newline < 0)
2634     {
2635 nigel 93 md->nltype = NLTYPE_ANY;
2636 nigel 91 }
2637     else
2638     {
2639 nigel 93 md->nltype = NLTYPE_FIXED;
2640     if (newline > 255)
2641     {
2642     md->nllen = 2;
2643     md->nl[0] = (newline >> 8) & 255;
2644     md->nl[1] = newline & 255;
2645     }
2646     else
2647     {
2648     md->nllen = 1;
2649     md->nl[0] = newline;
2650     }
2651 nigel 91 }
2652    
2653 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2654     back the character offset. */
2655    
2656     #ifdef SUPPORT_UTF8
2657     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2658     {
2659     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2660     return PCRE_ERROR_BADUTF8;
2661     if (start_offset > 0 && start_offset < length)
2662     {
2663     int tb = ((uschar *)subject)[start_offset];
2664     if (tb > 127)
2665     {
2666     tb &= 0xc0;
2667     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2668     }
2669     }
2670     }
2671     #endif
2672    
2673     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2674     is a feature that makes it possible to save compiled regex and re-use them
2675     in other programs later. */
2676    
2677 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2678 nigel 77
2679     /* The lower casing table and the "must be at the start of a line" flag are
2680     used in a loop when finding where to start. */
2681    
2682 nigel 91 lcc = md->tables + lcc_offset;
2683 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2684 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2685    
2686     /* Set up the first character to match, if available. The first_byte value is
2687     never set for an anchored regular expression, but the anchoring may be forced
2688     at run time, so we have to test for anchoring. The first char may be unset for
2689     an unanchored pattern, of course. If there's no first char and the pattern was
2690     studied, there may be a bitmap of possible first characters. */
2691    
2692     if (!anchored)
2693     {
2694 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2695 nigel 77 {
2696     first_byte = re->first_byte & 255;
2697     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2698     first_byte = lcc[first_byte];
2699     }
2700     else
2701     {
2702     if (startline && study != NULL &&
2703     (study->options & PCRE_STUDY_MAPPED) != 0)
2704     start_bits = study->start_bits;
2705     }
2706     }
2707    
2708     /* For anchored or unanchored matches, there may be a "last known required
2709     character" set. */
2710    
2711 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2712 nigel 77 {
2713     req_byte = re->req_byte & 255;
2714     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2715 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2716 nigel 77 }
2717    
2718     /* Call the main matching function, looping for a non-anchored regex after a
2719     failed match. Unless restarting, optimize by moving to the first match
2720     character if possible, when not anchored. Then unless wanting a partial match,
2721     check for a required later character. */
2722    
2723     for (;;)
2724     {
2725     int rc;
2726    
2727     if ((options & PCRE_DFA_RESTART) == 0)
2728     {
2729     const uschar *save_end_subject = end_subject;
2730    
2731     /* Advance to a unique first char if possible. If firstline is TRUE, the
2732     start of the match is constrained to the first line of a multiline string.
2733 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2734     scanning at a newline. If the match fails at the newline, later code breaks
2735     this loop. */
2736 nigel 77
2737     if (firstline)
2738     {
2739     const uschar *t = current_subject;
2740 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2741 nigel 77 end_subject = t;
2742     }
2743    
2744     if (first_byte >= 0)
2745     {
2746     if (first_byte_caseless)
2747     while (current_subject < end_subject &&
2748     lcc[*current_subject] != first_byte)
2749     current_subject++;
2750     else
2751     while (current_subject < end_subject && *current_subject != first_byte)
2752     current_subject++;
2753     }
2754    
2755 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2756 nigel 77
2757     else if (startline)
2758     {
2759 nigel 93 if (current_subject > md->start_subject + start_offset)
2760 nigel 77 {
2761 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2762 nigel 77 current_subject++;
2763 ph10 130
2764 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2765     ANYCRLF, and we are now at a LF, advance the match position by one more
2766     character. */
2767 ph10 134
2768 ph10 130 if (current_subject[-1] == '\r' &&
2769 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2770 ph10 130 current_subject < end_subject &&
2771     *current_subject == '\n')
2772     current_subject++;
2773 nigel 77 }
2774     }
2775    
2776     /* Or to a non-unique first char after study */
2777    
2778     else if (start_bits != NULL)
2779     {
2780     while (current_subject < end_subject)
2781     {
2782     register unsigned int c = *current_subject;
2783     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2784     else break;
2785     }
2786     }
2787    
2788     /* Restore fudged end_subject */
2789    
2790     end_subject = save_end_subject;
2791     }
2792    
2793     /* If req_byte is set, we know that that character must appear in the subject
2794     for the match to succeed. If the first character is set, req_byte must be
2795     later in the subject; otherwise the test starts at the match point. This
2796     optimization can save a huge amount of work in patterns with nested unlimited
2797     repeats that aren't going to match. Writing separate code for cased/caseless
2798     versions makes it go faster, as does using an autoincrement and backing off
2799     on a match.
2800    
2801     HOWEVER: when the subject string is very, very long, searching to its end can
2802     take a long time, and give bad performance on quite ordinary patterns. This
2803     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2804     don't do this when the string is sufficiently long.
2805    
2806     ALSO: this processing is disabled when partial matching is requested.
2807     */
2808    
2809     if (req_byte >= 0 &&
2810     end_subject - current_subject < REQ_BYTE_MAX &&
2811     (options & PCRE_PARTIAL) == 0)
2812     {
2813     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2814    
2815     /* We don't need to repeat the search if we haven't yet reached the
2816     place we found it at last time. */
2817    
2818     if (p > req_byte_ptr)
2819     {
2820     if (req_byte_caseless)
2821     {
2822     while (p < end_subject)
2823     {
2824     register int pp = *p++;
2825     if (pp == req_byte || pp == req_byte2) { p--; break; }
2826     }
2827     }
2828     else
2829     {
2830     while (p < end_subject)
2831     {
2832     if (*p++ == req_byte) { p--; break; }
2833     }
2834     }
2835    
2836     /* If we can't find the required character, break the matching loop,
2837     which will cause a return or PCRE_ERROR_NOMATCH. */
2838    
2839     if (p >= end_subject) break;
2840    
2841     /* If we have found the required character, save the point where we
2842     found it, so that we don't search again next time round the loop if
2843     the start hasn't passed this character yet. */
2844    
2845     req_byte_ptr = p;
2846     }
2847     }
2848    
2849     /* OK, now we can do the business */
2850    
2851     rc = internal_dfa_exec(
2852 nigel 91 md, /* fixed match data */
2853     md->start_code, /* this subexpression's code */
2854     current_subject, /* where we currently are */
2855     start_offset, /* start offset in subject */
2856     offsets, /* offset vector */
2857     offsetcount, /* size of same */
2858     workspace, /* workspace vector */
2859     wscount, /* size of same */
2860 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2861 nigel 91 0, /* function recurse level */
2862     0); /* regex recurse level */
2863 nigel 77
2864     /* Anything other than "no match" means we are done, always; otherwise, carry
2865     on only if not anchored. */
2866    
2867     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2868    
2869     /* Advance to the next subject character unless we are at the end of a line
2870     and firstline is set. */
2871    
2872 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2873 nigel 77 current_subject++;
2874     if (utf8)
2875     {
2876     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2877     current_subject++;
2878     }
2879     if (current_subject > end_subject) break;
2880    
2881 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2882 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2883     or ANY or ANYCRLF, advance the match position by one more character. */
2884 nigel 93
2885     if (current_subject[-1] == '\r' &&
2886 ph10 226 current_subject < end_subject &&
2887     *current_subject == '\n' &&
2888 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2889 ph10 226 (md->nltype == NLTYPE_ANY ||
2890     md->nltype == NLTYPE_ANYCRLF ||
2891     md->nllen == 2))
2892 nigel 93 current_subject++;
2893    
2894     } /* "Bumpalong" loop */
2895    
2896 nigel 77 return PCRE_ERROR_NOMATCH;
2897     }
2898    
2899     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12