/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 428 - (hide annotations) (download)
Mon Aug 31 17:10:26 2009 UTC (5 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 99289 byte(s)
Further partial match change: add PCRE_PARTIAL_HARD and make more intuitive.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 200 #ifdef HAVE_CONFIG_H
49 ph10 236 #include "config.h"
50 ph10 200 #endif
51 ph10 199
52 nigel 93 #define NLBLOCK md /* Block containing newline information */
53     #define PSSTART start_subject /* Field containing processed string start */
54     #define PSEND end_subject /* Field containing processed string end */
55    
56 nigel 77 #include "pcre_internal.h"
57    
58    
59     /* For use to indent debugging output */
60    
61     #define SP " "
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87 ph10 327 static const uschar coptable[] = {
88 nigel 77 0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141 ph10 327 static const uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
147 nigel 77 };
148    
149 ph10 327 static const uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
155 nigel 77 };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
227 ph10 341 = 0 => offsets overflowed; longest matches are present
228 nigel 77 -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 ph10 428 int forced_fail = 0;
458     int reached_end = 0;
459 nigel 77
460     /* Make the new state list into the active state list and empty the
461     new state list. */
462    
463     temp_states = active_states;
464     active_states = new_states;
465     new_states = temp_states;
466     active_count = new_count;
467     new_count = 0;
468    
469     workspace[0] ^= 1; /* Remember for the restarting feature */
470     workspace[1] = active_count;
471    
472     #ifdef DEBUG
473     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
474     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
475     printf("\"\n");
476    
477     printf("%.*sActive states: ", rlevel*2-2, SP);
478     for (i = 0; i < active_count; i++)
479     printf("%d/%d ", active_states[i].offset, active_states[i].count);
480     printf("\n");
481     #endif
482    
483     /* Set the pointers for adding new states */
484    
485     next_active_state = active_states + active_count;
486     next_new_state = new_states;
487    
488     /* Load the current character from the subject outside the loop, as many
489     different states may want to look at it, and we assume that at least one
490     will. */
491    
492     if (ptr < end_subject)
493     {
494 nigel 93 clen = 1; /* Number of bytes in the character */
495 nigel 77 #ifdef SUPPORT_UTF8
496     if (utf8) { GETCHARLEN(c, ptr, clen); } else
497     #endif /* SUPPORT_UTF8 */
498     c = *ptr;
499     }
500     else
501     {
502 nigel 93 clen = 0; /* This indicates the end of the subject */
503     c = NOTACHAR; /* This value should never actually be used */
504 nigel 77 }
505    
506     /* Scan up the active states and act on each one. The result of an action
507     may be to add more states to the currently active list (e.g. on hitting a
508     parenthesis) or it may be to put states on the new list, for considering
509     when we move the character pointer on. */
510    
511     for (i = 0; i < active_count; i++)
512     {
513     stateblock *current_state = active_states + i;
514     const uschar *code;
515     int state_offset = current_state->offset;
516 ph10 397 int count, codevalue, rrc;
517 nigel 77
518     #ifdef DEBUG
519     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
520 nigel 93 if (clen == 0) printf("EOL\n");
521 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
522     else printf("0x%02x\n", c);
523     #endif
524    
525     /* This variable is referred to implicity in the ADD_xxx macros. */
526    
527     ims = current_state->ims;
528    
529     /* A negative offset is a special case meaning "hold off going to this
530     (negated) state until the number of characters in the data field have
531     been skipped". */
532    
533     if (state_offset < 0)
534     {
535     if (current_state->data > 0)
536     {
537     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
538     ADD_NEW_DATA(state_offset, current_state->count,
539     current_state->data - 1);
540     continue;
541     }
542     else
543     {
544     current_state->offset = state_offset = -state_offset;
545     }
546     }
547    
548     /* Check for a duplicate state with the same count, and skip if found. */
549    
550     for (j = 0; j < i; j++)
551     {
552     if (active_states[j].offset == state_offset &&
553     active_states[j].count == current_state->count)
554     {
555     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
556     goto NEXT_ACTIVE_STATE;
557     }
558     }
559    
560     /* The state offset is the offset to the opcode */
561    
562     code = start_code + state_offset;
563     codevalue = *code;
564    
565     /* If this opcode is followed by an inline character, load it. It is
566     tempting to test for the presence of a subject character here, but that
567     is wrong, because sometimes zero repetitions of the subject are
568     permitted.
569    
570     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
571 ph10 178 argument that is not a data character - but is always one byte long. We
572     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
573     this case. To keep the other cases fast, convert these ones to new opcodes.
574     */
575 nigel 77
576     if (coptable[codevalue] > 0)
577     {
578     dlen = 1;
579     #ifdef SUPPORT_UTF8
580     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
581     #endif /* SUPPORT_UTF8 */
582     d = code[coptable[codevalue]];
583     if (codevalue >= OP_TYPESTAR)
584     {
585 nigel 93 switch(d)
586     {
587     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
588     case OP_NOTPROP:
589     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
590     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
591     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
592 ph10 178 case OP_NOT_HSPACE:
593 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
594 ph10 178 case OP_NOT_VSPACE:
595 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
596 nigel 93 default: break;
597     }
598 nigel 77 }
599     }
600     else
601     {
602     dlen = 0; /* Not strictly necessary, but compilers moan */
603 nigel 93 d = NOTACHAR; /* if these variables are not set. */
604 nigel 77 }
605    
606    
607     /* Now process the individual opcodes */
608    
609     switch (codevalue)
610     {
611    
612     /* ========================================================================== */
613     /* Reached a closing bracket. If not at the end of the pattern, carry
614     on with the next opcode. Otherwise, unless we have an empty string and
615     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
616     matches so we always have the longest first. */
617    
618     case OP_KET:
619     case OP_KETRMIN:
620     case OP_KETRMAX:
621     if (code != end_code)
622     {
623     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
624     if (codevalue != OP_KET)
625     {
626     ADD_ACTIVE(state_offset - GET(code, 1), 0);
627     }
628     }
629 ph10 428 else
630 nigel 77 {
631 ph10 428 reached_end++; /* Count branches that reach the end */
632     if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
633 nigel 77 {
634 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
635     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
636     match_count = 0;
637     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
638     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
639     if (offsetcount >= 2)
640     {
641     offsets[0] = current_subject - start_subject;
642     offsets[1] = ptr - start_subject;
643     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
644     offsets[1] - offsets[0], current_subject));
645     }
646     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
647     {
648     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
649     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
650     match_count, rlevel*2-2, SP));
651     return match_count;
652     }
653     }
654 nigel 77 }
655     break;
656    
657     /* ========================================================================== */
658     /* These opcodes add to the current list of states without looking
659     at the current character. */
660    
661     /*-----------------------------------------------------------------*/
662     case OP_ALT:
663     do { code += GET(code, 1); } while (*code == OP_ALT);
664     ADD_ACTIVE(code - start_code, 0);
665     break;
666    
667     /*-----------------------------------------------------------------*/
668     case OP_BRA:
669 nigel 93 case OP_SBRA:
670 nigel 77 do
671     {
672     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
673     code += GET(code, 1);
674     }
675     while (*code == OP_ALT);
676     break;
677    
678     /*-----------------------------------------------------------------*/
679 nigel 93 case OP_CBRA:
680     case OP_SCBRA:
681     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
682     code += GET(code, 1);
683     while (*code == OP_ALT)
684     {
685     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
686     code += GET(code, 1);
687     }
688     break;
689    
690     /*-----------------------------------------------------------------*/
691 nigel 77 case OP_BRAZERO:
692     case OP_BRAMINZERO:
693     ADD_ACTIVE(state_offset + 1, 0);
694     code += 1 + GET(code, 2);
695     while (*code == OP_ALT) code += GET(code, 1);
696     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
697     break;
698    
699     /*-----------------------------------------------------------------*/
700 ph10 335 case OP_SKIPZERO:
701     code += 1 + GET(code, 2);
702     while (*code == OP_ALT) code += GET(code, 1);
703     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
704     break;
705    
706     /*-----------------------------------------------------------------*/
707 nigel 77 case OP_CIRC:
708     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
709 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
710     ptr != end_subject &&
711 nigel 93 WAS_NEWLINE(ptr)))
712 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
713     break;
714    
715     /*-----------------------------------------------------------------*/
716     case OP_EOD:
717     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
718     break;
719    
720     /*-----------------------------------------------------------------*/
721     case OP_OPT:
722     ims = code[1];
723     ADD_ACTIVE(state_offset + 2, 0);
724     break;
725    
726     /*-----------------------------------------------------------------*/
727     case OP_SOD:
728     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
729     break;
730    
731     /*-----------------------------------------------------------------*/
732     case OP_SOM:
733     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
734     break;
735    
736    
737     /* ========================================================================== */
738     /* These opcodes inspect the next subject character, and sometimes
739     the previous one as well, but do not have an argument. The variable
740     clen contains the length of the current character and is zero if we are
741     at the end of the subject. */
742    
743     /*-----------------------------------------------------------------*/
744     case OP_ANY:
745 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
746 nigel 77 { ADD_NEW(state_offset + 1, 0); }
747     break;
748    
749     /*-----------------------------------------------------------------*/
750 ph10 341 case OP_ALLANY:
751     if (clen > 0)
752     { ADD_NEW(state_offset + 1, 0); }
753     break;
754    
755     /*-----------------------------------------------------------------*/
756 nigel 77 case OP_EODN:
757 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
758 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
759     break;
760    
761     /*-----------------------------------------------------------------*/
762     case OP_DOLL:
763     if ((md->moptions & PCRE_NOTEOL) == 0)
764     {
765 nigel 91 if (clen == 0 ||
766 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
767 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
768     ))
769 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
770     }
771 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
772 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
773     break;
774    
775     /*-----------------------------------------------------------------*/
776    
777     case OP_DIGIT:
778     case OP_WHITESPACE:
779     case OP_WORDCHAR:
780     if (clen > 0 && c < 256 &&
781     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
782     { ADD_NEW(state_offset + 1, 0); }
783     break;
784    
785     /*-----------------------------------------------------------------*/
786     case OP_NOT_DIGIT:
787     case OP_NOT_WHITESPACE:
788     case OP_NOT_WORDCHAR:
789     if (clen > 0 && (c >= 256 ||
790     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
791     { ADD_NEW(state_offset + 1, 0); }
792     break;
793    
794     /*-----------------------------------------------------------------*/
795     case OP_WORD_BOUNDARY:
796     case OP_NOT_WORD_BOUNDARY:
797     {
798     int left_word, right_word;
799    
800     if (ptr > start_subject)
801     {
802     const uschar *temp = ptr - 1;
803     #ifdef SUPPORT_UTF8
804     if (utf8) BACKCHAR(temp);
805     #endif
806     GETCHARTEST(d, temp);
807     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
808     }
809     else left_word = 0;
810    
811 ph10 428 if (clen > 0)
812     right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
813     else /* This is a fudge to ensure that if this is the */
814     { /* last item in the pattern, we don't count it as */
815     reached_end--; /* reached, thus disabling a partial match. */
816     right_word = 0;
817     }
818 nigel 77
819     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
820     { ADD_ACTIVE(state_offset + 1, 0); }
821     }
822     break;
823    
824    
825     /*-----------------------------------------------------------------*/
826     /* Check the next character by Unicode property. We will get here only
827     if the support is in the binary; otherwise a compile-time error occurs.
828     */
829    
830 ph10 151 #ifdef SUPPORT_UCP
831 nigel 77 case OP_PROP:
832     case OP_NOTPROP:
833     if (clen > 0)
834     {
835 nigel 87 BOOL OK;
836 ph10 349 const ucd_record * prop = GET_UCD(c);
837 nigel 87 switch(code[1])
838 nigel 77 {
839 nigel 87 case PT_ANY:
840     OK = TRUE;
841     break;
842    
843     case PT_LAMP:
844 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
845 nigel 87 break;
846    
847     case PT_GC:
848 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
849 nigel 87 break;
850    
851     case PT_PC:
852 ph10 349 OK = prop->chartype == code[2];
853 nigel 87 break;
854    
855     case PT_SC:
856 ph10 349 OK = prop->script == code[2];
857 nigel 87 break;
858    
859     /* Should never occur, but keep compilers from grumbling. */
860    
861     default:
862     OK = codevalue != OP_PROP;
863     break;
864 nigel 77 }
865 nigel 87
866     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
867 nigel 77 }
868     break;
869     #endif
870    
871    
872    
873     /* ========================================================================== */
874     /* These opcodes likewise inspect the subject character, but have an
875     argument that is not a data character. It is one of these opcodes:
876 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
877     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
878 nigel 77
879     case OP_TYPEPLUS:
880     case OP_TYPEMINPLUS:
881 nigel 93 case OP_TYPEPOSPLUS:
882 nigel 77 count = current_state->count; /* Already matched */
883     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
884     if (clen > 0)
885     {
886     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
887     (c < 256 &&
888 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
889 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
890     {
891 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
892     {
893     active_count--; /* Remove non-match possibility */
894     next_active_state--;
895     }
896 nigel 77 count++;
897     ADD_NEW(state_offset, count);
898     }
899     }
900     break;
901    
902     /*-----------------------------------------------------------------*/
903     case OP_TYPEQUERY:
904     case OP_TYPEMINQUERY:
905 nigel 93 case OP_TYPEPOSQUERY:
906 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
907     if (clen > 0)
908     {
909     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
910     (c < 256 &&
911 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
912 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
913     {
914 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
915     {
916     active_count--; /* Remove non-match possibility */
917     next_active_state--;
918     }
919 nigel 77 ADD_NEW(state_offset + 2, 0);
920     }
921     }
922     break;
923    
924     /*-----------------------------------------------------------------*/
925     case OP_TYPESTAR:
926     case OP_TYPEMINSTAR:
927 nigel 93 case OP_TYPEPOSSTAR:
928 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
929     if (clen > 0)
930     {
931     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
932     (c < 256 &&
933 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
934 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
935     {
936 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
937     {
938     active_count--; /* Remove non-match possibility */
939     next_active_state--;
940     }
941 nigel 77 ADD_NEW(state_offset, 0);
942     }
943     }
944     break;
945    
946     /*-----------------------------------------------------------------*/
947     case OP_TYPEEXACT:
948 nigel 93 count = current_state->count; /* Number already matched */
949     if (clen > 0)
950     {
951     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
952     (c < 256 &&
953 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
954 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
955     {
956     if (++count >= GET2(code, 1))
957     { ADD_NEW(state_offset + 4, 0); }
958     else
959     { ADD_NEW(state_offset, count); }
960     }
961     }
962     break;
963    
964     /*-----------------------------------------------------------------*/
965 nigel 77 case OP_TYPEUPTO:
966     case OP_TYPEMINUPTO:
967 nigel 93 case OP_TYPEPOSUPTO:
968     ADD_ACTIVE(state_offset + 4, 0);
969 nigel 77 count = current_state->count; /* Number already matched */
970     if (clen > 0)
971     {
972     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
973     (c < 256 &&
974 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
975 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
976     {
977 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
978     {
979     active_count--; /* Remove non-match possibility */
980     next_active_state--;
981     }
982 nigel 77 if (++count >= GET2(code, 1))
983     { ADD_NEW(state_offset + 4, 0); }
984     else
985     { ADD_NEW(state_offset, count); }
986     }
987     }
988     break;
989    
990     /* ========================================================================== */
991     /* These are virtual opcodes that are used when something like
992 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
993     argument. It keeps the code above fast for the other cases. The argument
994     is in the d variable. */
995 nigel 77
996 ph10 151 #ifdef SUPPORT_UCP
997 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
998     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
999 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1000 nigel 77 count = current_state->count; /* Already matched */
1001 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1002 nigel 77 if (clen > 0)
1003     {
1004 nigel 87 BOOL OK;
1005 ph10 349 const ucd_record * prop = GET_UCD(c);
1006 nigel 87 switch(code[2])
1007     {
1008     case PT_ANY:
1009     OK = TRUE;
1010     break;
1011    
1012     case PT_LAMP:
1013 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1014 nigel 87 break;
1015    
1016     case PT_GC:
1017 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1018 nigel 87 break;
1019    
1020     case PT_PC:
1021 ph10 349 OK = prop->chartype == code[3];
1022 nigel 87 break;
1023    
1024     case PT_SC:
1025 ph10 349 OK = prop->script == code[3];
1026 nigel 87 break;
1027    
1028     /* Should never occur, but keep compilers from grumbling. */
1029    
1030     default:
1031     OK = codevalue != OP_PROP;
1032     break;
1033     }
1034    
1035 nigel 93 if (OK == (d == OP_PROP))
1036     {
1037     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1038     {
1039     active_count--; /* Remove non-match possibility */
1040     next_active_state--;
1041     }
1042     count++;
1043     ADD_NEW(state_offset, count);
1044     }
1045 nigel 77 }
1046     break;
1047    
1048     /*-----------------------------------------------------------------*/
1049     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1050     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1051 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1052 nigel 77 count = current_state->count; /* Already matched */
1053     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1054 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1055 nigel 77 {
1056     const uschar *nptr = ptr + clen;
1057     int ncount = 0;
1058 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1059     {
1060     active_count--; /* Remove non-match possibility */
1061     next_active_state--;
1062     }
1063 nigel 77 while (nptr < end_subject)
1064     {
1065     int nd;
1066     int ndlen = 1;
1067     GETCHARLEN(nd, nptr, ndlen);
1068 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1069 nigel 77 ncount++;
1070     nptr += ndlen;
1071     }
1072     count++;
1073     ADD_NEW_DATA(-state_offset, count, ncount);
1074     }
1075     break;
1076 ph10 151 #endif
1077 nigel 77
1078     /*-----------------------------------------------------------------*/
1079 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1080     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1081     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1082     count = current_state->count; /* Already matched */
1083     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1084     if (clen > 0)
1085     {
1086     int ncount = 0;
1087     switch (c)
1088     {
1089     case 0x000b:
1090     case 0x000c:
1091     case 0x0085:
1092     case 0x2028:
1093     case 0x2029:
1094 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1095     goto ANYNL01;
1096    
1097     case 0x000d:
1098     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1099     /* Fall through */
1100    
1101     ANYNL01:
1102     case 0x000a:
1103 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1104     {
1105     active_count--; /* Remove non-match possibility */
1106     next_active_state--;
1107     }
1108     count++;
1109     ADD_NEW_DATA(-state_offset, count, ncount);
1110     break;
1111 ph10 231
1112 nigel 93 default:
1113     break;
1114     }
1115     }
1116     break;
1117    
1118     /*-----------------------------------------------------------------*/
1119 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1120     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1121     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1122     count = current_state->count; /* Already matched */
1123     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1124     if (clen > 0)
1125     {
1126 ph10 182 BOOL OK;
1127 ph10 178 switch (c)
1128     {
1129     case 0x000a:
1130     case 0x000b:
1131     case 0x000c:
1132     case 0x000d:
1133     case 0x0085:
1134     case 0x2028:
1135     case 0x2029:
1136     OK = TRUE;
1137 ph10 182 break;
1138 ph10 178
1139     default:
1140     OK = FALSE;
1141 ph10 182 break;
1142 ph10 178 }
1143    
1144     if (OK == (d == OP_VSPACE))
1145 ph10 182 {
1146 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1147     {
1148     active_count--; /* Remove non-match possibility */
1149     next_active_state--;
1150     }
1151     count++;
1152     ADD_NEW_DATA(-state_offset, count, 0);
1153     }
1154     }
1155     break;
1156    
1157     /*-----------------------------------------------------------------*/
1158     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1159     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1160     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1161     count = current_state->count; /* Already matched */
1162     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1163     if (clen > 0)
1164     {
1165 ph10 182 BOOL OK;
1166 ph10 178 switch (c)
1167     {
1168     case 0x09: /* HT */
1169     case 0x20: /* SPACE */
1170     case 0xa0: /* NBSP */
1171     case 0x1680: /* OGHAM SPACE MARK */
1172     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1173     case 0x2000: /* EN QUAD */
1174     case 0x2001: /* EM QUAD */
1175     case 0x2002: /* EN SPACE */
1176     case 0x2003: /* EM SPACE */
1177     case 0x2004: /* THREE-PER-EM SPACE */
1178     case 0x2005: /* FOUR-PER-EM SPACE */
1179     case 0x2006: /* SIX-PER-EM SPACE */
1180     case 0x2007: /* FIGURE SPACE */
1181     case 0x2008: /* PUNCTUATION SPACE */
1182     case 0x2009: /* THIN SPACE */
1183     case 0x200A: /* HAIR SPACE */
1184     case 0x202f: /* NARROW NO-BREAK SPACE */
1185     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1186     case 0x3000: /* IDEOGRAPHIC SPACE */
1187     OK = TRUE;
1188     break;
1189 ph10 182
1190 ph10 178 default:
1191     OK = FALSE;
1192     break;
1193     }
1194 ph10 182
1195 ph10 178 if (OK == (d == OP_HSPACE))
1196 ph10 182 {
1197 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1198     {
1199     active_count--; /* Remove non-match possibility */
1200     next_active_state--;
1201     }
1202     count++;
1203     ADD_NEW_DATA(-state_offset, count, 0);
1204     }
1205     }
1206     break;
1207    
1208     /*-----------------------------------------------------------------*/
1209 ph10 151 #ifdef SUPPORT_UCP
1210 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1211     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1212 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1213 nigel 87 count = 4;
1214 nigel 77 goto QS1;
1215    
1216     case OP_PROP_EXTRA + OP_TYPESTAR:
1217     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1218 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1219 nigel 77 count = 0;
1220    
1221     QS1:
1222    
1223 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1224 nigel 77 if (clen > 0)
1225     {
1226 nigel 87 BOOL OK;
1227 ph10 349 const ucd_record * prop = GET_UCD(c);
1228 nigel 87 switch(code[2])
1229     {
1230     case PT_ANY:
1231     OK = TRUE;
1232     break;
1233    
1234     case PT_LAMP:
1235 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1236 nigel 87 break;
1237    
1238     case PT_GC:
1239 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1240 nigel 87 break;
1241    
1242     case PT_PC:
1243 ph10 349 OK = prop->chartype == code[3];
1244 nigel 87 break;
1245    
1246     case PT_SC:
1247 ph10 349 OK = prop->script == code[3];
1248 nigel 87 break;
1249    
1250     /* Should never occur, but keep compilers from grumbling. */
1251    
1252     default:
1253     OK = codevalue != OP_PROP;
1254     break;
1255     }
1256    
1257 nigel 93 if (OK == (d == OP_PROP))
1258     {
1259     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1260     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1261     {
1262     active_count--; /* Remove non-match possibility */
1263     next_active_state--;
1264     }
1265     ADD_NEW(state_offset + count, 0);
1266     }
1267 nigel 77 }
1268     break;
1269    
1270     /*-----------------------------------------------------------------*/
1271     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1272     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1273 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1274 nigel 77 count = 2;
1275     goto QS2;
1276    
1277     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1278     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1279 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1280 nigel 77 count = 0;
1281    
1282     QS2:
1283    
1284     ADD_ACTIVE(state_offset + 2, 0);
1285 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1286 nigel 77 {
1287     const uschar *nptr = ptr + clen;
1288     int ncount = 0;
1289 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1290     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1291     {
1292     active_count--; /* Remove non-match possibility */
1293     next_active_state--;
1294     }
1295 nigel 77 while (nptr < end_subject)
1296     {
1297     int nd;
1298     int ndlen = 1;
1299     GETCHARLEN(nd, nptr, ndlen);
1300 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1301 nigel 77 ncount++;
1302     nptr += ndlen;
1303     }
1304     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1305     }
1306     break;
1307 ph10 151 #endif
1308 nigel 77
1309     /*-----------------------------------------------------------------*/
1310 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1311     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1312     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1313     count = 2;
1314     goto QS3;
1315    
1316     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1317     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1318     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1319     count = 0;
1320    
1321     QS3:
1322     ADD_ACTIVE(state_offset + 2, 0);
1323     if (clen > 0)
1324     {
1325     int ncount = 0;
1326     switch (c)
1327     {
1328     case 0x000b:
1329     case 0x000c:
1330     case 0x0085:
1331     case 0x2028:
1332     case 0x2029:
1333 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1334     goto ANYNL02;
1335    
1336     case 0x000d:
1337     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1338     /* Fall through */
1339    
1340     ANYNL02:
1341     case 0x000a:
1342 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1343     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1344     {
1345     active_count--; /* Remove non-match possibility */
1346     next_active_state--;
1347     }
1348     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1349     break;
1350 ph10 231
1351 nigel 93 default:
1352     break;
1353     }
1354     }
1355     break;
1356    
1357     /*-----------------------------------------------------------------*/
1358 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1359     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1360     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1361     count = 2;
1362     goto QS4;
1363    
1364     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1365     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1366     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1367     count = 0;
1368    
1369     QS4:
1370     ADD_ACTIVE(state_offset + 2, 0);
1371     if (clen > 0)
1372     {
1373 ph10 182 BOOL OK;
1374 ph10 178 switch (c)
1375     {
1376     case 0x000a:
1377     case 0x000b:
1378     case 0x000c:
1379     case 0x000d:
1380     case 0x0085:
1381     case 0x2028:
1382     case 0x2029:
1383     OK = TRUE;
1384     break;
1385 ph10 182
1386 ph10 178 default:
1387     OK = FALSE;
1388     break;
1389     }
1390     if (OK == (d == OP_VSPACE))
1391 ph10 182 {
1392 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1393     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1394     {
1395     active_count--; /* Remove non-match possibility */
1396     next_active_state--;
1397     }
1398     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1399     }
1400     }
1401     break;
1402    
1403     /*-----------------------------------------------------------------*/
1404     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1405     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1406     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1407     count = 2;
1408     goto QS5;
1409    
1410     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1411     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1412     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1413     count = 0;
1414    
1415     QS5:
1416     ADD_ACTIVE(state_offset + 2, 0);
1417     if (clen > 0)
1418     {
1419 ph10 182 BOOL OK;
1420 ph10 178 switch (c)
1421     {
1422     case 0x09: /* HT */
1423     case 0x20: /* SPACE */
1424     case 0xa0: /* NBSP */
1425     case 0x1680: /* OGHAM SPACE MARK */
1426     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1427     case 0x2000: /* EN QUAD */
1428     case 0x2001: /* EM QUAD */
1429     case 0x2002: /* EN SPACE */
1430     case 0x2003: /* EM SPACE */
1431     case 0x2004: /* THREE-PER-EM SPACE */
1432     case 0x2005: /* FOUR-PER-EM SPACE */
1433     case 0x2006: /* SIX-PER-EM SPACE */
1434     case 0x2007: /* FIGURE SPACE */
1435     case 0x2008: /* PUNCTUATION SPACE */
1436     case 0x2009: /* THIN SPACE */
1437     case 0x200A: /* HAIR SPACE */
1438     case 0x202f: /* NARROW NO-BREAK SPACE */
1439     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1440     case 0x3000: /* IDEOGRAPHIC SPACE */
1441     OK = TRUE;
1442     break;
1443 ph10 182
1444 ph10 178 default:
1445     OK = FALSE;
1446     break;
1447     }
1448 ph10 182
1449 ph10 178 if (OK == (d == OP_HSPACE))
1450 ph10 182 {
1451 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1452     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1453     {
1454     active_count--; /* Remove non-match possibility */
1455     next_active_state--;
1456     }
1457     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1458     }
1459     }
1460     break;
1461    
1462     /*-----------------------------------------------------------------*/
1463 ph10 151 #ifdef SUPPORT_UCP
1464 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1465     case OP_PROP_EXTRA + OP_TYPEUPTO:
1466     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1467 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1468 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1469 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1470 nigel 77 count = current_state->count; /* Number already matched */
1471     if (clen > 0)
1472     {
1473 nigel 87 BOOL OK;
1474 ph10 349 const ucd_record * prop = GET_UCD(c);
1475 nigel 87 switch(code[4])
1476 nigel 77 {
1477 nigel 87 case PT_ANY:
1478     OK = TRUE;
1479     break;
1480    
1481     case PT_LAMP:
1482 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1483 nigel 87 break;
1484    
1485     case PT_GC:
1486 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1487 nigel 87 break;
1488    
1489     case PT_PC:
1490 ph10 349 OK = prop->chartype == code[5];
1491 nigel 87 break;
1492    
1493     case PT_SC:
1494 ph10 349 OK = prop->script == code[5];
1495 nigel 87 break;
1496    
1497     /* Should never occur, but keep compilers from grumbling. */
1498    
1499     default:
1500     OK = codevalue != OP_PROP;
1501     break;
1502     }
1503    
1504     if (OK == (d == OP_PROP))
1505     {
1506 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1507     {
1508     active_count--; /* Remove non-match possibility */
1509     next_active_state--;
1510     }
1511 nigel 77 if (++count >= GET2(code, 1))
1512 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1513 nigel 77 else
1514     { ADD_NEW(state_offset, count); }
1515     }
1516     }
1517     break;
1518    
1519     /*-----------------------------------------------------------------*/
1520     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1521     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1522     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1523 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1524 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1525     { ADD_ACTIVE(state_offset + 4, 0); }
1526     count = current_state->count; /* Number already matched */
1527 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1528 nigel 77 {
1529     const uschar *nptr = ptr + clen;
1530     int ncount = 0;
1531 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1532     {
1533     active_count--; /* Remove non-match possibility */
1534     next_active_state--;
1535     }
1536 nigel 77 while (nptr < end_subject)
1537     {
1538     int nd;
1539     int ndlen = 1;
1540     GETCHARLEN(nd, nptr, ndlen);
1541 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1542 nigel 77 ncount++;
1543     nptr += ndlen;
1544     }
1545     if (++count >= GET2(code, 1))
1546     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1547     else
1548     { ADD_NEW_DATA(-state_offset, count, ncount); }
1549     }
1550     break;
1551 ph10 151 #endif
1552 nigel 77
1553 nigel 93 /*-----------------------------------------------------------------*/
1554     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1555     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1556     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1557     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1558     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1559     { ADD_ACTIVE(state_offset + 4, 0); }
1560     count = current_state->count; /* Number already matched */
1561     if (clen > 0)
1562     {
1563     int ncount = 0;
1564     switch (c)
1565     {
1566     case 0x000b:
1567     case 0x000c:
1568     case 0x0085:
1569     case 0x2028:
1570     case 0x2029:
1571 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1572     goto ANYNL03;
1573    
1574     case 0x000d:
1575     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1576     /* Fall through */
1577    
1578     ANYNL03:
1579     case 0x000a:
1580 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1581     {
1582     active_count--; /* Remove non-match possibility */
1583     next_active_state--;
1584     }
1585     if (++count >= GET2(code, 1))
1586     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1587     else
1588     { ADD_NEW_DATA(-state_offset, count, ncount); }
1589     break;
1590 ph10 231
1591 nigel 93 default:
1592     break;
1593     }
1594     }
1595     break;
1596    
1597 ph10 178 /*-----------------------------------------------------------------*/
1598     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1599     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1600     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1601     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1602     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1603     { ADD_ACTIVE(state_offset + 4, 0); }
1604     count = current_state->count; /* Number already matched */
1605     if (clen > 0)
1606     {
1607 ph10 182 BOOL OK;
1608 ph10 178 switch (c)
1609     {
1610     case 0x000a:
1611     case 0x000b:
1612     case 0x000c:
1613     case 0x000d:
1614     case 0x0085:
1615     case 0x2028:
1616     case 0x2029:
1617     OK = TRUE;
1618     break;
1619 ph10 182
1620 ph10 178 default:
1621     OK = FALSE;
1622     }
1623 ph10 182
1624 ph10 178 if (OK == (d == OP_VSPACE))
1625 ph10 182 {
1626 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1627     {
1628     active_count--; /* Remove non-match possibility */
1629     next_active_state--;
1630     }
1631     if (++count >= GET2(code, 1))
1632     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1633     else
1634     { ADD_NEW_DATA(-state_offset, count, 0); }
1635     }
1636     }
1637     break;
1638    
1639     /*-----------------------------------------------------------------*/
1640     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1641     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1642     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1643     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1644     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1645     { ADD_ACTIVE(state_offset + 4, 0); }
1646     count = current_state->count; /* Number already matched */
1647     if (clen > 0)
1648     {
1649 ph10 182 BOOL OK;
1650 ph10 178 switch (c)
1651     {
1652     case 0x09: /* HT */
1653     case 0x20: /* SPACE */
1654     case 0xa0: /* NBSP */
1655     case 0x1680: /* OGHAM SPACE MARK */
1656     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1657     case 0x2000: /* EN QUAD */
1658     case 0x2001: /* EM QUAD */
1659     case 0x2002: /* EN SPACE */
1660     case 0x2003: /* EM SPACE */
1661     case 0x2004: /* THREE-PER-EM SPACE */
1662     case 0x2005: /* FOUR-PER-EM SPACE */
1663     case 0x2006: /* SIX-PER-EM SPACE */
1664     case 0x2007: /* FIGURE SPACE */
1665     case 0x2008: /* PUNCTUATION SPACE */
1666     case 0x2009: /* THIN SPACE */
1667     case 0x200A: /* HAIR SPACE */
1668     case 0x202f: /* NARROW NO-BREAK SPACE */
1669     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1670     case 0x3000: /* IDEOGRAPHIC SPACE */
1671     OK = TRUE;
1672     break;
1673 ph10 182
1674 ph10 178 default:
1675     OK = FALSE;
1676     break;
1677     }
1678 ph10 182
1679 ph10 178 if (OK == (d == OP_HSPACE))
1680 ph10 182 {
1681 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1682     {
1683     active_count--; /* Remove non-match possibility */
1684     next_active_state--;
1685     }
1686     if (++count >= GET2(code, 1))
1687     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1688     else
1689     { ADD_NEW_DATA(-state_offset, count, 0); }
1690     }
1691     }
1692     break;
1693    
1694 nigel 77 /* ========================================================================== */
1695     /* These opcodes are followed by a character that is usually compared
1696     to the current subject character; it is loaded into d. We still get
1697     here even if there is no subject character, because in some cases zero
1698     repetitions are permitted. */
1699    
1700     /*-----------------------------------------------------------------*/
1701     case OP_CHAR:
1702     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1703     break;
1704    
1705     /*-----------------------------------------------------------------*/
1706     case OP_CHARNC:
1707     if (clen == 0) break;
1708    
1709     #ifdef SUPPORT_UTF8
1710     if (utf8)
1711     {
1712     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1713     {
1714 nigel 93 unsigned int othercase;
1715 nigel 77 if (c < 128) othercase = fcc[c]; else
1716    
1717     /* If we have Unicode property support, we can use it to test the
1718 nigel 87 other case of the character. */
1719 nigel 77
1720     #ifdef SUPPORT_UCP
1721 ph10 349 othercase = UCD_OTHERCASE(c);
1722 nigel 87 #else
1723 nigel 93 othercase = NOTACHAR;
1724 nigel 77 #endif
1725    
1726     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1727     }
1728     }
1729     else
1730     #endif /* SUPPORT_UTF8 */
1731    
1732     /* Non-UTF-8 mode */
1733     {
1734     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1735     }
1736     break;
1737    
1738    
1739     #ifdef SUPPORT_UCP
1740     /*-----------------------------------------------------------------*/
1741     /* This is a tricky one because it can match more than one character.
1742     Find out how many characters to skip, and then set up a negative state
1743     to wait for them to pass before continuing. */
1744    
1745     case OP_EXTUNI:
1746 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1747 nigel 77 {
1748     const uschar *nptr = ptr + clen;
1749     int ncount = 0;
1750     while (nptr < end_subject)
1751     {
1752     int nclen = 1;
1753     GETCHARLEN(c, nptr, nclen);
1754 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1755 nigel 77 ncount++;
1756     nptr += nclen;
1757     }
1758     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1759     }
1760     break;
1761     #endif
1762    
1763     /*-----------------------------------------------------------------*/
1764 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1765     character (when CR is followed by LF). In this case, set up a negative
1766     state to wait for one character to pass before continuing. */
1767    
1768     case OP_ANYNL:
1769     if (clen > 0) switch(c)
1770     {
1771     case 0x000b:
1772     case 0x000c:
1773     case 0x0085:
1774     case 0x2028:
1775     case 0x2029:
1776 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1777    
1778     case 0x000a:
1779 nigel 93 ADD_NEW(state_offset + 1, 0);
1780     break;
1781 ph10 231
1782 nigel 93 case 0x000d:
1783     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1784     {
1785     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1786     }
1787     else
1788     {
1789     ADD_NEW(state_offset + 1, 0);
1790     }
1791     break;
1792     }
1793     break;
1794    
1795     /*-----------------------------------------------------------------*/
1796 ph10 178 case OP_NOT_VSPACE:
1797     if (clen > 0) switch(c)
1798     {
1799     case 0x000a:
1800     case 0x000b:
1801     case 0x000c:
1802     case 0x000d:
1803     case 0x0085:
1804     case 0x2028:
1805     case 0x2029:
1806     break;
1807 ph10 182
1808     default:
1809 ph10 178 ADD_NEW(state_offset + 1, 0);
1810     break;
1811     }
1812     break;
1813    
1814     /*-----------------------------------------------------------------*/
1815     case OP_VSPACE:
1816     if (clen > 0) switch(c)
1817     {
1818     case 0x000a:
1819     case 0x000b:
1820     case 0x000c:
1821     case 0x000d:
1822     case 0x0085:
1823     case 0x2028:
1824     case 0x2029:
1825     ADD_NEW(state_offset + 1, 0);
1826     break;
1827 ph10 182
1828 ph10 178 default: break;
1829     }
1830     break;
1831    
1832     /*-----------------------------------------------------------------*/
1833     case OP_NOT_HSPACE:
1834     if (clen > 0) switch(c)
1835     {
1836     case 0x09: /* HT */
1837     case 0x20: /* SPACE */
1838     case 0xa0: /* NBSP */
1839     case 0x1680: /* OGHAM SPACE MARK */
1840     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1841     case 0x2000: /* EN QUAD */
1842     case 0x2001: /* EM QUAD */
1843     case 0x2002: /* EN SPACE */
1844     case 0x2003: /* EM SPACE */
1845     case 0x2004: /* THREE-PER-EM SPACE */
1846     case 0x2005: /* FOUR-PER-EM SPACE */
1847     case 0x2006: /* SIX-PER-EM SPACE */
1848     case 0x2007: /* FIGURE SPACE */
1849     case 0x2008: /* PUNCTUATION SPACE */
1850     case 0x2009: /* THIN SPACE */
1851     case 0x200A: /* HAIR SPACE */
1852     case 0x202f: /* NARROW NO-BREAK SPACE */
1853     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1854     case 0x3000: /* IDEOGRAPHIC SPACE */
1855     break;
1856 ph10 182
1857     default:
1858 ph10 178 ADD_NEW(state_offset + 1, 0);
1859     break;
1860     }
1861     break;
1862    
1863     /*-----------------------------------------------------------------*/
1864     case OP_HSPACE:
1865     if (clen > 0) switch(c)
1866     {
1867     case 0x09: /* HT */
1868     case 0x20: /* SPACE */
1869     case 0xa0: /* NBSP */
1870     case 0x1680: /* OGHAM SPACE MARK */
1871     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1872     case 0x2000: /* EN QUAD */
1873     case 0x2001: /* EM QUAD */
1874     case 0x2002: /* EN SPACE */
1875     case 0x2003: /* EM SPACE */
1876     case 0x2004: /* THREE-PER-EM SPACE */
1877     case 0x2005: /* FOUR-PER-EM SPACE */
1878     case 0x2006: /* SIX-PER-EM SPACE */
1879     case 0x2007: /* FIGURE SPACE */
1880     case 0x2008: /* PUNCTUATION SPACE */
1881     case 0x2009: /* THIN SPACE */
1882     case 0x200A: /* HAIR SPACE */
1883     case 0x202f: /* NARROW NO-BREAK SPACE */
1884     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1885     case 0x3000: /* IDEOGRAPHIC SPACE */
1886     ADD_NEW(state_offset + 1, 0);
1887     break;
1888     }
1889     break;
1890    
1891     /*-----------------------------------------------------------------*/
1892 nigel 77 /* Match a negated single character. This is only used for one-byte
1893     characters, that is, we know that d < 256. The character we are
1894     checking (c) can be multibyte. */
1895    
1896     case OP_NOT:
1897     if (clen > 0)
1898     {
1899 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1900 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1901     }
1902     break;
1903    
1904     /*-----------------------------------------------------------------*/
1905     case OP_PLUS:
1906     case OP_MINPLUS:
1907 nigel 93 case OP_POSPLUS:
1908 nigel 77 case OP_NOTPLUS:
1909     case OP_NOTMINPLUS:
1910 nigel 93 case OP_NOTPOSPLUS:
1911 nigel 77 count = current_state->count; /* Already matched */
1912     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1913     if (clen > 0)
1914     {
1915 nigel 93 unsigned int otherd = NOTACHAR;
1916 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1917     {
1918     #ifdef SUPPORT_UTF8
1919 nigel 87 if (utf8 && d >= 128)
1920 nigel 77 {
1921     #ifdef SUPPORT_UCP
1922 ph10 349 otherd = UCD_OTHERCASE(d);
1923 nigel 77 #endif /* SUPPORT_UCP */
1924     }
1925     else
1926     #endif /* SUPPORT_UTF8 */
1927     otherd = fcc[d];
1928     }
1929     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1930 nigel 93 {
1931     if (count > 0 &&
1932     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1933     {
1934     active_count--; /* Remove non-match possibility */
1935     next_active_state--;
1936     }
1937     count++;
1938     ADD_NEW(state_offset, count);
1939     }
1940 nigel 77 }
1941     break;
1942    
1943     /*-----------------------------------------------------------------*/
1944     case OP_QUERY:
1945     case OP_MINQUERY:
1946 nigel 93 case OP_POSQUERY:
1947 nigel 77 case OP_NOTQUERY:
1948     case OP_NOTMINQUERY:
1949 nigel 93 case OP_NOTPOSQUERY:
1950 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1951     if (clen > 0)
1952     {
1953 nigel 93 unsigned int otherd = NOTACHAR;
1954 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1955 nigel 77 {
1956     #ifdef SUPPORT_UTF8
1957 nigel 87 if (utf8 && d >= 128)
1958 nigel 77 {
1959     #ifdef SUPPORT_UCP
1960 ph10 349 otherd = UCD_OTHERCASE(d);
1961 nigel 77 #endif /* SUPPORT_UCP */
1962     }
1963     else
1964     #endif /* SUPPORT_UTF8 */
1965     otherd = fcc[d];
1966     }
1967     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1968 nigel 93 {
1969     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1970     {
1971     active_count--; /* Remove non-match possibility */
1972     next_active_state--;
1973     }
1974     ADD_NEW(state_offset + dlen + 1, 0);
1975     }
1976 nigel 77 }
1977     break;
1978    
1979     /*-----------------------------------------------------------------*/
1980     case OP_STAR:
1981     case OP_MINSTAR:
1982 nigel 93 case OP_POSSTAR:
1983 nigel 77 case OP_NOTSTAR:
1984     case OP_NOTMINSTAR:
1985 nigel 93 case OP_NOTPOSSTAR:
1986 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1987     if (clen > 0)
1988     {
1989 nigel 93 unsigned int otherd = NOTACHAR;
1990 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1991 nigel 77 {
1992     #ifdef SUPPORT_UTF8
1993 nigel 87 if (utf8 && d >= 128)
1994 nigel 77 {
1995     #ifdef SUPPORT_UCP
1996 ph10 349 otherd = UCD_OTHERCASE(d);
1997 nigel 77 #endif /* SUPPORT_UCP */
1998     }
1999     else
2000     #endif /* SUPPORT_UTF8 */
2001     otherd = fcc[d];
2002     }
2003     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2004 nigel 93 {
2005     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2006     {
2007     active_count--; /* Remove non-match possibility */
2008     next_active_state--;
2009     }
2010     ADD_NEW(state_offset, 0);
2011     }
2012 nigel 77 }
2013     break;
2014    
2015     /*-----------------------------------------------------------------*/
2016     case OP_EXACT:
2017 nigel 93 case OP_NOTEXACT:
2018     count = current_state->count; /* Number already matched */
2019     if (clen > 0)
2020     {
2021     unsigned int otherd = NOTACHAR;
2022     if ((ims & PCRE_CASELESS) != 0)
2023     {
2024     #ifdef SUPPORT_UTF8
2025     if (utf8 && d >= 128)
2026     {
2027     #ifdef SUPPORT_UCP
2028 ph10 349 otherd = UCD_OTHERCASE(d);
2029 nigel 93 #endif /* SUPPORT_UCP */
2030     }
2031     else
2032     #endif /* SUPPORT_UTF8 */
2033     otherd = fcc[d];
2034     }
2035     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2036     {
2037     if (++count >= GET2(code, 1))
2038     { ADD_NEW(state_offset + dlen + 3, 0); }
2039     else
2040     { ADD_NEW(state_offset, count); }
2041     }
2042     }
2043     break;
2044    
2045     /*-----------------------------------------------------------------*/
2046 nigel 77 case OP_UPTO:
2047     case OP_MINUPTO:
2048 nigel 93 case OP_POSUPTO:
2049 nigel 77 case OP_NOTUPTO:
2050     case OP_NOTMINUPTO:
2051 nigel 93 case OP_NOTPOSUPTO:
2052     ADD_ACTIVE(state_offset + dlen + 3, 0);
2053 nigel 77 count = current_state->count; /* Number already matched */
2054     if (clen > 0)
2055     {
2056 nigel 93 unsigned int otherd = NOTACHAR;
2057 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2058     {
2059     #ifdef SUPPORT_UTF8
2060 nigel 87 if (utf8 && d >= 128)
2061 nigel 77 {
2062     #ifdef SUPPORT_UCP
2063 ph10 349 otherd = UCD_OTHERCASE(d);
2064 nigel 77 #endif /* SUPPORT_UCP */
2065     }
2066     else
2067     #endif /* SUPPORT_UTF8 */
2068     otherd = fcc[d];
2069     }
2070     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2071     {
2072 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2073     {
2074     active_count--; /* Remove non-match possibility */
2075     next_active_state--;
2076     }
2077 nigel 77 if (++count >= GET2(code, 1))
2078     { ADD_NEW(state_offset + dlen + 3, 0); }
2079     else
2080     { ADD_NEW(state_offset, count); }
2081     }
2082     }
2083     break;
2084    
2085    
2086     /* ========================================================================== */
2087     /* These are the class-handling opcodes */
2088    
2089     case OP_CLASS:
2090     case OP_NCLASS:
2091     case OP_XCLASS:
2092     {
2093     BOOL isinclass = FALSE;
2094     int next_state_offset;
2095     const uschar *ecode;
2096    
2097     /* For a simple class, there is always just a 32-byte table, and we
2098     can set isinclass from it. */
2099    
2100     if (codevalue != OP_XCLASS)
2101     {
2102     ecode = code + 33;
2103     if (clen > 0)
2104     {
2105     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2106     ((code[1 + c/8] & (1 << (c&7))) != 0);
2107     }
2108     }
2109    
2110     /* An extended class may have a table or a list of single characters,
2111     ranges, or both, and it may be positive or negative. There's a
2112     function that sorts all this out. */
2113    
2114     else
2115     {
2116     ecode = code + GET(code, 1);
2117     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2118     }
2119    
2120     /* At this point, isinclass is set for all kinds of class, and ecode
2121     points to the byte after the end of the class. If there is a
2122     quantifier, this is where it will be. */
2123    
2124     next_state_offset = ecode - start_code;
2125    
2126     switch (*ecode)
2127     {
2128     case OP_CRSTAR:
2129     case OP_CRMINSTAR:
2130     ADD_ACTIVE(next_state_offset + 1, 0);
2131     if (isinclass) { ADD_NEW(state_offset, 0); }
2132     break;
2133    
2134     case OP_CRPLUS:
2135     case OP_CRMINPLUS:
2136     count = current_state->count; /* Already matched */
2137     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2138     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2139     break;
2140    
2141     case OP_CRQUERY:
2142     case OP_CRMINQUERY:
2143     ADD_ACTIVE(next_state_offset + 1, 0);
2144     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2145     break;
2146    
2147     case OP_CRRANGE:
2148     case OP_CRMINRANGE:
2149     count = current_state->count; /* Already matched */
2150     if (count >= GET2(ecode, 1))
2151     { ADD_ACTIVE(next_state_offset + 5, 0); }
2152     if (isinclass)
2153     {
2154 nigel 91 int max = GET2(ecode, 3);
2155     if (++count >= max && max != 0) /* Max 0 => no limit */
2156 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2157     else
2158     { ADD_NEW(state_offset, count); }
2159     }
2160     break;
2161    
2162     default:
2163     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2164     break;
2165     }
2166     }
2167     break;
2168    
2169     /* ========================================================================== */
2170     /* These are the opcodes for fancy brackets of various kinds. We have
2171 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2172     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2173 ph10 341 though the other "backtracking verbs" are not supported. */
2174 ph10 345
2175 ph10 341 case OP_FAIL:
2176 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2177 ph10 345 break;
2178 nigel 77
2179     case OP_ASSERT:
2180     case OP_ASSERT_NOT:
2181     case OP_ASSERTBACK:
2182     case OP_ASSERTBACK_NOT:
2183     {
2184     int rc;
2185     int local_offsets[2];
2186     int local_workspace[1000];
2187     const uschar *endasscode = code + GET(code, 1);
2188    
2189     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2190    
2191     rc = internal_dfa_exec(
2192     md, /* static match data */
2193     code, /* this subexpression's code */
2194     ptr, /* where we currently are */
2195     ptr - start_subject, /* start offset */
2196     local_offsets, /* offset vector */
2197     sizeof(local_offsets)/sizeof(int), /* size of same */
2198     local_workspace, /* workspace vector */
2199     sizeof(local_workspace)/sizeof(int), /* size of same */
2200     ims, /* the current ims flags */
2201     rlevel, /* function recursion level */
2202     recursing); /* pass on regex recursion */
2203    
2204     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2205     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2206     }
2207     break;
2208    
2209     /*-----------------------------------------------------------------*/
2210     case OP_COND:
2211 nigel 93 case OP_SCOND:
2212 nigel 77 {
2213     int local_offsets[1000];
2214     int local_workspace[1000];
2215 ph10 406 int codelink = GET(code, 1);
2216 ph10 397 int condcode;
2217 ph10 406
2218 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2219 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2220 ph10 398 happen for the other conditions. */
2221 nigel 77
2222 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2223 ph10 406 {
2224     rrc = 0;
2225 ph10 397 if (pcre_callout != NULL)
2226     {
2227     pcre_callout_block cb;
2228     cb.version = 1; /* Version 1 of the callout block */
2229     cb.callout_number = code[LINK_SIZE+2];
2230     cb.offset_vector = offsets;
2231     cb.subject = (PCRE_SPTR)start_subject;
2232     cb.subject_length = end_subject - start_subject;
2233     cb.start_match = current_subject - start_subject;
2234     cb.current_position = ptr - start_subject;
2235     cb.pattern_position = GET(code, LINK_SIZE + 3);
2236     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2237     cb.capture_top = 1;
2238     cb.capture_last = -1;
2239     cb.callout_data = md->callout_data;
2240     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2241     }
2242 ph10 398 if (rrc > 0) break; /* Fail this thread */
2243     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2244 ph10 406 }
2245 ph10 398
2246 ph10 397 condcode = code[LINK_SIZE+1];
2247 ph10 406
2248 nigel 93 /* Back reference conditions are not supported */
2249 nigel 77
2250 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2251    
2252     /* The DEFINE condition is always false */
2253    
2254     if (condcode == OP_DEF)
2255 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2256 nigel 93
2257     /* The only supported version of OP_RREF is for the value RREF_ANY,
2258     which means "test if in any recursion". We can't test for specifically
2259     recursed groups. */
2260    
2261     else if (condcode == OP_RREF)
2262     {
2263 nigel 77 int value = GET2(code, LINK_SIZE+2);
2264 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2265 ph10 406 if (recursing > 0)
2266 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2267     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2268 nigel 77 }
2269    
2270     /* Otherwise, the condition is an assertion */
2271    
2272     else
2273     {
2274     int rc;
2275     const uschar *asscode = code + LINK_SIZE + 1;
2276     const uschar *endasscode = asscode + GET(asscode, 1);
2277    
2278     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2279    
2280     rc = internal_dfa_exec(
2281     md, /* fixed match data */
2282     asscode, /* this subexpression's code */
2283     ptr, /* where we currently are */
2284     ptr - start_subject, /* start offset */
2285     local_offsets, /* offset vector */
2286     sizeof(local_offsets)/sizeof(int), /* size of same */
2287     local_workspace, /* workspace vector */
2288     sizeof(local_workspace)/sizeof(int), /* size of same */
2289     ims, /* the current ims flags */
2290     rlevel, /* function recursion level */
2291     recursing); /* pass on regex recursion */
2292    
2293     if ((rc >= 0) ==
2294     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2295 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2296 nigel 77 else
2297 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2298 nigel 77 }
2299     }
2300     break;
2301    
2302     /*-----------------------------------------------------------------*/
2303     case OP_RECURSE:
2304     {
2305     int local_offsets[1000];
2306     int local_workspace[1000];
2307     int rc;
2308    
2309     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2310     recursing + 1));
2311    
2312     rc = internal_dfa_exec(
2313     md, /* fixed match data */
2314     start_code + GET(code, 1), /* this subexpression's code */
2315     ptr, /* where we currently are */
2316     ptr - start_subject, /* start offset */
2317     local_offsets, /* offset vector */
2318     sizeof(local_offsets)/sizeof(int), /* size of same */
2319     local_workspace, /* workspace vector */
2320     sizeof(local_workspace)/sizeof(int), /* size of same */
2321     ims, /* the current ims flags */
2322     rlevel, /* function recursion level */
2323     recursing + 1); /* regex recurse level */
2324    
2325     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2326     recursing + 1, rc));
2327    
2328     /* Ran out of internal offsets */
2329    
2330     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2331    
2332     /* For each successful matched substring, set up the next state with a
2333     count of characters to skip before trying it. Note that the count is in
2334     characters, not bytes. */
2335    
2336     if (rc > 0)
2337     {
2338     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2339     {
2340     const uschar *p = start_subject + local_offsets[rc];
2341     const uschar *pp = start_subject + local_offsets[rc+1];
2342     int charcount = local_offsets[rc+1] - local_offsets[rc];
2343     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2344     if (charcount > 0)
2345     {
2346     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2347     }
2348     else
2349     {
2350     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2351     }
2352     }
2353     }
2354     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2355     }
2356     break;
2357    
2358     /*-----------------------------------------------------------------*/
2359     case OP_ONCE:
2360     {
2361     int local_offsets[2];
2362     int local_workspace[1000];
2363    
2364     int rc = internal_dfa_exec(
2365     md, /* fixed match data */
2366     code, /* this subexpression's code */
2367     ptr, /* where we currently are */
2368     ptr - start_subject, /* start offset */
2369     local_offsets, /* offset vector */
2370     sizeof(local_offsets)/sizeof(int), /* size of same */
2371     local_workspace, /* workspace vector */
2372     sizeof(local_workspace)/sizeof(int), /* size of same */
2373     ims, /* the current ims flags */
2374     rlevel, /* function recursion level */
2375     recursing); /* pass on regex recursion */
2376    
2377     if (rc >= 0)
2378     {
2379     const uschar *end_subpattern = code;
2380     int charcount = local_offsets[1] - local_offsets[0];
2381     int next_state_offset, repeat_state_offset;
2382    
2383     do { end_subpattern += GET(end_subpattern, 1); }
2384     while (*end_subpattern == OP_ALT);
2385     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2386    
2387     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2388     arrange for the repeat state also to be added to the relevant list.
2389     Calculate the offset, or set -1 for no repeat. */
2390    
2391     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2392     *end_subpattern == OP_KETRMIN)?
2393     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2394    
2395     /* If we have matched an empty string, add the next state at the
2396     current character pointer. This is important so that the duplicate
2397     checking kicks in, which is what breaks infinite loops that match an
2398     empty string. */
2399    
2400     if (charcount == 0)
2401     {
2402     ADD_ACTIVE(next_state_offset, 0);
2403     }
2404    
2405     /* Optimization: if there are no more active states, and there
2406     are no new states yet set up, then skip over the subject string
2407     right here, to save looping. Otherwise, set up the new state to swing
2408     into action when the end of the substring is reached. */
2409    
2410     else if (i + 1 >= active_count && new_count == 0)
2411     {
2412     ptr += charcount;
2413     clen = 0;
2414     ADD_NEW(next_state_offset, 0);
2415    
2416     /* If we are adding a repeat state at the new character position,
2417     we must fudge things so that it is the only current state.
2418     Otherwise, it might be a duplicate of one we processed before, and
2419     that would cause it to be skipped. */
2420    
2421     if (repeat_state_offset >= 0)
2422     {
2423     next_active_state = active_states;
2424     active_count = 0;
2425     i = -1;
2426     ADD_ACTIVE(repeat_state_offset, 0);
2427     }
2428     }
2429     else
2430     {
2431     const uschar *p = start_subject + local_offsets[0];
2432     const uschar *pp = start_subject + local_offsets[1];
2433     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2434     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2435     if (repeat_state_offset >= 0)
2436     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2437     }
2438    
2439     }
2440     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2441     }
2442     break;
2443    
2444    
2445     /* ========================================================================== */
2446     /* Handle callouts */
2447    
2448     case OP_CALLOUT:
2449 ph10 406 rrc = 0;
2450 nigel 77 if (pcre_callout != NULL)
2451     {
2452     pcre_callout_block cb;
2453     cb.version = 1; /* Version 1 of the callout block */
2454     cb.callout_number = code[1];
2455     cb.offset_vector = offsets;
2456 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2457 nigel 77 cb.subject_length = end_subject - start_subject;
2458     cb.start_match = current_subject - start_subject;
2459     cb.current_position = ptr - start_subject;
2460     cb.pattern_position = GET(code, 2);
2461     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2462     cb.capture_top = 1;
2463     cb.capture_last = -1;
2464     cb.callout_data = md->callout_data;
2465     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2466 ph10 406 }
2467     if (rrc == 0)
2468     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2469 nigel 77 break;
2470    
2471    
2472     /* ========================================================================== */
2473     default: /* Unsupported opcode */
2474     return PCRE_ERROR_DFA_UITEM;
2475     }
2476    
2477     NEXT_ACTIVE_STATE: continue;
2478    
2479     } /* End of loop scanning active states */
2480    
2481     /* We have finished the processing at the current subject character. If no
2482     new states have been set for the next character, we have found all the
2483     matches that we are going to find. If we are at the top level and partial
2484 ph10 428 matching has been requested, check for appropriate conditions. The "forced_
2485     fail" variable counts the number of (*F) encountered for the character. If it
2486     is equal to the original active_count (saved in workspace[1]) it means that
2487     (*F) was found on every active state. In this case we don't want to give a
2488     partial match. */
2489 nigel 77
2490     if (new_count <= 0)
2491     {
2492 ph10 427 if (rlevel == 1 && /* Top level, and */
2493 ph10 428 reached_end != workspace[1] && /* Not all reached end */
2494     forced_fail != workspace[1] && /* Not all forced fail & */
2495 ph10 427 ( /* either... */
2496     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2497     || /* or... */
2498     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2499     match_count < 0) /* no matches */
2500     ) && /* And... */
2501     ptr >= end_subject && /* Reached end of subject */
2502     ptr > current_subject) /* Matched non-empty string */
2503 nigel 77 {
2504     if (offsetcount >= 2)
2505     {
2506     offsets[0] = current_subject - start_subject;
2507     offsets[1] = end_subject - start_subject;
2508     }
2509     match_count = PCRE_ERROR_PARTIAL;
2510     }
2511    
2512     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2513     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2514     rlevel*2-2, SP));
2515 nigel 91 break; /* In effect, "return", but see the comment below */
2516 nigel 77 }
2517    
2518     /* One or more states are active for the next character. */
2519    
2520     ptr += clen; /* Advance to next subject character */
2521     } /* Loop to move along the subject string */
2522    
2523 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2524     if we use "return" above, we have compiler trouble. Some compilers warn if
2525     there's nothing here because they think the function doesn't return a value. On
2526     the other hand, if we put a dummy statement here, some more clever compilers
2527     complain that it can't be reached. Sigh. */
2528 nigel 77
2529 nigel 91 return match_count;
2530 nigel 77 }
2531    
2532    
2533    
2534    
2535     /*************************************************
2536     * Execute a Regular Expression - DFA engine *
2537     *************************************************/
2538    
2539     /* This external function applies a compiled re to a subject string using a DFA
2540     engine. This function calls the internal function multiple times if the pattern
2541     is not anchored.
2542    
2543     Arguments:
2544     argument_re points to the compiled expression
2545 ph10 97 extra_data points to extra data or is NULL
2546 nigel 77 subject points to the subject string
2547     length length of subject string (may contain binary zeros)
2548     start_offset where to start in the subject string
2549     options option bits
2550     offsets vector of match offsets
2551     offsetcount size of same
2552     workspace workspace vector
2553     wscount size of same
2554    
2555     Returns: > 0 => number of match offset pairs placed in offsets
2556     = 0 => offsets overflowed; longest matches are present
2557     -1 => failed to match
2558     < -1 => some kind of unexpected problem
2559     */
2560    
2561 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2562 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2563     const char *subject, int length, int start_offset, int options, int *offsets,
2564     int offsetcount, int *workspace, int wscount)
2565     {
2566     real_pcre *re = (real_pcre *)argument_re;
2567     dfa_match_data match_block;
2568 nigel 91 dfa_match_data *md = &match_block;
2569 nigel 77 BOOL utf8, anchored, startline, firstline;
2570     const uschar *current_subject, *end_subject, *lcc;
2571    
2572     pcre_study_data internal_study;
2573     const pcre_study_data *study = NULL;
2574     real_pcre internal_re;
2575    
2576     const uschar *req_byte_ptr;
2577     const uschar *start_bits = NULL;
2578     BOOL first_byte_caseless = FALSE;
2579     BOOL req_byte_caseless = FALSE;
2580     int first_byte = -1;
2581     int req_byte = -1;
2582     int req_byte2 = -1;
2583 nigel 91 int newline;
2584 nigel 77
2585     /* Plausibility checks */
2586    
2587     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2588     if (re == NULL || subject == NULL || workspace == NULL ||
2589     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2590     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2591     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2592    
2593     /* We need to find the pointer to any study data before we test for byte
2594     flipping, so we scan the extra_data block first. This may set two fields in the
2595     match block, so we must initialize them beforehand. However, the other fields
2596     in the match block must not be set until after the byte flipping. */
2597    
2598 nigel 91 md->tables = re->tables;
2599     md->callout_data = NULL;
2600 nigel 77
2601     if (extra_data != NULL)
2602     {
2603     unsigned int flags = extra_data->flags;
2604     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2605     study = (const pcre_study_data *)extra_data->study_data;
2606     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2607 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2608     return PCRE_ERROR_DFA_UMLIMIT;
2609 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2610 nigel 91 md->callout_data = extra_data->callout_data;
2611 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2612 nigel 91 md->tables = extra_data->tables;
2613 nigel 77 }
2614    
2615     /* Check that the first field in the block is the magic number. If it is not,
2616     test for a regex that was compiled on a host of opposite endianness. If this is
2617     the case, flipped values are put in internal_re and internal_study if there was
2618     study data too. */
2619    
2620     if (re->magic_number != MAGIC_NUMBER)
2621     {
2622     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2623     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2624     if (study != NULL) study = &internal_study;
2625     }
2626    
2627     /* Set some local values */
2628    
2629     current_subject = (const unsigned char *)subject + start_offset;
2630     end_subject = (const unsigned char *)subject + length;
2631     req_byte_ptr = current_subject - 1;
2632    
2633 nigel 91 #ifdef SUPPORT_UTF8
2634 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2635 nigel 91 #else
2636     utf8 = FALSE;
2637     #endif
2638 nigel 77
2639 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2640     (re->options & PCRE_ANCHORED) != 0;
2641    
2642 nigel 77 /* The remaining fixed data for passing around. */
2643    
2644 nigel 91 md->start_code = (const uschar *)argument_re +
2645 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2646 nigel 91 md->start_subject = (const unsigned char *)subject;
2647     md->end_subject = end_subject;
2648     md->moptions = options;
2649     md->poptions = re->options;
2650 nigel 77
2651 ph10 231 /* If the BSR option is not set at match time, copy what was set
2652     at compile time. */
2653    
2654     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2655     {
2656     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2657     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2658     #ifdef BSR_ANYCRLF
2659     else md->moptions |= PCRE_BSR_ANYCRLF;
2660 ph10 243 #endif
2661     }
2662 ph10 231
2663 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2664     nothing is set at run time, whatever was used at compile time applies. */
2665 nigel 91
2666 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2667 nigel 93 PCRE_NEWLINE_BITS)
2668 nigel 91 {
2669 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2670 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2671     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2672 nigel 91 case PCRE_NEWLINE_CR+
2673 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2674 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2675 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2676 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2677 nigel 91 }
2678    
2679 ph10 149 if (newline == -2)
2680 nigel 91 {
2681 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2682     }
2683     else if (newline < 0)
2684     {
2685 nigel 93 md->nltype = NLTYPE_ANY;
2686 nigel 91 }
2687     else
2688     {
2689 nigel 93 md->nltype = NLTYPE_FIXED;
2690     if (newline > 255)
2691     {
2692     md->nllen = 2;
2693     md->nl[0] = (newline >> 8) & 255;
2694     md->nl[1] = newline & 255;
2695     }
2696     else
2697     {
2698     md->nllen = 1;
2699     md->nl[0] = newline;
2700     }
2701 nigel 91 }
2702    
2703 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2704     back the character offset. */
2705    
2706     #ifdef SUPPORT_UTF8
2707     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2708     {
2709     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2710     return PCRE_ERROR_BADUTF8;
2711     if (start_offset > 0 && start_offset < length)
2712     {
2713     int tb = ((uschar *)subject)[start_offset];
2714     if (tb > 127)
2715     {
2716     tb &= 0xc0;
2717     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2718     }
2719     }
2720     }
2721     #endif
2722    
2723     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2724     is a feature that makes it possible to save compiled regex and re-use them
2725     in other programs later. */
2726    
2727 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2728 nigel 77
2729     /* The lower casing table and the "must be at the start of a line" flag are
2730     used in a loop when finding where to start. */
2731    
2732 nigel 91 lcc = md->tables + lcc_offset;
2733 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2734 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2735    
2736     /* Set up the first character to match, if available. The first_byte value is
2737     never set for an anchored regular expression, but the anchoring may be forced
2738     at run time, so we have to test for anchoring. The first char may be unset for
2739     an unanchored pattern, of course. If there's no first char and the pattern was
2740     studied, there may be a bitmap of possible first characters. */
2741    
2742     if (!anchored)
2743     {
2744 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2745 nigel 77 {
2746     first_byte = re->first_byte & 255;
2747     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2748     first_byte = lcc[first_byte];
2749     }
2750     else
2751     {
2752     if (startline && study != NULL &&
2753     (study->options & PCRE_STUDY_MAPPED) != 0)
2754     start_bits = study->start_bits;
2755     }
2756     }
2757    
2758     /* For anchored or unanchored matches, there may be a "last known required
2759     character" set. */
2760    
2761 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2762 nigel 77 {
2763     req_byte = re->req_byte & 255;
2764     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2765 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2766 nigel 77 }
2767    
2768     /* Call the main matching function, looping for a non-anchored regex after a
2769 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2770     a match. */
2771 nigel 77
2772     for (;;)
2773     {
2774     int rc;
2775    
2776     if ((options & PCRE_DFA_RESTART) == 0)
2777     {
2778     const uschar *save_end_subject = end_subject;
2779    
2780 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2781     line of a multiline string. Implement this by temporarily adjusting
2782     end_subject so that we stop scanning at a newline. If the match fails at
2783     the newline, later code breaks this loop. */
2784 nigel 77
2785     if (firstline)
2786     {
2787 ph10 365 USPTR t = current_subject;
2788     #ifdef SUPPORT_UTF8
2789     if (utf8)
2790 ph10 371 {
2791     while (t < md->end_subject && !IS_NEWLINE(t))
2792 ph10 365 {
2793     t++;
2794     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2795 ph10 371 }
2796 ph10 365 }
2797     else
2798 ph10 371 #endif
2799 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2800 nigel 77 end_subject = t;
2801     }
2802 ph10 392
2803 ph10 389 /* There are some optimizations that avoid running the match if a known
2804     starting point is not found, or if a known later character is not present.
2805     However, there is an option that disables these, for testing and for
2806     ensuring that all callouts do actually occur. */
2807 nigel 77
2808 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2809 ph10 392 {
2810    
2811 ph10 389 /* Advance to a known first byte. */
2812 ph10 392
2813 ph10 389 if (first_byte >= 0)
2814 nigel 77 {
2815 ph10 389 if (first_byte_caseless)
2816     while (current_subject < end_subject &&
2817     lcc[*current_subject] != first_byte)
2818     current_subject++;
2819     else
2820 ph10 392 while (current_subject < end_subject &&
2821 ph10 389 *current_subject != first_byte)
2822     current_subject++;
2823     }
2824 ph10 392
2825 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2826 ph10 392
2827 ph10 389 else if (startline)
2828     {
2829     if (current_subject > md->start_subject + start_offset)
2830     {
2831 ph10 365 #ifdef SUPPORT_UTF8
2832 ph10 389 if (utf8)
2833 ph10 365 {
2834 ph10 392 while (current_subject < end_subject &&
2835 ph10 389 !WAS_NEWLINE(current_subject))
2836     {
2837 ph10 365 current_subject++;
2838 ph10 389 while(current_subject < end_subject &&
2839     (*current_subject & 0xc0) == 0x80)
2840     current_subject++;
2841     }
2842 ph10 371 }
2843 ph10 389 else
2844     #endif
2845     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2846     current_subject++;
2847 ph10 392
2848 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2849     ANYCRLF, and we are now at a LF, advance the match position by one
2850     more character. */
2851 ph10 392
2852 ph10 391 if (current_subject[-1] == CHAR_CR &&
2853 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2854     current_subject < end_subject &&
2855 ph10 391 *current_subject == CHAR_NL)
2856 ph10 389 current_subject++;
2857 ph10 365 }
2858 nigel 77 }
2859 ph10 392
2860 ph10 389 /* Or to a non-unique first char after study */
2861 ph10 392
2862 ph10 389 else if (start_bits != NULL)
2863 nigel 77 {
2864 ph10 389 while (current_subject < end_subject)
2865     {
2866     register unsigned int c = *current_subject;
2867     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2868     else break;
2869     }
2870 nigel 77 }
2871 ph10 392 }
2872 nigel 77
2873     /* Restore fudged end_subject */
2874    
2875     end_subject = save_end_subject;
2876     }
2877    
2878     /* If req_byte is set, we know that that character must appear in the subject
2879     for the match to succeed. If the first character is set, req_byte must be
2880     later in the subject; otherwise the test starts at the match point. This
2881     optimization can save a huge amount of work in patterns with nested unlimited
2882     repeats that aren't going to match. Writing separate code for cased/caseless
2883     versions makes it go faster, as does using an autoincrement and backing off
2884     on a match.
2885    
2886     HOWEVER: when the subject string is very, very long, searching to its end can
2887     take a long time, and give bad performance on quite ordinary patterns. This
2888     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2889     don't do this when the string is sufficiently long.
2890    
2891 ph10 392 ALSO: this processing is disabled when partial matching is requested, and can
2892 ph10 428 also be explicitly deactivated. Furthermore, we have to disable when
2893     restarting after a partial match, because the required character may have
2894     already been matched. */
2895 nigel 77
2896 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2897     req_byte >= 0 &&
2898 nigel 77 end_subject - current_subject < REQ_BYTE_MAX &&
2899 ph10 428 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2900 nigel 77 {
2901     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2902    
2903     /* We don't need to repeat the search if we haven't yet reached the
2904     place we found it at last time. */
2905    
2906     if (p > req_byte_ptr)
2907     {
2908     if (req_byte_caseless)
2909     {
2910     while (p < end_subject)
2911     {
2912     register int pp = *p++;
2913     if (pp == req_byte || pp == req_byte2) { p--; break; }
2914     }
2915     }
2916     else
2917     {
2918     while (p < end_subject)
2919     {
2920     if (*p++ == req_byte) { p--; break; }
2921     }
2922     }
2923    
2924     /* If we can't find the required character, break the matching loop,
2925     which will cause a return or PCRE_ERROR_NOMATCH. */
2926    
2927     if (p >= end_subject) break;
2928    
2929     /* If we have found the required character, save the point where we
2930     found it, so that we don't search again next time round the loop if
2931     the start hasn't passed this character yet. */
2932    
2933     req_byte_ptr = p;
2934     }
2935     }
2936    
2937     /* OK, now we can do the business */
2938    
2939     rc = internal_dfa_exec(
2940 nigel 91 md, /* fixed match data */
2941     md->start_code, /* this subexpression's code */
2942     current_subject, /* where we currently are */
2943     start_offset, /* start offset in subject */
2944     offsets, /* offset vector */
2945     offsetcount, /* size of same */
2946     workspace, /* workspace vector */
2947     wscount, /* size of same */
2948 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2949 nigel 91 0, /* function recurse level */
2950     0); /* regex recurse level */
2951 nigel 77
2952     /* Anything other than "no match" means we are done, always; otherwise, carry
2953     on only if not anchored. */
2954    
2955     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2956    
2957     /* Advance to the next subject character unless we are at the end of a line
2958     and firstline is set. */
2959    
2960 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2961 nigel 77 current_subject++;
2962     if (utf8)
2963     {
2964     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2965     current_subject++;
2966     }
2967     if (current_subject > end_subject) break;
2968    
2969 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2970 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2971     or ANY or ANYCRLF, advance the match position by one more character. */
2972 nigel 93
2973 ph10 391 if (current_subject[-1] == CHAR_CR &&
2974 ph10 226 current_subject < end_subject &&
2975 ph10 391 *current_subject == CHAR_NL &&
2976 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2977 ph10 226 (md->nltype == NLTYPE_ANY ||
2978     md->nltype == NLTYPE_ANYCRLF ||
2979     md->nllen == 2))
2980 nigel 93 current_subject++;
2981    
2982     } /* "Bumpalong" loop */
2983    
2984 nigel 77 return PCRE_ERROR_NOMATCH;
2985     }
2986    
2987     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12