/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 439 - (hide annotations) (download)
Tue Sep 8 17:27:24 2009 UTC (5 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 101264 byte(s)
Added performance comment to pcre_exec.c.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 439 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51     test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71     Overall, I concluded that the gains in some cases did not outweigh the losses
72     in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109     character that is to be tested in some way. This makes is possible to
110     centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
113 ph10 168 that follow must also be modified. */
114 nigel 77
115 ph10 327 static const uschar coptable[] = {
116 nigel 77 0, /* End */
117 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
120 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
121     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
123     1, /* Char */
124     1, /* Charnc */
125     1, /* not */
126     /* Positive single-char repeats */
127     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
128     3, 3, 3, /* upto, minupto, exact */
129 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
130 nigel 77 /* Negative single-char repeats - only for chars < 256 */
131     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
132     3, 3, 3, /* NOT upto, minupto, exact */
133 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
134 nigel 77 /* Positive type repeats */
135     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
136     3, 3, 3, /* Type upto, minupto, exact */
137 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
138 nigel 77 /* Character class & ref repeats */
139     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
140     0, 0, /* CRRANGE, CRMINRANGE */
141     0, /* CLASS */
142     0, /* NCLASS */
143     0, /* XCLASS - variable length */
144     0, /* REF */
145     0, /* RECURSE */
146     0, /* CALLOUT */
147     0, /* Alt */
148     0, /* Ket */
149     0, /* KetRmax */
150     0, /* KetRmin */
151     0, /* Assert */
152     0, /* Assert not */
153     0, /* Assert behind */
154     0, /* Assert behind not */
155     0, /* Reverse */
156 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
157     0, 0, 0, /* SBRA, SCBRA, SCOND */
158 nigel 77 0, /* CREF */
159 nigel 93 0, /* RREF */
160     0, /* DEF */
161 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
162     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
163 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
164 nigel 77 };
165    
166     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
167     and \w */
168    
169 ph10 327 static const uschar toptable1[] = {
170 ph10 168 0, 0, 0, 0, 0, 0,
171 nigel 77 ctype_digit, ctype_digit,
172     ctype_space, ctype_space,
173     ctype_word, ctype_word,
174 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
175 nigel 77 };
176    
177 ph10 327 static const uschar toptable2[] = {
178 ph10 168 0, 0, 0, 0, 0, 0,
179 nigel 77 ctype_digit, 0,
180     ctype_space, 0,
181     ctype_word, 0,
182 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
183 nigel 77 };
184    
185    
186     /* Structure for holding data about a particular state, which is in effect the
187     current data for an active path through the match tree. It must consist
188     entirely of ints because the working vector we are passed, and which we put
189     these structures in, is a vector of ints. */
190    
191     typedef struct stateblock {
192     int offset; /* Offset to opcode */
193     int count; /* Count for repeats */
194     int ims; /* ims flag bits */
195     int data; /* Some use extra data */
196     } stateblock;
197    
198     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
199    
200    
201     #ifdef DEBUG
202     /*************************************************
203     * Print character string *
204     *************************************************/
205    
206     /* Character string printing function for debugging.
207    
208     Arguments:
209     p points to string
210     length number of bytes
211     f where to print
212    
213     Returns: nothing
214     */
215    
216     static void
217     pchars(unsigned char *p, int length, FILE *f)
218     {
219     int c;
220     while (length-- > 0)
221     {
222     if (isprint(c = *(p++)))
223     fprintf(f, "%c", c);
224     else
225     fprintf(f, "\\x%02x", c);
226     }
227     }
228     #endif
229    
230    
231    
232     /*************************************************
233     * Execute a Regular Expression - DFA engine *
234     *************************************************/
235    
236     /* This internal function applies a compiled pattern to a subject string,
237     starting at a given point, using a DFA engine. This function is called from the
238     external one, possibly multiple times if the pattern is not anchored. The
239     function calls itself recursively for some kinds of subpattern.
240    
241     Arguments:
242     md the match_data block with fixed information
243     this_start_code the opening bracket of this subexpression's code
244     current_subject where we currently are in the subject string
245     start_offset start offset in the subject string
246     offsets vector to contain the matching string offsets
247     offsetcount size of same
248     workspace vector of workspace
249     wscount size of same
250     ims the current ims flags
251     rlevel function call recursion level
252     recursing regex recursive call level
253    
254 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
255 ph10 341 = 0 => offsets overflowed; longest matches are present
256 nigel 77 -1 => failed to match
257     < -1 => some kind of unexpected problem
258    
259     The following macros are used for adding states to the two state vectors (one
260     for the current character, one for the following character). */
261    
262     #define ADD_ACTIVE(x,y) \
263     if (active_count++ < wscount) \
264     { \
265     next_active_state->offset = (x); \
266     next_active_state->count = (y); \
267     next_active_state->ims = ims; \
268     next_active_state++; \
269     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
270     } \
271     else return PCRE_ERROR_DFA_WSSIZE
272    
273     #define ADD_ACTIVE_DATA(x,y,z) \
274     if (active_count++ < wscount) \
275     { \
276     next_active_state->offset = (x); \
277     next_active_state->count = (y); \
278     next_active_state->ims = ims; \
279     next_active_state->data = (z); \
280     next_active_state++; \
281     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
282     } \
283     else return PCRE_ERROR_DFA_WSSIZE
284    
285     #define ADD_NEW(x,y) \
286     if (new_count++ < wscount) \
287     { \
288     next_new_state->offset = (x); \
289     next_new_state->count = (y); \
290     next_new_state->ims = ims; \
291     next_new_state++; \
292     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
293     } \
294     else return PCRE_ERROR_DFA_WSSIZE
295    
296     #define ADD_NEW_DATA(x,y,z) \
297     if (new_count++ < wscount) \
298     { \
299     next_new_state->offset = (x); \
300     next_new_state->count = (y); \
301     next_new_state->ims = ims; \
302     next_new_state->data = (z); \
303     next_new_state++; \
304     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
305     } \
306     else return PCRE_ERROR_DFA_WSSIZE
307    
308     /* And now, here is the code */
309    
310     static int
311     internal_dfa_exec(
312     dfa_match_data *md,
313     const uschar *this_start_code,
314     const uschar *current_subject,
315     int start_offset,
316     int *offsets,
317     int offsetcount,
318     int *workspace,
319     int wscount,
320     int ims,
321     int rlevel,
322     int recursing)
323     {
324     stateblock *active_states, *new_states, *temp_states;
325     stateblock *next_active_state, *next_new_state;
326    
327     const uschar *ctypes, *lcc, *fcc;
328     const uschar *ptr;
329 nigel 93 const uschar *end_code, *first_op;
330 nigel 77
331     int active_count, new_count, match_count;
332    
333     /* Some fields in the md block are frequently referenced, so we load them into
334     independent variables in the hope that this will perform better. */
335    
336     const uschar *start_subject = md->start_subject;
337     const uschar *end_subject = md->end_subject;
338     const uschar *start_code = md->start_code;
339    
340 nigel 87 #ifdef SUPPORT_UTF8
341 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
342 nigel 93 #else
343     BOOL utf8 = FALSE;
344 nigel 87 #endif
345 nigel 77
346     rlevel++;
347     offsetcount &= (-2);
348    
349     wscount -= 2;
350     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
351     (2 * INTS_PER_STATEBLOCK);
352    
353     DPRINTF(("\n%.*s---------------------\n"
354     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
355     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
356    
357     ctypes = md->tables + ctypes_offset;
358     lcc = md->tables + lcc_offset;
359     fcc = md->tables + fcc_offset;
360    
361     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
362    
363     active_states = (stateblock *)(workspace + 2);
364     next_new_state = new_states = active_states + wscount;
365     new_count = 0;
366    
367 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
368     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
369    
370 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
371     the alternative states onto the list, and find out where the end is. This
372     makes is possible to use this function recursively, when we want to stop at a
373     matching internal ket rather than at the end.
374    
375     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
376     a backward assertion. In that case, we have to find out the maximum amount to
377     move back, and set up each alternative appropriately. */
378    
379 nigel 93 if (*first_op == OP_REVERSE)
380 nigel 77 {
381     int max_back = 0;
382     int gone_back;
383    
384     end_code = this_start_code;
385     do
386     {
387     int back = GET(end_code, 2+LINK_SIZE);
388     if (back > max_back) max_back = back;
389     end_code += GET(end_code, 1);
390     }
391     while (*end_code == OP_ALT);
392    
393     /* If we can't go back the amount required for the longest lookbehind
394     pattern, go back as far as we can; some alternatives may still be viable. */
395    
396     #ifdef SUPPORT_UTF8
397     /* In character mode we have to step back character by character */
398    
399     if (utf8)
400     {
401     for (gone_back = 0; gone_back < max_back; gone_back++)
402     {
403     if (current_subject <= start_subject) break;
404     current_subject--;
405     while (current_subject > start_subject &&
406     (*current_subject & 0xc0) == 0x80)
407     current_subject--;
408     }
409     }
410     else
411     #endif
412    
413     /* In byte-mode we can do this quickly. */
414    
415     {
416     gone_back = (current_subject - max_back < start_subject)?
417     current_subject - start_subject : max_back;
418     current_subject -= gone_back;
419     }
420 ph10 435
421     /* Save the earliest consulted character */
422    
423     if (current_subject < md->start_used_ptr)
424     md->start_used_ptr = current_subject;
425 nigel 77
426     /* Now we can process the individual branches. */
427    
428     end_code = this_start_code;
429     do
430     {
431     int back = GET(end_code, 2+LINK_SIZE);
432     if (back <= gone_back)
433     {
434     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
435     ADD_NEW_DATA(-bstate, 0, gone_back - back);
436     }
437     end_code += GET(end_code, 1);
438     }
439     while (*end_code == OP_ALT);
440     }
441    
442     /* This is the code for a "normal" subpattern (not a backward assertion). The
443     start of a whole pattern is always one of these. If we are at the top level,
444     we may be asked to restart matching from the same point that we reached for a
445     previous partial match. We still have to scan through the top-level branches to
446     find the end state. */
447    
448     else
449     {
450     end_code = this_start_code;
451    
452     /* Restarting */
453    
454     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
455     {
456     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
457     new_count = workspace[1];
458     if (!workspace[0])
459     memcpy(new_states, active_states, new_count * sizeof(stateblock));
460     }
461    
462     /* Not restarting */
463    
464     else
465     {
466 nigel 93 int length = 1 + LINK_SIZE +
467     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
468 nigel 77 do
469     {
470 nigel 93 ADD_NEW(end_code - start_code + length, 0);
471 nigel 77 end_code += GET(end_code, 1);
472 nigel 93 length = 1 + LINK_SIZE;
473 nigel 77 }
474     while (*end_code == OP_ALT);
475     }
476     }
477    
478     workspace[0] = 0; /* Bit indicating which vector is current */
479    
480     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
481    
482     /* Loop for scanning the subject */
483    
484     ptr = current_subject;
485     for (;;)
486     {
487     int i, j;
488 nigel 91 int clen, dlen;
489     unsigned int c, d;
490 ph10 428 int forced_fail = 0;
491     int reached_end = 0;
492 nigel 77
493     /* Make the new state list into the active state list and empty the
494     new state list. */
495    
496     temp_states = active_states;
497     active_states = new_states;
498     new_states = temp_states;
499     active_count = new_count;
500     new_count = 0;
501    
502     workspace[0] ^= 1; /* Remember for the restarting feature */
503     workspace[1] = active_count;
504    
505     #ifdef DEBUG
506     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
507     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
508     printf("\"\n");
509    
510     printf("%.*sActive states: ", rlevel*2-2, SP);
511     for (i = 0; i < active_count; i++)
512     printf("%d/%d ", active_states[i].offset, active_states[i].count);
513     printf("\n");
514     #endif
515    
516     /* Set the pointers for adding new states */
517    
518     next_active_state = active_states + active_count;
519     next_new_state = new_states;
520    
521     /* Load the current character from the subject outside the loop, as many
522     different states may want to look at it, and we assume that at least one
523     will. */
524    
525     if (ptr < end_subject)
526     {
527 nigel 93 clen = 1; /* Number of bytes in the character */
528 nigel 77 #ifdef SUPPORT_UTF8
529     if (utf8) { GETCHARLEN(c, ptr, clen); } else
530     #endif /* SUPPORT_UTF8 */
531     c = *ptr;
532     }
533     else
534     {
535 nigel 93 clen = 0; /* This indicates the end of the subject */
536     c = NOTACHAR; /* This value should never actually be used */
537 nigel 77 }
538    
539     /* Scan up the active states and act on each one. The result of an action
540     may be to add more states to the currently active list (e.g. on hitting a
541     parenthesis) or it may be to put states on the new list, for considering
542     when we move the character pointer on. */
543    
544     for (i = 0; i < active_count; i++)
545     {
546     stateblock *current_state = active_states + i;
547     const uschar *code;
548     int state_offset = current_state->offset;
549 ph10 397 int count, codevalue, rrc;
550 nigel 77
551     #ifdef DEBUG
552     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
553 nigel 93 if (clen == 0) printf("EOL\n");
554 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
555     else printf("0x%02x\n", c);
556     #endif
557    
558     /* This variable is referred to implicity in the ADD_xxx macros. */
559    
560     ims = current_state->ims;
561    
562     /* A negative offset is a special case meaning "hold off going to this
563     (negated) state until the number of characters in the data field have
564     been skipped". */
565    
566     if (state_offset < 0)
567     {
568     if (current_state->data > 0)
569     {
570     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
571     ADD_NEW_DATA(state_offset, current_state->count,
572     current_state->data - 1);
573     continue;
574     }
575     else
576     {
577     current_state->offset = state_offset = -state_offset;
578     }
579     }
580    
581 ph10 439 /* Check for a duplicate state with the same count, and skip if found.
582     See the note at the head of this module about the possibility of improving
583     performance here. */
584 nigel 77
585     for (j = 0; j < i; j++)
586     {
587     if (active_states[j].offset == state_offset &&
588     active_states[j].count == current_state->count)
589     {
590     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
591     goto NEXT_ACTIVE_STATE;
592     }
593     }
594    
595     /* The state offset is the offset to the opcode */
596    
597     code = start_code + state_offset;
598     codevalue = *code;
599    
600     /* If this opcode is followed by an inline character, load it. It is
601     tempting to test for the presence of a subject character here, but that
602     is wrong, because sometimes zero repetitions of the subject are
603     permitted.
604    
605     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
606 ph10 178 argument that is not a data character - but is always one byte long. We
607     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
608     this case. To keep the other cases fast, convert these ones to new opcodes.
609     */
610 nigel 77
611     if (coptable[codevalue] > 0)
612     {
613     dlen = 1;
614     #ifdef SUPPORT_UTF8
615     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
616     #endif /* SUPPORT_UTF8 */
617     d = code[coptable[codevalue]];
618     if (codevalue >= OP_TYPESTAR)
619     {
620 nigel 93 switch(d)
621     {
622     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
623     case OP_NOTPROP:
624     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
625     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
626     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
627 ph10 178 case OP_NOT_HSPACE:
628 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
629 ph10 178 case OP_NOT_VSPACE:
630 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
631 nigel 93 default: break;
632     }
633 nigel 77 }
634     }
635     else
636     {
637     dlen = 0; /* Not strictly necessary, but compilers moan */
638 nigel 93 d = NOTACHAR; /* if these variables are not set. */
639 nigel 77 }
640    
641    
642     /* Now process the individual opcodes */
643    
644     switch (codevalue)
645     {
646    
647     /* ========================================================================== */
648     /* Reached a closing bracket. If not at the end of the pattern, carry
649     on with the next opcode. Otherwise, unless we have an empty string and
650     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
651     matches so we always have the longest first. */
652    
653     case OP_KET:
654     case OP_KETRMIN:
655     case OP_KETRMAX:
656     if (code != end_code)
657     {
658     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
659     if (codevalue != OP_KET)
660     {
661     ADD_ACTIVE(state_offset - GET(code, 1), 0);
662     }
663     }
664 ph10 428 else
665 nigel 77 {
666 ph10 428 reached_end++; /* Count branches that reach the end */
667     if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
668 nigel 77 {
669 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
670     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
671     match_count = 0;
672     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
673     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
674     if (offsetcount >= 2)
675     {
676     offsets[0] = current_subject - start_subject;
677     offsets[1] = ptr - start_subject;
678     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
679     offsets[1] - offsets[0], current_subject));
680     }
681     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
682     {
683     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
684     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
685     match_count, rlevel*2-2, SP));
686     return match_count;
687     }
688     }
689 nigel 77 }
690     break;
691    
692     /* ========================================================================== */
693     /* These opcodes add to the current list of states without looking
694     at the current character. */
695    
696     /*-----------------------------------------------------------------*/
697     case OP_ALT:
698     do { code += GET(code, 1); } while (*code == OP_ALT);
699     ADD_ACTIVE(code - start_code, 0);
700     break;
701    
702     /*-----------------------------------------------------------------*/
703     case OP_BRA:
704 nigel 93 case OP_SBRA:
705 nigel 77 do
706     {
707     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
708     code += GET(code, 1);
709     }
710     while (*code == OP_ALT);
711     break;
712    
713     /*-----------------------------------------------------------------*/
714 nigel 93 case OP_CBRA:
715     case OP_SCBRA:
716     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
717     code += GET(code, 1);
718     while (*code == OP_ALT)
719     {
720     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
721     code += GET(code, 1);
722     }
723     break;
724    
725     /*-----------------------------------------------------------------*/
726 nigel 77 case OP_BRAZERO:
727     case OP_BRAMINZERO:
728     ADD_ACTIVE(state_offset + 1, 0);
729     code += 1 + GET(code, 2);
730     while (*code == OP_ALT) code += GET(code, 1);
731     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
732     break;
733    
734     /*-----------------------------------------------------------------*/
735 ph10 335 case OP_SKIPZERO:
736     code += 1 + GET(code, 2);
737     while (*code == OP_ALT) code += GET(code, 1);
738     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
739     break;
740    
741     /*-----------------------------------------------------------------*/
742 nigel 77 case OP_CIRC:
743     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
744 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
745     ptr != end_subject &&
746 nigel 93 WAS_NEWLINE(ptr)))
747 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
748     break;
749    
750     /*-----------------------------------------------------------------*/
751     case OP_EOD:
752     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
753     break;
754    
755     /*-----------------------------------------------------------------*/
756     case OP_OPT:
757     ims = code[1];
758     ADD_ACTIVE(state_offset + 2, 0);
759     break;
760    
761     /*-----------------------------------------------------------------*/
762     case OP_SOD:
763     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
764     break;
765    
766     /*-----------------------------------------------------------------*/
767     case OP_SOM:
768     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
769     break;
770    
771    
772     /* ========================================================================== */
773     /* These opcodes inspect the next subject character, and sometimes
774     the previous one as well, but do not have an argument. The variable
775     clen contains the length of the current character and is zero if we are
776     at the end of the subject. */
777    
778     /*-----------------------------------------------------------------*/
779     case OP_ANY:
780 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
781 nigel 77 { ADD_NEW(state_offset + 1, 0); }
782     break;
783    
784     /*-----------------------------------------------------------------*/
785 ph10 341 case OP_ALLANY:
786     if (clen > 0)
787     { ADD_NEW(state_offset + 1, 0); }
788     break;
789    
790     /*-----------------------------------------------------------------*/
791 nigel 77 case OP_EODN:
792 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
793 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
794     break;
795    
796     /*-----------------------------------------------------------------*/
797     case OP_DOLL:
798     if ((md->moptions & PCRE_NOTEOL) == 0)
799     {
800 nigel 91 if (clen == 0 ||
801 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
802 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
803     ))
804 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
805     }
806 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
807 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
808     break;
809    
810     /*-----------------------------------------------------------------*/
811    
812     case OP_DIGIT:
813     case OP_WHITESPACE:
814     case OP_WORDCHAR:
815     if (clen > 0 && c < 256 &&
816     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
817     { ADD_NEW(state_offset + 1, 0); }
818     break;
819    
820     /*-----------------------------------------------------------------*/
821     case OP_NOT_DIGIT:
822     case OP_NOT_WHITESPACE:
823     case OP_NOT_WORDCHAR:
824     if (clen > 0 && (c >= 256 ||
825     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
826     { ADD_NEW(state_offset + 1, 0); }
827     break;
828    
829     /*-----------------------------------------------------------------*/
830     case OP_WORD_BOUNDARY:
831     case OP_NOT_WORD_BOUNDARY:
832     {
833     int left_word, right_word;
834    
835     if (ptr > start_subject)
836     {
837     const uschar *temp = ptr - 1;
838 ph10 435 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
839 nigel 77 #ifdef SUPPORT_UTF8
840     if (utf8) BACKCHAR(temp);
841     #endif
842     GETCHARTEST(d, temp);
843     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
844     }
845     else left_word = 0;
846    
847 ph10 428 if (clen > 0)
848     right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
849     else /* This is a fudge to ensure that if this is the */
850     { /* last item in the pattern, we don't count it as */
851     reached_end--; /* reached, thus disabling a partial match. */
852     right_word = 0;
853     }
854 nigel 77
855     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
856     { ADD_ACTIVE(state_offset + 1, 0); }
857     }
858     break;
859    
860    
861     /*-----------------------------------------------------------------*/
862     /* Check the next character by Unicode property. We will get here only
863     if the support is in the binary; otherwise a compile-time error occurs.
864     */
865    
866 ph10 151 #ifdef SUPPORT_UCP
867 nigel 77 case OP_PROP:
868     case OP_NOTPROP:
869     if (clen > 0)
870     {
871 nigel 87 BOOL OK;
872 ph10 349 const ucd_record * prop = GET_UCD(c);
873 nigel 87 switch(code[1])
874 nigel 77 {
875 nigel 87 case PT_ANY:
876     OK = TRUE;
877     break;
878    
879     case PT_LAMP:
880 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
881 nigel 87 break;
882    
883     case PT_GC:
884 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
885 nigel 87 break;
886    
887     case PT_PC:
888 ph10 349 OK = prop->chartype == code[2];
889 nigel 87 break;
890    
891     case PT_SC:
892 ph10 349 OK = prop->script == code[2];
893 nigel 87 break;
894    
895     /* Should never occur, but keep compilers from grumbling. */
896    
897     default:
898     OK = codevalue != OP_PROP;
899     break;
900 nigel 77 }
901 nigel 87
902     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
903 nigel 77 }
904     break;
905     #endif
906    
907    
908    
909     /* ========================================================================== */
910     /* These opcodes likewise inspect the subject character, but have an
911     argument that is not a data character. It is one of these opcodes:
912 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
913     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
914 nigel 77
915     case OP_TYPEPLUS:
916     case OP_TYPEMINPLUS:
917 nigel 93 case OP_TYPEPOSPLUS:
918 nigel 77 count = current_state->count; /* Already matched */
919     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
920     if (clen > 0)
921     {
922     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
923     (c < 256 &&
924 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
925 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
926     {
927 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
928     {
929     active_count--; /* Remove non-match possibility */
930     next_active_state--;
931     }
932 nigel 77 count++;
933     ADD_NEW(state_offset, count);
934     }
935     }
936     break;
937    
938     /*-----------------------------------------------------------------*/
939     case OP_TYPEQUERY:
940     case OP_TYPEMINQUERY:
941 nigel 93 case OP_TYPEPOSQUERY:
942 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
943     if (clen > 0)
944     {
945     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
946     (c < 256 &&
947 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
948 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
949     {
950 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
951     {
952     active_count--; /* Remove non-match possibility */
953     next_active_state--;
954     }
955 nigel 77 ADD_NEW(state_offset + 2, 0);
956     }
957     }
958     break;
959    
960     /*-----------------------------------------------------------------*/
961     case OP_TYPESTAR:
962     case OP_TYPEMINSTAR:
963 nigel 93 case OP_TYPEPOSSTAR:
964 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
965     if (clen > 0)
966     {
967     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
968     (c < 256 &&
969 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
970 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
971     {
972 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
973     {
974     active_count--; /* Remove non-match possibility */
975     next_active_state--;
976     }
977 nigel 77 ADD_NEW(state_offset, 0);
978     }
979     }
980     break;
981    
982     /*-----------------------------------------------------------------*/
983     case OP_TYPEEXACT:
984 nigel 93 count = current_state->count; /* Number already matched */
985     if (clen > 0)
986     {
987     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
988     (c < 256 &&
989 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
990 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
991     {
992     if (++count >= GET2(code, 1))
993     { ADD_NEW(state_offset + 4, 0); }
994     else
995     { ADD_NEW(state_offset, count); }
996     }
997     }
998     break;
999    
1000     /*-----------------------------------------------------------------*/
1001 nigel 77 case OP_TYPEUPTO:
1002     case OP_TYPEMINUPTO:
1003 nigel 93 case OP_TYPEPOSUPTO:
1004     ADD_ACTIVE(state_offset + 4, 0);
1005 nigel 77 count = current_state->count; /* Number already matched */
1006     if (clen > 0)
1007     {
1008     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1009     (c < 256 &&
1010 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1011 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1012     {
1013 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1014     {
1015     active_count--; /* Remove non-match possibility */
1016     next_active_state--;
1017     }
1018 nigel 77 if (++count >= GET2(code, 1))
1019     { ADD_NEW(state_offset + 4, 0); }
1020     else
1021     { ADD_NEW(state_offset, count); }
1022     }
1023     }
1024     break;
1025    
1026     /* ========================================================================== */
1027     /* These are virtual opcodes that are used when something like
1028 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1029     argument. It keeps the code above fast for the other cases. The argument
1030     is in the d variable. */
1031 nigel 77
1032 ph10 151 #ifdef SUPPORT_UCP
1033 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1034     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1035 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1036 nigel 77 count = current_state->count; /* Already matched */
1037 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1038 nigel 77 if (clen > 0)
1039     {
1040 nigel 87 BOOL OK;
1041 ph10 349 const ucd_record * prop = GET_UCD(c);
1042 nigel 87 switch(code[2])
1043     {
1044     case PT_ANY:
1045     OK = TRUE;
1046     break;
1047    
1048     case PT_LAMP:
1049 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1050 nigel 87 break;
1051    
1052     case PT_GC:
1053 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1054 nigel 87 break;
1055    
1056     case PT_PC:
1057 ph10 349 OK = prop->chartype == code[3];
1058 nigel 87 break;
1059    
1060     case PT_SC:
1061 ph10 349 OK = prop->script == code[3];
1062 nigel 87 break;
1063    
1064     /* Should never occur, but keep compilers from grumbling. */
1065    
1066     default:
1067     OK = codevalue != OP_PROP;
1068     break;
1069     }
1070    
1071 nigel 93 if (OK == (d == OP_PROP))
1072     {
1073     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1074     {
1075     active_count--; /* Remove non-match possibility */
1076     next_active_state--;
1077     }
1078     count++;
1079     ADD_NEW(state_offset, count);
1080     }
1081 nigel 77 }
1082     break;
1083    
1084     /*-----------------------------------------------------------------*/
1085     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1086     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1087 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1088 nigel 77 count = current_state->count; /* Already matched */
1089     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1090 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1091 nigel 77 {
1092     const uschar *nptr = ptr + clen;
1093     int ncount = 0;
1094 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1095     {
1096     active_count--; /* Remove non-match possibility */
1097     next_active_state--;
1098     }
1099 nigel 77 while (nptr < end_subject)
1100     {
1101     int nd;
1102     int ndlen = 1;
1103     GETCHARLEN(nd, nptr, ndlen);
1104 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1105 nigel 77 ncount++;
1106     nptr += ndlen;
1107     }
1108     count++;
1109     ADD_NEW_DATA(-state_offset, count, ncount);
1110     }
1111     break;
1112 ph10 151 #endif
1113 nigel 77
1114     /*-----------------------------------------------------------------*/
1115 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1116     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1117     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1118     count = current_state->count; /* Already matched */
1119     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1120     if (clen > 0)
1121     {
1122     int ncount = 0;
1123     switch (c)
1124     {
1125     case 0x000b:
1126     case 0x000c:
1127     case 0x0085:
1128     case 0x2028:
1129     case 0x2029:
1130 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1131     goto ANYNL01;
1132    
1133     case 0x000d:
1134     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1135     /* Fall through */
1136    
1137     ANYNL01:
1138     case 0x000a:
1139 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1140     {
1141     active_count--; /* Remove non-match possibility */
1142     next_active_state--;
1143     }
1144     count++;
1145     ADD_NEW_DATA(-state_offset, count, ncount);
1146     break;
1147 ph10 231
1148 nigel 93 default:
1149     break;
1150     }
1151     }
1152     break;
1153    
1154     /*-----------------------------------------------------------------*/
1155 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1156     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1157     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1158     count = current_state->count; /* Already matched */
1159     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1160     if (clen > 0)
1161     {
1162 ph10 182 BOOL OK;
1163 ph10 178 switch (c)
1164     {
1165     case 0x000a:
1166     case 0x000b:
1167     case 0x000c:
1168     case 0x000d:
1169     case 0x0085:
1170     case 0x2028:
1171     case 0x2029:
1172     OK = TRUE;
1173 ph10 182 break;
1174 ph10 178
1175     default:
1176     OK = FALSE;
1177 ph10 182 break;
1178 ph10 178 }
1179    
1180     if (OK == (d == OP_VSPACE))
1181 ph10 182 {
1182 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1183     {
1184     active_count--; /* Remove non-match possibility */
1185     next_active_state--;
1186     }
1187     count++;
1188     ADD_NEW_DATA(-state_offset, count, 0);
1189     }
1190     }
1191     break;
1192    
1193     /*-----------------------------------------------------------------*/
1194     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1195     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1196     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1197     count = current_state->count; /* Already matched */
1198     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1199     if (clen > 0)
1200     {
1201 ph10 182 BOOL OK;
1202 ph10 178 switch (c)
1203     {
1204     case 0x09: /* HT */
1205     case 0x20: /* SPACE */
1206     case 0xa0: /* NBSP */
1207     case 0x1680: /* OGHAM SPACE MARK */
1208     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1209     case 0x2000: /* EN QUAD */
1210     case 0x2001: /* EM QUAD */
1211     case 0x2002: /* EN SPACE */
1212     case 0x2003: /* EM SPACE */
1213     case 0x2004: /* THREE-PER-EM SPACE */
1214     case 0x2005: /* FOUR-PER-EM SPACE */
1215     case 0x2006: /* SIX-PER-EM SPACE */
1216     case 0x2007: /* FIGURE SPACE */
1217     case 0x2008: /* PUNCTUATION SPACE */
1218     case 0x2009: /* THIN SPACE */
1219     case 0x200A: /* HAIR SPACE */
1220     case 0x202f: /* NARROW NO-BREAK SPACE */
1221     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1222     case 0x3000: /* IDEOGRAPHIC SPACE */
1223     OK = TRUE;
1224     break;
1225 ph10 182
1226 ph10 178 default:
1227     OK = FALSE;
1228     break;
1229     }
1230 ph10 182
1231 ph10 178 if (OK == (d == OP_HSPACE))
1232 ph10 182 {
1233 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1234     {
1235     active_count--; /* Remove non-match possibility */
1236     next_active_state--;
1237     }
1238     count++;
1239     ADD_NEW_DATA(-state_offset, count, 0);
1240     }
1241     }
1242     break;
1243    
1244     /*-----------------------------------------------------------------*/
1245 ph10 151 #ifdef SUPPORT_UCP
1246 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1247     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1248 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1249 nigel 87 count = 4;
1250 nigel 77 goto QS1;
1251    
1252     case OP_PROP_EXTRA + OP_TYPESTAR:
1253     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1254 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1255 nigel 77 count = 0;
1256    
1257     QS1:
1258    
1259 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1260 nigel 77 if (clen > 0)
1261     {
1262 nigel 87 BOOL OK;
1263 ph10 349 const ucd_record * prop = GET_UCD(c);
1264 nigel 87 switch(code[2])
1265     {
1266     case PT_ANY:
1267     OK = TRUE;
1268     break;
1269    
1270     case PT_LAMP:
1271 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1272 nigel 87 break;
1273    
1274     case PT_GC:
1275 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1276 nigel 87 break;
1277    
1278     case PT_PC:
1279 ph10 349 OK = prop->chartype == code[3];
1280 nigel 87 break;
1281    
1282     case PT_SC:
1283 ph10 349 OK = prop->script == code[3];
1284 nigel 87 break;
1285    
1286     /* Should never occur, but keep compilers from grumbling. */
1287    
1288     default:
1289     OK = codevalue != OP_PROP;
1290     break;
1291     }
1292    
1293 nigel 93 if (OK == (d == OP_PROP))
1294     {
1295     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1296     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1297     {
1298     active_count--; /* Remove non-match possibility */
1299     next_active_state--;
1300     }
1301     ADD_NEW(state_offset + count, 0);
1302     }
1303 nigel 77 }
1304     break;
1305    
1306     /*-----------------------------------------------------------------*/
1307     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1308     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1309 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1310 nigel 77 count = 2;
1311     goto QS2;
1312    
1313     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1314     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1315 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1316 nigel 77 count = 0;
1317    
1318     QS2:
1319    
1320     ADD_ACTIVE(state_offset + 2, 0);
1321 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1322 nigel 77 {
1323     const uschar *nptr = ptr + clen;
1324     int ncount = 0;
1325 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1326     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1327     {
1328     active_count--; /* Remove non-match possibility */
1329     next_active_state--;
1330     }
1331 nigel 77 while (nptr < end_subject)
1332     {
1333     int nd;
1334     int ndlen = 1;
1335     GETCHARLEN(nd, nptr, ndlen);
1336 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1337 nigel 77 ncount++;
1338     nptr += ndlen;
1339     }
1340     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1341     }
1342     break;
1343 ph10 151 #endif
1344 nigel 77
1345     /*-----------------------------------------------------------------*/
1346 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1347     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1348     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1349     count = 2;
1350     goto QS3;
1351    
1352     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1353     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1354     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1355     count = 0;
1356    
1357     QS3:
1358     ADD_ACTIVE(state_offset + 2, 0);
1359     if (clen > 0)
1360     {
1361     int ncount = 0;
1362     switch (c)
1363     {
1364     case 0x000b:
1365     case 0x000c:
1366     case 0x0085:
1367     case 0x2028:
1368     case 0x2029:
1369 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1370     goto ANYNL02;
1371    
1372     case 0x000d:
1373     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1374     /* Fall through */
1375    
1376     ANYNL02:
1377     case 0x000a:
1378 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1379     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1380     {
1381     active_count--; /* Remove non-match possibility */
1382     next_active_state--;
1383     }
1384     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1385     break;
1386 ph10 231
1387 nigel 93 default:
1388     break;
1389     }
1390     }
1391     break;
1392    
1393     /*-----------------------------------------------------------------*/
1394 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1395     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1396     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1397     count = 2;
1398     goto QS4;
1399    
1400     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1401     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1402     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1403     count = 0;
1404    
1405     QS4:
1406     ADD_ACTIVE(state_offset + 2, 0);
1407     if (clen > 0)
1408     {
1409 ph10 182 BOOL OK;
1410 ph10 178 switch (c)
1411     {
1412     case 0x000a:
1413     case 0x000b:
1414     case 0x000c:
1415     case 0x000d:
1416     case 0x0085:
1417     case 0x2028:
1418     case 0x2029:
1419     OK = TRUE;
1420     break;
1421 ph10 182
1422 ph10 178 default:
1423     OK = FALSE;
1424     break;
1425     }
1426     if (OK == (d == OP_VSPACE))
1427 ph10 182 {
1428 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1429     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1430     {
1431     active_count--; /* Remove non-match possibility */
1432     next_active_state--;
1433     }
1434     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1435     }
1436     }
1437     break;
1438    
1439     /*-----------------------------------------------------------------*/
1440     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1441     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1442     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1443     count = 2;
1444     goto QS5;
1445    
1446     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1447     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1448     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1449     count = 0;
1450    
1451     QS5:
1452     ADD_ACTIVE(state_offset + 2, 0);
1453     if (clen > 0)
1454     {
1455 ph10 182 BOOL OK;
1456 ph10 178 switch (c)
1457     {
1458     case 0x09: /* HT */
1459     case 0x20: /* SPACE */
1460     case 0xa0: /* NBSP */
1461     case 0x1680: /* OGHAM SPACE MARK */
1462     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1463     case 0x2000: /* EN QUAD */
1464     case 0x2001: /* EM QUAD */
1465     case 0x2002: /* EN SPACE */
1466     case 0x2003: /* EM SPACE */
1467     case 0x2004: /* THREE-PER-EM SPACE */
1468     case 0x2005: /* FOUR-PER-EM SPACE */
1469     case 0x2006: /* SIX-PER-EM SPACE */
1470     case 0x2007: /* FIGURE SPACE */
1471     case 0x2008: /* PUNCTUATION SPACE */
1472     case 0x2009: /* THIN SPACE */
1473     case 0x200A: /* HAIR SPACE */
1474     case 0x202f: /* NARROW NO-BREAK SPACE */
1475     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1476     case 0x3000: /* IDEOGRAPHIC SPACE */
1477     OK = TRUE;
1478     break;
1479 ph10 182
1480 ph10 178 default:
1481     OK = FALSE;
1482     break;
1483     }
1484 ph10 182
1485 ph10 178 if (OK == (d == OP_HSPACE))
1486 ph10 182 {
1487 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1488     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1489     {
1490     active_count--; /* Remove non-match possibility */
1491     next_active_state--;
1492     }
1493     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1494     }
1495     }
1496     break;
1497    
1498     /*-----------------------------------------------------------------*/
1499 ph10 151 #ifdef SUPPORT_UCP
1500 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1501     case OP_PROP_EXTRA + OP_TYPEUPTO:
1502     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1503 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1504 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1505 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1506 nigel 77 count = current_state->count; /* Number already matched */
1507     if (clen > 0)
1508     {
1509 nigel 87 BOOL OK;
1510 ph10 349 const ucd_record * prop = GET_UCD(c);
1511 nigel 87 switch(code[4])
1512 nigel 77 {
1513 nigel 87 case PT_ANY:
1514     OK = TRUE;
1515     break;
1516    
1517     case PT_LAMP:
1518 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1519 nigel 87 break;
1520    
1521     case PT_GC:
1522 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1523 nigel 87 break;
1524    
1525     case PT_PC:
1526 ph10 349 OK = prop->chartype == code[5];
1527 nigel 87 break;
1528    
1529     case PT_SC:
1530 ph10 349 OK = prop->script == code[5];
1531 nigel 87 break;
1532    
1533     /* Should never occur, but keep compilers from grumbling. */
1534    
1535     default:
1536     OK = codevalue != OP_PROP;
1537     break;
1538     }
1539    
1540     if (OK == (d == OP_PROP))
1541     {
1542 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1543     {
1544     active_count--; /* Remove non-match possibility */
1545     next_active_state--;
1546     }
1547 nigel 77 if (++count >= GET2(code, 1))
1548 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1549 nigel 77 else
1550     { ADD_NEW(state_offset, count); }
1551     }
1552     }
1553     break;
1554    
1555     /*-----------------------------------------------------------------*/
1556     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1557     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1558     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1559 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1560 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1561     { ADD_ACTIVE(state_offset + 4, 0); }
1562     count = current_state->count; /* Number already matched */
1563 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1564 nigel 77 {
1565     const uschar *nptr = ptr + clen;
1566     int ncount = 0;
1567 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1568     {
1569     active_count--; /* Remove non-match possibility */
1570     next_active_state--;
1571     }
1572 nigel 77 while (nptr < end_subject)
1573     {
1574     int nd;
1575     int ndlen = 1;
1576     GETCHARLEN(nd, nptr, ndlen);
1577 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1578 nigel 77 ncount++;
1579     nptr += ndlen;
1580     }
1581     if (++count >= GET2(code, 1))
1582     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1583     else
1584     { ADD_NEW_DATA(-state_offset, count, ncount); }
1585     }
1586     break;
1587 ph10 151 #endif
1588 nigel 77
1589 nigel 93 /*-----------------------------------------------------------------*/
1590     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1591     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1592     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1593     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1594     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1595     { ADD_ACTIVE(state_offset + 4, 0); }
1596     count = current_state->count; /* Number already matched */
1597     if (clen > 0)
1598     {
1599     int ncount = 0;
1600     switch (c)
1601     {
1602     case 0x000b:
1603     case 0x000c:
1604     case 0x0085:
1605     case 0x2028:
1606     case 0x2029:
1607 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1608     goto ANYNL03;
1609    
1610     case 0x000d:
1611     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1612     /* Fall through */
1613    
1614     ANYNL03:
1615     case 0x000a:
1616 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1617     {
1618     active_count--; /* Remove non-match possibility */
1619     next_active_state--;
1620     }
1621     if (++count >= GET2(code, 1))
1622     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1623     else
1624     { ADD_NEW_DATA(-state_offset, count, ncount); }
1625     break;
1626 ph10 231
1627 nigel 93 default:
1628     break;
1629     }
1630     }
1631     break;
1632    
1633 ph10 178 /*-----------------------------------------------------------------*/
1634     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1635     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1636     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1637     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1638     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1639     { ADD_ACTIVE(state_offset + 4, 0); }
1640     count = current_state->count; /* Number already matched */
1641     if (clen > 0)
1642     {
1643 ph10 182 BOOL OK;
1644 ph10 178 switch (c)
1645     {
1646     case 0x000a:
1647     case 0x000b:
1648     case 0x000c:
1649     case 0x000d:
1650     case 0x0085:
1651     case 0x2028:
1652     case 0x2029:
1653     OK = TRUE;
1654     break;
1655 ph10 182
1656 ph10 178 default:
1657     OK = FALSE;
1658     }
1659 ph10 182
1660 ph10 178 if (OK == (d == OP_VSPACE))
1661 ph10 182 {
1662 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1663     {
1664     active_count--; /* Remove non-match possibility */
1665     next_active_state--;
1666     }
1667     if (++count >= GET2(code, 1))
1668     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1669     else
1670     { ADD_NEW_DATA(-state_offset, count, 0); }
1671     }
1672     }
1673     break;
1674    
1675     /*-----------------------------------------------------------------*/
1676     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1677     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1678     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1679     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1680     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1681     { ADD_ACTIVE(state_offset + 4, 0); }
1682     count = current_state->count; /* Number already matched */
1683     if (clen > 0)
1684     {
1685 ph10 182 BOOL OK;
1686 ph10 178 switch (c)
1687     {
1688     case 0x09: /* HT */
1689     case 0x20: /* SPACE */
1690     case 0xa0: /* NBSP */
1691     case 0x1680: /* OGHAM SPACE MARK */
1692     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1693     case 0x2000: /* EN QUAD */
1694     case 0x2001: /* EM QUAD */
1695     case 0x2002: /* EN SPACE */
1696     case 0x2003: /* EM SPACE */
1697     case 0x2004: /* THREE-PER-EM SPACE */
1698     case 0x2005: /* FOUR-PER-EM SPACE */
1699     case 0x2006: /* SIX-PER-EM SPACE */
1700     case 0x2007: /* FIGURE SPACE */
1701     case 0x2008: /* PUNCTUATION SPACE */
1702     case 0x2009: /* THIN SPACE */
1703     case 0x200A: /* HAIR SPACE */
1704     case 0x202f: /* NARROW NO-BREAK SPACE */
1705     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1706     case 0x3000: /* IDEOGRAPHIC SPACE */
1707     OK = TRUE;
1708     break;
1709 ph10 182
1710 ph10 178 default:
1711     OK = FALSE;
1712     break;
1713     }
1714 ph10 182
1715 ph10 178 if (OK == (d == OP_HSPACE))
1716 ph10 182 {
1717 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1718     {
1719     active_count--; /* Remove non-match possibility */
1720     next_active_state--;
1721     }
1722     if (++count >= GET2(code, 1))
1723     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1724     else
1725     { ADD_NEW_DATA(-state_offset, count, 0); }
1726     }
1727     }
1728     break;
1729    
1730 nigel 77 /* ========================================================================== */
1731     /* These opcodes are followed by a character that is usually compared
1732     to the current subject character; it is loaded into d. We still get
1733     here even if there is no subject character, because in some cases zero
1734     repetitions are permitted. */
1735    
1736     /*-----------------------------------------------------------------*/
1737     case OP_CHAR:
1738     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1739     break;
1740    
1741     /*-----------------------------------------------------------------*/
1742     case OP_CHARNC:
1743     if (clen == 0) break;
1744    
1745     #ifdef SUPPORT_UTF8
1746     if (utf8)
1747     {
1748     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1749     {
1750 nigel 93 unsigned int othercase;
1751 nigel 77 if (c < 128) othercase = fcc[c]; else
1752    
1753     /* If we have Unicode property support, we can use it to test the
1754 nigel 87 other case of the character. */
1755 nigel 77
1756     #ifdef SUPPORT_UCP
1757 ph10 349 othercase = UCD_OTHERCASE(c);
1758 nigel 87 #else
1759 nigel 93 othercase = NOTACHAR;
1760 nigel 77 #endif
1761    
1762     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1763     }
1764     }
1765     else
1766     #endif /* SUPPORT_UTF8 */
1767    
1768     /* Non-UTF-8 mode */
1769     {
1770     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1771     }
1772     break;
1773    
1774    
1775     #ifdef SUPPORT_UCP
1776     /*-----------------------------------------------------------------*/
1777     /* This is a tricky one because it can match more than one character.
1778     Find out how many characters to skip, and then set up a negative state
1779     to wait for them to pass before continuing. */
1780    
1781     case OP_EXTUNI:
1782 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1783 nigel 77 {
1784     const uschar *nptr = ptr + clen;
1785     int ncount = 0;
1786     while (nptr < end_subject)
1787     {
1788     int nclen = 1;
1789     GETCHARLEN(c, nptr, nclen);
1790 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1791 nigel 77 ncount++;
1792     nptr += nclen;
1793     }
1794     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1795     }
1796     break;
1797     #endif
1798    
1799     /*-----------------------------------------------------------------*/
1800 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1801     character (when CR is followed by LF). In this case, set up a negative
1802     state to wait for one character to pass before continuing. */
1803    
1804     case OP_ANYNL:
1805     if (clen > 0) switch(c)
1806     {
1807     case 0x000b:
1808     case 0x000c:
1809     case 0x0085:
1810     case 0x2028:
1811     case 0x2029:
1812 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1813    
1814     case 0x000a:
1815 nigel 93 ADD_NEW(state_offset + 1, 0);
1816     break;
1817 ph10 231
1818 nigel 93 case 0x000d:
1819     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1820     {
1821     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1822     }
1823     else
1824     {
1825     ADD_NEW(state_offset + 1, 0);
1826     }
1827     break;
1828     }
1829     break;
1830    
1831     /*-----------------------------------------------------------------*/
1832 ph10 178 case OP_NOT_VSPACE:
1833     if (clen > 0) switch(c)
1834     {
1835     case 0x000a:
1836     case 0x000b:
1837     case 0x000c:
1838     case 0x000d:
1839     case 0x0085:
1840     case 0x2028:
1841     case 0x2029:
1842     break;
1843 ph10 182
1844     default:
1845 ph10 178 ADD_NEW(state_offset + 1, 0);
1846     break;
1847     }
1848     break;
1849    
1850     /*-----------------------------------------------------------------*/
1851     case OP_VSPACE:
1852     if (clen > 0) switch(c)
1853     {
1854     case 0x000a:
1855     case 0x000b:
1856     case 0x000c:
1857     case 0x000d:
1858     case 0x0085:
1859     case 0x2028:
1860     case 0x2029:
1861     ADD_NEW(state_offset + 1, 0);
1862     break;
1863 ph10 182
1864 ph10 178 default: break;
1865     }
1866     break;
1867    
1868     /*-----------------------------------------------------------------*/
1869     case OP_NOT_HSPACE:
1870     if (clen > 0) switch(c)
1871     {
1872     case 0x09: /* HT */
1873     case 0x20: /* SPACE */
1874     case 0xa0: /* NBSP */
1875     case 0x1680: /* OGHAM SPACE MARK */
1876     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1877     case 0x2000: /* EN QUAD */
1878     case 0x2001: /* EM QUAD */
1879     case 0x2002: /* EN SPACE */
1880     case 0x2003: /* EM SPACE */
1881     case 0x2004: /* THREE-PER-EM SPACE */
1882     case 0x2005: /* FOUR-PER-EM SPACE */
1883     case 0x2006: /* SIX-PER-EM SPACE */
1884     case 0x2007: /* FIGURE SPACE */
1885     case 0x2008: /* PUNCTUATION SPACE */
1886     case 0x2009: /* THIN SPACE */
1887     case 0x200A: /* HAIR SPACE */
1888     case 0x202f: /* NARROW NO-BREAK SPACE */
1889     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1890     case 0x3000: /* IDEOGRAPHIC SPACE */
1891     break;
1892 ph10 182
1893     default:
1894 ph10 178 ADD_NEW(state_offset + 1, 0);
1895     break;
1896     }
1897     break;
1898    
1899     /*-----------------------------------------------------------------*/
1900     case OP_HSPACE:
1901     if (clen > 0) switch(c)
1902     {
1903     case 0x09: /* HT */
1904     case 0x20: /* SPACE */
1905     case 0xa0: /* NBSP */
1906     case 0x1680: /* OGHAM SPACE MARK */
1907     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1908     case 0x2000: /* EN QUAD */
1909     case 0x2001: /* EM QUAD */
1910     case 0x2002: /* EN SPACE */
1911     case 0x2003: /* EM SPACE */
1912     case 0x2004: /* THREE-PER-EM SPACE */
1913     case 0x2005: /* FOUR-PER-EM SPACE */
1914     case 0x2006: /* SIX-PER-EM SPACE */
1915     case 0x2007: /* FIGURE SPACE */
1916     case 0x2008: /* PUNCTUATION SPACE */
1917     case 0x2009: /* THIN SPACE */
1918     case 0x200A: /* HAIR SPACE */
1919     case 0x202f: /* NARROW NO-BREAK SPACE */
1920     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1921     case 0x3000: /* IDEOGRAPHIC SPACE */
1922     ADD_NEW(state_offset + 1, 0);
1923     break;
1924     }
1925     break;
1926    
1927     /*-----------------------------------------------------------------*/
1928 nigel 77 /* Match a negated single character. This is only used for one-byte
1929     characters, that is, we know that d < 256. The character we are
1930     checking (c) can be multibyte. */
1931    
1932     case OP_NOT:
1933     if (clen > 0)
1934     {
1935 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1936 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1937     }
1938     break;
1939    
1940     /*-----------------------------------------------------------------*/
1941     case OP_PLUS:
1942     case OP_MINPLUS:
1943 nigel 93 case OP_POSPLUS:
1944 nigel 77 case OP_NOTPLUS:
1945     case OP_NOTMINPLUS:
1946 nigel 93 case OP_NOTPOSPLUS:
1947 nigel 77 count = current_state->count; /* Already matched */
1948     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1949     if (clen > 0)
1950     {
1951 nigel 93 unsigned int otherd = NOTACHAR;
1952 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1953     {
1954     #ifdef SUPPORT_UTF8
1955 nigel 87 if (utf8 && d >= 128)
1956 nigel 77 {
1957     #ifdef SUPPORT_UCP
1958 ph10 349 otherd = UCD_OTHERCASE(d);
1959 nigel 77 #endif /* SUPPORT_UCP */
1960     }
1961     else
1962     #endif /* SUPPORT_UTF8 */
1963     otherd = fcc[d];
1964     }
1965     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1966 nigel 93 {
1967     if (count > 0 &&
1968     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1969     {
1970     active_count--; /* Remove non-match possibility */
1971     next_active_state--;
1972     }
1973     count++;
1974     ADD_NEW(state_offset, count);
1975     }
1976 nigel 77 }
1977     break;
1978    
1979     /*-----------------------------------------------------------------*/
1980     case OP_QUERY:
1981     case OP_MINQUERY:
1982 nigel 93 case OP_POSQUERY:
1983 nigel 77 case OP_NOTQUERY:
1984     case OP_NOTMINQUERY:
1985 nigel 93 case OP_NOTPOSQUERY:
1986 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1987     if (clen > 0)
1988     {
1989 nigel 93 unsigned int otherd = NOTACHAR;
1990 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1991 nigel 77 {
1992     #ifdef SUPPORT_UTF8
1993 nigel 87 if (utf8 && d >= 128)
1994 nigel 77 {
1995     #ifdef SUPPORT_UCP
1996 ph10 349 otherd = UCD_OTHERCASE(d);
1997 nigel 77 #endif /* SUPPORT_UCP */
1998     }
1999     else
2000     #endif /* SUPPORT_UTF8 */
2001     otherd = fcc[d];
2002     }
2003     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2004 nigel 93 {
2005     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2006     {
2007     active_count--; /* Remove non-match possibility */
2008     next_active_state--;
2009     }
2010     ADD_NEW(state_offset + dlen + 1, 0);
2011     }
2012 nigel 77 }
2013     break;
2014    
2015     /*-----------------------------------------------------------------*/
2016     case OP_STAR:
2017     case OP_MINSTAR:
2018 nigel 93 case OP_POSSTAR:
2019 nigel 77 case OP_NOTSTAR:
2020     case OP_NOTMINSTAR:
2021 nigel 93 case OP_NOTPOSSTAR:
2022 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2023     if (clen > 0)
2024     {
2025 nigel 93 unsigned int otherd = NOTACHAR;
2026 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2027 nigel 77 {
2028     #ifdef SUPPORT_UTF8
2029 nigel 87 if (utf8 && d >= 128)
2030 nigel 77 {
2031     #ifdef SUPPORT_UCP
2032 ph10 349 otherd = UCD_OTHERCASE(d);
2033 nigel 77 #endif /* SUPPORT_UCP */
2034     }
2035     else
2036     #endif /* SUPPORT_UTF8 */
2037     otherd = fcc[d];
2038     }
2039     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2040 nigel 93 {
2041     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2042     {
2043     active_count--; /* Remove non-match possibility */
2044     next_active_state--;
2045     }
2046     ADD_NEW(state_offset, 0);
2047     }
2048 nigel 77 }
2049     break;
2050    
2051     /*-----------------------------------------------------------------*/
2052     case OP_EXACT:
2053 nigel 93 case OP_NOTEXACT:
2054     count = current_state->count; /* Number already matched */
2055     if (clen > 0)
2056     {
2057     unsigned int otherd = NOTACHAR;
2058     if ((ims & PCRE_CASELESS) != 0)
2059     {
2060     #ifdef SUPPORT_UTF8
2061     if (utf8 && d >= 128)
2062     {
2063     #ifdef SUPPORT_UCP
2064 ph10 349 otherd = UCD_OTHERCASE(d);
2065 nigel 93 #endif /* SUPPORT_UCP */
2066     }
2067     else
2068     #endif /* SUPPORT_UTF8 */
2069     otherd = fcc[d];
2070     }
2071     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2072     {
2073     if (++count >= GET2(code, 1))
2074     { ADD_NEW(state_offset + dlen + 3, 0); }
2075     else
2076     { ADD_NEW(state_offset, count); }
2077     }
2078     }
2079     break;
2080    
2081     /*-----------------------------------------------------------------*/
2082 nigel 77 case OP_UPTO:
2083     case OP_MINUPTO:
2084 nigel 93 case OP_POSUPTO:
2085 nigel 77 case OP_NOTUPTO:
2086     case OP_NOTMINUPTO:
2087 nigel 93 case OP_NOTPOSUPTO:
2088     ADD_ACTIVE(state_offset + dlen + 3, 0);
2089 nigel 77 count = current_state->count; /* Number already matched */
2090     if (clen > 0)
2091     {
2092 nigel 93 unsigned int otherd = NOTACHAR;
2093 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2094     {
2095     #ifdef SUPPORT_UTF8
2096 nigel 87 if (utf8 && d >= 128)
2097 nigel 77 {
2098     #ifdef SUPPORT_UCP
2099 ph10 349 otherd = UCD_OTHERCASE(d);
2100 nigel 77 #endif /* SUPPORT_UCP */
2101     }
2102     else
2103     #endif /* SUPPORT_UTF8 */
2104     otherd = fcc[d];
2105     }
2106     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2107     {
2108 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2109     {
2110     active_count--; /* Remove non-match possibility */
2111     next_active_state--;
2112     }
2113 nigel 77 if (++count >= GET2(code, 1))
2114     { ADD_NEW(state_offset + dlen + 3, 0); }
2115     else
2116     { ADD_NEW(state_offset, count); }
2117     }
2118     }
2119     break;
2120    
2121    
2122     /* ========================================================================== */
2123     /* These are the class-handling opcodes */
2124    
2125     case OP_CLASS:
2126     case OP_NCLASS:
2127     case OP_XCLASS:
2128     {
2129     BOOL isinclass = FALSE;
2130     int next_state_offset;
2131     const uschar *ecode;
2132    
2133     /* For a simple class, there is always just a 32-byte table, and we
2134     can set isinclass from it. */
2135    
2136     if (codevalue != OP_XCLASS)
2137     {
2138     ecode = code + 33;
2139     if (clen > 0)
2140     {
2141     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2142     ((code[1 + c/8] & (1 << (c&7))) != 0);
2143     }
2144     }
2145    
2146     /* An extended class may have a table or a list of single characters,
2147     ranges, or both, and it may be positive or negative. There's a
2148     function that sorts all this out. */
2149    
2150     else
2151     {
2152     ecode = code + GET(code, 1);
2153     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2154     }
2155    
2156     /* At this point, isinclass is set for all kinds of class, and ecode
2157     points to the byte after the end of the class. If there is a
2158     quantifier, this is where it will be. */
2159    
2160     next_state_offset = ecode - start_code;
2161    
2162     switch (*ecode)
2163     {
2164     case OP_CRSTAR:
2165     case OP_CRMINSTAR:
2166     ADD_ACTIVE(next_state_offset + 1, 0);
2167     if (isinclass) { ADD_NEW(state_offset, 0); }
2168     break;
2169    
2170     case OP_CRPLUS:
2171     case OP_CRMINPLUS:
2172     count = current_state->count; /* Already matched */
2173     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2174     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2175     break;
2176    
2177     case OP_CRQUERY:
2178     case OP_CRMINQUERY:
2179     ADD_ACTIVE(next_state_offset + 1, 0);
2180     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2181     break;
2182    
2183     case OP_CRRANGE:
2184     case OP_CRMINRANGE:
2185     count = current_state->count; /* Already matched */
2186     if (count >= GET2(ecode, 1))
2187     { ADD_ACTIVE(next_state_offset + 5, 0); }
2188     if (isinclass)
2189     {
2190 nigel 91 int max = GET2(ecode, 3);
2191     if (++count >= max && max != 0) /* Max 0 => no limit */
2192 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2193     else
2194     { ADD_NEW(state_offset, count); }
2195     }
2196     break;
2197    
2198     default:
2199     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2200     break;
2201     }
2202     }
2203     break;
2204    
2205     /* ========================================================================== */
2206     /* These are the opcodes for fancy brackets of various kinds. We have
2207 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2208     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2209 ph10 341 though the other "backtracking verbs" are not supported. */
2210 ph10 345
2211 ph10 341 case OP_FAIL:
2212 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2213 ph10 345 break;
2214 nigel 77
2215     case OP_ASSERT:
2216     case OP_ASSERT_NOT:
2217     case OP_ASSERTBACK:
2218     case OP_ASSERTBACK_NOT:
2219     {
2220     int rc;
2221     int local_offsets[2];
2222     int local_workspace[1000];
2223     const uschar *endasscode = code + GET(code, 1);
2224    
2225     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2226    
2227     rc = internal_dfa_exec(
2228     md, /* static match data */
2229     code, /* this subexpression's code */
2230     ptr, /* where we currently are */
2231     ptr - start_subject, /* start offset */
2232     local_offsets, /* offset vector */
2233     sizeof(local_offsets)/sizeof(int), /* size of same */
2234     local_workspace, /* workspace vector */
2235     sizeof(local_workspace)/sizeof(int), /* size of same */
2236     ims, /* the current ims flags */
2237     rlevel, /* function recursion level */
2238     recursing); /* pass on regex recursion */
2239    
2240     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2241     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2242     }
2243     break;
2244    
2245     /*-----------------------------------------------------------------*/
2246     case OP_COND:
2247 nigel 93 case OP_SCOND:
2248 nigel 77 {
2249     int local_offsets[1000];
2250     int local_workspace[1000];
2251 ph10 406 int codelink = GET(code, 1);
2252 ph10 397 int condcode;
2253 ph10 406
2254 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2255 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2256 ph10 398 happen for the other conditions. */
2257 nigel 77
2258 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2259 ph10 406 {
2260     rrc = 0;
2261 ph10 397 if (pcre_callout != NULL)
2262     {
2263     pcre_callout_block cb;
2264     cb.version = 1; /* Version 1 of the callout block */
2265     cb.callout_number = code[LINK_SIZE+2];
2266     cb.offset_vector = offsets;
2267     cb.subject = (PCRE_SPTR)start_subject;
2268     cb.subject_length = end_subject - start_subject;
2269     cb.start_match = current_subject - start_subject;
2270     cb.current_position = ptr - start_subject;
2271     cb.pattern_position = GET(code, LINK_SIZE + 3);
2272     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2273     cb.capture_top = 1;
2274     cb.capture_last = -1;
2275     cb.callout_data = md->callout_data;
2276     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2277     }
2278 ph10 398 if (rrc > 0) break; /* Fail this thread */
2279     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2280 ph10 406 }
2281 ph10 398
2282 ph10 397 condcode = code[LINK_SIZE+1];
2283 ph10 406
2284 nigel 93 /* Back reference conditions are not supported */
2285 nigel 77
2286 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2287    
2288     /* The DEFINE condition is always false */
2289    
2290     if (condcode == OP_DEF)
2291 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2292 nigel 93
2293     /* The only supported version of OP_RREF is for the value RREF_ANY,
2294     which means "test if in any recursion". We can't test for specifically
2295     recursed groups. */
2296    
2297     else if (condcode == OP_RREF)
2298     {
2299 nigel 77 int value = GET2(code, LINK_SIZE+2);
2300 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2301 ph10 406 if (recursing > 0)
2302 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2303     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2304 nigel 77 }
2305    
2306     /* Otherwise, the condition is an assertion */
2307    
2308     else
2309     {
2310     int rc;
2311     const uschar *asscode = code + LINK_SIZE + 1;
2312     const uschar *endasscode = asscode + GET(asscode, 1);
2313    
2314     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2315    
2316     rc = internal_dfa_exec(
2317     md, /* fixed match data */
2318     asscode, /* this subexpression's code */
2319     ptr, /* where we currently are */
2320     ptr - start_subject, /* start offset */
2321     local_offsets, /* offset vector */
2322     sizeof(local_offsets)/sizeof(int), /* size of same */
2323     local_workspace, /* workspace vector */
2324     sizeof(local_workspace)/sizeof(int), /* size of same */
2325     ims, /* the current ims flags */
2326     rlevel, /* function recursion level */
2327     recursing); /* pass on regex recursion */
2328    
2329     if ((rc >= 0) ==
2330     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2331 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2332 nigel 77 else
2333 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2334 nigel 77 }
2335     }
2336     break;
2337    
2338     /*-----------------------------------------------------------------*/
2339     case OP_RECURSE:
2340     {
2341     int local_offsets[1000];
2342     int local_workspace[1000];
2343     int rc;
2344    
2345     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2346     recursing + 1));
2347    
2348     rc = internal_dfa_exec(
2349     md, /* fixed match data */
2350     start_code + GET(code, 1), /* this subexpression's code */
2351     ptr, /* where we currently are */
2352     ptr - start_subject, /* start offset */
2353     local_offsets, /* offset vector */
2354     sizeof(local_offsets)/sizeof(int), /* size of same */
2355     local_workspace, /* workspace vector */
2356     sizeof(local_workspace)/sizeof(int), /* size of same */
2357     ims, /* the current ims flags */
2358     rlevel, /* function recursion level */
2359     recursing + 1); /* regex recurse level */
2360    
2361     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2362     recursing + 1, rc));
2363    
2364     /* Ran out of internal offsets */
2365    
2366     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2367    
2368     /* For each successful matched substring, set up the next state with a
2369     count of characters to skip before trying it. Note that the count is in
2370     characters, not bytes. */
2371    
2372     if (rc > 0)
2373     {
2374     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2375     {
2376     const uschar *p = start_subject + local_offsets[rc];
2377     const uschar *pp = start_subject + local_offsets[rc+1];
2378     int charcount = local_offsets[rc+1] - local_offsets[rc];
2379     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2380     if (charcount > 0)
2381     {
2382     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2383     }
2384     else
2385     {
2386     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2387     }
2388     }
2389     }
2390     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2391     }
2392     break;
2393    
2394     /*-----------------------------------------------------------------*/
2395     case OP_ONCE:
2396     {
2397     int local_offsets[2];
2398     int local_workspace[1000];
2399    
2400     int rc = internal_dfa_exec(
2401     md, /* fixed match data */
2402     code, /* this subexpression's code */
2403     ptr, /* where we currently are */
2404     ptr - start_subject, /* start offset */
2405     local_offsets, /* offset vector */
2406     sizeof(local_offsets)/sizeof(int), /* size of same */
2407     local_workspace, /* workspace vector */
2408     sizeof(local_workspace)/sizeof(int), /* size of same */
2409     ims, /* the current ims flags */
2410     rlevel, /* function recursion level */
2411     recursing); /* pass on regex recursion */
2412    
2413     if (rc >= 0)
2414     {
2415     const uschar *end_subpattern = code;
2416     int charcount = local_offsets[1] - local_offsets[0];
2417     int next_state_offset, repeat_state_offset;
2418    
2419     do { end_subpattern += GET(end_subpattern, 1); }
2420     while (*end_subpattern == OP_ALT);
2421     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2422    
2423     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2424     arrange for the repeat state also to be added to the relevant list.
2425     Calculate the offset, or set -1 for no repeat. */
2426    
2427     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2428     *end_subpattern == OP_KETRMIN)?
2429     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2430    
2431     /* If we have matched an empty string, add the next state at the
2432     current character pointer. This is important so that the duplicate
2433     checking kicks in, which is what breaks infinite loops that match an
2434     empty string. */
2435    
2436     if (charcount == 0)
2437     {
2438     ADD_ACTIVE(next_state_offset, 0);
2439     }
2440    
2441     /* Optimization: if there are no more active states, and there
2442     are no new states yet set up, then skip over the subject string
2443     right here, to save looping. Otherwise, set up the new state to swing
2444     into action when the end of the substring is reached. */
2445    
2446     else if (i + 1 >= active_count && new_count == 0)
2447     {
2448     ptr += charcount;
2449     clen = 0;
2450     ADD_NEW(next_state_offset, 0);
2451    
2452     /* If we are adding a repeat state at the new character position,
2453     we must fudge things so that it is the only current state.
2454     Otherwise, it might be a duplicate of one we processed before, and
2455     that would cause it to be skipped. */
2456    
2457     if (repeat_state_offset >= 0)
2458     {
2459     next_active_state = active_states;
2460     active_count = 0;
2461     i = -1;
2462     ADD_ACTIVE(repeat_state_offset, 0);
2463     }
2464     }
2465     else
2466     {
2467     const uschar *p = start_subject + local_offsets[0];
2468     const uschar *pp = start_subject + local_offsets[1];
2469     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2470     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2471     if (repeat_state_offset >= 0)
2472     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2473     }
2474    
2475     }
2476     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2477     }
2478     break;
2479    
2480    
2481     /* ========================================================================== */
2482     /* Handle callouts */
2483    
2484     case OP_CALLOUT:
2485 ph10 406 rrc = 0;
2486 nigel 77 if (pcre_callout != NULL)
2487     {
2488     pcre_callout_block cb;
2489     cb.version = 1; /* Version 1 of the callout block */
2490     cb.callout_number = code[1];
2491     cb.offset_vector = offsets;
2492 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2493 nigel 77 cb.subject_length = end_subject - start_subject;
2494     cb.start_match = current_subject - start_subject;
2495     cb.current_position = ptr - start_subject;
2496     cb.pattern_position = GET(code, 2);
2497     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2498     cb.capture_top = 1;
2499     cb.capture_last = -1;
2500     cb.callout_data = md->callout_data;
2501     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2502 ph10 406 }
2503     if (rrc == 0)
2504     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2505 nigel 77 break;
2506    
2507    
2508     /* ========================================================================== */
2509     default: /* Unsupported opcode */
2510     return PCRE_ERROR_DFA_UITEM;
2511     }
2512    
2513     NEXT_ACTIVE_STATE: continue;
2514    
2515     } /* End of loop scanning active states */
2516    
2517     /* We have finished the processing at the current subject character. If no
2518     new states have been set for the next character, we have found all the
2519     matches that we are going to find. If we are at the top level and partial
2520 ph10 428 matching has been requested, check for appropriate conditions. The "forced_
2521     fail" variable counts the number of (*F) encountered for the character. If it
2522     is equal to the original active_count (saved in workspace[1]) it means that
2523     (*F) was found on every active state. In this case we don't want to give a
2524     partial match. */
2525 nigel 77
2526     if (new_count <= 0)
2527     {
2528 ph10 427 if (rlevel == 1 && /* Top level, and */
2529 ph10 428 reached_end != workspace[1] && /* Not all reached end */
2530     forced_fail != workspace[1] && /* Not all forced fail & */
2531 ph10 427 ( /* either... */
2532     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2533     || /* or... */
2534     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2535     match_count < 0) /* no matches */
2536     ) && /* And... */
2537     ptr >= end_subject && /* Reached end of subject */
2538     ptr > current_subject) /* Matched non-empty string */
2539 nigel 77 {
2540     if (offsetcount >= 2)
2541     {
2542 ph10 435 offsets[0] = md->start_used_ptr - start_subject;
2543 nigel 77 offsets[1] = end_subject - start_subject;
2544     }
2545     match_count = PCRE_ERROR_PARTIAL;
2546     }
2547    
2548     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2549     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2550     rlevel*2-2, SP));
2551 nigel 91 break; /* In effect, "return", but see the comment below */
2552 nigel 77 }
2553    
2554     /* One or more states are active for the next character. */
2555    
2556     ptr += clen; /* Advance to next subject character */
2557     } /* Loop to move along the subject string */
2558    
2559 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2560     if we use "return" above, we have compiler trouble. Some compilers warn if
2561     there's nothing here because they think the function doesn't return a value. On
2562     the other hand, if we put a dummy statement here, some more clever compilers
2563     complain that it can't be reached. Sigh. */
2564 nigel 77
2565 nigel 91 return match_count;
2566 nigel 77 }
2567    
2568    
2569    
2570    
2571     /*************************************************
2572     * Execute a Regular Expression - DFA engine *
2573     *************************************************/
2574    
2575     /* This external function applies a compiled re to a subject string using a DFA
2576     engine. This function calls the internal function multiple times if the pattern
2577     is not anchored.
2578    
2579     Arguments:
2580     argument_re points to the compiled expression
2581 ph10 97 extra_data points to extra data or is NULL
2582 nigel 77 subject points to the subject string
2583     length length of subject string (may contain binary zeros)
2584     start_offset where to start in the subject string
2585     options option bits
2586     offsets vector of match offsets
2587     offsetcount size of same
2588     workspace workspace vector
2589     wscount size of same
2590    
2591     Returns: > 0 => number of match offset pairs placed in offsets
2592     = 0 => offsets overflowed; longest matches are present
2593     -1 => failed to match
2594     < -1 => some kind of unexpected problem
2595     */
2596    
2597 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2598 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2599     const char *subject, int length, int start_offset, int options, int *offsets,
2600     int offsetcount, int *workspace, int wscount)
2601     {
2602     real_pcre *re = (real_pcre *)argument_re;
2603     dfa_match_data match_block;
2604 nigel 91 dfa_match_data *md = &match_block;
2605 nigel 77 BOOL utf8, anchored, startline, firstline;
2606     const uschar *current_subject, *end_subject, *lcc;
2607    
2608     pcre_study_data internal_study;
2609     const pcre_study_data *study = NULL;
2610     real_pcre internal_re;
2611    
2612     const uschar *req_byte_ptr;
2613     const uschar *start_bits = NULL;
2614     BOOL first_byte_caseless = FALSE;
2615     BOOL req_byte_caseless = FALSE;
2616     int first_byte = -1;
2617     int req_byte = -1;
2618     int req_byte2 = -1;
2619 nigel 91 int newline;
2620 nigel 77
2621     /* Plausibility checks */
2622    
2623     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2624     if (re == NULL || subject == NULL || workspace == NULL ||
2625     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2626     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2627     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2628    
2629     /* We need to find the pointer to any study data before we test for byte
2630     flipping, so we scan the extra_data block first. This may set two fields in the
2631     match block, so we must initialize them beforehand. However, the other fields
2632     in the match block must not be set until after the byte flipping. */
2633    
2634 nigel 91 md->tables = re->tables;
2635     md->callout_data = NULL;
2636 nigel 77
2637     if (extra_data != NULL)
2638     {
2639     unsigned int flags = extra_data->flags;
2640     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2641     study = (const pcre_study_data *)extra_data->study_data;
2642     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2643 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2644     return PCRE_ERROR_DFA_UMLIMIT;
2645 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2646 nigel 91 md->callout_data = extra_data->callout_data;
2647 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2648 nigel 91 md->tables = extra_data->tables;
2649 nigel 77 }
2650    
2651     /* Check that the first field in the block is the magic number. If it is not,
2652     test for a regex that was compiled on a host of opposite endianness. If this is
2653     the case, flipped values are put in internal_re and internal_study if there was
2654     study data too. */
2655    
2656     if (re->magic_number != MAGIC_NUMBER)
2657     {
2658     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2659     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2660     if (study != NULL) study = &internal_study;
2661     }
2662    
2663     /* Set some local values */
2664    
2665     current_subject = (const unsigned char *)subject + start_offset;
2666     end_subject = (const unsigned char *)subject + length;
2667     req_byte_ptr = current_subject - 1;
2668    
2669 nigel 91 #ifdef SUPPORT_UTF8
2670 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2671 nigel 91 #else
2672     utf8 = FALSE;
2673     #endif
2674 nigel 77
2675 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2676     (re->options & PCRE_ANCHORED) != 0;
2677    
2678 nigel 77 /* The remaining fixed data for passing around. */
2679    
2680 nigel 91 md->start_code = (const uschar *)argument_re +
2681 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2682 nigel 91 md->start_subject = (const unsigned char *)subject;
2683     md->end_subject = end_subject;
2684     md->moptions = options;
2685     md->poptions = re->options;
2686 nigel 77
2687 ph10 231 /* If the BSR option is not set at match time, copy what was set
2688     at compile time. */
2689    
2690     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2691     {
2692     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2693     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2694     #ifdef BSR_ANYCRLF
2695     else md->moptions |= PCRE_BSR_ANYCRLF;
2696 ph10 243 #endif
2697     }
2698 ph10 231
2699 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2700     nothing is set at run time, whatever was used at compile time applies. */
2701 nigel 91
2702 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2703 nigel 93 PCRE_NEWLINE_BITS)
2704 nigel 91 {
2705 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2706 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2707     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2708 nigel 91 case PCRE_NEWLINE_CR+
2709 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2710 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2711 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2712 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2713 nigel 91 }
2714    
2715 ph10 149 if (newline == -2)
2716 nigel 91 {
2717 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2718     }
2719     else if (newline < 0)
2720     {
2721 nigel 93 md->nltype = NLTYPE_ANY;
2722 nigel 91 }
2723     else
2724     {
2725 nigel 93 md->nltype = NLTYPE_FIXED;
2726     if (newline > 255)
2727     {
2728     md->nllen = 2;
2729     md->nl[0] = (newline >> 8) & 255;
2730     md->nl[1] = newline & 255;
2731     }
2732     else
2733     {
2734     md->nllen = 1;
2735     md->nl[0] = newline;
2736     }
2737 nigel 91 }
2738    
2739 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2740     back the character offset. */
2741    
2742     #ifdef SUPPORT_UTF8
2743     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2744     {
2745     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2746     return PCRE_ERROR_BADUTF8;
2747     if (start_offset > 0 && start_offset < length)
2748     {
2749     int tb = ((uschar *)subject)[start_offset];
2750     if (tb > 127)
2751     {
2752     tb &= 0xc0;
2753     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2754     }
2755     }
2756     }
2757     #endif
2758    
2759     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2760     is a feature that makes it possible to save compiled regex and re-use them
2761     in other programs later. */
2762    
2763 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2764 nigel 77
2765     /* The lower casing table and the "must be at the start of a line" flag are
2766     used in a loop when finding where to start. */
2767    
2768 nigel 91 lcc = md->tables + lcc_offset;
2769 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2770 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2771    
2772     /* Set up the first character to match, if available. The first_byte value is
2773     never set for an anchored regular expression, but the anchoring may be forced
2774     at run time, so we have to test for anchoring. The first char may be unset for
2775     an unanchored pattern, of course. If there's no first char and the pattern was
2776     studied, there may be a bitmap of possible first characters. */
2777    
2778     if (!anchored)
2779     {
2780 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2781 nigel 77 {
2782     first_byte = re->first_byte & 255;
2783     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2784     first_byte = lcc[first_byte];
2785     }
2786     else
2787     {
2788     if (startline && study != NULL &&
2789     (study->options & PCRE_STUDY_MAPPED) != 0)
2790     start_bits = study->start_bits;
2791     }
2792     }
2793    
2794     /* For anchored or unanchored matches, there may be a "last known required
2795     character" set. */
2796    
2797 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2798 nigel 77 {
2799     req_byte = re->req_byte & 255;
2800     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2801 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2802 nigel 77 }
2803    
2804     /* Call the main matching function, looping for a non-anchored regex after a
2805 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2806     a match. */
2807 nigel 77
2808     for (;;)
2809     {
2810     int rc;
2811    
2812     if ((options & PCRE_DFA_RESTART) == 0)
2813     {
2814     const uschar *save_end_subject = end_subject;
2815    
2816 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2817     line of a multiline string. Implement this by temporarily adjusting
2818     end_subject so that we stop scanning at a newline. If the match fails at
2819     the newline, later code breaks this loop. */
2820 nigel 77
2821     if (firstline)
2822     {
2823 ph10 365 USPTR t = current_subject;
2824     #ifdef SUPPORT_UTF8
2825     if (utf8)
2826 ph10 371 {
2827     while (t < md->end_subject && !IS_NEWLINE(t))
2828 ph10 365 {
2829     t++;
2830     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2831 ph10 371 }
2832 ph10 365 }
2833     else
2834 ph10 371 #endif
2835 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2836 nigel 77 end_subject = t;
2837     }
2838 ph10 392
2839 ph10 389 /* There are some optimizations that avoid running the match if a known
2840     starting point is not found, or if a known later character is not present.
2841     However, there is an option that disables these, for testing and for
2842     ensuring that all callouts do actually occur. */
2843 nigel 77
2844 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2845 ph10 392 {
2846    
2847 ph10 389 /* Advance to a known first byte. */
2848 ph10 392
2849 ph10 389 if (first_byte >= 0)
2850 nigel 77 {
2851 ph10 389 if (first_byte_caseless)
2852     while (current_subject < end_subject &&
2853     lcc[*current_subject] != first_byte)
2854     current_subject++;
2855     else
2856 ph10 392 while (current_subject < end_subject &&
2857 ph10 389 *current_subject != first_byte)
2858     current_subject++;
2859     }
2860 ph10 392
2861 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2862 ph10 392
2863 ph10 389 else if (startline)
2864     {
2865     if (current_subject > md->start_subject + start_offset)
2866     {
2867 ph10 365 #ifdef SUPPORT_UTF8
2868 ph10 389 if (utf8)
2869 ph10 365 {
2870 ph10 392 while (current_subject < end_subject &&
2871 ph10 389 !WAS_NEWLINE(current_subject))
2872     {
2873 ph10 365 current_subject++;
2874 ph10 389 while(current_subject < end_subject &&
2875     (*current_subject & 0xc0) == 0x80)
2876     current_subject++;
2877     }
2878 ph10 371 }
2879 ph10 389 else
2880     #endif
2881     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2882     current_subject++;
2883 ph10 392
2884 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2885     ANYCRLF, and we are now at a LF, advance the match position by one
2886     more character. */
2887 ph10 392
2888 ph10 391 if (current_subject[-1] == CHAR_CR &&
2889 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2890     current_subject < end_subject &&
2891 ph10 391 *current_subject == CHAR_NL)
2892 ph10 389 current_subject++;
2893 ph10 365 }
2894 nigel 77 }
2895 ph10 392
2896 ph10 389 /* Or to a non-unique first char after study */
2897 ph10 392
2898 ph10 389 else if (start_bits != NULL)
2899 nigel 77 {
2900 ph10 389 while (current_subject < end_subject)
2901     {
2902     register unsigned int c = *current_subject;
2903     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2904     else break;
2905     }
2906 nigel 77 }
2907 ph10 392 }
2908 nigel 77
2909     /* Restore fudged end_subject */
2910    
2911     end_subject = save_end_subject;
2912     }
2913    
2914     /* If req_byte is set, we know that that character must appear in the subject
2915     for the match to succeed. If the first character is set, req_byte must be
2916     later in the subject; otherwise the test starts at the match point. This
2917     optimization can save a huge amount of work in patterns with nested unlimited
2918     repeats that aren't going to match. Writing separate code for cased/caseless
2919     versions makes it go faster, as does using an autoincrement and backing off
2920     on a match.
2921    
2922     HOWEVER: when the subject string is very, very long, searching to its end can
2923     take a long time, and give bad performance on quite ordinary patterns. This
2924     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2925     don't do this when the string is sufficiently long.
2926    
2927 ph10 392 ALSO: this processing is disabled when partial matching is requested, and can
2928 ph10 428 also be explicitly deactivated. Furthermore, we have to disable when
2929     restarting after a partial match, because the required character may have
2930     already been matched. */
2931 nigel 77
2932 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2933     req_byte >= 0 &&
2934 nigel 77 end_subject - current_subject < REQ_BYTE_MAX &&
2935 ph10 428 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2936 nigel 77 {
2937     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2938    
2939     /* We don't need to repeat the search if we haven't yet reached the
2940     place we found it at last time. */
2941    
2942     if (p > req_byte_ptr)
2943     {
2944     if (req_byte_caseless)
2945     {
2946     while (p < end_subject)
2947     {
2948     register int pp = *p++;
2949     if (pp == req_byte || pp == req_byte2) { p--; break; }
2950     }
2951     }
2952     else
2953     {
2954     while (p < end_subject)
2955     {
2956     if (*p++ == req_byte) { p--; break; }
2957     }
2958     }
2959    
2960     /* If we can't find the required character, break the matching loop,
2961     which will cause a return or PCRE_ERROR_NOMATCH. */
2962    
2963     if (p >= end_subject) break;
2964    
2965     /* If we have found the required character, save the point where we
2966     found it, so that we don't search again next time round the loop if
2967     the start hasn't passed this character yet. */
2968    
2969     req_byte_ptr = p;
2970     }
2971     }
2972    
2973     /* OK, now we can do the business */
2974    
2975 ph10 435 md->start_used_ptr = current_subject;
2976    
2977 nigel 77 rc = internal_dfa_exec(
2978 nigel 91 md, /* fixed match data */
2979     md->start_code, /* this subexpression's code */
2980     current_subject, /* where we currently are */
2981     start_offset, /* start offset in subject */
2982     offsets, /* offset vector */
2983     offsetcount, /* size of same */
2984     workspace, /* workspace vector */
2985     wscount, /* size of same */
2986 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2987 nigel 91 0, /* function recurse level */
2988     0); /* regex recurse level */
2989 nigel 77
2990     /* Anything other than "no match" means we are done, always; otherwise, carry
2991     on only if not anchored. */
2992    
2993     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2994    
2995     /* Advance to the next subject character unless we are at the end of a line
2996     and firstline is set. */
2997    
2998 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2999 nigel 77 current_subject++;
3000     if (utf8)
3001     {
3002     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3003     current_subject++;
3004     }
3005     if (current_subject > end_subject) break;
3006    
3007 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3008 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3009     or ANY or ANYCRLF, advance the match position by one more character. */
3010 nigel 93
3011 ph10 391 if (current_subject[-1] == CHAR_CR &&
3012 ph10 226 current_subject < end_subject &&
3013 ph10 391 *current_subject == CHAR_NL &&
3014 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3015 ph10 226 (md->nltype == NLTYPE_ANY ||
3016     md->nltype == NLTYPE_ANYCRLF ||
3017     md->nllen == 2))
3018 nigel 93 current_subject++;
3019    
3020     } /* "Bumpalong" loop */
3021    
3022 nigel 77 return PCRE_ERROR_NOMATCH;
3023     }
3024    
3025     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12