/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 442 - (hide annotations) (download)
Fri Sep 11 10:21:02 2009 UTC (3 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 101524 byte(s)
Added PCRE_NOTEMPTY_ATSTART to fix /g bug when \K is present.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 439 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51     test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71     Overall, I concluded that the gains in some cases did not outweigh the losses
72     in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109     character that is to be tested in some way. This makes is possible to
110     centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
113 ph10 168 that follow must also be modified. */
114 nigel 77
115 ph10 327 static const uschar coptable[] = {
116 nigel 77 0, /* End */
117 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
120 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
121     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
123     1, /* Char */
124     1, /* Charnc */
125     1, /* not */
126     /* Positive single-char repeats */
127     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
128     3, 3, 3, /* upto, minupto, exact */
129 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
130 nigel 77 /* Negative single-char repeats - only for chars < 256 */
131     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
132     3, 3, 3, /* NOT upto, minupto, exact */
133 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
134 nigel 77 /* Positive type repeats */
135     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
136     3, 3, 3, /* Type upto, minupto, exact */
137 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
138 nigel 77 /* Character class & ref repeats */
139     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
140     0, 0, /* CRRANGE, CRMINRANGE */
141     0, /* CLASS */
142     0, /* NCLASS */
143     0, /* XCLASS - variable length */
144     0, /* REF */
145     0, /* RECURSE */
146     0, /* CALLOUT */
147     0, /* Alt */
148     0, /* Ket */
149     0, /* KetRmax */
150     0, /* KetRmin */
151     0, /* Assert */
152     0, /* Assert not */
153     0, /* Assert behind */
154     0, /* Assert behind not */
155     0, /* Reverse */
156 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
157     0, 0, 0, /* SBRA, SCBRA, SCOND */
158 nigel 77 0, /* CREF */
159 nigel 93 0, /* RREF */
160     0, /* DEF */
161 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
162     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
163 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
164 nigel 77 };
165    
166     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
167     and \w */
168    
169 ph10 327 static const uschar toptable1[] = {
170 ph10 168 0, 0, 0, 0, 0, 0,
171 nigel 77 ctype_digit, ctype_digit,
172     ctype_space, ctype_space,
173     ctype_word, ctype_word,
174 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
175 nigel 77 };
176    
177 ph10 327 static const uschar toptable2[] = {
178 ph10 168 0, 0, 0, 0, 0, 0,
179 nigel 77 ctype_digit, 0,
180     ctype_space, 0,
181     ctype_word, 0,
182 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
183 nigel 77 };
184    
185    
186     /* Structure for holding data about a particular state, which is in effect the
187     current data for an active path through the match tree. It must consist
188     entirely of ints because the working vector we are passed, and which we put
189     these structures in, is a vector of ints. */
190    
191     typedef struct stateblock {
192     int offset; /* Offset to opcode */
193     int count; /* Count for repeats */
194     int ims; /* ims flag bits */
195     int data; /* Some use extra data */
196     } stateblock;
197    
198     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
199    
200    
201     #ifdef DEBUG
202     /*************************************************
203     * Print character string *
204     *************************************************/
205    
206     /* Character string printing function for debugging.
207    
208     Arguments:
209     p points to string
210     length number of bytes
211     f where to print
212    
213     Returns: nothing
214     */
215    
216     static void
217     pchars(unsigned char *p, int length, FILE *f)
218     {
219     int c;
220     while (length-- > 0)
221     {
222     if (isprint(c = *(p++)))
223     fprintf(f, "%c", c);
224     else
225     fprintf(f, "\\x%02x", c);
226     }
227     }
228     #endif
229    
230    
231    
232     /*************************************************
233     * Execute a Regular Expression - DFA engine *
234     *************************************************/
235    
236     /* This internal function applies a compiled pattern to a subject string,
237     starting at a given point, using a DFA engine. This function is called from the
238     external one, possibly multiple times if the pattern is not anchored. The
239     function calls itself recursively for some kinds of subpattern.
240    
241     Arguments:
242     md the match_data block with fixed information
243     this_start_code the opening bracket of this subexpression's code
244     current_subject where we currently are in the subject string
245     start_offset start offset in the subject string
246     offsets vector to contain the matching string offsets
247     offsetcount size of same
248     workspace vector of workspace
249     wscount size of same
250     ims the current ims flags
251     rlevel function call recursion level
252     recursing regex recursive call level
253    
254 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
255 ph10 341 = 0 => offsets overflowed; longest matches are present
256 nigel 77 -1 => failed to match
257     < -1 => some kind of unexpected problem
258    
259     The following macros are used for adding states to the two state vectors (one
260     for the current character, one for the following character). */
261    
262     #define ADD_ACTIVE(x,y) \
263     if (active_count++ < wscount) \
264     { \
265     next_active_state->offset = (x); \
266     next_active_state->count = (y); \
267     next_active_state->ims = ims; \
268     next_active_state++; \
269     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
270     } \
271     else return PCRE_ERROR_DFA_WSSIZE
272    
273     #define ADD_ACTIVE_DATA(x,y,z) \
274     if (active_count++ < wscount) \
275     { \
276     next_active_state->offset = (x); \
277     next_active_state->count = (y); \
278     next_active_state->ims = ims; \
279     next_active_state->data = (z); \
280     next_active_state++; \
281     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
282     } \
283     else return PCRE_ERROR_DFA_WSSIZE
284    
285     #define ADD_NEW(x,y) \
286     if (new_count++ < wscount) \
287     { \
288     next_new_state->offset = (x); \
289     next_new_state->count = (y); \
290     next_new_state->ims = ims; \
291     next_new_state++; \
292     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
293     } \
294     else return PCRE_ERROR_DFA_WSSIZE
295    
296     #define ADD_NEW_DATA(x,y,z) \
297     if (new_count++ < wscount) \
298     { \
299     next_new_state->offset = (x); \
300     next_new_state->count = (y); \
301     next_new_state->ims = ims; \
302     next_new_state->data = (z); \
303     next_new_state++; \
304     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
305     } \
306     else return PCRE_ERROR_DFA_WSSIZE
307    
308     /* And now, here is the code */
309    
310     static int
311     internal_dfa_exec(
312     dfa_match_data *md,
313     const uschar *this_start_code,
314     const uschar *current_subject,
315     int start_offset,
316     int *offsets,
317     int offsetcount,
318     int *workspace,
319     int wscount,
320     int ims,
321     int rlevel,
322     int recursing)
323     {
324     stateblock *active_states, *new_states, *temp_states;
325     stateblock *next_active_state, *next_new_state;
326    
327     const uschar *ctypes, *lcc, *fcc;
328     const uschar *ptr;
329 nigel 93 const uschar *end_code, *first_op;
330 nigel 77
331     int active_count, new_count, match_count;
332    
333     /* Some fields in the md block are frequently referenced, so we load them into
334     independent variables in the hope that this will perform better. */
335    
336     const uschar *start_subject = md->start_subject;
337     const uschar *end_subject = md->end_subject;
338     const uschar *start_code = md->start_code;
339    
340 nigel 87 #ifdef SUPPORT_UTF8
341 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
342 nigel 93 #else
343     BOOL utf8 = FALSE;
344 nigel 87 #endif
345 nigel 77
346     rlevel++;
347     offsetcount &= (-2);
348    
349     wscount -= 2;
350     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
351     (2 * INTS_PER_STATEBLOCK);
352    
353     DPRINTF(("\n%.*s---------------------\n"
354     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
355     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
356    
357     ctypes = md->tables + ctypes_offset;
358     lcc = md->tables + lcc_offset;
359     fcc = md->tables + fcc_offset;
360    
361     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
362    
363     active_states = (stateblock *)(workspace + 2);
364     next_new_state = new_states = active_states + wscount;
365     new_count = 0;
366    
367 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
368     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
369    
370 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
371     the alternative states onto the list, and find out where the end is. This
372     makes is possible to use this function recursively, when we want to stop at a
373     matching internal ket rather than at the end.
374    
375     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
376     a backward assertion. In that case, we have to find out the maximum amount to
377     move back, and set up each alternative appropriately. */
378    
379 nigel 93 if (*first_op == OP_REVERSE)
380 nigel 77 {
381     int max_back = 0;
382     int gone_back;
383    
384     end_code = this_start_code;
385     do
386     {
387     int back = GET(end_code, 2+LINK_SIZE);
388     if (back > max_back) max_back = back;
389     end_code += GET(end_code, 1);
390     }
391     while (*end_code == OP_ALT);
392    
393     /* If we can't go back the amount required for the longest lookbehind
394     pattern, go back as far as we can; some alternatives may still be viable. */
395    
396     #ifdef SUPPORT_UTF8
397     /* In character mode we have to step back character by character */
398    
399     if (utf8)
400     {
401     for (gone_back = 0; gone_back < max_back; gone_back++)
402     {
403     if (current_subject <= start_subject) break;
404     current_subject--;
405     while (current_subject > start_subject &&
406     (*current_subject & 0xc0) == 0x80)
407     current_subject--;
408     }
409     }
410     else
411     #endif
412    
413     /* In byte-mode we can do this quickly. */
414    
415     {
416     gone_back = (current_subject - max_back < start_subject)?
417     current_subject - start_subject : max_back;
418     current_subject -= gone_back;
419     }
420 ph10 435
421     /* Save the earliest consulted character */
422    
423     if (current_subject < md->start_used_ptr)
424     md->start_used_ptr = current_subject;
425 nigel 77
426     /* Now we can process the individual branches. */
427    
428     end_code = this_start_code;
429     do
430     {
431     int back = GET(end_code, 2+LINK_SIZE);
432     if (back <= gone_back)
433     {
434     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
435     ADD_NEW_DATA(-bstate, 0, gone_back - back);
436     }
437     end_code += GET(end_code, 1);
438     }
439     while (*end_code == OP_ALT);
440     }
441    
442     /* This is the code for a "normal" subpattern (not a backward assertion). The
443     start of a whole pattern is always one of these. If we are at the top level,
444     we may be asked to restart matching from the same point that we reached for a
445     previous partial match. We still have to scan through the top-level branches to
446     find the end state. */
447    
448     else
449     {
450     end_code = this_start_code;
451    
452     /* Restarting */
453    
454     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
455     {
456     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
457     new_count = workspace[1];
458     if (!workspace[0])
459     memcpy(new_states, active_states, new_count * sizeof(stateblock));
460     }
461    
462     /* Not restarting */
463    
464     else
465     {
466 nigel 93 int length = 1 + LINK_SIZE +
467     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
468 nigel 77 do
469     {
470 nigel 93 ADD_NEW(end_code - start_code + length, 0);
471 nigel 77 end_code += GET(end_code, 1);
472 nigel 93 length = 1 + LINK_SIZE;
473 nigel 77 }
474     while (*end_code == OP_ALT);
475     }
476     }
477    
478     workspace[0] = 0; /* Bit indicating which vector is current */
479    
480     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
481    
482     /* Loop for scanning the subject */
483    
484     ptr = current_subject;
485     for (;;)
486     {
487     int i, j;
488 nigel 91 int clen, dlen;
489     unsigned int c, d;
490 ph10 428 int forced_fail = 0;
491     int reached_end = 0;
492 nigel 77
493     /* Make the new state list into the active state list and empty the
494     new state list. */
495    
496     temp_states = active_states;
497     active_states = new_states;
498     new_states = temp_states;
499     active_count = new_count;
500     new_count = 0;
501    
502     workspace[0] ^= 1; /* Remember for the restarting feature */
503     workspace[1] = active_count;
504    
505     #ifdef DEBUG
506     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
507     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
508     printf("\"\n");
509    
510     printf("%.*sActive states: ", rlevel*2-2, SP);
511     for (i = 0; i < active_count; i++)
512     printf("%d/%d ", active_states[i].offset, active_states[i].count);
513     printf("\n");
514     #endif
515    
516     /* Set the pointers for adding new states */
517    
518     next_active_state = active_states + active_count;
519     next_new_state = new_states;
520    
521     /* Load the current character from the subject outside the loop, as many
522     different states may want to look at it, and we assume that at least one
523     will. */
524    
525     if (ptr < end_subject)
526     {
527 nigel 93 clen = 1; /* Number of bytes in the character */
528 nigel 77 #ifdef SUPPORT_UTF8
529     if (utf8) { GETCHARLEN(c, ptr, clen); } else
530     #endif /* SUPPORT_UTF8 */
531     c = *ptr;
532     }
533     else
534     {
535 nigel 93 clen = 0; /* This indicates the end of the subject */
536     c = NOTACHAR; /* This value should never actually be used */
537 nigel 77 }
538    
539     /* Scan up the active states and act on each one. The result of an action
540     may be to add more states to the currently active list (e.g. on hitting a
541     parenthesis) or it may be to put states on the new list, for considering
542     when we move the character pointer on. */
543    
544     for (i = 0; i < active_count; i++)
545     {
546     stateblock *current_state = active_states + i;
547     const uschar *code;
548     int state_offset = current_state->offset;
549 ph10 397 int count, codevalue, rrc;
550 nigel 77
551     #ifdef DEBUG
552     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
553 nigel 93 if (clen == 0) printf("EOL\n");
554 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
555     else printf("0x%02x\n", c);
556     #endif
557    
558     /* This variable is referred to implicity in the ADD_xxx macros. */
559    
560     ims = current_state->ims;
561    
562     /* A negative offset is a special case meaning "hold off going to this
563     (negated) state until the number of characters in the data field have
564     been skipped". */
565    
566     if (state_offset < 0)
567     {
568     if (current_state->data > 0)
569     {
570     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
571     ADD_NEW_DATA(state_offset, current_state->count,
572     current_state->data - 1);
573     continue;
574     }
575     else
576     {
577     current_state->offset = state_offset = -state_offset;
578     }
579     }
580    
581 ph10 439 /* Check for a duplicate state with the same count, and skip if found.
582     See the note at the head of this module about the possibility of improving
583     performance here. */
584 nigel 77
585     for (j = 0; j < i; j++)
586     {
587     if (active_states[j].offset == state_offset &&
588     active_states[j].count == current_state->count)
589     {
590     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
591     goto NEXT_ACTIVE_STATE;
592     }
593     }
594    
595     /* The state offset is the offset to the opcode */
596    
597     code = start_code + state_offset;
598     codevalue = *code;
599    
600     /* If this opcode is followed by an inline character, load it. It is
601     tempting to test for the presence of a subject character here, but that
602     is wrong, because sometimes zero repetitions of the subject are
603     permitted.
604    
605     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
606 ph10 178 argument that is not a data character - but is always one byte long. We
607     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
608     this case. To keep the other cases fast, convert these ones to new opcodes.
609     */
610 nigel 77
611     if (coptable[codevalue] > 0)
612     {
613     dlen = 1;
614     #ifdef SUPPORT_UTF8
615     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
616     #endif /* SUPPORT_UTF8 */
617     d = code[coptable[codevalue]];
618     if (codevalue >= OP_TYPESTAR)
619     {
620 nigel 93 switch(d)
621     {
622     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
623     case OP_NOTPROP:
624     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
625     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
626     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
627 ph10 178 case OP_NOT_HSPACE:
628 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
629 ph10 178 case OP_NOT_VSPACE:
630 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
631 nigel 93 default: break;
632     }
633 nigel 77 }
634     }
635     else
636     {
637     dlen = 0; /* Not strictly necessary, but compilers moan */
638 nigel 93 d = NOTACHAR; /* if these variables are not set. */
639 nigel 77 }
640    
641    
642     /* Now process the individual opcodes */
643    
644     switch (codevalue)
645     {
646    
647     /* ========================================================================== */
648     /* Reached a closing bracket. If not at the end of the pattern, carry
649     on with the next opcode. Otherwise, unless we have an empty string and
650 ph10 442 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
651     start of the subject, save the match data, shifting up all previous
652 nigel 77 matches so we always have the longest first. */
653    
654     case OP_KET:
655     case OP_KETRMIN:
656     case OP_KETRMAX:
657     if (code != end_code)
658     {
659     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
660     if (codevalue != OP_KET)
661     {
662     ADD_ACTIVE(state_offset - GET(code, 1), 0);
663     }
664     }
665 ph10 428 else
666 nigel 77 {
667 ph10 428 reached_end++; /* Count branches that reach the end */
668 ph10 442 if (ptr > current_subject ||
669     ((md->moptions & PCRE_NOTEMPTY) == 0 &&
670     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
671     current_subject > start_subject + md->start_offset)))
672 nigel 77 {
673 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
674     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
675     match_count = 0;
676     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
677     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
678     if (offsetcount >= 2)
679     {
680     offsets[0] = current_subject - start_subject;
681     offsets[1] = ptr - start_subject;
682     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
683     offsets[1] - offsets[0], current_subject));
684     }
685     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
686     {
687     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
688     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
689     match_count, rlevel*2-2, SP));
690     return match_count;
691     }
692     }
693 nigel 77 }
694     break;
695    
696     /* ========================================================================== */
697     /* These opcodes add to the current list of states without looking
698     at the current character. */
699    
700     /*-----------------------------------------------------------------*/
701     case OP_ALT:
702     do { code += GET(code, 1); } while (*code == OP_ALT);
703     ADD_ACTIVE(code - start_code, 0);
704     break;
705    
706     /*-----------------------------------------------------------------*/
707     case OP_BRA:
708 nigel 93 case OP_SBRA:
709 nigel 77 do
710     {
711     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
712     code += GET(code, 1);
713     }
714     while (*code == OP_ALT);
715     break;
716    
717     /*-----------------------------------------------------------------*/
718 nigel 93 case OP_CBRA:
719     case OP_SCBRA:
720     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
721     code += GET(code, 1);
722     while (*code == OP_ALT)
723     {
724     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
725     code += GET(code, 1);
726     }
727     break;
728    
729     /*-----------------------------------------------------------------*/
730 nigel 77 case OP_BRAZERO:
731     case OP_BRAMINZERO:
732     ADD_ACTIVE(state_offset + 1, 0);
733     code += 1 + GET(code, 2);
734     while (*code == OP_ALT) code += GET(code, 1);
735     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
736     break;
737    
738     /*-----------------------------------------------------------------*/
739 ph10 335 case OP_SKIPZERO:
740     code += 1 + GET(code, 2);
741     while (*code == OP_ALT) code += GET(code, 1);
742     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
743     break;
744    
745     /*-----------------------------------------------------------------*/
746 nigel 77 case OP_CIRC:
747     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
748 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
749     ptr != end_subject &&
750 nigel 93 WAS_NEWLINE(ptr)))
751 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
752     break;
753    
754     /*-----------------------------------------------------------------*/
755     case OP_EOD:
756     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
757     break;
758    
759     /*-----------------------------------------------------------------*/
760     case OP_OPT:
761     ims = code[1];
762     ADD_ACTIVE(state_offset + 2, 0);
763     break;
764    
765     /*-----------------------------------------------------------------*/
766     case OP_SOD:
767     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
768     break;
769    
770     /*-----------------------------------------------------------------*/
771     case OP_SOM:
772     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
773     break;
774    
775    
776     /* ========================================================================== */
777     /* These opcodes inspect the next subject character, and sometimes
778     the previous one as well, but do not have an argument. The variable
779     clen contains the length of the current character and is zero if we are
780     at the end of the subject. */
781    
782     /*-----------------------------------------------------------------*/
783     case OP_ANY:
784 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
785 nigel 77 { ADD_NEW(state_offset + 1, 0); }
786     break;
787    
788     /*-----------------------------------------------------------------*/
789 ph10 341 case OP_ALLANY:
790     if (clen > 0)
791     { ADD_NEW(state_offset + 1, 0); }
792     break;
793    
794     /*-----------------------------------------------------------------*/
795 nigel 77 case OP_EODN:
796 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
797 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
798     break;
799    
800     /*-----------------------------------------------------------------*/
801     case OP_DOLL:
802     if ((md->moptions & PCRE_NOTEOL) == 0)
803     {
804 nigel 91 if (clen == 0 ||
805 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
806 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
807     ))
808 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
809     }
810 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
811 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
812     break;
813    
814     /*-----------------------------------------------------------------*/
815    
816     case OP_DIGIT:
817     case OP_WHITESPACE:
818     case OP_WORDCHAR:
819     if (clen > 0 && c < 256 &&
820     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
821     { ADD_NEW(state_offset + 1, 0); }
822     break;
823    
824     /*-----------------------------------------------------------------*/
825     case OP_NOT_DIGIT:
826     case OP_NOT_WHITESPACE:
827     case OP_NOT_WORDCHAR:
828     if (clen > 0 && (c >= 256 ||
829     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
830     { ADD_NEW(state_offset + 1, 0); }
831     break;
832    
833     /*-----------------------------------------------------------------*/
834     case OP_WORD_BOUNDARY:
835     case OP_NOT_WORD_BOUNDARY:
836     {
837     int left_word, right_word;
838    
839     if (ptr > start_subject)
840     {
841     const uschar *temp = ptr - 1;
842 ph10 435 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
843 nigel 77 #ifdef SUPPORT_UTF8
844     if (utf8) BACKCHAR(temp);
845     #endif
846     GETCHARTEST(d, temp);
847     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
848     }
849     else left_word = 0;
850    
851 ph10 428 if (clen > 0)
852     right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
853     else /* This is a fudge to ensure that if this is the */
854     { /* last item in the pattern, we don't count it as */
855     reached_end--; /* reached, thus disabling a partial match. */
856     right_word = 0;
857     }
858 nigel 77
859     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
860     { ADD_ACTIVE(state_offset + 1, 0); }
861     }
862     break;
863    
864    
865     /*-----------------------------------------------------------------*/
866     /* Check the next character by Unicode property. We will get here only
867     if the support is in the binary; otherwise a compile-time error occurs.
868     */
869    
870 ph10 151 #ifdef SUPPORT_UCP
871 nigel 77 case OP_PROP:
872     case OP_NOTPROP:
873     if (clen > 0)
874     {
875 nigel 87 BOOL OK;
876 ph10 349 const ucd_record * prop = GET_UCD(c);
877 nigel 87 switch(code[1])
878 nigel 77 {
879 nigel 87 case PT_ANY:
880     OK = TRUE;
881     break;
882    
883     case PT_LAMP:
884 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
885 nigel 87 break;
886    
887     case PT_GC:
888 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
889 nigel 87 break;
890    
891     case PT_PC:
892 ph10 349 OK = prop->chartype == code[2];
893 nigel 87 break;
894    
895     case PT_SC:
896 ph10 349 OK = prop->script == code[2];
897 nigel 87 break;
898    
899     /* Should never occur, but keep compilers from grumbling. */
900    
901     default:
902     OK = codevalue != OP_PROP;
903     break;
904 nigel 77 }
905 nigel 87
906     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
907 nigel 77 }
908     break;
909     #endif
910    
911    
912    
913     /* ========================================================================== */
914     /* These opcodes likewise inspect the subject character, but have an
915     argument that is not a data character. It is one of these opcodes:
916 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
917     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
918 nigel 77
919     case OP_TYPEPLUS:
920     case OP_TYPEMINPLUS:
921 nigel 93 case OP_TYPEPOSPLUS:
922 nigel 77 count = current_state->count; /* Already matched */
923     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
924     if (clen > 0)
925     {
926     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
927     (c < 256 &&
928 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
929 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
930     {
931 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
932     {
933     active_count--; /* Remove non-match possibility */
934     next_active_state--;
935     }
936 nigel 77 count++;
937     ADD_NEW(state_offset, count);
938     }
939     }
940     break;
941    
942     /*-----------------------------------------------------------------*/
943     case OP_TYPEQUERY:
944     case OP_TYPEMINQUERY:
945 nigel 93 case OP_TYPEPOSQUERY:
946 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
947     if (clen > 0)
948     {
949     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
950     (c < 256 &&
951 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
952 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
953     {
954 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
955     {
956     active_count--; /* Remove non-match possibility */
957     next_active_state--;
958     }
959 nigel 77 ADD_NEW(state_offset + 2, 0);
960     }
961     }
962     break;
963    
964     /*-----------------------------------------------------------------*/
965     case OP_TYPESTAR:
966     case OP_TYPEMINSTAR:
967 nigel 93 case OP_TYPEPOSSTAR:
968 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
969     if (clen > 0)
970     {
971     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
972     (c < 256 &&
973 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
974 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
975     {
976 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
977     {
978     active_count--; /* Remove non-match possibility */
979     next_active_state--;
980     }
981 nigel 77 ADD_NEW(state_offset, 0);
982     }
983     }
984     break;
985    
986     /*-----------------------------------------------------------------*/
987     case OP_TYPEEXACT:
988 nigel 93 count = current_state->count; /* Number already matched */
989     if (clen > 0)
990     {
991     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
992     (c < 256 &&
993 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
994 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
995     {
996     if (++count >= GET2(code, 1))
997     { ADD_NEW(state_offset + 4, 0); }
998     else
999     { ADD_NEW(state_offset, count); }
1000     }
1001     }
1002     break;
1003    
1004     /*-----------------------------------------------------------------*/
1005 nigel 77 case OP_TYPEUPTO:
1006     case OP_TYPEMINUPTO:
1007 nigel 93 case OP_TYPEPOSUPTO:
1008     ADD_ACTIVE(state_offset + 4, 0);
1009 nigel 77 count = current_state->count; /* Number already matched */
1010     if (clen > 0)
1011     {
1012     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1013     (c < 256 &&
1014 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1015 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1016     {
1017 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1018     {
1019     active_count--; /* Remove non-match possibility */
1020     next_active_state--;
1021     }
1022 nigel 77 if (++count >= GET2(code, 1))
1023     { ADD_NEW(state_offset + 4, 0); }
1024     else
1025     { ADD_NEW(state_offset, count); }
1026     }
1027     }
1028     break;
1029    
1030     /* ========================================================================== */
1031     /* These are virtual opcodes that are used when something like
1032 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1033     argument. It keeps the code above fast for the other cases. The argument
1034     is in the d variable. */
1035 nigel 77
1036 ph10 151 #ifdef SUPPORT_UCP
1037 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1038     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1039 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1040 nigel 77 count = current_state->count; /* Already matched */
1041 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1042 nigel 77 if (clen > 0)
1043     {
1044 nigel 87 BOOL OK;
1045 ph10 349 const ucd_record * prop = GET_UCD(c);
1046 nigel 87 switch(code[2])
1047     {
1048     case PT_ANY:
1049     OK = TRUE;
1050     break;
1051    
1052     case PT_LAMP:
1053 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1054 nigel 87 break;
1055    
1056     case PT_GC:
1057 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1058 nigel 87 break;
1059    
1060     case PT_PC:
1061 ph10 349 OK = prop->chartype == code[3];
1062 nigel 87 break;
1063    
1064     case PT_SC:
1065 ph10 349 OK = prop->script == code[3];
1066 nigel 87 break;
1067    
1068     /* Should never occur, but keep compilers from grumbling. */
1069    
1070     default:
1071     OK = codevalue != OP_PROP;
1072     break;
1073     }
1074    
1075 nigel 93 if (OK == (d == OP_PROP))
1076     {
1077     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1078     {
1079     active_count--; /* Remove non-match possibility */
1080     next_active_state--;
1081     }
1082     count++;
1083     ADD_NEW(state_offset, count);
1084     }
1085 nigel 77 }
1086     break;
1087    
1088     /*-----------------------------------------------------------------*/
1089     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1090     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1091 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1092 nigel 77 count = current_state->count; /* Already matched */
1093     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1094 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1095 nigel 77 {
1096     const uschar *nptr = ptr + clen;
1097     int ncount = 0;
1098 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1099     {
1100     active_count--; /* Remove non-match possibility */
1101     next_active_state--;
1102     }
1103 nigel 77 while (nptr < end_subject)
1104     {
1105     int nd;
1106     int ndlen = 1;
1107     GETCHARLEN(nd, nptr, ndlen);
1108 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1109 nigel 77 ncount++;
1110     nptr += ndlen;
1111     }
1112     count++;
1113     ADD_NEW_DATA(-state_offset, count, ncount);
1114     }
1115     break;
1116 ph10 151 #endif
1117 nigel 77
1118     /*-----------------------------------------------------------------*/
1119 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1120     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1121     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1122     count = current_state->count; /* Already matched */
1123     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1124     if (clen > 0)
1125     {
1126     int ncount = 0;
1127     switch (c)
1128     {
1129     case 0x000b:
1130     case 0x000c:
1131     case 0x0085:
1132     case 0x2028:
1133     case 0x2029:
1134 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1135     goto ANYNL01;
1136    
1137     case 0x000d:
1138     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1139     /* Fall through */
1140    
1141     ANYNL01:
1142     case 0x000a:
1143 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1144     {
1145     active_count--; /* Remove non-match possibility */
1146     next_active_state--;
1147     }
1148     count++;
1149     ADD_NEW_DATA(-state_offset, count, ncount);
1150     break;
1151 ph10 231
1152 nigel 93 default:
1153     break;
1154     }
1155     }
1156     break;
1157    
1158     /*-----------------------------------------------------------------*/
1159 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1160     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1161     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1162     count = current_state->count; /* Already matched */
1163     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1164     if (clen > 0)
1165     {
1166 ph10 182 BOOL OK;
1167 ph10 178 switch (c)
1168     {
1169     case 0x000a:
1170     case 0x000b:
1171     case 0x000c:
1172     case 0x000d:
1173     case 0x0085:
1174     case 0x2028:
1175     case 0x2029:
1176     OK = TRUE;
1177 ph10 182 break;
1178 ph10 178
1179     default:
1180     OK = FALSE;
1181 ph10 182 break;
1182 ph10 178 }
1183    
1184     if (OK == (d == OP_VSPACE))
1185 ph10 182 {
1186 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1187     {
1188     active_count--; /* Remove non-match possibility */
1189     next_active_state--;
1190     }
1191     count++;
1192     ADD_NEW_DATA(-state_offset, count, 0);
1193     }
1194     }
1195     break;
1196    
1197     /*-----------------------------------------------------------------*/
1198     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1199     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1200     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1201     count = current_state->count; /* Already matched */
1202     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1203     if (clen > 0)
1204     {
1205 ph10 182 BOOL OK;
1206 ph10 178 switch (c)
1207     {
1208     case 0x09: /* HT */
1209     case 0x20: /* SPACE */
1210     case 0xa0: /* NBSP */
1211     case 0x1680: /* OGHAM SPACE MARK */
1212     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1213     case 0x2000: /* EN QUAD */
1214     case 0x2001: /* EM QUAD */
1215     case 0x2002: /* EN SPACE */
1216     case 0x2003: /* EM SPACE */
1217     case 0x2004: /* THREE-PER-EM SPACE */
1218     case 0x2005: /* FOUR-PER-EM SPACE */
1219     case 0x2006: /* SIX-PER-EM SPACE */
1220     case 0x2007: /* FIGURE SPACE */
1221     case 0x2008: /* PUNCTUATION SPACE */
1222     case 0x2009: /* THIN SPACE */
1223     case 0x200A: /* HAIR SPACE */
1224     case 0x202f: /* NARROW NO-BREAK SPACE */
1225     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1226     case 0x3000: /* IDEOGRAPHIC SPACE */
1227     OK = TRUE;
1228     break;
1229 ph10 182
1230 ph10 178 default:
1231     OK = FALSE;
1232     break;
1233     }
1234 ph10 182
1235 ph10 178 if (OK == (d == OP_HSPACE))
1236 ph10 182 {
1237 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1238     {
1239     active_count--; /* Remove non-match possibility */
1240     next_active_state--;
1241     }
1242     count++;
1243     ADD_NEW_DATA(-state_offset, count, 0);
1244     }
1245     }
1246     break;
1247    
1248     /*-----------------------------------------------------------------*/
1249 ph10 151 #ifdef SUPPORT_UCP
1250 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1251     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1252 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1253 nigel 87 count = 4;
1254 nigel 77 goto QS1;
1255    
1256     case OP_PROP_EXTRA + OP_TYPESTAR:
1257     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1258 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1259 nigel 77 count = 0;
1260    
1261     QS1:
1262    
1263 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1264 nigel 77 if (clen > 0)
1265     {
1266 nigel 87 BOOL OK;
1267 ph10 349 const ucd_record * prop = GET_UCD(c);
1268 nigel 87 switch(code[2])
1269     {
1270     case PT_ANY:
1271     OK = TRUE;
1272     break;
1273    
1274     case PT_LAMP:
1275 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1276 nigel 87 break;
1277    
1278     case PT_GC:
1279 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1280 nigel 87 break;
1281    
1282     case PT_PC:
1283 ph10 349 OK = prop->chartype == code[3];
1284 nigel 87 break;
1285    
1286     case PT_SC:
1287 ph10 349 OK = prop->script == code[3];
1288 nigel 87 break;
1289    
1290     /* Should never occur, but keep compilers from grumbling. */
1291    
1292     default:
1293     OK = codevalue != OP_PROP;
1294     break;
1295     }
1296    
1297 nigel 93 if (OK == (d == OP_PROP))
1298     {
1299     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1300     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1301     {
1302     active_count--; /* Remove non-match possibility */
1303     next_active_state--;
1304     }
1305     ADD_NEW(state_offset + count, 0);
1306     }
1307 nigel 77 }
1308     break;
1309    
1310     /*-----------------------------------------------------------------*/
1311     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1312     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1313 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1314 nigel 77 count = 2;
1315     goto QS2;
1316    
1317     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1318     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1319 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1320 nigel 77 count = 0;
1321    
1322     QS2:
1323    
1324     ADD_ACTIVE(state_offset + 2, 0);
1325 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1326 nigel 77 {
1327     const uschar *nptr = ptr + clen;
1328     int ncount = 0;
1329 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1330     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1331     {
1332     active_count--; /* Remove non-match possibility */
1333     next_active_state--;
1334     }
1335 nigel 77 while (nptr < end_subject)
1336     {
1337     int nd;
1338     int ndlen = 1;
1339     GETCHARLEN(nd, nptr, ndlen);
1340 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1341 nigel 77 ncount++;
1342     nptr += ndlen;
1343     }
1344     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1345     }
1346     break;
1347 ph10 151 #endif
1348 nigel 77
1349     /*-----------------------------------------------------------------*/
1350 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1351     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1352     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1353     count = 2;
1354     goto QS3;
1355    
1356     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1357     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1358     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1359     count = 0;
1360    
1361     QS3:
1362     ADD_ACTIVE(state_offset + 2, 0);
1363     if (clen > 0)
1364     {
1365     int ncount = 0;
1366     switch (c)
1367     {
1368     case 0x000b:
1369     case 0x000c:
1370     case 0x0085:
1371     case 0x2028:
1372     case 0x2029:
1373 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1374     goto ANYNL02;
1375    
1376     case 0x000d:
1377     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1378     /* Fall through */
1379    
1380     ANYNL02:
1381     case 0x000a:
1382 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1383     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1384     {
1385     active_count--; /* Remove non-match possibility */
1386     next_active_state--;
1387     }
1388     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1389     break;
1390 ph10 231
1391 nigel 93 default:
1392     break;
1393     }
1394     }
1395     break;
1396    
1397     /*-----------------------------------------------------------------*/
1398 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1399     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1400     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1401     count = 2;
1402     goto QS4;
1403    
1404     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1405     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1406     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1407     count = 0;
1408    
1409     QS4:
1410     ADD_ACTIVE(state_offset + 2, 0);
1411     if (clen > 0)
1412     {
1413 ph10 182 BOOL OK;
1414 ph10 178 switch (c)
1415     {
1416     case 0x000a:
1417     case 0x000b:
1418     case 0x000c:
1419     case 0x000d:
1420     case 0x0085:
1421     case 0x2028:
1422     case 0x2029:
1423     OK = TRUE;
1424     break;
1425 ph10 182
1426 ph10 178 default:
1427     OK = FALSE;
1428     break;
1429     }
1430     if (OK == (d == OP_VSPACE))
1431 ph10 182 {
1432 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1433     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1434     {
1435     active_count--; /* Remove non-match possibility */
1436     next_active_state--;
1437     }
1438     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1439     }
1440     }
1441     break;
1442    
1443     /*-----------------------------------------------------------------*/
1444     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1445     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1446     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1447     count = 2;
1448     goto QS5;
1449    
1450     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1451     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1452     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1453     count = 0;
1454    
1455     QS5:
1456     ADD_ACTIVE(state_offset + 2, 0);
1457     if (clen > 0)
1458     {
1459 ph10 182 BOOL OK;
1460 ph10 178 switch (c)
1461     {
1462     case 0x09: /* HT */
1463     case 0x20: /* SPACE */
1464     case 0xa0: /* NBSP */
1465     case 0x1680: /* OGHAM SPACE MARK */
1466     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1467     case 0x2000: /* EN QUAD */
1468     case 0x2001: /* EM QUAD */
1469     case 0x2002: /* EN SPACE */
1470     case 0x2003: /* EM SPACE */
1471     case 0x2004: /* THREE-PER-EM SPACE */
1472     case 0x2005: /* FOUR-PER-EM SPACE */
1473     case 0x2006: /* SIX-PER-EM SPACE */
1474     case 0x2007: /* FIGURE SPACE */
1475     case 0x2008: /* PUNCTUATION SPACE */
1476     case 0x2009: /* THIN SPACE */
1477     case 0x200A: /* HAIR SPACE */
1478     case 0x202f: /* NARROW NO-BREAK SPACE */
1479     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1480     case 0x3000: /* IDEOGRAPHIC SPACE */
1481     OK = TRUE;
1482     break;
1483 ph10 182
1484 ph10 178 default:
1485     OK = FALSE;
1486     break;
1487     }
1488 ph10 182
1489 ph10 178 if (OK == (d == OP_HSPACE))
1490 ph10 182 {
1491 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1492     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1493     {
1494     active_count--; /* Remove non-match possibility */
1495     next_active_state--;
1496     }
1497     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1498     }
1499     }
1500     break;
1501    
1502     /*-----------------------------------------------------------------*/
1503 ph10 151 #ifdef SUPPORT_UCP
1504 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1505     case OP_PROP_EXTRA + OP_TYPEUPTO:
1506     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1507 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1508 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1509 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1510 nigel 77 count = current_state->count; /* Number already matched */
1511     if (clen > 0)
1512     {
1513 nigel 87 BOOL OK;
1514 ph10 349 const ucd_record * prop = GET_UCD(c);
1515 nigel 87 switch(code[4])
1516 nigel 77 {
1517 nigel 87 case PT_ANY:
1518     OK = TRUE;
1519     break;
1520    
1521     case PT_LAMP:
1522 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1523 nigel 87 break;
1524    
1525     case PT_GC:
1526 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1527 nigel 87 break;
1528    
1529     case PT_PC:
1530 ph10 349 OK = prop->chartype == code[5];
1531 nigel 87 break;
1532    
1533     case PT_SC:
1534 ph10 349 OK = prop->script == code[5];
1535 nigel 87 break;
1536    
1537     /* Should never occur, but keep compilers from grumbling. */
1538    
1539     default:
1540     OK = codevalue != OP_PROP;
1541     break;
1542     }
1543    
1544     if (OK == (d == OP_PROP))
1545     {
1546 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1547     {
1548     active_count--; /* Remove non-match possibility */
1549     next_active_state--;
1550     }
1551 nigel 77 if (++count >= GET2(code, 1))
1552 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1553 nigel 77 else
1554     { ADD_NEW(state_offset, count); }
1555     }
1556     }
1557     break;
1558    
1559     /*-----------------------------------------------------------------*/
1560     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1561     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1562     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1563 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1564 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1565     { ADD_ACTIVE(state_offset + 4, 0); }
1566     count = current_state->count; /* Number already matched */
1567 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1568 nigel 77 {
1569     const uschar *nptr = ptr + clen;
1570     int ncount = 0;
1571 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1572     {
1573     active_count--; /* Remove non-match possibility */
1574     next_active_state--;
1575     }
1576 nigel 77 while (nptr < end_subject)
1577     {
1578     int nd;
1579     int ndlen = 1;
1580     GETCHARLEN(nd, nptr, ndlen);
1581 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1582 nigel 77 ncount++;
1583     nptr += ndlen;
1584     }
1585     if (++count >= GET2(code, 1))
1586     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1587     else
1588     { ADD_NEW_DATA(-state_offset, count, ncount); }
1589     }
1590     break;
1591 ph10 151 #endif
1592 nigel 77
1593 nigel 93 /*-----------------------------------------------------------------*/
1594     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1595     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1596     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1597     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1598     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1599     { ADD_ACTIVE(state_offset + 4, 0); }
1600     count = current_state->count; /* Number already matched */
1601     if (clen > 0)
1602     {
1603     int ncount = 0;
1604     switch (c)
1605     {
1606     case 0x000b:
1607     case 0x000c:
1608     case 0x0085:
1609     case 0x2028:
1610     case 0x2029:
1611 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1612     goto ANYNL03;
1613    
1614     case 0x000d:
1615     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1616     /* Fall through */
1617    
1618     ANYNL03:
1619     case 0x000a:
1620 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1621     {
1622     active_count--; /* Remove non-match possibility */
1623     next_active_state--;
1624     }
1625     if (++count >= GET2(code, 1))
1626     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1627     else
1628     { ADD_NEW_DATA(-state_offset, count, ncount); }
1629     break;
1630 ph10 231
1631 nigel 93 default:
1632     break;
1633     }
1634     }
1635     break;
1636    
1637 ph10 178 /*-----------------------------------------------------------------*/
1638     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1639     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1640     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1641     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1642     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1643     { ADD_ACTIVE(state_offset + 4, 0); }
1644     count = current_state->count; /* Number already matched */
1645     if (clen > 0)
1646     {
1647 ph10 182 BOOL OK;
1648 ph10 178 switch (c)
1649     {
1650     case 0x000a:
1651     case 0x000b:
1652     case 0x000c:
1653     case 0x000d:
1654     case 0x0085:
1655     case 0x2028:
1656     case 0x2029:
1657     OK = TRUE;
1658     break;
1659 ph10 182
1660 ph10 178 default:
1661     OK = FALSE;
1662     }
1663 ph10 182
1664 ph10 178 if (OK == (d == OP_VSPACE))
1665 ph10 182 {
1666 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1667     {
1668     active_count--; /* Remove non-match possibility */
1669     next_active_state--;
1670     }
1671     if (++count >= GET2(code, 1))
1672     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1673     else
1674     { ADD_NEW_DATA(-state_offset, count, 0); }
1675     }
1676     }
1677     break;
1678    
1679     /*-----------------------------------------------------------------*/
1680     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1681     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1682     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1683     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1684     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1685     { ADD_ACTIVE(state_offset + 4, 0); }
1686     count = current_state->count; /* Number already matched */
1687     if (clen > 0)
1688     {
1689 ph10 182 BOOL OK;
1690 ph10 178 switch (c)
1691     {
1692     case 0x09: /* HT */
1693     case 0x20: /* SPACE */
1694     case 0xa0: /* NBSP */
1695     case 0x1680: /* OGHAM SPACE MARK */
1696     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1697     case 0x2000: /* EN QUAD */
1698     case 0x2001: /* EM QUAD */
1699     case 0x2002: /* EN SPACE */
1700     case 0x2003: /* EM SPACE */
1701     case 0x2004: /* THREE-PER-EM SPACE */
1702     case 0x2005: /* FOUR-PER-EM SPACE */
1703     case 0x2006: /* SIX-PER-EM SPACE */
1704     case 0x2007: /* FIGURE SPACE */
1705     case 0x2008: /* PUNCTUATION SPACE */
1706     case 0x2009: /* THIN SPACE */
1707     case 0x200A: /* HAIR SPACE */
1708     case 0x202f: /* NARROW NO-BREAK SPACE */
1709     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1710     case 0x3000: /* IDEOGRAPHIC SPACE */
1711     OK = TRUE;
1712     break;
1713 ph10 182
1714 ph10 178 default:
1715     OK = FALSE;
1716     break;
1717     }
1718 ph10 182
1719 ph10 178 if (OK == (d == OP_HSPACE))
1720 ph10 182 {
1721 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1722     {
1723     active_count--; /* Remove non-match possibility */
1724     next_active_state--;
1725     }
1726     if (++count >= GET2(code, 1))
1727     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1728     else
1729     { ADD_NEW_DATA(-state_offset, count, 0); }
1730     }
1731     }
1732     break;
1733    
1734 nigel 77 /* ========================================================================== */
1735     /* These opcodes are followed by a character that is usually compared
1736     to the current subject character; it is loaded into d. We still get
1737     here even if there is no subject character, because in some cases zero
1738     repetitions are permitted. */
1739    
1740     /*-----------------------------------------------------------------*/
1741     case OP_CHAR:
1742     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1743     break;
1744    
1745     /*-----------------------------------------------------------------*/
1746     case OP_CHARNC:
1747     if (clen == 0) break;
1748    
1749     #ifdef SUPPORT_UTF8
1750     if (utf8)
1751     {
1752     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1753     {
1754 nigel 93 unsigned int othercase;
1755 nigel 77 if (c < 128) othercase = fcc[c]; else
1756    
1757     /* If we have Unicode property support, we can use it to test the
1758 nigel 87 other case of the character. */
1759 nigel 77
1760     #ifdef SUPPORT_UCP
1761 ph10 349 othercase = UCD_OTHERCASE(c);
1762 nigel 87 #else
1763 nigel 93 othercase = NOTACHAR;
1764 nigel 77 #endif
1765    
1766     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1767     }
1768     }
1769     else
1770     #endif /* SUPPORT_UTF8 */
1771    
1772     /* Non-UTF-8 mode */
1773     {
1774     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1775     }
1776     break;
1777    
1778    
1779     #ifdef SUPPORT_UCP
1780     /*-----------------------------------------------------------------*/
1781     /* This is a tricky one because it can match more than one character.
1782     Find out how many characters to skip, and then set up a negative state
1783     to wait for them to pass before continuing. */
1784    
1785     case OP_EXTUNI:
1786 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1787 nigel 77 {
1788     const uschar *nptr = ptr + clen;
1789     int ncount = 0;
1790     while (nptr < end_subject)
1791     {
1792     int nclen = 1;
1793     GETCHARLEN(c, nptr, nclen);
1794 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1795 nigel 77 ncount++;
1796     nptr += nclen;
1797     }
1798     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1799     }
1800     break;
1801     #endif
1802    
1803     /*-----------------------------------------------------------------*/
1804 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1805     character (when CR is followed by LF). In this case, set up a negative
1806     state to wait for one character to pass before continuing. */
1807    
1808     case OP_ANYNL:
1809     if (clen > 0) switch(c)
1810     {
1811     case 0x000b:
1812     case 0x000c:
1813     case 0x0085:
1814     case 0x2028:
1815     case 0x2029:
1816 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1817    
1818     case 0x000a:
1819 nigel 93 ADD_NEW(state_offset + 1, 0);
1820     break;
1821 ph10 231
1822 nigel 93 case 0x000d:
1823     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1824     {
1825     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1826     }
1827     else
1828     {
1829     ADD_NEW(state_offset + 1, 0);
1830     }
1831     break;
1832     }
1833     break;
1834    
1835     /*-----------------------------------------------------------------*/
1836 ph10 178 case OP_NOT_VSPACE:
1837     if (clen > 0) switch(c)
1838     {
1839     case 0x000a:
1840     case 0x000b:
1841     case 0x000c:
1842     case 0x000d:
1843     case 0x0085:
1844     case 0x2028:
1845     case 0x2029:
1846     break;
1847 ph10 182
1848     default:
1849 ph10 178 ADD_NEW(state_offset + 1, 0);
1850     break;
1851     }
1852     break;
1853    
1854     /*-----------------------------------------------------------------*/
1855     case OP_VSPACE:
1856     if (clen > 0) switch(c)
1857     {
1858     case 0x000a:
1859     case 0x000b:
1860     case 0x000c:
1861     case 0x000d:
1862     case 0x0085:
1863     case 0x2028:
1864     case 0x2029:
1865     ADD_NEW(state_offset + 1, 0);
1866     break;
1867 ph10 182
1868 ph10 178 default: break;
1869     }
1870     break;
1871    
1872     /*-----------------------------------------------------------------*/
1873     case OP_NOT_HSPACE:
1874     if (clen > 0) switch(c)
1875     {
1876     case 0x09: /* HT */
1877     case 0x20: /* SPACE */
1878     case 0xa0: /* NBSP */
1879     case 0x1680: /* OGHAM SPACE MARK */
1880     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1881     case 0x2000: /* EN QUAD */
1882     case 0x2001: /* EM QUAD */
1883     case 0x2002: /* EN SPACE */
1884     case 0x2003: /* EM SPACE */
1885     case 0x2004: /* THREE-PER-EM SPACE */
1886     case 0x2005: /* FOUR-PER-EM SPACE */
1887     case 0x2006: /* SIX-PER-EM SPACE */
1888     case 0x2007: /* FIGURE SPACE */
1889     case 0x2008: /* PUNCTUATION SPACE */
1890     case 0x2009: /* THIN SPACE */
1891     case 0x200A: /* HAIR SPACE */
1892     case 0x202f: /* NARROW NO-BREAK SPACE */
1893     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1894     case 0x3000: /* IDEOGRAPHIC SPACE */
1895     break;
1896 ph10 182
1897     default:
1898 ph10 178 ADD_NEW(state_offset + 1, 0);
1899     break;
1900     }
1901     break;
1902    
1903     /*-----------------------------------------------------------------*/
1904     case OP_HSPACE:
1905     if (clen > 0) switch(c)
1906     {
1907     case 0x09: /* HT */
1908     case 0x20: /* SPACE */
1909     case 0xa0: /* NBSP */
1910     case 0x1680: /* OGHAM SPACE MARK */
1911     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1912     case 0x2000: /* EN QUAD */
1913     case 0x2001: /* EM QUAD */
1914     case 0x2002: /* EN SPACE */
1915     case 0x2003: /* EM SPACE */
1916     case 0x2004: /* THREE-PER-EM SPACE */
1917     case 0x2005: /* FOUR-PER-EM SPACE */
1918     case 0x2006: /* SIX-PER-EM SPACE */
1919     case 0x2007: /* FIGURE SPACE */
1920     case 0x2008: /* PUNCTUATION SPACE */
1921     case 0x2009: /* THIN SPACE */
1922     case 0x200A: /* HAIR SPACE */
1923     case 0x202f: /* NARROW NO-BREAK SPACE */
1924     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1925     case 0x3000: /* IDEOGRAPHIC SPACE */
1926     ADD_NEW(state_offset + 1, 0);
1927     break;
1928     }
1929     break;
1930    
1931     /*-----------------------------------------------------------------*/
1932 nigel 77 /* Match a negated single character. This is only used for one-byte
1933     characters, that is, we know that d < 256. The character we are
1934     checking (c) can be multibyte. */
1935    
1936     case OP_NOT:
1937     if (clen > 0)
1938     {
1939 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1940 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1941     }
1942     break;
1943    
1944     /*-----------------------------------------------------------------*/
1945     case OP_PLUS:
1946     case OP_MINPLUS:
1947 nigel 93 case OP_POSPLUS:
1948 nigel 77 case OP_NOTPLUS:
1949     case OP_NOTMINPLUS:
1950 nigel 93 case OP_NOTPOSPLUS:
1951 nigel 77 count = current_state->count; /* Already matched */
1952     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1953     if (clen > 0)
1954     {
1955 nigel 93 unsigned int otherd = NOTACHAR;
1956 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1957     {
1958     #ifdef SUPPORT_UTF8
1959 nigel 87 if (utf8 && d >= 128)
1960 nigel 77 {
1961     #ifdef SUPPORT_UCP
1962 ph10 349 otherd = UCD_OTHERCASE(d);
1963 nigel 77 #endif /* SUPPORT_UCP */
1964     }
1965     else
1966     #endif /* SUPPORT_UTF8 */
1967     otherd = fcc[d];
1968     }
1969     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1970 nigel 93 {
1971     if (count > 0 &&
1972     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1973     {
1974     active_count--; /* Remove non-match possibility */
1975     next_active_state--;
1976     }
1977     count++;
1978     ADD_NEW(state_offset, count);
1979     }
1980 nigel 77 }
1981     break;
1982    
1983     /*-----------------------------------------------------------------*/
1984     case OP_QUERY:
1985     case OP_MINQUERY:
1986 nigel 93 case OP_POSQUERY:
1987 nigel 77 case OP_NOTQUERY:
1988     case OP_NOTMINQUERY:
1989 nigel 93 case OP_NOTPOSQUERY:
1990 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1991     if (clen > 0)
1992     {
1993 nigel 93 unsigned int otherd = NOTACHAR;
1994 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1995 nigel 77 {
1996     #ifdef SUPPORT_UTF8
1997 nigel 87 if (utf8 && d >= 128)
1998 nigel 77 {
1999     #ifdef SUPPORT_UCP
2000 ph10 349 otherd = UCD_OTHERCASE(d);
2001 nigel 77 #endif /* SUPPORT_UCP */
2002     }
2003     else
2004     #endif /* SUPPORT_UTF8 */
2005     otherd = fcc[d];
2006     }
2007     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2008 nigel 93 {
2009     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2010     {
2011     active_count--; /* Remove non-match possibility */
2012     next_active_state--;
2013     }
2014     ADD_NEW(state_offset + dlen + 1, 0);
2015     }
2016 nigel 77 }
2017     break;
2018    
2019     /*-----------------------------------------------------------------*/
2020     case OP_STAR:
2021     case OP_MINSTAR:
2022 nigel 93 case OP_POSSTAR:
2023 nigel 77 case OP_NOTSTAR:
2024     case OP_NOTMINSTAR:
2025 nigel 93 case OP_NOTPOSSTAR:
2026 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2027     if (clen > 0)
2028     {
2029 nigel 93 unsigned int otherd = NOTACHAR;
2030 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2031 nigel 77 {
2032     #ifdef SUPPORT_UTF8
2033 nigel 87 if (utf8 && d >= 128)
2034 nigel 77 {
2035     #ifdef SUPPORT_UCP
2036 ph10 349 otherd = UCD_OTHERCASE(d);
2037 nigel 77 #endif /* SUPPORT_UCP */
2038     }
2039     else
2040     #endif /* SUPPORT_UTF8 */
2041     otherd = fcc[d];
2042     }
2043     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2044 nigel 93 {
2045     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2046     {
2047     active_count--; /* Remove non-match possibility */
2048     next_active_state--;
2049     }
2050     ADD_NEW(state_offset, 0);
2051     }
2052 nigel 77 }
2053     break;
2054    
2055     /*-----------------------------------------------------------------*/
2056     case OP_EXACT:
2057 nigel 93 case OP_NOTEXACT:
2058     count = current_state->count; /* Number already matched */
2059     if (clen > 0)
2060     {
2061     unsigned int otherd = NOTACHAR;
2062     if ((ims & PCRE_CASELESS) != 0)
2063     {
2064     #ifdef SUPPORT_UTF8
2065     if (utf8 && d >= 128)
2066     {
2067     #ifdef SUPPORT_UCP
2068 ph10 349 otherd = UCD_OTHERCASE(d);
2069 nigel 93 #endif /* SUPPORT_UCP */
2070     }
2071     else
2072     #endif /* SUPPORT_UTF8 */
2073     otherd = fcc[d];
2074     }
2075     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2076     {
2077     if (++count >= GET2(code, 1))
2078     { ADD_NEW(state_offset + dlen + 3, 0); }
2079     else
2080     { ADD_NEW(state_offset, count); }
2081     }
2082     }
2083     break;
2084    
2085     /*-----------------------------------------------------------------*/
2086 nigel 77 case OP_UPTO:
2087     case OP_MINUPTO:
2088 nigel 93 case OP_POSUPTO:
2089 nigel 77 case OP_NOTUPTO:
2090     case OP_NOTMINUPTO:
2091 nigel 93 case OP_NOTPOSUPTO:
2092     ADD_ACTIVE(state_offset + dlen + 3, 0);
2093 nigel 77 count = current_state->count; /* Number already matched */
2094     if (clen > 0)
2095     {
2096 nigel 93 unsigned int otherd = NOTACHAR;
2097 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2098     {
2099     #ifdef SUPPORT_UTF8
2100 nigel 87 if (utf8 && d >= 128)
2101 nigel 77 {
2102     #ifdef SUPPORT_UCP
2103 ph10 349 otherd = UCD_OTHERCASE(d);
2104 nigel 77 #endif /* SUPPORT_UCP */
2105     }
2106     else
2107     #endif /* SUPPORT_UTF8 */
2108     otherd = fcc[d];
2109     }
2110     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2111     {
2112 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2113     {
2114     active_count--; /* Remove non-match possibility */
2115     next_active_state--;
2116     }
2117 nigel 77 if (++count >= GET2(code, 1))
2118     { ADD_NEW(state_offset + dlen + 3, 0); }
2119     else
2120     { ADD_NEW(state_offset, count); }
2121     }
2122     }
2123     break;
2124    
2125    
2126     /* ========================================================================== */
2127     /* These are the class-handling opcodes */
2128    
2129     case OP_CLASS:
2130     case OP_NCLASS:
2131     case OP_XCLASS:
2132     {
2133     BOOL isinclass = FALSE;
2134     int next_state_offset;
2135     const uschar *ecode;
2136    
2137     /* For a simple class, there is always just a 32-byte table, and we
2138     can set isinclass from it. */
2139    
2140     if (codevalue != OP_XCLASS)
2141     {
2142     ecode = code + 33;
2143     if (clen > 0)
2144     {
2145     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2146     ((code[1 + c/8] & (1 << (c&7))) != 0);
2147     }
2148     }
2149    
2150     /* An extended class may have a table or a list of single characters,
2151     ranges, or both, and it may be positive or negative. There's a
2152     function that sorts all this out. */
2153    
2154     else
2155     {
2156     ecode = code + GET(code, 1);
2157     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2158     }
2159    
2160     /* At this point, isinclass is set for all kinds of class, and ecode
2161     points to the byte after the end of the class. If there is a
2162     quantifier, this is where it will be. */
2163    
2164     next_state_offset = ecode - start_code;
2165    
2166     switch (*ecode)
2167     {
2168     case OP_CRSTAR:
2169     case OP_CRMINSTAR:
2170     ADD_ACTIVE(next_state_offset + 1, 0);
2171     if (isinclass) { ADD_NEW(state_offset, 0); }
2172     break;
2173    
2174     case OP_CRPLUS:
2175     case OP_CRMINPLUS:
2176     count = current_state->count; /* Already matched */
2177     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2178     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2179     break;
2180    
2181     case OP_CRQUERY:
2182     case OP_CRMINQUERY:
2183     ADD_ACTIVE(next_state_offset + 1, 0);
2184     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2185     break;
2186    
2187     case OP_CRRANGE:
2188     case OP_CRMINRANGE:
2189     count = current_state->count; /* Already matched */
2190     if (count >= GET2(ecode, 1))
2191     { ADD_ACTIVE(next_state_offset + 5, 0); }
2192     if (isinclass)
2193     {
2194 nigel 91 int max = GET2(ecode, 3);
2195     if (++count >= max && max != 0) /* Max 0 => no limit */
2196 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2197     else
2198     { ADD_NEW(state_offset, count); }
2199     }
2200     break;
2201    
2202     default:
2203     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2204     break;
2205     }
2206     }
2207     break;
2208    
2209     /* ========================================================================== */
2210     /* These are the opcodes for fancy brackets of various kinds. We have
2211 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2212     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2213 ph10 341 though the other "backtracking verbs" are not supported. */
2214 ph10 345
2215 ph10 341 case OP_FAIL:
2216 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2217 ph10 345 break;
2218 nigel 77
2219     case OP_ASSERT:
2220     case OP_ASSERT_NOT:
2221     case OP_ASSERTBACK:
2222     case OP_ASSERTBACK_NOT:
2223     {
2224     int rc;
2225     int local_offsets[2];
2226     int local_workspace[1000];
2227     const uschar *endasscode = code + GET(code, 1);
2228    
2229     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2230    
2231     rc = internal_dfa_exec(
2232     md, /* static match data */
2233     code, /* this subexpression's code */
2234     ptr, /* where we currently are */
2235     ptr - start_subject, /* start offset */
2236     local_offsets, /* offset vector */
2237     sizeof(local_offsets)/sizeof(int), /* size of same */
2238     local_workspace, /* workspace vector */
2239     sizeof(local_workspace)/sizeof(int), /* size of same */
2240     ims, /* the current ims flags */
2241     rlevel, /* function recursion level */
2242     recursing); /* pass on regex recursion */
2243    
2244     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2245     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2246     }
2247     break;
2248    
2249     /*-----------------------------------------------------------------*/
2250     case OP_COND:
2251 nigel 93 case OP_SCOND:
2252 nigel 77 {
2253     int local_offsets[1000];
2254     int local_workspace[1000];
2255 ph10 406 int codelink = GET(code, 1);
2256 ph10 397 int condcode;
2257 ph10 406
2258 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2259 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2260 ph10 398 happen for the other conditions. */
2261 nigel 77
2262 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2263 ph10 406 {
2264     rrc = 0;
2265 ph10 397 if (pcre_callout != NULL)
2266     {
2267     pcre_callout_block cb;
2268     cb.version = 1; /* Version 1 of the callout block */
2269     cb.callout_number = code[LINK_SIZE+2];
2270     cb.offset_vector = offsets;
2271     cb.subject = (PCRE_SPTR)start_subject;
2272     cb.subject_length = end_subject - start_subject;
2273     cb.start_match = current_subject - start_subject;
2274     cb.current_position = ptr - start_subject;
2275     cb.pattern_position = GET(code, LINK_SIZE + 3);
2276     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2277     cb.capture_top = 1;
2278     cb.capture_last = -1;
2279     cb.callout_data = md->callout_data;
2280     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2281     }
2282 ph10 398 if (rrc > 0) break; /* Fail this thread */
2283     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2284 ph10 406 }
2285 ph10 398
2286 ph10 397 condcode = code[LINK_SIZE+1];
2287 ph10 406
2288 nigel 93 /* Back reference conditions are not supported */
2289 nigel 77
2290 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2291    
2292     /* The DEFINE condition is always false */
2293    
2294     if (condcode == OP_DEF)
2295 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2296 nigel 93
2297     /* The only supported version of OP_RREF is for the value RREF_ANY,
2298     which means "test if in any recursion". We can't test for specifically
2299     recursed groups. */
2300    
2301     else if (condcode == OP_RREF)
2302     {
2303 nigel 77 int value = GET2(code, LINK_SIZE+2);
2304 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2305 ph10 406 if (recursing > 0)
2306 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2307     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2308 nigel 77 }
2309    
2310     /* Otherwise, the condition is an assertion */
2311    
2312     else
2313     {
2314     int rc;
2315     const uschar *asscode = code + LINK_SIZE + 1;
2316     const uschar *endasscode = asscode + GET(asscode, 1);
2317    
2318     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2319    
2320     rc = internal_dfa_exec(
2321     md, /* fixed match data */
2322     asscode, /* this subexpression's code */
2323     ptr, /* where we currently are */
2324     ptr - start_subject, /* start offset */
2325     local_offsets, /* offset vector */
2326     sizeof(local_offsets)/sizeof(int), /* size of same */
2327     local_workspace, /* workspace vector */
2328     sizeof(local_workspace)/sizeof(int), /* size of same */
2329     ims, /* the current ims flags */
2330     rlevel, /* function recursion level */
2331     recursing); /* pass on regex recursion */
2332    
2333     if ((rc >= 0) ==
2334     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2335 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2336 nigel 77 else
2337 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2338 nigel 77 }
2339     }
2340     break;
2341    
2342     /*-----------------------------------------------------------------*/
2343     case OP_RECURSE:
2344     {
2345     int local_offsets[1000];
2346     int local_workspace[1000];
2347     int rc;
2348    
2349     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2350     recursing + 1));
2351    
2352     rc = internal_dfa_exec(
2353     md, /* fixed match data */
2354     start_code + GET(code, 1), /* this subexpression's code */
2355     ptr, /* where we currently are */
2356     ptr - start_subject, /* start offset */
2357     local_offsets, /* offset vector */
2358     sizeof(local_offsets)/sizeof(int), /* size of same */
2359     local_workspace, /* workspace vector */
2360     sizeof(local_workspace)/sizeof(int), /* size of same */
2361     ims, /* the current ims flags */
2362     rlevel, /* function recursion level */
2363     recursing + 1); /* regex recurse level */
2364    
2365     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2366     recursing + 1, rc));
2367    
2368     /* Ran out of internal offsets */
2369    
2370     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2371    
2372     /* For each successful matched substring, set up the next state with a
2373     count of characters to skip before trying it. Note that the count is in
2374     characters, not bytes. */
2375    
2376     if (rc > 0)
2377     {
2378     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2379     {
2380     const uschar *p = start_subject + local_offsets[rc];
2381     const uschar *pp = start_subject + local_offsets[rc+1];
2382     int charcount = local_offsets[rc+1] - local_offsets[rc];
2383     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2384     if (charcount > 0)
2385     {
2386     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2387     }
2388     else
2389     {
2390     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2391     }
2392     }
2393     }
2394     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2395     }
2396     break;
2397    
2398     /*-----------------------------------------------------------------*/
2399     case OP_ONCE:
2400     {
2401     int local_offsets[2];
2402     int local_workspace[1000];
2403    
2404     int rc = internal_dfa_exec(
2405     md, /* fixed match data */
2406     code, /* this subexpression's code */
2407     ptr, /* where we currently are */
2408     ptr - start_subject, /* start offset */
2409     local_offsets, /* offset vector */
2410     sizeof(local_offsets)/sizeof(int), /* size of same */
2411     local_workspace, /* workspace vector */
2412     sizeof(local_workspace)/sizeof(int), /* size of same */
2413     ims, /* the current ims flags */
2414     rlevel, /* function recursion level */
2415     recursing); /* pass on regex recursion */
2416    
2417     if (rc >= 0)
2418     {
2419     const uschar *end_subpattern = code;
2420     int charcount = local_offsets[1] - local_offsets[0];
2421     int next_state_offset, repeat_state_offset;
2422    
2423     do { end_subpattern += GET(end_subpattern, 1); }
2424     while (*end_subpattern == OP_ALT);
2425     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2426    
2427     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2428     arrange for the repeat state also to be added to the relevant list.
2429     Calculate the offset, or set -1 for no repeat. */
2430    
2431     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2432     *end_subpattern == OP_KETRMIN)?
2433     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2434    
2435     /* If we have matched an empty string, add the next state at the
2436     current character pointer. This is important so that the duplicate
2437     checking kicks in, which is what breaks infinite loops that match an
2438     empty string. */
2439    
2440     if (charcount == 0)
2441     {
2442     ADD_ACTIVE(next_state_offset, 0);
2443     }
2444    
2445     /* Optimization: if there are no more active states, and there
2446     are no new states yet set up, then skip over the subject string
2447     right here, to save looping. Otherwise, set up the new state to swing
2448     into action when the end of the substring is reached. */
2449    
2450     else if (i + 1 >= active_count && new_count == 0)
2451     {
2452     ptr += charcount;
2453     clen = 0;
2454     ADD_NEW(next_state_offset, 0);
2455    
2456     /* If we are adding a repeat state at the new character position,
2457     we must fudge things so that it is the only current state.
2458     Otherwise, it might be a duplicate of one we processed before, and
2459     that would cause it to be skipped. */
2460    
2461     if (repeat_state_offset >= 0)
2462     {
2463     next_active_state = active_states;
2464     active_count = 0;
2465     i = -1;
2466     ADD_ACTIVE(repeat_state_offset, 0);
2467     }
2468     }
2469     else
2470     {
2471     const uschar *p = start_subject + local_offsets[0];
2472     const uschar *pp = start_subject + local_offsets[1];
2473     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2474     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2475     if (repeat_state_offset >= 0)
2476     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2477     }
2478    
2479     }
2480     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2481     }
2482     break;
2483    
2484    
2485     /* ========================================================================== */
2486     /* Handle callouts */
2487    
2488     case OP_CALLOUT:
2489 ph10 406 rrc = 0;
2490 nigel 77 if (pcre_callout != NULL)
2491     {
2492     pcre_callout_block cb;
2493     cb.version = 1; /* Version 1 of the callout block */
2494     cb.callout_number = code[1];
2495     cb.offset_vector = offsets;
2496 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2497 nigel 77 cb.subject_length = end_subject - start_subject;
2498     cb.start_match = current_subject - start_subject;
2499     cb.current_position = ptr - start_subject;
2500     cb.pattern_position = GET(code, 2);
2501     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2502     cb.capture_top = 1;
2503     cb.capture_last = -1;
2504     cb.callout_data = md->callout_data;
2505     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2506 ph10 406 }
2507     if (rrc == 0)
2508     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2509 nigel 77 break;
2510    
2511    
2512     /* ========================================================================== */
2513     default: /* Unsupported opcode */
2514     return PCRE_ERROR_DFA_UITEM;
2515     }
2516    
2517     NEXT_ACTIVE_STATE: continue;
2518    
2519     } /* End of loop scanning active states */
2520    
2521     /* We have finished the processing at the current subject character. If no
2522     new states have been set for the next character, we have found all the
2523     matches that we are going to find. If we are at the top level and partial
2524 ph10 428 matching has been requested, check for appropriate conditions. The "forced_
2525     fail" variable counts the number of (*F) encountered for the character. If it
2526     is equal to the original active_count (saved in workspace[1]) it means that
2527     (*F) was found on every active state. In this case we don't want to give a
2528     partial match. */
2529 nigel 77
2530     if (new_count <= 0)
2531     {
2532 ph10 427 if (rlevel == 1 && /* Top level, and */
2533 ph10 428 reached_end != workspace[1] && /* Not all reached end */
2534     forced_fail != workspace[1] && /* Not all forced fail & */
2535 ph10 427 ( /* either... */
2536     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2537     || /* or... */
2538     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2539     match_count < 0) /* no matches */
2540     ) && /* And... */
2541     ptr >= end_subject && /* Reached end of subject */
2542     ptr > current_subject) /* Matched non-empty string */
2543 nigel 77 {
2544     if (offsetcount >= 2)
2545     {
2546 ph10 435 offsets[0] = md->start_used_ptr - start_subject;
2547 nigel 77 offsets[1] = end_subject - start_subject;
2548     }
2549     match_count = PCRE_ERROR_PARTIAL;
2550     }
2551    
2552     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2553     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2554     rlevel*2-2, SP));
2555 nigel 91 break; /* In effect, "return", but see the comment below */
2556 nigel 77 }
2557    
2558     /* One or more states are active for the next character. */
2559    
2560     ptr += clen; /* Advance to next subject character */
2561     } /* Loop to move along the subject string */
2562    
2563 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2564     if we use "return" above, we have compiler trouble. Some compilers warn if
2565     there's nothing here because they think the function doesn't return a value. On
2566     the other hand, if we put a dummy statement here, some more clever compilers
2567     complain that it can't be reached. Sigh. */
2568 nigel 77
2569 nigel 91 return match_count;
2570 nigel 77 }
2571    
2572    
2573    
2574    
2575     /*************************************************
2576     * Execute a Regular Expression - DFA engine *
2577     *************************************************/
2578    
2579     /* This external function applies a compiled re to a subject string using a DFA
2580     engine. This function calls the internal function multiple times if the pattern
2581     is not anchored.
2582    
2583     Arguments:
2584     argument_re points to the compiled expression
2585 ph10 97 extra_data points to extra data or is NULL
2586 nigel 77 subject points to the subject string
2587     length length of subject string (may contain binary zeros)
2588     start_offset where to start in the subject string
2589     options option bits
2590     offsets vector of match offsets
2591     offsetcount size of same
2592     workspace workspace vector
2593     wscount size of same
2594    
2595     Returns: > 0 => number of match offset pairs placed in offsets
2596     = 0 => offsets overflowed; longest matches are present
2597     -1 => failed to match
2598     < -1 => some kind of unexpected problem
2599     */
2600    
2601 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2602 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2603     const char *subject, int length, int start_offset, int options, int *offsets,
2604     int offsetcount, int *workspace, int wscount)
2605     {
2606     real_pcre *re = (real_pcre *)argument_re;
2607     dfa_match_data match_block;
2608 nigel 91 dfa_match_data *md = &match_block;
2609 nigel 77 BOOL utf8, anchored, startline, firstline;
2610     const uschar *current_subject, *end_subject, *lcc;
2611    
2612     pcre_study_data internal_study;
2613     const pcre_study_data *study = NULL;
2614     real_pcre internal_re;
2615    
2616     const uschar *req_byte_ptr;
2617     const uschar *start_bits = NULL;
2618     BOOL first_byte_caseless = FALSE;
2619     BOOL req_byte_caseless = FALSE;
2620     int first_byte = -1;
2621     int req_byte = -1;
2622     int req_byte2 = -1;
2623 nigel 91 int newline;
2624 nigel 77
2625     /* Plausibility checks */
2626    
2627     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2628     if (re == NULL || subject == NULL || workspace == NULL ||
2629     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2630     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2631     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2632    
2633     /* We need to find the pointer to any study data before we test for byte
2634     flipping, so we scan the extra_data block first. This may set two fields in the
2635     match block, so we must initialize them beforehand. However, the other fields
2636     in the match block must not be set until after the byte flipping. */
2637    
2638 nigel 91 md->tables = re->tables;
2639     md->callout_data = NULL;
2640 nigel 77
2641     if (extra_data != NULL)
2642     {
2643     unsigned int flags = extra_data->flags;
2644     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2645     study = (const pcre_study_data *)extra_data->study_data;
2646     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2647 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2648     return PCRE_ERROR_DFA_UMLIMIT;
2649 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2650 nigel 91 md->callout_data = extra_data->callout_data;
2651 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2652 nigel 91 md->tables = extra_data->tables;
2653 nigel 77 }
2654    
2655     /* Check that the first field in the block is the magic number. If it is not,
2656     test for a regex that was compiled on a host of opposite endianness. If this is
2657     the case, flipped values are put in internal_re and internal_study if there was
2658     study data too. */
2659    
2660     if (re->magic_number != MAGIC_NUMBER)
2661     {
2662     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2663     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2664     if (study != NULL) study = &internal_study;
2665     }
2666    
2667     /* Set some local values */
2668    
2669     current_subject = (const unsigned char *)subject + start_offset;
2670     end_subject = (const unsigned char *)subject + length;
2671     req_byte_ptr = current_subject - 1;
2672    
2673 nigel 91 #ifdef SUPPORT_UTF8
2674 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2675 nigel 91 #else
2676     utf8 = FALSE;
2677     #endif
2678 nigel 77
2679 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2680     (re->options & PCRE_ANCHORED) != 0;
2681    
2682 nigel 77 /* The remaining fixed data for passing around. */
2683    
2684 nigel 91 md->start_code = (const uschar *)argument_re +
2685 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2686 nigel 91 md->start_subject = (const unsigned char *)subject;
2687     md->end_subject = end_subject;
2688 ph10 442 md->start_offset = start_offset;
2689 nigel 91 md->moptions = options;
2690     md->poptions = re->options;
2691 nigel 77
2692 ph10 231 /* If the BSR option is not set at match time, copy what was set
2693     at compile time. */
2694    
2695     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2696     {
2697     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2698     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2699     #ifdef BSR_ANYCRLF
2700     else md->moptions |= PCRE_BSR_ANYCRLF;
2701 ph10 243 #endif
2702     }
2703 ph10 231
2704 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2705     nothing is set at run time, whatever was used at compile time applies. */
2706 nigel 91
2707 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2708 nigel 93 PCRE_NEWLINE_BITS)
2709 nigel 91 {
2710 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2711 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2712     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2713 nigel 91 case PCRE_NEWLINE_CR+
2714 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2715 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2716 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2717 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2718 nigel 91 }
2719    
2720 ph10 149 if (newline == -2)
2721 nigel 91 {
2722 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2723     }
2724     else if (newline < 0)
2725     {
2726 nigel 93 md->nltype = NLTYPE_ANY;
2727 nigel 91 }
2728     else
2729     {
2730 nigel 93 md->nltype = NLTYPE_FIXED;
2731     if (newline > 255)
2732     {
2733     md->nllen = 2;
2734     md->nl[0] = (newline >> 8) & 255;
2735     md->nl[1] = newline & 255;
2736     }
2737     else
2738     {
2739     md->nllen = 1;
2740     md->nl[0] = newline;
2741     }
2742 nigel 91 }
2743    
2744 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2745     back the character offset. */
2746    
2747     #ifdef SUPPORT_UTF8
2748     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2749     {
2750     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2751     return PCRE_ERROR_BADUTF8;
2752     if (start_offset > 0 && start_offset < length)
2753     {
2754     int tb = ((uschar *)subject)[start_offset];
2755     if (tb > 127)
2756     {
2757     tb &= 0xc0;
2758     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2759     }
2760     }
2761     }
2762     #endif
2763    
2764     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2765     is a feature that makes it possible to save compiled regex and re-use them
2766     in other programs later. */
2767    
2768 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2769 nigel 77
2770     /* The lower casing table and the "must be at the start of a line" flag are
2771     used in a loop when finding where to start. */
2772    
2773 nigel 91 lcc = md->tables + lcc_offset;
2774 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2775 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2776    
2777     /* Set up the first character to match, if available. The first_byte value is
2778     never set for an anchored regular expression, but the anchoring may be forced
2779     at run time, so we have to test for anchoring. The first char may be unset for
2780     an unanchored pattern, of course. If there's no first char and the pattern was
2781     studied, there may be a bitmap of possible first characters. */
2782    
2783     if (!anchored)
2784     {
2785 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2786 nigel 77 {
2787     first_byte = re->first_byte & 255;
2788     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2789     first_byte = lcc[first_byte];
2790     }
2791     else
2792     {
2793     if (startline && study != NULL &&
2794     (study->options & PCRE_STUDY_MAPPED) != 0)
2795     start_bits = study->start_bits;
2796     }
2797     }
2798    
2799     /* For anchored or unanchored matches, there may be a "last known required
2800     character" set. */
2801    
2802 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2803 nigel 77 {
2804     req_byte = re->req_byte & 255;
2805     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2806 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2807 nigel 77 }
2808    
2809     /* Call the main matching function, looping for a non-anchored regex after a
2810 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2811     a match. */
2812 nigel 77
2813     for (;;)
2814     {
2815     int rc;
2816    
2817     if ((options & PCRE_DFA_RESTART) == 0)
2818     {
2819     const uschar *save_end_subject = end_subject;
2820    
2821 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2822     line of a multiline string. Implement this by temporarily adjusting
2823     end_subject so that we stop scanning at a newline. If the match fails at
2824     the newline, later code breaks this loop. */
2825 nigel 77
2826     if (firstline)
2827     {
2828 ph10 365 USPTR t = current_subject;
2829     #ifdef SUPPORT_UTF8
2830     if (utf8)
2831 ph10 371 {
2832     while (t < md->end_subject && !IS_NEWLINE(t))
2833 ph10 365 {
2834     t++;
2835     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2836 ph10 371 }
2837 ph10 365 }
2838     else
2839 ph10 371 #endif
2840 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2841 nigel 77 end_subject = t;
2842     }
2843 ph10 392
2844 ph10 389 /* There are some optimizations that avoid running the match if a known
2845     starting point is not found, or if a known later character is not present.
2846     However, there is an option that disables these, for testing and for
2847     ensuring that all callouts do actually occur. */
2848 nigel 77
2849 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2850 ph10 392 {
2851    
2852 ph10 389 /* Advance to a known first byte. */
2853 ph10 392
2854 ph10 389 if (first_byte >= 0)
2855 nigel 77 {
2856 ph10 389 if (first_byte_caseless)
2857     while (current_subject < end_subject &&
2858     lcc[*current_subject] != first_byte)
2859     current_subject++;
2860     else
2861 ph10 392 while (current_subject < end_subject &&
2862 ph10 389 *current_subject != first_byte)
2863     current_subject++;
2864     }
2865 ph10 392
2866 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2867 ph10 392
2868 ph10 389 else if (startline)
2869     {
2870     if (current_subject > md->start_subject + start_offset)
2871     {
2872 ph10 365 #ifdef SUPPORT_UTF8
2873 ph10 389 if (utf8)
2874 ph10 365 {
2875 ph10 392 while (current_subject < end_subject &&
2876 ph10 389 !WAS_NEWLINE(current_subject))
2877     {
2878 ph10 365 current_subject++;
2879 ph10 389 while(current_subject < end_subject &&
2880     (*current_subject & 0xc0) == 0x80)
2881     current_subject++;
2882     }
2883 ph10 371 }
2884 ph10 389 else
2885     #endif
2886     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2887     current_subject++;
2888 ph10 392
2889 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2890     ANYCRLF, and we are now at a LF, advance the match position by one
2891     more character. */
2892 ph10 392
2893 ph10 391 if (current_subject[-1] == CHAR_CR &&
2894 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2895     current_subject < end_subject &&
2896 ph10 391 *current_subject == CHAR_NL)
2897 ph10 389 current_subject++;
2898 ph10 365 }
2899 nigel 77 }
2900 ph10 392
2901 ph10 389 /* Or to a non-unique first char after study */
2902 ph10 392
2903 ph10 389 else if (start_bits != NULL)
2904 nigel 77 {
2905 ph10 389 while (current_subject < end_subject)
2906     {
2907     register unsigned int c = *current_subject;
2908     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2909     else break;
2910     }
2911 nigel 77 }
2912 ph10 392 }
2913 nigel 77
2914     /* Restore fudged end_subject */
2915    
2916     end_subject = save_end_subject;
2917     }
2918    
2919     /* If req_byte is set, we know that that character must appear in the subject
2920     for the match to succeed. If the first character is set, req_byte must be
2921     later in the subject; otherwise the test starts at the match point. This
2922     optimization can save a huge amount of work in patterns with nested unlimited
2923     repeats that aren't going to match. Writing separate code for cased/caseless
2924     versions makes it go faster, as does using an autoincrement and backing off
2925     on a match.
2926    
2927     HOWEVER: when the subject string is very, very long, searching to its end can
2928     take a long time, and give bad performance on quite ordinary patterns. This
2929     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2930     don't do this when the string is sufficiently long.
2931    
2932 ph10 392 ALSO: this processing is disabled when partial matching is requested, and can
2933 ph10 428 also be explicitly deactivated. Furthermore, we have to disable when
2934     restarting after a partial match, because the required character may have
2935     already been matched. */
2936 nigel 77
2937 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2938     req_byte >= 0 &&
2939 nigel 77 end_subject - current_subject < REQ_BYTE_MAX &&
2940 ph10 428 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2941 nigel 77 {
2942     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2943    
2944     /* We don't need to repeat the search if we haven't yet reached the
2945     place we found it at last time. */
2946    
2947     if (p > req_byte_ptr)
2948     {
2949     if (req_byte_caseless)
2950     {
2951     while (p < end_subject)
2952     {
2953     register int pp = *p++;
2954     if (pp == req_byte || pp == req_byte2) { p--; break; }
2955     }
2956     }
2957     else
2958     {
2959     while (p < end_subject)
2960     {
2961     if (*p++ == req_byte) { p--; break; }
2962     }
2963     }
2964    
2965     /* If we can't find the required character, break the matching loop,
2966     which will cause a return or PCRE_ERROR_NOMATCH. */
2967    
2968     if (p >= end_subject) break;
2969    
2970     /* If we have found the required character, save the point where we
2971     found it, so that we don't search again next time round the loop if
2972     the start hasn't passed this character yet. */
2973    
2974     req_byte_ptr = p;
2975     }
2976     }
2977    
2978     /* OK, now we can do the business */
2979    
2980 ph10 435 md->start_used_ptr = current_subject;
2981    
2982 nigel 77 rc = internal_dfa_exec(
2983 nigel 91 md, /* fixed match data */
2984     md->start_code, /* this subexpression's code */
2985     current_subject, /* where we currently are */
2986     start_offset, /* start offset in subject */
2987     offsets, /* offset vector */
2988     offsetcount, /* size of same */
2989     workspace, /* workspace vector */
2990     wscount, /* size of same */
2991 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2992 nigel 91 0, /* function recurse level */
2993     0); /* regex recurse level */
2994 nigel 77
2995     /* Anything other than "no match" means we are done, always; otherwise, carry
2996     on only if not anchored. */
2997    
2998     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2999    
3000     /* Advance to the next subject character unless we are at the end of a line
3001     and firstline is set. */
3002    
3003 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3004 nigel 77 current_subject++;
3005     if (utf8)
3006     {
3007     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3008     current_subject++;
3009     }
3010     if (current_subject > end_subject) break;
3011    
3012 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3013 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3014     or ANY or ANYCRLF, advance the match position by one more character. */
3015 nigel 93
3016 ph10 391 if (current_subject[-1] == CHAR_CR &&
3017 ph10 226 current_subject < end_subject &&
3018 ph10 391 *current_subject == CHAR_NL &&
3019 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3020 ph10 226 (md->nltype == NLTYPE_ANY ||
3021     md->nltype == NLTYPE_ANYCRLF ||
3022     md->nllen == 2))
3023 nigel 93 current_subject++;
3024    
3025     } /* "Bumpalong" loop */
3026    
3027 nigel 77 return PCRE_ERROR_NOMATCH;
3028     }
3029    
3030     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12