/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 463 - (hide annotations) (download)
Sun Oct 18 10:02:46 2009 UTC (4 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 106400 byte(s)
Further tidies to partial matching.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109     character that is to be tested in some way. This makes is possible to
110     centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
122     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
124     1, /* Char */
125     1, /* Charnc */
126     1, /* not */
127     /* Positive single-char repeats */
128     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
129     3, 3, 3, /* upto, minupto, exact */
130 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
131 nigel 77 /* Negative single-char repeats - only for chars < 256 */
132     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
133     3, 3, 3, /* NOT upto, minupto, exact */
134 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
135 nigel 77 /* Positive type repeats */
136     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
137     3, 3, 3, /* Type upto, minupto, exact */
138 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
139 nigel 77 /* Character class & ref repeats */
140     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
141     0, 0, /* CRRANGE, CRMINRANGE */
142     0, /* CLASS */
143     0, /* NCLASS */
144     0, /* XCLASS - variable length */
145     0, /* REF */
146     0, /* RECURSE */
147     0, /* CALLOUT */
148     0, /* Alt */
149     0, /* Ket */
150     0, /* KetRmax */
151     0, /* KetRmin */
152     0, /* Assert */
153     0, /* Assert not */
154     0, /* Assert behind */
155     0, /* Assert behind not */
156     0, /* Reverse */
157 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
158     0, 0, 0, /* SBRA, SCBRA, SCOND */
159 nigel 77 0, /* CREF */
160 nigel 93 0, /* RREF */
161     0, /* DEF */
162 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
163     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
164 ph10 462 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
165 nigel 77 };
166    
167 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
168 ph10 462 remember the fact that a character could have been inspected when the end of
169 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
170     two tables that follow must also be modified. */
171 ph10 462
172     static const uschar poptable[] = {
173     0, /* End */
174 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
175 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
176     1, 1, 1, /* Any, AllAny, Anybyte */
177     1, 1, 1, /* NOTPROP, PROP, EXTUNI */
178     1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
179     0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
180     1, /* Char */
181     1, /* Charnc */
182     1, /* not */
183     /* Positive single-char repeats */
184     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
185     1, 1, 1, /* upto, minupto, exact */
186     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
187     /* Negative single-char repeats - only for chars < 256 */
188     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
189     1, 1, 1, /* NOT upto, minupto, exact */
190     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
191     /* Positive type repeats */
192     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
193     1, 1, 1, /* Type upto, minupto, exact */
194     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
195     /* Character class & ref repeats */
196     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
197     1, 1, /* CRRANGE, CRMINRANGE */
198     1, /* CLASS */
199     1, /* NCLASS */
200     1, /* XCLASS - variable length */
201     0, /* REF */
202     0, /* RECURSE */
203     0, /* CALLOUT */
204     0, /* Alt */
205     0, /* Ket */
206     0, /* KetRmax */
207     0, /* KetRmin */
208     0, /* Assert */
209     0, /* Assert not */
210     0, /* Assert behind */
211     0, /* Assert behind not */
212     0, /* Reverse */
213     0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
214     0, 0, 0, /* SBRA, SCBRA, SCOND */
215     0, /* CREF */
216     0, /* RREF */
217     0, /* DEF */
218     0, 0, /* BRAZERO, BRAMINZERO */
219     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
220     0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
221     };
222    
223 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
224     and \w */
225    
226 ph10 327 static const uschar toptable1[] = {
227 ph10 168 0, 0, 0, 0, 0, 0,
228 nigel 77 ctype_digit, ctype_digit,
229     ctype_space, ctype_space,
230     ctype_word, ctype_word,
231 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
232 nigel 77 };
233    
234 ph10 327 static const uschar toptable2[] = {
235 ph10 168 0, 0, 0, 0, 0, 0,
236 nigel 77 ctype_digit, 0,
237     ctype_space, 0,
238     ctype_word, 0,
239 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
240 nigel 77 };
241    
242    
243     /* Structure for holding data about a particular state, which is in effect the
244     current data for an active path through the match tree. It must consist
245     entirely of ints because the working vector we are passed, and which we put
246     these structures in, is a vector of ints. */
247    
248     typedef struct stateblock {
249     int offset; /* Offset to opcode */
250     int count; /* Count for repeats */
251     int ims; /* ims flag bits */
252     int data; /* Some use extra data */
253     } stateblock;
254    
255     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
256    
257    
258     #ifdef DEBUG
259     /*************************************************
260     * Print character string *
261     *************************************************/
262    
263     /* Character string printing function for debugging.
264    
265     Arguments:
266     p points to string
267     length number of bytes
268     f where to print
269    
270     Returns: nothing
271     */
272    
273     static void
274     pchars(unsigned char *p, int length, FILE *f)
275     {
276     int c;
277     while (length-- > 0)
278     {
279     if (isprint(c = *(p++)))
280     fprintf(f, "%c", c);
281     else
282     fprintf(f, "\\x%02x", c);
283     }
284     }
285     #endif
286    
287    
288    
289     /*************************************************
290     * Execute a Regular Expression - DFA engine *
291     *************************************************/
292    
293     /* This internal function applies a compiled pattern to a subject string,
294     starting at a given point, using a DFA engine. This function is called from the
295     external one, possibly multiple times if the pattern is not anchored. The
296     function calls itself recursively for some kinds of subpattern.
297    
298     Arguments:
299     md the match_data block with fixed information
300     this_start_code the opening bracket of this subexpression's code
301     current_subject where we currently are in the subject string
302     start_offset start offset in the subject string
303     offsets vector to contain the matching string offsets
304     offsetcount size of same
305     workspace vector of workspace
306     wscount size of same
307     ims the current ims flags
308     rlevel function call recursion level
309     recursing regex recursive call level
310    
311 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
312 ph10 341 = 0 => offsets overflowed; longest matches are present
313 nigel 77 -1 => failed to match
314     < -1 => some kind of unexpected problem
315    
316     The following macros are used for adding states to the two state vectors (one
317     for the current character, one for the following character). */
318    
319     #define ADD_ACTIVE(x,y) \
320     if (active_count++ < wscount) \
321     { \
322     next_active_state->offset = (x); \
323     next_active_state->count = (y); \
324     next_active_state->ims = ims; \
325     next_active_state++; \
326     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
327     } \
328     else return PCRE_ERROR_DFA_WSSIZE
329    
330     #define ADD_ACTIVE_DATA(x,y,z) \
331     if (active_count++ < wscount) \
332     { \
333     next_active_state->offset = (x); \
334     next_active_state->count = (y); \
335     next_active_state->ims = ims; \
336     next_active_state->data = (z); \
337     next_active_state++; \
338     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
339     } \
340     else return PCRE_ERROR_DFA_WSSIZE
341    
342     #define ADD_NEW(x,y) \
343     if (new_count++ < wscount) \
344     { \
345     next_new_state->offset = (x); \
346     next_new_state->count = (y); \
347     next_new_state->ims = ims; \
348     next_new_state++; \
349     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
350     } \
351     else return PCRE_ERROR_DFA_WSSIZE
352    
353     #define ADD_NEW_DATA(x,y,z) \
354     if (new_count++ < wscount) \
355     { \
356     next_new_state->offset = (x); \
357     next_new_state->count = (y); \
358     next_new_state->ims = ims; \
359     next_new_state->data = (z); \
360     next_new_state++; \
361     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
362     } \
363     else return PCRE_ERROR_DFA_WSSIZE
364    
365     /* And now, here is the code */
366    
367     static int
368     internal_dfa_exec(
369     dfa_match_data *md,
370     const uschar *this_start_code,
371     const uschar *current_subject,
372     int start_offset,
373     int *offsets,
374     int offsetcount,
375     int *workspace,
376     int wscount,
377     int ims,
378     int rlevel,
379     int recursing)
380     {
381     stateblock *active_states, *new_states, *temp_states;
382     stateblock *next_active_state, *next_new_state;
383    
384     const uschar *ctypes, *lcc, *fcc;
385     const uschar *ptr;
386 nigel 93 const uschar *end_code, *first_op;
387 nigel 77
388     int active_count, new_count, match_count;
389    
390     /* Some fields in the md block are frequently referenced, so we load them into
391     independent variables in the hope that this will perform better. */
392    
393     const uschar *start_subject = md->start_subject;
394     const uschar *end_subject = md->end_subject;
395     const uschar *start_code = md->start_code;
396    
397 nigel 87 #ifdef SUPPORT_UTF8
398 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
399 nigel 93 #else
400     BOOL utf8 = FALSE;
401 nigel 87 #endif
402 nigel 77
403     rlevel++;
404     offsetcount &= (-2);
405    
406     wscount -= 2;
407     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
408     (2 * INTS_PER_STATEBLOCK);
409    
410     DPRINTF(("\n%.*s---------------------\n"
411     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
412     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
413    
414     ctypes = md->tables + ctypes_offset;
415     lcc = md->tables + lcc_offset;
416     fcc = md->tables + fcc_offset;
417    
418     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
419    
420     active_states = (stateblock *)(workspace + 2);
421     next_new_state = new_states = active_states + wscount;
422     new_count = 0;
423    
424 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
425     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
426    
427 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
428     the alternative states onto the list, and find out where the end is. This
429     makes is possible to use this function recursively, when we want to stop at a
430     matching internal ket rather than at the end.
431    
432     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
433     a backward assertion. In that case, we have to find out the maximum amount to
434     move back, and set up each alternative appropriately. */
435    
436 nigel 93 if (*first_op == OP_REVERSE)
437 nigel 77 {
438     int max_back = 0;
439     int gone_back;
440    
441     end_code = this_start_code;
442     do
443     {
444     int back = GET(end_code, 2+LINK_SIZE);
445     if (back > max_back) max_back = back;
446     end_code += GET(end_code, 1);
447     }
448     while (*end_code == OP_ALT);
449    
450     /* If we can't go back the amount required for the longest lookbehind
451     pattern, go back as far as we can; some alternatives may still be viable. */
452    
453     #ifdef SUPPORT_UTF8
454     /* In character mode we have to step back character by character */
455    
456     if (utf8)
457     {
458     for (gone_back = 0; gone_back < max_back; gone_back++)
459     {
460     if (current_subject <= start_subject) break;
461     current_subject--;
462     while (current_subject > start_subject &&
463     (*current_subject & 0xc0) == 0x80)
464     current_subject--;
465     }
466     }
467     else
468     #endif
469    
470     /* In byte-mode we can do this quickly. */
471    
472     {
473     gone_back = (current_subject - max_back < start_subject)?
474     current_subject - start_subject : max_back;
475     current_subject -= gone_back;
476     }
477 ph10 461
478 ph10 435 /* Save the earliest consulted character */
479 nigel 77
480 ph10 461 if (current_subject < md->start_used_ptr)
481     md->start_used_ptr = current_subject;
482    
483 nigel 77 /* Now we can process the individual branches. */
484    
485     end_code = this_start_code;
486     do
487     {
488     int back = GET(end_code, 2+LINK_SIZE);
489     if (back <= gone_back)
490     {
491     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
492     ADD_NEW_DATA(-bstate, 0, gone_back - back);
493     }
494     end_code += GET(end_code, 1);
495     }
496     while (*end_code == OP_ALT);
497     }
498    
499     /* This is the code for a "normal" subpattern (not a backward assertion). The
500     start of a whole pattern is always one of these. If we are at the top level,
501     we may be asked to restart matching from the same point that we reached for a
502     previous partial match. We still have to scan through the top-level branches to
503     find the end state. */
504    
505     else
506     {
507     end_code = this_start_code;
508    
509     /* Restarting */
510    
511     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
512     {
513     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
514     new_count = workspace[1];
515     if (!workspace[0])
516     memcpy(new_states, active_states, new_count * sizeof(stateblock));
517     }
518    
519     /* Not restarting */
520    
521     else
522     {
523 nigel 93 int length = 1 + LINK_SIZE +
524     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
525 nigel 77 do
526     {
527 nigel 93 ADD_NEW(end_code - start_code + length, 0);
528 nigel 77 end_code += GET(end_code, 1);
529 nigel 93 length = 1 + LINK_SIZE;
530 nigel 77 }
531     while (*end_code == OP_ALT);
532     }
533     }
534    
535     workspace[0] = 0; /* Bit indicating which vector is current */
536    
537     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
538    
539     /* Loop for scanning the subject */
540    
541     ptr = current_subject;
542     for (;;)
543     {
544     int i, j;
545 nigel 91 int clen, dlen;
546     unsigned int c, d;
547 ph10 428 int forced_fail = 0;
548 ph10 462 BOOL could_continue = FALSE;
549 nigel 77
550     /* Make the new state list into the active state list and empty the
551     new state list. */
552    
553     temp_states = active_states;
554     active_states = new_states;
555     new_states = temp_states;
556     active_count = new_count;
557     new_count = 0;
558    
559     workspace[0] ^= 1; /* Remember for the restarting feature */
560     workspace[1] = active_count;
561    
562     #ifdef DEBUG
563     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
564     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
565     printf("\"\n");
566    
567     printf("%.*sActive states: ", rlevel*2-2, SP);
568     for (i = 0; i < active_count; i++)
569     printf("%d/%d ", active_states[i].offset, active_states[i].count);
570     printf("\n");
571     #endif
572    
573     /* Set the pointers for adding new states */
574    
575     next_active_state = active_states + active_count;
576     next_new_state = new_states;
577    
578     /* Load the current character from the subject outside the loop, as many
579     different states may want to look at it, and we assume that at least one
580     will. */
581    
582     if (ptr < end_subject)
583     {
584 nigel 93 clen = 1; /* Number of bytes in the character */
585 nigel 77 #ifdef SUPPORT_UTF8
586     if (utf8) { GETCHARLEN(c, ptr, clen); } else
587     #endif /* SUPPORT_UTF8 */
588     c = *ptr;
589     }
590     else
591     {
592 nigel 93 clen = 0; /* This indicates the end of the subject */
593     c = NOTACHAR; /* This value should never actually be used */
594 nigel 77 }
595    
596     /* Scan up the active states and act on each one. The result of an action
597     may be to add more states to the currently active list (e.g. on hitting a
598     parenthesis) or it may be to put states on the new list, for considering
599     when we move the character pointer on. */
600    
601     for (i = 0; i < active_count; i++)
602     {
603     stateblock *current_state = active_states + i;
604     const uschar *code;
605     int state_offset = current_state->offset;
606 ph10 397 int count, codevalue, rrc;
607 nigel 77
608     #ifdef DEBUG
609     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
610 nigel 93 if (clen == 0) printf("EOL\n");
611 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
612     else printf("0x%02x\n", c);
613     #endif
614    
615     /* This variable is referred to implicity in the ADD_xxx macros. */
616    
617     ims = current_state->ims;
618    
619     /* A negative offset is a special case meaning "hold off going to this
620     (negated) state until the number of characters in the data field have
621     been skipped". */
622    
623     if (state_offset < 0)
624     {
625     if (current_state->data > 0)
626     {
627     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
628     ADD_NEW_DATA(state_offset, current_state->count,
629     current_state->data - 1);
630     continue;
631     }
632     else
633     {
634     current_state->offset = state_offset = -state_offset;
635     }
636     }
637    
638 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
639 ph10 439 See the note at the head of this module about the possibility of improving
640     performance here. */
641 nigel 77
642     for (j = 0; j < i; j++)
643     {
644     if (active_states[j].offset == state_offset &&
645     active_states[j].count == current_state->count)
646     {
647     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
648     goto NEXT_ACTIVE_STATE;
649     }
650     }
651    
652     /* The state offset is the offset to the opcode */
653    
654     code = start_code + state_offset;
655     codevalue = *code;
656    
657 ph10 463 /* If this opcode inspects a character, but we are at the end of the
658     subject, remember the fact for use when testing for a partial match. */
659    
660 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
661 ph10 463 could_continue = TRUE;
662 ph10 462
663 nigel 77 /* If this opcode is followed by an inline character, load it. It is
664     tempting to test for the presence of a subject character here, but that
665     is wrong, because sometimes zero repetitions of the subject are
666     permitted.
667    
668     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
669 ph10 178 argument that is not a data character - but is always one byte long. We
670     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
671     this case. To keep the other cases fast, convert these ones to new opcodes.
672     */
673 nigel 77
674     if (coptable[codevalue] > 0)
675     {
676     dlen = 1;
677     #ifdef SUPPORT_UTF8
678     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
679     #endif /* SUPPORT_UTF8 */
680     d = code[coptable[codevalue]];
681     if (codevalue >= OP_TYPESTAR)
682     {
683 nigel 93 switch(d)
684     {
685     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
686     case OP_NOTPROP:
687     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
688     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
689     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
690 ph10 178 case OP_NOT_HSPACE:
691 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
692 ph10 178 case OP_NOT_VSPACE:
693 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
694 nigel 93 default: break;
695     }
696 nigel 77 }
697     }
698     else
699     {
700     dlen = 0; /* Not strictly necessary, but compilers moan */
701 nigel 93 d = NOTACHAR; /* if these variables are not set. */
702 nigel 77 }
703    
704    
705     /* Now process the individual opcodes */
706    
707     switch (codevalue)
708     {
709    
710     /* ========================================================================== */
711     /* Reached a closing bracket. If not at the end of the pattern, carry
712     on with the next opcode. Otherwise, unless we have an empty string and
713 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
714 ph10 442 start of the subject, save the match data, shifting up all previous
715 nigel 77 matches so we always have the longest first. */
716    
717     case OP_KET:
718     case OP_KETRMIN:
719     case OP_KETRMAX:
720     if (code != end_code)
721     {
722     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
723     if (codevalue != OP_KET)
724     {
725     ADD_ACTIVE(state_offset - GET(code, 1), 0);
726     }
727     }
728 ph10 461 else
729 nigel 77 {
730 ph10 461 if (ptr > current_subject ||
731 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
732     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
733     current_subject > start_subject + md->start_offset)))
734 nigel 77 {
735 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
736     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
737     match_count = 0;
738     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
739     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
740     if (offsetcount >= 2)
741     {
742     offsets[0] = current_subject - start_subject;
743     offsets[1] = ptr - start_subject;
744     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
745     offsets[1] - offsets[0], current_subject));
746     }
747     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
748     {
749     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
750     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
751     match_count, rlevel*2-2, SP));
752     return match_count;
753     }
754 ph10 461 }
755 nigel 77 }
756     break;
757    
758     /* ========================================================================== */
759     /* These opcodes add to the current list of states without looking
760     at the current character. */
761    
762     /*-----------------------------------------------------------------*/
763     case OP_ALT:
764     do { code += GET(code, 1); } while (*code == OP_ALT);
765     ADD_ACTIVE(code - start_code, 0);
766     break;
767    
768     /*-----------------------------------------------------------------*/
769     case OP_BRA:
770 nigel 93 case OP_SBRA:
771 nigel 77 do
772     {
773     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
774     code += GET(code, 1);
775     }
776     while (*code == OP_ALT);
777     break;
778    
779     /*-----------------------------------------------------------------*/
780 nigel 93 case OP_CBRA:
781     case OP_SCBRA:
782     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
783     code += GET(code, 1);
784     while (*code == OP_ALT)
785     {
786     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
787     code += GET(code, 1);
788     }
789     break;
790    
791     /*-----------------------------------------------------------------*/
792 nigel 77 case OP_BRAZERO:
793     case OP_BRAMINZERO:
794     ADD_ACTIVE(state_offset + 1, 0);
795     code += 1 + GET(code, 2);
796     while (*code == OP_ALT) code += GET(code, 1);
797     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
798     break;
799    
800     /*-----------------------------------------------------------------*/
801 ph10 335 case OP_SKIPZERO:
802     code += 1 + GET(code, 2);
803     while (*code == OP_ALT) code += GET(code, 1);
804     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
805     break;
806    
807     /*-----------------------------------------------------------------*/
808 nigel 77 case OP_CIRC:
809     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
810 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
811     ptr != end_subject &&
812 nigel 93 WAS_NEWLINE(ptr)))
813 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
814     break;
815    
816     /*-----------------------------------------------------------------*/
817     case OP_EOD:
818     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
819     break;
820    
821     /*-----------------------------------------------------------------*/
822     case OP_OPT:
823     ims = code[1];
824     ADD_ACTIVE(state_offset + 2, 0);
825     break;
826    
827     /*-----------------------------------------------------------------*/
828     case OP_SOD:
829     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
830     break;
831    
832     /*-----------------------------------------------------------------*/
833     case OP_SOM:
834     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
835     break;
836    
837    
838     /* ========================================================================== */
839     /* These opcodes inspect the next subject character, and sometimes
840     the previous one as well, but do not have an argument. The variable
841     clen contains the length of the current character and is zero if we are
842     at the end of the subject. */
843    
844     /*-----------------------------------------------------------------*/
845     case OP_ANY:
846 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
847 nigel 77 { ADD_NEW(state_offset + 1, 0); }
848     break;
849    
850     /*-----------------------------------------------------------------*/
851 ph10 341 case OP_ALLANY:
852     if (clen > 0)
853     { ADD_NEW(state_offset + 1, 0); }
854     break;
855    
856     /*-----------------------------------------------------------------*/
857 nigel 77 case OP_EODN:
858 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
859 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
860     break;
861    
862     /*-----------------------------------------------------------------*/
863     case OP_DOLL:
864     if ((md->moptions & PCRE_NOTEOL) == 0)
865     {
866 nigel 91 if (clen == 0 ||
867 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
868 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
869     ))
870 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
871     }
872 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
873 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
874     break;
875    
876     /*-----------------------------------------------------------------*/
877    
878     case OP_DIGIT:
879     case OP_WHITESPACE:
880     case OP_WORDCHAR:
881     if (clen > 0 && c < 256 &&
882     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
883     { ADD_NEW(state_offset + 1, 0); }
884     break;
885    
886     /*-----------------------------------------------------------------*/
887     case OP_NOT_DIGIT:
888     case OP_NOT_WHITESPACE:
889     case OP_NOT_WORDCHAR:
890     if (clen > 0 && (c >= 256 ||
891     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
892     { ADD_NEW(state_offset + 1, 0); }
893     break;
894    
895     /*-----------------------------------------------------------------*/
896     case OP_WORD_BOUNDARY:
897     case OP_NOT_WORD_BOUNDARY:
898     {
899     int left_word, right_word;
900    
901     if (ptr > start_subject)
902     {
903     const uschar *temp = ptr - 1;
904 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
905 nigel 77 #ifdef SUPPORT_UTF8
906     if (utf8) BACKCHAR(temp);
907     #endif
908     GETCHARTEST(d, temp);
909     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
910     }
911     else left_word = 0;
912    
913 ph10 461 if (clen > 0)
914 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
915 ph10 463 else right_word = 0;
916 nigel 77
917     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
918     { ADD_ACTIVE(state_offset + 1, 0); }
919     }
920     break;
921    
922    
923     /*-----------------------------------------------------------------*/
924     /* Check the next character by Unicode property. We will get here only
925     if the support is in the binary; otherwise a compile-time error occurs.
926     */
927    
928 ph10 151 #ifdef SUPPORT_UCP
929 nigel 77 case OP_PROP:
930     case OP_NOTPROP:
931     if (clen > 0)
932     {
933 nigel 87 BOOL OK;
934 ph10 349 const ucd_record * prop = GET_UCD(c);
935 nigel 87 switch(code[1])
936 nigel 77 {
937 nigel 87 case PT_ANY:
938     OK = TRUE;
939     break;
940    
941     case PT_LAMP:
942 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
943 nigel 87 break;
944    
945     case PT_GC:
946 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
947 nigel 87 break;
948    
949     case PT_PC:
950 ph10 349 OK = prop->chartype == code[2];
951 nigel 87 break;
952    
953     case PT_SC:
954 ph10 349 OK = prop->script == code[2];
955 nigel 87 break;
956    
957     /* Should never occur, but keep compilers from grumbling. */
958    
959     default:
960     OK = codevalue != OP_PROP;
961     break;
962 nigel 77 }
963 nigel 87
964     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
965 nigel 77 }
966     break;
967     #endif
968    
969    
970    
971     /* ========================================================================== */
972     /* These opcodes likewise inspect the subject character, but have an
973     argument that is not a data character. It is one of these opcodes:
974 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
975     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
976 nigel 77
977     case OP_TYPEPLUS:
978     case OP_TYPEMINPLUS:
979 nigel 93 case OP_TYPEPOSPLUS:
980 nigel 77 count = current_state->count; /* Already matched */
981     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
982     if (clen > 0)
983     {
984     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
985     (c < 256 &&
986 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
987 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
988     {
989 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
990     {
991     active_count--; /* Remove non-match possibility */
992     next_active_state--;
993     }
994 nigel 77 count++;
995     ADD_NEW(state_offset, count);
996     }
997     }
998     break;
999    
1000     /*-----------------------------------------------------------------*/
1001     case OP_TYPEQUERY:
1002     case OP_TYPEMINQUERY:
1003 nigel 93 case OP_TYPEPOSQUERY:
1004 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1005     if (clen > 0)
1006     {
1007     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1008     (c < 256 &&
1009 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1010 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1011     {
1012 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1013     {
1014     active_count--; /* Remove non-match possibility */
1015     next_active_state--;
1016     }
1017 nigel 77 ADD_NEW(state_offset + 2, 0);
1018     }
1019     }
1020     break;
1021    
1022     /*-----------------------------------------------------------------*/
1023     case OP_TYPESTAR:
1024     case OP_TYPEMINSTAR:
1025 nigel 93 case OP_TYPEPOSSTAR:
1026 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1027     if (clen > 0)
1028     {
1029     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1030     (c < 256 &&
1031 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1032 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1033     {
1034 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1035     {
1036     active_count--; /* Remove non-match possibility */
1037     next_active_state--;
1038     }
1039 nigel 77 ADD_NEW(state_offset, 0);
1040     }
1041     }
1042     break;
1043    
1044     /*-----------------------------------------------------------------*/
1045     case OP_TYPEEXACT:
1046 nigel 93 count = current_state->count; /* Number already matched */
1047     if (clen > 0)
1048     {
1049     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1050     (c < 256 &&
1051 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1052 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1053     {
1054     if (++count >= GET2(code, 1))
1055     { ADD_NEW(state_offset + 4, 0); }
1056     else
1057     { ADD_NEW(state_offset, count); }
1058     }
1059     }
1060     break;
1061    
1062     /*-----------------------------------------------------------------*/
1063 nigel 77 case OP_TYPEUPTO:
1064     case OP_TYPEMINUPTO:
1065 nigel 93 case OP_TYPEPOSUPTO:
1066     ADD_ACTIVE(state_offset + 4, 0);
1067 nigel 77 count = current_state->count; /* Number already matched */
1068     if (clen > 0)
1069     {
1070     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1071     (c < 256 &&
1072 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1073 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1074     {
1075 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1076     {
1077     active_count--; /* Remove non-match possibility */
1078     next_active_state--;
1079     }
1080 nigel 77 if (++count >= GET2(code, 1))
1081     { ADD_NEW(state_offset + 4, 0); }
1082     else
1083     { ADD_NEW(state_offset, count); }
1084     }
1085     }
1086     break;
1087    
1088     /* ========================================================================== */
1089     /* These are virtual opcodes that are used when something like
1090 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1091     argument. It keeps the code above fast for the other cases. The argument
1092     is in the d variable. */
1093 nigel 77
1094 ph10 151 #ifdef SUPPORT_UCP
1095 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1096     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1097 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1098 nigel 77 count = current_state->count; /* Already matched */
1099 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1100 nigel 77 if (clen > 0)
1101     {
1102 nigel 87 BOOL OK;
1103 ph10 349 const ucd_record * prop = GET_UCD(c);
1104 nigel 87 switch(code[2])
1105     {
1106     case PT_ANY:
1107     OK = TRUE;
1108     break;
1109    
1110     case PT_LAMP:
1111 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1112 nigel 87 break;
1113    
1114     case PT_GC:
1115 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1116 nigel 87 break;
1117    
1118     case PT_PC:
1119 ph10 349 OK = prop->chartype == code[3];
1120 nigel 87 break;
1121    
1122     case PT_SC:
1123 ph10 349 OK = prop->script == code[3];
1124 nigel 87 break;
1125    
1126     /* Should never occur, but keep compilers from grumbling. */
1127    
1128     default:
1129     OK = codevalue != OP_PROP;
1130     break;
1131     }
1132    
1133 nigel 93 if (OK == (d == OP_PROP))
1134     {
1135     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1136     {
1137     active_count--; /* Remove non-match possibility */
1138     next_active_state--;
1139     }
1140     count++;
1141     ADD_NEW(state_offset, count);
1142     }
1143 nigel 77 }
1144     break;
1145    
1146     /*-----------------------------------------------------------------*/
1147     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1148     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1149 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1150 nigel 77 count = current_state->count; /* Already matched */
1151     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1153 nigel 77 {
1154     const uschar *nptr = ptr + clen;
1155     int ncount = 0;
1156 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1157     {
1158     active_count--; /* Remove non-match possibility */
1159     next_active_state--;
1160     }
1161 nigel 77 while (nptr < end_subject)
1162     {
1163     int nd;
1164     int ndlen = 1;
1165     GETCHARLEN(nd, nptr, ndlen);
1166 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1167 nigel 77 ncount++;
1168     nptr += ndlen;
1169     }
1170     count++;
1171     ADD_NEW_DATA(-state_offset, count, ncount);
1172     }
1173     break;
1174 ph10 151 #endif
1175 nigel 77
1176     /*-----------------------------------------------------------------*/
1177 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1178     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1179     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1180     count = current_state->count; /* Already matched */
1181     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1182     if (clen > 0)
1183     {
1184     int ncount = 0;
1185     switch (c)
1186     {
1187     case 0x000b:
1188     case 0x000c:
1189     case 0x0085:
1190     case 0x2028:
1191     case 0x2029:
1192 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1193     goto ANYNL01;
1194    
1195     case 0x000d:
1196     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1197     /* Fall through */
1198    
1199     ANYNL01:
1200     case 0x000a:
1201 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1202     {
1203     active_count--; /* Remove non-match possibility */
1204     next_active_state--;
1205     }
1206     count++;
1207     ADD_NEW_DATA(-state_offset, count, ncount);
1208     break;
1209 ph10 231
1210 nigel 93 default:
1211     break;
1212     }
1213     }
1214     break;
1215    
1216     /*-----------------------------------------------------------------*/
1217 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1218     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1219     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1220     count = current_state->count; /* Already matched */
1221     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1222     if (clen > 0)
1223     {
1224 ph10 182 BOOL OK;
1225 ph10 178 switch (c)
1226     {
1227     case 0x000a:
1228     case 0x000b:
1229     case 0x000c:
1230     case 0x000d:
1231     case 0x0085:
1232     case 0x2028:
1233     case 0x2029:
1234     OK = TRUE;
1235 ph10 182 break;
1236 ph10 178
1237     default:
1238     OK = FALSE;
1239 ph10 182 break;
1240 ph10 178 }
1241    
1242     if (OK == (d == OP_VSPACE))
1243 ph10 182 {
1244 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1245     {
1246     active_count--; /* Remove non-match possibility */
1247     next_active_state--;
1248     }
1249     count++;
1250     ADD_NEW_DATA(-state_offset, count, 0);
1251     }
1252     }
1253     break;
1254    
1255     /*-----------------------------------------------------------------*/
1256     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1257     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1258     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1259     count = current_state->count; /* Already matched */
1260     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1261     if (clen > 0)
1262     {
1263 ph10 182 BOOL OK;
1264 ph10 178 switch (c)
1265     {
1266     case 0x09: /* HT */
1267     case 0x20: /* SPACE */
1268     case 0xa0: /* NBSP */
1269     case 0x1680: /* OGHAM SPACE MARK */
1270     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1271     case 0x2000: /* EN QUAD */
1272     case 0x2001: /* EM QUAD */
1273     case 0x2002: /* EN SPACE */
1274     case 0x2003: /* EM SPACE */
1275     case 0x2004: /* THREE-PER-EM SPACE */
1276     case 0x2005: /* FOUR-PER-EM SPACE */
1277     case 0x2006: /* SIX-PER-EM SPACE */
1278     case 0x2007: /* FIGURE SPACE */
1279     case 0x2008: /* PUNCTUATION SPACE */
1280     case 0x2009: /* THIN SPACE */
1281     case 0x200A: /* HAIR SPACE */
1282     case 0x202f: /* NARROW NO-BREAK SPACE */
1283     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1284     case 0x3000: /* IDEOGRAPHIC SPACE */
1285     OK = TRUE;
1286     break;
1287 ph10 182
1288 ph10 178 default:
1289     OK = FALSE;
1290     break;
1291     }
1292 ph10 182
1293 ph10 178 if (OK == (d == OP_HSPACE))
1294 ph10 182 {
1295 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1296     {
1297     active_count--; /* Remove non-match possibility */
1298     next_active_state--;
1299     }
1300     count++;
1301     ADD_NEW_DATA(-state_offset, count, 0);
1302     }
1303     }
1304     break;
1305    
1306     /*-----------------------------------------------------------------*/
1307 ph10 151 #ifdef SUPPORT_UCP
1308 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1309     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1310 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1311 nigel 87 count = 4;
1312 nigel 77 goto QS1;
1313    
1314     case OP_PROP_EXTRA + OP_TYPESTAR:
1315     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1316 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1317 nigel 77 count = 0;
1318    
1319     QS1:
1320    
1321 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1322 nigel 77 if (clen > 0)
1323     {
1324 nigel 87 BOOL OK;
1325 ph10 349 const ucd_record * prop = GET_UCD(c);
1326 nigel 87 switch(code[2])
1327     {
1328     case PT_ANY:
1329     OK = TRUE;
1330     break;
1331    
1332     case PT_LAMP:
1333 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1334 nigel 87 break;
1335    
1336     case PT_GC:
1337 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1338 nigel 87 break;
1339    
1340     case PT_PC:
1341 ph10 349 OK = prop->chartype == code[3];
1342 nigel 87 break;
1343    
1344     case PT_SC:
1345 ph10 349 OK = prop->script == code[3];
1346 nigel 87 break;
1347    
1348     /* Should never occur, but keep compilers from grumbling. */
1349    
1350     default:
1351     OK = codevalue != OP_PROP;
1352     break;
1353     }
1354    
1355 nigel 93 if (OK == (d == OP_PROP))
1356     {
1357     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1358     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1359     {
1360     active_count--; /* Remove non-match possibility */
1361     next_active_state--;
1362     }
1363     ADD_NEW(state_offset + count, 0);
1364     }
1365 nigel 77 }
1366     break;
1367    
1368     /*-----------------------------------------------------------------*/
1369     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1370     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1371 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1372 nigel 77 count = 2;
1373     goto QS2;
1374    
1375     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1376     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1377 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1378 nigel 77 count = 0;
1379    
1380     QS2:
1381    
1382     ADD_ACTIVE(state_offset + 2, 0);
1383 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1384 nigel 77 {
1385     const uschar *nptr = ptr + clen;
1386     int ncount = 0;
1387 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1388     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1389     {
1390     active_count--; /* Remove non-match possibility */
1391     next_active_state--;
1392     }
1393 nigel 77 while (nptr < end_subject)
1394     {
1395     int nd;
1396     int ndlen = 1;
1397     GETCHARLEN(nd, nptr, ndlen);
1398 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1399 nigel 77 ncount++;
1400     nptr += ndlen;
1401     }
1402     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1403     }
1404     break;
1405 ph10 151 #endif
1406 nigel 77
1407     /*-----------------------------------------------------------------*/
1408 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1409     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1410     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1411     count = 2;
1412     goto QS3;
1413    
1414     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1415     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1416     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1417     count = 0;
1418    
1419     QS3:
1420     ADD_ACTIVE(state_offset + 2, 0);
1421     if (clen > 0)
1422     {
1423     int ncount = 0;
1424     switch (c)
1425     {
1426     case 0x000b:
1427     case 0x000c:
1428     case 0x0085:
1429     case 0x2028:
1430     case 0x2029:
1431 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1432     goto ANYNL02;
1433    
1434     case 0x000d:
1435     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1436     /* Fall through */
1437    
1438     ANYNL02:
1439     case 0x000a:
1440 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1441     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1442     {
1443     active_count--; /* Remove non-match possibility */
1444     next_active_state--;
1445     }
1446     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1447     break;
1448 ph10 231
1449 nigel 93 default:
1450     break;
1451     }
1452     }
1453     break;
1454    
1455     /*-----------------------------------------------------------------*/
1456 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1457     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1458     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1459     count = 2;
1460     goto QS4;
1461    
1462     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1463     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1464     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1465     count = 0;
1466    
1467     QS4:
1468     ADD_ACTIVE(state_offset + 2, 0);
1469     if (clen > 0)
1470     {
1471 ph10 182 BOOL OK;
1472 ph10 178 switch (c)
1473     {
1474     case 0x000a:
1475     case 0x000b:
1476     case 0x000c:
1477     case 0x000d:
1478     case 0x0085:
1479     case 0x2028:
1480     case 0x2029:
1481     OK = TRUE;
1482     break;
1483 ph10 182
1484 ph10 178 default:
1485     OK = FALSE;
1486     break;
1487     }
1488     if (OK == (d == OP_VSPACE))
1489 ph10 182 {
1490 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1491     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1492     {
1493     active_count--; /* Remove non-match possibility */
1494     next_active_state--;
1495     }
1496     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1497     }
1498     }
1499     break;
1500    
1501     /*-----------------------------------------------------------------*/
1502     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1503     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1504     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1505     count = 2;
1506     goto QS5;
1507    
1508     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1509     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1510     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1511     count = 0;
1512    
1513     QS5:
1514     ADD_ACTIVE(state_offset + 2, 0);
1515     if (clen > 0)
1516     {
1517 ph10 182 BOOL OK;
1518 ph10 178 switch (c)
1519     {
1520     case 0x09: /* HT */
1521     case 0x20: /* SPACE */
1522     case 0xa0: /* NBSP */
1523     case 0x1680: /* OGHAM SPACE MARK */
1524     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1525     case 0x2000: /* EN QUAD */
1526     case 0x2001: /* EM QUAD */
1527     case 0x2002: /* EN SPACE */
1528     case 0x2003: /* EM SPACE */
1529     case 0x2004: /* THREE-PER-EM SPACE */
1530     case 0x2005: /* FOUR-PER-EM SPACE */
1531     case 0x2006: /* SIX-PER-EM SPACE */
1532     case 0x2007: /* FIGURE SPACE */
1533     case 0x2008: /* PUNCTUATION SPACE */
1534     case 0x2009: /* THIN SPACE */
1535     case 0x200A: /* HAIR SPACE */
1536     case 0x202f: /* NARROW NO-BREAK SPACE */
1537     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1538     case 0x3000: /* IDEOGRAPHIC SPACE */
1539     OK = TRUE;
1540     break;
1541 ph10 182
1542 ph10 178 default:
1543     OK = FALSE;
1544     break;
1545     }
1546 ph10 182
1547 ph10 178 if (OK == (d == OP_HSPACE))
1548 ph10 182 {
1549 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1550     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1551     {
1552     active_count--; /* Remove non-match possibility */
1553     next_active_state--;
1554     }
1555     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1556     }
1557     }
1558     break;
1559    
1560     /*-----------------------------------------------------------------*/
1561 ph10 151 #ifdef SUPPORT_UCP
1562 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1563     case OP_PROP_EXTRA + OP_TYPEUPTO:
1564     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1565 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1566 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1567 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1568 nigel 77 count = current_state->count; /* Number already matched */
1569     if (clen > 0)
1570     {
1571 nigel 87 BOOL OK;
1572 ph10 349 const ucd_record * prop = GET_UCD(c);
1573 nigel 87 switch(code[4])
1574 nigel 77 {
1575 nigel 87 case PT_ANY:
1576     OK = TRUE;
1577     break;
1578    
1579     case PT_LAMP:
1580 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1581 nigel 87 break;
1582    
1583     case PT_GC:
1584 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1585 nigel 87 break;
1586    
1587     case PT_PC:
1588 ph10 349 OK = prop->chartype == code[5];
1589 nigel 87 break;
1590    
1591     case PT_SC:
1592 ph10 349 OK = prop->script == code[5];
1593 nigel 87 break;
1594    
1595     /* Should never occur, but keep compilers from grumbling. */
1596    
1597     default:
1598     OK = codevalue != OP_PROP;
1599     break;
1600     }
1601    
1602     if (OK == (d == OP_PROP))
1603     {
1604 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1605     {
1606     active_count--; /* Remove non-match possibility */
1607     next_active_state--;
1608     }
1609 nigel 77 if (++count >= GET2(code, 1))
1610 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1611 nigel 77 else
1612     { ADD_NEW(state_offset, count); }
1613     }
1614     }
1615     break;
1616    
1617     /*-----------------------------------------------------------------*/
1618     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1619     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1620     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1621 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1622 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1623     { ADD_ACTIVE(state_offset + 4, 0); }
1624     count = current_state->count; /* Number already matched */
1625 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1626 nigel 77 {
1627     const uschar *nptr = ptr + clen;
1628     int ncount = 0;
1629 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1630     {
1631     active_count--; /* Remove non-match possibility */
1632     next_active_state--;
1633     }
1634 nigel 77 while (nptr < end_subject)
1635     {
1636     int nd;
1637     int ndlen = 1;
1638     GETCHARLEN(nd, nptr, ndlen);
1639 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1640 nigel 77 ncount++;
1641     nptr += ndlen;
1642     }
1643     if (++count >= GET2(code, 1))
1644     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1645     else
1646     { ADD_NEW_DATA(-state_offset, count, ncount); }
1647     }
1648     break;
1649 ph10 151 #endif
1650 nigel 77
1651 nigel 93 /*-----------------------------------------------------------------*/
1652     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1653     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1654     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1655     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1656     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1657     { ADD_ACTIVE(state_offset + 4, 0); }
1658     count = current_state->count; /* Number already matched */
1659     if (clen > 0)
1660     {
1661     int ncount = 0;
1662     switch (c)
1663     {
1664     case 0x000b:
1665     case 0x000c:
1666     case 0x0085:
1667     case 0x2028:
1668     case 0x2029:
1669 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1670     goto ANYNL03;
1671    
1672     case 0x000d:
1673     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1674     /* Fall through */
1675    
1676     ANYNL03:
1677     case 0x000a:
1678 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1679     {
1680     active_count--; /* Remove non-match possibility */
1681     next_active_state--;
1682     }
1683     if (++count >= GET2(code, 1))
1684     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1685     else
1686     { ADD_NEW_DATA(-state_offset, count, ncount); }
1687     break;
1688 ph10 231
1689 nigel 93 default:
1690     break;
1691     }
1692     }
1693     break;
1694    
1695 ph10 178 /*-----------------------------------------------------------------*/
1696     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1697     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1698     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1699     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1700     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1701     { ADD_ACTIVE(state_offset + 4, 0); }
1702     count = current_state->count; /* Number already matched */
1703     if (clen > 0)
1704     {
1705 ph10 182 BOOL OK;
1706 ph10 178 switch (c)
1707     {
1708     case 0x000a:
1709     case 0x000b:
1710     case 0x000c:
1711     case 0x000d:
1712     case 0x0085:
1713     case 0x2028:
1714     case 0x2029:
1715     OK = TRUE;
1716     break;
1717 ph10 182
1718 ph10 178 default:
1719     OK = FALSE;
1720     }
1721 ph10 182
1722 ph10 178 if (OK == (d == OP_VSPACE))
1723 ph10 182 {
1724 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1725     {
1726     active_count--; /* Remove non-match possibility */
1727     next_active_state--;
1728     }
1729     if (++count >= GET2(code, 1))
1730     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1731     else
1732     { ADD_NEW_DATA(-state_offset, count, 0); }
1733     }
1734     }
1735     break;
1736    
1737     /*-----------------------------------------------------------------*/
1738     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1739     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1740     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1741     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1742     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1743     { ADD_ACTIVE(state_offset + 4, 0); }
1744     count = current_state->count; /* Number already matched */
1745     if (clen > 0)
1746     {
1747 ph10 182 BOOL OK;
1748 ph10 178 switch (c)
1749     {
1750     case 0x09: /* HT */
1751     case 0x20: /* SPACE */
1752     case 0xa0: /* NBSP */
1753     case 0x1680: /* OGHAM SPACE MARK */
1754     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1755     case 0x2000: /* EN QUAD */
1756     case 0x2001: /* EM QUAD */
1757     case 0x2002: /* EN SPACE */
1758     case 0x2003: /* EM SPACE */
1759     case 0x2004: /* THREE-PER-EM SPACE */
1760     case 0x2005: /* FOUR-PER-EM SPACE */
1761     case 0x2006: /* SIX-PER-EM SPACE */
1762     case 0x2007: /* FIGURE SPACE */
1763     case 0x2008: /* PUNCTUATION SPACE */
1764     case 0x2009: /* THIN SPACE */
1765     case 0x200A: /* HAIR SPACE */
1766     case 0x202f: /* NARROW NO-BREAK SPACE */
1767     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1768     case 0x3000: /* IDEOGRAPHIC SPACE */
1769     OK = TRUE;
1770     break;
1771 ph10 182
1772 ph10 178 default:
1773     OK = FALSE;
1774     break;
1775     }
1776 ph10 182
1777 ph10 178 if (OK == (d == OP_HSPACE))
1778 ph10 182 {
1779 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1780     {
1781     active_count--; /* Remove non-match possibility */
1782     next_active_state--;
1783     }
1784     if (++count >= GET2(code, 1))
1785     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1786     else
1787     { ADD_NEW_DATA(-state_offset, count, 0); }
1788     }
1789     }
1790     break;
1791    
1792 nigel 77 /* ========================================================================== */
1793     /* These opcodes are followed by a character that is usually compared
1794     to the current subject character; it is loaded into d. We still get
1795     here even if there is no subject character, because in some cases zero
1796     repetitions are permitted. */
1797    
1798     /*-----------------------------------------------------------------*/
1799     case OP_CHAR:
1800     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1801     break;
1802    
1803     /*-----------------------------------------------------------------*/
1804     case OP_CHARNC:
1805     if (clen == 0) break;
1806    
1807     #ifdef SUPPORT_UTF8
1808     if (utf8)
1809     {
1810     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1811     {
1812 nigel 93 unsigned int othercase;
1813 nigel 77 if (c < 128) othercase = fcc[c]; else
1814    
1815     /* If we have Unicode property support, we can use it to test the
1816 nigel 87 other case of the character. */
1817 nigel 77
1818     #ifdef SUPPORT_UCP
1819 ph10 349 othercase = UCD_OTHERCASE(c);
1820 nigel 87 #else
1821 nigel 93 othercase = NOTACHAR;
1822 nigel 77 #endif
1823    
1824     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1825     }
1826     }
1827     else
1828     #endif /* SUPPORT_UTF8 */
1829    
1830     /* Non-UTF-8 mode */
1831     {
1832     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1833     }
1834     break;
1835    
1836    
1837     #ifdef SUPPORT_UCP
1838     /*-----------------------------------------------------------------*/
1839     /* This is a tricky one because it can match more than one character.
1840     Find out how many characters to skip, and then set up a negative state
1841     to wait for them to pass before continuing. */
1842    
1843     case OP_EXTUNI:
1844 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1845 nigel 77 {
1846     const uschar *nptr = ptr + clen;
1847     int ncount = 0;
1848     while (nptr < end_subject)
1849     {
1850     int nclen = 1;
1851     GETCHARLEN(c, nptr, nclen);
1852 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1853 nigel 77 ncount++;
1854     nptr += nclen;
1855     }
1856     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1857     }
1858     break;
1859     #endif
1860    
1861     /*-----------------------------------------------------------------*/
1862 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1863     character (when CR is followed by LF). In this case, set up a negative
1864     state to wait for one character to pass before continuing. */
1865    
1866     case OP_ANYNL:
1867     if (clen > 0) switch(c)
1868     {
1869     case 0x000b:
1870     case 0x000c:
1871     case 0x0085:
1872     case 0x2028:
1873     case 0x2029:
1874 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1875    
1876     case 0x000a:
1877 nigel 93 ADD_NEW(state_offset + 1, 0);
1878     break;
1879 ph10 231
1880 nigel 93 case 0x000d:
1881     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1882     {
1883     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1884     }
1885     else
1886     {
1887     ADD_NEW(state_offset + 1, 0);
1888     }
1889     break;
1890     }
1891     break;
1892    
1893     /*-----------------------------------------------------------------*/
1894 ph10 178 case OP_NOT_VSPACE:
1895     if (clen > 0) switch(c)
1896     {
1897     case 0x000a:
1898     case 0x000b:
1899     case 0x000c:
1900     case 0x000d:
1901     case 0x0085:
1902     case 0x2028:
1903     case 0x2029:
1904     break;
1905 ph10 182
1906     default:
1907 ph10 178 ADD_NEW(state_offset + 1, 0);
1908     break;
1909     }
1910     break;
1911    
1912     /*-----------------------------------------------------------------*/
1913     case OP_VSPACE:
1914     if (clen > 0) switch(c)
1915     {
1916     case 0x000a:
1917     case 0x000b:
1918     case 0x000c:
1919     case 0x000d:
1920     case 0x0085:
1921     case 0x2028:
1922     case 0x2029:
1923     ADD_NEW(state_offset + 1, 0);
1924     break;
1925 ph10 182
1926 ph10 178 default: break;
1927     }
1928     break;
1929    
1930     /*-----------------------------------------------------------------*/
1931     case OP_NOT_HSPACE:
1932     if (clen > 0) switch(c)
1933     {
1934     case 0x09: /* HT */
1935     case 0x20: /* SPACE */
1936     case 0xa0: /* NBSP */
1937     case 0x1680: /* OGHAM SPACE MARK */
1938     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1939     case 0x2000: /* EN QUAD */
1940     case 0x2001: /* EM QUAD */
1941     case 0x2002: /* EN SPACE */
1942     case 0x2003: /* EM SPACE */
1943     case 0x2004: /* THREE-PER-EM SPACE */
1944     case 0x2005: /* FOUR-PER-EM SPACE */
1945     case 0x2006: /* SIX-PER-EM SPACE */
1946     case 0x2007: /* FIGURE SPACE */
1947     case 0x2008: /* PUNCTUATION SPACE */
1948     case 0x2009: /* THIN SPACE */
1949     case 0x200A: /* HAIR SPACE */
1950     case 0x202f: /* NARROW NO-BREAK SPACE */
1951     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1952     case 0x3000: /* IDEOGRAPHIC SPACE */
1953     break;
1954 ph10 182
1955     default:
1956 ph10 178 ADD_NEW(state_offset + 1, 0);
1957     break;
1958     }
1959     break;
1960    
1961     /*-----------------------------------------------------------------*/
1962     case OP_HSPACE:
1963     if (clen > 0) switch(c)
1964     {
1965     case 0x09: /* HT */
1966     case 0x20: /* SPACE */
1967     case 0xa0: /* NBSP */
1968     case 0x1680: /* OGHAM SPACE MARK */
1969     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1970     case 0x2000: /* EN QUAD */
1971     case 0x2001: /* EM QUAD */
1972     case 0x2002: /* EN SPACE */
1973     case 0x2003: /* EM SPACE */
1974     case 0x2004: /* THREE-PER-EM SPACE */
1975     case 0x2005: /* FOUR-PER-EM SPACE */
1976     case 0x2006: /* SIX-PER-EM SPACE */
1977     case 0x2007: /* FIGURE SPACE */
1978     case 0x2008: /* PUNCTUATION SPACE */
1979     case 0x2009: /* THIN SPACE */
1980     case 0x200A: /* HAIR SPACE */
1981     case 0x202f: /* NARROW NO-BREAK SPACE */
1982     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1983     case 0x3000: /* IDEOGRAPHIC SPACE */
1984     ADD_NEW(state_offset + 1, 0);
1985     break;
1986     }
1987     break;
1988    
1989     /*-----------------------------------------------------------------*/
1990 nigel 77 /* Match a negated single character. This is only used for one-byte
1991     characters, that is, we know that d < 256. The character we are
1992     checking (c) can be multibyte. */
1993    
1994     case OP_NOT:
1995     if (clen > 0)
1996     {
1997 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1998 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1999     }
2000     break;
2001    
2002     /*-----------------------------------------------------------------*/
2003     case OP_PLUS:
2004     case OP_MINPLUS:
2005 nigel 93 case OP_POSPLUS:
2006 nigel 77 case OP_NOTPLUS:
2007     case OP_NOTMINPLUS:
2008 nigel 93 case OP_NOTPOSPLUS:
2009 nigel 77 count = current_state->count; /* Already matched */
2010     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2011     if (clen > 0)
2012     {
2013 nigel 93 unsigned int otherd = NOTACHAR;
2014 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2015     {
2016     #ifdef SUPPORT_UTF8
2017 nigel 87 if (utf8 && d >= 128)
2018 nigel 77 {
2019     #ifdef SUPPORT_UCP
2020 ph10 349 otherd = UCD_OTHERCASE(d);
2021 nigel 77 #endif /* SUPPORT_UCP */
2022     }
2023     else
2024     #endif /* SUPPORT_UTF8 */
2025     otherd = fcc[d];
2026     }
2027     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2028 nigel 93 {
2029     if (count > 0 &&
2030     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2031     {
2032     active_count--; /* Remove non-match possibility */
2033     next_active_state--;
2034     }
2035     count++;
2036     ADD_NEW(state_offset, count);
2037     }
2038 nigel 77 }
2039     break;
2040    
2041     /*-----------------------------------------------------------------*/
2042     case OP_QUERY:
2043     case OP_MINQUERY:
2044 nigel 93 case OP_POSQUERY:
2045 nigel 77 case OP_NOTQUERY:
2046     case OP_NOTMINQUERY:
2047 nigel 93 case OP_NOTPOSQUERY:
2048 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2049     if (clen > 0)
2050     {
2051 nigel 93 unsigned int otherd = NOTACHAR;
2052 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2053 nigel 77 {
2054     #ifdef SUPPORT_UTF8
2055 nigel 87 if (utf8 && d >= 128)
2056 nigel 77 {
2057     #ifdef SUPPORT_UCP
2058 ph10 349 otherd = UCD_OTHERCASE(d);
2059 nigel 77 #endif /* SUPPORT_UCP */
2060     }
2061     else
2062     #endif /* SUPPORT_UTF8 */
2063     otherd = fcc[d];
2064     }
2065     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2066 nigel 93 {
2067     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2068     {
2069     active_count--; /* Remove non-match possibility */
2070     next_active_state--;
2071     }
2072     ADD_NEW(state_offset + dlen + 1, 0);
2073     }
2074 nigel 77 }
2075     break;
2076    
2077     /*-----------------------------------------------------------------*/
2078     case OP_STAR:
2079     case OP_MINSTAR:
2080 nigel 93 case OP_POSSTAR:
2081 nigel 77 case OP_NOTSTAR:
2082     case OP_NOTMINSTAR:
2083 nigel 93 case OP_NOTPOSSTAR:
2084 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2085     if (clen > 0)
2086     {
2087 nigel 93 unsigned int otherd = NOTACHAR;
2088 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2089 nigel 77 {
2090     #ifdef SUPPORT_UTF8
2091 nigel 87 if (utf8 && d >= 128)
2092 nigel 77 {
2093     #ifdef SUPPORT_UCP
2094 ph10 349 otherd = UCD_OTHERCASE(d);
2095 nigel 77 #endif /* SUPPORT_UCP */
2096     }
2097     else
2098     #endif /* SUPPORT_UTF8 */
2099     otherd = fcc[d];
2100     }
2101     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2102 nigel 93 {
2103     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2104     {
2105     active_count--; /* Remove non-match possibility */
2106     next_active_state--;
2107     }
2108     ADD_NEW(state_offset, 0);
2109     }
2110 nigel 77 }
2111     break;
2112    
2113     /*-----------------------------------------------------------------*/
2114     case OP_EXACT:
2115 nigel 93 case OP_NOTEXACT:
2116     count = current_state->count; /* Number already matched */
2117     if (clen > 0)
2118     {
2119     unsigned int otherd = NOTACHAR;
2120     if ((ims & PCRE_CASELESS) != 0)
2121     {
2122     #ifdef SUPPORT_UTF8
2123     if (utf8 && d >= 128)
2124     {
2125     #ifdef SUPPORT_UCP
2126 ph10 349 otherd = UCD_OTHERCASE(d);
2127 nigel 93 #endif /* SUPPORT_UCP */
2128     }
2129     else
2130     #endif /* SUPPORT_UTF8 */
2131     otherd = fcc[d];
2132     }
2133     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2134     {
2135     if (++count >= GET2(code, 1))
2136     { ADD_NEW(state_offset + dlen + 3, 0); }
2137     else
2138     { ADD_NEW(state_offset, count); }
2139     }
2140     }
2141     break;
2142    
2143     /*-----------------------------------------------------------------*/
2144 nigel 77 case OP_UPTO:
2145     case OP_MINUPTO:
2146 nigel 93 case OP_POSUPTO:
2147 nigel 77 case OP_NOTUPTO:
2148     case OP_NOTMINUPTO:
2149 nigel 93 case OP_NOTPOSUPTO:
2150     ADD_ACTIVE(state_offset + dlen + 3, 0);
2151 nigel 77 count = current_state->count; /* Number already matched */
2152     if (clen > 0)
2153     {
2154 nigel 93 unsigned int otherd = NOTACHAR;
2155 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2156     {
2157     #ifdef SUPPORT_UTF8
2158 nigel 87 if (utf8 && d >= 128)
2159 nigel 77 {
2160     #ifdef SUPPORT_UCP
2161 ph10 349 otherd = UCD_OTHERCASE(d);
2162 nigel 77 #endif /* SUPPORT_UCP */
2163     }
2164     else
2165     #endif /* SUPPORT_UTF8 */
2166     otherd = fcc[d];
2167     }
2168     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2169     {
2170 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2171     {
2172     active_count--; /* Remove non-match possibility */
2173     next_active_state--;
2174     }
2175 nigel 77 if (++count >= GET2(code, 1))
2176     { ADD_NEW(state_offset + dlen + 3, 0); }
2177     else
2178     { ADD_NEW(state_offset, count); }
2179     }
2180     }
2181     break;
2182    
2183    
2184     /* ========================================================================== */
2185     /* These are the class-handling opcodes */
2186    
2187     case OP_CLASS:
2188     case OP_NCLASS:
2189     case OP_XCLASS:
2190     {
2191     BOOL isinclass = FALSE;
2192     int next_state_offset;
2193     const uschar *ecode;
2194    
2195     /* For a simple class, there is always just a 32-byte table, and we
2196     can set isinclass from it. */
2197    
2198     if (codevalue != OP_XCLASS)
2199     {
2200     ecode = code + 33;
2201     if (clen > 0)
2202     {
2203     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2204     ((code[1 + c/8] & (1 << (c&7))) != 0);
2205     }
2206     }
2207    
2208     /* An extended class may have a table or a list of single characters,
2209     ranges, or both, and it may be positive or negative. There's a
2210     function that sorts all this out. */
2211    
2212     else
2213     {
2214     ecode = code + GET(code, 1);
2215     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2216     }
2217    
2218     /* At this point, isinclass is set for all kinds of class, and ecode
2219     points to the byte after the end of the class. If there is a
2220     quantifier, this is where it will be. */
2221    
2222     next_state_offset = ecode - start_code;
2223    
2224     switch (*ecode)
2225     {
2226     case OP_CRSTAR:
2227     case OP_CRMINSTAR:
2228     ADD_ACTIVE(next_state_offset + 1, 0);
2229     if (isinclass) { ADD_NEW(state_offset, 0); }
2230     break;
2231    
2232     case OP_CRPLUS:
2233     case OP_CRMINPLUS:
2234     count = current_state->count; /* Already matched */
2235     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2236     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2237     break;
2238    
2239     case OP_CRQUERY:
2240     case OP_CRMINQUERY:
2241     ADD_ACTIVE(next_state_offset + 1, 0);
2242     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2243     break;
2244    
2245     case OP_CRRANGE:
2246     case OP_CRMINRANGE:
2247     count = current_state->count; /* Already matched */
2248     if (count >= GET2(ecode, 1))
2249     { ADD_ACTIVE(next_state_offset + 5, 0); }
2250     if (isinclass)
2251     {
2252 nigel 91 int max = GET2(ecode, 3);
2253     if (++count >= max && max != 0) /* Max 0 => no limit */
2254 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2255     else
2256     { ADD_NEW(state_offset, count); }
2257     }
2258     break;
2259    
2260     default:
2261     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2262     break;
2263     }
2264     }
2265     break;
2266    
2267     /* ========================================================================== */
2268     /* These are the opcodes for fancy brackets of various kinds. We have
2269 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2270     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2271 ph10 341 though the other "backtracking verbs" are not supported. */
2272 ph10 345
2273 ph10 341 case OP_FAIL:
2274 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2275 ph10 345 break;
2276 nigel 77
2277     case OP_ASSERT:
2278     case OP_ASSERT_NOT:
2279     case OP_ASSERTBACK:
2280     case OP_ASSERTBACK_NOT:
2281     {
2282     int rc;
2283     int local_offsets[2];
2284     int local_workspace[1000];
2285     const uschar *endasscode = code + GET(code, 1);
2286    
2287     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2288    
2289     rc = internal_dfa_exec(
2290     md, /* static match data */
2291     code, /* this subexpression's code */
2292     ptr, /* where we currently are */
2293     ptr - start_subject, /* start offset */
2294     local_offsets, /* offset vector */
2295     sizeof(local_offsets)/sizeof(int), /* size of same */
2296     local_workspace, /* workspace vector */
2297     sizeof(local_workspace)/sizeof(int), /* size of same */
2298     ims, /* the current ims flags */
2299     rlevel, /* function recursion level */
2300     recursing); /* pass on regex recursion */
2301    
2302     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2303     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2304     }
2305     break;
2306    
2307     /*-----------------------------------------------------------------*/
2308     case OP_COND:
2309 nigel 93 case OP_SCOND:
2310 nigel 77 {
2311     int local_offsets[1000];
2312     int local_workspace[1000];
2313 ph10 406 int codelink = GET(code, 1);
2314 ph10 397 int condcode;
2315 ph10 406
2316 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2317 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2318 ph10 398 happen for the other conditions. */
2319 nigel 77
2320 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2321 ph10 406 {
2322     rrc = 0;
2323 ph10 397 if (pcre_callout != NULL)
2324     {
2325     pcre_callout_block cb;
2326     cb.version = 1; /* Version 1 of the callout block */
2327     cb.callout_number = code[LINK_SIZE+2];
2328     cb.offset_vector = offsets;
2329     cb.subject = (PCRE_SPTR)start_subject;
2330     cb.subject_length = end_subject - start_subject;
2331     cb.start_match = current_subject - start_subject;
2332     cb.current_position = ptr - start_subject;
2333     cb.pattern_position = GET(code, LINK_SIZE + 3);
2334     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2335     cb.capture_top = 1;
2336     cb.capture_last = -1;
2337     cb.callout_data = md->callout_data;
2338     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2339     }
2340 ph10 398 if (rrc > 0) break; /* Fail this thread */
2341     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2342 ph10 406 }
2343 ph10 398
2344 ph10 397 condcode = code[LINK_SIZE+1];
2345 ph10 406
2346 nigel 93 /* Back reference conditions are not supported */
2347 nigel 77
2348 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2349 ph10 459 return PCRE_ERROR_DFA_UCOND;
2350 nigel 93
2351     /* The DEFINE condition is always false */
2352    
2353     if (condcode == OP_DEF)
2354 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2355 nigel 93
2356     /* The only supported version of OP_RREF is for the value RREF_ANY,
2357     which means "test if in any recursion". We can't test for specifically
2358     recursed groups. */
2359    
2360 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2361 nigel 93 {
2362 nigel 77 int value = GET2(code, LINK_SIZE+2);
2363 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2364 ph10 406 if (recursing > 0)
2365 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2366     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2367 nigel 77 }
2368    
2369     /* Otherwise, the condition is an assertion */
2370    
2371     else
2372     {
2373     int rc;
2374     const uschar *asscode = code + LINK_SIZE + 1;
2375     const uschar *endasscode = asscode + GET(asscode, 1);
2376    
2377     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2378    
2379     rc = internal_dfa_exec(
2380     md, /* fixed match data */
2381     asscode, /* this subexpression's code */
2382     ptr, /* where we currently are */
2383     ptr - start_subject, /* start offset */
2384     local_offsets, /* offset vector */
2385     sizeof(local_offsets)/sizeof(int), /* size of same */
2386     local_workspace, /* workspace vector */
2387     sizeof(local_workspace)/sizeof(int), /* size of same */
2388     ims, /* the current ims flags */
2389     rlevel, /* function recursion level */
2390     recursing); /* pass on regex recursion */
2391    
2392     if ((rc >= 0) ==
2393     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2394 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2395 nigel 77 else
2396 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2397 nigel 77 }
2398     }
2399     break;
2400    
2401     /*-----------------------------------------------------------------*/
2402     case OP_RECURSE:
2403     {
2404     int local_offsets[1000];
2405     int local_workspace[1000];
2406     int rc;
2407    
2408     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2409     recursing + 1));
2410    
2411     rc = internal_dfa_exec(
2412     md, /* fixed match data */
2413     start_code + GET(code, 1), /* this subexpression's code */
2414     ptr, /* where we currently are */
2415     ptr - start_subject, /* start offset */
2416     local_offsets, /* offset vector */
2417     sizeof(local_offsets)/sizeof(int), /* size of same */
2418     local_workspace, /* workspace vector */
2419     sizeof(local_workspace)/sizeof(int), /* size of same */
2420     ims, /* the current ims flags */
2421     rlevel, /* function recursion level */
2422     recursing + 1); /* regex recurse level */
2423    
2424     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2425     recursing + 1, rc));
2426    
2427     /* Ran out of internal offsets */
2428    
2429     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2430    
2431     /* For each successful matched substring, set up the next state with a
2432     count of characters to skip before trying it. Note that the count is in
2433     characters, not bytes. */
2434    
2435     if (rc > 0)
2436     {
2437     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2438     {
2439     const uschar *p = start_subject + local_offsets[rc];
2440     const uschar *pp = start_subject + local_offsets[rc+1];
2441     int charcount = local_offsets[rc+1] - local_offsets[rc];
2442     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2443     if (charcount > 0)
2444     {
2445     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2446     }
2447     else
2448     {
2449     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2450     }
2451     }
2452     }
2453     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2454     }
2455     break;
2456    
2457     /*-----------------------------------------------------------------*/
2458     case OP_ONCE:
2459     {
2460     int local_offsets[2];
2461     int local_workspace[1000];
2462    
2463     int rc = internal_dfa_exec(
2464     md, /* fixed match data */
2465     code, /* this subexpression's code */
2466     ptr, /* where we currently are */
2467     ptr - start_subject, /* start offset */
2468     local_offsets, /* offset vector */
2469     sizeof(local_offsets)/sizeof(int), /* size of same */
2470     local_workspace, /* workspace vector */
2471     sizeof(local_workspace)/sizeof(int), /* size of same */
2472     ims, /* the current ims flags */
2473     rlevel, /* function recursion level */
2474     recursing); /* pass on regex recursion */
2475    
2476     if (rc >= 0)
2477     {
2478     const uschar *end_subpattern = code;
2479     int charcount = local_offsets[1] - local_offsets[0];
2480     int next_state_offset, repeat_state_offset;
2481    
2482     do { end_subpattern += GET(end_subpattern, 1); }
2483     while (*end_subpattern == OP_ALT);
2484     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2485    
2486     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2487     arrange for the repeat state also to be added to the relevant list.
2488     Calculate the offset, or set -1 for no repeat. */
2489    
2490     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2491     *end_subpattern == OP_KETRMIN)?
2492     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2493    
2494     /* If we have matched an empty string, add the next state at the
2495     current character pointer. This is important so that the duplicate
2496     checking kicks in, which is what breaks infinite loops that match an
2497     empty string. */
2498    
2499     if (charcount == 0)
2500     {
2501     ADD_ACTIVE(next_state_offset, 0);
2502     }
2503    
2504     /* Optimization: if there are no more active states, and there
2505     are no new states yet set up, then skip over the subject string
2506     right here, to save looping. Otherwise, set up the new state to swing
2507     into action when the end of the substring is reached. */
2508    
2509     else if (i + 1 >= active_count && new_count == 0)
2510     {
2511     ptr += charcount;
2512     clen = 0;
2513     ADD_NEW(next_state_offset, 0);
2514    
2515     /* If we are adding a repeat state at the new character position,
2516     we must fudge things so that it is the only current state.
2517     Otherwise, it might be a duplicate of one we processed before, and
2518     that would cause it to be skipped. */
2519    
2520     if (repeat_state_offset >= 0)
2521     {
2522     next_active_state = active_states;
2523     active_count = 0;
2524     i = -1;
2525     ADD_ACTIVE(repeat_state_offset, 0);
2526     }
2527     }
2528     else
2529     {
2530     const uschar *p = start_subject + local_offsets[0];
2531     const uschar *pp = start_subject + local_offsets[1];
2532     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2533     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2534     if (repeat_state_offset >= 0)
2535     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2536     }
2537    
2538     }
2539     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2540     }
2541     break;
2542    
2543    
2544     /* ========================================================================== */
2545     /* Handle callouts */
2546    
2547     case OP_CALLOUT:
2548 ph10 406 rrc = 0;
2549 nigel 77 if (pcre_callout != NULL)
2550     {
2551     pcre_callout_block cb;
2552     cb.version = 1; /* Version 1 of the callout block */
2553     cb.callout_number = code[1];
2554     cb.offset_vector = offsets;
2555 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2556 nigel 77 cb.subject_length = end_subject - start_subject;
2557     cb.start_match = current_subject - start_subject;
2558     cb.current_position = ptr - start_subject;
2559     cb.pattern_position = GET(code, 2);
2560     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2561     cb.capture_top = 1;
2562     cb.capture_last = -1;
2563     cb.callout_data = md->callout_data;
2564     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2565 ph10 406 }
2566     if (rrc == 0)
2567     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2568 nigel 77 break;
2569    
2570    
2571     /* ========================================================================== */
2572     default: /* Unsupported opcode */
2573     return PCRE_ERROR_DFA_UITEM;
2574     }
2575    
2576     NEXT_ACTIVE_STATE: continue;
2577    
2578     } /* End of loop scanning active states */
2579    
2580     /* We have finished the processing at the current subject character. If no
2581     new states have been set for the next character, we have found all the
2582     matches that we are going to find. If we are at the top level and partial
2583 ph10 463 matching has been requested, check for appropriate conditions.
2584    
2585 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2586     character. If it is equal to the original active_count (saved in
2587     workspace[1]) it means that (*F) was found on every active state. In this
2588 ph10 463 case we don't want to give a partial match.
2589 nigel 77
2590 ph10 463 The "could_continue" variable is true if a state could have continued but
2591     for the fact that the end of the subject was reached. */
2592    
2593 nigel 77 if (new_count <= 0)
2594     {
2595 ph10 427 if (rlevel == 1 && /* Top level, and */
2596 ph10 463 could_continue && /* Some could go on */
2597 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2598 ph10 427 ( /* either... */
2599     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2600     || /* or... */
2601     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2602     match_count < 0) /* no matches */
2603     ) && /* And... */
2604     ptr >= end_subject && /* Reached end of subject */
2605     ptr > current_subject) /* Matched non-empty string */
2606 nigel 77 {
2607     if (offsetcount >= 2)
2608     {
2609 ph10 435 offsets[0] = md->start_used_ptr - start_subject;
2610 nigel 77 offsets[1] = end_subject - start_subject;
2611     }
2612     match_count = PCRE_ERROR_PARTIAL;
2613     }
2614    
2615     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2616     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2617     rlevel*2-2, SP));
2618 nigel 91 break; /* In effect, "return", but see the comment below */
2619 nigel 77 }
2620    
2621     /* One or more states are active for the next character. */
2622    
2623     ptr += clen; /* Advance to next subject character */
2624     } /* Loop to move along the subject string */
2625    
2626 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2627     if we use "return" above, we have compiler trouble. Some compilers warn if
2628     there's nothing here because they think the function doesn't return a value. On
2629     the other hand, if we put a dummy statement here, some more clever compilers
2630     complain that it can't be reached. Sigh. */
2631 nigel 77
2632 nigel 91 return match_count;
2633 nigel 77 }
2634    
2635    
2636    
2637    
2638     /*************************************************
2639     * Execute a Regular Expression - DFA engine *
2640     *************************************************/
2641    
2642     /* This external function applies a compiled re to a subject string using a DFA
2643     engine. This function calls the internal function multiple times if the pattern
2644     is not anchored.
2645    
2646     Arguments:
2647     argument_re points to the compiled expression
2648 ph10 97 extra_data points to extra data or is NULL
2649 nigel 77 subject points to the subject string
2650     length length of subject string (may contain binary zeros)
2651     start_offset where to start in the subject string
2652     options option bits
2653     offsets vector of match offsets
2654     offsetcount size of same
2655     workspace workspace vector
2656     wscount size of same
2657    
2658     Returns: > 0 => number of match offset pairs placed in offsets
2659     = 0 => offsets overflowed; longest matches are present
2660     -1 => failed to match
2661     < -1 => some kind of unexpected problem
2662     */
2663    
2664 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2665 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2666     const char *subject, int length, int start_offset, int options, int *offsets,
2667     int offsetcount, int *workspace, int wscount)
2668     {
2669     real_pcre *re = (real_pcre *)argument_re;
2670     dfa_match_data match_block;
2671 nigel 91 dfa_match_data *md = &match_block;
2672 nigel 77 BOOL utf8, anchored, startline, firstline;
2673     const uschar *current_subject, *end_subject, *lcc;
2674    
2675     pcre_study_data internal_study;
2676     const pcre_study_data *study = NULL;
2677     real_pcre internal_re;
2678    
2679     const uschar *req_byte_ptr;
2680     const uschar *start_bits = NULL;
2681     BOOL first_byte_caseless = FALSE;
2682     BOOL req_byte_caseless = FALSE;
2683     int first_byte = -1;
2684     int req_byte = -1;
2685     int req_byte2 = -1;
2686 nigel 91 int newline;
2687 nigel 77
2688     /* Plausibility checks */
2689    
2690     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2691     if (re == NULL || subject == NULL || workspace == NULL ||
2692     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2693     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2694     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2695    
2696     /* We need to find the pointer to any study data before we test for byte
2697     flipping, so we scan the extra_data block first. This may set two fields in the
2698     match block, so we must initialize them beforehand. However, the other fields
2699     in the match block must not be set until after the byte flipping. */
2700    
2701 nigel 91 md->tables = re->tables;
2702     md->callout_data = NULL;
2703 nigel 77
2704     if (extra_data != NULL)
2705     {
2706     unsigned int flags = extra_data->flags;
2707     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2708     study = (const pcre_study_data *)extra_data->study_data;
2709     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2710 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2711     return PCRE_ERROR_DFA_UMLIMIT;
2712 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2713 nigel 91 md->callout_data = extra_data->callout_data;
2714 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2715 nigel 91 md->tables = extra_data->tables;
2716 nigel 77 }
2717 ph10 461
2718 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
2719     test for a regex that was compiled on a host of opposite endianness. If this is
2720     the case, flipped values are put in internal_re and internal_study if there was
2721     study data too. */
2722    
2723     if (re->magic_number != MAGIC_NUMBER)
2724     {
2725     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2726     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2727     if (study != NULL) study = &internal_study;
2728     }
2729    
2730     /* Set some local values */
2731    
2732     current_subject = (const unsigned char *)subject + start_offset;
2733     end_subject = (const unsigned char *)subject + length;
2734     req_byte_ptr = current_subject - 1;
2735    
2736 nigel 91 #ifdef SUPPORT_UTF8
2737 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2738 nigel 91 #else
2739     utf8 = FALSE;
2740     #endif
2741 nigel 77
2742 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2743     (re->options & PCRE_ANCHORED) != 0;
2744    
2745 nigel 77 /* The remaining fixed data for passing around. */
2746    
2747 nigel 91 md->start_code = (const uschar *)argument_re +
2748 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2749 nigel 91 md->start_subject = (const unsigned char *)subject;
2750     md->end_subject = end_subject;
2751 ph10 442 md->start_offset = start_offset;
2752 nigel 91 md->moptions = options;
2753     md->poptions = re->options;
2754 nigel 77
2755 ph10 231 /* If the BSR option is not set at match time, copy what was set
2756     at compile time. */
2757    
2758     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2759     {
2760     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2761     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2762     #ifdef BSR_ANYCRLF
2763     else md->moptions |= PCRE_BSR_ANYCRLF;
2764 ph10 243 #endif
2765     }
2766 ph10 231
2767 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2768     nothing is set at run time, whatever was used at compile time applies. */
2769 nigel 91
2770 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2771 nigel 93 PCRE_NEWLINE_BITS)
2772 nigel 91 {
2773 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2774 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2775     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2776 nigel 91 case PCRE_NEWLINE_CR+
2777 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2778 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2779 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2780 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2781 nigel 91 }
2782    
2783 ph10 149 if (newline == -2)
2784 nigel 91 {
2785 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2786     }
2787     else if (newline < 0)
2788     {
2789 nigel 93 md->nltype = NLTYPE_ANY;
2790 nigel 91 }
2791     else
2792     {
2793 nigel 93 md->nltype = NLTYPE_FIXED;
2794     if (newline > 255)
2795     {
2796     md->nllen = 2;
2797     md->nl[0] = (newline >> 8) & 255;
2798     md->nl[1] = newline & 255;
2799     }
2800     else
2801     {
2802     md->nllen = 1;
2803     md->nl[0] = newline;
2804     }
2805 nigel 91 }
2806    
2807 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2808     back the character offset. */
2809    
2810     #ifdef SUPPORT_UTF8
2811     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2812     {
2813     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2814     return PCRE_ERROR_BADUTF8;
2815     if (start_offset > 0 && start_offset < length)
2816     {
2817     int tb = ((uschar *)subject)[start_offset];
2818     if (tb > 127)
2819     {
2820     tb &= 0xc0;
2821     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2822     }
2823     }
2824     }
2825     #endif
2826    
2827     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2828     is a feature that makes it possible to save compiled regex and re-use them
2829     in other programs later. */
2830    
2831 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2832 nigel 77
2833     /* The lower casing table and the "must be at the start of a line" flag are
2834     used in a loop when finding where to start. */
2835    
2836 nigel 91 lcc = md->tables + lcc_offset;
2837 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2838 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2839    
2840     /* Set up the first character to match, if available. The first_byte value is
2841     never set for an anchored regular expression, but the anchoring may be forced
2842     at run time, so we have to test for anchoring. The first char may be unset for
2843     an unanchored pattern, of course. If there's no first char and the pattern was
2844     studied, there may be a bitmap of possible first characters. */
2845    
2846     if (!anchored)
2847     {
2848 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2849 nigel 77 {
2850     first_byte = re->first_byte & 255;
2851     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2852     first_byte = lcc[first_byte];
2853     }
2854     else
2855     {
2856 ph10 455 if (!startline && study != NULL &&
2857     (study->flags & PCRE_STUDY_MAPPED) != 0)
2858 nigel 77 start_bits = study->start_bits;
2859     }
2860     }
2861    
2862     /* For anchored or unanchored matches, there may be a "last known required
2863     character" set. */
2864    
2865 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2866 nigel 77 {
2867     req_byte = re->req_byte & 255;
2868     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2869 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2870 nigel 77 }
2871    
2872     /* Call the main matching function, looping for a non-anchored regex after a
2873 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2874     a match. */
2875 nigel 77
2876     for (;;)
2877     {
2878     int rc;
2879    
2880     if ((options & PCRE_DFA_RESTART) == 0)
2881     {
2882     const uschar *save_end_subject = end_subject;
2883    
2884 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2885     line of a multiline string. Implement this by temporarily adjusting
2886     end_subject so that we stop scanning at a newline. If the match fails at
2887     the newline, later code breaks this loop. */
2888 nigel 77
2889     if (firstline)
2890     {
2891 ph10 365 USPTR t = current_subject;
2892     #ifdef SUPPORT_UTF8
2893     if (utf8)
2894 ph10 371 {
2895     while (t < md->end_subject && !IS_NEWLINE(t))
2896 ph10 365 {
2897     t++;
2898     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2899 ph10 371 }
2900 ph10 365 }
2901     else
2902 ph10 371 #endif
2903 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2904 nigel 77 end_subject = t;
2905     }
2906 ph10 392
2907 ph10 389 /* There are some optimizations that avoid running the match if a known
2908 ph10 455 starting point is not found. However, there is an option that disables
2909     these, for testing and for ensuring that all callouts do actually occur. */
2910 nigel 77
2911 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2912 ph10 392 {
2913 ph10 389 /* Advance to a known first byte. */
2914 ph10 392
2915 ph10 389 if (first_byte >= 0)
2916 nigel 77 {
2917 ph10 389 if (first_byte_caseless)
2918     while (current_subject < end_subject &&
2919     lcc[*current_subject] != first_byte)
2920     current_subject++;
2921     else
2922 ph10 392 while (current_subject < end_subject &&
2923 ph10 389 *current_subject != first_byte)
2924     current_subject++;
2925     }
2926 ph10 392
2927 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2928 ph10 392
2929 ph10 389 else if (startline)
2930     {
2931     if (current_subject > md->start_subject + start_offset)
2932     {
2933 ph10 365 #ifdef SUPPORT_UTF8
2934 ph10 389 if (utf8)
2935 ph10 365 {
2936 ph10 392 while (current_subject < end_subject &&
2937 ph10 389 !WAS_NEWLINE(current_subject))
2938     {
2939 ph10 365 current_subject++;
2940 ph10 389 while(current_subject < end_subject &&
2941     (*current_subject & 0xc0) == 0x80)
2942     current_subject++;
2943     }
2944 ph10 371 }
2945 ph10 389 else
2946     #endif
2947     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2948     current_subject++;
2949 ph10 392
2950 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2951     ANYCRLF, and we are now at a LF, advance the match position by one
2952     more character. */
2953 ph10 392
2954 ph10 391 if (current_subject[-1] == CHAR_CR &&
2955 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2956     current_subject < end_subject &&
2957 ph10 391 *current_subject == CHAR_NL)
2958 ph10 389 current_subject++;
2959 ph10 365 }
2960 nigel 77 }
2961 ph10 392
2962 ph10 389 /* Or to a non-unique first char after study */
2963 ph10 392
2964 ph10 389 else if (start_bits != NULL)
2965 nigel 77 {
2966 ph10 389 while (current_subject < end_subject)
2967     {
2968     register unsigned int c = *current_subject;
2969     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2970     else break;
2971     }
2972 nigel 77 }
2973 ph10 392 }
2974 nigel 77
2975     /* Restore fudged end_subject */
2976    
2977     end_subject = save_end_subject;
2978    
2979 ph10 461 /* The following two optimizations are disabled for partial matching or if
2980     disabling is explicitly requested (and of course, by the test above, this
2981 ph10 455 code is not obeyed when restarting after a partial match). */
2982 ph10 461
2983 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2984     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
2985 ph10 461 {
2986 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
2987     is a lower bound; no actual string of that length may actually match the
2988     pattern. Although the value is, strictly, in characters, we treat it as
2989     bytes to avoid spending too much time in this optimization. */
2990 nigel 77
2991 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
2992     end_subject - current_subject < study->minlength)
2993     return PCRE_ERROR_NOMATCH;
2994 ph10 461
2995 ph10 455 /* If req_byte is set, we know that that character must appear in the
2996     subject for the match to succeed. If the first character is set, req_byte
2997     must be later in the subject; otherwise the test starts at the match
2998     point. This optimization can save a huge amount of work in patterns with
2999     nested unlimited repeats that aren't going to match. Writing separate
3000     code for cased/caseless versions makes it go faster, as does using an
3001     autoincrement and backing off on a match.
3002 ph10 461
3003 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3004     can take a long time, and give bad performance on quite ordinary
3005     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3006     string... so we don't do this when the string is sufficiently long. */
3007 ph10 461
3008 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3009 nigel 77 {
3010 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3011 ph10 461
3012 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3013     place we found it at last time. */
3014 ph10 461
3015 ph10 455 if (p > req_byte_ptr)
3016 nigel 77 {
3017 ph10 455 if (req_byte_caseless)
3018     {
3019     while (p < end_subject)
3020     {
3021     register int pp = *p++;
3022     if (pp == req_byte || pp == req_byte2) { p--; break; }
3023     }
3024     }
3025     else
3026     {
3027     while (p < end_subject)
3028     {
3029     if (*p++ == req_byte) { p--; break; }
3030     }
3031     }
3032 ph10 461
3033 ph10 455 /* If we can't find the required character, break the matching loop,
3034     which will cause a return or PCRE_ERROR_NOMATCH. */
3035 ph10 461
3036 ph10 455 if (p >= end_subject) break;
3037 ph10 461
3038 ph10 455 /* If we have found the required character, save the point where we
3039     found it, so that we don't search again next time round the loop if
3040     the start hasn't passed this character yet. */
3041 ph10 461
3042 ph10 455 req_byte_ptr = p;
3043 nigel 77 }
3044 ph10 461 }
3045 nigel 77 }
3046 ph10 455 } /* End of optimizations that are done when not restarting */
3047 nigel 77
3048     /* OK, now we can do the business */
3049    
3050 ph10 435 md->start_used_ptr = current_subject;
3051 ph10 461
3052 nigel 77 rc = internal_dfa_exec(
3053 nigel 91 md, /* fixed match data */
3054     md->start_code, /* this subexpression's code */
3055     current_subject, /* where we currently are */
3056     start_offset, /* start offset in subject */
3057     offsets, /* offset vector */
3058     offsetcount, /* size of same */
3059     workspace, /* workspace vector */
3060     wscount, /* size of same */
3061 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3062 nigel 91 0, /* function recurse level */
3063     0); /* regex recurse level */
3064 nigel 77
3065     /* Anything other than "no match" means we are done, always; otherwise, carry
3066     on only if not anchored. */
3067    
3068     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3069    
3070     /* Advance to the next subject character unless we are at the end of a line
3071     and firstline is set. */
3072    
3073 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3074 nigel 77 current_subject++;
3075     if (utf8)
3076     {
3077     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3078     current_subject++;
3079     }
3080     if (current_subject > end_subject) break;
3081    
3082 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3083 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3084     or ANY or ANYCRLF, advance the match position by one more character. */
3085 nigel 93
3086 ph10 391 if (current_subject[-1] == CHAR_CR &&
3087 ph10 226 current_subject < end_subject &&
3088 ph10 391 *current_subject == CHAR_NL &&
3089 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3090 ph10 226 (md->nltype == NLTYPE_ANY ||
3091     md->nltype == NLTYPE_ANYCRLF ||
3092     md->nllen == 2))
3093 nigel 93 current_subject++;
3094    
3095     } /* "Bumpalong" loop */
3096    
3097 nigel 77 return PCRE_ERROR_NOMATCH;
3098     }
3099    
3100     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12