/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 462 - (hide annotations) (download)
Sat Oct 17 19:55:02 2009 UTC (3 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 107111 byte(s)
Fix PCRE_PARTIAL_HARD for patterns that end optionally, e.g. abc*

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109     character that is to be tested in some way. This makes is possible to
110     centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 462 small value. Non-zero values in the table are the offsets from the opcode where
113     the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
122     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
124     1, /* Char */
125     1, /* Charnc */
126     1, /* not */
127     /* Positive single-char repeats */
128     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
129     3, 3, 3, /* upto, minupto, exact */
130 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
131 nigel 77 /* Negative single-char repeats - only for chars < 256 */
132     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
133     3, 3, 3, /* NOT upto, minupto, exact */
134 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
135 nigel 77 /* Positive type repeats */
136     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
137     3, 3, 3, /* Type upto, minupto, exact */
138 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
139 nigel 77 /* Character class & ref repeats */
140     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
141     0, 0, /* CRRANGE, CRMINRANGE */
142     0, /* CLASS */
143     0, /* NCLASS */
144     0, /* XCLASS - variable length */
145     0, /* REF */
146     0, /* RECURSE */
147     0, /* CALLOUT */
148     0, /* Alt */
149     0, /* Ket */
150     0, /* KetRmax */
151     0, /* KetRmin */
152     0, /* Assert */
153     0, /* Assert not */
154     0, /* Assert behind */
155     0, /* Assert behind not */
156     0, /* Reverse */
157 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
158     0, 0, 0, /* SBRA, SCBRA, SCOND */
159 nigel 77 0, /* CREF */
160 nigel 93 0, /* RREF */
161     0, /* DEF */
162 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
163     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
164 ph10 462 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
165 nigel 77 };
166    
167 ph10 462 /* This table identifies those opcodes that inspect a character. It is used to
168     remember the fact that a character could have been inspected when the end of
169     the subject is reached, in order to support PCRE_PARTIAL_HARD behaviour.
170     ***NOTE*** If the start of this table is modified, the two tables that follow
171     must also be modified. */
172    
173     static const uschar poptable[] = {
174     0, /* End */
175     0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
176     1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
177     1, 1, 1, /* Any, AllAny, Anybyte */
178     1, 1, 1, /* NOTPROP, PROP, EXTUNI */
179     1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
180     0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
181     1, /* Char */
182     1, /* Charnc */
183     1, /* not */
184     /* Positive single-char repeats */
185     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
186     1, 1, 1, /* upto, minupto, exact */
187     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
188     /* Negative single-char repeats - only for chars < 256 */
189     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
190     1, 1, 1, /* NOT upto, minupto, exact */
191     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
192     /* Positive type repeats */
193     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
194     1, 1, 1, /* Type upto, minupto, exact */
195     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
196     /* Character class & ref repeats */
197     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
198     1, 1, /* CRRANGE, CRMINRANGE */
199     1, /* CLASS */
200     1, /* NCLASS */
201     1, /* XCLASS - variable length */
202     0, /* REF */
203     0, /* RECURSE */
204     0, /* CALLOUT */
205     0, /* Alt */
206     0, /* Ket */
207     0, /* KetRmax */
208     0, /* KetRmin */
209     0, /* Assert */
210     0, /* Assert not */
211     0, /* Assert behind */
212     0, /* Assert behind not */
213     0, /* Reverse */
214     0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
215     0, 0, 0, /* SBRA, SCBRA, SCOND */
216     0, /* CREF */
217     0, /* RREF */
218     0, /* DEF */
219     0, 0, /* BRAZERO, BRAMINZERO */
220     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
221     0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
222     };
223    
224 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
225     and \w */
226    
227 ph10 327 static const uschar toptable1[] = {
228 ph10 168 0, 0, 0, 0, 0, 0,
229 nigel 77 ctype_digit, ctype_digit,
230     ctype_space, ctype_space,
231     ctype_word, ctype_word,
232 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
233 nigel 77 };
234    
235 ph10 327 static const uschar toptable2[] = {
236 ph10 168 0, 0, 0, 0, 0, 0,
237 nigel 77 ctype_digit, 0,
238     ctype_space, 0,
239     ctype_word, 0,
240 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
241 nigel 77 };
242    
243    
244     /* Structure for holding data about a particular state, which is in effect the
245     current data for an active path through the match tree. It must consist
246     entirely of ints because the working vector we are passed, and which we put
247     these structures in, is a vector of ints. */
248    
249     typedef struct stateblock {
250     int offset; /* Offset to opcode */
251     int count; /* Count for repeats */
252     int ims; /* ims flag bits */
253     int data; /* Some use extra data */
254     } stateblock;
255    
256     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
257    
258    
259     #ifdef DEBUG
260     /*************************************************
261     * Print character string *
262     *************************************************/
263    
264     /* Character string printing function for debugging.
265    
266     Arguments:
267     p points to string
268     length number of bytes
269     f where to print
270    
271     Returns: nothing
272     */
273    
274     static void
275     pchars(unsigned char *p, int length, FILE *f)
276     {
277     int c;
278     while (length-- > 0)
279     {
280     if (isprint(c = *(p++)))
281     fprintf(f, "%c", c);
282     else
283     fprintf(f, "\\x%02x", c);
284     }
285     }
286     #endif
287    
288    
289    
290     /*************************************************
291     * Execute a Regular Expression - DFA engine *
292     *************************************************/
293    
294     /* This internal function applies a compiled pattern to a subject string,
295     starting at a given point, using a DFA engine. This function is called from the
296     external one, possibly multiple times if the pattern is not anchored. The
297     function calls itself recursively for some kinds of subpattern.
298    
299     Arguments:
300     md the match_data block with fixed information
301     this_start_code the opening bracket of this subexpression's code
302     current_subject where we currently are in the subject string
303     start_offset start offset in the subject string
304     offsets vector to contain the matching string offsets
305     offsetcount size of same
306     workspace vector of workspace
307     wscount size of same
308     ims the current ims flags
309     rlevel function call recursion level
310     recursing regex recursive call level
311    
312 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
313 ph10 341 = 0 => offsets overflowed; longest matches are present
314 nigel 77 -1 => failed to match
315     < -1 => some kind of unexpected problem
316    
317     The following macros are used for adding states to the two state vectors (one
318     for the current character, one for the following character). */
319    
320     #define ADD_ACTIVE(x,y) \
321     if (active_count++ < wscount) \
322     { \
323     next_active_state->offset = (x); \
324     next_active_state->count = (y); \
325     next_active_state->ims = ims; \
326     next_active_state++; \
327     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
328     } \
329     else return PCRE_ERROR_DFA_WSSIZE
330    
331     #define ADD_ACTIVE_DATA(x,y,z) \
332     if (active_count++ < wscount) \
333     { \
334     next_active_state->offset = (x); \
335     next_active_state->count = (y); \
336     next_active_state->ims = ims; \
337     next_active_state->data = (z); \
338     next_active_state++; \
339     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
340     } \
341     else return PCRE_ERROR_DFA_WSSIZE
342    
343     #define ADD_NEW(x,y) \
344     if (new_count++ < wscount) \
345     { \
346     next_new_state->offset = (x); \
347     next_new_state->count = (y); \
348     next_new_state->ims = ims; \
349     next_new_state++; \
350     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
351     } \
352     else return PCRE_ERROR_DFA_WSSIZE
353    
354     #define ADD_NEW_DATA(x,y,z) \
355     if (new_count++ < wscount) \
356     { \
357     next_new_state->offset = (x); \
358     next_new_state->count = (y); \
359     next_new_state->ims = ims; \
360     next_new_state->data = (z); \
361     next_new_state++; \
362     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
363     } \
364     else return PCRE_ERROR_DFA_WSSIZE
365    
366     /* And now, here is the code */
367    
368     static int
369     internal_dfa_exec(
370     dfa_match_data *md,
371     const uschar *this_start_code,
372     const uschar *current_subject,
373     int start_offset,
374     int *offsets,
375     int offsetcount,
376     int *workspace,
377     int wscount,
378     int ims,
379     int rlevel,
380     int recursing)
381     {
382     stateblock *active_states, *new_states, *temp_states;
383     stateblock *next_active_state, *next_new_state;
384    
385     const uschar *ctypes, *lcc, *fcc;
386     const uschar *ptr;
387 nigel 93 const uschar *end_code, *first_op;
388 nigel 77
389     int active_count, new_count, match_count;
390    
391     /* Some fields in the md block are frequently referenced, so we load them into
392     independent variables in the hope that this will perform better. */
393    
394     const uschar *start_subject = md->start_subject;
395     const uschar *end_subject = md->end_subject;
396     const uschar *start_code = md->start_code;
397    
398 nigel 87 #ifdef SUPPORT_UTF8
399 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
400 nigel 93 #else
401     BOOL utf8 = FALSE;
402 nigel 87 #endif
403 nigel 77
404     rlevel++;
405     offsetcount &= (-2);
406    
407     wscount -= 2;
408     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
409     (2 * INTS_PER_STATEBLOCK);
410    
411     DPRINTF(("\n%.*s---------------------\n"
412     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
413     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
414    
415     ctypes = md->tables + ctypes_offset;
416     lcc = md->tables + lcc_offset;
417     fcc = md->tables + fcc_offset;
418    
419     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
420    
421     active_states = (stateblock *)(workspace + 2);
422     next_new_state = new_states = active_states + wscount;
423     new_count = 0;
424    
425 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
426     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
427    
428 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
429     the alternative states onto the list, and find out where the end is. This
430     makes is possible to use this function recursively, when we want to stop at a
431     matching internal ket rather than at the end.
432    
433     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
434     a backward assertion. In that case, we have to find out the maximum amount to
435     move back, and set up each alternative appropriately. */
436    
437 nigel 93 if (*first_op == OP_REVERSE)
438 nigel 77 {
439     int max_back = 0;
440     int gone_back;
441    
442     end_code = this_start_code;
443     do
444     {
445     int back = GET(end_code, 2+LINK_SIZE);
446     if (back > max_back) max_back = back;
447     end_code += GET(end_code, 1);
448     }
449     while (*end_code == OP_ALT);
450    
451     /* If we can't go back the amount required for the longest lookbehind
452     pattern, go back as far as we can; some alternatives may still be viable. */
453    
454     #ifdef SUPPORT_UTF8
455     /* In character mode we have to step back character by character */
456    
457     if (utf8)
458     {
459     for (gone_back = 0; gone_back < max_back; gone_back++)
460     {
461     if (current_subject <= start_subject) break;
462     current_subject--;
463     while (current_subject > start_subject &&
464     (*current_subject & 0xc0) == 0x80)
465     current_subject--;
466     }
467     }
468     else
469     #endif
470    
471     /* In byte-mode we can do this quickly. */
472    
473     {
474     gone_back = (current_subject - max_back < start_subject)?
475     current_subject - start_subject : max_back;
476     current_subject -= gone_back;
477     }
478 ph10 461
479 ph10 435 /* Save the earliest consulted character */
480 nigel 77
481 ph10 461 if (current_subject < md->start_used_ptr)
482     md->start_used_ptr = current_subject;
483    
484 nigel 77 /* Now we can process the individual branches. */
485    
486     end_code = this_start_code;
487     do
488     {
489     int back = GET(end_code, 2+LINK_SIZE);
490     if (back <= gone_back)
491     {
492     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
493     ADD_NEW_DATA(-bstate, 0, gone_back - back);
494     }
495     end_code += GET(end_code, 1);
496     }
497     while (*end_code == OP_ALT);
498     }
499    
500     /* This is the code for a "normal" subpattern (not a backward assertion). The
501     start of a whole pattern is always one of these. If we are at the top level,
502     we may be asked to restart matching from the same point that we reached for a
503     previous partial match. We still have to scan through the top-level branches to
504     find the end state. */
505    
506     else
507     {
508     end_code = this_start_code;
509    
510     /* Restarting */
511    
512     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
513     {
514     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
515     new_count = workspace[1];
516     if (!workspace[0])
517     memcpy(new_states, active_states, new_count * sizeof(stateblock));
518     }
519    
520     /* Not restarting */
521    
522     else
523     {
524 nigel 93 int length = 1 + LINK_SIZE +
525     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
526 nigel 77 do
527     {
528 nigel 93 ADD_NEW(end_code - start_code + length, 0);
529 nigel 77 end_code += GET(end_code, 1);
530 nigel 93 length = 1 + LINK_SIZE;
531 nigel 77 }
532     while (*end_code == OP_ALT);
533     }
534     }
535    
536     workspace[0] = 0; /* Bit indicating which vector is current */
537    
538     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
539    
540     /* Loop for scanning the subject */
541    
542     ptr = current_subject;
543     for (;;)
544     {
545     int i, j;
546 nigel 91 int clen, dlen;
547     unsigned int c, d;
548 ph10 428 int forced_fail = 0;
549 ph10 461 int reached_end = 0;
550 ph10 462 BOOL could_continue = FALSE;
551 nigel 77
552     /* Make the new state list into the active state list and empty the
553     new state list. */
554    
555     temp_states = active_states;
556     active_states = new_states;
557     new_states = temp_states;
558     active_count = new_count;
559     new_count = 0;
560    
561     workspace[0] ^= 1; /* Remember for the restarting feature */
562     workspace[1] = active_count;
563    
564     #ifdef DEBUG
565     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
566     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
567     printf("\"\n");
568    
569     printf("%.*sActive states: ", rlevel*2-2, SP);
570     for (i = 0; i < active_count; i++)
571     printf("%d/%d ", active_states[i].offset, active_states[i].count);
572     printf("\n");
573     #endif
574    
575     /* Set the pointers for adding new states */
576    
577     next_active_state = active_states + active_count;
578     next_new_state = new_states;
579    
580     /* Load the current character from the subject outside the loop, as many
581     different states may want to look at it, and we assume that at least one
582     will. */
583    
584     if (ptr < end_subject)
585     {
586 nigel 93 clen = 1; /* Number of bytes in the character */
587 nigel 77 #ifdef SUPPORT_UTF8
588     if (utf8) { GETCHARLEN(c, ptr, clen); } else
589     #endif /* SUPPORT_UTF8 */
590     c = *ptr;
591     }
592     else
593     {
594 nigel 93 clen = 0; /* This indicates the end of the subject */
595     c = NOTACHAR; /* This value should never actually be used */
596 nigel 77 }
597    
598     /* Scan up the active states and act on each one. The result of an action
599     may be to add more states to the currently active list (e.g. on hitting a
600     parenthesis) or it may be to put states on the new list, for considering
601     when we move the character pointer on. */
602    
603     for (i = 0; i < active_count; i++)
604     {
605     stateblock *current_state = active_states + i;
606     const uschar *code;
607     int state_offset = current_state->offset;
608 ph10 397 int count, codevalue, rrc;
609 nigel 77
610     #ifdef DEBUG
611     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
612 nigel 93 if (clen == 0) printf("EOL\n");
613 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
614     else printf("0x%02x\n", c);
615     #endif
616    
617     /* This variable is referred to implicity in the ADD_xxx macros. */
618    
619     ims = current_state->ims;
620    
621     /* A negative offset is a special case meaning "hold off going to this
622     (negated) state until the number of characters in the data field have
623     been skipped". */
624    
625     if (state_offset < 0)
626     {
627     if (current_state->data > 0)
628     {
629     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
630     ADD_NEW_DATA(state_offset, current_state->count,
631     current_state->data - 1);
632     continue;
633     }
634     else
635     {
636     current_state->offset = state_offset = -state_offset;
637     }
638     }
639    
640 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
641 ph10 439 See the note at the head of this module about the possibility of improving
642     performance here. */
643 nigel 77
644     for (j = 0; j < i; j++)
645     {
646     if (active_states[j].offset == state_offset &&
647     active_states[j].count == current_state->count)
648     {
649     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
650     goto NEXT_ACTIVE_STATE;
651     }
652     }
653    
654     /* The state offset is the offset to the opcode */
655    
656     code = start_code + state_offset;
657     codevalue = *code;
658 ph10 462
659     /* If this opcode inspects a character, but we are at the end of the
660     subject, remember the fact so that we can support PCRE_PARTIAL_HARD. */
661 nigel 77
662 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
663     could_continue = TRUE;
664    
665 nigel 77 /* If this opcode is followed by an inline character, load it. It is
666     tempting to test for the presence of a subject character here, but that
667     is wrong, because sometimes zero repetitions of the subject are
668     permitted.
669    
670     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
671 ph10 178 argument that is not a data character - but is always one byte long. We
672     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
673     this case. To keep the other cases fast, convert these ones to new opcodes.
674     */
675 nigel 77
676     if (coptable[codevalue] > 0)
677     {
678     dlen = 1;
679     #ifdef SUPPORT_UTF8
680     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
681     #endif /* SUPPORT_UTF8 */
682     d = code[coptable[codevalue]];
683     if (codevalue >= OP_TYPESTAR)
684     {
685 nigel 93 switch(d)
686     {
687     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
688     case OP_NOTPROP:
689     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
690     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
691     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
692 ph10 178 case OP_NOT_HSPACE:
693 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
694 ph10 178 case OP_NOT_VSPACE:
695 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
696 nigel 93 default: break;
697     }
698 nigel 77 }
699     }
700     else
701     {
702     dlen = 0; /* Not strictly necessary, but compilers moan */
703 nigel 93 d = NOTACHAR; /* if these variables are not set. */
704 nigel 77 }
705    
706    
707     /* Now process the individual opcodes */
708    
709     switch (codevalue)
710     {
711    
712     /* ========================================================================== */
713     /* Reached a closing bracket. If not at the end of the pattern, carry
714     on with the next opcode. Otherwise, unless we have an empty string and
715 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
716 ph10 442 start of the subject, save the match data, shifting up all previous
717 nigel 77 matches so we always have the longest first. */
718    
719     case OP_KET:
720     case OP_KETRMIN:
721     case OP_KETRMAX:
722     if (code != end_code)
723     {
724     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
725     if (codevalue != OP_KET)
726     {
727     ADD_ACTIVE(state_offset - GET(code, 1), 0);
728     }
729     }
730 ph10 461 else
731 nigel 77 {
732 ph10 461 reached_end++; /* Count branches that reach the end */
733     if (ptr > current_subject ||
734 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
735     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
736     current_subject > start_subject + md->start_offset)))
737 nigel 77 {
738 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
739     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
740     match_count = 0;
741     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
742     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
743     if (offsetcount >= 2)
744     {
745     offsets[0] = current_subject - start_subject;
746     offsets[1] = ptr - start_subject;
747     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
748     offsets[1] - offsets[0], current_subject));
749     }
750     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
751     {
752     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
753     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
754     match_count, rlevel*2-2, SP));
755     return match_count;
756     }
757 ph10 461 }
758 nigel 77 }
759     break;
760    
761     /* ========================================================================== */
762     /* These opcodes add to the current list of states without looking
763     at the current character. */
764    
765     /*-----------------------------------------------------------------*/
766     case OP_ALT:
767     do { code += GET(code, 1); } while (*code == OP_ALT);
768     ADD_ACTIVE(code - start_code, 0);
769     break;
770    
771     /*-----------------------------------------------------------------*/
772     case OP_BRA:
773 nigel 93 case OP_SBRA:
774 nigel 77 do
775     {
776     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
777     code += GET(code, 1);
778     }
779     while (*code == OP_ALT);
780     break;
781    
782     /*-----------------------------------------------------------------*/
783 nigel 93 case OP_CBRA:
784     case OP_SCBRA:
785     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
786     code += GET(code, 1);
787     while (*code == OP_ALT)
788     {
789     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
790     code += GET(code, 1);
791     }
792     break;
793    
794     /*-----------------------------------------------------------------*/
795 nigel 77 case OP_BRAZERO:
796     case OP_BRAMINZERO:
797     ADD_ACTIVE(state_offset + 1, 0);
798     code += 1 + GET(code, 2);
799     while (*code == OP_ALT) code += GET(code, 1);
800     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
801     break;
802    
803     /*-----------------------------------------------------------------*/
804 ph10 335 case OP_SKIPZERO:
805     code += 1 + GET(code, 2);
806     while (*code == OP_ALT) code += GET(code, 1);
807     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
808     break;
809    
810     /*-----------------------------------------------------------------*/
811 nigel 77 case OP_CIRC:
812     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
813 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
814     ptr != end_subject &&
815 nigel 93 WAS_NEWLINE(ptr)))
816 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
817     break;
818    
819     /*-----------------------------------------------------------------*/
820     case OP_EOD:
821     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
822     break;
823    
824     /*-----------------------------------------------------------------*/
825     case OP_OPT:
826     ims = code[1];
827     ADD_ACTIVE(state_offset + 2, 0);
828     break;
829    
830     /*-----------------------------------------------------------------*/
831     case OP_SOD:
832     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
833     break;
834    
835     /*-----------------------------------------------------------------*/
836     case OP_SOM:
837     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
838     break;
839    
840    
841     /* ========================================================================== */
842     /* These opcodes inspect the next subject character, and sometimes
843     the previous one as well, but do not have an argument. The variable
844     clen contains the length of the current character and is zero if we are
845     at the end of the subject. */
846    
847     /*-----------------------------------------------------------------*/
848     case OP_ANY:
849 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
850 nigel 77 { ADD_NEW(state_offset + 1, 0); }
851     break;
852    
853     /*-----------------------------------------------------------------*/
854 ph10 341 case OP_ALLANY:
855     if (clen > 0)
856     { ADD_NEW(state_offset + 1, 0); }
857     break;
858    
859     /*-----------------------------------------------------------------*/
860 nigel 77 case OP_EODN:
861 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
862 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
863     break;
864    
865     /*-----------------------------------------------------------------*/
866     case OP_DOLL:
867     if ((md->moptions & PCRE_NOTEOL) == 0)
868     {
869 nigel 91 if (clen == 0 ||
870 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
871 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
872     ))
873 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
874     }
875 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
876 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
877     break;
878    
879     /*-----------------------------------------------------------------*/
880    
881     case OP_DIGIT:
882     case OP_WHITESPACE:
883     case OP_WORDCHAR:
884     if (clen > 0 && c < 256 &&
885     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
886     { ADD_NEW(state_offset + 1, 0); }
887     break;
888    
889     /*-----------------------------------------------------------------*/
890     case OP_NOT_DIGIT:
891     case OP_NOT_WHITESPACE:
892     case OP_NOT_WORDCHAR:
893     if (clen > 0 && (c >= 256 ||
894     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
895     { ADD_NEW(state_offset + 1, 0); }
896     break;
897    
898     /*-----------------------------------------------------------------*/
899     case OP_WORD_BOUNDARY:
900     case OP_NOT_WORD_BOUNDARY:
901     {
902     int left_word, right_word;
903    
904     if (ptr > start_subject)
905     {
906     const uschar *temp = ptr - 1;
907 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
908 nigel 77 #ifdef SUPPORT_UTF8
909     if (utf8) BACKCHAR(temp);
910     #endif
911     GETCHARTEST(d, temp);
912     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
913     }
914     else left_word = 0;
915    
916 ph10 461 if (clen > 0)
917 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
918     else /* This is a fudge to ensure that if this is the */
919     { /* last item in the pattern, we don't count it as */
920     reached_end--; /* reached, thus disabling a partial match. */
921     right_word = 0;
922 ph10 461 }
923 nigel 77
924     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
925     { ADD_ACTIVE(state_offset + 1, 0); }
926     }
927     break;
928    
929    
930     /*-----------------------------------------------------------------*/
931     /* Check the next character by Unicode property. We will get here only
932     if the support is in the binary; otherwise a compile-time error occurs.
933     */
934    
935 ph10 151 #ifdef SUPPORT_UCP
936 nigel 77 case OP_PROP:
937     case OP_NOTPROP:
938     if (clen > 0)
939     {
940 nigel 87 BOOL OK;
941 ph10 349 const ucd_record * prop = GET_UCD(c);
942 nigel 87 switch(code[1])
943 nigel 77 {
944 nigel 87 case PT_ANY:
945     OK = TRUE;
946     break;
947    
948     case PT_LAMP:
949 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
950 nigel 87 break;
951    
952     case PT_GC:
953 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
954 nigel 87 break;
955    
956     case PT_PC:
957 ph10 349 OK = prop->chartype == code[2];
958 nigel 87 break;
959    
960     case PT_SC:
961 ph10 349 OK = prop->script == code[2];
962 nigel 87 break;
963    
964     /* Should never occur, but keep compilers from grumbling. */
965    
966     default:
967     OK = codevalue != OP_PROP;
968     break;
969 nigel 77 }
970 nigel 87
971     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
972 nigel 77 }
973     break;
974     #endif
975    
976    
977    
978     /* ========================================================================== */
979     /* These opcodes likewise inspect the subject character, but have an
980     argument that is not a data character. It is one of these opcodes:
981 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
982     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
983 nigel 77
984     case OP_TYPEPLUS:
985     case OP_TYPEMINPLUS:
986 nigel 93 case OP_TYPEPOSPLUS:
987 nigel 77 count = current_state->count; /* Already matched */
988     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
989     if (clen > 0)
990     {
991     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
992     (c < 256 &&
993 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
994 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
995     {
996 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
997     {
998     active_count--; /* Remove non-match possibility */
999     next_active_state--;
1000     }
1001 nigel 77 count++;
1002     ADD_NEW(state_offset, count);
1003     }
1004     }
1005     break;
1006    
1007     /*-----------------------------------------------------------------*/
1008     case OP_TYPEQUERY:
1009     case OP_TYPEMINQUERY:
1010 nigel 93 case OP_TYPEPOSQUERY:
1011 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1012     if (clen > 0)
1013     {
1014     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1015     (c < 256 &&
1016 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1017 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1018     {
1019 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1020     {
1021     active_count--; /* Remove non-match possibility */
1022     next_active_state--;
1023     }
1024 nigel 77 ADD_NEW(state_offset + 2, 0);
1025     }
1026     }
1027     break;
1028    
1029     /*-----------------------------------------------------------------*/
1030     case OP_TYPESTAR:
1031     case OP_TYPEMINSTAR:
1032 nigel 93 case OP_TYPEPOSSTAR:
1033 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1034     if (clen > 0)
1035     {
1036     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1037     (c < 256 &&
1038 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1039 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1040     {
1041 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1042     {
1043     active_count--; /* Remove non-match possibility */
1044     next_active_state--;
1045     }
1046 nigel 77 ADD_NEW(state_offset, 0);
1047     }
1048     }
1049     break;
1050    
1051     /*-----------------------------------------------------------------*/
1052     case OP_TYPEEXACT:
1053 nigel 93 count = current_state->count; /* Number already matched */
1054     if (clen > 0)
1055     {
1056     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1057     (c < 256 &&
1058 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1059 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1060     {
1061     if (++count >= GET2(code, 1))
1062     { ADD_NEW(state_offset + 4, 0); }
1063     else
1064     { ADD_NEW(state_offset, count); }
1065     }
1066     }
1067     break;
1068    
1069     /*-----------------------------------------------------------------*/
1070 nigel 77 case OP_TYPEUPTO:
1071     case OP_TYPEMINUPTO:
1072 nigel 93 case OP_TYPEPOSUPTO:
1073     ADD_ACTIVE(state_offset + 4, 0);
1074 nigel 77 count = current_state->count; /* Number already matched */
1075     if (clen > 0)
1076     {
1077     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1078     (c < 256 &&
1079 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1080 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1081     {
1082 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1083     {
1084     active_count--; /* Remove non-match possibility */
1085     next_active_state--;
1086     }
1087 nigel 77 if (++count >= GET2(code, 1))
1088     { ADD_NEW(state_offset + 4, 0); }
1089     else
1090     { ADD_NEW(state_offset, count); }
1091     }
1092     }
1093     break;
1094    
1095     /* ========================================================================== */
1096     /* These are virtual opcodes that are used when something like
1097 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1098     argument. It keeps the code above fast for the other cases. The argument
1099     is in the d variable. */
1100 nigel 77
1101 ph10 151 #ifdef SUPPORT_UCP
1102 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1103     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1104 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1105 nigel 77 count = current_state->count; /* Already matched */
1106 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1107 nigel 77 if (clen > 0)
1108     {
1109 nigel 87 BOOL OK;
1110 ph10 349 const ucd_record * prop = GET_UCD(c);
1111 nigel 87 switch(code[2])
1112     {
1113     case PT_ANY:
1114     OK = TRUE;
1115     break;
1116    
1117     case PT_LAMP:
1118 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1119 nigel 87 break;
1120    
1121     case PT_GC:
1122 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1123 nigel 87 break;
1124    
1125     case PT_PC:
1126 ph10 349 OK = prop->chartype == code[3];
1127 nigel 87 break;
1128    
1129     case PT_SC:
1130 ph10 349 OK = prop->script == code[3];
1131 nigel 87 break;
1132    
1133     /* Should never occur, but keep compilers from grumbling. */
1134    
1135     default:
1136     OK = codevalue != OP_PROP;
1137     break;
1138     }
1139    
1140 nigel 93 if (OK == (d == OP_PROP))
1141     {
1142     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1143     {
1144     active_count--; /* Remove non-match possibility */
1145     next_active_state--;
1146     }
1147     count++;
1148     ADD_NEW(state_offset, count);
1149     }
1150 nigel 77 }
1151     break;
1152    
1153     /*-----------------------------------------------------------------*/
1154     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1155     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1156 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1157 nigel 77 count = current_state->count; /* Already matched */
1158     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1159 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1160 nigel 77 {
1161     const uschar *nptr = ptr + clen;
1162     int ncount = 0;
1163 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1164     {
1165     active_count--; /* Remove non-match possibility */
1166     next_active_state--;
1167     }
1168 nigel 77 while (nptr < end_subject)
1169     {
1170     int nd;
1171     int ndlen = 1;
1172     GETCHARLEN(nd, nptr, ndlen);
1173 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1174 nigel 77 ncount++;
1175     nptr += ndlen;
1176     }
1177     count++;
1178     ADD_NEW_DATA(-state_offset, count, ncount);
1179     }
1180     break;
1181 ph10 151 #endif
1182 nigel 77
1183     /*-----------------------------------------------------------------*/
1184 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1185     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1186     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1187     count = current_state->count; /* Already matched */
1188     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1189     if (clen > 0)
1190     {
1191     int ncount = 0;
1192     switch (c)
1193     {
1194     case 0x000b:
1195     case 0x000c:
1196     case 0x0085:
1197     case 0x2028:
1198     case 0x2029:
1199 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1200     goto ANYNL01;
1201    
1202     case 0x000d:
1203     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1204     /* Fall through */
1205    
1206     ANYNL01:
1207     case 0x000a:
1208 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1209     {
1210     active_count--; /* Remove non-match possibility */
1211     next_active_state--;
1212     }
1213     count++;
1214     ADD_NEW_DATA(-state_offset, count, ncount);
1215     break;
1216 ph10 231
1217 nigel 93 default:
1218     break;
1219     }
1220     }
1221     break;
1222    
1223     /*-----------------------------------------------------------------*/
1224 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1225     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1226     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1227     count = current_state->count; /* Already matched */
1228     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1229     if (clen > 0)
1230     {
1231 ph10 182 BOOL OK;
1232 ph10 178 switch (c)
1233     {
1234     case 0x000a:
1235     case 0x000b:
1236     case 0x000c:
1237     case 0x000d:
1238     case 0x0085:
1239     case 0x2028:
1240     case 0x2029:
1241     OK = TRUE;
1242 ph10 182 break;
1243 ph10 178
1244     default:
1245     OK = FALSE;
1246 ph10 182 break;
1247 ph10 178 }
1248    
1249     if (OK == (d == OP_VSPACE))
1250 ph10 182 {
1251 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1252     {
1253     active_count--; /* Remove non-match possibility */
1254     next_active_state--;
1255     }
1256     count++;
1257     ADD_NEW_DATA(-state_offset, count, 0);
1258     }
1259     }
1260     break;
1261    
1262     /*-----------------------------------------------------------------*/
1263     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1264     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1265     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1266     count = current_state->count; /* Already matched */
1267     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1268     if (clen > 0)
1269     {
1270 ph10 182 BOOL OK;
1271 ph10 178 switch (c)
1272     {
1273     case 0x09: /* HT */
1274     case 0x20: /* SPACE */
1275     case 0xa0: /* NBSP */
1276     case 0x1680: /* OGHAM SPACE MARK */
1277     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1278     case 0x2000: /* EN QUAD */
1279     case 0x2001: /* EM QUAD */
1280     case 0x2002: /* EN SPACE */
1281     case 0x2003: /* EM SPACE */
1282     case 0x2004: /* THREE-PER-EM SPACE */
1283     case 0x2005: /* FOUR-PER-EM SPACE */
1284     case 0x2006: /* SIX-PER-EM SPACE */
1285     case 0x2007: /* FIGURE SPACE */
1286     case 0x2008: /* PUNCTUATION SPACE */
1287     case 0x2009: /* THIN SPACE */
1288     case 0x200A: /* HAIR SPACE */
1289     case 0x202f: /* NARROW NO-BREAK SPACE */
1290     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1291     case 0x3000: /* IDEOGRAPHIC SPACE */
1292     OK = TRUE;
1293     break;
1294 ph10 182
1295 ph10 178 default:
1296     OK = FALSE;
1297     break;
1298     }
1299 ph10 182
1300 ph10 178 if (OK == (d == OP_HSPACE))
1301 ph10 182 {
1302 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1303     {
1304     active_count--; /* Remove non-match possibility */
1305     next_active_state--;
1306     }
1307     count++;
1308     ADD_NEW_DATA(-state_offset, count, 0);
1309     }
1310     }
1311     break;
1312    
1313     /*-----------------------------------------------------------------*/
1314 ph10 151 #ifdef SUPPORT_UCP
1315 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1316     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1317 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1318 nigel 87 count = 4;
1319 nigel 77 goto QS1;
1320    
1321     case OP_PROP_EXTRA + OP_TYPESTAR:
1322     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1323 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1324 nigel 77 count = 0;
1325    
1326     QS1:
1327    
1328 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1329 nigel 77 if (clen > 0)
1330     {
1331 nigel 87 BOOL OK;
1332 ph10 349 const ucd_record * prop = GET_UCD(c);
1333 nigel 87 switch(code[2])
1334     {
1335     case PT_ANY:
1336     OK = TRUE;
1337     break;
1338    
1339     case PT_LAMP:
1340 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1341 nigel 87 break;
1342    
1343     case PT_GC:
1344 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1345 nigel 87 break;
1346    
1347     case PT_PC:
1348 ph10 349 OK = prop->chartype == code[3];
1349 nigel 87 break;
1350    
1351     case PT_SC:
1352 ph10 349 OK = prop->script == code[3];
1353 nigel 87 break;
1354    
1355     /* Should never occur, but keep compilers from grumbling. */
1356    
1357     default:
1358     OK = codevalue != OP_PROP;
1359     break;
1360     }
1361    
1362 nigel 93 if (OK == (d == OP_PROP))
1363     {
1364     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1365     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1366     {
1367     active_count--; /* Remove non-match possibility */
1368     next_active_state--;
1369     }
1370     ADD_NEW(state_offset + count, 0);
1371     }
1372 nigel 77 }
1373     break;
1374    
1375     /*-----------------------------------------------------------------*/
1376     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1377     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1378 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1379 nigel 77 count = 2;
1380     goto QS2;
1381    
1382     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1383     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1384 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1385 nigel 77 count = 0;
1386    
1387     QS2:
1388    
1389     ADD_ACTIVE(state_offset + 2, 0);
1390 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1391 nigel 77 {
1392     const uschar *nptr = ptr + clen;
1393     int ncount = 0;
1394 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1395     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1396     {
1397     active_count--; /* Remove non-match possibility */
1398     next_active_state--;
1399     }
1400 nigel 77 while (nptr < end_subject)
1401     {
1402     int nd;
1403     int ndlen = 1;
1404     GETCHARLEN(nd, nptr, ndlen);
1405 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1406 nigel 77 ncount++;
1407     nptr += ndlen;
1408     }
1409     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1410     }
1411     break;
1412 ph10 151 #endif
1413 nigel 77
1414     /*-----------------------------------------------------------------*/
1415 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1416     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1417     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1418     count = 2;
1419     goto QS3;
1420    
1421     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1422     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1423     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1424     count = 0;
1425    
1426     QS3:
1427     ADD_ACTIVE(state_offset + 2, 0);
1428     if (clen > 0)
1429     {
1430     int ncount = 0;
1431     switch (c)
1432     {
1433     case 0x000b:
1434     case 0x000c:
1435     case 0x0085:
1436     case 0x2028:
1437     case 0x2029:
1438 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1439     goto ANYNL02;
1440    
1441     case 0x000d:
1442     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1443     /* Fall through */
1444    
1445     ANYNL02:
1446     case 0x000a:
1447 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1448     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1449     {
1450     active_count--; /* Remove non-match possibility */
1451     next_active_state--;
1452     }
1453     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1454     break;
1455 ph10 231
1456 nigel 93 default:
1457     break;
1458     }
1459     }
1460     break;
1461    
1462     /*-----------------------------------------------------------------*/
1463 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1464     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1465     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1466     count = 2;
1467     goto QS4;
1468    
1469     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1470     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1471     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1472     count = 0;
1473    
1474     QS4:
1475     ADD_ACTIVE(state_offset + 2, 0);
1476     if (clen > 0)
1477     {
1478 ph10 182 BOOL OK;
1479 ph10 178 switch (c)
1480     {
1481     case 0x000a:
1482     case 0x000b:
1483     case 0x000c:
1484     case 0x000d:
1485     case 0x0085:
1486     case 0x2028:
1487     case 0x2029:
1488     OK = TRUE;
1489     break;
1490 ph10 182
1491 ph10 178 default:
1492     OK = FALSE;
1493     break;
1494     }
1495     if (OK == (d == OP_VSPACE))
1496 ph10 182 {
1497 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1498     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1499     {
1500     active_count--; /* Remove non-match possibility */
1501     next_active_state--;
1502     }
1503     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1504     }
1505     }
1506     break;
1507    
1508     /*-----------------------------------------------------------------*/
1509     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1510     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1511     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1512     count = 2;
1513     goto QS5;
1514    
1515     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1516     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1517     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1518     count = 0;
1519    
1520     QS5:
1521     ADD_ACTIVE(state_offset + 2, 0);
1522     if (clen > 0)
1523     {
1524 ph10 182 BOOL OK;
1525 ph10 178 switch (c)
1526     {
1527     case 0x09: /* HT */
1528     case 0x20: /* SPACE */
1529     case 0xa0: /* NBSP */
1530     case 0x1680: /* OGHAM SPACE MARK */
1531     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1532     case 0x2000: /* EN QUAD */
1533     case 0x2001: /* EM QUAD */
1534     case 0x2002: /* EN SPACE */
1535     case 0x2003: /* EM SPACE */
1536     case 0x2004: /* THREE-PER-EM SPACE */
1537     case 0x2005: /* FOUR-PER-EM SPACE */
1538     case 0x2006: /* SIX-PER-EM SPACE */
1539     case 0x2007: /* FIGURE SPACE */
1540     case 0x2008: /* PUNCTUATION SPACE */
1541     case 0x2009: /* THIN SPACE */
1542     case 0x200A: /* HAIR SPACE */
1543     case 0x202f: /* NARROW NO-BREAK SPACE */
1544     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1545     case 0x3000: /* IDEOGRAPHIC SPACE */
1546     OK = TRUE;
1547     break;
1548 ph10 182
1549 ph10 178 default:
1550     OK = FALSE;
1551     break;
1552     }
1553 ph10 182
1554 ph10 178 if (OK == (d == OP_HSPACE))
1555 ph10 182 {
1556 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1557     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1558     {
1559     active_count--; /* Remove non-match possibility */
1560     next_active_state--;
1561     }
1562     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1563     }
1564     }
1565     break;
1566    
1567     /*-----------------------------------------------------------------*/
1568 ph10 151 #ifdef SUPPORT_UCP
1569 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1570     case OP_PROP_EXTRA + OP_TYPEUPTO:
1571     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1572 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1573 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1574 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1575 nigel 77 count = current_state->count; /* Number already matched */
1576     if (clen > 0)
1577     {
1578 nigel 87 BOOL OK;
1579 ph10 349 const ucd_record * prop = GET_UCD(c);
1580 nigel 87 switch(code[4])
1581 nigel 77 {
1582 nigel 87 case PT_ANY:
1583     OK = TRUE;
1584     break;
1585    
1586     case PT_LAMP:
1587 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1588 nigel 87 break;
1589    
1590     case PT_GC:
1591 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1592 nigel 87 break;
1593    
1594     case PT_PC:
1595 ph10 349 OK = prop->chartype == code[5];
1596 nigel 87 break;
1597    
1598     case PT_SC:
1599 ph10 349 OK = prop->script == code[5];
1600 nigel 87 break;
1601    
1602     /* Should never occur, but keep compilers from grumbling. */
1603    
1604     default:
1605     OK = codevalue != OP_PROP;
1606     break;
1607     }
1608    
1609     if (OK == (d == OP_PROP))
1610     {
1611 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1612     {
1613     active_count--; /* Remove non-match possibility */
1614     next_active_state--;
1615     }
1616 nigel 77 if (++count >= GET2(code, 1))
1617 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1618 nigel 77 else
1619     { ADD_NEW(state_offset, count); }
1620     }
1621     }
1622     break;
1623    
1624     /*-----------------------------------------------------------------*/
1625     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1626     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1627     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1628 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1629 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1630     { ADD_ACTIVE(state_offset + 4, 0); }
1631     count = current_state->count; /* Number already matched */
1632 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1633 nigel 77 {
1634     const uschar *nptr = ptr + clen;
1635     int ncount = 0;
1636 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1637     {
1638     active_count--; /* Remove non-match possibility */
1639     next_active_state--;
1640     }
1641 nigel 77 while (nptr < end_subject)
1642     {
1643     int nd;
1644     int ndlen = 1;
1645     GETCHARLEN(nd, nptr, ndlen);
1646 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1647 nigel 77 ncount++;
1648     nptr += ndlen;
1649     }
1650     if (++count >= GET2(code, 1))
1651     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1652     else
1653     { ADD_NEW_DATA(-state_offset, count, ncount); }
1654     }
1655     break;
1656 ph10 151 #endif
1657 nigel 77
1658 nigel 93 /*-----------------------------------------------------------------*/
1659     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1660     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1661     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1662     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1663     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1664     { ADD_ACTIVE(state_offset + 4, 0); }
1665     count = current_state->count; /* Number already matched */
1666     if (clen > 0)
1667     {
1668     int ncount = 0;
1669     switch (c)
1670     {
1671     case 0x000b:
1672     case 0x000c:
1673     case 0x0085:
1674     case 0x2028:
1675     case 0x2029:
1676 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1677     goto ANYNL03;
1678    
1679     case 0x000d:
1680     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1681     /* Fall through */
1682    
1683     ANYNL03:
1684     case 0x000a:
1685 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1686     {
1687     active_count--; /* Remove non-match possibility */
1688     next_active_state--;
1689     }
1690     if (++count >= GET2(code, 1))
1691     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1692     else
1693     { ADD_NEW_DATA(-state_offset, count, ncount); }
1694     break;
1695 ph10 231
1696 nigel 93 default:
1697     break;
1698     }
1699     }
1700     break;
1701    
1702 ph10 178 /*-----------------------------------------------------------------*/
1703     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1704     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1705     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1706     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1707     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1708     { ADD_ACTIVE(state_offset + 4, 0); }
1709     count = current_state->count; /* Number already matched */
1710     if (clen > 0)
1711     {
1712 ph10 182 BOOL OK;
1713 ph10 178 switch (c)
1714     {
1715     case 0x000a:
1716     case 0x000b:
1717     case 0x000c:
1718     case 0x000d:
1719     case 0x0085:
1720     case 0x2028:
1721     case 0x2029:
1722     OK = TRUE;
1723     break;
1724 ph10 182
1725 ph10 178 default:
1726     OK = FALSE;
1727     }
1728 ph10 182
1729 ph10 178 if (OK == (d == OP_VSPACE))
1730 ph10 182 {
1731 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1732     {
1733     active_count--; /* Remove non-match possibility */
1734     next_active_state--;
1735     }
1736     if (++count >= GET2(code, 1))
1737     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1738     else
1739     { ADD_NEW_DATA(-state_offset, count, 0); }
1740     }
1741     }
1742     break;
1743    
1744     /*-----------------------------------------------------------------*/
1745     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1746     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1747     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1748     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1749     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1750     { ADD_ACTIVE(state_offset + 4, 0); }
1751     count = current_state->count; /* Number already matched */
1752     if (clen > 0)
1753     {
1754 ph10 182 BOOL OK;
1755 ph10 178 switch (c)
1756     {
1757     case 0x09: /* HT */
1758     case 0x20: /* SPACE */
1759     case 0xa0: /* NBSP */
1760     case 0x1680: /* OGHAM SPACE MARK */
1761     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1762     case 0x2000: /* EN QUAD */
1763     case 0x2001: /* EM QUAD */
1764     case 0x2002: /* EN SPACE */
1765     case 0x2003: /* EM SPACE */
1766     case 0x2004: /* THREE-PER-EM SPACE */
1767     case 0x2005: /* FOUR-PER-EM SPACE */
1768     case 0x2006: /* SIX-PER-EM SPACE */
1769     case 0x2007: /* FIGURE SPACE */
1770     case 0x2008: /* PUNCTUATION SPACE */
1771     case 0x2009: /* THIN SPACE */
1772     case 0x200A: /* HAIR SPACE */
1773     case 0x202f: /* NARROW NO-BREAK SPACE */
1774     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1775     case 0x3000: /* IDEOGRAPHIC SPACE */
1776     OK = TRUE;
1777     break;
1778 ph10 182
1779 ph10 178 default:
1780     OK = FALSE;
1781     break;
1782     }
1783 ph10 182
1784 ph10 178 if (OK == (d == OP_HSPACE))
1785 ph10 182 {
1786 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1787     {
1788     active_count--; /* Remove non-match possibility */
1789     next_active_state--;
1790     }
1791     if (++count >= GET2(code, 1))
1792     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1793     else
1794     { ADD_NEW_DATA(-state_offset, count, 0); }
1795     }
1796     }
1797     break;
1798    
1799 nigel 77 /* ========================================================================== */
1800     /* These opcodes are followed by a character that is usually compared
1801     to the current subject character; it is loaded into d. We still get
1802     here even if there is no subject character, because in some cases zero
1803     repetitions are permitted. */
1804    
1805     /*-----------------------------------------------------------------*/
1806     case OP_CHAR:
1807     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1808     break;
1809    
1810     /*-----------------------------------------------------------------*/
1811     case OP_CHARNC:
1812     if (clen == 0) break;
1813    
1814     #ifdef SUPPORT_UTF8
1815     if (utf8)
1816     {
1817     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1818     {
1819 nigel 93 unsigned int othercase;
1820 nigel 77 if (c < 128) othercase = fcc[c]; else
1821    
1822     /* If we have Unicode property support, we can use it to test the
1823 nigel 87 other case of the character. */
1824 nigel 77
1825     #ifdef SUPPORT_UCP
1826 ph10 349 othercase = UCD_OTHERCASE(c);
1827 nigel 87 #else
1828 nigel 93 othercase = NOTACHAR;
1829 nigel 77 #endif
1830    
1831     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1832     }
1833     }
1834     else
1835     #endif /* SUPPORT_UTF8 */
1836    
1837     /* Non-UTF-8 mode */
1838     {
1839     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1840     }
1841     break;
1842    
1843    
1844     #ifdef SUPPORT_UCP
1845     /*-----------------------------------------------------------------*/
1846     /* This is a tricky one because it can match more than one character.
1847     Find out how many characters to skip, and then set up a negative state
1848     to wait for them to pass before continuing. */
1849    
1850     case OP_EXTUNI:
1851 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1852 nigel 77 {
1853     const uschar *nptr = ptr + clen;
1854     int ncount = 0;
1855     while (nptr < end_subject)
1856     {
1857     int nclen = 1;
1858     GETCHARLEN(c, nptr, nclen);
1859 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1860 nigel 77 ncount++;
1861     nptr += nclen;
1862     }
1863     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1864     }
1865     break;
1866     #endif
1867    
1868     /*-----------------------------------------------------------------*/
1869 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1870     character (when CR is followed by LF). In this case, set up a negative
1871     state to wait for one character to pass before continuing. */
1872    
1873     case OP_ANYNL:
1874     if (clen > 0) switch(c)
1875     {
1876     case 0x000b:
1877     case 0x000c:
1878     case 0x0085:
1879     case 0x2028:
1880     case 0x2029:
1881 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1882    
1883     case 0x000a:
1884 nigel 93 ADD_NEW(state_offset + 1, 0);
1885     break;
1886 ph10 231
1887 nigel 93 case 0x000d:
1888     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1889     {
1890     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1891     }
1892     else
1893     {
1894     ADD_NEW(state_offset + 1, 0);
1895     }
1896     break;
1897     }
1898     break;
1899    
1900     /*-----------------------------------------------------------------*/
1901 ph10 178 case OP_NOT_VSPACE:
1902     if (clen > 0) switch(c)
1903     {
1904     case 0x000a:
1905     case 0x000b:
1906     case 0x000c:
1907     case 0x000d:
1908     case 0x0085:
1909     case 0x2028:
1910     case 0x2029:
1911     break;
1912 ph10 182
1913     default:
1914 ph10 178 ADD_NEW(state_offset + 1, 0);
1915     break;
1916     }
1917     break;
1918    
1919     /*-----------------------------------------------------------------*/
1920     case OP_VSPACE:
1921     if (clen > 0) switch(c)
1922     {
1923     case 0x000a:
1924     case 0x000b:
1925     case 0x000c:
1926     case 0x000d:
1927     case 0x0085:
1928     case 0x2028:
1929     case 0x2029:
1930     ADD_NEW(state_offset + 1, 0);
1931     break;
1932 ph10 182
1933 ph10 178 default: break;
1934     }
1935     break;
1936    
1937     /*-----------------------------------------------------------------*/
1938     case OP_NOT_HSPACE:
1939     if (clen > 0) switch(c)
1940     {
1941     case 0x09: /* HT */
1942     case 0x20: /* SPACE */
1943     case 0xa0: /* NBSP */
1944     case 0x1680: /* OGHAM SPACE MARK */
1945     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1946     case 0x2000: /* EN QUAD */
1947     case 0x2001: /* EM QUAD */
1948     case 0x2002: /* EN SPACE */
1949     case 0x2003: /* EM SPACE */
1950     case 0x2004: /* THREE-PER-EM SPACE */
1951     case 0x2005: /* FOUR-PER-EM SPACE */
1952     case 0x2006: /* SIX-PER-EM SPACE */
1953     case 0x2007: /* FIGURE SPACE */
1954     case 0x2008: /* PUNCTUATION SPACE */
1955     case 0x2009: /* THIN SPACE */
1956     case 0x200A: /* HAIR SPACE */
1957     case 0x202f: /* NARROW NO-BREAK SPACE */
1958     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1959     case 0x3000: /* IDEOGRAPHIC SPACE */
1960     break;
1961 ph10 182
1962     default:
1963 ph10 178 ADD_NEW(state_offset + 1, 0);
1964     break;
1965     }
1966     break;
1967    
1968     /*-----------------------------------------------------------------*/
1969     case OP_HSPACE:
1970     if (clen > 0) switch(c)
1971     {
1972     case 0x09: /* HT */
1973     case 0x20: /* SPACE */
1974     case 0xa0: /* NBSP */
1975     case 0x1680: /* OGHAM SPACE MARK */
1976     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1977     case 0x2000: /* EN QUAD */
1978     case 0x2001: /* EM QUAD */
1979     case 0x2002: /* EN SPACE */
1980     case 0x2003: /* EM SPACE */
1981     case 0x2004: /* THREE-PER-EM SPACE */
1982     case 0x2005: /* FOUR-PER-EM SPACE */
1983     case 0x2006: /* SIX-PER-EM SPACE */
1984     case 0x2007: /* FIGURE SPACE */
1985     case 0x2008: /* PUNCTUATION SPACE */
1986     case 0x2009: /* THIN SPACE */
1987     case 0x200A: /* HAIR SPACE */
1988     case 0x202f: /* NARROW NO-BREAK SPACE */
1989     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1990     case 0x3000: /* IDEOGRAPHIC SPACE */
1991     ADD_NEW(state_offset + 1, 0);
1992     break;
1993     }
1994     break;
1995    
1996     /*-----------------------------------------------------------------*/
1997 nigel 77 /* Match a negated single character. This is only used for one-byte
1998     characters, that is, we know that d < 256. The character we are
1999     checking (c) can be multibyte. */
2000    
2001     case OP_NOT:
2002     if (clen > 0)
2003     {
2004 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2005 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2006     }
2007     break;
2008    
2009     /*-----------------------------------------------------------------*/
2010     case OP_PLUS:
2011     case OP_MINPLUS:
2012 nigel 93 case OP_POSPLUS:
2013 nigel 77 case OP_NOTPLUS:
2014     case OP_NOTMINPLUS:
2015 nigel 93 case OP_NOTPOSPLUS:
2016 nigel 77 count = current_state->count; /* Already matched */
2017     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2018     if (clen > 0)
2019     {
2020 nigel 93 unsigned int otherd = NOTACHAR;
2021 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2022     {
2023     #ifdef SUPPORT_UTF8
2024 nigel 87 if (utf8 && d >= 128)
2025 nigel 77 {
2026     #ifdef SUPPORT_UCP
2027 ph10 349 otherd = UCD_OTHERCASE(d);
2028 nigel 77 #endif /* SUPPORT_UCP */
2029     }
2030     else
2031     #endif /* SUPPORT_UTF8 */
2032     otherd = fcc[d];
2033     }
2034     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2035 nigel 93 {
2036     if (count > 0 &&
2037     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2038     {
2039     active_count--; /* Remove non-match possibility */
2040     next_active_state--;
2041     }
2042     count++;
2043     ADD_NEW(state_offset, count);
2044     }
2045 nigel 77 }
2046     break;
2047    
2048     /*-----------------------------------------------------------------*/
2049     case OP_QUERY:
2050     case OP_MINQUERY:
2051 nigel 93 case OP_POSQUERY:
2052 nigel 77 case OP_NOTQUERY:
2053     case OP_NOTMINQUERY:
2054 nigel 93 case OP_NOTPOSQUERY:
2055 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2056     if (clen > 0)
2057     {
2058 nigel 93 unsigned int otherd = NOTACHAR;
2059 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2060 nigel 77 {
2061     #ifdef SUPPORT_UTF8
2062 nigel 87 if (utf8 && d >= 128)
2063 nigel 77 {
2064     #ifdef SUPPORT_UCP
2065 ph10 349 otherd = UCD_OTHERCASE(d);
2066 nigel 77 #endif /* SUPPORT_UCP */
2067     }
2068     else
2069     #endif /* SUPPORT_UTF8 */
2070     otherd = fcc[d];
2071     }
2072     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2073 nigel 93 {
2074     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2075     {
2076     active_count--; /* Remove non-match possibility */
2077     next_active_state--;
2078     }
2079     ADD_NEW(state_offset + dlen + 1, 0);
2080     }
2081 nigel 77 }
2082     break;
2083    
2084     /*-----------------------------------------------------------------*/
2085     case OP_STAR:
2086     case OP_MINSTAR:
2087 nigel 93 case OP_POSSTAR:
2088 nigel 77 case OP_NOTSTAR:
2089     case OP_NOTMINSTAR:
2090 nigel 93 case OP_NOTPOSSTAR:
2091 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2092     if (clen > 0)
2093     {
2094 nigel 93 unsigned int otherd = NOTACHAR;
2095 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2096 nigel 77 {
2097     #ifdef SUPPORT_UTF8
2098 nigel 87 if (utf8 && d >= 128)
2099 nigel 77 {
2100     #ifdef SUPPORT_UCP
2101 ph10 349 otherd = UCD_OTHERCASE(d);
2102 nigel 77 #endif /* SUPPORT_UCP */
2103     }
2104     else
2105     #endif /* SUPPORT_UTF8 */
2106     otherd = fcc[d];
2107     }
2108     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2109 nigel 93 {
2110     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2111     {
2112     active_count--; /* Remove non-match possibility */
2113     next_active_state--;
2114     }
2115     ADD_NEW(state_offset, 0);
2116     }
2117 nigel 77 }
2118     break;
2119    
2120     /*-----------------------------------------------------------------*/
2121     case OP_EXACT:
2122 nigel 93 case OP_NOTEXACT:
2123     count = current_state->count; /* Number already matched */
2124     if (clen > 0)
2125     {
2126     unsigned int otherd = NOTACHAR;
2127     if ((ims & PCRE_CASELESS) != 0)
2128     {
2129     #ifdef SUPPORT_UTF8
2130     if (utf8 && d >= 128)
2131     {
2132     #ifdef SUPPORT_UCP
2133 ph10 349 otherd = UCD_OTHERCASE(d);
2134 nigel 93 #endif /* SUPPORT_UCP */
2135     }
2136     else
2137     #endif /* SUPPORT_UTF8 */
2138     otherd = fcc[d];
2139     }
2140     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2141     {
2142     if (++count >= GET2(code, 1))
2143     { ADD_NEW(state_offset + dlen + 3, 0); }
2144     else
2145     { ADD_NEW(state_offset, count); }
2146     }
2147     }
2148     break;
2149    
2150     /*-----------------------------------------------------------------*/
2151 nigel 77 case OP_UPTO:
2152     case OP_MINUPTO:
2153 nigel 93 case OP_POSUPTO:
2154 nigel 77 case OP_NOTUPTO:
2155     case OP_NOTMINUPTO:
2156 nigel 93 case OP_NOTPOSUPTO:
2157     ADD_ACTIVE(state_offset + dlen + 3, 0);
2158 nigel 77 count = current_state->count; /* Number already matched */
2159     if (clen > 0)
2160     {
2161 nigel 93 unsigned int otherd = NOTACHAR;
2162 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2163     {
2164     #ifdef SUPPORT_UTF8
2165 nigel 87 if (utf8 && d >= 128)
2166 nigel 77 {
2167     #ifdef SUPPORT_UCP
2168 ph10 349 otherd = UCD_OTHERCASE(d);
2169 nigel 77 #endif /* SUPPORT_UCP */
2170     }
2171     else
2172     #endif /* SUPPORT_UTF8 */
2173     otherd = fcc[d];
2174     }
2175     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2176     {
2177 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2178     {
2179     active_count--; /* Remove non-match possibility */
2180     next_active_state--;
2181     }
2182 nigel 77 if (++count >= GET2(code, 1))
2183     { ADD_NEW(state_offset + dlen + 3, 0); }
2184     else
2185     { ADD_NEW(state_offset, count); }
2186     }
2187     }
2188     break;
2189    
2190    
2191     /* ========================================================================== */
2192     /* These are the class-handling opcodes */
2193    
2194     case OP_CLASS:
2195     case OP_NCLASS:
2196     case OP_XCLASS:
2197     {
2198     BOOL isinclass = FALSE;
2199     int next_state_offset;
2200     const uschar *ecode;
2201    
2202     /* For a simple class, there is always just a 32-byte table, and we
2203     can set isinclass from it. */
2204    
2205     if (codevalue != OP_XCLASS)
2206     {
2207     ecode = code + 33;
2208     if (clen > 0)
2209     {
2210     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2211     ((code[1 + c/8] & (1 << (c&7))) != 0);
2212     }
2213     }
2214    
2215     /* An extended class may have a table or a list of single characters,
2216     ranges, or both, and it may be positive or negative. There's a
2217     function that sorts all this out. */
2218    
2219     else
2220     {
2221     ecode = code + GET(code, 1);
2222     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2223     }
2224    
2225     /* At this point, isinclass is set for all kinds of class, and ecode
2226     points to the byte after the end of the class. If there is a
2227     quantifier, this is where it will be. */
2228    
2229     next_state_offset = ecode - start_code;
2230    
2231     switch (*ecode)
2232     {
2233     case OP_CRSTAR:
2234     case OP_CRMINSTAR:
2235     ADD_ACTIVE(next_state_offset + 1, 0);
2236     if (isinclass) { ADD_NEW(state_offset, 0); }
2237     break;
2238    
2239     case OP_CRPLUS:
2240     case OP_CRMINPLUS:
2241     count = current_state->count; /* Already matched */
2242     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2243     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2244     break;
2245    
2246     case OP_CRQUERY:
2247     case OP_CRMINQUERY:
2248     ADD_ACTIVE(next_state_offset + 1, 0);
2249     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2250     break;
2251    
2252     case OP_CRRANGE:
2253     case OP_CRMINRANGE:
2254     count = current_state->count; /* Already matched */
2255     if (count >= GET2(ecode, 1))
2256     { ADD_ACTIVE(next_state_offset + 5, 0); }
2257     if (isinclass)
2258     {
2259 nigel 91 int max = GET2(ecode, 3);
2260     if (++count >= max && max != 0) /* Max 0 => no limit */
2261 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2262     else
2263     { ADD_NEW(state_offset, count); }
2264     }
2265     break;
2266    
2267     default:
2268     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2269     break;
2270     }
2271     }
2272     break;
2273    
2274     /* ========================================================================== */
2275     /* These are the opcodes for fancy brackets of various kinds. We have
2276 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2277     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2278 ph10 341 though the other "backtracking verbs" are not supported. */
2279 ph10 345
2280 ph10 341 case OP_FAIL:
2281 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2282 ph10 345 break;
2283 nigel 77
2284     case OP_ASSERT:
2285     case OP_ASSERT_NOT:
2286     case OP_ASSERTBACK:
2287     case OP_ASSERTBACK_NOT:
2288     {
2289     int rc;
2290     int local_offsets[2];
2291     int local_workspace[1000];
2292     const uschar *endasscode = code + GET(code, 1);
2293    
2294     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2295    
2296     rc = internal_dfa_exec(
2297     md, /* static match data */
2298     code, /* this subexpression's code */
2299     ptr, /* where we currently are */
2300     ptr - start_subject, /* start offset */
2301     local_offsets, /* offset vector */
2302     sizeof(local_offsets)/sizeof(int), /* size of same */
2303     local_workspace, /* workspace vector */
2304     sizeof(local_workspace)/sizeof(int), /* size of same */
2305     ims, /* the current ims flags */
2306     rlevel, /* function recursion level */
2307     recursing); /* pass on regex recursion */
2308    
2309     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2310     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2311     }
2312     break;
2313    
2314     /*-----------------------------------------------------------------*/
2315     case OP_COND:
2316 nigel 93 case OP_SCOND:
2317 nigel 77 {
2318     int local_offsets[1000];
2319     int local_workspace[1000];
2320 ph10 406 int codelink = GET(code, 1);
2321 ph10 397 int condcode;
2322 ph10 406
2323 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2324 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2325 ph10 398 happen for the other conditions. */
2326 nigel 77
2327 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2328 ph10 406 {
2329     rrc = 0;
2330 ph10 397 if (pcre_callout != NULL)
2331     {
2332     pcre_callout_block cb;
2333     cb.version = 1; /* Version 1 of the callout block */
2334     cb.callout_number = code[LINK_SIZE+2];
2335     cb.offset_vector = offsets;
2336     cb.subject = (PCRE_SPTR)start_subject;
2337     cb.subject_length = end_subject - start_subject;
2338     cb.start_match = current_subject - start_subject;
2339     cb.current_position = ptr - start_subject;
2340     cb.pattern_position = GET(code, LINK_SIZE + 3);
2341     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2342     cb.capture_top = 1;
2343     cb.capture_last = -1;
2344     cb.callout_data = md->callout_data;
2345     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2346     }
2347 ph10 398 if (rrc > 0) break; /* Fail this thread */
2348     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2349 ph10 406 }
2350 ph10 398
2351 ph10 397 condcode = code[LINK_SIZE+1];
2352 ph10 406
2353 nigel 93 /* Back reference conditions are not supported */
2354 nigel 77
2355 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2356 ph10 459 return PCRE_ERROR_DFA_UCOND;
2357 nigel 93
2358     /* The DEFINE condition is always false */
2359    
2360     if (condcode == OP_DEF)
2361 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2362 nigel 93
2363     /* The only supported version of OP_RREF is for the value RREF_ANY,
2364     which means "test if in any recursion". We can't test for specifically
2365     recursed groups. */
2366    
2367 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2368 nigel 93 {
2369 nigel 77 int value = GET2(code, LINK_SIZE+2);
2370 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2371 ph10 406 if (recursing > 0)
2372 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2373     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2374 nigel 77 }
2375    
2376     /* Otherwise, the condition is an assertion */
2377    
2378     else
2379     {
2380     int rc;
2381     const uschar *asscode = code + LINK_SIZE + 1;
2382     const uschar *endasscode = asscode + GET(asscode, 1);
2383    
2384     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2385    
2386     rc = internal_dfa_exec(
2387     md, /* fixed match data */
2388     asscode, /* this subexpression's code */
2389     ptr, /* where we currently are */
2390     ptr - start_subject, /* start offset */
2391     local_offsets, /* offset vector */
2392     sizeof(local_offsets)/sizeof(int), /* size of same */
2393     local_workspace, /* workspace vector */
2394     sizeof(local_workspace)/sizeof(int), /* size of same */
2395     ims, /* the current ims flags */
2396     rlevel, /* function recursion level */
2397     recursing); /* pass on regex recursion */
2398    
2399     if ((rc >= 0) ==
2400     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2401 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2402 nigel 77 else
2403 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2404 nigel 77 }
2405     }
2406     break;
2407    
2408     /*-----------------------------------------------------------------*/
2409     case OP_RECURSE:
2410     {
2411     int local_offsets[1000];
2412     int local_workspace[1000];
2413     int rc;
2414    
2415     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2416     recursing + 1));
2417    
2418     rc = internal_dfa_exec(
2419     md, /* fixed match data */
2420     start_code + GET(code, 1), /* this subexpression's code */
2421     ptr, /* where we currently are */
2422     ptr - start_subject, /* start offset */
2423     local_offsets, /* offset vector */
2424     sizeof(local_offsets)/sizeof(int), /* size of same */
2425     local_workspace, /* workspace vector */
2426     sizeof(local_workspace)/sizeof(int), /* size of same */
2427     ims, /* the current ims flags */
2428     rlevel, /* function recursion level */
2429     recursing + 1); /* regex recurse level */
2430    
2431     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2432     recursing + 1, rc));
2433    
2434     /* Ran out of internal offsets */
2435    
2436     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2437    
2438     /* For each successful matched substring, set up the next state with a
2439     count of characters to skip before trying it. Note that the count is in
2440     characters, not bytes. */
2441    
2442     if (rc > 0)
2443     {
2444     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2445     {
2446     const uschar *p = start_subject + local_offsets[rc];
2447     const uschar *pp = start_subject + local_offsets[rc+1];
2448     int charcount = local_offsets[rc+1] - local_offsets[rc];
2449     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2450     if (charcount > 0)
2451     {
2452     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2453     }
2454     else
2455     {
2456     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2457     }
2458     }
2459     }
2460     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2461     }
2462     break;
2463    
2464     /*-----------------------------------------------------------------*/
2465     case OP_ONCE:
2466     {
2467     int local_offsets[2];
2468     int local_workspace[1000];
2469    
2470     int rc = internal_dfa_exec(
2471     md, /* fixed match data */
2472     code, /* this subexpression's code */
2473     ptr, /* where we currently are */
2474     ptr - start_subject, /* start offset */
2475     local_offsets, /* offset vector */
2476     sizeof(local_offsets)/sizeof(int), /* size of same */
2477     local_workspace, /* workspace vector */
2478     sizeof(local_workspace)/sizeof(int), /* size of same */
2479     ims, /* the current ims flags */
2480     rlevel, /* function recursion level */
2481     recursing); /* pass on regex recursion */
2482    
2483     if (rc >= 0)
2484     {
2485     const uschar *end_subpattern = code;
2486     int charcount = local_offsets[1] - local_offsets[0];
2487     int next_state_offset, repeat_state_offset;
2488    
2489     do { end_subpattern += GET(end_subpattern, 1); }
2490     while (*end_subpattern == OP_ALT);
2491     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2492    
2493     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2494     arrange for the repeat state also to be added to the relevant list.
2495     Calculate the offset, or set -1 for no repeat. */
2496    
2497     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2498     *end_subpattern == OP_KETRMIN)?
2499     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2500    
2501     /* If we have matched an empty string, add the next state at the
2502     current character pointer. This is important so that the duplicate
2503     checking kicks in, which is what breaks infinite loops that match an
2504     empty string. */
2505    
2506     if (charcount == 0)
2507     {
2508     ADD_ACTIVE(next_state_offset, 0);
2509     }
2510    
2511     /* Optimization: if there are no more active states, and there
2512     are no new states yet set up, then skip over the subject string
2513     right here, to save looping. Otherwise, set up the new state to swing
2514     into action when the end of the substring is reached. */
2515    
2516     else if (i + 1 >= active_count && new_count == 0)
2517     {
2518     ptr += charcount;
2519     clen = 0;
2520     ADD_NEW(next_state_offset, 0);
2521    
2522     /* If we are adding a repeat state at the new character position,
2523     we must fudge things so that it is the only current state.
2524     Otherwise, it might be a duplicate of one we processed before, and
2525     that would cause it to be skipped. */
2526    
2527     if (repeat_state_offset >= 0)
2528     {
2529     next_active_state = active_states;
2530     active_count = 0;
2531     i = -1;
2532     ADD_ACTIVE(repeat_state_offset, 0);
2533     }
2534     }
2535     else
2536     {
2537     const uschar *p = start_subject + local_offsets[0];
2538     const uschar *pp = start_subject + local_offsets[1];
2539     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2540     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2541     if (repeat_state_offset >= 0)
2542     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2543     }
2544    
2545     }
2546     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2547     }
2548     break;
2549    
2550    
2551     /* ========================================================================== */
2552     /* Handle callouts */
2553    
2554     case OP_CALLOUT:
2555 ph10 406 rrc = 0;
2556 nigel 77 if (pcre_callout != NULL)
2557     {
2558     pcre_callout_block cb;
2559     cb.version = 1; /* Version 1 of the callout block */
2560     cb.callout_number = code[1];
2561     cb.offset_vector = offsets;
2562 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2563 nigel 77 cb.subject_length = end_subject - start_subject;
2564     cb.start_match = current_subject - start_subject;
2565     cb.current_position = ptr - start_subject;
2566     cb.pattern_position = GET(code, 2);
2567     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2568     cb.capture_top = 1;
2569     cb.capture_last = -1;
2570     cb.callout_data = md->callout_data;
2571     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2572 ph10 406 }
2573     if (rrc == 0)
2574     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2575 nigel 77 break;
2576    
2577    
2578     /* ========================================================================== */
2579     default: /* Unsupported opcode */
2580     return PCRE_ERROR_DFA_UITEM;
2581     }
2582    
2583     NEXT_ACTIVE_STATE: continue;
2584    
2585     } /* End of loop scanning active states */
2586    
2587     /* We have finished the processing at the current subject character. If no
2588     new states have been set for the next character, we have found all the
2589     matches that we are going to find. If we are at the top level and partial
2590 ph10 462 matching has been requested, check for appropriate conditions.
2591    
2592     The "forced_ fail" variable counts the number of (*F) encountered for the
2593     character. If it is equal to the original active_count (saved in
2594     workspace[1]) it means that (*F) was found on every active state. In this
2595     case we don't want to give a partial match.
2596    
2597     The "reached_end" variable counts the number of threads that have reached the
2598     end of the pattern. The "could_continue" variable is true if a thread could
2599     have continued but for the fact that the end of the subject was reached. */
2600 nigel 77
2601     if (new_count <= 0)
2602     {
2603 ph10 427 if (rlevel == 1 && /* Top level, and */
2604 ph10 462 ( /* either... */
2605     reached_end != workspace[1] || /* Not all reached end */
2606     could_continue /* or some could go on */
2607     ) && /* and... */
2608 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2609 ph10 427 ( /* either... */
2610     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2611     || /* or... */
2612     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2613     match_count < 0) /* no matches */
2614     ) && /* And... */
2615     ptr >= end_subject && /* Reached end of subject */
2616     ptr > current_subject) /* Matched non-empty string */
2617 nigel 77 {
2618     if (offsetcount >= 2)
2619     {
2620 ph10 435 offsets[0] = md->start_used_ptr - start_subject;
2621 nigel 77 offsets[1] = end_subject - start_subject;
2622     }
2623     match_count = PCRE_ERROR_PARTIAL;
2624     }
2625    
2626     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2627     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2628     rlevel*2-2, SP));
2629 nigel 91 break; /* In effect, "return", but see the comment below */
2630 nigel 77 }
2631    
2632     /* One or more states are active for the next character. */
2633    
2634     ptr += clen; /* Advance to next subject character */
2635     } /* Loop to move along the subject string */
2636    
2637 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2638     if we use "return" above, we have compiler trouble. Some compilers warn if
2639     there's nothing here because they think the function doesn't return a value. On
2640     the other hand, if we put a dummy statement here, some more clever compilers
2641     complain that it can't be reached. Sigh. */
2642 nigel 77
2643 nigel 91 return match_count;
2644 nigel 77 }
2645    
2646    
2647    
2648    
2649     /*************************************************
2650     * Execute a Regular Expression - DFA engine *
2651     *************************************************/
2652    
2653     /* This external function applies a compiled re to a subject string using a DFA
2654     engine. This function calls the internal function multiple times if the pattern
2655     is not anchored.
2656    
2657     Arguments:
2658     argument_re points to the compiled expression
2659 ph10 97 extra_data points to extra data or is NULL
2660 nigel 77 subject points to the subject string
2661     length length of subject string (may contain binary zeros)
2662     start_offset where to start in the subject string
2663     options option bits
2664     offsets vector of match offsets
2665     offsetcount size of same
2666     workspace workspace vector
2667     wscount size of same
2668    
2669     Returns: > 0 => number of match offset pairs placed in offsets
2670     = 0 => offsets overflowed; longest matches are present
2671     -1 => failed to match
2672     < -1 => some kind of unexpected problem
2673     */
2674    
2675 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2676 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2677     const char *subject, int length, int start_offset, int options, int *offsets,
2678     int offsetcount, int *workspace, int wscount)
2679     {
2680     real_pcre *re = (real_pcre *)argument_re;
2681     dfa_match_data match_block;
2682 nigel 91 dfa_match_data *md = &match_block;
2683 nigel 77 BOOL utf8, anchored, startline, firstline;
2684     const uschar *current_subject, *end_subject, *lcc;
2685    
2686     pcre_study_data internal_study;
2687     const pcre_study_data *study = NULL;
2688     real_pcre internal_re;
2689    
2690     const uschar *req_byte_ptr;
2691     const uschar *start_bits = NULL;
2692     BOOL first_byte_caseless = FALSE;
2693     BOOL req_byte_caseless = FALSE;
2694     int first_byte = -1;
2695     int req_byte = -1;
2696     int req_byte2 = -1;
2697 nigel 91 int newline;
2698 nigel 77
2699     /* Plausibility checks */
2700    
2701     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2702     if (re == NULL || subject == NULL || workspace == NULL ||
2703     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2704     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2705     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2706    
2707     /* We need to find the pointer to any study data before we test for byte
2708     flipping, so we scan the extra_data block first. This may set two fields in the
2709     match block, so we must initialize them beforehand. However, the other fields
2710     in the match block must not be set until after the byte flipping. */
2711    
2712 nigel 91 md->tables = re->tables;
2713     md->callout_data = NULL;
2714 nigel 77
2715     if (extra_data != NULL)
2716     {
2717     unsigned int flags = extra_data->flags;
2718     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2719     study = (const pcre_study_data *)extra_data->study_data;
2720     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2721 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2722     return PCRE_ERROR_DFA_UMLIMIT;
2723 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2724 nigel 91 md->callout_data = extra_data->callout_data;
2725 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2726 nigel 91 md->tables = extra_data->tables;
2727 nigel 77 }
2728 ph10 461
2729 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
2730     test for a regex that was compiled on a host of opposite endianness. If this is
2731     the case, flipped values are put in internal_re and internal_study if there was
2732     study data too. */
2733    
2734     if (re->magic_number != MAGIC_NUMBER)
2735     {
2736     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2737     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2738     if (study != NULL) study = &internal_study;
2739     }
2740    
2741     /* Set some local values */
2742    
2743     current_subject = (const unsigned char *)subject + start_offset;
2744     end_subject = (const unsigned char *)subject + length;
2745     req_byte_ptr = current_subject - 1;
2746    
2747 nigel 91 #ifdef SUPPORT_UTF8
2748 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2749 nigel 91 #else
2750     utf8 = FALSE;
2751     #endif
2752 nigel 77
2753 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2754     (re->options & PCRE_ANCHORED) != 0;
2755    
2756 nigel 77 /* The remaining fixed data for passing around. */
2757    
2758 nigel 91 md->start_code = (const uschar *)argument_re +
2759 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2760 nigel 91 md->start_subject = (const unsigned char *)subject;
2761     md->end_subject = end_subject;
2762 ph10 442 md->start_offset = start_offset;
2763 nigel 91 md->moptions = options;
2764     md->poptions = re->options;
2765 nigel 77
2766 ph10 231 /* If the BSR option is not set at match time, copy what was set
2767     at compile time. */
2768    
2769     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2770     {
2771     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2772     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2773     #ifdef BSR_ANYCRLF
2774     else md->moptions |= PCRE_BSR_ANYCRLF;
2775 ph10 243 #endif
2776     }
2777 ph10 231
2778 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2779     nothing is set at run time, whatever was used at compile time applies. */
2780 nigel 91
2781 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2782 nigel 93 PCRE_NEWLINE_BITS)
2783 nigel 91 {
2784 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2785 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2786     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2787 nigel 91 case PCRE_NEWLINE_CR+
2788 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2789 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2790 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2791 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2792 nigel 91 }
2793    
2794 ph10 149 if (newline == -2)
2795 nigel 91 {
2796 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2797     }
2798     else if (newline < 0)
2799     {
2800 nigel 93 md->nltype = NLTYPE_ANY;
2801 nigel 91 }
2802     else
2803     {
2804 nigel 93 md->nltype = NLTYPE_FIXED;
2805     if (newline > 255)
2806     {
2807     md->nllen = 2;
2808     md->nl[0] = (newline >> 8) & 255;
2809     md->nl[1] = newline & 255;
2810     }
2811     else
2812     {
2813     md->nllen = 1;
2814     md->nl[0] = newline;
2815     }
2816 nigel 91 }
2817    
2818 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2819     back the character offset. */
2820    
2821     #ifdef SUPPORT_UTF8
2822     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2823     {
2824     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2825     return PCRE_ERROR_BADUTF8;
2826     if (start_offset > 0 && start_offset < length)
2827     {
2828     int tb = ((uschar *)subject)[start_offset];
2829     if (tb > 127)
2830     {
2831     tb &= 0xc0;
2832     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2833     }
2834     }
2835     }
2836     #endif
2837    
2838     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2839     is a feature that makes it possible to save compiled regex and re-use them
2840     in other programs later. */
2841    
2842 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2843 nigel 77
2844     /* The lower casing table and the "must be at the start of a line" flag are
2845     used in a loop when finding where to start. */
2846    
2847 nigel 91 lcc = md->tables + lcc_offset;
2848 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2849 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2850    
2851     /* Set up the first character to match, if available. The first_byte value is
2852     never set for an anchored regular expression, but the anchoring may be forced
2853     at run time, so we have to test for anchoring. The first char may be unset for
2854     an unanchored pattern, of course. If there's no first char and the pattern was
2855     studied, there may be a bitmap of possible first characters. */
2856    
2857     if (!anchored)
2858     {
2859 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2860 nigel 77 {
2861     first_byte = re->first_byte & 255;
2862     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2863     first_byte = lcc[first_byte];
2864     }
2865     else
2866     {
2867 ph10 455 if (!startline && study != NULL &&
2868     (study->flags & PCRE_STUDY_MAPPED) != 0)
2869 nigel 77 start_bits = study->start_bits;
2870     }
2871     }
2872    
2873     /* For anchored or unanchored matches, there may be a "last known required
2874     character" set. */
2875    
2876 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2877 nigel 77 {
2878     req_byte = re->req_byte & 255;
2879     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2880 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2881 nigel 77 }
2882    
2883     /* Call the main matching function, looping for a non-anchored regex after a
2884 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2885     a match. */
2886 nigel 77
2887     for (;;)
2888     {
2889     int rc;
2890    
2891     if ((options & PCRE_DFA_RESTART) == 0)
2892     {
2893     const uschar *save_end_subject = end_subject;
2894    
2895 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2896     line of a multiline string. Implement this by temporarily adjusting
2897     end_subject so that we stop scanning at a newline. If the match fails at
2898     the newline, later code breaks this loop. */
2899 nigel 77
2900     if (firstline)
2901     {
2902 ph10 365 USPTR t = current_subject;
2903     #ifdef SUPPORT_UTF8
2904     if (utf8)
2905 ph10 371 {
2906     while (t < md->end_subject && !IS_NEWLINE(t))
2907 ph10 365 {
2908     t++;
2909     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2910 ph10 371 }
2911 ph10 365 }
2912     else
2913 ph10 371 #endif
2914 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2915 nigel 77 end_subject = t;
2916     }
2917 ph10 392
2918 ph10 389 /* There are some optimizations that avoid running the match if a known
2919 ph10 455 starting point is not found. However, there is an option that disables
2920     these, for testing and for ensuring that all callouts do actually occur. */
2921 nigel 77
2922 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2923 ph10 392 {
2924 ph10 389 /* Advance to a known first byte. */
2925 ph10 392
2926 ph10 389 if (first_byte >= 0)
2927 nigel 77 {
2928 ph10 389 if (first_byte_caseless)
2929     while (current_subject < end_subject &&
2930     lcc[*current_subject] != first_byte)
2931     current_subject++;
2932     else
2933 ph10 392 while (current_subject < end_subject &&
2934 ph10 389 *current_subject != first_byte)
2935     current_subject++;
2936     }
2937 ph10 392
2938 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2939 ph10 392
2940 ph10 389 else if (startline)
2941     {
2942     if (current_subject > md->start_subject + start_offset)
2943     {
2944 ph10 365 #ifdef SUPPORT_UTF8
2945 ph10 389 if (utf8)
2946 ph10 365 {
2947 ph10 392 while (current_subject < end_subject &&
2948 ph10 389 !WAS_NEWLINE(current_subject))
2949     {
2950 ph10 365 current_subject++;
2951 ph10 389 while(current_subject < end_subject &&
2952     (*current_subject & 0xc0) == 0x80)
2953     current_subject++;
2954     }
2955 ph10 371 }
2956 ph10 389 else
2957     #endif
2958     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2959     current_subject++;
2960 ph10 392
2961 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2962     ANYCRLF, and we are now at a LF, advance the match position by one
2963     more character. */
2964 ph10 392
2965 ph10 391 if (current_subject[-1] == CHAR_CR &&
2966 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2967     current_subject < end_subject &&
2968 ph10 391 *current_subject == CHAR_NL)
2969 ph10 389 current_subject++;
2970 ph10 365 }
2971 nigel 77 }
2972 ph10 392
2973 ph10 389 /* Or to a non-unique first char after study */
2974 ph10 392
2975 ph10 389 else if (start_bits != NULL)
2976 nigel 77 {
2977 ph10 389 while (current_subject < end_subject)
2978     {
2979     register unsigned int c = *current_subject;
2980     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2981     else break;
2982     }
2983 nigel 77 }
2984 ph10 392 }
2985 nigel 77
2986     /* Restore fudged end_subject */
2987    
2988     end_subject = save_end_subject;
2989    
2990 ph10 461 /* The following two optimizations are disabled for partial matching or if
2991     disabling is explicitly requested (and of course, by the test above, this
2992 ph10 455 code is not obeyed when restarting after a partial match). */
2993 ph10 461
2994 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2995     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
2996 ph10 461 {
2997 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
2998     is a lower bound; no actual string of that length may actually match the
2999     pattern. Although the value is, strictly, in characters, we treat it as
3000     bytes to avoid spending too much time in this optimization. */
3001 nigel 77
3002 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3003     end_subject - current_subject < study->minlength)
3004     return PCRE_ERROR_NOMATCH;
3005 ph10 461
3006 ph10 455 /* If req_byte is set, we know that that character must appear in the
3007     subject for the match to succeed. If the first character is set, req_byte
3008     must be later in the subject; otherwise the test starts at the match
3009     point. This optimization can save a huge amount of work in patterns with
3010     nested unlimited repeats that aren't going to match. Writing separate
3011     code for cased/caseless versions makes it go faster, as does using an
3012     autoincrement and backing off on a match.
3013 ph10 461
3014 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3015     can take a long time, and give bad performance on quite ordinary
3016     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3017     string... so we don't do this when the string is sufficiently long. */
3018 ph10 461
3019 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3020 nigel 77 {
3021 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3022 ph10 461
3023 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3024     place we found it at last time. */
3025 ph10 461
3026 ph10 455 if (p > req_byte_ptr)
3027 nigel 77 {
3028 ph10 455 if (req_byte_caseless)
3029     {
3030     while (p < end_subject)
3031     {
3032     register int pp = *p++;
3033     if (pp == req_byte || pp == req_byte2) { p--; break; }
3034     }
3035     }
3036     else
3037     {
3038     while (p < end_subject)
3039     {
3040     if (*p++ == req_byte) { p--; break; }
3041     }
3042     }
3043 ph10 461
3044 ph10 455 /* If we can't find the required character, break the matching loop,
3045     which will cause a return or PCRE_ERROR_NOMATCH. */
3046 ph10 461
3047 ph10 455 if (p >= end_subject) break;
3048 ph10 461
3049 ph10 455 /* If we have found the required character, save the point where we
3050     found it, so that we don't search again next time round the loop if
3051     the start hasn't passed this character yet. */
3052 ph10 461
3053 ph10 455 req_byte_ptr = p;
3054 nigel 77 }
3055 ph10 461 }
3056 nigel 77 }
3057 ph10 455 } /* End of optimizations that are done when not restarting */
3058 nigel 77
3059     /* OK, now we can do the business */
3060    
3061 ph10 435 md->start_used_ptr = current_subject;
3062 ph10 461
3063 nigel 77 rc = internal_dfa_exec(
3064 nigel 91 md, /* fixed match data */
3065     md->start_code, /* this subexpression's code */
3066     current_subject, /* where we currently are */
3067     start_offset, /* start offset in subject */
3068     offsets, /* offset vector */
3069     offsetcount, /* size of same */
3070     workspace, /* workspace vector */
3071     wscount, /* size of same */
3072 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3073 nigel 91 0, /* function recurse level */
3074     0); /* regex recurse level */
3075 nigel 77
3076     /* Anything other than "no match" means we are done, always; otherwise, carry
3077     on only if not anchored. */
3078    
3079     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3080    
3081     /* Advance to the next subject character unless we are at the end of a line
3082     and firstline is set. */
3083    
3084 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3085 nigel 77 current_subject++;
3086     if (utf8)
3087     {
3088     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3089     current_subject++;
3090     }
3091     if (current_subject > end_subject) break;
3092    
3093 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3094 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3095     or ANY or ANYCRLF, advance the match position by one more character. */
3096 nigel 93
3097 ph10 391 if (current_subject[-1] == CHAR_CR &&
3098 ph10 226 current_subject < end_subject &&
3099 ph10 391 *current_subject == CHAR_NL &&
3100 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3101 ph10 226 (md->nltype == NLTYPE_ANY ||
3102     md->nltype == NLTYPE_ANYCRLF ||
3103     md->nllen == 2))
3104 nigel 93 current_subject++;
3105    
3106     } /* "Bumpalong" loop */
3107    
3108 nigel 77 return PCRE_ERROR_NOMATCH;
3109     }
3110    
3111     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12