/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 517 - (hide annotations) (download)
Wed May 5 10:44:20 2010 UTC (3 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 111110 byte(s)
Add new special properties Xan, Xps, Xsp, Xwd to help with \w etc.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 473 Copyright (c) 1997-2010 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
125     1, /* Char */
126     1, /* Charnc */
127     1, /* not */
128     /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130     3, 3, 3, /* upto, minupto, exact */
131 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
132 nigel 77 /* Negative single-char repeats - only for chars < 256 */
133     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
134     3, 3, 3, /* NOT upto, minupto, exact */
135 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
136 nigel 77 /* Positive type repeats */
137     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* Type upto, minupto, exact */
139 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
140 nigel 77 /* Character class & ref repeats */
141     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
142     0, 0, /* CRRANGE, CRMINRANGE */
143     0, /* CLASS */
144     0, /* NCLASS */
145     0, /* XCLASS - variable length */
146     0, /* REF */
147     0, /* RECURSE */
148     0, /* CALLOUT */
149     0, /* Alt */
150     0, /* Ket */
151     0, /* KetRmax */
152     0, /* KetRmin */
153     0, /* Assert */
154     0, /* Assert not */
155     0, /* Assert behind */
156     0, /* Assert behind not */
157     0, /* Reverse */
158 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
159     0, 0, 0, /* SBRA, SCBRA, SCOND */
160 ph10 498 0, 0, /* CREF, NCREF */
161     0, 0, /* RREF, NRREF */
162 nigel 93 0, /* DEF */
163 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
164 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
165     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
166     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
167 nigel 77 };
168    
169 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
170 ph10 462 remember the fact that a character could have been inspected when the end of
171 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
172     two tables that follow must also be modified. */
173 ph10 462
174     static const uschar poptable[] = {
175     0, /* End */
176 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
177 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
178     1, 1, 1, /* Any, AllAny, Anybyte */
179 ph10 498 1, 1, /* \P, \p */
180 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
181 ph10 498 1, /* \X */
182 ph10 462 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
183     1, /* Char */
184     1, /* Charnc */
185     1, /* not */
186     /* Positive single-char repeats */
187     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
188     1, 1, 1, /* upto, minupto, exact */
189     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
190     /* Negative single-char repeats - only for chars < 256 */
191     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
192     1, 1, 1, /* NOT upto, minupto, exact */
193     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
194     /* Positive type repeats */
195     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
196     1, 1, 1, /* Type upto, minupto, exact */
197     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
198     /* Character class & ref repeats */
199     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200     1, 1, /* CRRANGE, CRMINRANGE */
201     1, /* CLASS */
202     1, /* NCLASS */
203     1, /* XCLASS - variable length */
204     0, /* REF */
205     0, /* RECURSE */
206     0, /* CALLOUT */
207     0, /* Alt */
208     0, /* Ket */
209     0, /* KetRmax */
210     0, /* KetRmin */
211     0, /* Assert */
212     0, /* Assert not */
213     0, /* Assert behind */
214     0, /* Assert behind not */
215     0, /* Reverse */
216     0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
217     0, 0, 0, /* SBRA, SCBRA, SCOND */
218 ph10 498 0, 0, /* CREF, NCREF */
219     0, 0, /* RREF, NRREF */
220 ph10 462 0, /* DEF */
221     0, 0, /* BRAZERO, BRAMINZERO */
222 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
223     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
224     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
225 ph10 462 };
226    
227 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
228     and \w */
229    
230 ph10 327 static const uschar toptable1[] = {
231 ph10 168 0, 0, 0, 0, 0, 0,
232 nigel 77 ctype_digit, ctype_digit,
233     ctype_space, ctype_space,
234     ctype_word, ctype_word,
235 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
236 nigel 77 };
237    
238 ph10 327 static const uschar toptable2[] = {
239 ph10 168 0, 0, 0, 0, 0, 0,
240 nigel 77 ctype_digit, 0,
241     ctype_space, 0,
242     ctype_word, 0,
243 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
244 nigel 77 };
245    
246    
247     /* Structure for holding data about a particular state, which is in effect the
248     current data for an active path through the match tree. It must consist
249     entirely of ints because the working vector we are passed, and which we put
250     these structures in, is a vector of ints. */
251    
252     typedef struct stateblock {
253     int offset; /* Offset to opcode */
254     int count; /* Count for repeats */
255     int ims; /* ims flag bits */
256     int data; /* Some use extra data */
257     } stateblock;
258    
259     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
260    
261    
262 ph10 475 #ifdef PCRE_DEBUG
263 nigel 77 /*************************************************
264     * Print character string *
265     *************************************************/
266    
267     /* Character string printing function for debugging.
268    
269     Arguments:
270     p points to string
271     length number of bytes
272     f where to print
273    
274     Returns: nothing
275     */
276    
277     static void
278     pchars(unsigned char *p, int length, FILE *f)
279     {
280     int c;
281     while (length-- > 0)
282     {
283     if (isprint(c = *(p++)))
284     fprintf(f, "%c", c);
285     else
286     fprintf(f, "\\x%02x", c);
287     }
288     }
289     #endif
290    
291    
292    
293     /*************************************************
294     * Execute a Regular Expression - DFA engine *
295     *************************************************/
296    
297     /* This internal function applies a compiled pattern to a subject string,
298     starting at a given point, using a DFA engine. This function is called from the
299     external one, possibly multiple times if the pattern is not anchored. The
300     function calls itself recursively for some kinds of subpattern.
301    
302     Arguments:
303     md the match_data block with fixed information
304     this_start_code the opening bracket of this subexpression's code
305     current_subject where we currently are in the subject string
306     start_offset start offset in the subject string
307     offsets vector to contain the matching string offsets
308     offsetcount size of same
309     workspace vector of workspace
310     wscount size of same
311     ims the current ims flags
312     rlevel function call recursion level
313     recursing regex recursive call level
314    
315 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
316 ph10 341 = 0 => offsets overflowed; longest matches are present
317 nigel 77 -1 => failed to match
318     < -1 => some kind of unexpected problem
319    
320     The following macros are used for adding states to the two state vectors (one
321     for the current character, one for the following character). */
322    
323     #define ADD_ACTIVE(x,y) \
324     if (active_count++ < wscount) \
325     { \
326     next_active_state->offset = (x); \
327     next_active_state->count = (y); \
328     next_active_state->ims = ims; \
329     next_active_state++; \
330     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
331     } \
332     else return PCRE_ERROR_DFA_WSSIZE
333    
334     #define ADD_ACTIVE_DATA(x,y,z) \
335     if (active_count++ < wscount) \
336     { \
337     next_active_state->offset = (x); \
338     next_active_state->count = (y); \
339     next_active_state->ims = ims; \
340     next_active_state->data = (z); \
341     next_active_state++; \
342     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
343     } \
344     else return PCRE_ERROR_DFA_WSSIZE
345    
346     #define ADD_NEW(x,y) \
347     if (new_count++ < wscount) \
348     { \
349     next_new_state->offset = (x); \
350     next_new_state->count = (y); \
351     next_new_state->ims = ims; \
352     next_new_state++; \
353     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354     } \
355     else return PCRE_ERROR_DFA_WSSIZE
356    
357     #define ADD_NEW_DATA(x,y,z) \
358     if (new_count++ < wscount) \
359     { \
360     next_new_state->offset = (x); \
361     next_new_state->count = (y); \
362     next_new_state->ims = ims; \
363     next_new_state->data = (z); \
364     next_new_state++; \
365     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
366     } \
367     else return PCRE_ERROR_DFA_WSSIZE
368    
369     /* And now, here is the code */
370    
371     static int
372     internal_dfa_exec(
373     dfa_match_data *md,
374     const uschar *this_start_code,
375     const uschar *current_subject,
376     int start_offset,
377     int *offsets,
378     int offsetcount,
379     int *workspace,
380     int wscount,
381     int ims,
382     int rlevel,
383     int recursing)
384     {
385     stateblock *active_states, *new_states, *temp_states;
386     stateblock *next_active_state, *next_new_state;
387    
388     const uschar *ctypes, *lcc, *fcc;
389     const uschar *ptr;
390 nigel 93 const uschar *end_code, *first_op;
391 nigel 77
392     int active_count, new_count, match_count;
393    
394     /* Some fields in the md block are frequently referenced, so we load them into
395     independent variables in the hope that this will perform better. */
396    
397     const uschar *start_subject = md->start_subject;
398     const uschar *end_subject = md->end_subject;
399     const uschar *start_code = md->start_code;
400    
401 nigel 87 #ifdef SUPPORT_UTF8
402 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
403 nigel 93 #else
404     BOOL utf8 = FALSE;
405 nigel 87 #endif
406 nigel 77
407     rlevel++;
408     offsetcount &= (-2);
409    
410     wscount -= 2;
411     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
412     (2 * INTS_PER_STATEBLOCK);
413    
414     DPRINTF(("\n%.*s---------------------\n"
415     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
416     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
417    
418     ctypes = md->tables + ctypes_offset;
419     lcc = md->tables + lcc_offset;
420     fcc = md->tables + fcc_offset;
421    
422     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
423    
424     active_states = (stateblock *)(workspace + 2);
425     next_new_state = new_states = active_states + wscount;
426     new_count = 0;
427    
428 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
429     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
430    
431 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
432     the alternative states onto the list, and find out where the end is. This
433     makes is possible to use this function recursively, when we want to stop at a
434     matching internal ket rather than at the end.
435    
436     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
437     a backward assertion. In that case, we have to find out the maximum amount to
438     move back, and set up each alternative appropriately. */
439    
440 nigel 93 if (*first_op == OP_REVERSE)
441 nigel 77 {
442     int max_back = 0;
443     int gone_back;
444    
445     end_code = this_start_code;
446     do
447     {
448     int back = GET(end_code, 2+LINK_SIZE);
449     if (back > max_back) max_back = back;
450     end_code += GET(end_code, 1);
451     }
452     while (*end_code == OP_ALT);
453    
454     /* If we can't go back the amount required for the longest lookbehind
455     pattern, go back as far as we can; some alternatives may still be viable. */
456    
457     #ifdef SUPPORT_UTF8
458     /* In character mode we have to step back character by character */
459    
460     if (utf8)
461     {
462     for (gone_back = 0; gone_back < max_back; gone_back++)
463     {
464     if (current_subject <= start_subject) break;
465     current_subject--;
466     while (current_subject > start_subject &&
467     (*current_subject & 0xc0) == 0x80)
468     current_subject--;
469     }
470     }
471     else
472     #endif
473    
474     /* In byte-mode we can do this quickly. */
475    
476     {
477     gone_back = (current_subject - max_back < start_subject)?
478     current_subject - start_subject : max_back;
479     current_subject -= gone_back;
480     }
481 ph10 461
482 ph10 435 /* Save the earliest consulted character */
483 nigel 77
484 ph10 461 if (current_subject < md->start_used_ptr)
485     md->start_used_ptr = current_subject;
486    
487 nigel 77 /* Now we can process the individual branches. */
488    
489     end_code = this_start_code;
490     do
491     {
492     int back = GET(end_code, 2+LINK_SIZE);
493     if (back <= gone_back)
494     {
495     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
496     ADD_NEW_DATA(-bstate, 0, gone_back - back);
497     }
498     end_code += GET(end_code, 1);
499     }
500     while (*end_code == OP_ALT);
501     }
502    
503     /* This is the code for a "normal" subpattern (not a backward assertion). The
504     start of a whole pattern is always one of these. If we are at the top level,
505     we may be asked to restart matching from the same point that we reached for a
506     previous partial match. We still have to scan through the top-level branches to
507     find the end state. */
508    
509     else
510     {
511     end_code = this_start_code;
512    
513     /* Restarting */
514    
515     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
516     {
517     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
518     new_count = workspace[1];
519     if (!workspace[0])
520     memcpy(new_states, active_states, new_count * sizeof(stateblock));
521     }
522    
523     /* Not restarting */
524    
525     else
526     {
527 nigel 93 int length = 1 + LINK_SIZE +
528     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
529 nigel 77 do
530     {
531 nigel 93 ADD_NEW(end_code - start_code + length, 0);
532 nigel 77 end_code += GET(end_code, 1);
533 nigel 93 length = 1 + LINK_SIZE;
534 nigel 77 }
535     while (*end_code == OP_ALT);
536     }
537     }
538    
539     workspace[0] = 0; /* Bit indicating which vector is current */
540    
541     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
542    
543     /* Loop for scanning the subject */
544    
545     ptr = current_subject;
546     for (;;)
547     {
548     int i, j;
549 nigel 91 int clen, dlen;
550     unsigned int c, d;
551 ph10 428 int forced_fail = 0;
552 ph10 462 BOOL could_continue = FALSE;
553 nigel 77
554     /* Make the new state list into the active state list and empty the
555     new state list. */
556    
557     temp_states = active_states;
558     active_states = new_states;
559     new_states = temp_states;
560     active_count = new_count;
561     new_count = 0;
562    
563     workspace[0] ^= 1; /* Remember for the restarting feature */
564     workspace[1] = active_count;
565    
566 ph10 475 #ifdef PCRE_DEBUG
567 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
568     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
569     printf("\"\n");
570    
571     printf("%.*sActive states: ", rlevel*2-2, SP);
572     for (i = 0; i < active_count; i++)
573     printf("%d/%d ", active_states[i].offset, active_states[i].count);
574     printf("\n");
575     #endif
576    
577     /* Set the pointers for adding new states */
578    
579     next_active_state = active_states + active_count;
580     next_new_state = new_states;
581    
582     /* Load the current character from the subject outside the loop, as many
583     different states may want to look at it, and we assume that at least one
584     will. */
585    
586     if (ptr < end_subject)
587     {
588 nigel 93 clen = 1; /* Number of bytes in the character */
589 nigel 77 #ifdef SUPPORT_UTF8
590     if (utf8) { GETCHARLEN(c, ptr, clen); } else
591     #endif /* SUPPORT_UTF8 */
592     c = *ptr;
593     }
594     else
595     {
596 nigel 93 clen = 0; /* This indicates the end of the subject */
597     c = NOTACHAR; /* This value should never actually be used */
598 nigel 77 }
599    
600     /* Scan up the active states and act on each one. The result of an action
601     may be to add more states to the currently active list (e.g. on hitting a
602     parenthesis) or it may be to put states on the new list, for considering
603     when we move the character pointer on. */
604    
605     for (i = 0; i < active_count; i++)
606     {
607     stateblock *current_state = active_states + i;
608     const uschar *code;
609     int state_offset = current_state->offset;
610 ph10 397 int count, codevalue, rrc;
611 nigel 77
612 ph10 475 #ifdef PCRE_DEBUG
613 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
614 nigel 93 if (clen == 0) printf("EOL\n");
615 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
616     else printf("0x%02x\n", c);
617     #endif
618    
619     /* This variable is referred to implicity in the ADD_xxx macros. */
620    
621     ims = current_state->ims;
622    
623     /* A negative offset is a special case meaning "hold off going to this
624     (negated) state until the number of characters in the data field have
625     been skipped". */
626    
627     if (state_offset < 0)
628     {
629     if (current_state->data > 0)
630     {
631     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
632     ADD_NEW_DATA(state_offset, current_state->count,
633     current_state->data - 1);
634     continue;
635     }
636     else
637     {
638     current_state->offset = state_offset = -state_offset;
639     }
640     }
641    
642 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
643 ph10 439 See the note at the head of this module about the possibility of improving
644     performance here. */
645 nigel 77
646     for (j = 0; j < i; j++)
647     {
648     if (active_states[j].offset == state_offset &&
649     active_states[j].count == current_state->count)
650     {
651     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
652     goto NEXT_ACTIVE_STATE;
653     }
654     }
655    
656     /* The state offset is the offset to the opcode */
657    
658     code = start_code + state_offset;
659     codevalue = *code;
660    
661 ph10 463 /* If this opcode inspects a character, but we are at the end of the
662     subject, remember the fact for use when testing for a partial match. */
663    
664 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
665 ph10 463 could_continue = TRUE;
666 ph10 462
667 nigel 77 /* If this opcode is followed by an inline character, load it. It is
668     tempting to test for the presence of a subject character here, but that
669     is wrong, because sometimes zero repetitions of the subject are
670     permitted.
671    
672     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
673 ph10 178 argument that is not a data character - but is always one byte long. We
674     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
675     this case. To keep the other cases fast, convert these ones to new opcodes.
676     */
677 nigel 77
678     if (coptable[codevalue] > 0)
679     {
680     dlen = 1;
681     #ifdef SUPPORT_UTF8
682     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
683     #endif /* SUPPORT_UTF8 */
684     d = code[coptable[codevalue]];
685     if (codevalue >= OP_TYPESTAR)
686     {
687 nigel 93 switch(d)
688     {
689     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
690     case OP_NOTPROP:
691     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
692     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
693     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
694 ph10 178 case OP_NOT_HSPACE:
695 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
696 ph10 178 case OP_NOT_VSPACE:
697 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
698 nigel 93 default: break;
699     }
700 nigel 77 }
701     }
702     else
703     {
704     dlen = 0; /* Not strictly necessary, but compilers moan */
705 nigel 93 d = NOTACHAR; /* if these variables are not set. */
706 nigel 77 }
707    
708    
709     /* Now process the individual opcodes */
710    
711     switch (codevalue)
712     {
713 ph10 498 /* ========================================================================== */
714     /* These cases are never obeyed. This is a fudge that causes a compile-
715     time error if the vectors coptable or poptable, which are indexed by
716     opcode, are not the correct length. It seems to be the only way to do
717     such a check at compile time, as the sizeof() operator does not work
718     in the C preprocessor. */
719 ph10 507
720 ph10 498 case OP_TABLE_LENGTH:
721 ph10 507 case OP_TABLE_LENGTH +
722 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
723     (sizeof(poptable) == OP_TABLE_LENGTH)):
724 ph10 507 break;
725 nigel 77
726     /* ========================================================================== */
727     /* Reached a closing bracket. If not at the end of the pattern, carry
728     on with the next opcode. Otherwise, unless we have an empty string and
729 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
730 ph10 442 start of the subject, save the match data, shifting up all previous
731 nigel 77 matches so we always have the longest first. */
732    
733     case OP_KET:
734     case OP_KETRMIN:
735     case OP_KETRMAX:
736     if (code != end_code)
737     {
738     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
739     if (codevalue != OP_KET)
740     {
741     ADD_ACTIVE(state_offset - GET(code, 1), 0);
742     }
743     }
744 ph10 461 else
745 nigel 77 {
746 ph10 461 if (ptr > current_subject ||
747 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
748     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
749     current_subject > start_subject + md->start_offset)))
750 nigel 77 {
751 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
752     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
753     match_count = 0;
754     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
755     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
756     if (offsetcount >= 2)
757     {
758     offsets[0] = current_subject - start_subject;
759     offsets[1] = ptr - start_subject;
760     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
761     offsets[1] - offsets[0], current_subject));
762     }
763     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
764     {
765     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
766     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
767     match_count, rlevel*2-2, SP));
768     return match_count;
769     }
770 ph10 461 }
771 nigel 77 }
772     break;
773    
774     /* ========================================================================== */
775     /* These opcodes add to the current list of states without looking
776     at the current character. */
777    
778     /*-----------------------------------------------------------------*/
779     case OP_ALT:
780     do { code += GET(code, 1); } while (*code == OP_ALT);
781     ADD_ACTIVE(code - start_code, 0);
782     break;
783    
784     /*-----------------------------------------------------------------*/
785     case OP_BRA:
786 nigel 93 case OP_SBRA:
787 nigel 77 do
788     {
789     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
790     code += GET(code, 1);
791     }
792     while (*code == OP_ALT);
793     break;
794    
795     /*-----------------------------------------------------------------*/
796 nigel 93 case OP_CBRA:
797     case OP_SCBRA:
798     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
799     code += GET(code, 1);
800     while (*code == OP_ALT)
801     {
802     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
803     code += GET(code, 1);
804     }
805     break;
806    
807     /*-----------------------------------------------------------------*/
808 nigel 77 case OP_BRAZERO:
809     case OP_BRAMINZERO:
810     ADD_ACTIVE(state_offset + 1, 0);
811     code += 1 + GET(code, 2);
812     while (*code == OP_ALT) code += GET(code, 1);
813     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
814     break;
815    
816     /*-----------------------------------------------------------------*/
817 ph10 335 case OP_SKIPZERO:
818     code += 1 + GET(code, 2);
819     while (*code == OP_ALT) code += GET(code, 1);
820     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
821     break;
822    
823     /*-----------------------------------------------------------------*/
824 nigel 77 case OP_CIRC:
825     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
826 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
827     ptr != end_subject &&
828 nigel 93 WAS_NEWLINE(ptr)))
829 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
830     break;
831    
832     /*-----------------------------------------------------------------*/
833     case OP_EOD:
834     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
835     break;
836    
837     /*-----------------------------------------------------------------*/
838     case OP_OPT:
839     ims = code[1];
840     ADD_ACTIVE(state_offset + 2, 0);
841     break;
842    
843     /*-----------------------------------------------------------------*/
844     case OP_SOD:
845     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
846     break;
847    
848     /*-----------------------------------------------------------------*/
849     case OP_SOM:
850     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
851     break;
852    
853    
854     /* ========================================================================== */
855     /* These opcodes inspect the next subject character, and sometimes
856     the previous one as well, but do not have an argument. The variable
857     clen contains the length of the current character and is zero if we are
858     at the end of the subject. */
859    
860     /*-----------------------------------------------------------------*/
861     case OP_ANY:
862 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
863 nigel 77 { ADD_NEW(state_offset + 1, 0); }
864     break;
865    
866     /*-----------------------------------------------------------------*/
867 ph10 341 case OP_ALLANY:
868     if (clen > 0)
869     { ADD_NEW(state_offset + 1, 0); }
870     break;
871    
872     /*-----------------------------------------------------------------*/
873 nigel 77 case OP_EODN:
874 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
875 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
876     break;
877    
878     /*-----------------------------------------------------------------*/
879     case OP_DOLL:
880     if ((md->moptions & PCRE_NOTEOL) == 0)
881     {
882 nigel 91 if (clen == 0 ||
883 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
884 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
885     ))
886 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
887     }
888 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
889 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
890     break;
891    
892     /*-----------------------------------------------------------------*/
893    
894     case OP_DIGIT:
895     case OP_WHITESPACE:
896     case OP_WORDCHAR:
897     if (clen > 0 && c < 256 &&
898     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
899     { ADD_NEW(state_offset + 1, 0); }
900     break;
901    
902     /*-----------------------------------------------------------------*/
903     case OP_NOT_DIGIT:
904     case OP_NOT_WHITESPACE:
905     case OP_NOT_WORDCHAR:
906     if (clen > 0 && (c >= 256 ||
907     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
908     { ADD_NEW(state_offset + 1, 0); }
909     break;
910    
911     /*-----------------------------------------------------------------*/
912     case OP_WORD_BOUNDARY:
913     case OP_NOT_WORD_BOUNDARY:
914     {
915     int left_word, right_word;
916    
917     if (ptr > start_subject)
918     {
919     const uschar *temp = ptr - 1;
920 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
921 nigel 77 #ifdef SUPPORT_UTF8
922     if (utf8) BACKCHAR(temp);
923     #endif
924     GETCHARTEST(d, temp);
925     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
926     }
927     else left_word = 0;
928    
929 ph10 461 if (clen > 0)
930 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
931 ph10 463 else right_word = 0;
932 nigel 77
933     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
934     { ADD_ACTIVE(state_offset + 1, 0); }
935     }
936     break;
937    
938    
939     /*-----------------------------------------------------------------*/
940     /* Check the next character by Unicode property. We will get here only
941     if the support is in the binary; otherwise a compile-time error occurs.
942     */
943    
944 ph10 151 #ifdef SUPPORT_UCP
945 nigel 77 case OP_PROP:
946     case OP_NOTPROP:
947     if (clen > 0)
948     {
949 nigel 87 BOOL OK;
950 ph10 349 const ucd_record * prop = GET_UCD(c);
951 nigel 87 switch(code[1])
952 nigel 77 {
953 nigel 87 case PT_ANY:
954     OK = TRUE;
955     break;
956    
957     case PT_LAMP:
958 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
959     prop->chartype == ucp_Lt;
960 nigel 87 break;
961    
962     case PT_GC:
963 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
964 nigel 87 break;
965    
966     case PT_PC:
967 ph10 349 OK = prop->chartype == code[2];
968 nigel 87 break;
969    
970     case PT_SC:
971 ph10 349 OK = prop->script == code[2];
972 nigel 87 break;
973 ph10 517
974     /* These are specials for combination cases. */
975    
976     case PT_ALNUM:
977     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
978     _pcre_ucp_gentype[prop->chartype] == ucp_N;
979     break;
980    
981     case PT_SPACE: /* Perl space */
982     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
983     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
984     break;
985    
986     case PT_PXSPACE: /* POSIX space */
987     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
988     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
989     c == CHAR_FF || c == CHAR_CR;
990     break;
991    
992     case PT_WORD:
993     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
994     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
995     c == CHAR_UNDERSCORE;
996     break;
997 nigel 87
998     /* Should never occur, but keep compilers from grumbling. */
999    
1000     default:
1001     OK = codevalue != OP_PROP;
1002     break;
1003 nigel 77 }
1004 nigel 87
1005     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1006 nigel 77 }
1007     break;
1008     #endif
1009    
1010    
1011    
1012     /* ========================================================================== */
1013     /* These opcodes likewise inspect the subject character, but have an
1014     argument that is not a data character. It is one of these opcodes:
1015 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1016     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1017 nigel 77
1018     case OP_TYPEPLUS:
1019     case OP_TYPEMINPLUS:
1020 nigel 93 case OP_TYPEPOSPLUS:
1021 nigel 77 count = current_state->count; /* Already matched */
1022     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1023     if (clen > 0)
1024     {
1025     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1026     (c < 256 &&
1027 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1028 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1029     {
1030 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1031     {
1032     active_count--; /* Remove non-match possibility */
1033     next_active_state--;
1034     }
1035 nigel 77 count++;
1036     ADD_NEW(state_offset, count);
1037     }
1038     }
1039     break;
1040    
1041     /*-----------------------------------------------------------------*/
1042     case OP_TYPEQUERY:
1043     case OP_TYPEMINQUERY:
1044 nigel 93 case OP_TYPEPOSQUERY:
1045 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1046     if (clen > 0)
1047     {
1048     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1049     (c < 256 &&
1050 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1051 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1052     {
1053 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1054     {
1055     active_count--; /* Remove non-match possibility */
1056     next_active_state--;
1057     }
1058 nigel 77 ADD_NEW(state_offset + 2, 0);
1059     }
1060     }
1061     break;
1062    
1063     /*-----------------------------------------------------------------*/
1064     case OP_TYPESTAR:
1065     case OP_TYPEMINSTAR:
1066 nigel 93 case OP_TYPEPOSSTAR:
1067 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1068     if (clen > 0)
1069     {
1070     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1071     (c < 256 &&
1072 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1073 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1074     {
1075 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1076     {
1077     active_count--; /* Remove non-match possibility */
1078     next_active_state--;
1079     }
1080 nigel 77 ADD_NEW(state_offset, 0);
1081     }
1082     }
1083     break;
1084    
1085     /*-----------------------------------------------------------------*/
1086     case OP_TYPEEXACT:
1087 nigel 93 count = current_state->count; /* Number already matched */
1088     if (clen > 0)
1089     {
1090     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1091     (c < 256 &&
1092 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1093 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1094     {
1095     if (++count >= GET2(code, 1))
1096     { ADD_NEW(state_offset + 4, 0); }
1097     else
1098     { ADD_NEW(state_offset, count); }
1099     }
1100     }
1101     break;
1102    
1103     /*-----------------------------------------------------------------*/
1104 nigel 77 case OP_TYPEUPTO:
1105     case OP_TYPEMINUPTO:
1106 nigel 93 case OP_TYPEPOSUPTO:
1107     ADD_ACTIVE(state_offset + 4, 0);
1108 nigel 77 count = current_state->count; /* Number already matched */
1109     if (clen > 0)
1110     {
1111     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1112     (c < 256 &&
1113 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1114 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1115     {
1116 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1117     {
1118     active_count--; /* Remove non-match possibility */
1119     next_active_state--;
1120     }
1121 nigel 77 if (++count >= GET2(code, 1))
1122     { ADD_NEW(state_offset + 4, 0); }
1123     else
1124     { ADD_NEW(state_offset, count); }
1125     }
1126     }
1127     break;
1128    
1129     /* ========================================================================== */
1130     /* These are virtual opcodes that are used when something like
1131 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1132     argument. It keeps the code above fast for the other cases. The argument
1133     is in the d variable. */
1134 nigel 77
1135 ph10 151 #ifdef SUPPORT_UCP
1136 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1137     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1138 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1139 nigel 77 count = current_state->count; /* Already matched */
1140 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1141 nigel 77 if (clen > 0)
1142     {
1143 nigel 87 BOOL OK;
1144 ph10 349 const ucd_record * prop = GET_UCD(c);
1145 nigel 87 switch(code[2])
1146     {
1147     case PT_ANY:
1148     OK = TRUE;
1149     break;
1150    
1151     case PT_LAMP:
1152 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1153     prop->chartype == ucp_Lt;
1154 nigel 87 break;
1155    
1156     case PT_GC:
1157 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1158 nigel 87 break;
1159    
1160     case PT_PC:
1161 ph10 349 OK = prop->chartype == code[3];
1162 nigel 87 break;
1163    
1164     case PT_SC:
1165 ph10 349 OK = prop->script == code[3];
1166 nigel 87 break;
1167    
1168 ph10 517 /* These are specials for combination cases. */
1169    
1170     case PT_ALNUM:
1171     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1172     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1173     break;
1174    
1175     case PT_SPACE: /* Perl space */
1176     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1177     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1178     break;
1179    
1180     case PT_PXSPACE: /* POSIX space */
1181     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1182     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1183     c == CHAR_FF || c == CHAR_CR;
1184     break;
1185    
1186     case PT_WORD:
1187     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1188     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1189     c == CHAR_UNDERSCORE;
1190     break;
1191    
1192 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1193    
1194     default:
1195     OK = codevalue != OP_PROP;
1196     break;
1197     }
1198    
1199 nigel 93 if (OK == (d == OP_PROP))
1200     {
1201     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1202     {
1203     active_count--; /* Remove non-match possibility */
1204     next_active_state--;
1205     }
1206     count++;
1207     ADD_NEW(state_offset, count);
1208     }
1209 nigel 77 }
1210     break;
1211    
1212     /*-----------------------------------------------------------------*/
1213     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1214     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1215 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1216 nigel 77 count = current_state->count; /* Already matched */
1217     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1218 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1219 nigel 77 {
1220     const uschar *nptr = ptr + clen;
1221     int ncount = 0;
1222 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1223     {
1224     active_count--; /* Remove non-match possibility */
1225     next_active_state--;
1226     }
1227 nigel 77 while (nptr < end_subject)
1228     {
1229     int nd;
1230     int ndlen = 1;
1231     GETCHARLEN(nd, nptr, ndlen);
1232 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1233 nigel 77 ncount++;
1234     nptr += ndlen;
1235     }
1236     count++;
1237     ADD_NEW_DATA(-state_offset, count, ncount);
1238     }
1239     break;
1240 ph10 151 #endif
1241 nigel 77
1242     /*-----------------------------------------------------------------*/
1243 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1244     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1245     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1246     count = current_state->count; /* Already matched */
1247     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1248     if (clen > 0)
1249     {
1250     int ncount = 0;
1251     switch (c)
1252     {
1253     case 0x000b:
1254     case 0x000c:
1255     case 0x0085:
1256     case 0x2028:
1257     case 0x2029:
1258 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1259     goto ANYNL01;
1260    
1261     case 0x000d:
1262     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1263     /* Fall through */
1264    
1265     ANYNL01:
1266     case 0x000a:
1267 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1268     {
1269     active_count--; /* Remove non-match possibility */
1270     next_active_state--;
1271     }
1272     count++;
1273     ADD_NEW_DATA(-state_offset, count, ncount);
1274     break;
1275 ph10 231
1276 nigel 93 default:
1277     break;
1278     }
1279     }
1280     break;
1281    
1282     /*-----------------------------------------------------------------*/
1283 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1284     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1285     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1286     count = current_state->count; /* Already matched */
1287     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1288     if (clen > 0)
1289     {
1290 ph10 182 BOOL OK;
1291 ph10 178 switch (c)
1292     {
1293     case 0x000a:
1294     case 0x000b:
1295     case 0x000c:
1296     case 0x000d:
1297     case 0x0085:
1298     case 0x2028:
1299     case 0x2029:
1300     OK = TRUE;
1301 ph10 182 break;
1302 ph10 178
1303     default:
1304     OK = FALSE;
1305 ph10 182 break;
1306 ph10 178 }
1307    
1308     if (OK == (d == OP_VSPACE))
1309 ph10 182 {
1310 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1311     {
1312     active_count--; /* Remove non-match possibility */
1313     next_active_state--;
1314     }
1315     count++;
1316     ADD_NEW_DATA(-state_offset, count, 0);
1317     }
1318     }
1319     break;
1320    
1321     /*-----------------------------------------------------------------*/
1322     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1323     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1324     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1325     count = current_state->count; /* Already matched */
1326     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1327     if (clen > 0)
1328     {
1329 ph10 182 BOOL OK;
1330 ph10 178 switch (c)
1331     {
1332     case 0x09: /* HT */
1333     case 0x20: /* SPACE */
1334     case 0xa0: /* NBSP */
1335     case 0x1680: /* OGHAM SPACE MARK */
1336     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1337     case 0x2000: /* EN QUAD */
1338     case 0x2001: /* EM QUAD */
1339     case 0x2002: /* EN SPACE */
1340     case 0x2003: /* EM SPACE */
1341     case 0x2004: /* THREE-PER-EM SPACE */
1342     case 0x2005: /* FOUR-PER-EM SPACE */
1343     case 0x2006: /* SIX-PER-EM SPACE */
1344     case 0x2007: /* FIGURE SPACE */
1345     case 0x2008: /* PUNCTUATION SPACE */
1346     case 0x2009: /* THIN SPACE */
1347     case 0x200A: /* HAIR SPACE */
1348     case 0x202f: /* NARROW NO-BREAK SPACE */
1349     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1350     case 0x3000: /* IDEOGRAPHIC SPACE */
1351     OK = TRUE;
1352     break;
1353 ph10 182
1354 ph10 178 default:
1355     OK = FALSE;
1356     break;
1357     }
1358 ph10 182
1359 ph10 178 if (OK == (d == OP_HSPACE))
1360 ph10 182 {
1361 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1362     {
1363     active_count--; /* Remove non-match possibility */
1364     next_active_state--;
1365     }
1366     count++;
1367     ADD_NEW_DATA(-state_offset, count, 0);
1368     }
1369     }
1370     break;
1371    
1372     /*-----------------------------------------------------------------*/
1373 ph10 151 #ifdef SUPPORT_UCP
1374 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1375     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1376 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1377 nigel 87 count = 4;
1378 nigel 77 goto QS1;
1379    
1380     case OP_PROP_EXTRA + OP_TYPESTAR:
1381     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1382 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1383 nigel 77 count = 0;
1384    
1385     QS1:
1386    
1387 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1388 nigel 77 if (clen > 0)
1389     {
1390 nigel 87 BOOL OK;
1391 ph10 349 const ucd_record * prop = GET_UCD(c);
1392 nigel 87 switch(code[2])
1393     {
1394     case PT_ANY:
1395     OK = TRUE;
1396     break;
1397    
1398     case PT_LAMP:
1399 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1400     prop->chartype == ucp_Lt;
1401 nigel 87 break;
1402    
1403     case PT_GC:
1404 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1405 nigel 87 break;
1406    
1407     case PT_PC:
1408 ph10 349 OK = prop->chartype == code[3];
1409 nigel 87 break;
1410    
1411     case PT_SC:
1412 ph10 349 OK = prop->script == code[3];
1413 nigel 87 break;
1414 ph10 517
1415     /* These are specials for combination cases. */
1416    
1417     case PT_ALNUM:
1418     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1419     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1420     break;
1421    
1422     case PT_SPACE: /* Perl space */
1423     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1424     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1425     break;
1426    
1427     case PT_PXSPACE: /* POSIX space */
1428     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1429     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1430     c == CHAR_FF || c == CHAR_CR;
1431     break;
1432    
1433     case PT_WORD:
1434     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1435     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1436     c == CHAR_UNDERSCORE;
1437     break;
1438 nigel 87
1439     /* Should never occur, but keep compilers from grumbling. */
1440    
1441     default:
1442     OK = codevalue != OP_PROP;
1443     break;
1444     }
1445    
1446 nigel 93 if (OK == (d == OP_PROP))
1447     {
1448     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1449     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1450     {
1451     active_count--; /* Remove non-match possibility */
1452     next_active_state--;
1453     }
1454     ADD_NEW(state_offset + count, 0);
1455     }
1456 nigel 77 }
1457     break;
1458    
1459     /*-----------------------------------------------------------------*/
1460     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1461     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1462 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1463 nigel 77 count = 2;
1464     goto QS2;
1465    
1466     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1467     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1468 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1469 nigel 77 count = 0;
1470    
1471     QS2:
1472    
1473     ADD_ACTIVE(state_offset + 2, 0);
1474 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1475 nigel 77 {
1476     const uschar *nptr = ptr + clen;
1477     int ncount = 0;
1478 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1479     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1480     {
1481     active_count--; /* Remove non-match possibility */
1482     next_active_state--;
1483     }
1484 nigel 77 while (nptr < end_subject)
1485     {
1486     int nd;
1487     int ndlen = 1;
1488     GETCHARLEN(nd, nptr, ndlen);
1489 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1490 nigel 77 ncount++;
1491     nptr += ndlen;
1492     }
1493     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1494     }
1495     break;
1496 ph10 151 #endif
1497 nigel 77
1498     /*-----------------------------------------------------------------*/
1499 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1500     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1501     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1502     count = 2;
1503     goto QS3;
1504    
1505     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1506     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1507     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1508     count = 0;
1509    
1510     QS3:
1511     ADD_ACTIVE(state_offset + 2, 0);
1512     if (clen > 0)
1513     {
1514     int ncount = 0;
1515     switch (c)
1516     {
1517     case 0x000b:
1518     case 0x000c:
1519     case 0x0085:
1520     case 0x2028:
1521     case 0x2029:
1522 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1523     goto ANYNL02;
1524    
1525     case 0x000d:
1526     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1527     /* Fall through */
1528    
1529     ANYNL02:
1530     case 0x000a:
1531 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1532     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1533     {
1534     active_count--; /* Remove non-match possibility */
1535     next_active_state--;
1536     }
1537     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1538     break;
1539 ph10 231
1540 nigel 93 default:
1541     break;
1542     }
1543     }
1544     break;
1545    
1546     /*-----------------------------------------------------------------*/
1547 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1548     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1549     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1550     count = 2;
1551     goto QS4;
1552    
1553     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1554     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1555     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1556     count = 0;
1557    
1558     QS4:
1559     ADD_ACTIVE(state_offset + 2, 0);
1560     if (clen > 0)
1561     {
1562 ph10 182 BOOL OK;
1563 ph10 178 switch (c)
1564     {
1565     case 0x000a:
1566     case 0x000b:
1567     case 0x000c:
1568     case 0x000d:
1569     case 0x0085:
1570     case 0x2028:
1571     case 0x2029:
1572     OK = TRUE;
1573     break;
1574 ph10 182
1575 ph10 178 default:
1576     OK = FALSE;
1577     break;
1578     }
1579     if (OK == (d == OP_VSPACE))
1580 ph10 182 {
1581 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1582     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1583     {
1584     active_count--; /* Remove non-match possibility */
1585     next_active_state--;
1586     }
1587     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1588     }
1589     }
1590     break;
1591    
1592     /*-----------------------------------------------------------------*/
1593     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1594     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1595     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1596     count = 2;
1597     goto QS5;
1598    
1599     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1600     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1601     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1602     count = 0;
1603    
1604     QS5:
1605     ADD_ACTIVE(state_offset + 2, 0);
1606     if (clen > 0)
1607     {
1608 ph10 182 BOOL OK;
1609 ph10 178 switch (c)
1610     {
1611     case 0x09: /* HT */
1612     case 0x20: /* SPACE */
1613     case 0xa0: /* NBSP */
1614     case 0x1680: /* OGHAM SPACE MARK */
1615     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1616     case 0x2000: /* EN QUAD */
1617     case 0x2001: /* EM QUAD */
1618     case 0x2002: /* EN SPACE */
1619     case 0x2003: /* EM SPACE */
1620     case 0x2004: /* THREE-PER-EM SPACE */
1621     case 0x2005: /* FOUR-PER-EM SPACE */
1622     case 0x2006: /* SIX-PER-EM SPACE */
1623     case 0x2007: /* FIGURE SPACE */
1624     case 0x2008: /* PUNCTUATION SPACE */
1625     case 0x2009: /* THIN SPACE */
1626     case 0x200A: /* HAIR SPACE */
1627     case 0x202f: /* NARROW NO-BREAK SPACE */
1628     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1629     case 0x3000: /* IDEOGRAPHIC SPACE */
1630     OK = TRUE;
1631     break;
1632 ph10 182
1633 ph10 178 default:
1634     OK = FALSE;
1635     break;
1636     }
1637 ph10 182
1638 ph10 178 if (OK == (d == OP_HSPACE))
1639 ph10 182 {
1640 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1641     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1642     {
1643     active_count--; /* Remove non-match possibility */
1644     next_active_state--;
1645     }
1646     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1647     }
1648     }
1649     break;
1650    
1651     /*-----------------------------------------------------------------*/
1652 ph10 151 #ifdef SUPPORT_UCP
1653 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1654     case OP_PROP_EXTRA + OP_TYPEUPTO:
1655     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1656 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1657 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1658 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1659 nigel 77 count = current_state->count; /* Number already matched */
1660     if (clen > 0)
1661     {
1662 nigel 87 BOOL OK;
1663 ph10 349 const ucd_record * prop = GET_UCD(c);
1664 nigel 87 switch(code[4])
1665 nigel 77 {
1666 nigel 87 case PT_ANY:
1667     OK = TRUE;
1668     break;
1669    
1670     case PT_LAMP:
1671 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1672     prop->chartype == ucp_Lt;
1673 nigel 87 break;
1674    
1675     case PT_GC:
1676 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1677 nigel 87 break;
1678    
1679     case PT_PC:
1680 ph10 349 OK = prop->chartype == code[5];
1681 nigel 87 break;
1682    
1683     case PT_SC:
1684 ph10 349 OK = prop->script == code[5];
1685 nigel 87 break;
1686 ph10 517
1687     /* These are specials for combination cases. */
1688    
1689     case PT_ALNUM:
1690     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1691     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1692     break;
1693    
1694     case PT_SPACE: /* Perl space */
1695     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1696     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1697     break;
1698    
1699     case PT_PXSPACE: /* POSIX space */
1700     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1701     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1702     c == CHAR_FF || c == CHAR_CR;
1703     break;
1704    
1705     case PT_WORD:
1706     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1707     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1708     c == CHAR_UNDERSCORE;
1709     break;
1710 nigel 87
1711     /* Should never occur, but keep compilers from grumbling. */
1712    
1713     default:
1714     OK = codevalue != OP_PROP;
1715     break;
1716     }
1717    
1718     if (OK == (d == OP_PROP))
1719     {
1720 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1721     {
1722     active_count--; /* Remove non-match possibility */
1723     next_active_state--;
1724     }
1725 nigel 77 if (++count >= GET2(code, 1))
1726 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1727 nigel 77 else
1728     { ADD_NEW(state_offset, count); }
1729     }
1730     }
1731     break;
1732    
1733     /*-----------------------------------------------------------------*/
1734     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1735     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1736     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1737 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1738 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1739     { ADD_ACTIVE(state_offset + 4, 0); }
1740     count = current_state->count; /* Number already matched */
1741 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1742 nigel 77 {
1743     const uschar *nptr = ptr + clen;
1744     int ncount = 0;
1745 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1746     {
1747     active_count--; /* Remove non-match possibility */
1748     next_active_state--;
1749     }
1750 nigel 77 while (nptr < end_subject)
1751     {
1752     int nd;
1753     int ndlen = 1;
1754     GETCHARLEN(nd, nptr, ndlen);
1755 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1756 nigel 77 ncount++;
1757     nptr += ndlen;
1758     }
1759     if (++count >= GET2(code, 1))
1760     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1761     else
1762     { ADD_NEW_DATA(-state_offset, count, ncount); }
1763     }
1764     break;
1765 ph10 151 #endif
1766 nigel 77
1767 nigel 93 /*-----------------------------------------------------------------*/
1768     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1769     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1770     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1771     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1772     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1773     { ADD_ACTIVE(state_offset + 4, 0); }
1774     count = current_state->count; /* Number already matched */
1775     if (clen > 0)
1776     {
1777     int ncount = 0;
1778     switch (c)
1779     {
1780     case 0x000b:
1781     case 0x000c:
1782     case 0x0085:
1783     case 0x2028:
1784     case 0x2029:
1785 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1786     goto ANYNL03;
1787    
1788     case 0x000d:
1789     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1790     /* Fall through */
1791    
1792     ANYNL03:
1793     case 0x000a:
1794 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1795     {
1796     active_count--; /* Remove non-match possibility */
1797     next_active_state--;
1798     }
1799     if (++count >= GET2(code, 1))
1800     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1801     else
1802     { ADD_NEW_DATA(-state_offset, count, ncount); }
1803     break;
1804 ph10 231
1805 nigel 93 default:
1806     break;
1807     }
1808     }
1809     break;
1810    
1811 ph10 178 /*-----------------------------------------------------------------*/
1812     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1813     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1814     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1815     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1816     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1817     { ADD_ACTIVE(state_offset + 4, 0); }
1818     count = current_state->count; /* Number already matched */
1819     if (clen > 0)
1820     {
1821 ph10 182 BOOL OK;
1822 ph10 178 switch (c)
1823     {
1824     case 0x000a:
1825     case 0x000b:
1826     case 0x000c:
1827     case 0x000d:
1828     case 0x0085:
1829     case 0x2028:
1830     case 0x2029:
1831     OK = TRUE;
1832     break;
1833 ph10 182
1834 ph10 178 default:
1835     OK = FALSE;
1836     }
1837 ph10 182
1838 ph10 178 if (OK == (d == OP_VSPACE))
1839 ph10 182 {
1840 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1841     {
1842     active_count--; /* Remove non-match possibility */
1843     next_active_state--;
1844     }
1845     if (++count >= GET2(code, 1))
1846     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1847     else
1848     { ADD_NEW_DATA(-state_offset, count, 0); }
1849     }
1850     }
1851     break;
1852    
1853     /*-----------------------------------------------------------------*/
1854     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1855     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1856     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1857     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1858     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1859     { ADD_ACTIVE(state_offset + 4, 0); }
1860     count = current_state->count; /* Number already matched */
1861     if (clen > 0)
1862     {
1863 ph10 182 BOOL OK;
1864 ph10 178 switch (c)
1865     {
1866     case 0x09: /* HT */
1867     case 0x20: /* SPACE */
1868     case 0xa0: /* NBSP */
1869     case 0x1680: /* OGHAM SPACE MARK */
1870     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1871     case 0x2000: /* EN QUAD */
1872     case 0x2001: /* EM QUAD */
1873     case 0x2002: /* EN SPACE */
1874     case 0x2003: /* EM SPACE */
1875     case 0x2004: /* THREE-PER-EM SPACE */
1876     case 0x2005: /* FOUR-PER-EM SPACE */
1877     case 0x2006: /* SIX-PER-EM SPACE */
1878     case 0x2007: /* FIGURE SPACE */
1879     case 0x2008: /* PUNCTUATION SPACE */
1880     case 0x2009: /* THIN SPACE */
1881     case 0x200A: /* HAIR SPACE */
1882     case 0x202f: /* NARROW NO-BREAK SPACE */
1883     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1884     case 0x3000: /* IDEOGRAPHIC SPACE */
1885     OK = TRUE;
1886     break;
1887 ph10 182
1888 ph10 178 default:
1889     OK = FALSE;
1890     break;
1891     }
1892 ph10 182
1893 ph10 178 if (OK == (d == OP_HSPACE))
1894 ph10 182 {
1895 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1896     {
1897     active_count--; /* Remove non-match possibility */
1898     next_active_state--;
1899     }
1900     if (++count >= GET2(code, 1))
1901     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1902     else
1903     { ADD_NEW_DATA(-state_offset, count, 0); }
1904     }
1905     }
1906     break;
1907    
1908 nigel 77 /* ========================================================================== */
1909     /* These opcodes are followed by a character that is usually compared
1910     to the current subject character; it is loaded into d. We still get
1911     here even if there is no subject character, because in some cases zero
1912     repetitions are permitted. */
1913    
1914     /*-----------------------------------------------------------------*/
1915     case OP_CHAR:
1916     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1917     break;
1918    
1919     /*-----------------------------------------------------------------*/
1920     case OP_CHARNC:
1921     if (clen == 0) break;
1922    
1923     #ifdef SUPPORT_UTF8
1924     if (utf8)
1925     {
1926     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1927     {
1928 nigel 93 unsigned int othercase;
1929 nigel 77 if (c < 128) othercase = fcc[c]; else
1930    
1931     /* If we have Unicode property support, we can use it to test the
1932 nigel 87 other case of the character. */
1933 nigel 77
1934     #ifdef SUPPORT_UCP
1935 ph10 349 othercase = UCD_OTHERCASE(c);
1936 nigel 87 #else
1937 nigel 93 othercase = NOTACHAR;
1938 nigel 77 #endif
1939    
1940     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1941     }
1942     }
1943     else
1944     #endif /* SUPPORT_UTF8 */
1945    
1946     /* Non-UTF-8 mode */
1947     {
1948     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1949     }
1950     break;
1951    
1952    
1953     #ifdef SUPPORT_UCP
1954     /*-----------------------------------------------------------------*/
1955     /* This is a tricky one because it can match more than one character.
1956     Find out how many characters to skip, and then set up a negative state
1957     to wait for them to pass before continuing. */
1958    
1959     case OP_EXTUNI:
1960 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1961 nigel 77 {
1962     const uschar *nptr = ptr + clen;
1963     int ncount = 0;
1964     while (nptr < end_subject)
1965     {
1966     int nclen = 1;
1967     GETCHARLEN(c, nptr, nclen);
1968 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1969 nigel 77 ncount++;
1970     nptr += nclen;
1971     }
1972     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1973     }
1974     break;
1975     #endif
1976    
1977     /*-----------------------------------------------------------------*/
1978 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1979     character (when CR is followed by LF). In this case, set up a negative
1980     state to wait for one character to pass before continuing. */
1981    
1982     case OP_ANYNL:
1983     if (clen > 0) switch(c)
1984     {
1985     case 0x000b:
1986     case 0x000c:
1987     case 0x0085:
1988     case 0x2028:
1989     case 0x2029:
1990 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1991    
1992     case 0x000a:
1993 nigel 93 ADD_NEW(state_offset + 1, 0);
1994     break;
1995 ph10 231
1996 nigel 93 case 0x000d:
1997     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1998     {
1999     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2000     }
2001     else
2002     {
2003     ADD_NEW(state_offset + 1, 0);
2004     }
2005     break;
2006     }
2007     break;
2008    
2009     /*-----------------------------------------------------------------*/
2010 ph10 178 case OP_NOT_VSPACE:
2011     if (clen > 0) switch(c)
2012     {
2013     case 0x000a:
2014     case 0x000b:
2015     case 0x000c:
2016     case 0x000d:
2017     case 0x0085:
2018     case 0x2028:
2019     case 0x2029:
2020     break;
2021 ph10 182
2022     default:
2023 ph10 178 ADD_NEW(state_offset + 1, 0);
2024     break;
2025     }
2026     break;
2027    
2028     /*-----------------------------------------------------------------*/
2029     case OP_VSPACE:
2030     if (clen > 0) switch(c)
2031     {
2032     case 0x000a:
2033     case 0x000b:
2034     case 0x000c:
2035     case 0x000d:
2036     case 0x0085:
2037     case 0x2028:
2038     case 0x2029:
2039     ADD_NEW(state_offset + 1, 0);
2040     break;
2041 ph10 182
2042 ph10 178 default: break;
2043     }
2044     break;
2045    
2046     /*-----------------------------------------------------------------*/
2047     case OP_NOT_HSPACE:
2048     if (clen > 0) switch(c)
2049     {
2050     case 0x09: /* HT */
2051     case 0x20: /* SPACE */
2052     case 0xa0: /* NBSP */
2053     case 0x1680: /* OGHAM SPACE MARK */
2054     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2055     case 0x2000: /* EN QUAD */
2056     case 0x2001: /* EM QUAD */
2057     case 0x2002: /* EN SPACE */
2058     case 0x2003: /* EM SPACE */
2059     case 0x2004: /* THREE-PER-EM SPACE */
2060     case 0x2005: /* FOUR-PER-EM SPACE */
2061     case 0x2006: /* SIX-PER-EM SPACE */
2062     case 0x2007: /* FIGURE SPACE */
2063     case 0x2008: /* PUNCTUATION SPACE */
2064     case 0x2009: /* THIN SPACE */
2065     case 0x200A: /* HAIR SPACE */
2066     case 0x202f: /* NARROW NO-BREAK SPACE */
2067     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2068     case 0x3000: /* IDEOGRAPHIC SPACE */
2069     break;
2070 ph10 182
2071     default:
2072 ph10 178 ADD_NEW(state_offset + 1, 0);
2073     break;
2074     }
2075     break;
2076    
2077     /*-----------------------------------------------------------------*/
2078     case OP_HSPACE:
2079     if (clen > 0) switch(c)
2080     {
2081     case 0x09: /* HT */
2082     case 0x20: /* SPACE */
2083     case 0xa0: /* NBSP */
2084     case 0x1680: /* OGHAM SPACE MARK */
2085     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2086     case 0x2000: /* EN QUAD */
2087     case 0x2001: /* EM QUAD */
2088     case 0x2002: /* EN SPACE */
2089     case 0x2003: /* EM SPACE */
2090     case 0x2004: /* THREE-PER-EM SPACE */
2091     case 0x2005: /* FOUR-PER-EM SPACE */
2092     case 0x2006: /* SIX-PER-EM SPACE */
2093     case 0x2007: /* FIGURE SPACE */
2094     case 0x2008: /* PUNCTUATION SPACE */
2095     case 0x2009: /* THIN SPACE */
2096     case 0x200A: /* HAIR SPACE */
2097     case 0x202f: /* NARROW NO-BREAK SPACE */
2098     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2099     case 0x3000: /* IDEOGRAPHIC SPACE */
2100     ADD_NEW(state_offset + 1, 0);
2101     break;
2102     }
2103     break;
2104    
2105     /*-----------------------------------------------------------------*/
2106 nigel 77 /* Match a negated single character. This is only used for one-byte
2107     characters, that is, we know that d < 256. The character we are
2108     checking (c) can be multibyte. */
2109    
2110     case OP_NOT:
2111     if (clen > 0)
2112     {
2113 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2114 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2115     }
2116     break;
2117    
2118     /*-----------------------------------------------------------------*/
2119     case OP_PLUS:
2120     case OP_MINPLUS:
2121 nigel 93 case OP_POSPLUS:
2122 nigel 77 case OP_NOTPLUS:
2123     case OP_NOTMINPLUS:
2124 nigel 93 case OP_NOTPOSPLUS:
2125 nigel 77 count = current_state->count; /* Already matched */
2126     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2127     if (clen > 0)
2128     {
2129 nigel 93 unsigned int otherd = NOTACHAR;
2130 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2131     {
2132     #ifdef SUPPORT_UTF8
2133 nigel 87 if (utf8 && d >= 128)
2134 nigel 77 {
2135     #ifdef SUPPORT_UCP
2136 ph10 349 otherd = UCD_OTHERCASE(d);
2137 nigel 77 #endif /* SUPPORT_UCP */
2138     }
2139     else
2140     #endif /* SUPPORT_UTF8 */
2141     otherd = fcc[d];
2142     }
2143     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2144 nigel 93 {
2145     if (count > 0 &&
2146     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2147     {
2148     active_count--; /* Remove non-match possibility */
2149     next_active_state--;
2150     }
2151     count++;
2152     ADD_NEW(state_offset, count);
2153     }
2154 nigel 77 }
2155     break;
2156    
2157     /*-----------------------------------------------------------------*/
2158     case OP_QUERY:
2159     case OP_MINQUERY:
2160 nigel 93 case OP_POSQUERY:
2161 nigel 77 case OP_NOTQUERY:
2162     case OP_NOTMINQUERY:
2163 nigel 93 case OP_NOTPOSQUERY:
2164 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2165     if (clen > 0)
2166     {
2167 nigel 93 unsigned int otherd = NOTACHAR;
2168 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2169 nigel 77 {
2170     #ifdef SUPPORT_UTF8
2171 nigel 87 if (utf8 && d >= 128)
2172 nigel 77 {
2173     #ifdef SUPPORT_UCP
2174 ph10 349 otherd = UCD_OTHERCASE(d);
2175 nigel 77 #endif /* SUPPORT_UCP */
2176     }
2177     else
2178     #endif /* SUPPORT_UTF8 */
2179     otherd = fcc[d];
2180     }
2181     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2182 nigel 93 {
2183     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2184     {
2185     active_count--; /* Remove non-match possibility */
2186     next_active_state--;
2187     }
2188     ADD_NEW(state_offset + dlen + 1, 0);
2189     }
2190 nigel 77 }
2191     break;
2192    
2193     /*-----------------------------------------------------------------*/
2194     case OP_STAR:
2195     case OP_MINSTAR:
2196 nigel 93 case OP_POSSTAR:
2197 nigel 77 case OP_NOTSTAR:
2198     case OP_NOTMINSTAR:
2199 nigel 93 case OP_NOTPOSSTAR:
2200 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2201     if (clen > 0)
2202     {
2203 nigel 93 unsigned int otherd = NOTACHAR;
2204 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2205 nigel 77 {
2206     #ifdef SUPPORT_UTF8
2207 nigel 87 if (utf8 && d >= 128)
2208 nigel 77 {
2209     #ifdef SUPPORT_UCP
2210 ph10 349 otherd = UCD_OTHERCASE(d);
2211 nigel 77 #endif /* SUPPORT_UCP */
2212     }
2213     else
2214     #endif /* SUPPORT_UTF8 */
2215     otherd = fcc[d];
2216     }
2217     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2218 nigel 93 {
2219     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2220     {
2221     active_count--; /* Remove non-match possibility */
2222     next_active_state--;
2223     }
2224     ADD_NEW(state_offset, 0);
2225     }
2226 nigel 77 }
2227     break;
2228    
2229     /*-----------------------------------------------------------------*/
2230     case OP_EXACT:
2231 nigel 93 case OP_NOTEXACT:
2232     count = current_state->count; /* Number already matched */
2233     if (clen > 0)
2234     {
2235     unsigned int otherd = NOTACHAR;
2236     if ((ims & PCRE_CASELESS) != 0)
2237     {
2238     #ifdef SUPPORT_UTF8
2239     if (utf8 && d >= 128)
2240     {
2241     #ifdef SUPPORT_UCP
2242 ph10 349 otherd = UCD_OTHERCASE(d);
2243 nigel 93 #endif /* SUPPORT_UCP */
2244     }
2245     else
2246     #endif /* SUPPORT_UTF8 */
2247     otherd = fcc[d];
2248     }
2249     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2250     {
2251     if (++count >= GET2(code, 1))
2252     { ADD_NEW(state_offset + dlen + 3, 0); }
2253     else
2254     { ADD_NEW(state_offset, count); }
2255     }
2256     }
2257     break;
2258    
2259     /*-----------------------------------------------------------------*/
2260 nigel 77 case OP_UPTO:
2261     case OP_MINUPTO:
2262 nigel 93 case OP_POSUPTO:
2263 nigel 77 case OP_NOTUPTO:
2264     case OP_NOTMINUPTO:
2265 nigel 93 case OP_NOTPOSUPTO:
2266     ADD_ACTIVE(state_offset + dlen + 3, 0);
2267 nigel 77 count = current_state->count; /* Number already matched */
2268     if (clen > 0)
2269     {
2270 nigel 93 unsigned int otherd = NOTACHAR;
2271 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2272     {
2273     #ifdef SUPPORT_UTF8
2274 nigel 87 if (utf8 && d >= 128)
2275 nigel 77 {
2276     #ifdef SUPPORT_UCP
2277 ph10 349 otherd = UCD_OTHERCASE(d);
2278 nigel 77 #endif /* SUPPORT_UCP */
2279     }
2280     else
2281     #endif /* SUPPORT_UTF8 */
2282     otherd = fcc[d];
2283     }
2284     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2285     {
2286 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2287     {
2288     active_count--; /* Remove non-match possibility */
2289     next_active_state--;
2290     }
2291 nigel 77 if (++count >= GET2(code, 1))
2292     { ADD_NEW(state_offset + dlen + 3, 0); }
2293     else
2294     { ADD_NEW(state_offset, count); }
2295     }
2296     }
2297     break;
2298    
2299    
2300     /* ========================================================================== */
2301     /* These are the class-handling opcodes */
2302    
2303     case OP_CLASS:
2304     case OP_NCLASS:
2305     case OP_XCLASS:
2306     {
2307     BOOL isinclass = FALSE;
2308     int next_state_offset;
2309     const uschar *ecode;
2310    
2311     /* For a simple class, there is always just a 32-byte table, and we
2312     can set isinclass from it. */
2313    
2314     if (codevalue != OP_XCLASS)
2315     {
2316     ecode = code + 33;
2317     if (clen > 0)
2318     {
2319     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2320     ((code[1 + c/8] & (1 << (c&7))) != 0);
2321     }
2322     }
2323    
2324     /* An extended class may have a table or a list of single characters,
2325     ranges, or both, and it may be positive or negative. There's a
2326     function that sorts all this out. */
2327    
2328     else
2329     {
2330     ecode = code + GET(code, 1);
2331     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2332     }
2333    
2334     /* At this point, isinclass is set for all kinds of class, and ecode
2335     points to the byte after the end of the class. If there is a
2336     quantifier, this is where it will be. */
2337    
2338     next_state_offset = ecode - start_code;
2339    
2340     switch (*ecode)
2341     {
2342     case OP_CRSTAR:
2343     case OP_CRMINSTAR:
2344     ADD_ACTIVE(next_state_offset + 1, 0);
2345     if (isinclass) { ADD_NEW(state_offset, 0); }
2346     break;
2347    
2348     case OP_CRPLUS:
2349     case OP_CRMINPLUS:
2350     count = current_state->count; /* Already matched */
2351     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2352     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2353     break;
2354    
2355     case OP_CRQUERY:
2356     case OP_CRMINQUERY:
2357     ADD_ACTIVE(next_state_offset + 1, 0);
2358     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2359     break;
2360    
2361     case OP_CRRANGE:
2362     case OP_CRMINRANGE:
2363     count = current_state->count; /* Already matched */
2364     if (count >= GET2(ecode, 1))
2365     { ADD_ACTIVE(next_state_offset + 5, 0); }
2366     if (isinclass)
2367     {
2368 nigel 91 int max = GET2(ecode, 3);
2369     if (++count >= max && max != 0) /* Max 0 => no limit */
2370 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2371     else
2372     { ADD_NEW(state_offset, count); }
2373     }
2374     break;
2375    
2376     default:
2377     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2378     break;
2379     }
2380     }
2381     break;
2382    
2383     /* ========================================================================== */
2384     /* These are the opcodes for fancy brackets of various kinds. We have
2385 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2386     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2387 ph10 341 though the other "backtracking verbs" are not supported. */
2388 ph10 345
2389 ph10 341 case OP_FAIL:
2390 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2391 ph10 345 break;
2392 nigel 77
2393     case OP_ASSERT:
2394     case OP_ASSERT_NOT:
2395     case OP_ASSERTBACK:
2396     case OP_ASSERTBACK_NOT:
2397     {
2398     int rc;
2399     int local_offsets[2];
2400     int local_workspace[1000];
2401     const uschar *endasscode = code + GET(code, 1);
2402    
2403     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2404    
2405     rc = internal_dfa_exec(
2406     md, /* static match data */
2407     code, /* this subexpression's code */
2408     ptr, /* where we currently are */
2409     ptr - start_subject, /* start offset */
2410     local_offsets, /* offset vector */
2411     sizeof(local_offsets)/sizeof(int), /* size of same */
2412     local_workspace, /* workspace vector */
2413     sizeof(local_workspace)/sizeof(int), /* size of same */
2414     ims, /* the current ims flags */
2415     rlevel, /* function recursion level */
2416     recursing); /* pass on regex recursion */
2417 ph10 487
2418 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2419 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2420     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2421     }
2422     break;
2423    
2424     /*-----------------------------------------------------------------*/
2425     case OP_COND:
2426 nigel 93 case OP_SCOND:
2427 nigel 77 {
2428     int local_offsets[1000];
2429     int local_workspace[1000];
2430 ph10 406 int codelink = GET(code, 1);
2431 ph10 397 int condcode;
2432 ph10 406
2433 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2434 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2435 ph10 398 happen for the other conditions. */
2436 nigel 77
2437 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2438 ph10 406 {
2439     rrc = 0;
2440 ph10 397 if (pcre_callout != NULL)
2441     {
2442     pcre_callout_block cb;
2443     cb.version = 1; /* Version 1 of the callout block */
2444     cb.callout_number = code[LINK_SIZE+2];
2445     cb.offset_vector = offsets;
2446     cb.subject = (PCRE_SPTR)start_subject;
2447     cb.subject_length = end_subject - start_subject;
2448     cb.start_match = current_subject - start_subject;
2449     cb.current_position = ptr - start_subject;
2450     cb.pattern_position = GET(code, LINK_SIZE + 3);
2451     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2452     cb.capture_top = 1;
2453     cb.capture_last = -1;
2454     cb.callout_data = md->callout_data;
2455     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2456     }
2457 ph10 398 if (rrc > 0) break; /* Fail this thread */
2458     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2459 ph10 406 }
2460 ph10 398
2461 ph10 397 condcode = code[LINK_SIZE+1];
2462 ph10 406
2463 nigel 93 /* Back reference conditions are not supported */
2464 nigel 77
2465 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2466 ph10 459 return PCRE_ERROR_DFA_UCOND;
2467 nigel 93
2468     /* The DEFINE condition is always false */
2469    
2470     if (condcode == OP_DEF)
2471 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2472 nigel 93
2473     /* The only supported version of OP_RREF is for the value RREF_ANY,
2474     which means "test if in any recursion". We can't test for specifically
2475     recursed groups. */
2476    
2477 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2478 nigel 93 {
2479 nigel 77 int value = GET2(code, LINK_SIZE+2);
2480 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2481 ph10 406 if (recursing > 0)
2482 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2483     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2484 nigel 77 }
2485    
2486     /* Otherwise, the condition is an assertion */
2487    
2488     else
2489     {
2490     int rc;
2491     const uschar *asscode = code + LINK_SIZE + 1;
2492     const uschar *endasscode = asscode + GET(asscode, 1);
2493    
2494     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2495    
2496     rc = internal_dfa_exec(
2497     md, /* fixed match data */
2498     asscode, /* this subexpression's code */
2499     ptr, /* where we currently are */
2500     ptr - start_subject, /* start offset */
2501     local_offsets, /* offset vector */
2502     sizeof(local_offsets)/sizeof(int), /* size of same */
2503     local_workspace, /* workspace vector */
2504     sizeof(local_workspace)/sizeof(int), /* size of same */
2505     ims, /* the current ims flags */
2506     rlevel, /* function recursion level */
2507     recursing); /* pass on regex recursion */
2508    
2509 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2510 nigel 77 if ((rc >= 0) ==
2511     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2512 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2513 nigel 77 else
2514 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2515 nigel 77 }
2516     }
2517     break;
2518    
2519     /*-----------------------------------------------------------------*/
2520     case OP_RECURSE:
2521     {
2522     int local_offsets[1000];
2523     int local_workspace[1000];
2524     int rc;
2525    
2526     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2527     recursing + 1));
2528    
2529     rc = internal_dfa_exec(
2530     md, /* fixed match data */
2531     start_code + GET(code, 1), /* this subexpression's code */
2532     ptr, /* where we currently are */
2533     ptr - start_subject, /* start offset */
2534     local_offsets, /* offset vector */
2535     sizeof(local_offsets)/sizeof(int), /* size of same */
2536     local_workspace, /* workspace vector */
2537     sizeof(local_workspace)/sizeof(int), /* size of same */
2538     ims, /* the current ims flags */
2539     rlevel, /* function recursion level */
2540     recursing + 1); /* regex recurse level */
2541    
2542     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2543     recursing + 1, rc));
2544    
2545     /* Ran out of internal offsets */
2546    
2547     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2548    
2549     /* For each successful matched substring, set up the next state with a
2550     count of characters to skip before trying it. Note that the count is in
2551     characters, not bytes. */
2552    
2553     if (rc > 0)
2554     {
2555     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2556     {
2557     const uschar *p = start_subject + local_offsets[rc];
2558     const uschar *pp = start_subject + local_offsets[rc+1];
2559     int charcount = local_offsets[rc+1] - local_offsets[rc];
2560     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2561     if (charcount > 0)
2562     {
2563     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2564     }
2565     else
2566     {
2567     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2568     }
2569     }
2570     }
2571     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2572     }
2573     break;
2574    
2575     /*-----------------------------------------------------------------*/
2576     case OP_ONCE:
2577     {
2578     int local_offsets[2];
2579     int local_workspace[1000];
2580    
2581     int rc = internal_dfa_exec(
2582     md, /* fixed match data */
2583     code, /* this subexpression's code */
2584     ptr, /* where we currently are */
2585     ptr - start_subject, /* start offset */
2586     local_offsets, /* offset vector */
2587     sizeof(local_offsets)/sizeof(int), /* size of same */
2588     local_workspace, /* workspace vector */
2589     sizeof(local_workspace)/sizeof(int), /* size of same */
2590     ims, /* the current ims flags */
2591     rlevel, /* function recursion level */
2592     recursing); /* pass on regex recursion */
2593    
2594     if (rc >= 0)
2595     {
2596     const uschar *end_subpattern = code;
2597     int charcount = local_offsets[1] - local_offsets[0];
2598     int next_state_offset, repeat_state_offset;
2599    
2600     do { end_subpattern += GET(end_subpattern, 1); }
2601     while (*end_subpattern == OP_ALT);
2602     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2603    
2604     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2605     arrange for the repeat state also to be added to the relevant list.
2606     Calculate the offset, or set -1 for no repeat. */
2607    
2608     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2609     *end_subpattern == OP_KETRMIN)?
2610     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2611    
2612     /* If we have matched an empty string, add the next state at the
2613     current character pointer. This is important so that the duplicate
2614     checking kicks in, which is what breaks infinite loops that match an
2615     empty string. */
2616    
2617     if (charcount == 0)
2618     {
2619     ADD_ACTIVE(next_state_offset, 0);
2620     }
2621    
2622     /* Optimization: if there are no more active states, and there
2623     are no new states yet set up, then skip over the subject string
2624     right here, to save looping. Otherwise, set up the new state to swing
2625     into action when the end of the substring is reached. */
2626    
2627     else if (i + 1 >= active_count && new_count == 0)
2628     {
2629     ptr += charcount;
2630     clen = 0;
2631     ADD_NEW(next_state_offset, 0);
2632    
2633     /* If we are adding a repeat state at the new character position,
2634     we must fudge things so that it is the only current state.
2635     Otherwise, it might be a duplicate of one we processed before, and
2636     that would cause it to be skipped. */
2637    
2638     if (repeat_state_offset >= 0)
2639     {
2640     next_active_state = active_states;
2641     active_count = 0;
2642     i = -1;
2643     ADD_ACTIVE(repeat_state_offset, 0);
2644     }
2645     }
2646     else
2647     {
2648     const uschar *p = start_subject + local_offsets[0];
2649     const uschar *pp = start_subject + local_offsets[1];
2650     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2651     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2652     if (repeat_state_offset >= 0)
2653     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2654     }
2655    
2656     }
2657     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2658     }
2659     break;
2660    
2661    
2662     /* ========================================================================== */
2663     /* Handle callouts */
2664    
2665     case OP_CALLOUT:
2666 ph10 406 rrc = 0;
2667 nigel 77 if (pcre_callout != NULL)
2668     {
2669     pcre_callout_block cb;
2670     cb.version = 1; /* Version 1 of the callout block */
2671     cb.callout_number = code[1];
2672     cb.offset_vector = offsets;
2673 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2674 nigel 77 cb.subject_length = end_subject - start_subject;
2675     cb.start_match = current_subject - start_subject;
2676     cb.current_position = ptr - start_subject;
2677     cb.pattern_position = GET(code, 2);
2678     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2679     cb.capture_top = 1;
2680     cb.capture_last = -1;
2681     cb.callout_data = md->callout_data;
2682     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2683 ph10 406 }
2684     if (rrc == 0)
2685     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2686 nigel 77 break;
2687    
2688    
2689     /* ========================================================================== */
2690     default: /* Unsupported opcode */
2691     return PCRE_ERROR_DFA_UITEM;
2692     }
2693    
2694     NEXT_ACTIVE_STATE: continue;
2695    
2696     } /* End of loop scanning active states */
2697    
2698     /* We have finished the processing at the current subject character. If no
2699     new states have been set for the next character, we have found all the
2700     matches that we are going to find. If we are at the top level and partial
2701 ph10 463 matching has been requested, check for appropriate conditions.
2702    
2703 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2704     character. If it is equal to the original active_count (saved in
2705     workspace[1]) it means that (*F) was found on every active state. In this
2706 ph10 463 case we don't want to give a partial match.
2707 nigel 77
2708 ph10 463 The "could_continue" variable is true if a state could have continued but
2709     for the fact that the end of the subject was reached. */
2710    
2711 nigel 77 if (new_count <= 0)
2712     {
2713 ph10 427 if (rlevel == 1 && /* Top level, and */
2714 ph10 463 could_continue && /* Some could go on */
2715 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2716 ph10 427 ( /* either... */
2717     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2718     || /* or... */
2719     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2720     match_count < 0) /* no matches */
2721     ) && /* And... */
2722     ptr >= end_subject && /* Reached end of subject */
2723     ptr > current_subject) /* Matched non-empty string */
2724 nigel 77 {
2725     if (offsetcount >= 2)
2726     {
2727 ph10 435 offsets[0] = md->start_used_ptr - start_subject;
2728 nigel 77 offsets[1] = end_subject - start_subject;
2729     }
2730     match_count = PCRE_ERROR_PARTIAL;
2731     }
2732    
2733     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2734     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2735     rlevel*2-2, SP));
2736 nigel 91 break; /* In effect, "return", but see the comment below */
2737 nigel 77 }
2738    
2739     /* One or more states are active for the next character. */
2740    
2741     ptr += clen; /* Advance to next subject character */
2742     } /* Loop to move along the subject string */
2743    
2744 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2745     if we use "return" above, we have compiler trouble. Some compilers warn if
2746     there's nothing here because they think the function doesn't return a value. On
2747     the other hand, if we put a dummy statement here, some more clever compilers
2748     complain that it can't be reached. Sigh. */
2749 nigel 77
2750 nigel 91 return match_count;
2751 nigel 77 }
2752    
2753    
2754    
2755    
2756     /*************************************************
2757     * Execute a Regular Expression - DFA engine *
2758     *************************************************/
2759    
2760     /* This external function applies a compiled re to a subject string using a DFA
2761     engine. This function calls the internal function multiple times if the pattern
2762     is not anchored.
2763    
2764     Arguments:
2765     argument_re points to the compiled expression
2766 ph10 97 extra_data points to extra data or is NULL
2767 nigel 77 subject points to the subject string
2768     length length of subject string (may contain binary zeros)
2769     start_offset where to start in the subject string
2770     options option bits
2771     offsets vector of match offsets
2772     offsetcount size of same
2773     workspace workspace vector
2774     wscount size of same
2775    
2776     Returns: > 0 => number of match offset pairs placed in offsets
2777     = 0 => offsets overflowed; longest matches are present
2778     -1 => failed to match
2779     < -1 => some kind of unexpected problem
2780     */
2781    
2782 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2783 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2784     const char *subject, int length, int start_offset, int options, int *offsets,
2785     int offsetcount, int *workspace, int wscount)
2786     {
2787     real_pcre *re = (real_pcre *)argument_re;
2788     dfa_match_data match_block;
2789 nigel 91 dfa_match_data *md = &match_block;
2790 nigel 77 BOOL utf8, anchored, startline, firstline;
2791     const uschar *current_subject, *end_subject, *lcc;
2792    
2793     pcre_study_data internal_study;
2794     const pcre_study_data *study = NULL;
2795     real_pcre internal_re;
2796    
2797     const uschar *req_byte_ptr;
2798     const uschar *start_bits = NULL;
2799     BOOL first_byte_caseless = FALSE;
2800     BOOL req_byte_caseless = FALSE;
2801     int first_byte = -1;
2802     int req_byte = -1;
2803     int req_byte2 = -1;
2804 nigel 91 int newline;
2805 nigel 77
2806     /* Plausibility checks */
2807    
2808     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2809     if (re == NULL || subject == NULL || workspace == NULL ||
2810     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2811     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2812     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2813    
2814     /* We need to find the pointer to any study data before we test for byte
2815     flipping, so we scan the extra_data block first. This may set two fields in the
2816     match block, so we must initialize them beforehand. However, the other fields
2817     in the match block must not be set until after the byte flipping. */
2818    
2819 nigel 91 md->tables = re->tables;
2820     md->callout_data = NULL;
2821 nigel 77
2822     if (extra_data != NULL)
2823     {
2824     unsigned int flags = extra_data->flags;
2825     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2826     study = (const pcre_study_data *)extra_data->study_data;
2827     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2828 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2829     return PCRE_ERROR_DFA_UMLIMIT;
2830 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2831 nigel 91 md->callout_data = extra_data->callout_data;
2832 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2833 nigel 91 md->tables = extra_data->tables;
2834 nigel 77 }
2835 ph10 461
2836 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
2837     test for a regex that was compiled on a host of opposite endianness. If this is
2838     the case, flipped values are put in internal_re and internal_study if there was
2839     study data too. */
2840    
2841     if (re->magic_number != MAGIC_NUMBER)
2842     {
2843     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2844     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2845     if (study != NULL) study = &internal_study;
2846     }
2847    
2848     /* Set some local values */
2849    
2850     current_subject = (const unsigned char *)subject + start_offset;
2851     end_subject = (const unsigned char *)subject + length;
2852     req_byte_ptr = current_subject - 1;
2853    
2854 nigel 91 #ifdef SUPPORT_UTF8
2855 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2856 nigel 91 #else
2857     utf8 = FALSE;
2858     #endif
2859 nigel 77
2860 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2861     (re->options & PCRE_ANCHORED) != 0;
2862    
2863 nigel 77 /* The remaining fixed data for passing around. */
2864    
2865 nigel 91 md->start_code = (const uschar *)argument_re +
2866 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2867 nigel 91 md->start_subject = (const unsigned char *)subject;
2868     md->end_subject = end_subject;
2869 ph10 442 md->start_offset = start_offset;
2870 nigel 91 md->moptions = options;
2871     md->poptions = re->options;
2872 nigel 77
2873 ph10 231 /* If the BSR option is not set at match time, copy what was set
2874     at compile time. */
2875    
2876     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2877     {
2878     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2879     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2880     #ifdef BSR_ANYCRLF
2881     else md->moptions |= PCRE_BSR_ANYCRLF;
2882 ph10 243 #endif
2883     }
2884 ph10 231
2885 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2886     nothing is set at run time, whatever was used at compile time applies. */
2887 nigel 91
2888 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2889 nigel 93 PCRE_NEWLINE_BITS)
2890 nigel 91 {
2891 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2892 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2893     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2894 nigel 91 case PCRE_NEWLINE_CR+
2895 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2896 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2897 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2898 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2899 nigel 91 }
2900    
2901 ph10 149 if (newline == -2)
2902 nigel 91 {
2903 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2904     }
2905     else if (newline < 0)
2906     {
2907 nigel 93 md->nltype = NLTYPE_ANY;
2908 nigel 91 }
2909     else
2910     {
2911 nigel 93 md->nltype = NLTYPE_FIXED;
2912     if (newline > 255)
2913     {
2914     md->nllen = 2;
2915     md->nl[0] = (newline >> 8) & 255;
2916     md->nl[1] = newline & 255;
2917     }
2918     else
2919     {
2920     md->nllen = 1;
2921     md->nl[0] = newline;
2922     }
2923 nigel 91 }
2924    
2925 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2926     back the character offset. */
2927    
2928     #ifdef SUPPORT_UTF8
2929     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2930     {
2931     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2932     return PCRE_ERROR_BADUTF8;
2933     if (start_offset > 0 && start_offset < length)
2934     {
2935     int tb = ((uschar *)subject)[start_offset];
2936     if (tb > 127)
2937     {
2938     tb &= 0xc0;
2939     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2940     }
2941     }
2942     }
2943     #endif
2944    
2945     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2946     is a feature that makes it possible to save compiled regex and re-use them
2947     in other programs later. */
2948    
2949 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2950 nigel 77
2951     /* The lower casing table and the "must be at the start of a line" flag are
2952     used in a loop when finding where to start. */
2953    
2954 nigel 91 lcc = md->tables + lcc_offset;
2955 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2956 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2957    
2958     /* Set up the first character to match, if available. The first_byte value is
2959     never set for an anchored regular expression, but the anchoring may be forced
2960     at run time, so we have to test for anchoring. The first char may be unset for
2961     an unanchored pattern, of course. If there's no first char and the pattern was
2962     studied, there may be a bitmap of possible first characters. */
2963    
2964     if (!anchored)
2965     {
2966 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2967 nigel 77 {
2968     first_byte = re->first_byte & 255;
2969     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2970     first_byte = lcc[first_byte];
2971     }
2972     else
2973     {
2974 ph10 455 if (!startline && study != NULL &&
2975     (study->flags & PCRE_STUDY_MAPPED) != 0)
2976 nigel 77 start_bits = study->start_bits;
2977     }
2978     }
2979    
2980     /* For anchored or unanchored matches, there may be a "last known required
2981     character" set. */
2982    
2983 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2984 nigel 77 {
2985     req_byte = re->req_byte & 255;
2986     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2987 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2988 nigel 77 }
2989    
2990     /* Call the main matching function, looping for a non-anchored regex after a
2991 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2992     a match. */
2993 nigel 77
2994     for (;;)
2995     {
2996     int rc;
2997    
2998     if ((options & PCRE_DFA_RESTART) == 0)
2999     {
3000     const uschar *save_end_subject = end_subject;
3001    
3002 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3003     line of a multiline string. Implement this by temporarily adjusting
3004     end_subject so that we stop scanning at a newline. If the match fails at
3005     the newline, later code breaks this loop. */
3006 nigel 77
3007     if (firstline)
3008     {
3009 ph10 365 USPTR t = current_subject;
3010     #ifdef SUPPORT_UTF8
3011     if (utf8)
3012 ph10 371 {
3013     while (t < md->end_subject && !IS_NEWLINE(t))
3014 ph10 365 {
3015     t++;
3016     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3017 ph10 371 }
3018 ph10 365 }
3019     else
3020 ph10 371 #endif
3021 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3022 nigel 77 end_subject = t;
3023     }
3024 ph10 392
3025 ph10 389 /* There are some optimizations that avoid running the match if a known
3026 ph10 455 starting point is not found. However, there is an option that disables
3027     these, for testing and for ensuring that all callouts do actually occur. */
3028 nigel 77
3029 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
3030 ph10 392 {
3031 ph10 389 /* Advance to a known first byte. */
3032 ph10 392
3033 ph10 389 if (first_byte >= 0)
3034 nigel 77 {
3035 ph10 389 if (first_byte_caseless)
3036     while (current_subject < end_subject &&
3037     lcc[*current_subject] != first_byte)
3038     current_subject++;
3039     else
3040 ph10 392 while (current_subject < end_subject &&
3041 ph10 389 *current_subject != first_byte)
3042     current_subject++;
3043     }
3044 ph10 392
3045 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3046 ph10 392
3047 ph10 389 else if (startline)
3048     {
3049     if (current_subject > md->start_subject + start_offset)
3050     {
3051 ph10 365 #ifdef SUPPORT_UTF8
3052 ph10 389 if (utf8)
3053 ph10 365 {
3054 ph10 392 while (current_subject < end_subject &&
3055 ph10 389 !WAS_NEWLINE(current_subject))
3056     {
3057 ph10 365 current_subject++;
3058 ph10 389 while(current_subject < end_subject &&
3059     (*current_subject & 0xc0) == 0x80)
3060     current_subject++;
3061     }
3062 ph10 371 }
3063 ph10 389 else
3064     #endif
3065     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3066     current_subject++;
3067 ph10 392
3068 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3069     ANYCRLF, and we are now at a LF, advance the match position by one
3070     more character. */
3071 ph10 392
3072 ph10 391 if (current_subject[-1] == CHAR_CR &&
3073 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3074     current_subject < end_subject &&
3075 ph10 391 *current_subject == CHAR_NL)
3076 ph10 389 current_subject++;
3077 ph10 365 }
3078 nigel 77 }
3079 ph10 392
3080 ph10 389 /* Or to a non-unique first char after study */
3081 ph10 392
3082 ph10 389 else if (start_bits != NULL)
3083 nigel 77 {
3084 ph10 389 while (current_subject < end_subject)
3085     {
3086     register unsigned int c = *current_subject;
3087     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
3088     else break;
3089     }
3090 nigel 77 }
3091 ph10 392 }
3092 nigel 77
3093     /* Restore fudged end_subject */
3094    
3095     end_subject = save_end_subject;
3096    
3097 ph10 461 /* The following two optimizations are disabled for partial matching or if
3098     disabling is explicitly requested (and of course, by the test above, this
3099 ph10 455 code is not obeyed when restarting after a partial match). */
3100 ph10 461
3101 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3102     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3103 ph10 461 {
3104 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3105     is a lower bound; no actual string of that length may actually match the
3106     pattern. Although the value is, strictly, in characters, we treat it as
3107     bytes to avoid spending too much time in this optimization. */
3108 nigel 77
3109 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3110 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3111 ph10 455 return PCRE_ERROR_NOMATCH;
3112 ph10 461
3113 ph10 455 /* If req_byte is set, we know that that character must appear in the
3114     subject for the match to succeed. If the first character is set, req_byte
3115     must be later in the subject; otherwise the test starts at the match
3116     point. This optimization can save a huge amount of work in patterns with
3117     nested unlimited repeats that aren't going to match. Writing separate
3118     code for cased/caseless versions makes it go faster, as does using an
3119     autoincrement and backing off on a match.
3120 ph10 461
3121 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3122     can take a long time, and give bad performance on quite ordinary
3123     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3124     string... so we don't do this when the string is sufficiently long. */
3125 ph10 461
3126 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3127 nigel 77 {
3128 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3129 ph10 461
3130 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3131     place we found it at last time. */
3132 ph10 461
3133 ph10 455 if (p > req_byte_ptr)
3134 nigel 77 {
3135 ph10 455 if (req_byte_caseless)
3136     {
3137     while (p < end_subject)
3138     {
3139     register int pp = *p++;
3140     if (pp == req_byte || pp == req_byte2) { p--; break; }
3141     }
3142     }
3143     else
3144     {
3145     while (p < end_subject)
3146     {
3147     if (*p++ == req_byte) { p--; break; }
3148     }
3149     }
3150 ph10 461
3151 ph10 455 /* If we can't find the required character, break the matching loop,
3152     which will cause a return or PCRE_ERROR_NOMATCH. */
3153 ph10 461
3154 ph10 455 if (p >= end_subject) break;
3155 ph10 461
3156 ph10 455 /* If we have found the required character, save the point where we
3157     found it, so that we don't search again next time round the loop if
3158     the start hasn't passed this character yet. */
3159 ph10 461
3160 ph10 455 req_byte_ptr = p;
3161 nigel 77 }
3162 ph10 461 }
3163 nigel 77 }
3164 ph10 455 } /* End of optimizations that are done when not restarting */
3165 nigel 77
3166     /* OK, now we can do the business */
3167    
3168 ph10 435 md->start_used_ptr = current_subject;
3169 ph10 461
3170 nigel 77 rc = internal_dfa_exec(
3171 nigel 91 md, /* fixed match data */
3172     md->start_code, /* this subexpression's code */
3173     current_subject, /* where we currently are */
3174     start_offset, /* start offset in subject */
3175     offsets, /* offset vector */
3176     offsetcount, /* size of same */
3177     workspace, /* workspace vector */
3178     wscount, /* size of same */
3179 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3180 nigel 91 0, /* function recurse level */
3181     0); /* regex recurse level */
3182 nigel 77
3183     /* Anything other than "no match" means we are done, always; otherwise, carry
3184     on only if not anchored. */
3185    
3186     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3187    
3188     /* Advance to the next subject character unless we are at the end of a line
3189     and firstline is set. */
3190    
3191 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3192 nigel 77 current_subject++;
3193     if (utf8)
3194     {
3195     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3196     current_subject++;
3197     }
3198     if (current_subject > end_subject) break;
3199    
3200 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3201 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3202     or ANY or ANYCRLF, advance the match position by one more character. */
3203 nigel 93
3204 ph10 391 if (current_subject[-1] == CHAR_CR &&
3205 ph10 226 current_subject < end_subject &&
3206 ph10 391 *current_subject == CHAR_NL &&
3207 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3208 ph10 226 (md->nltype == NLTYPE_ANY ||
3209     md->nltype == NLTYPE_ANYCRLF ||
3210     md->nllen == 2))
3211 nigel 93 current_subject++;
3212    
3213     } /* "Bumpalong" loop */
3214    
3215 nigel 77 return PCRE_ERROR_NOMATCH;
3216     }
3217    
3218     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12