/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 510 - (hide annotations) (download)
Sat Mar 27 17:45:29 2010 UTC (4 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 107433 byte(s)
Add support for *MARK and names for *PRUNE, *SKIP, *THEN.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 473 Copyright (c) 1997-2010 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
125     1, /* Char */
126     1, /* Charnc */
127     1, /* not */
128     /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130     3, 3, 3, /* upto, minupto, exact */
131 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
132 nigel 77 /* Negative single-char repeats - only for chars < 256 */
133     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
134     3, 3, 3, /* NOT upto, minupto, exact */
135 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
136 nigel 77 /* Positive type repeats */
137     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* Type upto, minupto, exact */
139 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
140 nigel 77 /* Character class & ref repeats */
141     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
142     0, 0, /* CRRANGE, CRMINRANGE */
143     0, /* CLASS */
144     0, /* NCLASS */
145     0, /* XCLASS - variable length */
146     0, /* REF */
147     0, /* RECURSE */
148     0, /* CALLOUT */
149     0, /* Alt */
150     0, /* Ket */
151     0, /* KetRmax */
152     0, /* KetRmin */
153     0, /* Assert */
154     0, /* Assert not */
155     0, /* Assert behind */
156     0, /* Assert behind not */
157     0, /* Reverse */
158 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
159     0, 0, 0, /* SBRA, SCBRA, SCOND */
160 ph10 498 0, 0, /* CREF, NCREF */
161     0, 0, /* RREF, NRREF */
162 nigel 93 0, /* DEF */
163 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
164 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
165     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
166     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
167 nigel 77 };
168    
169 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
170 ph10 462 remember the fact that a character could have been inspected when the end of
171 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
172     two tables that follow must also be modified. */
173 ph10 462
174     static const uschar poptable[] = {
175     0, /* End */
176 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
177 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
178     1, 1, 1, /* Any, AllAny, Anybyte */
179 ph10 498 1, 1, /* \P, \p */
180 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
181 ph10 498 1, /* \X */
182 ph10 462 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
183     1, /* Char */
184     1, /* Charnc */
185     1, /* not */
186     /* Positive single-char repeats */
187     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
188     1, 1, 1, /* upto, minupto, exact */
189     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
190     /* Negative single-char repeats - only for chars < 256 */
191     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
192     1, 1, 1, /* NOT upto, minupto, exact */
193     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
194     /* Positive type repeats */
195     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
196     1, 1, 1, /* Type upto, minupto, exact */
197     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
198     /* Character class & ref repeats */
199     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200     1, 1, /* CRRANGE, CRMINRANGE */
201     1, /* CLASS */
202     1, /* NCLASS */
203     1, /* XCLASS - variable length */
204     0, /* REF */
205     0, /* RECURSE */
206     0, /* CALLOUT */
207     0, /* Alt */
208     0, /* Ket */
209     0, /* KetRmax */
210     0, /* KetRmin */
211     0, /* Assert */
212     0, /* Assert not */
213     0, /* Assert behind */
214     0, /* Assert behind not */
215     0, /* Reverse */
216     0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
217     0, 0, 0, /* SBRA, SCBRA, SCOND */
218 ph10 498 0, 0, /* CREF, NCREF */
219     0, 0, /* RREF, NRREF */
220 ph10 462 0, /* DEF */
221     0, 0, /* BRAZERO, BRAMINZERO */
222 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
223     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
224     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
225 ph10 462 };
226    
227 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
228     and \w */
229    
230 ph10 327 static const uschar toptable1[] = {
231 ph10 168 0, 0, 0, 0, 0, 0,
232 nigel 77 ctype_digit, ctype_digit,
233     ctype_space, ctype_space,
234     ctype_word, ctype_word,
235 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
236 nigel 77 };
237    
238 ph10 327 static const uschar toptable2[] = {
239 ph10 168 0, 0, 0, 0, 0, 0,
240 nigel 77 ctype_digit, 0,
241     ctype_space, 0,
242     ctype_word, 0,
243 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
244 nigel 77 };
245    
246    
247     /* Structure for holding data about a particular state, which is in effect the
248     current data for an active path through the match tree. It must consist
249     entirely of ints because the working vector we are passed, and which we put
250     these structures in, is a vector of ints. */
251    
252     typedef struct stateblock {
253     int offset; /* Offset to opcode */
254     int count; /* Count for repeats */
255     int ims; /* ims flag bits */
256     int data; /* Some use extra data */
257     } stateblock;
258    
259     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
260    
261    
262 ph10 475 #ifdef PCRE_DEBUG
263 nigel 77 /*************************************************
264     * Print character string *
265     *************************************************/
266    
267     /* Character string printing function for debugging.
268    
269     Arguments:
270     p points to string
271     length number of bytes
272     f where to print
273    
274     Returns: nothing
275     */
276    
277     static void
278     pchars(unsigned char *p, int length, FILE *f)
279     {
280     int c;
281     while (length-- > 0)
282     {
283     if (isprint(c = *(p++)))
284     fprintf(f, "%c", c);
285     else
286     fprintf(f, "\\x%02x", c);
287     }
288     }
289     #endif
290    
291    
292    
293     /*************************************************
294     * Execute a Regular Expression - DFA engine *
295     *************************************************/
296    
297     /* This internal function applies a compiled pattern to a subject string,
298     starting at a given point, using a DFA engine. This function is called from the
299     external one, possibly multiple times if the pattern is not anchored. The
300     function calls itself recursively for some kinds of subpattern.
301    
302     Arguments:
303     md the match_data block with fixed information
304     this_start_code the opening bracket of this subexpression's code
305     current_subject where we currently are in the subject string
306     start_offset start offset in the subject string
307     offsets vector to contain the matching string offsets
308     offsetcount size of same
309     workspace vector of workspace
310     wscount size of same
311     ims the current ims flags
312     rlevel function call recursion level
313     recursing regex recursive call level
314    
315 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
316 ph10 341 = 0 => offsets overflowed; longest matches are present
317 nigel 77 -1 => failed to match
318     < -1 => some kind of unexpected problem
319    
320     The following macros are used for adding states to the two state vectors (one
321     for the current character, one for the following character). */
322    
323     #define ADD_ACTIVE(x,y) \
324     if (active_count++ < wscount) \
325     { \
326     next_active_state->offset = (x); \
327     next_active_state->count = (y); \
328     next_active_state->ims = ims; \
329     next_active_state++; \
330     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
331     } \
332     else return PCRE_ERROR_DFA_WSSIZE
333    
334     #define ADD_ACTIVE_DATA(x,y,z) \
335     if (active_count++ < wscount) \
336     { \
337     next_active_state->offset = (x); \
338     next_active_state->count = (y); \
339     next_active_state->ims = ims; \
340     next_active_state->data = (z); \
341     next_active_state++; \
342     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
343     } \
344     else return PCRE_ERROR_DFA_WSSIZE
345    
346     #define ADD_NEW(x,y) \
347     if (new_count++ < wscount) \
348     { \
349     next_new_state->offset = (x); \
350     next_new_state->count = (y); \
351     next_new_state->ims = ims; \
352     next_new_state++; \
353     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354     } \
355     else return PCRE_ERROR_DFA_WSSIZE
356    
357     #define ADD_NEW_DATA(x,y,z) \
358     if (new_count++ < wscount) \
359     { \
360     next_new_state->offset = (x); \
361     next_new_state->count = (y); \
362     next_new_state->ims = ims; \
363     next_new_state->data = (z); \
364     next_new_state++; \
365     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
366     } \
367     else return PCRE_ERROR_DFA_WSSIZE
368    
369     /* And now, here is the code */
370    
371     static int
372     internal_dfa_exec(
373     dfa_match_data *md,
374     const uschar *this_start_code,
375     const uschar *current_subject,
376     int start_offset,
377     int *offsets,
378     int offsetcount,
379     int *workspace,
380     int wscount,
381     int ims,
382     int rlevel,
383     int recursing)
384     {
385     stateblock *active_states, *new_states, *temp_states;
386     stateblock *next_active_state, *next_new_state;
387    
388     const uschar *ctypes, *lcc, *fcc;
389     const uschar *ptr;
390 nigel 93 const uschar *end_code, *first_op;
391 nigel 77
392     int active_count, new_count, match_count;
393    
394     /* Some fields in the md block are frequently referenced, so we load them into
395     independent variables in the hope that this will perform better. */
396    
397     const uschar *start_subject = md->start_subject;
398     const uschar *end_subject = md->end_subject;
399     const uschar *start_code = md->start_code;
400    
401 nigel 87 #ifdef SUPPORT_UTF8
402 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
403 nigel 93 #else
404     BOOL utf8 = FALSE;
405 nigel 87 #endif
406 nigel 77
407     rlevel++;
408     offsetcount &= (-2);
409    
410     wscount -= 2;
411     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
412     (2 * INTS_PER_STATEBLOCK);
413    
414     DPRINTF(("\n%.*s---------------------\n"
415     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
416     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
417    
418     ctypes = md->tables + ctypes_offset;
419     lcc = md->tables + lcc_offset;
420     fcc = md->tables + fcc_offset;
421    
422     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
423    
424     active_states = (stateblock *)(workspace + 2);
425     next_new_state = new_states = active_states + wscount;
426     new_count = 0;
427    
428 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
429     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
430    
431 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
432     the alternative states onto the list, and find out where the end is. This
433     makes is possible to use this function recursively, when we want to stop at a
434     matching internal ket rather than at the end.
435    
436     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
437     a backward assertion. In that case, we have to find out the maximum amount to
438     move back, and set up each alternative appropriately. */
439    
440 nigel 93 if (*first_op == OP_REVERSE)
441 nigel 77 {
442     int max_back = 0;
443     int gone_back;
444    
445     end_code = this_start_code;
446     do
447     {
448     int back = GET(end_code, 2+LINK_SIZE);
449     if (back > max_back) max_back = back;
450     end_code += GET(end_code, 1);
451     }
452     while (*end_code == OP_ALT);
453    
454     /* If we can't go back the amount required for the longest lookbehind
455     pattern, go back as far as we can; some alternatives may still be viable. */
456    
457     #ifdef SUPPORT_UTF8
458     /* In character mode we have to step back character by character */
459    
460     if (utf8)
461     {
462     for (gone_back = 0; gone_back < max_back; gone_back++)
463     {
464     if (current_subject <= start_subject) break;
465     current_subject--;
466     while (current_subject > start_subject &&
467     (*current_subject & 0xc0) == 0x80)
468     current_subject--;
469     }
470     }
471     else
472     #endif
473    
474     /* In byte-mode we can do this quickly. */
475    
476     {
477     gone_back = (current_subject - max_back < start_subject)?
478     current_subject - start_subject : max_back;
479     current_subject -= gone_back;
480     }
481 ph10 461
482 ph10 435 /* Save the earliest consulted character */
483 nigel 77
484 ph10 461 if (current_subject < md->start_used_ptr)
485     md->start_used_ptr = current_subject;
486    
487 nigel 77 /* Now we can process the individual branches. */
488    
489     end_code = this_start_code;
490     do
491     {
492     int back = GET(end_code, 2+LINK_SIZE);
493     if (back <= gone_back)
494     {
495     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
496     ADD_NEW_DATA(-bstate, 0, gone_back - back);
497     }
498     end_code += GET(end_code, 1);
499     }
500     while (*end_code == OP_ALT);
501     }
502    
503     /* This is the code for a "normal" subpattern (not a backward assertion). The
504     start of a whole pattern is always one of these. If we are at the top level,
505     we may be asked to restart matching from the same point that we reached for a
506     previous partial match. We still have to scan through the top-level branches to
507     find the end state. */
508    
509     else
510     {
511     end_code = this_start_code;
512    
513     /* Restarting */
514    
515     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
516     {
517     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
518     new_count = workspace[1];
519     if (!workspace[0])
520     memcpy(new_states, active_states, new_count * sizeof(stateblock));
521     }
522    
523     /* Not restarting */
524    
525     else
526     {
527 nigel 93 int length = 1 + LINK_SIZE +
528     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
529 nigel 77 do
530     {
531 nigel 93 ADD_NEW(end_code - start_code + length, 0);
532 nigel 77 end_code += GET(end_code, 1);
533 nigel 93 length = 1 + LINK_SIZE;
534 nigel 77 }
535     while (*end_code == OP_ALT);
536     }
537     }
538    
539     workspace[0] = 0; /* Bit indicating which vector is current */
540    
541     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
542    
543     /* Loop for scanning the subject */
544    
545     ptr = current_subject;
546     for (;;)
547     {
548     int i, j;
549 nigel 91 int clen, dlen;
550     unsigned int c, d;
551 ph10 428 int forced_fail = 0;
552 ph10 462 BOOL could_continue = FALSE;
553 nigel 77
554     /* Make the new state list into the active state list and empty the
555     new state list. */
556    
557     temp_states = active_states;
558     active_states = new_states;
559     new_states = temp_states;
560     active_count = new_count;
561     new_count = 0;
562    
563     workspace[0] ^= 1; /* Remember for the restarting feature */
564     workspace[1] = active_count;
565    
566 ph10 475 #ifdef PCRE_DEBUG
567 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
568     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
569     printf("\"\n");
570    
571     printf("%.*sActive states: ", rlevel*2-2, SP);
572     for (i = 0; i < active_count; i++)
573     printf("%d/%d ", active_states[i].offset, active_states[i].count);
574     printf("\n");
575     #endif
576    
577     /* Set the pointers for adding new states */
578    
579     next_active_state = active_states + active_count;
580     next_new_state = new_states;
581    
582     /* Load the current character from the subject outside the loop, as many
583     different states may want to look at it, and we assume that at least one
584     will. */
585    
586     if (ptr < end_subject)
587     {
588 nigel 93 clen = 1; /* Number of bytes in the character */
589 nigel 77 #ifdef SUPPORT_UTF8
590     if (utf8) { GETCHARLEN(c, ptr, clen); } else
591     #endif /* SUPPORT_UTF8 */
592     c = *ptr;
593     }
594     else
595     {
596 nigel 93 clen = 0; /* This indicates the end of the subject */
597     c = NOTACHAR; /* This value should never actually be used */
598 nigel 77 }
599    
600     /* Scan up the active states and act on each one. The result of an action
601     may be to add more states to the currently active list (e.g. on hitting a
602     parenthesis) or it may be to put states on the new list, for considering
603     when we move the character pointer on. */
604    
605     for (i = 0; i < active_count; i++)
606     {
607     stateblock *current_state = active_states + i;
608     const uschar *code;
609     int state_offset = current_state->offset;
610 ph10 397 int count, codevalue, rrc;
611 nigel 77
612 ph10 475 #ifdef PCRE_DEBUG
613 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
614 nigel 93 if (clen == 0) printf("EOL\n");
615 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
616     else printf("0x%02x\n", c);
617     #endif
618    
619     /* This variable is referred to implicity in the ADD_xxx macros. */
620    
621     ims = current_state->ims;
622    
623     /* A negative offset is a special case meaning "hold off going to this
624     (negated) state until the number of characters in the data field have
625     been skipped". */
626    
627     if (state_offset < 0)
628     {
629     if (current_state->data > 0)
630     {
631     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
632     ADD_NEW_DATA(state_offset, current_state->count,
633     current_state->data - 1);
634     continue;
635     }
636     else
637     {
638     current_state->offset = state_offset = -state_offset;
639     }
640     }
641    
642 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
643 ph10 439 See the note at the head of this module about the possibility of improving
644     performance here. */
645 nigel 77
646     for (j = 0; j < i; j++)
647     {
648     if (active_states[j].offset == state_offset &&
649     active_states[j].count == current_state->count)
650     {
651     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
652     goto NEXT_ACTIVE_STATE;
653     }
654     }
655    
656     /* The state offset is the offset to the opcode */
657    
658     code = start_code + state_offset;
659     codevalue = *code;
660    
661 ph10 463 /* If this opcode inspects a character, but we are at the end of the
662     subject, remember the fact for use when testing for a partial match. */
663    
664 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
665 ph10 463 could_continue = TRUE;
666 ph10 462
667 nigel 77 /* If this opcode is followed by an inline character, load it. It is
668     tempting to test for the presence of a subject character here, but that
669     is wrong, because sometimes zero repetitions of the subject are
670     permitted.
671    
672     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
673 ph10 178 argument that is not a data character - but is always one byte long. We
674     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
675     this case. To keep the other cases fast, convert these ones to new opcodes.
676     */
677 nigel 77
678     if (coptable[codevalue] > 0)
679     {
680     dlen = 1;
681     #ifdef SUPPORT_UTF8
682     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
683     #endif /* SUPPORT_UTF8 */
684     d = code[coptable[codevalue]];
685     if (codevalue >= OP_TYPESTAR)
686     {
687 nigel 93 switch(d)
688     {
689     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
690     case OP_NOTPROP:
691     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
692     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
693     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
694 ph10 178 case OP_NOT_HSPACE:
695 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
696 ph10 178 case OP_NOT_VSPACE:
697 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
698 nigel 93 default: break;
699     }
700 nigel 77 }
701     }
702     else
703     {
704     dlen = 0; /* Not strictly necessary, but compilers moan */
705 nigel 93 d = NOTACHAR; /* if these variables are not set. */
706 nigel 77 }
707    
708    
709     /* Now process the individual opcodes */
710    
711     switch (codevalue)
712     {
713 ph10 498 /* ========================================================================== */
714     /* These cases are never obeyed. This is a fudge that causes a compile-
715     time error if the vectors coptable or poptable, which are indexed by
716     opcode, are not the correct length. It seems to be the only way to do
717     such a check at compile time, as the sizeof() operator does not work
718     in the C preprocessor. */
719 ph10 507
720 ph10 498 case OP_TABLE_LENGTH:
721 ph10 507 case OP_TABLE_LENGTH +
722 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
723     (sizeof(poptable) == OP_TABLE_LENGTH)):
724 ph10 507 break;
725 nigel 77
726     /* ========================================================================== */
727     /* Reached a closing bracket. If not at the end of the pattern, carry
728     on with the next opcode. Otherwise, unless we have an empty string and
729 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
730 ph10 442 start of the subject, save the match data, shifting up all previous
731 nigel 77 matches so we always have the longest first. */
732    
733     case OP_KET:
734     case OP_KETRMIN:
735     case OP_KETRMAX:
736     if (code != end_code)
737     {
738     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
739     if (codevalue != OP_KET)
740     {
741     ADD_ACTIVE(state_offset - GET(code, 1), 0);
742     }
743     }
744 ph10 461 else
745 nigel 77 {
746 ph10 461 if (ptr > current_subject ||
747 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
748     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
749     current_subject > start_subject + md->start_offset)))
750 nigel 77 {
751 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
752     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
753     match_count = 0;
754     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
755     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
756     if (offsetcount >= 2)
757     {
758     offsets[0] = current_subject - start_subject;
759     offsets[1] = ptr - start_subject;
760     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
761     offsets[1] - offsets[0], current_subject));
762     }
763     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
764     {
765     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
766     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
767     match_count, rlevel*2-2, SP));
768     return match_count;
769     }
770 ph10 461 }
771 nigel 77 }
772     break;
773    
774     /* ========================================================================== */
775     /* These opcodes add to the current list of states without looking
776     at the current character. */
777    
778     /*-----------------------------------------------------------------*/
779     case OP_ALT:
780     do { code += GET(code, 1); } while (*code == OP_ALT);
781     ADD_ACTIVE(code - start_code, 0);
782     break;
783    
784     /*-----------------------------------------------------------------*/
785     case OP_BRA:
786 nigel 93 case OP_SBRA:
787 nigel 77 do
788     {
789     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
790     code += GET(code, 1);
791     }
792     while (*code == OP_ALT);
793     break;
794    
795     /*-----------------------------------------------------------------*/
796 nigel 93 case OP_CBRA:
797     case OP_SCBRA:
798     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
799     code += GET(code, 1);
800     while (*code == OP_ALT)
801     {
802     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
803     code += GET(code, 1);
804     }
805     break;
806    
807     /*-----------------------------------------------------------------*/
808 nigel 77 case OP_BRAZERO:
809     case OP_BRAMINZERO:
810     ADD_ACTIVE(state_offset + 1, 0);
811     code += 1 + GET(code, 2);
812     while (*code == OP_ALT) code += GET(code, 1);
813     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
814     break;
815    
816     /*-----------------------------------------------------------------*/
817 ph10 335 case OP_SKIPZERO:
818     code += 1 + GET(code, 2);
819     while (*code == OP_ALT) code += GET(code, 1);
820     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
821     break;
822    
823     /*-----------------------------------------------------------------*/
824 nigel 77 case OP_CIRC:
825     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
826 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
827     ptr != end_subject &&
828 nigel 93 WAS_NEWLINE(ptr)))
829 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
830     break;
831    
832     /*-----------------------------------------------------------------*/
833     case OP_EOD:
834     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
835     break;
836    
837     /*-----------------------------------------------------------------*/
838     case OP_OPT:
839     ims = code[1];
840     ADD_ACTIVE(state_offset + 2, 0);
841     break;
842    
843     /*-----------------------------------------------------------------*/
844     case OP_SOD:
845     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
846     break;
847    
848     /*-----------------------------------------------------------------*/
849     case OP_SOM:
850     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
851     break;
852    
853    
854     /* ========================================================================== */
855     /* These opcodes inspect the next subject character, and sometimes
856     the previous one as well, but do not have an argument. The variable
857     clen contains the length of the current character and is zero if we are
858     at the end of the subject. */
859    
860     /*-----------------------------------------------------------------*/
861     case OP_ANY:
862 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
863 nigel 77 { ADD_NEW(state_offset + 1, 0); }
864     break;
865    
866     /*-----------------------------------------------------------------*/
867 ph10 341 case OP_ALLANY:
868     if (clen > 0)
869     { ADD_NEW(state_offset + 1, 0); }
870     break;
871    
872     /*-----------------------------------------------------------------*/
873 nigel 77 case OP_EODN:
874 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
875 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
876     break;
877    
878     /*-----------------------------------------------------------------*/
879     case OP_DOLL:
880     if ((md->moptions & PCRE_NOTEOL) == 0)
881     {
882 nigel 91 if (clen == 0 ||
883 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
884 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
885     ))
886 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
887     }
888 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
889 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
890     break;
891    
892     /*-----------------------------------------------------------------*/
893    
894     case OP_DIGIT:
895     case OP_WHITESPACE:
896     case OP_WORDCHAR:
897     if (clen > 0 && c < 256 &&
898     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
899     { ADD_NEW(state_offset + 1, 0); }
900     break;
901    
902     /*-----------------------------------------------------------------*/
903     case OP_NOT_DIGIT:
904     case OP_NOT_WHITESPACE:
905     case OP_NOT_WORDCHAR:
906     if (clen > 0 && (c >= 256 ||
907     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
908     { ADD_NEW(state_offset + 1, 0); }
909     break;
910    
911     /*-----------------------------------------------------------------*/
912     case OP_WORD_BOUNDARY:
913     case OP_NOT_WORD_BOUNDARY:
914     {
915     int left_word, right_word;
916    
917     if (ptr > start_subject)
918     {
919     const uschar *temp = ptr - 1;
920 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
921 nigel 77 #ifdef SUPPORT_UTF8
922     if (utf8) BACKCHAR(temp);
923     #endif
924     GETCHARTEST(d, temp);
925     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
926     }
927     else left_word = 0;
928    
929 ph10 461 if (clen > 0)
930 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
931 ph10 463 else right_word = 0;
932 nigel 77
933     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
934     { ADD_ACTIVE(state_offset + 1, 0); }
935     }
936     break;
937    
938    
939     /*-----------------------------------------------------------------*/
940     /* Check the next character by Unicode property. We will get here only
941     if the support is in the binary; otherwise a compile-time error occurs.
942     */
943    
944 ph10 151 #ifdef SUPPORT_UCP
945 nigel 77 case OP_PROP:
946     case OP_NOTPROP:
947     if (clen > 0)
948     {
949 nigel 87 BOOL OK;
950 ph10 349 const ucd_record * prop = GET_UCD(c);
951 nigel 87 switch(code[1])
952 nigel 77 {
953 nigel 87 case PT_ANY:
954     OK = TRUE;
955     break;
956    
957     case PT_LAMP:
958 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
959 nigel 87 break;
960    
961     case PT_GC:
962 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
963 nigel 87 break;
964    
965     case PT_PC:
966 ph10 349 OK = prop->chartype == code[2];
967 nigel 87 break;
968    
969     case PT_SC:
970 ph10 349 OK = prop->script == code[2];
971 nigel 87 break;
972    
973     /* Should never occur, but keep compilers from grumbling. */
974    
975     default:
976     OK = codevalue != OP_PROP;
977     break;
978 nigel 77 }
979 nigel 87
980     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
981 nigel 77 }
982     break;
983     #endif
984    
985    
986    
987     /* ========================================================================== */
988     /* These opcodes likewise inspect the subject character, but have an
989     argument that is not a data character. It is one of these opcodes:
990 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
991     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
992 nigel 77
993     case OP_TYPEPLUS:
994     case OP_TYPEMINPLUS:
995 nigel 93 case OP_TYPEPOSPLUS:
996 nigel 77 count = current_state->count; /* Already matched */
997     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
998     if (clen > 0)
999     {
1000     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1001     (c < 256 &&
1002 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1003 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1004     {
1005 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1006     {
1007     active_count--; /* Remove non-match possibility */
1008     next_active_state--;
1009     }
1010 nigel 77 count++;
1011     ADD_NEW(state_offset, count);
1012     }
1013     }
1014     break;
1015    
1016     /*-----------------------------------------------------------------*/
1017     case OP_TYPEQUERY:
1018     case OP_TYPEMINQUERY:
1019 nigel 93 case OP_TYPEPOSQUERY:
1020 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1021     if (clen > 0)
1022     {
1023     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1024     (c < 256 &&
1025 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1026 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1027     {
1028 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1029     {
1030     active_count--; /* Remove non-match possibility */
1031     next_active_state--;
1032     }
1033 nigel 77 ADD_NEW(state_offset + 2, 0);
1034     }
1035     }
1036     break;
1037    
1038     /*-----------------------------------------------------------------*/
1039     case OP_TYPESTAR:
1040     case OP_TYPEMINSTAR:
1041 nigel 93 case OP_TYPEPOSSTAR:
1042 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1043     if (clen > 0)
1044     {
1045     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1046     (c < 256 &&
1047 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1048 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1049     {
1050 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1051     {
1052     active_count--; /* Remove non-match possibility */
1053     next_active_state--;
1054     }
1055 nigel 77 ADD_NEW(state_offset, 0);
1056     }
1057     }
1058     break;
1059    
1060     /*-----------------------------------------------------------------*/
1061     case OP_TYPEEXACT:
1062 nigel 93 count = current_state->count; /* Number already matched */
1063     if (clen > 0)
1064     {
1065     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1066     (c < 256 &&
1067 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1068 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1069     {
1070     if (++count >= GET2(code, 1))
1071     { ADD_NEW(state_offset + 4, 0); }
1072     else
1073     { ADD_NEW(state_offset, count); }
1074     }
1075     }
1076     break;
1077    
1078     /*-----------------------------------------------------------------*/
1079 nigel 77 case OP_TYPEUPTO:
1080     case OP_TYPEMINUPTO:
1081 nigel 93 case OP_TYPEPOSUPTO:
1082     ADD_ACTIVE(state_offset + 4, 0);
1083 nigel 77 count = current_state->count; /* Number already matched */
1084     if (clen > 0)
1085     {
1086     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1087     (c < 256 &&
1088 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1089 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1090     {
1091 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1092     {
1093     active_count--; /* Remove non-match possibility */
1094     next_active_state--;
1095     }
1096 nigel 77 if (++count >= GET2(code, 1))
1097     { ADD_NEW(state_offset + 4, 0); }
1098     else
1099     { ADD_NEW(state_offset, count); }
1100     }
1101     }
1102     break;
1103    
1104     /* ========================================================================== */
1105     /* These are virtual opcodes that are used when something like
1106 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1107     argument. It keeps the code above fast for the other cases. The argument
1108     is in the d variable. */
1109 nigel 77
1110 ph10 151 #ifdef SUPPORT_UCP
1111 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1112     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1113 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1114 nigel 77 count = current_state->count; /* Already matched */
1115 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1116 nigel 77 if (clen > 0)
1117     {
1118 nigel 87 BOOL OK;
1119 ph10 349 const ucd_record * prop = GET_UCD(c);
1120 nigel 87 switch(code[2])
1121     {
1122     case PT_ANY:
1123     OK = TRUE;
1124     break;
1125    
1126     case PT_LAMP:
1127 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1128 nigel 87 break;
1129    
1130     case PT_GC:
1131 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1132 nigel 87 break;
1133    
1134     case PT_PC:
1135 ph10 349 OK = prop->chartype == code[3];
1136 nigel 87 break;
1137    
1138     case PT_SC:
1139 ph10 349 OK = prop->script == code[3];
1140 nigel 87 break;
1141    
1142     /* Should never occur, but keep compilers from grumbling. */
1143    
1144     default:
1145     OK = codevalue != OP_PROP;
1146     break;
1147     }
1148    
1149 nigel 93 if (OK == (d == OP_PROP))
1150     {
1151     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1152     {
1153     active_count--; /* Remove non-match possibility */
1154     next_active_state--;
1155     }
1156     count++;
1157     ADD_NEW(state_offset, count);
1158     }
1159 nigel 77 }
1160     break;
1161    
1162     /*-----------------------------------------------------------------*/
1163     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1164     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1165 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1166 nigel 77 count = current_state->count; /* Already matched */
1167     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1168 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1169 nigel 77 {
1170     const uschar *nptr = ptr + clen;
1171     int ncount = 0;
1172 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1173     {
1174     active_count--; /* Remove non-match possibility */
1175     next_active_state--;
1176     }
1177 nigel 77 while (nptr < end_subject)
1178     {
1179     int nd;
1180     int ndlen = 1;
1181     GETCHARLEN(nd, nptr, ndlen);
1182 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1183 nigel 77 ncount++;
1184     nptr += ndlen;
1185     }
1186     count++;
1187     ADD_NEW_DATA(-state_offset, count, ncount);
1188     }
1189     break;
1190 ph10 151 #endif
1191 nigel 77
1192     /*-----------------------------------------------------------------*/
1193 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1194     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1195     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1196     count = current_state->count; /* Already matched */
1197     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1198     if (clen > 0)
1199     {
1200     int ncount = 0;
1201     switch (c)
1202     {
1203     case 0x000b:
1204     case 0x000c:
1205     case 0x0085:
1206     case 0x2028:
1207     case 0x2029:
1208 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1209     goto ANYNL01;
1210    
1211     case 0x000d:
1212     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1213     /* Fall through */
1214    
1215     ANYNL01:
1216     case 0x000a:
1217 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1218     {
1219     active_count--; /* Remove non-match possibility */
1220     next_active_state--;
1221     }
1222     count++;
1223     ADD_NEW_DATA(-state_offset, count, ncount);
1224     break;
1225 ph10 231
1226 nigel 93 default:
1227     break;
1228     }
1229     }
1230     break;
1231    
1232     /*-----------------------------------------------------------------*/
1233 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1234     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1235     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1236     count = current_state->count; /* Already matched */
1237     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1238     if (clen > 0)
1239     {
1240 ph10 182 BOOL OK;
1241 ph10 178 switch (c)
1242     {
1243     case 0x000a:
1244     case 0x000b:
1245     case 0x000c:
1246     case 0x000d:
1247     case 0x0085:
1248     case 0x2028:
1249     case 0x2029:
1250     OK = TRUE;
1251 ph10 182 break;
1252 ph10 178
1253     default:
1254     OK = FALSE;
1255 ph10 182 break;
1256 ph10 178 }
1257    
1258     if (OK == (d == OP_VSPACE))
1259 ph10 182 {
1260 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1261     {
1262     active_count--; /* Remove non-match possibility */
1263     next_active_state--;
1264     }
1265     count++;
1266     ADD_NEW_DATA(-state_offset, count, 0);
1267     }
1268     }
1269     break;
1270    
1271     /*-----------------------------------------------------------------*/
1272     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1273     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1274     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1275     count = current_state->count; /* Already matched */
1276     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1277     if (clen > 0)
1278     {
1279 ph10 182 BOOL OK;
1280 ph10 178 switch (c)
1281     {
1282     case 0x09: /* HT */
1283     case 0x20: /* SPACE */
1284     case 0xa0: /* NBSP */
1285     case 0x1680: /* OGHAM SPACE MARK */
1286     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1287     case 0x2000: /* EN QUAD */
1288     case 0x2001: /* EM QUAD */
1289     case 0x2002: /* EN SPACE */
1290     case 0x2003: /* EM SPACE */
1291     case 0x2004: /* THREE-PER-EM SPACE */
1292     case 0x2005: /* FOUR-PER-EM SPACE */
1293     case 0x2006: /* SIX-PER-EM SPACE */
1294     case 0x2007: /* FIGURE SPACE */
1295     case 0x2008: /* PUNCTUATION SPACE */
1296     case 0x2009: /* THIN SPACE */
1297     case 0x200A: /* HAIR SPACE */
1298     case 0x202f: /* NARROW NO-BREAK SPACE */
1299     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1300     case 0x3000: /* IDEOGRAPHIC SPACE */
1301     OK = TRUE;
1302     break;
1303 ph10 182
1304 ph10 178 default:
1305     OK = FALSE;
1306     break;
1307     }
1308 ph10 182
1309 ph10 178 if (OK == (d == OP_HSPACE))
1310 ph10 182 {
1311 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1312     {
1313     active_count--; /* Remove non-match possibility */
1314     next_active_state--;
1315     }
1316     count++;
1317     ADD_NEW_DATA(-state_offset, count, 0);
1318     }
1319     }
1320     break;
1321    
1322     /*-----------------------------------------------------------------*/
1323 ph10 151 #ifdef SUPPORT_UCP
1324 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1325     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1326 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1327 nigel 87 count = 4;
1328 nigel 77 goto QS1;
1329    
1330     case OP_PROP_EXTRA + OP_TYPESTAR:
1331     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1332 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1333 nigel 77 count = 0;
1334    
1335     QS1:
1336    
1337 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1338 nigel 77 if (clen > 0)
1339     {
1340 nigel 87 BOOL OK;
1341 ph10 349 const ucd_record * prop = GET_UCD(c);
1342 nigel 87 switch(code[2])
1343     {
1344     case PT_ANY:
1345     OK = TRUE;
1346     break;
1347    
1348     case PT_LAMP:
1349 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1350 nigel 87 break;
1351    
1352     case PT_GC:
1353 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1354 nigel 87 break;
1355    
1356     case PT_PC:
1357 ph10 349 OK = prop->chartype == code[3];
1358 nigel 87 break;
1359    
1360     case PT_SC:
1361 ph10 349 OK = prop->script == code[3];
1362 nigel 87 break;
1363    
1364     /* Should never occur, but keep compilers from grumbling. */
1365    
1366     default:
1367     OK = codevalue != OP_PROP;
1368     break;
1369     }
1370    
1371 nigel 93 if (OK == (d == OP_PROP))
1372     {
1373     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1374     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1375     {
1376     active_count--; /* Remove non-match possibility */
1377     next_active_state--;
1378     }
1379     ADD_NEW(state_offset + count, 0);
1380     }
1381 nigel 77 }
1382     break;
1383    
1384     /*-----------------------------------------------------------------*/
1385     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1386     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1387 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1388 nigel 77 count = 2;
1389     goto QS2;
1390    
1391     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1392     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1393 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1394 nigel 77 count = 0;
1395    
1396     QS2:
1397    
1398     ADD_ACTIVE(state_offset + 2, 0);
1399 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1400 nigel 77 {
1401     const uschar *nptr = ptr + clen;
1402     int ncount = 0;
1403 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1404     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1405     {
1406     active_count--; /* Remove non-match possibility */
1407     next_active_state--;
1408     }
1409 nigel 77 while (nptr < end_subject)
1410     {
1411     int nd;
1412     int ndlen = 1;
1413     GETCHARLEN(nd, nptr, ndlen);
1414 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1415 nigel 77 ncount++;
1416     nptr += ndlen;
1417     }
1418     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1419     }
1420     break;
1421 ph10 151 #endif
1422 nigel 77
1423     /*-----------------------------------------------------------------*/
1424 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1425     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1426     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1427     count = 2;
1428     goto QS3;
1429    
1430     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1431     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1432     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1433     count = 0;
1434    
1435     QS3:
1436     ADD_ACTIVE(state_offset + 2, 0);
1437     if (clen > 0)
1438     {
1439     int ncount = 0;
1440     switch (c)
1441     {
1442     case 0x000b:
1443     case 0x000c:
1444     case 0x0085:
1445     case 0x2028:
1446     case 0x2029:
1447 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1448     goto ANYNL02;
1449    
1450     case 0x000d:
1451     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1452     /* Fall through */
1453    
1454     ANYNL02:
1455     case 0x000a:
1456 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1457     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1458     {
1459     active_count--; /* Remove non-match possibility */
1460     next_active_state--;
1461     }
1462     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1463     break;
1464 ph10 231
1465 nigel 93 default:
1466     break;
1467     }
1468     }
1469     break;
1470    
1471     /*-----------------------------------------------------------------*/
1472 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1473     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1474     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1475     count = 2;
1476     goto QS4;
1477    
1478     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1479     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1480     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1481     count = 0;
1482    
1483     QS4:
1484     ADD_ACTIVE(state_offset + 2, 0);
1485     if (clen > 0)
1486     {
1487 ph10 182 BOOL OK;
1488 ph10 178 switch (c)
1489     {
1490     case 0x000a:
1491     case 0x000b:
1492     case 0x000c:
1493     case 0x000d:
1494     case 0x0085:
1495     case 0x2028:
1496     case 0x2029:
1497     OK = TRUE;
1498     break;
1499 ph10 182
1500 ph10 178 default:
1501     OK = FALSE;
1502     break;
1503     }
1504     if (OK == (d == OP_VSPACE))
1505 ph10 182 {
1506 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1507     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1508     {
1509     active_count--; /* Remove non-match possibility */
1510     next_active_state--;
1511     }
1512     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1513     }
1514     }
1515     break;
1516    
1517     /*-----------------------------------------------------------------*/
1518     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1519     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1520     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1521     count = 2;
1522     goto QS5;
1523    
1524     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1525     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1526     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1527     count = 0;
1528    
1529     QS5:
1530     ADD_ACTIVE(state_offset + 2, 0);
1531     if (clen > 0)
1532     {
1533 ph10 182 BOOL OK;
1534 ph10 178 switch (c)
1535     {
1536     case 0x09: /* HT */
1537     case 0x20: /* SPACE */
1538     case 0xa0: /* NBSP */
1539     case 0x1680: /* OGHAM SPACE MARK */
1540     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1541     case 0x2000: /* EN QUAD */
1542     case 0x2001: /* EM QUAD */
1543     case 0x2002: /* EN SPACE */
1544     case 0x2003: /* EM SPACE */
1545     case 0x2004: /* THREE-PER-EM SPACE */
1546     case 0x2005: /* FOUR-PER-EM SPACE */
1547     case 0x2006: /* SIX-PER-EM SPACE */
1548     case 0x2007: /* FIGURE SPACE */
1549     case 0x2008: /* PUNCTUATION SPACE */
1550     case 0x2009: /* THIN SPACE */
1551     case 0x200A: /* HAIR SPACE */
1552     case 0x202f: /* NARROW NO-BREAK SPACE */
1553     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1554     case 0x3000: /* IDEOGRAPHIC SPACE */
1555     OK = TRUE;
1556     break;
1557 ph10 182
1558 ph10 178 default:
1559     OK = FALSE;
1560     break;
1561     }
1562 ph10 182
1563 ph10 178 if (OK == (d == OP_HSPACE))
1564 ph10 182 {
1565 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1566     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1567     {
1568     active_count--; /* Remove non-match possibility */
1569     next_active_state--;
1570     }
1571     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1572     }
1573     }
1574     break;
1575    
1576     /*-----------------------------------------------------------------*/
1577 ph10 151 #ifdef SUPPORT_UCP
1578 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1579     case OP_PROP_EXTRA + OP_TYPEUPTO:
1580     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1581 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1582 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1583 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1584 nigel 77 count = current_state->count; /* Number already matched */
1585     if (clen > 0)
1586     {
1587 nigel 87 BOOL OK;
1588 ph10 349 const ucd_record * prop = GET_UCD(c);
1589 nigel 87 switch(code[4])
1590 nigel 77 {
1591 nigel 87 case PT_ANY:
1592     OK = TRUE;
1593     break;
1594    
1595     case PT_LAMP:
1596 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1597 nigel 87 break;
1598    
1599     case PT_GC:
1600 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1601 nigel 87 break;
1602    
1603     case PT_PC:
1604 ph10 349 OK = prop->chartype == code[5];
1605 nigel 87 break;
1606    
1607     case PT_SC:
1608 ph10 349 OK = prop->script == code[5];
1609 nigel 87 break;
1610    
1611     /* Should never occur, but keep compilers from grumbling. */
1612    
1613     default:
1614     OK = codevalue != OP_PROP;
1615     break;
1616     }
1617    
1618     if (OK == (d == OP_PROP))
1619     {
1620 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1621     {
1622     active_count--; /* Remove non-match possibility */
1623     next_active_state--;
1624     }
1625 nigel 77 if (++count >= GET2(code, 1))
1626 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1627 nigel 77 else
1628     { ADD_NEW(state_offset, count); }
1629     }
1630     }
1631     break;
1632    
1633     /*-----------------------------------------------------------------*/
1634     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1635     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1636     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1637 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1638 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1639     { ADD_ACTIVE(state_offset + 4, 0); }
1640     count = current_state->count; /* Number already matched */
1641 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1642 nigel 77 {
1643     const uschar *nptr = ptr + clen;
1644     int ncount = 0;
1645 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1646     {
1647     active_count--; /* Remove non-match possibility */
1648     next_active_state--;
1649     }
1650 nigel 77 while (nptr < end_subject)
1651     {
1652     int nd;
1653     int ndlen = 1;
1654     GETCHARLEN(nd, nptr, ndlen);
1655 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1656 nigel 77 ncount++;
1657     nptr += ndlen;
1658     }
1659     if (++count >= GET2(code, 1))
1660     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1661     else
1662     { ADD_NEW_DATA(-state_offset, count, ncount); }
1663     }
1664     break;
1665 ph10 151 #endif
1666 nigel 77
1667 nigel 93 /*-----------------------------------------------------------------*/
1668     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1669     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1670     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1671     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1672     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1673     { ADD_ACTIVE(state_offset + 4, 0); }
1674     count = current_state->count; /* Number already matched */
1675     if (clen > 0)
1676     {
1677     int ncount = 0;
1678     switch (c)
1679     {
1680     case 0x000b:
1681     case 0x000c:
1682     case 0x0085:
1683     case 0x2028:
1684     case 0x2029:
1685 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1686     goto ANYNL03;
1687    
1688     case 0x000d:
1689     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1690     /* Fall through */
1691    
1692     ANYNL03:
1693     case 0x000a:
1694 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1695     {
1696     active_count--; /* Remove non-match possibility */
1697     next_active_state--;
1698     }
1699     if (++count >= GET2(code, 1))
1700     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1701     else
1702     { ADD_NEW_DATA(-state_offset, count, ncount); }
1703     break;
1704 ph10 231
1705 nigel 93 default:
1706     break;
1707     }
1708     }
1709     break;
1710    
1711 ph10 178 /*-----------------------------------------------------------------*/
1712     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1713     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1714     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1715     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1716     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1717     { ADD_ACTIVE(state_offset + 4, 0); }
1718     count = current_state->count; /* Number already matched */
1719     if (clen > 0)
1720     {
1721 ph10 182 BOOL OK;
1722 ph10 178 switch (c)
1723     {
1724     case 0x000a:
1725     case 0x000b:
1726     case 0x000c:
1727     case 0x000d:
1728     case 0x0085:
1729     case 0x2028:
1730     case 0x2029:
1731     OK = TRUE;
1732     break;
1733 ph10 182
1734 ph10 178 default:
1735     OK = FALSE;
1736     }
1737 ph10 182
1738 ph10 178 if (OK == (d == OP_VSPACE))
1739 ph10 182 {
1740 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1741     {
1742     active_count--; /* Remove non-match possibility */
1743     next_active_state--;
1744     }
1745     if (++count >= GET2(code, 1))
1746     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1747     else
1748     { ADD_NEW_DATA(-state_offset, count, 0); }
1749     }
1750     }
1751     break;
1752    
1753     /*-----------------------------------------------------------------*/
1754     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1755     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1756     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1757     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1758     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1759     { ADD_ACTIVE(state_offset + 4, 0); }
1760     count = current_state->count; /* Number already matched */
1761     if (clen > 0)
1762     {
1763 ph10 182 BOOL OK;
1764 ph10 178 switch (c)
1765     {
1766     case 0x09: /* HT */
1767     case 0x20: /* SPACE */
1768     case 0xa0: /* NBSP */
1769     case 0x1680: /* OGHAM SPACE MARK */
1770     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1771     case 0x2000: /* EN QUAD */
1772     case 0x2001: /* EM QUAD */
1773     case 0x2002: /* EN SPACE */
1774     case 0x2003: /* EM SPACE */
1775     case 0x2004: /* THREE-PER-EM SPACE */
1776     case 0x2005: /* FOUR-PER-EM SPACE */
1777     case 0x2006: /* SIX-PER-EM SPACE */
1778     case 0x2007: /* FIGURE SPACE */
1779     case 0x2008: /* PUNCTUATION SPACE */
1780     case 0x2009: /* THIN SPACE */
1781     case 0x200A: /* HAIR SPACE */
1782     case 0x202f: /* NARROW NO-BREAK SPACE */
1783     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1784     case 0x3000: /* IDEOGRAPHIC SPACE */
1785     OK = TRUE;
1786     break;
1787 ph10 182
1788 ph10 178 default:
1789     OK = FALSE;
1790     break;
1791     }
1792 ph10 182
1793 ph10 178 if (OK == (d == OP_HSPACE))
1794 ph10 182 {
1795 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1796     {
1797     active_count--; /* Remove non-match possibility */
1798     next_active_state--;
1799     }
1800     if (++count >= GET2(code, 1))
1801     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1802     else
1803     { ADD_NEW_DATA(-state_offset, count, 0); }
1804     }
1805     }
1806     break;
1807    
1808 nigel 77 /* ========================================================================== */
1809     /* These opcodes are followed by a character that is usually compared
1810     to the current subject character; it is loaded into d. We still get
1811     here even if there is no subject character, because in some cases zero
1812     repetitions are permitted. */
1813    
1814     /*-----------------------------------------------------------------*/
1815     case OP_CHAR:
1816     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1817     break;
1818    
1819     /*-----------------------------------------------------------------*/
1820     case OP_CHARNC:
1821     if (clen == 0) break;
1822    
1823     #ifdef SUPPORT_UTF8
1824     if (utf8)
1825     {
1826     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1827     {
1828 nigel 93 unsigned int othercase;
1829 nigel 77 if (c < 128) othercase = fcc[c]; else
1830    
1831     /* If we have Unicode property support, we can use it to test the
1832 nigel 87 other case of the character. */
1833 nigel 77
1834     #ifdef SUPPORT_UCP
1835 ph10 349 othercase = UCD_OTHERCASE(c);
1836 nigel 87 #else
1837 nigel 93 othercase = NOTACHAR;
1838 nigel 77 #endif
1839    
1840     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1841     }
1842     }
1843     else
1844     #endif /* SUPPORT_UTF8 */
1845    
1846     /* Non-UTF-8 mode */
1847     {
1848     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1849     }
1850     break;
1851    
1852    
1853     #ifdef SUPPORT_UCP
1854     /*-----------------------------------------------------------------*/
1855     /* This is a tricky one because it can match more than one character.
1856     Find out how many characters to skip, and then set up a negative state
1857     to wait for them to pass before continuing. */
1858    
1859     case OP_EXTUNI:
1860 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1861 nigel 77 {
1862     const uschar *nptr = ptr + clen;
1863     int ncount = 0;
1864     while (nptr < end_subject)
1865     {
1866     int nclen = 1;
1867     GETCHARLEN(c, nptr, nclen);
1868 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1869 nigel 77 ncount++;
1870     nptr += nclen;
1871     }
1872     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1873     }
1874     break;
1875     #endif
1876    
1877     /*-----------------------------------------------------------------*/
1878 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1879     character (when CR is followed by LF). In this case, set up a negative
1880     state to wait for one character to pass before continuing. */
1881    
1882     case OP_ANYNL:
1883     if (clen > 0) switch(c)
1884     {
1885     case 0x000b:
1886     case 0x000c:
1887     case 0x0085:
1888     case 0x2028:
1889     case 0x2029:
1890 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1891    
1892     case 0x000a:
1893 nigel 93 ADD_NEW(state_offset + 1, 0);
1894     break;
1895 ph10 231
1896 nigel 93 case 0x000d:
1897     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1898     {
1899     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1900     }
1901     else
1902     {
1903     ADD_NEW(state_offset + 1, 0);
1904     }
1905     break;
1906     }
1907     break;
1908    
1909     /*-----------------------------------------------------------------*/
1910 ph10 178 case OP_NOT_VSPACE:
1911     if (clen > 0) switch(c)
1912     {
1913     case 0x000a:
1914     case 0x000b:
1915     case 0x000c:
1916     case 0x000d:
1917     case 0x0085:
1918     case 0x2028:
1919     case 0x2029:
1920     break;
1921 ph10 182
1922     default:
1923 ph10 178 ADD_NEW(state_offset + 1, 0);
1924     break;
1925     }
1926     break;
1927    
1928     /*-----------------------------------------------------------------*/
1929     case OP_VSPACE:
1930     if (clen > 0) switch(c)
1931     {
1932     case 0x000a:
1933     case 0x000b:
1934     case 0x000c:
1935     case 0x000d:
1936     case 0x0085:
1937     case 0x2028:
1938     case 0x2029:
1939     ADD_NEW(state_offset + 1, 0);
1940     break;
1941 ph10 182
1942 ph10 178 default: break;
1943     }
1944     break;
1945    
1946     /*-----------------------------------------------------------------*/
1947     case OP_NOT_HSPACE:
1948     if (clen > 0) switch(c)
1949     {
1950     case 0x09: /* HT */
1951     case 0x20: /* SPACE */
1952     case 0xa0: /* NBSP */
1953     case 0x1680: /* OGHAM SPACE MARK */
1954     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1955     case 0x2000: /* EN QUAD */
1956     case 0x2001: /* EM QUAD */
1957     case 0x2002: /* EN SPACE */
1958     case 0x2003: /* EM SPACE */
1959     case 0x2004: /* THREE-PER-EM SPACE */
1960     case 0x2005: /* FOUR-PER-EM SPACE */
1961     case 0x2006: /* SIX-PER-EM SPACE */
1962     case 0x2007: /* FIGURE SPACE */
1963     case 0x2008: /* PUNCTUATION SPACE */
1964     case 0x2009: /* THIN SPACE */
1965     case 0x200A: /* HAIR SPACE */
1966     case 0x202f: /* NARROW NO-BREAK SPACE */
1967     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1968     case 0x3000: /* IDEOGRAPHIC SPACE */
1969     break;
1970 ph10 182
1971     default:
1972 ph10 178 ADD_NEW(state_offset + 1, 0);
1973     break;
1974     }
1975     break;
1976    
1977     /*-----------------------------------------------------------------*/
1978     case OP_HSPACE:
1979     if (clen > 0) switch(c)
1980     {
1981     case 0x09: /* HT */
1982     case 0x20: /* SPACE */
1983     case 0xa0: /* NBSP */
1984     case 0x1680: /* OGHAM SPACE MARK */
1985     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1986     case 0x2000: /* EN QUAD */
1987     case 0x2001: /* EM QUAD */
1988     case 0x2002: /* EN SPACE */
1989     case 0x2003: /* EM SPACE */
1990     case 0x2004: /* THREE-PER-EM SPACE */
1991     case 0x2005: /* FOUR-PER-EM SPACE */
1992     case 0x2006: /* SIX-PER-EM SPACE */
1993     case 0x2007: /* FIGURE SPACE */
1994     case 0x2008: /* PUNCTUATION SPACE */
1995     case 0x2009: /* THIN SPACE */
1996     case 0x200A: /* HAIR SPACE */
1997     case 0x202f: /* NARROW NO-BREAK SPACE */
1998     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1999     case 0x3000: /* IDEOGRAPHIC SPACE */
2000     ADD_NEW(state_offset + 1, 0);
2001     break;
2002     }
2003     break;
2004    
2005     /*-----------------------------------------------------------------*/
2006 nigel 77 /* Match a negated single character. This is only used for one-byte
2007     characters, that is, we know that d < 256. The character we are
2008     checking (c) can be multibyte. */
2009    
2010     case OP_NOT:
2011     if (clen > 0)
2012     {
2013 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2014 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2015     }
2016     break;
2017    
2018     /*-----------------------------------------------------------------*/
2019     case OP_PLUS:
2020     case OP_MINPLUS:
2021 nigel 93 case OP_POSPLUS:
2022 nigel 77 case OP_NOTPLUS:
2023     case OP_NOTMINPLUS:
2024 nigel 93 case OP_NOTPOSPLUS:
2025 nigel 77 count = current_state->count; /* Already matched */
2026     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2027     if (clen > 0)
2028     {
2029 nigel 93 unsigned int otherd = NOTACHAR;
2030 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2031     {
2032     #ifdef SUPPORT_UTF8
2033 nigel 87 if (utf8 && d >= 128)
2034 nigel 77 {
2035     #ifdef SUPPORT_UCP
2036 ph10 349 otherd = UCD_OTHERCASE(d);
2037 nigel 77 #endif /* SUPPORT_UCP */
2038     }
2039     else
2040     #endif /* SUPPORT_UTF8 */
2041     otherd = fcc[d];
2042     }
2043     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2044 nigel 93 {
2045     if (count > 0 &&
2046     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2047     {
2048     active_count--; /* Remove non-match possibility */
2049     next_active_state--;
2050     }
2051     count++;
2052     ADD_NEW(state_offset, count);
2053     }
2054 nigel 77 }
2055     break;
2056    
2057     /*-----------------------------------------------------------------*/
2058     case OP_QUERY:
2059     case OP_MINQUERY:
2060 nigel 93 case OP_POSQUERY:
2061 nigel 77 case OP_NOTQUERY:
2062     case OP_NOTMINQUERY:
2063 nigel 93 case OP_NOTPOSQUERY:
2064 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2065     if (clen > 0)
2066     {
2067 nigel 93 unsigned int otherd = NOTACHAR;
2068 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2069 nigel 77 {
2070     #ifdef SUPPORT_UTF8
2071 nigel 87 if (utf8 && d >= 128)
2072 nigel 77 {
2073     #ifdef SUPPORT_UCP
2074 ph10 349 otherd = UCD_OTHERCASE(d);
2075 nigel 77 #endif /* SUPPORT_UCP */
2076     }
2077     else
2078     #endif /* SUPPORT_UTF8 */
2079     otherd = fcc[d];
2080     }
2081     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2082 nigel 93 {
2083     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2084     {
2085     active_count--; /* Remove non-match possibility */
2086     next_active_state--;
2087     }
2088     ADD_NEW(state_offset + dlen + 1, 0);
2089     }
2090 nigel 77 }
2091     break;
2092    
2093     /*-----------------------------------------------------------------*/
2094     case OP_STAR:
2095     case OP_MINSTAR:
2096 nigel 93 case OP_POSSTAR:
2097 nigel 77 case OP_NOTSTAR:
2098     case OP_NOTMINSTAR:
2099 nigel 93 case OP_NOTPOSSTAR:
2100 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2101     if (clen > 0)
2102     {
2103 nigel 93 unsigned int otherd = NOTACHAR;
2104 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2105 nigel 77 {
2106     #ifdef SUPPORT_UTF8
2107 nigel 87 if (utf8 && d >= 128)
2108 nigel 77 {
2109     #ifdef SUPPORT_UCP
2110 ph10 349 otherd = UCD_OTHERCASE(d);
2111 nigel 77 #endif /* SUPPORT_UCP */
2112     }
2113     else
2114     #endif /* SUPPORT_UTF8 */
2115     otherd = fcc[d];
2116     }
2117     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2118 nigel 93 {
2119     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2120     {
2121     active_count--; /* Remove non-match possibility */
2122     next_active_state--;
2123     }
2124     ADD_NEW(state_offset, 0);
2125     }
2126 nigel 77 }
2127     break;
2128    
2129     /*-----------------------------------------------------------------*/
2130     case OP_EXACT:
2131 nigel 93 case OP_NOTEXACT:
2132     count = current_state->count; /* Number already matched */
2133     if (clen > 0)
2134     {
2135     unsigned int otherd = NOTACHAR;
2136     if ((ims & PCRE_CASELESS) != 0)
2137     {
2138     #ifdef SUPPORT_UTF8
2139     if (utf8 && d >= 128)
2140     {
2141     #ifdef SUPPORT_UCP
2142 ph10 349 otherd = UCD_OTHERCASE(d);
2143 nigel 93 #endif /* SUPPORT_UCP */
2144     }
2145     else
2146     #endif /* SUPPORT_UTF8 */
2147     otherd = fcc[d];
2148     }
2149     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2150     {
2151     if (++count >= GET2(code, 1))
2152     { ADD_NEW(state_offset + dlen + 3, 0); }
2153     else
2154     { ADD_NEW(state_offset, count); }
2155     }
2156     }
2157     break;
2158    
2159     /*-----------------------------------------------------------------*/
2160 nigel 77 case OP_UPTO:
2161     case OP_MINUPTO:
2162 nigel 93 case OP_POSUPTO:
2163 nigel 77 case OP_NOTUPTO:
2164     case OP_NOTMINUPTO:
2165 nigel 93 case OP_NOTPOSUPTO:
2166     ADD_ACTIVE(state_offset + dlen + 3, 0);
2167 nigel 77 count = current_state->count; /* Number already matched */
2168     if (clen > 0)
2169     {
2170 nigel 93 unsigned int otherd = NOTACHAR;
2171 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2172     {
2173     #ifdef SUPPORT_UTF8
2174 nigel 87 if (utf8 && d >= 128)
2175 nigel 77 {
2176     #ifdef SUPPORT_UCP
2177 ph10 349 otherd = UCD_OTHERCASE(d);
2178 nigel 77 #endif /* SUPPORT_UCP */
2179     }
2180     else
2181     #endif /* SUPPORT_UTF8 */
2182     otherd = fcc[d];
2183     }
2184     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2185     {
2186 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2187     {
2188     active_count--; /* Remove non-match possibility */
2189     next_active_state--;
2190     }
2191 nigel 77 if (++count >= GET2(code, 1))
2192     { ADD_NEW(state_offset + dlen + 3, 0); }
2193     else
2194     { ADD_NEW(state_offset, count); }
2195     }
2196     }
2197     break;
2198    
2199    
2200     /* ========================================================================== */
2201     /* These are the class-handling opcodes */
2202    
2203     case OP_CLASS:
2204     case OP_NCLASS:
2205     case OP_XCLASS:
2206     {
2207     BOOL isinclass = FALSE;
2208     int next_state_offset;
2209     const uschar *ecode;
2210    
2211     /* For a simple class, there is always just a 32-byte table, and we
2212     can set isinclass from it. */
2213    
2214     if (codevalue != OP_XCLASS)
2215     {
2216     ecode = code + 33;
2217     if (clen > 0)
2218     {
2219     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2220     ((code[1 + c/8] & (1 << (c&7))) != 0);
2221     }
2222     }
2223    
2224     /* An extended class may have a table or a list of single characters,
2225     ranges, or both, and it may be positive or negative. There's a
2226     function that sorts all this out. */
2227    
2228     else
2229     {
2230     ecode = code + GET(code, 1);
2231     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2232     }
2233    
2234     /* At this point, isinclass is set for all kinds of class, and ecode
2235     points to the byte after the end of the class. If there is a
2236     quantifier, this is where it will be. */
2237    
2238     next_state_offset = ecode - start_code;
2239    
2240     switch (*ecode)
2241     {
2242     case OP_CRSTAR:
2243     case OP_CRMINSTAR:
2244     ADD_ACTIVE(next_state_offset + 1, 0);
2245     if (isinclass) { ADD_NEW(state_offset, 0); }
2246     break;
2247    
2248     case OP_CRPLUS:
2249     case OP_CRMINPLUS:
2250     count = current_state->count; /* Already matched */
2251     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2252     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2253     break;
2254    
2255     case OP_CRQUERY:
2256     case OP_CRMINQUERY:
2257     ADD_ACTIVE(next_state_offset + 1, 0);
2258     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2259     break;
2260    
2261     case OP_CRRANGE:
2262     case OP_CRMINRANGE:
2263     count = current_state->count; /* Already matched */
2264     if (count >= GET2(ecode, 1))
2265     { ADD_ACTIVE(next_state_offset + 5, 0); }
2266     if (isinclass)
2267     {
2268 nigel 91 int max = GET2(ecode, 3);
2269     if (++count >= max && max != 0) /* Max 0 => no limit */
2270 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2271     else
2272     { ADD_NEW(state_offset, count); }
2273     }
2274     break;
2275    
2276     default:
2277     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2278     break;
2279     }
2280     }
2281     break;
2282    
2283     /* ========================================================================== */
2284     /* These are the opcodes for fancy brackets of various kinds. We have
2285 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2286     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2287 ph10 341 though the other "backtracking verbs" are not supported. */
2288 ph10 345
2289 ph10 341 case OP_FAIL:
2290 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2291 ph10 345 break;
2292 nigel 77
2293     case OP_ASSERT:
2294     case OP_ASSERT_NOT:
2295     case OP_ASSERTBACK:
2296     case OP_ASSERTBACK_NOT:
2297     {
2298     int rc;
2299     int local_offsets[2];
2300     int local_workspace[1000];
2301     const uschar *endasscode = code + GET(code, 1);
2302    
2303     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2304    
2305     rc = internal_dfa_exec(
2306     md, /* static match data */
2307     code, /* this subexpression's code */
2308     ptr, /* where we currently are */
2309     ptr - start_subject, /* start offset */
2310     local_offsets, /* offset vector */
2311     sizeof(local_offsets)/sizeof(int), /* size of same */
2312     local_workspace, /* workspace vector */
2313     sizeof(local_workspace)/sizeof(int), /* size of same */
2314     ims, /* the current ims flags */
2315     rlevel, /* function recursion level */
2316     recursing); /* pass on regex recursion */
2317 ph10 487
2318 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2319 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2320     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2321     }
2322     break;
2323    
2324     /*-----------------------------------------------------------------*/
2325     case OP_COND:
2326 nigel 93 case OP_SCOND:
2327 nigel 77 {
2328     int local_offsets[1000];
2329     int local_workspace[1000];
2330 ph10 406 int codelink = GET(code, 1);
2331 ph10 397 int condcode;
2332 ph10 406
2333 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2334 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2335 ph10 398 happen for the other conditions. */
2336 nigel 77
2337 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2338 ph10 406 {
2339     rrc = 0;
2340 ph10 397 if (pcre_callout != NULL)
2341     {
2342     pcre_callout_block cb;
2343     cb.version = 1; /* Version 1 of the callout block */
2344     cb.callout_number = code[LINK_SIZE+2];
2345     cb.offset_vector = offsets;
2346     cb.subject = (PCRE_SPTR)start_subject;
2347     cb.subject_length = end_subject - start_subject;
2348     cb.start_match = current_subject - start_subject;
2349     cb.current_position = ptr - start_subject;
2350     cb.pattern_position = GET(code, LINK_SIZE + 3);
2351     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2352     cb.capture_top = 1;
2353     cb.capture_last = -1;
2354     cb.callout_data = md->callout_data;
2355     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2356     }
2357 ph10 398 if (rrc > 0) break; /* Fail this thread */
2358     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2359 ph10 406 }
2360 ph10 398
2361 ph10 397 condcode = code[LINK_SIZE+1];
2362 ph10 406
2363 nigel 93 /* Back reference conditions are not supported */
2364 nigel 77
2365 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2366 ph10 459 return PCRE_ERROR_DFA_UCOND;
2367 nigel 93
2368     /* The DEFINE condition is always false */
2369    
2370     if (condcode == OP_DEF)
2371 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2372 nigel 93
2373     /* The only supported version of OP_RREF is for the value RREF_ANY,
2374     which means "test if in any recursion". We can't test for specifically
2375     recursed groups. */
2376    
2377 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2378 nigel 93 {
2379 nigel 77 int value = GET2(code, LINK_SIZE+2);
2380 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2381 ph10 406 if (recursing > 0)
2382 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2383     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2384 nigel 77 }
2385    
2386     /* Otherwise, the condition is an assertion */
2387    
2388     else
2389     {
2390     int rc;
2391     const uschar *asscode = code + LINK_SIZE + 1;
2392     const uschar *endasscode = asscode + GET(asscode, 1);
2393    
2394     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2395    
2396     rc = internal_dfa_exec(
2397     md, /* fixed match data */
2398     asscode, /* this subexpression's code */
2399     ptr, /* where we currently are */
2400     ptr - start_subject, /* start offset */
2401     local_offsets, /* offset vector */
2402     sizeof(local_offsets)/sizeof(int), /* size of same */
2403     local_workspace, /* workspace vector */
2404     sizeof(local_workspace)/sizeof(int), /* size of same */
2405     ims, /* the current ims flags */
2406     rlevel, /* function recursion level */
2407     recursing); /* pass on regex recursion */
2408    
2409 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2410 nigel 77 if ((rc >= 0) ==
2411     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2412 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2413 nigel 77 else
2414 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2415 nigel 77 }
2416     }
2417     break;
2418    
2419     /*-----------------------------------------------------------------*/
2420     case OP_RECURSE:
2421     {
2422     int local_offsets[1000];
2423     int local_workspace[1000];
2424     int rc;
2425    
2426     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2427     recursing + 1));
2428    
2429     rc = internal_dfa_exec(
2430     md, /* fixed match data */
2431     start_code + GET(code, 1), /* this subexpression's code */
2432     ptr, /* where we currently are */
2433     ptr - start_subject, /* start offset */
2434     local_offsets, /* offset vector */
2435     sizeof(local_offsets)/sizeof(int), /* size of same */
2436     local_workspace, /* workspace vector */
2437     sizeof(local_workspace)/sizeof(int), /* size of same */
2438     ims, /* the current ims flags */
2439     rlevel, /* function recursion level */
2440     recursing + 1); /* regex recurse level */
2441    
2442     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2443     recursing + 1, rc));
2444    
2445     /* Ran out of internal offsets */
2446    
2447     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2448    
2449     /* For each successful matched substring, set up the next state with a
2450     count of characters to skip before trying it. Note that the count is in
2451     characters, not bytes. */
2452    
2453     if (rc > 0)
2454     {
2455     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2456     {
2457     const uschar *p = start_subject + local_offsets[rc];
2458     const uschar *pp = start_subject + local_offsets[rc+1];
2459     int charcount = local_offsets[rc+1] - local_offsets[rc];
2460     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2461     if (charcount > 0)
2462     {
2463     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2464     }
2465     else
2466     {
2467     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2468     }
2469     }
2470     }
2471     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2472     }
2473     break;
2474    
2475     /*-----------------------------------------------------------------*/
2476     case OP_ONCE:
2477     {
2478     int local_offsets[2];
2479     int local_workspace[1000];
2480    
2481     int rc = internal_dfa_exec(
2482     md, /* fixed match data */
2483     code, /* this subexpression's code */
2484     ptr, /* where we currently are */
2485     ptr - start_subject, /* start offset */
2486     local_offsets, /* offset vector */
2487     sizeof(local_offsets)/sizeof(int), /* size of same */
2488     local_workspace, /* workspace vector */
2489     sizeof(local_workspace)/sizeof(int), /* size of same */
2490     ims, /* the current ims flags */
2491     rlevel, /* function recursion level */
2492     recursing); /* pass on regex recursion */
2493    
2494     if (rc >= 0)
2495     {
2496     const uschar *end_subpattern = code;
2497     int charcount = local_offsets[1] - local_offsets[0];
2498     int next_state_offset, repeat_state_offset;
2499    
2500     do { end_subpattern += GET(end_subpattern, 1); }
2501     while (*end_subpattern == OP_ALT);
2502     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2503    
2504     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2505     arrange for the repeat state also to be added to the relevant list.
2506     Calculate the offset, or set -1 for no repeat. */
2507    
2508     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2509     *end_subpattern == OP_KETRMIN)?
2510     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2511    
2512     /* If we have matched an empty string, add the next state at the
2513     current character pointer. This is important so that the duplicate
2514     checking kicks in, which is what breaks infinite loops that match an
2515     empty string. */
2516    
2517     if (charcount == 0)
2518     {
2519     ADD_ACTIVE(next_state_offset, 0);
2520     }
2521    
2522     /* Optimization: if there are no more active states, and there
2523     are no new states yet set up, then skip over the subject string
2524     right here, to save looping. Otherwise, set up the new state to swing
2525     into action when the end of the substring is reached. */
2526    
2527     else if (i + 1 >= active_count && new_count == 0)
2528     {
2529     ptr += charcount;
2530     clen = 0;
2531     ADD_NEW(next_state_offset, 0);
2532    
2533     /* If we are adding a repeat state at the new character position,
2534     we must fudge things so that it is the only current state.
2535     Otherwise, it might be a duplicate of one we processed before, and
2536     that would cause it to be skipped. */
2537    
2538     if (repeat_state_offset >= 0)
2539     {
2540     next_active_state = active_states;
2541     active_count = 0;
2542     i = -1;
2543     ADD_ACTIVE(repeat_state_offset, 0);
2544     }
2545     }
2546     else
2547     {
2548     const uschar *p = start_subject + local_offsets[0];
2549     const uschar *pp = start_subject + local_offsets[1];
2550     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2551     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2552     if (repeat_state_offset >= 0)
2553     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2554     }
2555    
2556     }
2557     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2558     }
2559     break;
2560    
2561    
2562     /* ========================================================================== */
2563     /* Handle callouts */
2564    
2565     case OP_CALLOUT:
2566 ph10 406 rrc = 0;
2567 nigel 77 if (pcre_callout != NULL)
2568     {
2569     pcre_callout_block cb;
2570     cb.version = 1; /* Version 1 of the callout block */
2571     cb.callout_number = code[1];
2572     cb.offset_vector = offsets;
2573 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2574 nigel 77 cb.subject_length = end_subject - start_subject;
2575     cb.start_match = current_subject - start_subject;
2576     cb.current_position = ptr - start_subject;
2577     cb.pattern_position = GET(code, 2);
2578     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2579     cb.capture_top = 1;
2580     cb.capture_last = -1;
2581     cb.callout_data = md->callout_data;
2582     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2583 ph10 406 }
2584     if (rrc == 0)
2585     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2586 nigel 77 break;
2587    
2588    
2589     /* ========================================================================== */
2590     default: /* Unsupported opcode */
2591     return PCRE_ERROR_DFA_UITEM;
2592     }
2593    
2594     NEXT_ACTIVE_STATE: continue;
2595    
2596     } /* End of loop scanning active states */
2597    
2598     /* We have finished the processing at the current subject character. If no
2599     new states have been set for the next character, we have found all the
2600     matches that we are going to find. If we are at the top level and partial
2601 ph10 463 matching has been requested, check for appropriate conditions.
2602    
2603 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2604     character. If it is equal to the original active_count (saved in
2605     workspace[1]) it means that (*F) was found on every active state. In this
2606 ph10 463 case we don't want to give a partial match.
2607 nigel 77
2608 ph10 463 The "could_continue" variable is true if a state could have continued but
2609     for the fact that the end of the subject was reached. */
2610    
2611 nigel 77 if (new_count <= 0)
2612     {
2613 ph10 427 if (rlevel == 1 && /* Top level, and */
2614 ph10 463 could_continue && /* Some could go on */
2615 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2616 ph10 427 ( /* either... */
2617     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2618     || /* or... */
2619     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2620     match_count < 0) /* no matches */
2621     ) && /* And... */
2622     ptr >= end_subject && /* Reached end of subject */
2623     ptr > current_subject) /* Matched non-empty string */
2624 nigel 77 {
2625     if (offsetcount >= 2)
2626     {
2627 ph10 435 offsets[0] = md->start_used_ptr - start_subject;
2628 nigel 77 offsets[1] = end_subject - start_subject;
2629     }
2630     match_count = PCRE_ERROR_PARTIAL;
2631     }
2632    
2633     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2634     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2635     rlevel*2-2, SP));
2636 nigel 91 break; /* In effect, "return", but see the comment below */
2637 nigel 77 }
2638    
2639     /* One or more states are active for the next character. */
2640    
2641     ptr += clen; /* Advance to next subject character */
2642     } /* Loop to move along the subject string */
2643    
2644 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2645     if we use "return" above, we have compiler trouble. Some compilers warn if
2646     there's nothing here because they think the function doesn't return a value. On
2647     the other hand, if we put a dummy statement here, some more clever compilers
2648     complain that it can't be reached. Sigh. */
2649 nigel 77
2650 nigel 91 return match_count;
2651 nigel 77 }
2652    
2653    
2654    
2655    
2656     /*************************************************
2657     * Execute a Regular Expression - DFA engine *
2658     *************************************************/
2659    
2660     /* This external function applies a compiled re to a subject string using a DFA
2661     engine. This function calls the internal function multiple times if the pattern
2662     is not anchored.
2663    
2664     Arguments:
2665     argument_re points to the compiled expression
2666 ph10 97 extra_data points to extra data or is NULL
2667 nigel 77 subject points to the subject string
2668     length length of subject string (may contain binary zeros)
2669     start_offset where to start in the subject string
2670     options option bits
2671     offsets vector of match offsets
2672     offsetcount size of same
2673     workspace workspace vector
2674     wscount size of same
2675    
2676     Returns: > 0 => number of match offset pairs placed in offsets
2677     = 0 => offsets overflowed; longest matches are present
2678     -1 => failed to match
2679     < -1 => some kind of unexpected problem
2680     */
2681    
2682 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2683 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2684     const char *subject, int length, int start_offset, int options, int *offsets,
2685     int offsetcount, int *workspace, int wscount)
2686     {
2687     real_pcre *re = (real_pcre *)argument_re;
2688     dfa_match_data match_block;
2689 nigel 91 dfa_match_data *md = &match_block;
2690 nigel 77 BOOL utf8, anchored, startline, firstline;
2691     const uschar *current_subject, *end_subject, *lcc;
2692    
2693     pcre_study_data internal_study;
2694     const pcre_study_data *study = NULL;
2695     real_pcre internal_re;
2696    
2697     const uschar *req_byte_ptr;
2698     const uschar *start_bits = NULL;
2699     BOOL first_byte_caseless = FALSE;
2700     BOOL req_byte_caseless = FALSE;
2701     int first_byte = -1;
2702     int req_byte = -1;
2703     int req_byte2 = -1;
2704 nigel 91 int newline;
2705 nigel 77
2706     /* Plausibility checks */
2707    
2708     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2709     if (re == NULL || subject == NULL || workspace == NULL ||
2710     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2711     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2712     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2713    
2714     /* We need to find the pointer to any study data before we test for byte
2715     flipping, so we scan the extra_data block first. This may set two fields in the
2716     match block, so we must initialize them beforehand. However, the other fields
2717     in the match block must not be set until after the byte flipping. */
2718    
2719 nigel 91 md->tables = re->tables;
2720     md->callout_data = NULL;
2721 nigel 77
2722     if (extra_data != NULL)
2723     {
2724     unsigned int flags = extra_data->flags;
2725     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2726     study = (const pcre_study_data *)extra_data->study_data;
2727     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2728 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2729     return PCRE_ERROR_DFA_UMLIMIT;
2730 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2731 nigel 91 md->callout_data = extra_data->callout_data;
2732 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2733 nigel 91 md->tables = extra_data->tables;
2734 nigel 77 }
2735 ph10 461
2736 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
2737     test for a regex that was compiled on a host of opposite endianness. If this is
2738     the case, flipped values are put in internal_re and internal_study if there was
2739     study data too. */
2740    
2741     if (re->magic_number != MAGIC_NUMBER)
2742     {
2743     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2744     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2745     if (study != NULL) study = &internal_study;
2746     }
2747    
2748     /* Set some local values */
2749    
2750     current_subject = (const unsigned char *)subject + start_offset;
2751     end_subject = (const unsigned char *)subject + length;
2752     req_byte_ptr = current_subject - 1;
2753    
2754 nigel 91 #ifdef SUPPORT_UTF8
2755 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2756 nigel 91 #else
2757     utf8 = FALSE;
2758     #endif
2759 nigel 77
2760 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2761     (re->options & PCRE_ANCHORED) != 0;
2762    
2763 nigel 77 /* The remaining fixed data for passing around. */
2764    
2765 nigel 91 md->start_code = (const uschar *)argument_re +
2766 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2767 nigel 91 md->start_subject = (const unsigned char *)subject;
2768     md->end_subject = end_subject;
2769 ph10 442 md->start_offset = start_offset;
2770 nigel 91 md->moptions = options;
2771     md->poptions = re->options;
2772 nigel 77
2773 ph10 231 /* If the BSR option is not set at match time, copy what was set
2774     at compile time. */
2775    
2776     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2777     {
2778     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2779     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2780     #ifdef BSR_ANYCRLF
2781     else md->moptions |= PCRE_BSR_ANYCRLF;
2782 ph10 243 #endif
2783     }
2784 ph10 231
2785 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2786     nothing is set at run time, whatever was used at compile time applies. */
2787 nigel 91
2788 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2789 nigel 93 PCRE_NEWLINE_BITS)
2790 nigel 91 {
2791 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2792 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2793     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2794 nigel 91 case PCRE_NEWLINE_CR+
2795 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2796 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2797 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2798 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2799 nigel 91 }
2800    
2801 ph10 149 if (newline == -2)
2802 nigel 91 {
2803 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2804     }
2805     else if (newline < 0)
2806     {
2807 nigel 93 md->nltype = NLTYPE_ANY;
2808 nigel 91 }
2809     else
2810     {
2811 nigel 93 md->nltype = NLTYPE_FIXED;
2812     if (newline > 255)
2813     {
2814     md->nllen = 2;
2815     md->nl[0] = (newline >> 8) & 255;
2816     md->nl[1] = newline & 255;
2817     }
2818     else
2819     {
2820     md->nllen = 1;
2821     md->nl[0] = newline;
2822     }
2823 nigel 91 }
2824    
2825 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2826     back the character offset. */
2827    
2828     #ifdef SUPPORT_UTF8
2829     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2830     {
2831     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2832     return PCRE_ERROR_BADUTF8;
2833     if (start_offset > 0 && start_offset < length)
2834     {
2835     int tb = ((uschar *)subject)[start_offset];
2836     if (tb > 127)
2837     {
2838     tb &= 0xc0;
2839     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2840     }
2841     }
2842     }
2843     #endif
2844    
2845     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2846     is a feature that makes it possible to save compiled regex and re-use them
2847     in other programs later. */
2848    
2849 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2850 nigel 77
2851     /* The lower casing table and the "must be at the start of a line" flag are
2852     used in a loop when finding where to start. */
2853    
2854 nigel 91 lcc = md->tables + lcc_offset;
2855 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2856 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2857    
2858     /* Set up the first character to match, if available. The first_byte value is
2859     never set for an anchored regular expression, but the anchoring may be forced
2860     at run time, so we have to test for anchoring. The first char may be unset for
2861     an unanchored pattern, of course. If there's no first char and the pattern was
2862     studied, there may be a bitmap of possible first characters. */
2863    
2864     if (!anchored)
2865     {
2866 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2867 nigel 77 {
2868     first_byte = re->first_byte & 255;
2869     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2870     first_byte = lcc[first_byte];
2871     }
2872     else
2873     {
2874 ph10 455 if (!startline && study != NULL &&
2875     (study->flags & PCRE_STUDY_MAPPED) != 0)
2876 nigel 77 start_bits = study->start_bits;
2877     }
2878     }
2879    
2880     /* For anchored or unanchored matches, there may be a "last known required
2881     character" set. */
2882    
2883 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2884 nigel 77 {
2885     req_byte = re->req_byte & 255;
2886     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2887 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2888 nigel 77 }
2889    
2890     /* Call the main matching function, looping for a non-anchored regex after a
2891 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2892     a match. */
2893 nigel 77
2894     for (;;)
2895     {
2896     int rc;
2897    
2898     if ((options & PCRE_DFA_RESTART) == 0)
2899     {
2900     const uschar *save_end_subject = end_subject;
2901    
2902 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2903     line of a multiline string. Implement this by temporarily adjusting
2904     end_subject so that we stop scanning at a newline. If the match fails at
2905     the newline, later code breaks this loop. */
2906 nigel 77
2907     if (firstline)
2908     {
2909 ph10 365 USPTR t = current_subject;
2910     #ifdef SUPPORT_UTF8
2911     if (utf8)
2912 ph10 371 {
2913     while (t < md->end_subject && !IS_NEWLINE(t))
2914 ph10 365 {
2915     t++;
2916     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2917 ph10 371 }
2918 ph10 365 }
2919     else
2920 ph10 371 #endif
2921 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2922 nigel 77 end_subject = t;
2923     }
2924 ph10 392
2925 ph10 389 /* There are some optimizations that avoid running the match if a known
2926 ph10 455 starting point is not found. However, there is an option that disables
2927     these, for testing and for ensuring that all callouts do actually occur. */
2928 nigel 77
2929 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2930 ph10 392 {
2931 ph10 389 /* Advance to a known first byte. */
2932 ph10 392
2933 ph10 389 if (first_byte >= 0)
2934 nigel 77 {
2935 ph10 389 if (first_byte_caseless)
2936     while (current_subject < end_subject &&
2937     lcc[*current_subject] != first_byte)
2938     current_subject++;
2939     else
2940 ph10 392 while (current_subject < end_subject &&
2941 ph10 389 *current_subject != first_byte)
2942     current_subject++;
2943     }
2944 ph10 392
2945 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2946 ph10 392
2947 ph10 389 else if (startline)
2948     {
2949     if (current_subject > md->start_subject + start_offset)
2950     {
2951 ph10 365 #ifdef SUPPORT_UTF8
2952 ph10 389 if (utf8)
2953 ph10 365 {
2954 ph10 392 while (current_subject < end_subject &&
2955 ph10 389 !WAS_NEWLINE(current_subject))
2956     {
2957 ph10 365 current_subject++;
2958 ph10 389 while(current_subject < end_subject &&
2959     (*current_subject & 0xc0) == 0x80)
2960     current_subject++;
2961     }
2962 ph10 371 }
2963 ph10 389 else
2964     #endif
2965     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2966     current_subject++;
2967 ph10 392
2968 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2969     ANYCRLF, and we are now at a LF, advance the match position by one
2970     more character. */
2971 ph10 392
2972 ph10 391 if (current_subject[-1] == CHAR_CR &&
2973 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2974     current_subject < end_subject &&
2975 ph10 391 *current_subject == CHAR_NL)
2976 ph10 389 current_subject++;
2977 ph10 365 }
2978 nigel 77 }
2979 ph10 392
2980 ph10 389 /* Or to a non-unique first char after study */
2981 ph10 392
2982 ph10 389 else if (start_bits != NULL)
2983 nigel 77 {
2984 ph10 389 while (current_subject < end_subject)
2985     {
2986     register unsigned int c = *current_subject;
2987     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2988     else break;
2989     }
2990 nigel 77 }
2991 ph10 392 }
2992 nigel 77
2993     /* Restore fudged end_subject */
2994    
2995     end_subject = save_end_subject;
2996    
2997 ph10 461 /* The following two optimizations are disabled for partial matching or if
2998     disabling is explicitly requested (and of course, by the test above, this
2999 ph10 455 code is not obeyed when restarting after a partial match). */
3000 ph10 461
3001 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3002     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3003 ph10 461 {
3004 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3005     is a lower bound; no actual string of that length may actually match the
3006     pattern. Although the value is, strictly, in characters, we treat it as
3007     bytes to avoid spending too much time in this optimization. */
3008 nigel 77
3009 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3010 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3011 ph10 455 return PCRE_ERROR_NOMATCH;
3012 ph10 461
3013 ph10 455 /* If req_byte is set, we know that that character must appear in the
3014     subject for the match to succeed. If the first character is set, req_byte
3015     must be later in the subject; otherwise the test starts at the match
3016     point. This optimization can save a huge amount of work in patterns with
3017     nested unlimited repeats that aren't going to match. Writing separate
3018     code for cased/caseless versions makes it go faster, as does using an
3019     autoincrement and backing off on a match.
3020 ph10 461
3021 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3022     can take a long time, and give bad performance on quite ordinary
3023     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3024     string... so we don't do this when the string is sufficiently long. */
3025 ph10 461
3026 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3027 nigel 77 {
3028 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3029 ph10 461
3030 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3031     place we found it at last time. */
3032 ph10 461
3033 ph10 455 if (p > req_byte_ptr)
3034 nigel 77 {
3035 ph10 455 if (req_byte_caseless)
3036     {
3037     while (p < end_subject)
3038     {
3039     register int pp = *p++;
3040     if (pp == req_byte || pp == req_byte2) { p--; break; }
3041     }
3042     }
3043     else
3044     {
3045     while (p < end_subject)
3046     {
3047     if (*p++ == req_byte) { p--; break; }
3048     }
3049     }
3050 ph10 461
3051 ph10 455 /* If we can't find the required character, break the matching loop,
3052     which will cause a return or PCRE_ERROR_NOMATCH. */
3053 ph10 461
3054 ph10 455 if (p >= end_subject) break;
3055 ph10 461
3056 ph10 455 /* If we have found the required character, save the point where we
3057     found it, so that we don't search again next time round the loop if
3058     the start hasn't passed this character yet. */
3059 ph10 461
3060 ph10 455 req_byte_ptr = p;
3061 nigel 77 }
3062 ph10 461 }
3063 nigel 77 }
3064 ph10 455 } /* End of optimizations that are done when not restarting */
3065 nigel 77
3066     /* OK, now we can do the business */
3067    
3068 ph10 435 md->start_used_ptr = current_subject;
3069 ph10 461
3070 nigel 77 rc = internal_dfa_exec(
3071 nigel 91 md, /* fixed match data */
3072     md->start_code, /* this subexpression's code */
3073     current_subject, /* where we currently are */
3074     start_offset, /* start offset in subject */
3075     offsets, /* offset vector */
3076     offsetcount, /* size of same */
3077     workspace, /* workspace vector */
3078     wscount, /* size of same */
3079 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3080 nigel 91 0, /* function recurse level */
3081     0); /* regex recurse level */
3082 nigel 77
3083     /* Anything other than "no match" means we are done, always; otherwise, carry
3084     on only if not anchored. */
3085    
3086     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3087    
3088     /* Advance to the next subject character unless we are at the end of a line
3089     and firstline is set. */
3090    
3091 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3092 nigel 77 current_subject++;
3093     if (utf8)
3094     {
3095     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3096     current_subject++;
3097     }
3098     if (current_subject > end_subject) break;
3099    
3100 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3101 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3102     or ANY or ANYCRLF, advance the match position by one more character. */
3103 nigel 93
3104 ph10 391 if (current_subject[-1] == CHAR_CR &&
3105 ph10 226 current_subject < end_subject &&
3106 ph10 391 *current_subject == CHAR_NL &&
3107 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3108 ph10 226 (md->nltype == NLTYPE_ANY ||
3109     md->nltype == NLTYPE_ANYCRLF ||
3110     md->nllen == 2))
3111 nigel 93 current_subject++;
3112    
3113     } /* "Bumpalong" loop */
3114    
3115 nigel 77 return PCRE_ERROR_NOMATCH;
3116     }
3117    
3118     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12