/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 507 - (hide annotations) (download)
Wed Mar 10 16:08:01 2010 UTC (4 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 107277 byte(s)
Tidies for 8.02-RC1 release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 473 Copyright (c) 1997-2010 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109     character that is to be tested in some way. This makes is possible to
110     centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
125     1, /* Char */
126     1, /* Charnc */
127     1, /* not */
128     /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130     3, 3, 3, /* upto, minupto, exact */
131 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
132 nigel 77 /* Negative single-char repeats - only for chars < 256 */
133     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
134     3, 3, 3, /* NOT upto, minupto, exact */
135 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
136 nigel 77 /* Positive type repeats */
137     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* Type upto, minupto, exact */
139 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
140 nigel 77 /* Character class & ref repeats */
141     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
142     0, 0, /* CRRANGE, CRMINRANGE */
143     0, /* CLASS */
144     0, /* NCLASS */
145     0, /* XCLASS - variable length */
146     0, /* REF */
147     0, /* RECURSE */
148     0, /* CALLOUT */
149     0, /* Alt */
150     0, /* Ket */
151     0, /* KetRmax */
152     0, /* KetRmin */
153     0, /* Assert */
154     0, /* Assert not */
155     0, /* Assert behind */
156     0, /* Assert behind not */
157     0, /* Reverse */
158 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
159     0, 0, 0, /* SBRA, SCBRA, SCOND */
160 ph10 498 0, 0, /* CREF, NCREF */
161     0, 0, /* RREF, NRREF */
162 nigel 93 0, /* DEF */
163 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
164     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
165 ph10 462 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
166 nigel 77 };
167    
168 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
169 ph10 462 remember the fact that a character could have been inspected when the end of
170 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
171     two tables that follow must also be modified. */
172 ph10 462
173     static const uschar poptable[] = {
174     0, /* End */
175 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
176 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
177     1, 1, 1, /* Any, AllAny, Anybyte */
178 ph10 498 1, 1, /* \P, \p */
179 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
180 ph10 498 1, /* \X */
181 ph10 462 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
182     1, /* Char */
183     1, /* Charnc */
184     1, /* not */
185     /* Positive single-char repeats */
186     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
187     1, 1, 1, /* upto, minupto, exact */
188     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
189     /* Negative single-char repeats - only for chars < 256 */
190     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
191     1, 1, 1, /* NOT upto, minupto, exact */
192     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
193     /* Positive type repeats */
194     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
195     1, 1, 1, /* Type upto, minupto, exact */
196     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
197     /* Character class & ref repeats */
198     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
199     1, 1, /* CRRANGE, CRMINRANGE */
200     1, /* CLASS */
201     1, /* NCLASS */
202     1, /* XCLASS - variable length */
203     0, /* REF */
204     0, /* RECURSE */
205     0, /* CALLOUT */
206     0, /* Alt */
207     0, /* Ket */
208     0, /* KetRmax */
209     0, /* KetRmin */
210     0, /* Assert */
211     0, /* Assert not */
212     0, /* Assert behind */
213     0, /* Assert behind not */
214     0, /* Reverse */
215     0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
216     0, 0, 0, /* SBRA, SCBRA, SCOND */
217 ph10 498 0, 0, /* CREF, NCREF */
218     0, 0, /* RREF, NRREF */
219 ph10 462 0, /* DEF */
220     0, 0, /* BRAZERO, BRAMINZERO */
221     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
222     0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
223     };
224    
225 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
226     and \w */
227    
228 ph10 327 static const uschar toptable1[] = {
229 ph10 168 0, 0, 0, 0, 0, 0,
230 nigel 77 ctype_digit, ctype_digit,
231     ctype_space, ctype_space,
232     ctype_word, ctype_word,
233 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
234 nigel 77 };
235    
236 ph10 327 static const uschar toptable2[] = {
237 ph10 168 0, 0, 0, 0, 0, 0,
238 nigel 77 ctype_digit, 0,
239     ctype_space, 0,
240     ctype_word, 0,
241 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
242 nigel 77 };
243    
244    
245     /* Structure for holding data about a particular state, which is in effect the
246     current data for an active path through the match tree. It must consist
247     entirely of ints because the working vector we are passed, and which we put
248     these structures in, is a vector of ints. */
249    
250     typedef struct stateblock {
251     int offset; /* Offset to opcode */
252     int count; /* Count for repeats */
253     int ims; /* ims flag bits */
254     int data; /* Some use extra data */
255     } stateblock;
256    
257     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
258    
259    
260 ph10 475 #ifdef PCRE_DEBUG
261 nigel 77 /*************************************************
262     * Print character string *
263     *************************************************/
264    
265     /* Character string printing function for debugging.
266    
267     Arguments:
268     p points to string
269     length number of bytes
270     f where to print
271    
272     Returns: nothing
273     */
274    
275     static void
276     pchars(unsigned char *p, int length, FILE *f)
277     {
278     int c;
279     while (length-- > 0)
280     {
281     if (isprint(c = *(p++)))
282     fprintf(f, "%c", c);
283     else
284     fprintf(f, "\\x%02x", c);
285     }
286     }
287     #endif
288    
289    
290    
291     /*************************************************
292     * Execute a Regular Expression - DFA engine *
293     *************************************************/
294    
295     /* This internal function applies a compiled pattern to a subject string,
296     starting at a given point, using a DFA engine. This function is called from the
297     external one, possibly multiple times if the pattern is not anchored. The
298     function calls itself recursively for some kinds of subpattern.
299    
300     Arguments:
301     md the match_data block with fixed information
302     this_start_code the opening bracket of this subexpression's code
303     current_subject where we currently are in the subject string
304     start_offset start offset in the subject string
305     offsets vector to contain the matching string offsets
306     offsetcount size of same
307     workspace vector of workspace
308     wscount size of same
309     ims the current ims flags
310     rlevel function call recursion level
311     recursing regex recursive call level
312    
313 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
314 ph10 341 = 0 => offsets overflowed; longest matches are present
315 nigel 77 -1 => failed to match
316     < -1 => some kind of unexpected problem
317    
318     The following macros are used for adding states to the two state vectors (one
319     for the current character, one for the following character). */
320    
321     #define ADD_ACTIVE(x,y) \
322     if (active_count++ < wscount) \
323     { \
324     next_active_state->offset = (x); \
325     next_active_state->count = (y); \
326     next_active_state->ims = ims; \
327     next_active_state++; \
328     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
329     } \
330     else return PCRE_ERROR_DFA_WSSIZE
331    
332     #define ADD_ACTIVE_DATA(x,y,z) \
333     if (active_count++ < wscount) \
334     { \
335     next_active_state->offset = (x); \
336     next_active_state->count = (y); \
337     next_active_state->ims = ims; \
338     next_active_state->data = (z); \
339     next_active_state++; \
340     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
341     } \
342     else return PCRE_ERROR_DFA_WSSIZE
343    
344     #define ADD_NEW(x,y) \
345     if (new_count++ < wscount) \
346     { \
347     next_new_state->offset = (x); \
348     next_new_state->count = (y); \
349     next_new_state->ims = ims; \
350     next_new_state++; \
351     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
352     } \
353     else return PCRE_ERROR_DFA_WSSIZE
354    
355     #define ADD_NEW_DATA(x,y,z) \
356     if (new_count++ < wscount) \
357     { \
358     next_new_state->offset = (x); \
359     next_new_state->count = (y); \
360     next_new_state->ims = ims; \
361     next_new_state->data = (z); \
362     next_new_state++; \
363     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364     } \
365     else return PCRE_ERROR_DFA_WSSIZE
366    
367     /* And now, here is the code */
368    
369     static int
370     internal_dfa_exec(
371     dfa_match_data *md,
372     const uschar *this_start_code,
373     const uschar *current_subject,
374     int start_offset,
375     int *offsets,
376     int offsetcount,
377     int *workspace,
378     int wscount,
379     int ims,
380     int rlevel,
381     int recursing)
382     {
383     stateblock *active_states, *new_states, *temp_states;
384     stateblock *next_active_state, *next_new_state;
385    
386     const uschar *ctypes, *lcc, *fcc;
387     const uschar *ptr;
388 nigel 93 const uschar *end_code, *first_op;
389 nigel 77
390     int active_count, new_count, match_count;
391    
392     /* Some fields in the md block are frequently referenced, so we load them into
393     independent variables in the hope that this will perform better. */
394    
395     const uschar *start_subject = md->start_subject;
396     const uschar *end_subject = md->end_subject;
397     const uschar *start_code = md->start_code;
398    
399 nigel 87 #ifdef SUPPORT_UTF8
400 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
401 nigel 93 #else
402     BOOL utf8 = FALSE;
403 nigel 87 #endif
404 nigel 77
405     rlevel++;
406     offsetcount &= (-2);
407    
408     wscount -= 2;
409     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
410     (2 * INTS_PER_STATEBLOCK);
411    
412     DPRINTF(("\n%.*s---------------------\n"
413     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
414     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
415    
416     ctypes = md->tables + ctypes_offset;
417     lcc = md->tables + lcc_offset;
418     fcc = md->tables + fcc_offset;
419    
420     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
421    
422     active_states = (stateblock *)(workspace + 2);
423     next_new_state = new_states = active_states + wscount;
424     new_count = 0;
425    
426 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
427     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
428    
429 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
430     the alternative states onto the list, and find out where the end is. This
431     makes is possible to use this function recursively, when we want to stop at a
432     matching internal ket rather than at the end.
433    
434     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
435     a backward assertion. In that case, we have to find out the maximum amount to
436     move back, and set up each alternative appropriately. */
437    
438 nigel 93 if (*first_op == OP_REVERSE)
439 nigel 77 {
440     int max_back = 0;
441     int gone_back;
442    
443     end_code = this_start_code;
444     do
445     {
446     int back = GET(end_code, 2+LINK_SIZE);
447     if (back > max_back) max_back = back;
448     end_code += GET(end_code, 1);
449     }
450     while (*end_code == OP_ALT);
451    
452     /* If we can't go back the amount required for the longest lookbehind
453     pattern, go back as far as we can; some alternatives may still be viable. */
454    
455     #ifdef SUPPORT_UTF8
456     /* In character mode we have to step back character by character */
457    
458     if (utf8)
459     {
460     for (gone_back = 0; gone_back < max_back; gone_back++)
461     {
462     if (current_subject <= start_subject) break;
463     current_subject--;
464     while (current_subject > start_subject &&
465     (*current_subject & 0xc0) == 0x80)
466     current_subject--;
467     }
468     }
469     else
470     #endif
471    
472     /* In byte-mode we can do this quickly. */
473    
474     {
475     gone_back = (current_subject - max_back < start_subject)?
476     current_subject - start_subject : max_back;
477     current_subject -= gone_back;
478     }
479 ph10 461
480 ph10 435 /* Save the earliest consulted character */
481 nigel 77
482 ph10 461 if (current_subject < md->start_used_ptr)
483     md->start_used_ptr = current_subject;
484    
485 nigel 77 /* Now we can process the individual branches. */
486    
487     end_code = this_start_code;
488     do
489     {
490     int back = GET(end_code, 2+LINK_SIZE);
491     if (back <= gone_back)
492     {
493     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
494     ADD_NEW_DATA(-bstate, 0, gone_back - back);
495     }
496     end_code += GET(end_code, 1);
497     }
498     while (*end_code == OP_ALT);
499     }
500    
501     /* This is the code for a "normal" subpattern (not a backward assertion). The
502     start of a whole pattern is always one of these. If we are at the top level,
503     we may be asked to restart matching from the same point that we reached for a
504     previous partial match. We still have to scan through the top-level branches to
505     find the end state. */
506    
507     else
508     {
509     end_code = this_start_code;
510    
511     /* Restarting */
512    
513     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
514     {
515     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
516     new_count = workspace[1];
517     if (!workspace[0])
518     memcpy(new_states, active_states, new_count * sizeof(stateblock));
519     }
520    
521     /* Not restarting */
522    
523     else
524     {
525 nigel 93 int length = 1 + LINK_SIZE +
526     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
527 nigel 77 do
528     {
529 nigel 93 ADD_NEW(end_code - start_code + length, 0);
530 nigel 77 end_code += GET(end_code, 1);
531 nigel 93 length = 1 + LINK_SIZE;
532 nigel 77 }
533     while (*end_code == OP_ALT);
534     }
535     }
536    
537     workspace[0] = 0; /* Bit indicating which vector is current */
538    
539     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
540    
541     /* Loop for scanning the subject */
542    
543     ptr = current_subject;
544     for (;;)
545     {
546     int i, j;
547 nigel 91 int clen, dlen;
548     unsigned int c, d;
549 ph10 428 int forced_fail = 0;
550 ph10 462 BOOL could_continue = FALSE;
551 nigel 77
552     /* Make the new state list into the active state list and empty the
553     new state list. */
554    
555     temp_states = active_states;
556     active_states = new_states;
557     new_states = temp_states;
558     active_count = new_count;
559     new_count = 0;
560    
561     workspace[0] ^= 1; /* Remember for the restarting feature */
562     workspace[1] = active_count;
563    
564 ph10 475 #ifdef PCRE_DEBUG
565 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
566     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
567     printf("\"\n");
568    
569     printf("%.*sActive states: ", rlevel*2-2, SP);
570     for (i = 0; i < active_count; i++)
571     printf("%d/%d ", active_states[i].offset, active_states[i].count);
572     printf("\n");
573     #endif
574    
575     /* Set the pointers for adding new states */
576    
577     next_active_state = active_states + active_count;
578     next_new_state = new_states;
579    
580     /* Load the current character from the subject outside the loop, as many
581     different states may want to look at it, and we assume that at least one
582     will. */
583    
584     if (ptr < end_subject)
585     {
586 nigel 93 clen = 1; /* Number of bytes in the character */
587 nigel 77 #ifdef SUPPORT_UTF8
588     if (utf8) { GETCHARLEN(c, ptr, clen); } else
589     #endif /* SUPPORT_UTF8 */
590     c = *ptr;
591     }
592     else
593     {
594 nigel 93 clen = 0; /* This indicates the end of the subject */
595     c = NOTACHAR; /* This value should never actually be used */
596 nigel 77 }
597    
598     /* Scan up the active states and act on each one. The result of an action
599     may be to add more states to the currently active list (e.g. on hitting a
600     parenthesis) or it may be to put states on the new list, for considering
601     when we move the character pointer on. */
602    
603     for (i = 0; i < active_count; i++)
604     {
605     stateblock *current_state = active_states + i;
606     const uschar *code;
607     int state_offset = current_state->offset;
608 ph10 397 int count, codevalue, rrc;
609 nigel 77
610 ph10 475 #ifdef PCRE_DEBUG
611 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
612 nigel 93 if (clen == 0) printf("EOL\n");
613 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
614     else printf("0x%02x\n", c);
615     #endif
616    
617     /* This variable is referred to implicity in the ADD_xxx macros. */
618    
619     ims = current_state->ims;
620    
621     /* A negative offset is a special case meaning "hold off going to this
622     (negated) state until the number of characters in the data field have
623     been skipped". */
624    
625     if (state_offset < 0)
626     {
627     if (current_state->data > 0)
628     {
629     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
630     ADD_NEW_DATA(state_offset, current_state->count,
631     current_state->data - 1);
632     continue;
633     }
634     else
635     {
636     current_state->offset = state_offset = -state_offset;
637     }
638     }
639    
640 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
641 ph10 439 See the note at the head of this module about the possibility of improving
642     performance here. */
643 nigel 77
644     for (j = 0; j < i; j++)
645     {
646     if (active_states[j].offset == state_offset &&
647     active_states[j].count == current_state->count)
648     {
649     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
650     goto NEXT_ACTIVE_STATE;
651     }
652     }
653    
654     /* The state offset is the offset to the opcode */
655    
656     code = start_code + state_offset;
657     codevalue = *code;
658    
659 ph10 463 /* If this opcode inspects a character, but we are at the end of the
660     subject, remember the fact for use when testing for a partial match. */
661    
662 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
663 ph10 463 could_continue = TRUE;
664 ph10 462
665 nigel 77 /* If this opcode is followed by an inline character, load it. It is
666     tempting to test for the presence of a subject character here, but that
667     is wrong, because sometimes zero repetitions of the subject are
668     permitted.
669    
670     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
671 ph10 178 argument that is not a data character - but is always one byte long. We
672     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
673     this case. To keep the other cases fast, convert these ones to new opcodes.
674     */
675 nigel 77
676     if (coptable[codevalue] > 0)
677     {
678     dlen = 1;
679     #ifdef SUPPORT_UTF8
680     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
681     #endif /* SUPPORT_UTF8 */
682     d = code[coptable[codevalue]];
683     if (codevalue >= OP_TYPESTAR)
684     {
685 nigel 93 switch(d)
686     {
687     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
688     case OP_NOTPROP:
689     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
690     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
691     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
692 ph10 178 case OP_NOT_HSPACE:
693 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
694 ph10 178 case OP_NOT_VSPACE:
695 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
696 nigel 93 default: break;
697     }
698 nigel 77 }
699     }
700     else
701     {
702     dlen = 0; /* Not strictly necessary, but compilers moan */
703 nigel 93 d = NOTACHAR; /* if these variables are not set. */
704 nigel 77 }
705    
706    
707     /* Now process the individual opcodes */
708    
709     switch (codevalue)
710     {
711 ph10 498 /* ========================================================================== */
712     /* These cases are never obeyed. This is a fudge that causes a compile-
713     time error if the vectors coptable or poptable, which are indexed by
714     opcode, are not the correct length. It seems to be the only way to do
715     such a check at compile time, as the sizeof() operator does not work
716     in the C preprocessor. */
717 ph10 507
718 ph10 498 case OP_TABLE_LENGTH:
719 ph10 507 case OP_TABLE_LENGTH +
720 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
721     (sizeof(poptable) == OP_TABLE_LENGTH)):
722 ph10 507 break;
723 nigel 77
724     /* ========================================================================== */
725     /* Reached a closing bracket. If not at the end of the pattern, carry
726     on with the next opcode. Otherwise, unless we have an empty string and
727 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
728 ph10 442 start of the subject, save the match data, shifting up all previous
729 nigel 77 matches so we always have the longest first. */
730    
731     case OP_KET:
732     case OP_KETRMIN:
733     case OP_KETRMAX:
734     if (code != end_code)
735     {
736     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
737     if (codevalue != OP_KET)
738     {
739     ADD_ACTIVE(state_offset - GET(code, 1), 0);
740     }
741     }
742 ph10 461 else
743 nigel 77 {
744 ph10 461 if (ptr > current_subject ||
745 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
746     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
747     current_subject > start_subject + md->start_offset)))
748 nigel 77 {
749 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
750     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
751     match_count = 0;
752     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
753     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
754     if (offsetcount >= 2)
755     {
756     offsets[0] = current_subject - start_subject;
757     offsets[1] = ptr - start_subject;
758     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
759     offsets[1] - offsets[0], current_subject));
760     }
761     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
762     {
763     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
764     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
765     match_count, rlevel*2-2, SP));
766     return match_count;
767     }
768 ph10 461 }
769 nigel 77 }
770     break;
771    
772     /* ========================================================================== */
773     /* These opcodes add to the current list of states without looking
774     at the current character. */
775    
776     /*-----------------------------------------------------------------*/
777     case OP_ALT:
778     do { code += GET(code, 1); } while (*code == OP_ALT);
779     ADD_ACTIVE(code - start_code, 0);
780     break;
781    
782     /*-----------------------------------------------------------------*/
783     case OP_BRA:
784 nigel 93 case OP_SBRA:
785 nigel 77 do
786     {
787     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
788     code += GET(code, 1);
789     }
790     while (*code == OP_ALT);
791     break;
792    
793     /*-----------------------------------------------------------------*/
794 nigel 93 case OP_CBRA:
795     case OP_SCBRA:
796     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
797     code += GET(code, 1);
798     while (*code == OP_ALT)
799     {
800     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
801     code += GET(code, 1);
802     }
803     break;
804    
805     /*-----------------------------------------------------------------*/
806 nigel 77 case OP_BRAZERO:
807     case OP_BRAMINZERO:
808     ADD_ACTIVE(state_offset + 1, 0);
809     code += 1 + GET(code, 2);
810     while (*code == OP_ALT) code += GET(code, 1);
811     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
812     break;
813    
814     /*-----------------------------------------------------------------*/
815 ph10 335 case OP_SKIPZERO:
816     code += 1 + GET(code, 2);
817     while (*code == OP_ALT) code += GET(code, 1);
818     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
819     break;
820    
821     /*-----------------------------------------------------------------*/
822 nigel 77 case OP_CIRC:
823     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
824 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
825     ptr != end_subject &&
826 nigel 93 WAS_NEWLINE(ptr)))
827 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
828     break;
829    
830     /*-----------------------------------------------------------------*/
831     case OP_EOD:
832     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
833     break;
834    
835     /*-----------------------------------------------------------------*/
836     case OP_OPT:
837     ims = code[1];
838     ADD_ACTIVE(state_offset + 2, 0);
839     break;
840    
841     /*-----------------------------------------------------------------*/
842     case OP_SOD:
843     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
844     break;
845    
846     /*-----------------------------------------------------------------*/
847     case OP_SOM:
848     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
849     break;
850    
851    
852     /* ========================================================================== */
853     /* These opcodes inspect the next subject character, and sometimes
854     the previous one as well, but do not have an argument. The variable
855     clen contains the length of the current character and is zero if we are
856     at the end of the subject. */
857    
858     /*-----------------------------------------------------------------*/
859     case OP_ANY:
860 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
861 nigel 77 { ADD_NEW(state_offset + 1, 0); }
862     break;
863    
864     /*-----------------------------------------------------------------*/
865 ph10 341 case OP_ALLANY:
866     if (clen > 0)
867     { ADD_NEW(state_offset + 1, 0); }
868     break;
869    
870     /*-----------------------------------------------------------------*/
871 nigel 77 case OP_EODN:
872 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
873 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
874     break;
875    
876     /*-----------------------------------------------------------------*/
877     case OP_DOLL:
878     if ((md->moptions & PCRE_NOTEOL) == 0)
879     {
880 nigel 91 if (clen == 0 ||
881 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
882 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
883     ))
884 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
885     }
886 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
887 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
888     break;
889    
890     /*-----------------------------------------------------------------*/
891    
892     case OP_DIGIT:
893     case OP_WHITESPACE:
894     case OP_WORDCHAR:
895     if (clen > 0 && c < 256 &&
896     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
897     { ADD_NEW(state_offset + 1, 0); }
898     break;
899    
900     /*-----------------------------------------------------------------*/
901     case OP_NOT_DIGIT:
902     case OP_NOT_WHITESPACE:
903     case OP_NOT_WORDCHAR:
904     if (clen > 0 && (c >= 256 ||
905     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
906     { ADD_NEW(state_offset + 1, 0); }
907     break;
908    
909     /*-----------------------------------------------------------------*/
910     case OP_WORD_BOUNDARY:
911     case OP_NOT_WORD_BOUNDARY:
912     {
913     int left_word, right_word;
914    
915     if (ptr > start_subject)
916     {
917     const uschar *temp = ptr - 1;
918 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
919 nigel 77 #ifdef SUPPORT_UTF8
920     if (utf8) BACKCHAR(temp);
921     #endif
922     GETCHARTEST(d, temp);
923     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
924     }
925     else left_word = 0;
926    
927 ph10 461 if (clen > 0)
928 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
929 ph10 463 else right_word = 0;
930 nigel 77
931     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
932     { ADD_ACTIVE(state_offset + 1, 0); }
933     }
934     break;
935    
936    
937     /*-----------------------------------------------------------------*/
938     /* Check the next character by Unicode property. We will get here only
939     if the support is in the binary; otherwise a compile-time error occurs.
940     */
941    
942 ph10 151 #ifdef SUPPORT_UCP
943 nigel 77 case OP_PROP:
944     case OP_NOTPROP:
945     if (clen > 0)
946     {
947 nigel 87 BOOL OK;
948 ph10 349 const ucd_record * prop = GET_UCD(c);
949 nigel 87 switch(code[1])
950 nigel 77 {
951 nigel 87 case PT_ANY:
952     OK = TRUE;
953     break;
954    
955     case PT_LAMP:
956 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
957 nigel 87 break;
958    
959     case PT_GC:
960 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
961 nigel 87 break;
962    
963     case PT_PC:
964 ph10 349 OK = prop->chartype == code[2];
965 nigel 87 break;
966    
967     case PT_SC:
968 ph10 349 OK = prop->script == code[2];
969 nigel 87 break;
970    
971     /* Should never occur, but keep compilers from grumbling. */
972    
973     default:
974     OK = codevalue != OP_PROP;
975     break;
976 nigel 77 }
977 nigel 87
978     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
979 nigel 77 }
980     break;
981     #endif
982    
983    
984    
985     /* ========================================================================== */
986     /* These opcodes likewise inspect the subject character, but have an
987     argument that is not a data character. It is one of these opcodes:
988 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
989     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
990 nigel 77
991     case OP_TYPEPLUS:
992     case OP_TYPEMINPLUS:
993 nigel 93 case OP_TYPEPOSPLUS:
994 nigel 77 count = current_state->count; /* Already matched */
995     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
996     if (clen > 0)
997     {
998     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
999     (c < 256 &&
1000 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1001 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1002     {
1003 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1004     {
1005     active_count--; /* Remove non-match possibility */
1006     next_active_state--;
1007     }
1008 nigel 77 count++;
1009     ADD_NEW(state_offset, count);
1010     }
1011     }
1012     break;
1013    
1014     /*-----------------------------------------------------------------*/
1015     case OP_TYPEQUERY:
1016     case OP_TYPEMINQUERY:
1017 nigel 93 case OP_TYPEPOSQUERY:
1018 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1019     if (clen > 0)
1020     {
1021     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1022     (c < 256 &&
1023 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1024 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1025     {
1026 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1027     {
1028     active_count--; /* Remove non-match possibility */
1029     next_active_state--;
1030     }
1031 nigel 77 ADD_NEW(state_offset + 2, 0);
1032     }
1033     }
1034     break;
1035    
1036     /*-----------------------------------------------------------------*/
1037     case OP_TYPESTAR:
1038     case OP_TYPEMINSTAR:
1039 nigel 93 case OP_TYPEPOSSTAR:
1040 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1041     if (clen > 0)
1042     {
1043     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1044     (c < 256 &&
1045 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1046 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1047     {
1048 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1049     {
1050     active_count--; /* Remove non-match possibility */
1051     next_active_state--;
1052     }
1053 nigel 77 ADD_NEW(state_offset, 0);
1054     }
1055     }
1056     break;
1057    
1058     /*-----------------------------------------------------------------*/
1059     case OP_TYPEEXACT:
1060 nigel 93 count = current_state->count; /* Number already matched */
1061     if (clen > 0)
1062     {
1063     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1064     (c < 256 &&
1065 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1066 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1067     {
1068     if (++count >= GET2(code, 1))
1069     { ADD_NEW(state_offset + 4, 0); }
1070     else
1071     { ADD_NEW(state_offset, count); }
1072     }
1073     }
1074     break;
1075    
1076     /*-----------------------------------------------------------------*/
1077 nigel 77 case OP_TYPEUPTO:
1078     case OP_TYPEMINUPTO:
1079 nigel 93 case OP_TYPEPOSUPTO:
1080     ADD_ACTIVE(state_offset + 4, 0);
1081 nigel 77 count = current_state->count; /* Number already matched */
1082     if (clen > 0)
1083     {
1084     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1085     (c < 256 &&
1086 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1087 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1088     {
1089 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1090     {
1091     active_count--; /* Remove non-match possibility */
1092     next_active_state--;
1093     }
1094 nigel 77 if (++count >= GET2(code, 1))
1095     { ADD_NEW(state_offset + 4, 0); }
1096     else
1097     { ADD_NEW(state_offset, count); }
1098     }
1099     }
1100     break;
1101    
1102     /* ========================================================================== */
1103     /* These are virtual opcodes that are used when something like
1104 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1105     argument. It keeps the code above fast for the other cases. The argument
1106     is in the d variable. */
1107 nigel 77
1108 ph10 151 #ifdef SUPPORT_UCP
1109 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1110     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1111 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1112 nigel 77 count = current_state->count; /* Already matched */
1113 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1114 nigel 77 if (clen > 0)
1115     {
1116 nigel 87 BOOL OK;
1117 ph10 349 const ucd_record * prop = GET_UCD(c);
1118 nigel 87 switch(code[2])
1119     {
1120     case PT_ANY:
1121     OK = TRUE;
1122     break;
1123    
1124     case PT_LAMP:
1125 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1126 nigel 87 break;
1127    
1128     case PT_GC:
1129 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1130 nigel 87 break;
1131    
1132     case PT_PC:
1133 ph10 349 OK = prop->chartype == code[3];
1134 nigel 87 break;
1135    
1136     case PT_SC:
1137 ph10 349 OK = prop->script == code[3];
1138 nigel 87 break;
1139    
1140     /* Should never occur, but keep compilers from grumbling. */
1141    
1142     default:
1143     OK = codevalue != OP_PROP;
1144     break;
1145     }
1146    
1147 nigel 93 if (OK == (d == OP_PROP))
1148     {
1149     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1150     {
1151     active_count--; /* Remove non-match possibility */
1152     next_active_state--;
1153     }
1154     count++;
1155     ADD_NEW(state_offset, count);
1156     }
1157 nigel 77 }
1158     break;
1159    
1160     /*-----------------------------------------------------------------*/
1161     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1162     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1163 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1164 nigel 77 count = current_state->count; /* Already matched */
1165     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1166 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1167 nigel 77 {
1168     const uschar *nptr = ptr + clen;
1169     int ncount = 0;
1170 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1171     {
1172     active_count--; /* Remove non-match possibility */
1173     next_active_state--;
1174     }
1175 nigel 77 while (nptr < end_subject)
1176     {
1177     int nd;
1178     int ndlen = 1;
1179     GETCHARLEN(nd, nptr, ndlen);
1180 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1181 nigel 77 ncount++;
1182     nptr += ndlen;
1183     }
1184     count++;
1185     ADD_NEW_DATA(-state_offset, count, ncount);
1186     }
1187     break;
1188 ph10 151 #endif
1189 nigel 77
1190     /*-----------------------------------------------------------------*/
1191 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1192     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1193     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1194     count = current_state->count; /* Already matched */
1195     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1196     if (clen > 0)
1197     {
1198     int ncount = 0;
1199     switch (c)
1200     {
1201     case 0x000b:
1202     case 0x000c:
1203     case 0x0085:
1204     case 0x2028:
1205     case 0x2029:
1206 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1207     goto ANYNL01;
1208    
1209     case 0x000d:
1210     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1211     /* Fall through */
1212    
1213     ANYNL01:
1214     case 0x000a:
1215 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1216     {
1217     active_count--; /* Remove non-match possibility */
1218     next_active_state--;
1219     }
1220     count++;
1221     ADD_NEW_DATA(-state_offset, count, ncount);
1222     break;
1223 ph10 231
1224 nigel 93 default:
1225     break;
1226     }
1227     }
1228     break;
1229    
1230     /*-----------------------------------------------------------------*/
1231 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1232     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1233     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1234     count = current_state->count; /* Already matched */
1235     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1236     if (clen > 0)
1237     {
1238 ph10 182 BOOL OK;
1239 ph10 178 switch (c)
1240     {
1241     case 0x000a:
1242     case 0x000b:
1243     case 0x000c:
1244     case 0x000d:
1245     case 0x0085:
1246     case 0x2028:
1247     case 0x2029:
1248     OK = TRUE;
1249 ph10 182 break;
1250 ph10 178
1251     default:
1252     OK = FALSE;
1253 ph10 182 break;
1254 ph10 178 }
1255    
1256     if (OK == (d == OP_VSPACE))
1257 ph10 182 {
1258 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1259     {
1260     active_count--; /* Remove non-match possibility */
1261     next_active_state--;
1262     }
1263     count++;
1264     ADD_NEW_DATA(-state_offset, count, 0);
1265     }
1266     }
1267     break;
1268    
1269     /*-----------------------------------------------------------------*/
1270     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1271     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1272     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1273     count = current_state->count; /* Already matched */
1274     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1275     if (clen > 0)
1276     {
1277 ph10 182 BOOL OK;
1278 ph10 178 switch (c)
1279     {
1280     case 0x09: /* HT */
1281     case 0x20: /* SPACE */
1282     case 0xa0: /* NBSP */
1283     case 0x1680: /* OGHAM SPACE MARK */
1284     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1285     case 0x2000: /* EN QUAD */
1286     case 0x2001: /* EM QUAD */
1287     case 0x2002: /* EN SPACE */
1288     case 0x2003: /* EM SPACE */
1289     case 0x2004: /* THREE-PER-EM SPACE */
1290     case 0x2005: /* FOUR-PER-EM SPACE */
1291     case 0x2006: /* SIX-PER-EM SPACE */
1292     case 0x2007: /* FIGURE SPACE */
1293     case 0x2008: /* PUNCTUATION SPACE */
1294     case 0x2009: /* THIN SPACE */
1295     case 0x200A: /* HAIR SPACE */
1296     case 0x202f: /* NARROW NO-BREAK SPACE */
1297     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1298     case 0x3000: /* IDEOGRAPHIC SPACE */
1299     OK = TRUE;
1300     break;
1301 ph10 182
1302 ph10 178 default:
1303     OK = FALSE;
1304     break;
1305     }
1306 ph10 182
1307 ph10 178 if (OK == (d == OP_HSPACE))
1308 ph10 182 {
1309 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1310     {
1311     active_count--; /* Remove non-match possibility */
1312     next_active_state--;
1313     }
1314     count++;
1315     ADD_NEW_DATA(-state_offset, count, 0);
1316     }
1317     }
1318     break;
1319    
1320     /*-----------------------------------------------------------------*/
1321 ph10 151 #ifdef SUPPORT_UCP
1322 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1323     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1324 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1325 nigel 87 count = 4;
1326 nigel 77 goto QS1;
1327    
1328     case OP_PROP_EXTRA + OP_TYPESTAR:
1329     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1330 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1331 nigel 77 count = 0;
1332    
1333     QS1:
1334    
1335 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1336 nigel 77 if (clen > 0)
1337     {
1338 nigel 87 BOOL OK;
1339 ph10 349 const ucd_record * prop = GET_UCD(c);
1340 nigel 87 switch(code[2])
1341     {
1342     case PT_ANY:
1343     OK = TRUE;
1344     break;
1345    
1346     case PT_LAMP:
1347 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1348 nigel 87 break;
1349    
1350     case PT_GC:
1351 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1352 nigel 87 break;
1353    
1354     case PT_PC:
1355 ph10 349 OK = prop->chartype == code[3];
1356 nigel 87 break;
1357    
1358     case PT_SC:
1359 ph10 349 OK = prop->script == code[3];
1360 nigel 87 break;
1361    
1362     /* Should never occur, but keep compilers from grumbling. */
1363    
1364     default:
1365     OK = codevalue != OP_PROP;
1366     break;
1367     }
1368    
1369 nigel 93 if (OK == (d == OP_PROP))
1370     {
1371     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1372     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1373     {
1374     active_count--; /* Remove non-match possibility */
1375     next_active_state--;
1376     }
1377     ADD_NEW(state_offset + count, 0);
1378     }
1379 nigel 77 }
1380     break;
1381    
1382     /*-----------------------------------------------------------------*/
1383     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1384     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1385 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1386 nigel 77 count = 2;
1387     goto QS2;
1388    
1389     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1390     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1391 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1392 nigel 77 count = 0;
1393    
1394     QS2:
1395    
1396     ADD_ACTIVE(state_offset + 2, 0);
1397 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1398 nigel 77 {
1399     const uschar *nptr = ptr + clen;
1400     int ncount = 0;
1401 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1402     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1403     {
1404     active_count--; /* Remove non-match possibility */
1405     next_active_state--;
1406     }
1407 nigel 77 while (nptr < end_subject)
1408     {
1409     int nd;
1410     int ndlen = 1;
1411     GETCHARLEN(nd, nptr, ndlen);
1412 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1413 nigel 77 ncount++;
1414     nptr += ndlen;
1415     }
1416     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1417     }
1418     break;
1419 ph10 151 #endif
1420 nigel 77
1421     /*-----------------------------------------------------------------*/
1422 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1423     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1424     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1425     count = 2;
1426     goto QS3;
1427    
1428     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1429     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1430     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1431     count = 0;
1432    
1433     QS3:
1434     ADD_ACTIVE(state_offset + 2, 0);
1435     if (clen > 0)
1436     {
1437     int ncount = 0;
1438     switch (c)
1439     {
1440     case 0x000b:
1441     case 0x000c:
1442     case 0x0085:
1443     case 0x2028:
1444     case 0x2029:
1445 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1446     goto ANYNL02;
1447    
1448     case 0x000d:
1449     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1450     /* Fall through */
1451    
1452     ANYNL02:
1453     case 0x000a:
1454 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1455     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1456     {
1457     active_count--; /* Remove non-match possibility */
1458     next_active_state--;
1459     }
1460     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1461     break;
1462 ph10 231
1463 nigel 93 default:
1464     break;
1465     }
1466     }
1467     break;
1468    
1469     /*-----------------------------------------------------------------*/
1470 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1471     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1472     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1473     count = 2;
1474     goto QS4;
1475    
1476     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1477     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1478     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1479     count = 0;
1480    
1481     QS4:
1482     ADD_ACTIVE(state_offset + 2, 0);
1483     if (clen > 0)
1484     {
1485 ph10 182 BOOL OK;
1486 ph10 178 switch (c)
1487     {
1488     case 0x000a:
1489     case 0x000b:
1490     case 0x000c:
1491     case 0x000d:
1492     case 0x0085:
1493     case 0x2028:
1494     case 0x2029:
1495     OK = TRUE;
1496     break;
1497 ph10 182
1498 ph10 178 default:
1499     OK = FALSE;
1500     break;
1501     }
1502     if (OK == (d == OP_VSPACE))
1503 ph10 182 {
1504 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1505     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1506     {
1507     active_count--; /* Remove non-match possibility */
1508     next_active_state--;
1509     }
1510     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1511     }
1512     }
1513     break;
1514    
1515     /*-----------------------------------------------------------------*/
1516     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1517     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1518     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1519     count = 2;
1520     goto QS5;
1521    
1522     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1523     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1524     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1525     count = 0;
1526    
1527     QS5:
1528     ADD_ACTIVE(state_offset + 2, 0);
1529     if (clen > 0)
1530     {
1531 ph10 182 BOOL OK;
1532 ph10 178 switch (c)
1533     {
1534     case 0x09: /* HT */
1535     case 0x20: /* SPACE */
1536     case 0xa0: /* NBSP */
1537     case 0x1680: /* OGHAM SPACE MARK */
1538     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1539     case 0x2000: /* EN QUAD */
1540     case 0x2001: /* EM QUAD */
1541     case 0x2002: /* EN SPACE */
1542     case 0x2003: /* EM SPACE */
1543     case 0x2004: /* THREE-PER-EM SPACE */
1544     case 0x2005: /* FOUR-PER-EM SPACE */
1545     case 0x2006: /* SIX-PER-EM SPACE */
1546     case 0x2007: /* FIGURE SPACE */
1547     case 0x2008: /* PUNCTUATION SPACE */
1548     case 0x2009: /* THIN SPACE */
1549     case 0x200A: /* HAIR SPACE */
1550     case 0x202f: /* NARROW NO-BREAK SPACE */
1551     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1552     case 0x3000: /* IDEOGRAPHIC SPACE */
1553     OK = TRUE;
1554     break;
1555 ph10 182
1556 ph10 178 default:
1557     OK = FALSE;
1558     break;
1559     }
1560 ph10 182
1561 ph10 178 if (OK == (d == OP_HSPACE))
1562 ph10 182 {
1563 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1564     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1565     {
1566     active_count--; /* Remove non-match possibility */
1567     next_active_state--;
1568     }
1569     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1570     }
1571     }
1572     break;
1573    
1574     /*-----------------------------------------------------------------*/
1575 ph10 151 #ifdef SUPPORT_UCP
1576 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1577     case OP_PROP_EXTRA + OP_TYPEUPTO:
1578     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1579 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1580 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1581 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1582 nigel 77 count = current_state->count; /* Number already matched */
1583     if (clen > 0)
1584     {
1585 nigel 87 BOOL OK;
1586 ph10 349 const ucd_record * prop = GET_UCD(c);
1587 nigel 87 switch(code[4])
1588 nigel 77 {
1589 nigel 87 case PT_ANY:
1590     OK = TRUE;
1591     break;
1592    
1593     case PT_LAMP:
1594 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1595 nigel 87 break;
1596    
1597     case PT_GC:
1598 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1599 nigel 87 break;
1600    
1601     case PT_PC:
1602 ph10 349 OK = prop->chartype == code[5];
1603 nigel 87 break;
1604    
1605     case PT_SC:
1606 ph10 349 OK = prop->script == code[5];
1607 nigel 87 break;
1608    
1609     /* Should never occur, but keep compilers from grumbling. */
1610    
1611     default:
1612     OK = codevalue != OP_PROP;
1613     break;
1614     }
1615    
1616     if (OK == (d == OP_PROP))
1617     {
1618 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1619     {
1620     active_count--; /* Remove non-match possibility */
1621     next_active_state--;
1622     }
1623 nigel 77 if (++count >= GET2(code, 1))
1624 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1625 nigel 77 else
1626     { ADD_NEW(state_offset, count); }
1627     }
1628     }
1629     break;
1630    
1631     /*-----------------------------------------------------------------*/
1632     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1633     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1634     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1635 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1636 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1637     { ADD_ACTIVE(state_offset + 4, 0); }
1638     count = current_state->count; /* Number already matched */
1639 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1640 nigel 77 {
1641     const uschar *nptr = ptr + clen;
1642     int ncount = 0;
1643 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1644     {
1645     active_count--; /* Remove non-match possibility */
1646     next_active_state--;
1647     }
1648 nigel 77 while (nptr < end_subject)
1649     {
1650     int nd;
1651     int ndlen = 1;
1652     GETCHARLEN(nd, nptr, ndlen);
1653 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1654 nigel 77 ncount++;
1655     nptr += ndlen;
1656     }
1657     if (++count >= GET2(code, 1))
1658     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1659     else
1660     { ADD_NEW_DATA(-state_offset, count, ncount); }
1661     }
1662     break;
1663 ph10 151 #endif
1664 nigel 77
1665 nigel 93 /*-----------------------------------------------------------------*/
1666     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1667     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1668     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1669     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1670     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1671     { ADD_ACTIVE(state_offset + 4, 0); }
1672     count = current_state->count; /* Number already matched */
1673     if (clen > 0)
1674     {
1675     int ncount = 0;
1676     switch (c)
1677     {
1678     case 0x000b:
1679     case 0x000c:
1680     case 0x0085:
1681     case 0x2028:
1682     case 0x2029:
1683 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1684     goto ANYNL03;
1685    
1686     case 0x000d:
1687     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1688     /* Fall through */
1689    
1690     ANYNL03:
1691     case 0x000a:
1692 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1693     {
1694     active_count--; /* Remove non-match possibility */
1695     next_active_state--;
1696     }
1697     if (++count >= GET2(code, 1))
1698     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1699     else
1700     { ADD_NEW_DATA(-state_offset, count, ncount); }
1701     break;
1702 ph10 231
1703 nigel 93 default:
1704     break;
1705     }
1706     }
1707     break;
1708    
1709 ph10 178 /*-----------------------------------------------------------------*/
1710     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1711     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1712     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1713     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1714     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1715     { ADD_ACTIVE(state_offset + 4, 0); }
1716     count = current_state->count; /* Number already matched */
1717     if (clen > 0)
1718     {
1719 ph10 182 BOOL OK;
1720 ph10 178 switch (c)
1721     {
1722     case 0x000a:
1723     case 0x000b:
1724     case 0x000c:
1725     case 0x000d:
1726     case 0x0085:
1727     case 0x2028:
1728     case 0x2029:
1729     OK = TRUE;
1730     break;
1731 ph10 182
1732 ph10 178 default:
1733     OK = FALSE;
1734     }
1735 ph10 182
1736 ph10 178 if (OK == (d == OP_VSPACE))
1737 ph10 182 {
1738 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1739     {
1740     active_count--; /* Remove non-match possibility */
1741     next_active_state--;
1742     }
1743     if (++count >= GET2(code, 1))
1744     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1745     else
1746     { ADD_NEW_DATA(-state_offset, count, 0); }
1747     }
1748     }
1749     break;
1750    
1751     /*-----------------------------------------------------------------*/
1752     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1753     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1754     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1755     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1756     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1757     { ADD_ACTIVE(state_offset + 4, 0); }
1758     count = current_state->count; /* Number already matched */
1759     if (clen > 0)
1760     {
1761 ph10 182 BOOL OK;
1762 ph10 178 switch (c)
1763     {
1764     case 0x09: /* HT */
1765     case 0x20: /* SPACE */
1766     case 0xa0: /* NBSP */
1767     case 0x1680: /* OGHAM SPACE MARK */
1768     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1769     case 0x2000: /* EN QUAD */
1770     case 0x2001: /* EM QUAD */
1771     case 0x2002: /* EN SPACE */
1772     case 0x2003: /* EM SPACE */
1773     case 0x2004: /* THREE-PER-EM SPACE */
1774     case 0x2005: /* FOUR-PER-EM SPACE */
1775     case 0x2006: /* SIX-PER-EM SPACE */
1776     case 0x2007: /* FIGURE SPACE */
1777     case 0x2008: /* PUNCTUATION SPACE */
1778     case 0x2009: /* THIN SPACE */
1779     case 0x200A: /* HAIR SPACE */
1780     case 0x202f: /* NARROW NO-BREAK SPACE */
1781     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1782     case 0x3000: /* IDEOGRAPHIC SPACE */
1783     OK = TRUE;
1784     break;
1785 ph10 182
1786 ph10 178 default:
1787     OK = FALSE;
1788     break;
1789     }
1790 ph10 182
1791 ph10 178 if (OK == (d == OP_HSPACE))
1792 ph10 182 {
1793 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1794     {
1795     active_count--; /* Remove non-match possibility */
1796     next_active_state--;
1797     }
1798     if (++count >= GET2(code, 1))
1799     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1800     else
1801     { ADD_NEW_DATA(-state_offset, count, 0); }
1802     }
1803     }
1804     break;
1805    
1806 nigel 77 /* ========================================================================== */
1807     /* These opcodes are followed by a character that is usually compared
1808     to the current subject character; it is loaded into d. We still get
1809     here even if there is no subject character, because in some cases zero
1810     repetitions are permitted. */
1811    
1812     /*-----------------------------------------------------------------*/
1813     case OP_CHAR:
1814     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1815     break;
1816    
1817     /*-----------------------------------------------------------------*/
1818     case OP_CHARNC:
1819     if (clen == 0) break;
1820    
1821     #ifdef SUPPORT_UTF8
1822     if (utf8)
1823     {
1824     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1825     {
1826 nigel 93 unsigned int othercase;
1827 nigel 77 if (c < 128) othercase = fcc[c]; else
1828    
1829     /* If we have Unicode property support, we can use it to test the
1830 nigel 87 other case of the character. */
1831 nigel 77
1832     #ifdef SUPPORT_UCP
1833 ph10 349 othercase = UCD_OTHERCASE(c);
1834 nigel 87 #else
1835 nigel 93 othercase = NOTACHAR;
1836 nigel 77 #endif
1837    
1838     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1839     }
1840     }
1841     else
1842     #endif /* SUPPORT_UTF8 */
1843    
1844     /* Non-UTF-8 mode */
1845     {
1846     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1847     }
1848     break;
1849    
1850    
1851     #ifdef SUPPORT_UCP
1852     /*-----------------------------------------------------------------*/
1853     /* This is a tricky one because it can match more than one character.
1854     Find out how many characters to skip, and then set up a negative state
1855     to wait for them to pass before continuing. */
1856    
1857     case OP_EXTUNI:
1858 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1859 nigel 77 {
1860     const uschar *nptr = ptr + clen;
1861     int ncount = 0;
1862     while (nptr < end_subject)
1863     {
1864     int nclen = 1;
1865     GETCHARLEN(c, nptr, nclen);
1866 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1867 nigel 77 ncount++;
1868     nptr += nclen;
1869     }
1870     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1871     }
1872     break;
1873     #endif
1874    
1875     /*-----------------------------------------------------------------*/
1876 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1877     character (when CR is followed by LF). In this case, set up a negative
1878     state to wait for one character to pass before continuing. */
1879    
1880     case OP_ANYNL:
1881     if (clen > 0) switch(c)
1882     {
1883     case 0x000b:
1884     case 0x000c:
1885     case 0x0085:
1886     case 0x2028:
1887     case 0x2029:
1888 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1889    
1890     case 0x000a:
1891 nigel 93 ADD_NEW(state_offset + 1, 0);
1892     break;
1893 ph10 231
1894 nigel 93 case 0x000d:
1895     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1896     {
1897     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1898     }
1899     else
1900     {
1901     ADD_NEW(state_offset + 1, 0);
1902     }
1903     break;
1904     }
1905     break;
1906    
1907     /*-----------------------------------------------------------------*/
1908 ph10 178 case OP_NOT_VSPACE:
1909     if (clen > 0) switch(c)
1910     {
1911     case 0x000a:
1912     case 0x000b:
1913     case 0x000c:
1914     case 0x000d:
1915     case 0x0085:
1916     case 0x2028:
1917     case 0x2029:
1918     break;
1919 ph10 182
1920     default:
1921 ph10 178 ADD_NEW(state_offset + 1, 0);
1922     break;
1923     }
1924     break;
1925    
1926     /*-----------------------------------------------------------------*/
1927     case OP_VSPACE:
1928     if (clen > 0) switch(c)
1929     {
1930     case 0x000a:
1931     case 0x000b:
1932     case 0x000c:
1933     case 0x000d:
1934     case 0x0085:
1935     case 0x2028:
1936     case 0x2029:
1937     ADD_NEW(state_offset + 1, 0);
1938     break;
1939 ph10 182
1940 ph10 178 default: break;
1941     }
1942     break;
1943    
1944     /*-----------------------------------------------------------------*/
1945     case OP_NOT_HSPACE:
1946     if (clen > 0) switch(c)
1947     {
1948     case 0x09: /* HT */
1949     case 0x20: /* SPACE */
1950     case 0xa0: /* NBSP */
1951     case 0x1680: /* OGHAM SPACE MARK */
1952     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1953     case 0x2000: /* EN QUAD */
1954     case 0x2001: /* EM QUAD */
1955     case 0x2002: /* EN SPACE */
1956     case 0x2003: /* EM SPACE */
1957     case 0x2004: /* THREE-PER-EM SPACE */
1958     case 0x2005: /* FOUR-PER-EM SPACE */
1959     case 0x2006: /* SIX-PER-EM SPACE */
1960     case 0x2007: /* FIGURE SPACE */
1961     case 0x2008: /* PUNCTUATION SPACE */
1962     case 0x2009: /* THIN SPACE */
1963     case 0x200A: /* HAIR SPACE */
1964     case 0x202f: /* NARROW NO-BREAK SPACE */
1965     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1966     case 0x3000: /* IDEOGRAPHIC SPACE */
1967     break;
1968 ph10 182
1969     default:
1970 ph10 178 ADD_NEW(state_offset + 1, 0);
1971     break;
1972     }
1973     break;
1974    
1975     /*-----------------------------------------------------------------*/
1976     case OP_HSPACE:
1977     if (clen > 0) switch(c)
1978     {
1979     case 0x09: /* HT */
1980     case 0x20: /* SPACE */
1981     case 0xa0: /* NBSP */
1982     case 0x1680: /* OGHAM SPACE MARK */
1983     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1984     case 0x2000: /* EN QUAD */
1985     case 0x2001: /* EM QUAD */
1986     case 0x2002: /* EN SPACE */
1987     case 0x2003: /* EM SPACE */
1988     case 0x2004: /* THREE-PER-EM SPACE */
1989     case 0x2005: /* FOUR-PER-EM SPACE */
1990     case 0x2006: /* SIX-PER-EM SPACE */
1991     case 0x2007: /* FIGURE SPACE */
1992     case 0x2008: /* PUNCTUATION SPACE */
1993     case 0x2009: /* THIN SPACE */
1994     case 0x200A: /* HAIR SPACE */
1995     case 0x202f: /* NARROW NO-BREAK SPACE */
1996     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1997     case 0x3000: /* IDEOGRAPHIC SPACE */
1998     ADD_NEW(state_offset + 1, 0);
1999     break;
2000     }
2001     break;
2002    
2003     /*-----------------------------------------------------------------*/
2004 nigel 77 /* Match a negated single character. This is only used for one-byte
2005     characters, that is, we know that d < 256. The character we are
2006     checking (c) can be multibyte. */
2007    
2008     case OP_NOT:
2009     if (clen > 0)
2010     {
2011 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2012 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2013     }
2014     break;
2015    
2016     /*-----------------------------------------------------------------*/
2017     case OP_PLUS:
2018     case OP_MINPLUS:
2019 nigel 93 case OP_POSPLUS:
2020 nigel 77 case OP_NOTPLUS:
2021     case OP_NOTMINPLUS:
2022 nigel 93 case OP_NOTPOSPLUS:
2023 nigel 77 count = current_state->count; /* Already matched */
2024     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2025     if (clen > 0)
2026     {
2027 nigel 93 unsigned int otherd = NOTACHAR;
2028 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2029     {
2030     #ifdef SUPPORT_UTF8
2031 nigel 87 if (utf8 && d >= 128)
2032 nigel 77 {
2033     #ifdef SUPPORT_UCP
2034 ph10 349 otherd = UCD_OTHERCASE(d);
2035 nigel 77 #endif /* SUPPORT_UCP */
2036     }
2037     else
2038     #endif /* SUPPORT_UTF8 */
2039     otherd = fcc[d];
2040     }
2041     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2042 nigel 93 {
2043     if (count > 0 &&
2044     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2045     {
2046     active_count--; /* Remove non-match possibility */
2047     next_active_state--;
2048     }
2049     count++;
2050     ADD_NEW(state_offset, count);
2051     }
2052 nigel 77 }
2053     break;
2054    
2055     /*-----------------------------------------------------------------*/
2056     case OP_QUERY:
2057     case OP_MINQUERY:
2058 nigel 93 case OP_POSQUERY:
2059 nigel 77 case OP_NOTQUERY:
2060     case OP_NOTMINQUERY:
2061 nigel 93 case OP_NOTPOSQUERY:
2062 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2063     if (clen > 0)
2064     {
2065 nigel 93 unsigned int otherd = NOTACHAR;
2066 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2067 nigel 77 {
2068     #ifdef SUPPORT_UTF8
2069 nigel 87 if (utf8 && d >= 128)
2070 nigel 77 {
2071     #ifdef SUPPORT_UCP
2072 ph10 349 otherd = UCD_OTHERCASE(d);
2073 nigel 77 #endif /* SUPPORT_UCP */
2074     }
2075     else
2076     #endif /* SUPPORT_UTF8 */
2077     otherd = fcc[d];
2078     }
2079     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2080 nigel 93 {
2081     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2082     {
2083     active_count--; /* Remove non-match possibility */
2084     next_active_state--;
2085     }
2086     ADD_NEW(state_offset + dlen + 1, 0);
2087     }
2088 nigel 77 }
2089     break;
2090    
2091     /*-----------------------------------------------------------------*/
2092     case OP_STAR:
2093     case OP_MINSTAR:
2094 nigel 93 case OP_POSSTAR:
2095 nigel 77 case OP_NOTSTAR:
2096     case OP_NOTMINSTAR:
2097 nigel 93 case OP_NOTPOSSTAR:
2098 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2099     if (clen > 0)
2100     {
2101 nigel 93 unsigned int otherd = NOTACHAR;
2102 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2103 nigel 77 {
2104     #ifdef SUPPORT_UTF8
2105 nigel 87 if (utf8 && d >= 128)
2106 nigel 77 {
2107     #ifdef SUPPORT_UCP
2108 ph10 349 otherd = UCD_OTHERCASE(d);
2109 nigel 77 #endif /* SUPPORT_UCP */
2110     }
2111     else
2112     #endif /* SUPPORT_UTF8 */
2113     otherd = fcc[d];
2114     }
2115     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2116 nigel 93 {
2117     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2118     {
2119     active_count--; /* Remove non-match possibility */
2120     next_active_state--;
2121     }
2122     ADD_NEW(state_offset, 0);
2123     }
2124 nigel 77 }
2125     break;
2126    
2127     /*-----------------------------------------------------------------*/
2128     case OP_EXACT:
2129 nigel 93 case OP_NOTEXACT:
2130     count = current_state->count; /* Number already matched */
2131     if (clen > 0)
2132     {
2133     unsigned int otherd = NOTACHAR;
2134     if ((ims & PCRE_CASELESS) != 0)
2135     {
2136     #ifdef SUPPORT_UTF8
2137     if (utf8 && d >= 128)
2138     {
2139     #ifdef SUPPORT_UCP
2140 ph10 349 otherd = UCD_OTHERCASE(d);
2141 nigel 93 #endif /* SUPPORT_UCP */
2142     }
2143     else
2144     #endif /* SUPPORT_UTF8 */
2145     otherd = fcc[d];
2146     }
2147     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2148     {
2149     if (++count >= GET2(code, 1))
2150     { ADD_NEW(state_offset + dlen + 3, 0); }
2151     else
2152     { ADD_NEW(state_offset, count); }
2153     }
2154     }
2155     break;
2156    
2157     /*-----------------------------------------------------------------*/
2158 nigel 77 case OP_UPTO:
2159     case OP_MINUPTO:
2160 nigel 93 case OP_POSUPTO:
2161 nigel 77 case OP_NOTUPTO:
2162     case OP_NOTMINUPTO:
2163 nigel 93 case OP_NOTPOSUPTO:
2164     ADD_ACTIVE(state_offset + dlen + 3, 0);
2165 nigel 77 count = current_state->count; /* Number already matched */
2166     if (clen > 0)
2167     {
2168 nigel 93 unsigned int otherd = NOTACHAR;
2169 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2170     {
2171     #ifdef SUPPORT_UTF8
2172 nigel 87 if (utf8 && d >= 128)
2173 nigel 77 {
2174     #ifdef SUPPORT_UCP
2175 ph10 349 otherd = UCD_OTHERCASE(d);
2176 nigel 77 #endif /* SUPPORT_UCP */
2177     }
2178     else
2179     #endif /* SUPPORT_UTF8 */
2180     otherd = fcc[d];
2181     }
2182     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2183     {
2184 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2185     {
2186     active_count--; /* Remove non-match possibility */
2187     next_active_state--;
2188     }
2189 nigel 77 if (++count >= GET2(code, 1))
2190     { ADD_NEW(state_offset + dlen + 3, 0); }
2191     else
2192     { ADD_NEW(state_offset, count); }
2193     }
2194     }
2195     break;
2196    
2197    
2198     /* ========================================================================== */
2199     /* These are the class-handling opcodes */
2200    
2201     case OP_CLASS:
2202     case OP_NCLASS:
2203     case OP_XCLASS:
2204     {
2205     BOOL isinclass = FALSE;
2206     int next_state_offset;
2207     const uschar *ecode;
2208    
2209     /* For a simple class, there is always just a 32-byte table, and we
2210     can set isinclass from it. */
2211    
2212     if (codevalue != OP_XCLASS)
2213     {
2214     ecode = code + 33;
2215     if (clen > 0)
2216     {
2217     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2218     ((code[1 + c/8] & (1 << (c&7))) != 0);
2219     }
2220     }
2221    
2222     /* An extended class may have a table or a list of single characters,
2223     ranges, or both, and it may be positive or negative. There's a
2224     function that sorts all this out. */
2225    
2226     else
2227     {
2228     ecode = code + GET(code, 1);
2229     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2230     }
2231    
2232     /* At this point, isinclass is set for all kinds of class, and ecode
2233     points to the byte after the end of the class. If there is a
2234     quantifier, this is where it will be. */
2235    
2236     next_state_offset = ecode - start_code;
2237    
2238     switch (*ecode)
2239     {
2240     case OP_CRSTAR:
2241     case OP_CRMINSTAR:
2242     ADD_ACTIVE(next_state_offset + 1, 0);
2243     if (isinclass) { ADD_NEW(state_offset, 0); }
2244     break;
2245    
2246     case OP_CRPLUS:
2247     case OP_CRMINPLUS:
2248     count = current_state->count; /* Already matched */
2249     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2250     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2251     break;
2252    
2253     case OP_CRQUERY:
2254     case OP_CRMINQUERY:
2255     ADD_ACTIVE(next_state_offset + 1, 0);
2256     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2257     break;
2258    
2259     case OP_CRRANGE:
2260     case OP_CRMINRANGE:
2261     count = current_state->count; /* Already matched */
2262     if (count >= GET2(ecode, 1))
2263     { ADD_ACTIVE(next_state_offset + 5, 0); }
2264     if (isinclass)
2265     {
2266 nigel 91 int max = GET2(ecode, 3);
2267     if (++count >= max && max != 0) /* Max 0 => no limit */
2268 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2269     else
2270     { ADD_NEW(state_offset, count); }
2271     }
2272     break;
2273    
2274     default:
2275     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2276     break;
2277     }
2278     }
2279     break;
2280    
2281     /* ========================================================================== */
2282     /* These are the opcodes for fancy brackets of various kinds. We have
2283 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2284     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2285 ph10 341 though the other "backtracking verbs" are not supported. */
2286 ph10 345
2287 ph10 341 case OP_FAIL:
2288 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2289 ph10 345 break;
2290 nigel 77
2291     case OP_ASSERT:
2292     case OP_ASSERT_NOT:
2293     case OP_ASSERTBACK:
2294     case OP_ASSERTBACK_NOT:
2295     {
2296     int rc;
2297     int local_offsets[2];
2298     int local_workspace[1000];
2299     const uschar *endasscode = code + GET(code, 1);
2300    
2301     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2302    
2303     rc = internal_dfa_exec(
2304     md, /* static match data */
2305     code, /* this subexpression's code */
2306     ptr, /* where we currently are */
2307     ptr - start_subject, /* start offset */
2308     local_offsets, /* offset vector */
2309     sizeof(local_offsets)/sizeof(int), /* size of same */
2310     local_workspace, /* workspace vector */
2311     sizeof(local_workspace)/sizeof(int), /* size of same */
2312     ims, /* the current ims flags */
2313     rlevel, /* function recursion level */
2314     recursing); /* pass on regex recursion */
2315 ph10 487
2316 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2317 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2318     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2319     }
2320     break;
2321    
2322     /*-----------------------------------------------------------------*/
2323     case OP_COND:
2324 nigel 93 case OP_SCOND:
2325 nigel 77 {
2326     int local_offsets[1000];
2327     int local_workspace[1000];
2328 ph10 406 int codelink = GET(code, 1);
2329 ph10 397 int condcode;
2330 ph10 406
2331 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2332 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2333 ph10 398 happen for the other conditions. */
2334 nigel 77
2335 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2336 ph10 406 {
2337     rrc = 0;
2338 ph10 397 if (pcre_callout != NULL)
2339     {
2340     pcre_callout_block cb;
2341     cb.version = 1; /* Version 1 of the callout block */
2342     cb.callout_number = code[LINK_SIZE+2];
2343     cb.offset_vector = offsets;
2344     cb.subject = (PCRE_SPTR)start_subject;
2345     cb.subject_length = end_subject - start_subject;
2346     cb.start_match = current_subject - start_subject;
2347     cb.current_position = ptr - start_subject;
2348     cb.pattern_position = GET(code, LINK_SIZE + 3);
2349     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2350     cb.capture_top = 1;
2351     cb.capture_last = -1;
2352     cb.callout_data = md->callout_data;
2353     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2354     }
2355 ph10 398 if (rrc > 0) break; /* Fail this thread */
2356     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2357 ph10 406 }
2358 ph10 398
2359 ph10 397 condcode = code[LINK_SIZE+1];
2360 ph10 406
2361 nigel 93 /* Back reference conditions are not supported */
2362 nigel 77
2363 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2364 ph10 459 return PCRE_ERROR_DFA_UCOND;
2365 nigel 93
2366     /* The DEFINE condition is always false */
2367    
2368     if (condcode == OP_DEF)
2369 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2370 nigel 93
2371     /* The only supported version of OP_RREF is for the value RREF_ANY,
2372     which means "test if in any recursion". We can't test for specifically
2373     recursed groups. */
2374    
2375 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2376 nigel 93 {
2377 nigel 77 int value = GET2(code, LINK_SIZE+2);
2378 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2379 ph10 406 if (recursing > 0)
2380 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2381     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2382 nigel 77 }
2383    
2384     /* Otherwise, the condition is an assertion */
2385    
2386     else
2387     {
2388     int rc;
2389     const uschar *asscode = code + LINK_SIZE + 1;
2390     const uschar *endasscode = asscode + GET(asscode, 1);
2391    
2392     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2393    
2394     rc = internal_dfa_exec(
2395     md, /* fixed match data */
2396     asscode, /* this subexpression's code */
2397     ptr, /* where we currently are */
2398     ptr - start_subject, /* start offset */
2399     local_offsets, /* offset vector */
2400     sizeof(local_offsets)/sizeof(int), /* size of same */
2401     local_workspace, /* workspace vector */
2402     sizeof(local_workspace)/sizeof(int), /* size of same */
2403     ims, /* the current ims flags */
2404     rlevel, /* function recursion level */
2405     recursing); /* pass on regex recursion */
2406    
2407 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2408 nigel 77 if ((rc >= 0) ==
2409     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2410 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2411 nigel 77 else
2412 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2413 nigel 77 }
2414     }
2415     break;
2416    
2417     /*-----------------------------------------------------------------*/
2418     case OP_RECURSE:
2419     {
2420     int local_offsets[1000];
2421     int local_workspace[1000];
2422     int rc;
2423    
2424     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2425     recursing + 1));
2426    
2427     rc = internal_dfa_exec(
2428     md, /* fixed match data */
2429     start_code + GET(code, 1), /* this subexpression's code */
2430     ptr, /* where we currently are */
2431     ptr - start_subject, /* start offset */
2432     local_offsets, /* offset vector */
2433     sizeof(local_offsets)/sizeof(int), /* size of same */
2434     local_workspace, /* workspace vector */
2435     sizeof(local_workspace)/sizeof(int), /* size of same */
2436     ims, /* the current ims flags */
2437     rlevel, /* function recursion level */
2438     recursing + 1); /* regex recurse level */
2439    
2440     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2441     recursing + 1, rc));
2442    
2443     /* Ran out of internal offsets */
2444    
2445     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2446    
2447     /* For each successful matched substring, set up the next state with a
2448     count of characters to skip before trying it. Note that the count is in
2449     characters, not bytes. */
2450    
2451     if (rc > 0)
2452     {
2453     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2454     {
2455     const uschar *p = start_subject + local_offsets[rc];
2456     const uschar *pp = start_subject + local_offsets[rc+1];
2457     int charcount = local_offsets[rc+1] - local_offsets[rc];
2458     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2459     if (charcount > 0)
2460     {
2461     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2462     }
2463     else
2464     {
2465     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2466     }
2467     }
2468     }
2469     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2470     }
2471     break;
2472    
2473     /*-----------------------------------------------------------------*/
2474     case OP_ONCE:
2475     {
2476     int local_offsets[2];
2477     int local_workspace[1000];
2478    
2479     int rc = internal_dfa_exec(
2480     md, /* fixed match data */
2481     code, /* this subexpression's code */
2482     ptr, /* where we currently are */
2483     ptr - start_subject, /* start offset */
2484     local_offsets, /* offset vector */
2485     sizeof(local_offsets)/sizeof(int), /* size of same */
2486     local_workspace, /* workspace vector */
2487     sizeof(local_workspace)/sizeof(int), /* size of same */
2488     ims, /* the current ims flags */
2489     rlevel, /* function recursion level */
2490     recursing); /* pass on regex recursion */
2491    
2492     if (rc >= 0)
2493     {
2494     const uschar *end_subpattern = code;
2495     int charcount = local_offsets[1] - local_offsets[0];
2496     int next_state_offset, repeat_state_offset;
2497    
2498     do { end_subpattern += GET(end_subpattern, 1); }
2499     while (*end_subpattern == OP_ALT);
2500     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2501    
2502     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2503     arrange for the repeat state also to be added to the relevant list.
2504     Calculate the offset, or set -1 for no repeat. */
2505    
2506     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2507     *end_subpattern == OP_KETRMIN)?
2508     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2509    
2510     /* If we have matched an empty string, add the next state at the
2511     current character pointer. This is important so that the duplicate
2512     checking kicks in, which is what breaks infinite loops that match an
2513     empty string. */
2514    
2515     if (charcount == 0)
2516     {
2517     ADD_ACTIVE(next_state_offset, 0);
2518     }
2519    
2520     /* Optimization: if there are no more active states, and there
2521     are no new states yet set up, then skip over the subject string
2522     right here, to save looping. Otherwise, set up the new state to swing
2523     into action when the end of the substring is reached. */
2524    
2525     else if (i + 1 >= active_count && new_count == 0)
2526     {
2527     ptr += charcount;
2528     clen = 0;
2529     ADD_NEW(next_state_offset, 0);
2530    
2531     /* If we are adding a repeat state at the new character position,
2532     we must fudge things so that it is the only current state.
2533     Otherwise, it might be a duplicate of one we processed before, and
2534     that would cause it to be skipped. */
2535    
2536     if (repeat_state_offset >= 0)
2537     {
2538     next_active_state = active_states;
2539     active_count = 0;
2540     i = -1;
2541     ADD_ACTIVE(repeat_state_offset, 0);
2542     }
2543     }
2544     else
2545     {
2546     const uschar *p = start_subject + local_offsets[0];
2547     const uschar *pp = start_subject + local_offsets[1];
2548     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2549     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2550     if (repeat_state_offset >= 0)
2551     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2552     }
2553    
2554     }
2555     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2556     }
2557     break;
2558    
2559    
2560     /* ========================================================================== */
2561     /* Handle callouts */
2562    
2563     case OP_CALLOUT:
2564 ph10 406 rrc = 0;
2565 nigel 77 if (pcre_callout != NULL)
2566     {
2567     pcre_callout_block cb;
2568     cb.version = 1; /* Version 1 of the callout block */
2569     cb.callout_number = code[1];
2570     cb.offset_vector = offsets;
2571 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2572 nigel 77 cb.subject_length = end_subject - start_subject;
2573     cb.start_match = current_subject - start_subject;
2574     cb.current_position = ptr - start_subject;
2575     cb.pattern_position = GET(code, 2);
2576     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2577     cb.capture_top = 1;
2578     cb.capture_last = -1;
2579     cb.callout_data = md->callout_data;
2580     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2581 ph10 406 }
2582     if (rrc == 0)
2583     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2584 nigel 77 break;
2585    
2586    
2587     /* ========================================================================== */
2588     default: /* Unsupported opcode */
2589     return PCRE_ERROR_DFA_UITEM;
2590     }
2591    
2592     NEXT_ACTIVE_STATE: continue;
2593    
2594     } /* End of loop scanning active states */
2595    
2596     /* We have finished the processing at the current subject character. If no
2597     new states have been set for the next character, we have found all the
2598     matches that we are going to find. If we are at the top level and partial
2599 ph10 463 matching has been requested, check for appropriate conditions.
2600    
2601 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2602     character. If it is equal to the original active_count (saved in
2603     workspace[1]) it means that (*F) was found on every active state. In this
2604 ph10 463 case we don't want to give a partial match.
2605 nigel 77
2606 ph10 463 The "could_continue" variable is true if a state could have continued but
2607     for the fact that the end of the subject was reached. */
2608    
2609 nigel 77 if (new_count <= 0)
2610     {
2611 ph10 427 if (rlevel == 1 && /* Top level, and */
2612 ph10 463 could_continue && /* Some could go on */
2613 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2614 ph10 427 ( /* either... */
2615     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2616     || /* or... */
2617     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2618     match_count < 0) /* no matches */
2619     ) && /* And... */
2620     ptr >= end_subject && /* Reached end of subject */
2621     ptr > current_subject) /* Matched non-empty string */
2622 nigel 77 {
2623     if (offsetcount >= 2)
2624     {
2625 ph10 435 offsets[0] = md->start_used_ptr - start_subject;
2626 nigel 77 offsets[1] = end_subject - start_subject;
2627     }
2628     match_count = PCRE_ERROR_PARTIAL;
2629     }
2630    
2631     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2632     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2633     rlevel*2-2, SP));
2634 nigel 91 break; /* In effect, "return", but see the comment below */
2635 nigel 77 }
2636    
2637     /* One or more states are active for the next character. */
2638    
2639     ptr += clen; /* Advance to next subject character */
2640     } /* Loop to move along the subject string */
2641    
2642 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2643     if we use "return" above, we have compiler trouble. Some compilers warn if
2644     there's nothing here because they think the function doesn't return a value. On
2645     the other hand, if we put a dummy statement here, some more clever compilers
2646     complain that it can't be reached. Sigh. */
2647 nigel 77
2648 nigel 91 return match_count;
2649 nigel 77 }
2650    
2651    
2652    
2653    
2654     /*************************************************
2655     * Execute a Regular Expression - DFA engine *
2656     *************************************************/
2657    
2658     /* This external function applies a compiled re to a subject string using a DFA
2659     engine. This function calls the internal function multiple times if the pattern
2660     is not anchored.
2661    
2662     Arguments:
2663     argument_re points to the compiled expression
2664 ph10 97 extra_data points to extra data or is NULL
2665 nigel 77 subject points to the subject string
2666     length length of subject string (may contain binary zeros)
2667     start_offset where to start in the subject string
2668     options option bits
2669     offsets vector of match offsets
2670     offsetcount size of same
2671     workspace workspace vector
2672     wscount size of same
2673    
2674     Returns: > 0 => number of match offset pairs placed in offsets
2675     = 0 => offsets overflowed; longest matches are present
2676     -1 => failed to match
2677     < -1 => some kind of unexpected problem
2678     */
2679    
2680 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2681 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2682     const char *subject, int length, int start_offset, int options, int *offsets,
2683     int offsetcount, int *workspace, int wscount)
2684     {
2685     real_pcre *re = (real_pcre *)argument_re;
2686     dfa_match_data match_block;
2687 nigel 91 dfa_match_data *md = &match_block;
2688 nigel 77 BOOL utf8, anchored, startline, firstline;
2689     const uschar *current_subject, *end_subject, *lcc;
2690    
2691     pcre_study_data internal_study;
2692     const pcre_study_data *study = NULL;
2693     real_pcre internal_re;
2694    
2695     const uschar *req_byte_ptr;
2696     const uschar *start_bits = NULL;
2697     BOOL first_byte_caseless = FALSE;
2698     BOOL req_byte_caseless = FALSE;
2699     int first_byte = -1;
2700     int req_byte = -1;
2701     int req_byte2 = -1;
2702 nigel 91 int newline;
2703 nigel 77
2704     /* Plausibility checks */
2705    
2706     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2707     if (re == NULL || subject == NULL || workspace == NULL ||
2708     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2709     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2710     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2711    
2712     /* We need to find the pointer to any study data before we test for byte
2713     flipping, so we scan the extra_data block first. This may set two fields in the
2714     match block, so we must initialize them beforehand. However, the other fields
2715     in the match block must not be set until after the byte flipping. */
2716    
2717 nigel 91 md->tables = re->tables;
2718     md->callout_data = NULL;
2719 nigel 77
2720     if (extra_data != NULL)
2721     {
2722     unsigned int flags = extra_data->flags;
2723     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2724     study = (const pcre_study_data *)extra_data->study_data;
2725     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2726 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2727     return PCRE_ERROR_DFA_UMLIMIT;
2728 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2729 nigel 91 md->callout_data = extra_data->callout_data;
2730 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2731 nigel 91 md->tables = extra_data->tables;
2732 nigel 77 }
2733 ph10 461
2734 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
2735     test for a regex that was compiled on a host of opposite endianness. If this is
2736     the case, flipped values are put in internal_re and internal_study if there was
2737     study data too. */
2738    
2739     if (re->magic_number != MAGIC_NUMBER)
2740     {
2741     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2742     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2743     if (study != NULL) study = &internal_study;
2744     }
2745    
2746     /* Set some local values */
2747    
2748     current_subject = (const unsigned char *)subject + start_offset;
2749     end_subject = (const unsigned char *)subject + length;
2750     req_byte_ptr = current_subject - 1;
2751    
2752 nigel 91 #ifdef SUPPORT_UTF8
2753 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2754 nigel 91 #else
2755     utf8 = FALSE;
2756     #endif
2757 nigel 77
2758 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2759     (re->options & PCRE_ANCHORED) != 0;
2760    
2761 nigel 77 /* The remaining fixed data for passing around. */
2762    
2763 nigel 91 md->start_code = (const uschar *)argument_re +
2764 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2765 nigel 91 md->start_subject = (const unsigned char *)subject;
2766     md->end_subject = end_subject;
2767 ph10 442 md->start_offset = start_offset;
2768 nigel 91 md->moptions = options;
2769     md->poptions = re->options;
2770 nigel 77
2771 ph10 231 /* If the BSR option is not set at match time, copy what was set
2772     at compile time. */
2773    
2774     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2775     {
2776     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2777     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2778     #ifdef BSR_ANYCRLF
2779     else md->moptions |= PCRE_BSR_ANYCRLF;
2780 ph10 243 #endif
2781     }
2782 ph10 231
2783 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2784     nothing is set at run time, whatever was used at compile time applies. */
2785 nigel 91
2786 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2787 nigel 93 PCRE_NEWLINE_BITS)
2788 nigel 91 {
2789 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2790 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2791     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2792 nigel 91 case PCRE_NEWLINE_CR+
2793 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2794 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2795 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2796 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2797 nigel 91 }
2798    
2799 ph10 149 if (newline == -2)
2800 nigel 91 {
2801 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2802     }
2803     else if (newline < 0)
2804     {
2805 nigel 93 md->nltype = NLTYPE_ANY;
2806 nigel 91 }
2807     else
2808     {
2809 nigel 93 md->nltype = NLTYPE_FIXED;
2810     if (newline > 255)
2811     {
2812     md->nllen = 2;
2813     md->nl[0] = (newline >> 8) & 255;
2814     md->nl[1] = newline & 255;
2815     }
2816     else
2817     {
2818     md->nllen = 1;
2819     md->nl[0] = newline;
2820     }
2821 nigel 91 }
2822    
2823 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2824     back the character offset. */
2825    
2826     #ifdef SUPPORT_UTF8
2827     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2828     {
2829     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2830     return PCRE_ERROR_BADUTF8;
2831     if (start_offset > 0 && start_offset < length)
2832     {
2833     int tb = ((uschar *)subject)[start_offset];
2834     if (tb > 127)
2835     {
2836     tb &= 0xc0;
2837     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2838     }
2839     }
2840     }
2841     #endif
2842    
2843     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2844     is a feature that makes it possible to save compiled regex and re-use them
2845     in other programs later. */
2846    
2847 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2848 nigel 77
2849     /* The lower casing table and the "must be at the start of a line" flag are
2850     used in a loop when finding where to start. */
2851    
2852 nigel 91 lcc = md->tables + lcc_offset;
2853 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2854 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2855    
2856     /* Set up the first character to match, if available. The first_byte value is
2857     never set for an anchored regular expression, but the anchoring may be forced
2858     at run time, so we have to test for anchoring. The first char may be unset for
2859     an unanchored pattern, of course. If there's no first char and the pattern was
2860     studied, there may be a bitmap of possible first characters. */
2861    
2862     if (!anchored)
2863     {
2864 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2865 nigel 77 {
2866     first_byte = re->first_byte & 255;
2867     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2868     first_byte = lcc[first_byte];
2869     }
2870     else
2871     {
2872 ph10 455 if (!startline && study != NULL &&
2873     (study->flags & PCRE_STUDY_MAPPED) != 0)
2874 nigel 77 start_bits = study->start_bits;
2875     }
2876     }
2877    
2878     /* For anchored or unanchored matches, there may be a "last known required
2879     character" set. */
2880    
2881 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2882 nigel 77 {
2883     req_byte = re->req_byte & 255;
2884     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2885 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2886 nigel 77 }
2887    
2888     /* Call the main matching function, looping for a non-anchored regex after a
2889 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2890     a match. */
2891 nigel 77
2892     for (;;)
2893     {
2894     int rc;
2895    
2896     if ((options & PCRE_DFA_RESTART) == 0)
2897     {
2898     const uschar *save_end_subject = end_subject;
2899    
2900 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2901     line of a multiline string. Implement this by temporarily adjusting
2902     end_subject so that we stop scanning at a newline. If the match fails at
2903     the newline, later code breaks this loop. */
2904 nigel 77
2905     if (firstline)
2906     {
2907 ph10 365 USPTR t = current_subject;
2908     #ifdef SUPPORT_UTF8
2909     if (utf8)
2910 ph10 371 {
2911     while (t < md->end_subject && !IS_NEWLINE(t))
2912 ph10 365 {
2913     t++;
2914     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2915 ph10 371 }
2916 ph10 365 }
2917     else
2918 ph10 371 #endif
2919 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2920 nigel 77 end_subject = t;
2921     }
2922 ph10 392
2923 ph10 389 /* There are some optimizations that avoid running the match if a known
2924 ph10 455 starting point is not found. However, there is an option that disables
2925     these, for testing and for ensuring that all callouts do actually occur. */
2926 nigel 77
2927 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2928 ph10 392 {
2929 ph10 389 /* Advance to a known first byte. */
2930 ph10 392
2931 ph10 389 if (first_byte >= 0)
2932 nigel 77 {
2933 ph10 389 if (first_byte_caseless)
2934     while (current_subject < end_subject &&
2935     lcc[*current_subject] != first_byte)
2936     current_subject++;
2937     else
2938 ph10 392 while (current_subject < end_subject &&
2939 ph10 389 *current_subject != first_byte)
2940     current_subject++;
2941     }
2942 ph10 392
2943 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2944 ph10 392
2945 ph10 389 else if (startline)
2946     {
2947     if (current_subject > md->start_subject + start_offset)
2948     {
2949 ph10 365 #ifdef SUPPORT_UTF8
2950 ph10 389 if (utf8)
2951 ph10 365 {
2952 ph10 392 while (current_subject < end_subject &&
2953 ph10 389 !WAS_NEWLINE(current_subject))
2954     {
2955 ph10 365 current_subject++;
2956 ph10 389 while(current_subject < end_subject &&
2957     (*current_subject & 0xc0) == 0x80)
2958     current_subject++;
2959     }
2960 ph10 371 }
2961 ph10 389 else
2962     #endif
2963     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2964     current_subject++;
2965 ph10 392
2966 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2967     ANYCRLF, and we are now at a LF, advance the match position by one
2968     more character. */
2969 ph10 392
2970 ph10 391 if (current_subject[-1] == CHAR_CR &&
2971 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2972     current_subject < end_subject &&
2973 ph10 391 *current_subject == CHAR_NL)
2974 ph10 389 current_subject++;
2975 ph10 365 }
2976 nigel 77 }
2977 ph10 392
2978 ph10 389 /* Or to a non-unique first char after study */
2979 ph10 392
2980 ph10 389 else if (start_bits != NULL)
2981 nigel 77 {
2982 ph10 389 while (current_subject < end_subject)
2983     {
2984     register unsigned int c = *current_subject;
2985     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2986     else break;
2987     }
2988 nigel 77 }
2989 ph10 392 }
2990 nigel 77
2991     /* Restore fudged end_subject */
2992    
2993     end_subject = save_end_subject;
2994    
2995 ph10 461 /* The following two optimizations are disabled for partial matching or if
2996     disabling is explicitly requested (and of course, by the test above, this
2997 ph10 455 code is not obeyed when restarting after a partial match). */
2998 ph10 461
2999 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3000     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3001 ph10 461 {
3002 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3003     is a lower bound; no actual string of that length may actually match the
3004     pattern. Although the value is, strictly, in characters, we treat it as
3005     bytes to avoid spending too much time in this optimization. */
3006 nigel 77
3007 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3008 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3009 ph10 455 return PCRE_ERROR_NOMATCH;
3010 ph10 461
3011 ph10 455 /* If req_byte is set, we know that that character must appear in the
3012     subject for the match to succeed. If the first character is set, req_byte
3013     must be later in the subject; otherwise the test starts at the match
3014     point. This optimization can save a huge amount of work in patterns with
3015     nested unlimited repeats that aren't going to match. Writing separate
3016     code for cased/caseless versions makes it go faster, as does using an
3017     autoincrement and backing off on a match.
3018 ph10 461
3019 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3020     can take a long time, and give bad performance on quite ordinary
3021     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3022     string... so we don't do this when the string is sufficiently long. */
3023 ph10 461
3024 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3025 nigel 77 {
3026 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3027 ph10 461
3028 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3029     place we found it at last time. */
3030 ph10 461
3031 ph10 455 if (p > req_byte_ptr)
3032 nigel 77 {
3033 ph10 455 if (req_byte_caseless)
3034     {
3035     while (p < end_subject)
3036     {
3037     register int pp = *p++;
3038     if (pp == req_byte || pp == req_byte2) { p--; break; }
3039     }
3040     }
3041     else
3042     {
3043     while (p < end_subject)
3044     {
3045     if (*p++ == req_byte) { p--; break; }
3046     }
3047     }
3048 ph10 461
3049 ph10 455 /* If we can't find the required character, break the matching loop,
3050     which will cause a return or PCRE_ERROR_NOMATCH. */
3051 ph10 461
3052 ph10 455 if (p >= end_subject) break;
3053 ph10 461
3054 ph10 455 /* If we have found the required character, save the point where we
3055     found it, so that we don't search again next time round the loop if
3056     the start hasn't passed this character yet. */
3057 ph10 461
3058 ph10 455 req_byte_ptr = p;
3059 nigel 77 }
3060 ph10 461 }
3061 nigel 77 }
3062 ph10 455 } /* End of optimizations that are done when not restarting */
3063 nigel 77
3064     /* OK, now we can do the business */
3065    
3066 ph10 435 md->start_used_ptr = current_subject;
3067 ph10 461
3068 nigel 77 rc = internal_dfa_exec(
3069 nigel 91 md, /* fixed match data */
3070     md->start_code, /* this subexpression's code */
3071     current_subject, /* where we currently are */
3072     start_offset, /* start offset in subject */
3073     offsets, /* offset vector */
3074     offsetcount, /* size of same */
3075     workspace, /* workspace vector */
3076     wscount, /* size of same */
3077 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3078 nigel 91 0, /* function recurse level */
3079     0); /* regex recurse level */
3080 nigel 77
3081     /* Anything other than "no match" means we are done, always; otherwise, carry
3082     on only if not anchored. */
3083    
3084     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3085    
3086     /* Advance to the next subject character unless we are at the end of a line
3087     and firstline is set. */
3088    
3089 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3090 nigel 77 current_subject++;
3091     if (utf8)
3092     {
3093     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3094     current_subject++;
3095     }
3096     if (current_subject > end_subject) break;
3097    
3098 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3099 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3100     or ANY or ANYCRLF, advance the match position by one more character. */
3101 nigel 93
3102 ph10 391 if (current_subject[-1] == CHAR_CR &&
3103 ph10 226 current_subject < end_subject &&
3104 ph10 391 *current_subject == CHAR_NL &&
3105 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3106 ph10 226 (md->nltype == NLTYPE_ANY ||
3107     md->nltype == NLTYPE_ANYCRLF ||
3108     md->nllen == 2))
3109 nigel 93 current_subject++;
3110    
3111     } /* "Bumpalong" loop */
3112    
3113 nigel 77 return PCRE_ERROR_NOMATCH;
3114     }
3115    
3116     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12