/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 530 - (hide annotations) (download)
Tue Jun 1 13:42:06 2010 UTC (3 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 111967 byte(s)
Added a lot of (int) casts to avoid compiler warnings in systems where      
size_t is 64-bit.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 473 Copyright (c) 1997-2010 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
125     1, /* Char */
126     1, /* Charnc */
127     1, /* not */
128     /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130     3, 3, 3, /* upto, minupto, exact */
131 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
132 nigel 77 /* Negative single-char repeats - only for chars < 256 */
133     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
134     3, 3, 3, /* NOT upto, minupto, exact */
135 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
136 nigel 77 /* Positive type repeats */
137     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* Type upto, minupto, exact */
139 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
140 nigel 77 /* Character class & ref repeats */
141     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
142     0, 0, /* CRRANGE, CRMINRANGE */
143     0, /* CLASS */
144     0, /* NCLASS */
145     0, /* XCLASS - variable length */
146     0, /* REF */
147     0, /* RECURSE */
148     0, /* CALLOUT */
149     0, /* Alt */
150     0, /* Ket */
151     0, /* KetRmax */
152     0, /* KetRmin */
153     0, /* Assert */
154     0, /* Assert not */
155     0, /* Assert behind */
156     0, /* Assert behind not */
157     0, /* Reverse */
158 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
159     0, 0, 0, /* SBRA, SCBRA, SCOND */
160 ph10 498 0, 0, /* CREF, NCREF */
161     0, 0, /* RREF, NRREF */
162 nigel 93 0, /* DEF */
163 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
164 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
165     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
166     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
167 nigel 77 };
168    
169 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
170 ph10 462 remember the fact that a character could have been inspected when the end of
171 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
172     two tables that follow must also be modified. */
173 ph10 462
174     static const uschar poptable[] = {
175     0, /* End */
176 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
177 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
178     1, 1, 1, /* Any, AllAny, Anybyte */
179 ph10 498 1, 1, /* \P, \p */
180 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
181 ph10 498 1, /* \X */
182 ph10 462 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
183     1, /* Char */
184     1, /* Charnc */
185     1, /* not */
186     /* Positive single-char repeats */
187     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
188     1, 1, 1, /* upto, minupto, exact */
189     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
190     /* Negative single-char repeats - only for chars < 256 */
191     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
192     1, 1, 1, /* NOT upto, minupto, exact */
193     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
194     /* Positive type repeats */
195     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
196     1, 1, 1, /* Type upto, minupto, exact */
197     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
198     /* Character class & ref repeats */
199     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200     1, 1, /* CRRANGE, CRMINRANGE */
201     1, /* CLASS */
202     1, /* NCLASS */
203     1, /* XCLASS - variable length */
204     0, /* REF */
205     0, /* RECURSE */
206     0, /* CALLOUT */
207     0, /* Alt */
208     0, /* Ket */
209     0, /* KetRmax */
210     0, /* KetRmin */
211     0, /* Assert */
212     0, /* Assert not */
213     0, /* Assert behind */
214     0, /* Assert behind not */
215     0, /* Reverse */
216     0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
217     0, 0, 0, /* SBRA, SCBRA, SCOND */
218 ph10 498 0, 0, /* CREF, NCREF */
219     0, 0, /* RREF, NRREF */
220 ph10 462 0, /* DEF */
221     0, 0, /* BRAZERO, BRAMINZERO */
222 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
223     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
224     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
225 ph10 462 };
226    
227 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
228     and \w */
229    
230 ph10 327 static const uschar toptable1[] = {
231 ph10 168 0, 0, 0, 0, 0, 0,
232 nigel 77 ctype_digit, ctype_digit,
233     ctype_space, ctype_space,
234     ctype_word, ctype_word,
235 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
236 nigel 77 };
237    
238 ph10 327 static const uschar toptable2[] = {
239 ph10 168 0, 0, 0, 0, 0, 0,
240 nigel 77 ctype_digit, 0,
241     ctype_space, 0,
242     ctype_word, 0,
243 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
244 nigel 77 };
245    
246    
247     /* Structure for holding data about a particular state, which is in effect the
248     current data for an active path through the match tree. It must consist
249     entirely of ints because the working vector we are passed, and which we put
250     these structures in, is a vector of ints. */
251    
252     typedef struct stateblock {
253     int offset; /* Offset to opcode */
254     int count; /* Count for repeats */
255     int ims; /* ims flag bits */
256     int data; /* Some use extra data */
257     } stateblock;
258    
259     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
260    
261    
262 ph10 475 #ifdef PCRE_DEBUG
263 nigel 77 /*************************************************
264     * Print character string *
265     *************************************************/
266    
267     /* Character string printing function for debugging.
268    
269     Arguments:
270     p points to string
271     length number of bytes
272     f where to print
273    
274     Returns: nothing
275     */
276    
277     static void
278     pchars(unsigned char *p, int length, FILE *f)
279     {
280     int c;
281     while (length-- > 0)
282     {
283     if (isprint(c = *(p++)))
284     fprintf(f, "%c", c);
285     else
286     fprintf(f, "\\x%02x", c);
287     }
288     }
289     #endif
290    
291    
292    
293     /*************************************************
294     * Execute a Regular Expression - DFA engine *
295     *************************************************/
296    
297     /* This internal function applies a compiled pattern to a subject string,
298     starting at a given point, using a DFA engine. This function is called from the
299     external one, possibly multiple times if the pattern is not anchored. The
300     function calls itself recursively for some kinds of subpattern.
301    
302     Arguments:
303     md the match_data block with fixed information
304     this_start_code the opening bracket of this subexpression's code
305     current_subject where we currently are in the subject string
306     start_offset start offset in the subject string
307     offsets vector to contain the matching string offsets
308     offsetcount size of same
309     workspace vector of workspace
310     wscount size of same
311     ims the current ims flags
312     rlevel function call recursion level
313     recursing regex recursive call level
314    
315 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
316 ph10 341 = 0 => offsets overflowed; longest matches are present
317 nigel 77 -1 => failed to match
318     < -1 => some kind of unexpected problem
319    
320     The following macros are used for adding states to the two state vectors (one
321     for the current character, one for the following character). */
322    
323     #define ADD_ACTIVE(x,y) \
324     if (active_count++ < wscount) \
325     { \
326     next_active_state->offset = (x); \
327     next_active_state->count = (y); \
328     next_active_state->ims = ims; \
329     next_active_state++; \
330     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
331     } \
332     else return PCRE_ERROR_DFA_WSSIZE
333    
334     #define ADD_ACTIVE_DATA(x,y,z) \
335     if (active_count++ < wscount) \
336     { \
337     next_active_state->offset = (x); \
338     next_active_state->count = (y); \
339     next_active_state->ims = ims; \
340     next_active_state->data = (z); \
341     next_active_state++; \
342     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
343     } \
344     else return PCRE_ERROR_DFA_WSSIZE
345    
346     #define ADD_NEW(x,y) \
347     if (new_count++ < wscount) \
348     { \
349     next_new_state->offset = (x); \
350     next_new_state->count = (y); \
351     next_new_state->ims = ims; \
352     next_new_state++; \
353     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354     } \
355     else return PCRE_ERROR_DFA_WSSIZE
356    
357     #define ADD_NEW_DATA(x,y,z) \
358     if (new_count++ < wscount) \
359     { \
360     next_new_state->offset = (x); \
361     next_new_state->count = (y); \
362     next_new_state->ims = ims; \
363     next_new_state->data = (z); \
364     next_new_state++; \
365     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
366     } \
367     else return PCRE_ERROR_DFA_WSSIZE
368    
369     /* And now, here is the code */
370    
371     static int
372     internal_dfa_exec(
373     dfa_match_data *md,
374     const uschar *this_start_code,
375     const uschar *current_subject,
376     int start_offset,
377     int *offsets,
378     int offsetcount,
379     int *workspace,
380     int wscount,
381     int ims,
382     int rlevel,
383     int recursing)
384     {
385     stateblock *active_states, *new_states, *temp_states;
386     stateblock *next_active_state, *next_new_state;
387    
388     const uschar *ctypes, *lcc, *fcc;
389     const uschar *ptr;
390 nigel 93 const uschar *end_code, *first_op;
391 nigel 77
392     int active_count, new_count, match_count;
393    
394     /* Some fields in the md block are frequently referenced, so we load them into
395     independent variables in the hope that this will perform better. */
396    
397     const uschar *start_subject = md->start_subject;
398     const uschar *end_subject = md->end_subject;
399     const uschar *start_code = md->start_code;
400    
401 nigel 87 #ifdef SUPPORT_UTF8
402 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
403 nigel 93 #else
404     BOOL utf8 = FALSE;
405 nigel 87 #endif
406 nigel 77
407     rlevel++;
408     offsetcount &= (-2);
409    
410     wscount -= 2;
411     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
412     (2 * INTS_PER_STATEBLOCK);
413    
414     DPRINTF(("\n%.*s---------------------\n"
415     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
416     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
417    
418     ctypes = md->tables + ctypes_offset;
419     lcc = md->tables + lcc_offset;
420     fcc = md->tables + fcc_offset;
421    
422     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
423    
424     active_states = (stateblock *)(workspace + 2);
425     next_new_state = new_states = active_states + wscount;
426     new_count = 0;
427    
428 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
429     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
430    
431 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
432     the alternative states onto the list, and find out where the end is. This
433     makes is possible to use this function recursively, when we want to stop at a
434     matching internal ket rather than at the end.
435    
436     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
437     a backward assertion. In that case, we have to find out the maximum amount to
438     move back, and set up each alternative appropriately. */
439    
440 nigel 93 if (*first_op == OP_REVERSE)
441 nigel 77 {
442     int max_back = 0;
443     int gone_back;
444    
445     end_code = this_start_code;
446     do
447     {
448     int back = GET(end_code, 2+LINK_SIZE);
449     if (back > max_back) max_back = back;
450     end_code += GET(end_code, 1);
451     }
452     while (*end_code == OP_ALT);
453    
454     /* If we can't go back the amount required for the longest lookbehind
455     pattern, go back as far as we can; some alternatives may still be viable. */
456    
457     #ifdef SUPPORT_UTF8
458     /* In character mode we have to step back character by character */
459    
460     if (utf8)
461     {
462     for (gone_back = 0; gone_back < max_back; gone_back++)
463     {
464     if (current_subject <= start_subject) break;
465     current_subject--;
466     while (current_subject > start_subject &&
467     (*current_subject & 0xc0) == 0x80)
468     current_subject--;
469     }
470     }
471     else
472     #endif
473    
474     /* In byte-mode we can do this quickly. */
475    
476     {
477     gone_back = (current_subject - max_back < start_subject)?
478 ph10 530 (int)(current_subject - start_subject) : max_back;
479 nigel 77 current_subject -= gone_back;
480     }
481 ph10 461
482 ph10 435 /* Save the earliest consulted character */
483 nigel 77
484 ph10 461 if (current_subject < md->start_used_ptr)
485     md->start_used_ptr = current_subject;
486    
487 nigel 77 /* Now we can process the individual branches. */
488    
489     end_code = this_start_code;
490     do
491     {
492     int back = GET(end_code, 2+LINK_SIZE);
493     if (back <= gone_back)
494     {
495 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
496 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
497     }
498     end_code += GET(end_code, 1);
499     }
500     while (*end_code == OP_ALT);
501     }
502    
503     /* This is the code for a "normal" subpattern (not a backward assertion). The
504     start of a whole pattern is always one of these. If we are at the top level,
505     we may be asked to restart matching from the same point that we reached for a
506     previous partial match. We still have to scan through the top-level branches to
507     find the end state. */
508    
509     else
510     {
511     end_code = this_start_code;
512    
513     /* Restarting */
514    
515     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
516     {
517     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
518     new_count = workspace[1];
519     if (!workspace[0])
520     memcpy(new_states, active_states, new_count * sizeof(stateblock));
521     }
522    
523     /* Not restarting */
524    
525     else
526     {
527 nigel 93 int length = 1 + LINK_SIZE +
528     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
529 nigel 77 do
530     {
531 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
532 nigel 77 end_code += GET(end_code, 1);
533 nigel 93 length = 1 + LINK_SIZE;
534 nigel 77 }
535     while (*end_code == OP_ALT);
536     }
537     }
538    
539     workspace[0] = 0; /* Bit indicating which vector is current */
540    
541     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
542    
543     /* Loop for scanning the subject */
544    
545     ptr = current_subject;
546     for (;;)
547     {
548     int i, j;
549 nigel 91 int clen, dlen;
550     unsigned int c, d;
551 ph10 428 int forced_fail = 0;
552 ph10 462 BOOL could_continue = FALSE;
553 nigel 77
554     /* Make the new state list into the active state list and empty the
555     new state list. */
556    
557     temp_states = active_states;
558     active_states = new_states;
559     new_states = temp_states;
560     active_count = new_count;
561     new_count = 0;
562    
563     workspace[0] ^= 1; /* Remember for the restarting feature */
564     workspace[1] = active_count;
565    
566 ph10 475 #ifdef PCRE_DEBUG
567 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
568     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
569     printf("\"\n");
570    
571     printf("%.*sActive states: ", rlevel*2-2, SP);
572     for (i = 0; i < active_count; i++)
573     printf("%d/%d ", active_states[i].offset, active_states[i].count);
574     printf("\n");
575     #endif
576    
577     /* Set the pointers for adding new states */
578    
579     next_active_state = active_states + active_count;
580     next_new_state = new_states;
581    
582     /* Load the current character from the subject outside the loop, as many
583     different states may want to look at it, and we assume that at least one
584     will. */
585    
586     if (ptr < end_subject)
587     {
588 nigel 93 clen = 1; /* Number of bytes in the character */
589 nigel 77 #ifdef SUPPORT_UTF8
590     if (utf8) { GETCHARLEN(c, ptr, clen); } else
591     #endif /* SUPPORT_UTF8 */
592     c = *ptr;
593     }
594     else
595     {
596 nigel 93 clen = 0; /* This indicates the end of the subject */
597     c = NOTACHAR; /* This value should never actually be used */
598 nigel 77 }
599    
600     /* Scan up the active states and act on each one. The result of an action
601     may be to add more states to the currently active list (e.g. on hitting a
602     parenthesis) or it may be to put states on the new list, for considering
603     when we move the character pointer on. */
604    
605     for (i = 0; i < active_count; i++)
606     {
607     stateblock *current_state = active_states + i;
608     const uschar *code;
609     int state_offset = current_state->offset;
610 ph10 397 int count, codevalue, rrc;
611 nigel 77
612 ph10 475 #ifdef PCRE_DEBUG
613 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
614 nigel 93 if (clen == 0) printf("EOL\n");
615 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
616     else printf("0x%02x\n", c);
617     #endif
618    
619     /* This variable is referred to implicity in the ADD_xxx macros. */
620    
621     ims = current_state->ims;
622    
623     /* A negative offset is a special case meaning "hold off going to this
624     (negated) state until the number of characters in the data field have
625     been skipped". */
626    
627     if (state_offset < 0)
628     {
629     if (current_state->data > 0)
630     {
631     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
632     ADD_NEW_DATA(state_offset, current_state->count,
633     current_state->data - 1);
634     continue;
635     }
636     else
637     {
638     current_state->offset = state_offset = -state_offset;
639     }
640     }
641    
642 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
643 ph10 439 See the note at the head of this module about the possibility of improving
644     performance here. */
645 nigel 77
646     for (j = 0; j < i; j++)
647     {
648     if (active_states[j].offset == state_offset &&
649     active_states[j].count == current_state->count)
650     {
651     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
652     goto NEXT_ACTIVE_STATE;
653     }
654     }
655    
656     /* The state offset is the offset to the opcode */
657    
658     code = start_code + state_offset;
659     codevalue = *code;
660    
661 ph10 463 /* If this opcode inspects a character, but we are at the end of the
662     subject, remember the fact for use when testing for a partial match. */
663    
664 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
665 ph10 463 could_continue = TRUE;
666 ph10 462
667 nigel 77 /* If this opcode is followed by an inline character, load it. It is
668     tempting to test for the presence of a subject character here, but that
669     is wrong, because sometimes zero repetitions of the subject are
670     permitted.
671    
672     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
673 ph10 178 argument that is not a data character - but is always one byte long. We
674     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
675     this case. To keep the other cases fast, convert these ones to new opcodes.
676     */
677 nigel 77
678     if (coptable[codevalue] > 0)
679     {
680     dlen = 1;
681     #ifdef SUPPORT_UTF8
682     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
683     #endif /* SUPPORT_UTF8 */
684     d = code[coptable[codevalue]];
685     if (codevalue >= OP_TYPESTAR)
686     {
687 nigel 93 switch(d)
688     {
689     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
690     case OP_NOTPROP:
691     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
692     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
693     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
694 ph10 178 case OP_NOT_HSPACE:
695 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
696 ph10 178 case OP_NOT_VSPACE:
697 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
698 nigel 93 default: break;
699     }
700 nigel 77 }
701     }
702     else
703     {
704     dlen = 0; /* Not strictly necessary, but compilers moan */
705 nigel 93 d = NOTACHAR; /* if these variables are not set. */
706 nigel 77 }
707    
708    
709     /* Now process the individual opcodes */
710    
711     switch (codevalue)
712     {
713 ph10 498 /* ========================================================================== */
714     /* These cases are never obeyed. This is a fudge that causes a compile-
715     time error if the vectors coptable or poptable, which are indexed by
716     opcode, are not the correct length. It seems to be the only way to do
717     such a check at compile time, as the sizeof() operator does not work
718     in the C preprocessor. */
719 ph10 507
720 ph10 498 case OP_TABLE_LENGTH:
721 ph10 507 case OP_TABLE_LENGTH +
722 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
723     (sizeof(poptable) == OP_TABLE_LENGTH)):
724 ph10 507 break;
725 nigel 77
726     /* ========================================================================== */
727     /* Reached a closing bracket. If not at the end of the pattern, carry
728     on with the next opcode. Otherwise, unless we have an empty string and
729 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
730 ph10 442 start of the subject, save the match data, shifting up all previous
731 nigel 77 matches so we always have the longest first. */
732    
733     case OP_KET:
734     case OP_KETRMIN:
735     case OP_KETRMAX:
736     if (code != end_code)
737     {
738     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
739     if (codevalue != OP_KET)
740     {
741     ADD_ACTIVE(state_offset - GET(code, 1), 0);
742     }
743     }
744 ph10 461 else
745 nigel 77 {
746 ph10 461 if (ptr > current_subject ||
747 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
748     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
749     current_subject > start_subject + md->start_offset)))
750 nigel 77 {
751 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
752     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
753     match_count = 0;
754     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
755     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
756     if (offsetcount >= 2)
757     {
758 ph10 530 offsets[0] = (int)(current_subject - start_subject);
759     offsets[1] = (int)(ptr - start_subject);
760 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
761     offsets[1] - offsets[0], current_subject));
762     }
763     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
764     {
765     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
766     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
767     match_count, rlevel*2-2, SP));
768     return match_count;
769     }
770 ph10 461 }
771 nigel 77 }
772     break;
773    
774     /* ========================================================================== */
775     /* These opcodes add to the current list of states without looking
776     at the current character. */
777    
778     /*-----------------------------------------------------------------*/
779     case OP_ALT:
780     do { code += GET(code, 1); } while (*code == OP_ALT);
781 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
782 nigel 77 break;
783    
784     /*-----------------------------------------------------------------*/
785     case OP_BRA:
786 nigel 93 case OP_SBRA:
787 nigel 77 do
788     {
789 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
790 nigel 77 code += GET(code, 1);
791     }
792     while (*code == OP_ALT);
793     break;
794    
795     /*-----------------------------------------------------------------*/
796 nigel 93 case OP_CBRA:
797     case OP_SCBRA:
798 ph10 530 ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
799 nigel 93 code += GET(code, 1);
800     while (*code == OP_ALT)
801     {
802 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
803 nigel 93 code += GET(code, 1);
804     }
805     break;
806    
807     /*-----------------------------------------------------------------*/
808 nigel 77 case OP_BRAZERO:
809     case OP_BRAMINZERO:
810     ADD_ACTIVE(state_offset + 1, 0);
811     code += 1 + GET(code, 2);
812     while (*code == OP_ALT) code += GET(code, 1);
813 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
814 nigel 77 break;
815    
816     /*-----------------------------------------------------------------*/
817 ph10 335 case OP_SKIPZERO:
818     code += 1 + GET(code, 2);
819     while (*code == OP_ALT) code += GET(code, 1);
820 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821 ph10 335 break;
822    
823     /*-----------------------------------------------------------------*/
824 nigel 77 case OP_CIRC:
825     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
826 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
827     ptr != end_subject &&
828 nigel 93 WAS_NEWLINE(ptr)))
829 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
830     break;
831    
832     /*-----------------------------------------------------------------*/
833     case OP_EOD:
834     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
835     break;
836    
837     /*-----------------------------------------------------------------*/
838     case OP_OPT:
839     ims = code[1];
840     ADD_ACTIVE(state_offset + 2, 0);
841     break;
842    
843     /*-----------------------------------------------------------------*/
844     case OP_SOD:
845     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
846     break;
847    
848     /*-----------------------------------------------------------------*/
849     case OP_SOM:
850     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
851     break;
852    
853    
854     /* ========================================================================== */
855     /* These opcodes inspect the next subject character, and sometimes
856     the previous one as well, but do not have an argument. The variable
857     clen contains the length of the current character and is zero if we are
858     at the end of the subject. */
859    
860     /*-----------------------------------------------------------------*/
861     case OP_ANY:
862 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
863 nigel 77 { ADD_NEW(state_offset + 1, 0); }
864     break;
865    
866     /*-----------------------------------------------------------------*/
867 ph10 341 case OP_ALLANY:
868     if (clen > 0)
869     { ADD_NEW(state_offset + 1, 0); }
870     break;
871    
872     /*-----------------------------------------------------------------*/
873 nigel 77 case OP_EODN:
874 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
875 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
876     break;
877    
878     /*-----------------------------------------------------------------*/
879     case OP_DOLL:
880     if ((md->moptions & PCRE_NOTEOL) == 0)
881     {
882 nigel 91 if (clen == 0 ||
883 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
884 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
885     ))
886 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
887     }
888 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
889 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
890     break;
891    
892     /*-----------------------------------------------------------------*/
893    
894     case OP_DIGIT:
895     case OP_WHITESPACE:
896     case OP_WORDCHAR:
897     if (clen > 0 && c < 256 &&
898     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
899     { ADD_NEW(state_offset + 1, 0); }
900     break;
901    
902     /*-----------------------------------------------------------------*/
903     case OP_NOT_DIGIT:
904     case OP_NOT_WHITESPACE:
905     case OP_NOT_WORDCHAR:
906     if (clen > 0 && (c >= 256 ||
907     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
908     { ADD_NEW(state_offset + 1, 0); }
909     break;
910    
911     /*-----------------------------------------------------------------*/
912     case OP_WORD_BOUNDARY:
913     case OP_NOT_WORD_BOUNDARY:
914     {
915     int left_word, right_word;
916    
917     if (ptr > start_subject)
918     {
919     const uschar *temp = ptr - 1;
920 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
921 nigel 77 #ifdef SUPPORT_UTF8
922     if (utf8) BACKCHAR(temp);
923     #endif
924     GETCHARTEST(d, temp);
925 ph10 518 #ifdef SUPPORT_UCP
926     if ((md->poptions & PCRE_UCP) != 0)
927     {
928     if (d == '_') left_word = TRUE; else
929     {
930     int cat = UCD_CATEGORY(d);
931     left_word = (cat == ucp_L || cat == ucp_N);
932     }
933     }
934     else
935     #endif
936 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
937     }
938 ph10 518 else left_word = FALSE;
939 nigel 77
940 ph10 461 if (clen > 0)
941 ph10 518 {
942     #ifdef SUPPORT_UCP
943     if ((md->poptions & PCRE_UCP) != 0)
944     {
945     if (c == '_') right_word = TRUE; else
946     {
947     int cat = UCD_CATEGORY(c);
948     right_word = (cat == ucp_L || cat == ucp_N);
949     }
950     }
951     else
952     #endif
953 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
954 ph10 518 }
955     else right_word = FALSE;
956 nigel 77
957     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
958     { ADD_ACTIVE(state_offset + 1, 0); }
959     }
960     break;
961    
962    
963     /*-----------------------------------------------------------------*/
964     /* Check the next character by Unicode property. We will get here only
965     if the support is in the binary; otherwise a compile-time error occurs.
966     */
967    
968 ph10 151 #ifdef SUPPORT_UCP
969 nigel 77 case OP_PROP:
970     case OP_NOTPROP:
971     if (clen > 0)
972     {
973 nigel 87 BOOL OK;
974 ph10 349 const ucd_record * prop = GET_UCD(c);
975 nigel 87 switch(code[1])
976 nigel 77 {
977 nigel 87 case PT_ANY:
978     OK = TRUE;
979     break;
980    
981     case PT_LAMP:
982 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
983     prop->chartype == ucp_Lt;
984 nigel 87 break;
985    
986     case PT_GC:
987 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
988 nigel 87 break;
989    
990     case PT_PC:
991 ph10 349 OK = prop->chartype == code[2];
992 nigel 87 break;
993    
994     case PT_SC:
995 ph10 349 OK = prop->script == code[2];
996 nigel 87 break;
997 ph10 517
998     /* These are specials for combination cases. */
999    
1000     case PT_ALNUM:
1001     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1002     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1003     break;
1004    
1005     case PT_SPACE: /* Perl space */
1006     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1007     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1008     break;
1009    
1010     case PT_PXSPACE: /* POSIX space */
1011     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1012     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1013     c == CHAR_FF || c == CHAR_CR;
1014     break;
1015    
1016     case PT_WORD:
1017     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1018     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1019     c == CHAR_UNDERSCORE;
1020     break;
1021 nigel 87
1022     /* Should never occur, but keep compilers from grumbling. */
1023    
1024     default:
1025     OK = codevalue != OP_PROP;
1026     break;
1027 nigel 77 }
1028 nigel 87
1029     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1030 nigel 77 }
1031     break;
1032     #endif
1033    
1034    
1035    
1036     /* ========================================================================== */
1037     /* These opcodes likewise inspect the subject character, but have an
1038     argument that is not a data character. It is one of these opcodes:
1039 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1040     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1041 nigel 77
1042     case OP_TYPEPLUS:
1043     case OP_TYPEMINPLUS:
1044 nigel 93 case OP_TYPEPOSPLUS:
1045 nigel 77 count = current_state->count; /* Already matched */
1046     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1047     if (clen > 0)
1048     {
1049     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1050     (c < 256 &&
1051 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1052 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1053     {
1054 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1055     {
1056     active_count--; /* Remove non-match possibility */
1057     next_active_state--;
1058     }
1059 nigel 77 count++;
1060     ADD_NEW(state_offset, count);
1061     }
1062     }
1063     break;
1064    
1065     /*-----------------------------------------------------------------*/
1066     case OP_TYPEQUERY:
1067     case OP_TYPEMINQUERY:
1068 nigel 93 case OP_TYPEPOSQUERY:
1069 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1070     if (clen > 0)
1071     {
1072     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1073     (c < 256 &&
1074 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1075 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1076     {
1077 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1078     {
1079     active_count--; /* Remove non-match possibility */
1080     next_active_state--;
1081     }
1082 nigel 77 ADD_NEW(state_offset + 2, 0);
1083     }
1084     }
1085     break;
1086    
1087     /*-----------------------------------------------------------------*/
1088     case OP_TYPESTAR:
1089     case OP_TYPEMINSTAR:
1090 nigel 93 case OP_TYPEPOSSTAR:
1091 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1092     if (clen > 0)
1093     {
1094     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1095     (c < 256 &&
1096 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1097 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1098     {
1099 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1100     {
1101     active_count--; /* Remove non-match possibility */
1102     next_active_state--;
1103     }
1104 nigel 77 ADD_NEW(state_offset, 0);
1105     }
1106     }
1107     break;
1108    
1109     /*-----------------------------------------------------------------*/
1110     case OP_TYPEEXACT:
1111 nigel 93 count = current_state->count; /* Number already matched */
1112     if (clen > 0)
1113     {
1114     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1115     (c < 256 &&
1116 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1117 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1118     {
1119     if (++count >= GET2(code, 1))
1120     { ADD_NEW(state_offset + 4, 0); }
1121     else
1122     { ADD_NEW(state_offset, count); }
1123     }
1124     }
1125     break;
1126    
1127     /*-----------------------------------------------------------------*/
1128 nigel 77 case OP_TYPEUPTO:
1129     case OP_TYPEMINUPTO:
1130 nigel 93 case OP_TYPEPOSUPTO:
1131     ADD_ACTIVE(state_offset + 4, 0);
1132 nigel 77 count = current_state->count; /* Number already matched */
1133     if (clen > 0)
1134     {
1135     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1136     (c < 256 &&
1137 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1138 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1139     {
1140 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1141     {
1142     active_count--; /* Remove non-match possibility */
1143     next_active_state--;
1144     }
1145 nigel 77 if (++count >= GET2(code, 1))
1146     { ADD_NEW(state_offset + 4, 0); }
1147     else
1148     { ADD_NEW(state_offset, count); }
1149     }
1150     }
1151     break;
1152    
1153     /* ========================================================================== */
1154     /* These are virtual opcodes that are used when something like
1155 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1156     argument. It keeps the code above fast for the other cases. The argument
1157     is in the d variable. */
1158 nigel 77
1159 ph10 151 #ifdef SUPPORT_UCP
1160 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1161     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1162 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1163 nigel 77 count = current_state->count; /* Already matched */
1164 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1165 nigel 77 if (clen > 0)
1166     {
1167 nigel 87 BOOL OK;
1168 ph10 349 const ucd_record * prop = GET_UCD(c);
1169 nigel 87 switch(code[2])
1170     {
1171     case PT_ANY:
1172     OK = TRUE;
1173     break;
1174    
1175     case PT_LAMP:
1176 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1177     prop->chartype == ucp_Lt;
1178 nigel 87 break;
1179    
1180     case PT_GC:
1181 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1182 nigel 87 break;
1183    
1184     case PT_PC:
1185 ph10 349 OK = prop->chartype == code[3];
1186 nigel 87 break;
1187    
1188     case PT_SC:
1189 ph10 349 OK = prop->script == code[3];
1190 nigel 87 break;
1191    
1192 ph10 517 /* These are specials for combination cases. */
1193    
1194     case PT_ALNUM:
1195     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1196     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1197     break;
1198    
1199     case PT_SPACE: /* Perl space */
1200     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1201     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1202     break;
1203    
1204     case PT_PXSPACE: /* POSIX space */
1205     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1206     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1207     c == CHAR_FF || c == CHAR_CR;
1208     break;
1209    
1210     case PT_WORD:
1211     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1212     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1213     c == CHAR_UNDERSCORE;
1214     break;
1215    
1216 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1217    
1218     default:
1219     OK = codevalue != OP_PROP;
1220     break;
1221     }
1222    
1223 nigel 93 if (OK == (d == OP_PROP))
1224     {
1225     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1226     {
1227     active_count--; /* Remove non-match possibility */
1228     next_active_state--;
1229     }
1230     count++;
1231     ADD_NEW(state_offset, count);
1232     }
1233 nigel 77 }
1234     break;
1235    
1236     /*-----------------------------------------------------------------*/
1237     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1238     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1239 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1240 nigel 77 count = current_state->count; /* Already matched */
1241     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1242 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1243 nigel 77 {
1244     const uschar *nptr = ptr + clen;
1245     int ncount = 0;
1246 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1247     {
1248     active_count--; /* Remove non-match possibility */
1249     next_active_state--;
1250     }
1251 nigel 77 while (nptr < end_subject)
1252     {
1253     int nd;
1254     int ndlen = 1;
1255     GETCHARLEN(nd, nptr, ndlen);
1256 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1257 nigel 77 ncount++;
1258     nptr += ndlen;
1259     }
1260     count++;
1261     ADD_NEW_DATA(-state_offset, count, ncount);
1262     }
1263     break;
1264 ph10 151 #endif
1265 nigel 77
1266     /*-----------------------------------------------------------------*/
1267 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1268     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1269     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1270     count = current_state->count; /* Already matched */
1271     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1272     if (clen > 0)
1273     {
1274     int ncount = 0;
1275     switch (c)
1276     {
1277     case 0x000b:
1278     case 0x000c:
1279     case 0x0085:
1280     case 0x2028:
1281     case 0x2029:
1282 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1283     goto ANYNL01;
1284    
1285     case 0x000d:
1286     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1287     /* Fall through */
1288    
1289     ANYNL01:
1290     case 0x000a:
1291 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1292     {
1293     active_count--; /* Remove non-match possibility */
1294     next_active_state--;
1295     }
1296     count++;
1297     ADD_NEW_DATA(-state_offset, count, ncount);
1298     break;
1299 ph10 231
1300 nigel 93 default:
1301     break;
1302     }
1303     }
1304     break;
1305    
1306     /*-----------------------------------------------------------------*/
1307 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1308     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1309     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1310     count = current_state->count; /* Already matched */
1311     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1312     if (clen > 0)
1313     {
1314 ph10 182 BOOL OK;
1315 ph10 178 switch (c)
1316     {
1317     case 0x000a:
1318     case 0x000b:
1319     case 0x000c:
1320     case 0x000d:
1321     case 0x0085:
1322     case 0x2028:
1323     case 0x2029:
1324     OK = TRUE;
1325 ph10 182 break;
1326 ph10 178
1327     default:
1328     OK = FALSE;
1329 ph10 182 break;
1330 ph10 178 }
1331    
1332     if (OK == (d == OP_VSPACE))
1333 ph10 182 {
1334 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1335     {
1336     active_count--; /* Remove non-match possibility */
1337     next_active_state--;
1338     }
1339     count++;
1340     ADD_NEW_DATA(-state_offset, count, 0);
1341     }
1342     }
1343     break;
1344    
1345     /*-----------------------------------------------------------------*/
1346     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1347     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1348     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1349     count = current_state->count; /* Already matched */
1350     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1351     if (clen > 0)
1352     {
1353 ph10 182 BOOL OK;
1354 ph10 178 switch (c)
1355     {
1356     case 0x09: /* HT */
1357     case 0x20: /* SPACE */
1358     case 0xa0: /* NBSP */
1359     case 0x1680: /* OGHAM SPACE MARK */
1360     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1361     case 0x2000: /* EN QUAD */
1362     case 0x2001: /* EM QUAD */
1363     case 0x2002: /* EN SPACE */
1364     case 0x2003: /* EM SPACE */
1365     case 0x2004: /* THREE-PER-EM SPACE */
1366     case 0x2005: /* FOUR-PER-EM SPACE */
1367     case 0x2006: /* SIX-PER-EM SPACE */
1368     case 0x2007: /* FIGURE SPACE */
1369     case 0x2008: /* PUNCTUATION SPACE */
1370     case 0x2009: /* THIN SPACE */
1371     case 0x200A: /* HAIR SPACE */
1372     case 0x202f: /* NARROW NO-BREAK SPACE */
1373     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1374     case 0x3000: /* IDEOGRAPHIC SPACE */
1375     OK = TRUE;
1376     break;
1377 ph10 182
1378 ph10 178 default:
1379     OK = FALSE;
1380     break;
1381     }
1382 ph10 182
1383 ph10 178 if (OK == (d == OP_HSPACE))
1384 ph10 182 {
1385 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1386     {
1387     active_count--; /* Remove non-match possibility */
1388     next_active_state--;
1389     }
1390     count++;
1391     ADD_NEW_DATA(-state_offset, count, 0);
1392     }
1393     }
1394     break;
1395    
1396     /*-----------------------------------------------------------------*/
1397 ph10 151 #ifdef SUPPORT_UCP
1398 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1399     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1400 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1401 nigel 87 count = 4;
1402 nigel 77 goto QS1;
1403    
1404     case OP_PROP_EXTRA + OP_TYPESTAR:
1405     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1406 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1407 nigel 77 count = 0;
1408    
1409     QS1:
1410    
1411 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1412 nigel 77 if (clen > 0)
1413     {
1414 nigel 87 BOOL OK;
1415 ph10 349 const ucd_record * prop = GET_UCD(c);
1416 nigel 87 switch(code[2])
1417     {
1418     case PT_ANY:
1419     OK = TRUE;
1420     break;
1421    
1422     case PT_LAMP:
1423 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1424     prop->chartype == ucp_Lt;
1425 nigel 87 break;
1426    
1427     case PT_GC:
1428 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1429 nigel 87 break;
1430    
1431     case PT_PC:
1432 ph10 349 OK = prop->chartype == code[3];
1433 nigel 87 break;
1434    
1435     case PT_SC:
1436 ph10 349 OK = prop->script == code[3];
1437 nigel 87 break;
1438 ph10 517
1439     /* These are specials for combination cases. */
1440    
1441     case PT_ALNUM:
1442     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1443     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1444     break;
1445    
1446     case PT_SPACE: /* Perl space */
1447     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1448     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1449     break;
1450    
1451     case PT_PXSPACE: /* POSIX space */
1452     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1453     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1454     c == CHAR_FF || c == CHAR_CR;
1455     break;
1456    
1457     case PT_WORD:
1458     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1459     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1460     c == CHAR_UNDERSCORE;
1461     break;
1462 nigel 87
1463     /* Should never occur, but keep compilers from grumbling. */
1464    
1465     default:
1466     OK = codevalue != OP_PROP;
1467     break;
1468     }
1469    
1470 nigel 93 if (OK == (d == OP_PROP))
1471     {
1472     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1473     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1474     {
1475     active_count--; /* Remove non-match possibility */
1476     next_active_state--;
1477     }
1478     ADD_NEW(state_offset + count, 0);
1479     }
1480 nigel 77 }
1481     break;
1482    
1483     /*-----------------------------------------------------------------*/
1484     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1485     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1486 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1487 nigel 77 count = 2;
1488     goto QS2;
1489    
1490     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1491     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1492 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1493 nigel 77 count = 0;
1494    
1495     QS2:
1496    
1497     ADD_ACTIVE(state_offset + 2, 0);
1498 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1499 nigel 77 {
1500     const uschar *nptr = ptr + clen;
1501     int ncount = 0;
1502 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1503     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1504     {
1505     active_count--; /* Remove non-match possibility */
1506     next_active_state--;
1507     }
1508 nigel 77 while (nptr < end_subject)
1509     {
1510     int nd;
1511     int ndlen = 1;
1512     GETCHARLEN(nd, nptr, ndlen);
1513 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1514 nigel 77 ncount++;
1515     nptr += ndlen;
1516     }
1517     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1518     }
1519     break;
1520 ph10 151 #endif
1521 nigel 77
1522     /*-----------------------------------------------------------------*/
1523 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1524     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1525     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1526     count = 2;
1527     goto QS3;
1528    
1529     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1530     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1531     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1532     count = 0;
1533    
1534     QS3:
1535     ADD_ACTIVE(state_offset + 2, 0);
1536     if (clen > 0)
1537     {
1538     int ncount = 0;
1539     switch (c)
1540     {
1541     case 0x000b:
1542     case 0x000c:
1543     case 0x0085:
1544     case 0x2028:
1545     case 0x2029:
1546 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1547     goto ANYNL02;
1548    
1549     case 0x000d:
1550     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1551     /* Fall through */
1552    
1553     ANYNL02:
1554     case 0x000a:
1555 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1556     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1557     {
1558     active_count--; /* Remove non-match possibility */
1559     next_active_state--;
1560     }
1561     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1562     break;
1563 ph10 231
1564 nigel 93 default:
1565     break;
1566     }
1567     }
1568     break;
1569    
1570     /*-----------------------------------------------------------------*/
1571 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1572     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1573     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1574     count = 2;
1575     goto QS4;
1576    
1577     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1578     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1579     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1580     count = 0;
1581    
1582     QS4:
1583     ADD_ACTIVE(state_offset + 2, 0);
1584     if (clen > 0)
1585     {
1586 ph10 182 BOOL OK;
1587 ph10 178 switch (c)
1588     {
1589     case 0x000a:
1590     case 0x000b:
1591     case 0x000c:
1592     case 0x000d:
1593     case 0x0085:
1594     case 0x2028:
1595     case 0x2029:
1596     OK = TRUE;
1597     break;
1598 ph10 182
1599 ph10 178 default:
1600     OK = FALSE;
1601     break;
1602     }
1603     if (OK == (d == OP_VSPACE))
1604 ph10 182 {
1605 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1606     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1607     {
1608     active_count--; /* Remove non-match possibility */
1609     next_active_state--;
1610     }
1611     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1612     }
1613     }
1614     break;
1615    
1616     /*-----------------------------------------------------------------*/
1617     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1618     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1619     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1620     count = 2;
1621     goto QS5;
1622    
1623     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1624     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1625     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1626     count = 0;
1627    
1628     QS5:
1629     ADD_ACTIVE(state_offset + 2, 0);
1630     if (clen > 0)
1631     {
1632 ph10 182 BOOL OK;
1633 ph10 178 switch (c)
1634     {
1635     case 0x09: /* HT */
1636     case 0x20: /* SPACE */
1637     case 0xa0: /* NBSP */
1638     case 0x1680: /* OGHAM SPACE MARK */
1639     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1640     case 0x2000: /* EN QUAD */
1641     case 0x2001: /* EM QUAD */
1642     case 0x2002: /* EN SPACE */
1643     case 0x2003: /* EM SPACE */
1644     case 0x2004: /* THREE-PER-EM SPACE */
1645     case 0x2005: /* FOUR-PER-EM SPACE */
1646     case 0x2006: /* SIX-PER-EM SPACE */
1647     case 0x2007: /* FIGURE SPACE */
1648     case 0x2008: /* PUNCTUATION SPACE */
1649     case 0x2009: /* THIN SPACE */
1650     case 0x200A: /* HAIR SPACE */
1651     case 0x202f: /* NARROW NO-BREAK SPACE */
1652     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1653     case 0x3000: /* IDEOGRAPHIC SPACE */
1654     OK = TRUE;
1655     break;
1656 ph10 182
1657 ph10 178 default:
1658     OK = FALSE;
1659     break;
1660     }
1661 ph10 182
1662 ph10 178 if (OK == (d == OP_HSPACE))
1663 ph10 182 {
1664 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1665     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1666     {
1667     active_count--; /* Remove non-match possibility */
1668     next_active_state--;
1669     }
1670     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1671     }
1672     }
1673     break;
1674    
1675     /*-----------------------------------------------------------------*/
1676 ph10 151 #ifdef SUPPORT_UCP
1677 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1678     case OP_PROP_EXTRA + OP_TYPEUPTO:
1679     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1680 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1681 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1682 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1683 nigel 77 count = current_state->count; /* Number already matched */
1684     if (clen > 0)
1685     {
1686 nigel 87 BOOL OK;
1687 ph10 349 const ucd_record * prop = GET_UCD(c);
1688 nigel 87 switch(code[4])
1689 nigel 77 {
1690 nigel 87 case PT_ANY:
1691     OK = TRUE;
1692     break;
1693    
1694     case PT_LAMP:
1695 ph10 517 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1696     prop->chartype == ucp_Lt;
1697 nigel 87 break;
1698    
1699     case PT_GC:
1700 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1701 nigel 87 break;
1702    
1703     case PT_PC:
1704 ph10 349 OK = prop->chartype == code[5];
1705 nigel 87 break;
1706    
1707     case PT_SC:
1708 ph10 349 OK = prop->script == code[5];
1709 nigel 87 break;
1710 ph10 517
1711     /* These are specials for combination cases. */
1712    
1713     case PT_ALNUM:
1714     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1715     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1716     break;
1717    
1718     case PT_SPACE: /* Perl space */
1719     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1720     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1721     break;
1722    
1723     case PT_PXSPACE: /* POSIX space */
1724     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1725     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1726     c == CHAR_FF || c == CHAR_CR;
1727     break;
1728    
1729     case PT_WORD:
1730     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1731     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1732     c == CHAR_UNDERSCORE;
1733     break;
1734 nigel 87
1735     /* Should never occur, but keep compilers from grumbling. */
1736    
1737     default:
1738     OK = codevalue != OP_PROP;
1739     break;
1740     }
1741    
1742     if (OK == (d == OP_PROP))
1743     {
1744 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1745     {
1746     active_count--; /* Remove non-match possibility */
1747     next_active_state--;
1748     }
1749 nigel 77 if (++count >= GET2(code, 1))
1750 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1751 nigel 77 else
1752     { ADD_NEW(state_offset, count); }
1753     }
1754     }
1755     break;
1756    
1757     /*-----------------------------------------------------------------*/
1758     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1759     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1760     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1761 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1762 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1763     { ADD_ACTIVE(state_offset + 4, 0); }
1764     count = current_state->count; /* Number already matched */
1765 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1766 nigel 77 {
1767     const uschar *nptr = ptr + clen;
1768     int ncount = 0;
1769 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1770     {
1771     active_count--; /* Remove non-match possibility */
1772     next_active_state--;
1773     }
1774 nigel 77 while (nptr < end_subject)
1775     {
1776     int nd;
1777     int ndlen = 1;
1778     GETCHARLEN(nd, nptr, ndlen);
1779 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1780 nigel 77 ncount++;
1781     nptr += ndlen;
1782     }
1783     if (++count >= GET2(code, 1))
1784     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1785     else
1786     { ADD_NEW_DATA(-state_offset, count, ncount); }
1787     }
1788     break;
1789 ph10 151 #endif
1790 nigel 77
1791 nigel 93 /*-----------------------------------------------------------------*/
1792     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1793     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1794     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1795     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1796     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1797     { ADD_ACTIVE(state_offset + 4, 0); }
1798     count = current_state->count; /* Number already matched */
1799     if (clen > 0)
1800     {
1801     int ncount = 0;
1802     switch (c)
1803     {
1804     case 0x000b:
1805     case 0x000c:
1806     case 0x0085:
1807     case 0x2028:
1808     case 0x2029:
1809 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1810     goto ANYNL03;
1811    
1812     case 0x000d:
1813     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1814     /* Fall through */
1815    
1816     ANYNL03:
1817     case 0x000a:
1818 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1819     {
1820     active_count--; /* Remove non-match possibility */
1821     next_active_state--;
1822     }
1823     if (++count >= GET2(code, 1))
1824     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1825     else
1826     { ADD_NEW_DATA(-state_offset, count, ncount); }
1827     break;
1828 ph10 231
1829 nigel 93 default:
1830     break;
1831     }
1832     }
1833     break;
1834    
1835 ph10 178 /*-----------------------------------------------------------------*/
1836     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1837     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1838     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1839     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1840     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1841     { ADD_ACTIVE(state_offset + 4, 0); }
1842     count = current_state->count; /* Number already matched */
1843     if (clen > 0)
1844     {
1845 ph10 182 BOOL OK;
1846 ph10 178 switch (c)
1847     {
1848     case 0x000a:
1849     case 0x000b:
1850     case 0x000c:
1851     case 0x000d:
1852     case 0x0085:
1853     case 0x2028:
1854     case 0x2029:
1855     OK = TRUE;
1856     break;
1857 ph10 182
1858 ph10 178 default:
1859     OK = FALSE;
1860     }
1861 ph10 182
1862 ph10 178 if (OK == (d == OP_VSPACE))
1863 ph10 182 {
1864 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1865     {
1866     active_count--; /* Remove non-match possibility */
1867     next_active_state--;
1868     }
1869     if (++count >= GET2(code, 1))
1870     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1871     else
1872     { ADD_NEW_DATA(-state_offset, count, 0); }
1873     }
1874     }
1875     break;
1876    
1877     /*-----------------------------------------------------------------*/
1878     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1879     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1880     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1881     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1882     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1883     { ADD_ACTIVE(state_offset + 4, 0); }
1884     count = current_state->count; /* Number already matched */
1885     if (clen > 0)
1886     {
1887 ph10 182 BOOL OK;
1888 ph10 178 switch (c)
1889     {
1890     case 0x09: /* HT */
1891     case 0x20: /* SPACE */
1892     case 0xa0: /* NBSP */
1893     case 0x1680: /* OGHAM SPACE MARK */
1894     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1895     case 0x2000: /* EN QUAD */
1896     case 0x2001: /* EM QUAD */
1897     case 0x2002: /* EN SPACE */
1898     case 0x2003: /* EM SPACE */
1899     case 0x2004: /* THREE-PER-EM SPACE */
1900     case 0x2005: /* FOUR-PER-EM SPACE */
1901     case 0x2006: /* SIX-PER-EM SPACE */
1902     case 0x2007: /* FIGURE SPACE */
1903     case 0x2008: /* PUNCTUATION SPACE */
1904     case 0x2009: /* THIN SPACE */
1905     case 0x200A: /* HAIR SPACE */
1906     case 0x202f: /* NARROW NO-BREAK SPACE */
1907     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1908     case 0x3000: /* IDEOGRAPHIC SPACE */
1909     OK = TRUE;
1910     break;
1911 ph10 182
1912 ph10 178 default:
1913     OK = FALSE;
1914     break;
1915     }
1916 ph10 182
1917 ph10 178 if (OK == (d == OP_HSPACE))
1918 ph10 182 {
1919 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1920     {
1921     active_count--; /* Remove non-match possibility */
1922     next_active_state--;
1923     }
1924     if (++count >= GET2(code, 1))
1925     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1926     else
1927     { ADD_NEW_DATA(-state_offset, count, 0); }
1928     }
1929     }
1930     break;
1931    
1932 nigel 77 /* ========================================================================== */
1933     /* These opcodes are followed by a character that is usually compared
1934     to the current subject character; it is loaded into d. We still get
1935     here even if there is no subject character, because in some cases zero
1936     repetitions are permitted. */
1937    
1938     /*-----------------------------------------------------------------*/
1939     case OP_CHAR:
1940     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1941     break;
1942    
1943     /*-----------------------------------------------------------------*/
1944     case OP_CHARNC:
1945     if (clen == 0) break;
1946    
1947     #ifdef SUPPORT_UTF8
1948     if (utf8)
1949     {
1950     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1951     {
1952 nigel 93 unsigned int othercase;
1953 nigel 77 if (c < 128) othercase = fcc[c]; else
1954    
1955     /* If we have Unicode property support, we can use it to test the
1956 nigel 87 other case of the character. */
1957 nigel 77
1958     #ifdef SUPPORT_UCP
1959 ph10 349 othercase = UCD_OTHERCASE(c);
1960 nigel 87 #else
1961 nigel 93 othercase = NOTACHAR;
1962 nigel 77 #endif
1963    
1964     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1965     }
1966     }
1967     else
1968     #endif /* SUPPORT_UTF8 */
1969    
1970     /* Non-UTF-8 mode */
1971     {
1972     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1973     }
1974     break;
1975    
1976    
1977     #ifdef SUPPORT_UCP
1978     /*-----------------------------------------------------------------*/
1979     /* This is a tricky one because it can match more than one character.
1980     Find out how many characters to skip, and then set up a negative state
1981     to wait for them to pass before continuing. */
1982    
1983     case OP_EXTUNI:
1984 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1985 nigel 77 {
1986     const uschar *nptr = ptr + clen;
1987     int ncount = 0;
1988     while (nptr < end_subject)
1989     {
1990     int nclen = 1;
1991     GETCHARLEN(c, nptr, nclen);
1992 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1993 nigel 77 ncount++;
1994     nptr += nclen;
1995     }
1996     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1997     }
1998     break;
1999     #endif
2000    
2001     /*-----------------------------------------------------------------*/
2002 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2003     character (when CR is followed by LF). In this case, set up a negative
2004     state to wait for one character to pass before continuing. */
2005    
2006     case OP_ANYNL:
2007     if (clen > 0) switch(c)
2008     {
2009     case 0x000b:
2010     case 0x000c:
2011     case 0x0085:
2012     case 0x2028:
2013     case 0x2029:
2014 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2015    
2016     case 0x000a:
2017 nigel 93 ADD_NEW(state_offset + 1, 0);
2018     break;
2019 ph10 231
2020 nigel 93 case 0x000d:
2021     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2022     {
2023     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2024     }
2025     else
2026     {
2027     ADD_NEW(state_offset + 1, 0);
2028     }
2029     break;
2030     }
2031     break;
2032    
2033     /*-----------------------------------------------------------------*/
2034 ph10 178 case OP_NOT_VSPACE:
2035     if (clen > 0) switch(c)
2036     {
2037     case 0x000a:
2038     case 0x000b:
2039     case 0x000c:
2040     case 0x000d:
2041     case 0x0085:
2042     case 0x2028:
2043     case 0x2029:
2044     break;
2045 ph10 182
2046     default:
2047 ph10 178 ADD_NEW(state_offset + 1, 0);
2048     break;
2049     }
2050     break;
2051    
2052     /*-----------------------------------------------------------------*/
2053     case OP_VSPACE:
2054     if (clen > 0) switch(c)
2055     {
2056     case 0x000a:
2057     case 0x000b:
2058     case 0x000c:
2059     case 0x000d:
2060     case 0x0085:
2061     case 0x2028:
2062     case 0x2029:
2063     ADD_NEW(state_offset + 1, 0);
2064     break;
2065 ph10 182
2066 ph10 178 default: break;
2067     }
2068     break;
2069    
2070     /*-----------------------------------------------------------------*/
2071     case OP_NOT_HSPACE:
2072     if (clen > 0) switch(c)
2073     {
2074     case 0x09: /* HT */
2075     case 0x20: /* SPACE */
2076     case 0xa0: /* NBSP */
2077     case 0x1680: /* OGHAM SPACE MARK */
2078     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2079     case 0x2000: /* EN QUAD */
2080     case 0x2001: /* EM QUAD */
2081     case 0x2002: /* EN SPACE */
2082     case 0x2003: /* EM SPACE */
2083     case 0x2004: /* THREE-PER-EM SPACE */
2084     case 0x2005: /* FOUR-PER-EM SPACE */
2085     case 0x2006: /* SIX-PER-EM SPACE */
2086     case 0x2007: /* FIGURE SPACE */
2087     case 0x2008: /* PUNCTUATION SPACE */
2088     case 0x2009: /* THIN SPACE */
2089     case 0x200A: /* HAIR SPACE */
2090     case 0x202f: /* NARROW NO-BREAK SPACE */
2091     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2092     case 0x3000: /* IDEOGRAPHIC SPACE */
2093     break;
2094 ph10 182
2095     default:
2096 ph10 178 ADD_NEW(state_offset + 1, 0);
2097     break;
2098     }
2099     break;
2100    
2101     /*-----------------------------------------------------------------*/
2102     case OP_HSPACE:
2103     if (clen > 0) switch(c)
2104     {
2105     case 0x09: /* HT */
2106     case 0x20: /* SPACE */
2107     case 0xa0: /* NBSP */
2108     case 0x1680: /* OGHAM SPACE MARK */
2109     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2110     case 0x2000: /* EN QUAD */
2111     case 0x2001: /* EM QUAD */
2112     case 0x2002: /* EN SPACE */
2113     case 0x2003: /* EM SPACE */
2114     case 0x2004: /* THREE-PER-EM SPACE */
2115     case 0x2005: /* FOUR-PER-EM SPACE */
2116     case 0x2006: /* SIX-PER-EM SPACE */
2117     case 0x2007: /* FIGURE SPACE */
2118     case 0x2008: /* PUNCTUATION SPACE */
2119     case 0x2009: /* THIN SPACE */
2120     case 0x200A: /* HAIR SPACE */
2121     case 0x202f: /* NARROW NO-BREAK SPACE */
2122     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2123     case 0x3000: /* IDEOGRAPHIC SPACE */
2124     ADD_NEW(state_offset + 1, 0);
2125     break;
2126     }
2127     break;
2128    
2129     /*-----------------------------------------------------------------*/
2130 nigel 77 /* Match a negated single character. This is only used for one-byte
2131     characters, that is, we know that d < 256. The character we are
2132     checking (c) can be multibyte. */
2133    
2134     case OP_NOT:
2135     if (clen > 0)
2136     {
2137 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2138 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2139     }
2140     break;
2141    
2142     /*-----------------------------------------------------------------*/
2143     case OP_PLUS:
2144     case OP_MINPLUS:
2145 nigel 93 case OP_POSPLUS:
2146 nigel 77 case OP_NOTPLUS:
2147     case OP_NOTMINPLUS:
2148 nigel 93 case OP_NOTPOSPLUS:
2149 nigel 77 count = current_state->count; /* Already matched */
2150     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2151     if (clen > 0)
2152     {
2153 nigel 93 unsigned int otherd = NOTACHAR;
2154 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2155     {
2156     #ifdef SUPPORT_UTF8
2157 nigel 87 if (utf8 && d >= 128)
2158 nigel 77 {
2159     #ifdef SUPPORT_UCP
2160 ph10 349 otherd = UCD_OTHERCASE(d);
2161 nigel 77 #endif /* SUPPORT_UCP */
2162     }
2163     else
2164     #endif /* SUPPORT_UTF8 */
2165     otherd = fcc[d];
2166     }
2167     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2168 nigel 93 {
2169     if (count > 0 &&
2170     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2171     {
2172     active_count--; /* Remove non-match possibility */
2173     next_active_state--;
2174     }
2175     count++;
2176     ADD_NEW(state_offset, count);
2177     }
2178 nigel 77 }
2179     break;
2180    
2181     /*-----------------------------------------------------------------*/
2182     case OP_QUERY:
2183     case OP_MINQUERY:
2184 nigel 93 case OP_POSQUERY:
2185 nigel 77 case OP_NOTQUERY:
2186     case OP_NOTMINQUERY:
2187 nigel 93 case OP_NOTPOSQUERY:
2188 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2189     if (clen > 0)
2190     {
2191 nigel 93 unsigned int otherd = NOTACHAR;
2192 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2193 nigel 77 {
2194     #ifdef SUPPORT_UTF8
2195 nigel 87 if (utf8 && d >= 128)
2196 nigel 77 {
2197     #ifdef SUPPORT_UCP
2198 ph10 349 otherd = UCD_OTHERCASE(d);
2199 nigel 77 #endif /* SUPPORT_UCP */
2200     }
2201     else
2202     #endif /* SUPPORT_UTF8 */
2203     otherd = fcc[d];
2204     }
2205     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2206 nigel 93 {
2207     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2208     {
2209     active_count--; /* Remove non-match possibility */
2210     next_active_state--;
2211     }
2212     ADD_NEW(state_offset + dlen + 1, 0);
2213     }
2214 nigel 77 }
2215     break;
2216    
2217     /*-----------------------------------------------------------------*/
2218     case OP_STAR:
2219     case OP_MINSTAR:
2220 nigel 93 case OP_POSSTAR:
2221 nigel 77 case OP_NOTSTAR:
2222     case OP_NOTMINSTAR:
2223 nigel 93 case OP_NOTPOSSTAR:
2224 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2225     if (clen > 0)
2226     {
2227 nigel 93 unsigned int otherd = NOTACHAR;
2228 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2229 nigel 77 {
2230     #ifdef SUPPORT_UTF8
2231 nigel 87 if (utf8 && d >= 128)
2232 nigel 77 {
2233     #ifdef SUPPORT_UCP
2234 ph10 349 otherd = UCD_OTHERCASE(d);
2235 nigel 77 #endif /* SUPPORT_UCP */
2236     }
2237     else
2238     #endif /* SUPPORT_UTF8 */
2239     otherd = fcc[d];
2240     }
2241     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2242 nigel 93 {
2243     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2244     {
2245     active_count--; /* Remove non-match possibility */
2246     next_active_state--;
2247     }
2248     ADD_NEW(state_offset, 0);
2249     }
2250 nigel 77 }
2251     break;
2252    
2253     /*-----------------------------------------------------------------*/
2254     case OP_EXACT:
2255 nigel 93 case OP_NOTEXACT:
2256     count = current_state->count; /* Number already matched */
2257     if (clen > 0)
2258     {
2259     unsigned int otherd = NOTACHAR;
2260     if ((ims & PCRE_CASELESS) != 0)
2261     {
2262     #ifdef SUPPORT_UTF8
2263     if (utf8 && d >= 128)
2264     {
2265     #ifdef SUPPORT_UCP
2266 ph10 349 otherd = UCD_OTHERCASE(d);
2267 nigel 93 #endif /* SUPPORT_UCP */
2268     }
2269     else
2270     #endif /* SUPPORT_UTF8 */
2271     otherd = fcc[d];
2272     }
2273     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2274     {
2275     if (++count >= GET2(code, 1))
2276     { ADD_NEW(state_offset + dlen + 3, 0); }
2277     else
2278     { ADD_NEW(state_offset, count); }
2279     }
2280     }
2281     break;
2282    
2283     /*-----------------------------------------------------------------*/
2284 nigel 77 case OP_UPTO:
2285     case OP_MINUPTO:
2286 nigel 93 case OP_POSUPTO:
2287 nigel 77 case OP_NOTUPTO:
2288     case OP_NOTMINUPTO:
2289 nigel 93 case OP_NOTPOSUPTO:
2290     ADD_ACTIVE(state_offset + dlen + 3, 0);
2291 nigel 77 count = current_state->count; /* Number already matched */
2292     if (clen > 0)
2293     {
2294 nigel 93 unsigned int otherd = NOTACHAR;
2295 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2296     {
2297     #ifdef SUPPORT_UTF8
2298 nigel 87 if (utf8 && d >= 128)
2299 nigel 77 {
2300     #ifdef SUPPORT_UCP
2301 ph10 349 otherd = UCD_OTHERCASE(d);
2302 nigel 77 #endif /* SUPPORT_UCP */
2303     }
2304     else
2305     #endif /* SUPPORT_UTF8 */
2306     otherd = fcc[d];
2307     }
2308     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2309     {
2310 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2311     {
2312     active_count--; /* Remove non-match possibility */
2313     next_active_state--;
2314     }
2315 nigel 77 if (++count >= GET2(code, 1))
2316     { ADD_NEW(state_offset + dlen + 3, 0); }
2317     else
2318     { ADD_NEW(state_offset, count); }
2319     }
2320     }
2321     break;
2322    
2323    
2324     /* ========================================================================== */
2325     /* These are the class-handling opcodes */
2326    
2327     case OP_CLASS:
2328     case OP_NCLASS:
2329     case OP_XCLASS:
2330     {
2331     BOOL isinclass = FALSE;
2332     int next_state_offset;
2333     const uschar *ecode;
2334    
2335     /* For a simple class, there is always just a 32-byte table, and we
2336     can set isinclass from it. */
2337    
2338     if (codevalue != OP_XCLASS)
2339     {
2340     ecode = code + 33;
2341     if (clen > 0)
2342     {
2343     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2344     ((code[1 + c/8] & (1 << (c&7))) != 0);
2345     }
2346     }
2347    
2348     /* An extended class may have a table or a list of single characters,
2349     ranges, or both, and it may be positive or negative. There's a
2350     function that sorts all this out. */
2351    
2352     else
2353     {
2354     ecode = code + GET(code, 1);
2355     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2356     }
2357    
2358     /* At this point, isinclass is set for all kinds of class, and ecode
2359     points to the byte after the end of the class. If there is a
2360     quantifier, this is where it will be. */
2361    
2362 ph10 530 next_state_offset = (int)(ecode - start_code);
2363 nigel 77
2364     switch (*ecode)
2365     {
2366     case OP_CRSTAR:
2367     case OP_CRMINSTAR:
2368     ADD_ACTIVE(next_state_offset + 1, 0);
2369     if (isinclass) { ADD_NEW(state_offset, 0); }
2370     break;
2371    
2372     case OP_CRPLUS:
2373     case OP_CRMINPLUS:
2374     count = current_state->count; /* Already matched */
2375     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2376     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2377     break;
2378    
2379     case OP_CRQUERY:
2380     case OP_CRMINQUERY:
2381     ADD_ACTIVE(next_state_offset + 1, 0);
2382     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2383     break;
2384    
2385     case OP_CRRANGE:
2386     case OP_CRMINRANGE:
2387     count = current_state->count; /* Already matched */
2388     if (count >= GET2(ecode, 1))
2389     { ADD_ACTIVE(next_state_offset + 5, 0); }
2390     if (isinclass)
2391     {
2392 nigel 91 int max = GET2(ecode, 3);
2393     if (++count >= max && max != 0) /* Max 0 => no limit */
2394 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2395     else
2396     { ADD_NEW(state_offset, count); }
2397     }
2398     break;
2399    
2400     default:
2401     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2402     break;
2403     }
2404     }
2405     break;
2406    
2407     /* ========================================================================== */
2408     /* These are the opcodes for fancy brackets of various kinds. We have
2409 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2410     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2411 ph10 341 though the other "backtracking verbs" are not supported. */
2412 ph10 345
2413 ph10 341 case OP_FAIL:
2414 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2415 ph10 345 break;
2416 nigel 77
2417     case OP_ASSERT:
2418     case OP_ASSERT_NOT:
2419     case OP_ASSERTBACK:
2420     case OP_ASSERTBACK_NOT:
2421     {
2422     int rc;
2423     int local_offsets[2];
2424     int local_workspace[1000];
2425     const uschar *endasscode = code + GET(code, 1);
2426    
2427     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2428    
2429     rc = internal_dfa_exec(
2430     md, /* static match data */
2431     code, /* this subexpression's code */
2432     ptr, /* where we currently are */
2433 ph10 530 (int)(ptr - start_subject), /* start offset */
2434 nigel 77 local_offsets, /* offset vector */
2435     sizeof(local_offsets)/sizeof(int), /* size of same */
2436     local_workspace, /* workspace vector */
2437     sizeof(local_workspace)/sizeof(int), /* size of same */
2438     ims, /* the current ims flags */
2439     rlevel, /* function recursion level */
2440     recursing); /* pass on regex recursion */
2441 ph10 487
2442 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2443 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2444 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2445 nigel 77 }
2446     break;
2447    
2448     /*-----------------------------------------------------------------*/
2449     case OP_COND:
2450 nigel 93 case OP_SCOND:
2451 nigel 77 {
2452     int local_offsets[1000];
2453     int local_workspace[1000];
2454 ph10 406 int codelink = GET(code, 1);
2455 ph10 397 int condcode;
2456 ph10 406
2457 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2458 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2459 ph10 398 happen for the other conditions. */
2460 nigel 77
2461 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2462 ph10 406 {
2463     rrc = 0;
2464 ph10 397 if (pcre_callout != NULL)
2465     {
2466     pcre_callout_block cb;
2467     cb.version = 1; /* Version 1 of the callout block */
2468     cb.callout_number = code[LINK_SIZE+2];
2469     cb.offset_vector = offsets;
2470     cb.subject = (PCRE_SPTR)start_subject;
2471 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2472     cb.start_match = (int)(current_subject - start_subject);
2473     cb.current_position = (int)(ptr - start_subject);
2474 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2475     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2476     cb.capture_top = 1;
2477     cb.capture_last = -1;
2478     cb.callout_data = md->callout_data;
2479     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2480     }
2481 ph10 398 if (rrc > 0) break; /* Fail this thread */
2482     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2483 ph10 406 }
2484 ph10 398
2485 ph10 397 condcode = code[LINK_SIZE+1];
2486 ph10 406
2487 nigel 93 /* Back reference conditions are not supported */
2488 nigel 77
2489 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2490 ph10 459 return PCRE_ERROR_DFA_UCOND;
2491 nigel 93
2492     /* The DEFINE condition is always false */
2493    
2494     if (condcode == OP_DEF)
2495 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2496 nigel 93
2497     /* The only supported version of OP_RREF is for the value RREF_ANY,
2498     which means "test if in any recursion". We can't test for specifically
2499     recursed groups. */
2500    
2501 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2502 nigel 93 {
2503 nigel 77 int value = GET2(code, LINK_SIZE+2);
2504 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2505 ph10 406 if (recursing > 0)
2506 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2507     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2508 nigel 77 }
2509    
2510     /* Otherwise, the condition is an assertion */
2511    
2512     else
2513     {
2514     int rc;
2515     const uschar *asscode = code + LINK_SIZE + 1;
2516     const uschar *endasscode = asscode + GET(asscode, 1);
2517    
2518     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2519    
2520     rc = internal_dfa_exec(
2521     md, /* fixed match data */
2522     asscode, /* this subexpression's code */
2523     ptr, /* where we currently are */
2524 ph10 530 (int)(ptr - start_subject), /* start offset */
2525 nigel 77 local_offsets, /* offset vector */
2526     sizeof(local_offsets)/sizeof(int), /* size of same */
2527     local_workspace, /* workspace vector */
2528     sizeof(local_workspace)/sizeof(int), /* size of same */
2529     ims, /* the current ims flags */
2530     rlevel, /* function recursion level */
2531     recursing); /* pass on regex recursion */
2532    
2533 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2534 nigel 77 if ((rc >= 0) ==
2535     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2536 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2537 nigel 77 else
2538 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2539 nigel 77 }
2540     }
2541     break;
2542    
2543     /*-----------------------------------------------------------------*/
2544     case OP_RECURSE:
2545     {
2546     int local_offsets[1000];
2547     int local_workspace[1000];
2548     int rc;
2549    
2550     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2551     recursing + 1));
2552    
2553     rc = internal_dfa_exec(
2554     md, /* fixed match data */
2555     start_code + GET(code, 1), /* this subexpression's code */
2556     ptr, /* where we currently are */
2557 ph10 530 (int)(ptr - start_subject), /* start offset */
2558 nigel 77 local_offsets, /* offset vector */
2559     sizeof(local_offsets)/sizeof(int), /* size of same */
2560     local_workspace, /* workspace vector */
2561     sizeof(local_workspace)/sizeof(int), /* size of same */
2562     ims, /* the current ims flags */
2563     rlevel, /* function recursion level */
2564     recursing + 1); /* regex recurse level */
2565    
2566     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2567     recursing + 1, rc));
2568    
2569     /* Ran out of internal offsets */
2570    
2571     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2572    
2573     /* For each successful matched substring, set up the next state with a
2574     count of characters to skip before trying it. Note that the count is in
2575     characters, not bytes. */
2576    
2577     if (rc > 0)
2578     {
2579     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2580     {
2581     const uschar *p = start_subject + local_offsets[rc];
2582     const uschar *pp = start_subject + local_offsets[rc+1];
2583     int charcount = local_offsets[rc+1] - local_offsets[rc];
2584     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2585     if (charcount > 0)
2586     {
2587     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2588     }
2589     else
2590     {
2591     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2592     }
2593     }
2594     }
2595     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2596     }
2597     break;
2598    
2599     /*-----------------------------------------------------------------*/
2600     case OP_ONCE:
2601     {
2602     int local_offsets[2];
2603     int local_workspace[1000];
2604    
2605     int rc = internal_dfa_exec(
2606     md, /* fixed match data */
2607     code, /* this subexpression's code */
2608     ptr, /* where we currently are */
2609 ph10 530 (int)(ptr - start_subject), /* start offset */
2610 nigel 77 local_offsets, /* offset vector */
2611     sizeof(local_offsets)/sizeof(int), /* size of same */
2612     local_workspace, /* workspace vector */
2613     sizeof(local_workspace)/sizeof(int), /* size of same */
2614     ims, /* the current ims flags */
2615     rlevel, /* function recursion level */
2616     recursing); /* pass on regex recursion */
2617    
2618     if (rc >= 0)
2619     {
2620     const uschar *end_subpattern = code;
2621     int charcount = local_offsets[1] - local_offsets[0];
2622     int next_state_offset, repeat_state_offset;
2623    
2624     do { end_subpattern += GET(end_subpattern, 1); }
2625     while (*end_subpattern == OP_ALT);
2626 ph10 530 next_state_offset =
2627     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2628 nigel 77
2629     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2630     arrange for the repeat state also to be added to the relevant list.
2631     Calculate the offset, or set -1 for no repeat. */
2632    
2633     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2634     *end_subpattern == OP_KETRMIN)?
2635 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2636 nigel 77
2637     /* If we have matched an empty string, add the next state at the
2638     current character pointer. This is important so that the duplicate
2639     checking kicks in, which is what breaks infinite loops that match an
2640     empty string. */
2641    
2642     if (charcount == 0)
2643     {
2644     ADD_ACTIVE(next_state_offset, 0);
2645     }
2646    
2647     /* Optimization: if there are no more active states, and there
2648     are no new states yet set up, then skip over the subject string
2649     right here, to save looping. Otherwise, set up the new state to swing
2650     into action when the end of the substring is reached. */
2651    
2652     else if (i + 1 >= active_count && new_count == 0)
2653     {
2654     ptr += charcount;
2655     clen = 0;
2656     ADD_NEW(next_state_offset, 0);
2657    
2658     /* If we are adding a repeat state at the new character position,
2659     we must fudge things so that it is the only current state.
2660     Otherwise, it might be a duplicate of one we processed before, and
2661     that would cause it to be skipped. */
2662    
2663     if (repeat_state_offset >= 0)
2664     {
2665     next_active_state = active_states;
2666     active_count = 0;
2667     i = -1;
2668     ADD_ACTIVE(repeat_state_offset, 0);
2669     }
2670     }
2671     else
2672     {
2673     const uschar *p = start_subject + local_offsets[0];
2674     const uschar *pp = start_subject + local_offsets[1];
2675     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2676     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2677     if (repeat_state_offset >= 0)
2678     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2679     }
2680    
2681     }
2682     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2683     }
2684     break;
2685    
2686    
2687     /* ========================================================================== */
2688     /* Handle callouts */
2689    
2690     case OP_CALLOUT:
2691 ph10 406 rrc = 0;
2692 nigel 77 if (pcre_callout != NULL)
2693     {
2694     pcre_callout_block cb;
2695     cb.version = 1; /* Version 1 of the callout block */
2696     cb.callout_number = code[1];
2697     cb.offset_vector = offsets;
2698 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2699 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2700     cb.start_match = (int)(current_subject - start_subject);
2701     cb.current_position = (int)(ptr - start_subject);
2702 nigel 77 cb.pattern_position = GET(code, 2);
2703     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2704     cb.capture_top = 1;
2705     cb.capture_last = -1;
2706     cb.callout_data = md->callout_data;
2707     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2708 ph10 406 }
2709     if (rrc == 0)
2710     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2711 nigel 77 break;
2712    
2713    
2714     /* ========================================================================== */
2715     default: /* Unsupported opcode */
2716     return PCRE_ERROR_DFA_UITEM;
2717     }
2718    
2719     NEXT_ACTIVE_STATE: continue;
2720    
2721     } /* End of loop scanning active states */
2722    
2723     /* We have finished the processing at the current subject character. If no
2724     new states have been set for the next character, we have found all the
2725     matches that we are going to find. If we are at the top level and partial
2726 ph10 463 matching has been requested, check for appropriate conditions.
2727    
2728 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2729     character. If it is equal to the original active_count (saved in
2730     workspace[1]) it means that (*F) was found on every active state. In this
2731 ph10 463 case we don't want to give a partial match.
2732 nigel 77
2733 ph10 463 The "could_continue" variable is true if a state could have continued but
2734     for the fact that the end of the subject was reached. */
2735    
2736 nigel 77 if (new_count <= 0)
2737     {
2738 ph10 427 if (rlevel == 1 && /* Top level, and */
2739 ph10 463 could_continue && /* Some could go on */
2740 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2741 ph10 427 ( /* either... */
2742     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2743     || /* or... */
2744     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2745     match_count < 0) /* no matches */
2746     ) && /* And... */
2747     ptr >= end_subject && /* Reached end of subject */
2748     ptr > current_subject) /* Matched non-empty string */
2749 nigel 77 {
2750     if (offsetcount >= 2)
2751     {
2752 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2753     offsets[1] = (int)(end_subject - start_subject);
2754 nigel 77 }
2755     match_count = PCRE_ERROR_PARTIAL;
2756     }
2757    
2758     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2759     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2760     rlevel*2-2, SP));
2761 nigel 91 break; /* In effect, "return", but see the comment below */
2762 nigel 77 }
2763    
2764     /* One or more states are active for the next character. */
2765    
2766     ptr += clen; /* Advance to next subject character */
2767     } /* Loop to move along the subject string */
2768    
2769 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2770     if we use "return" above, we have compiler trouble. Some compilers warn if
2771     there's nothing here because they think the function doesn't return a value. On
2772     the other hand, if we put a dummy statement here, some more clever compilers
2773     complain that it can't be reached. Sigh. */
2774 nigel 77
2775 nigel 91 return match_count;
2776 nigel 77 }
2777    
2778    
2779    
2780    
2781     /*************************************************
2782     * Execute a Regular Expression - DFA engine *
2783     *************************************************/
2784    
2785     /* This external function applies a compiled re to a subject string using a DFA
2786     engine. This function calls the internal function multiple times if the pattern
2787     is not anchored.
2788    
2789     Arguments:
2790     argument_re points to the compiled expression
2791 ph10 97 extra_data points to extra data or is NULL
2792 nigel 77 subject points to the subject string
2793     length length of subject string (may contain binary zeros)
2794     start_offset where to start in the subject string
2795     options option bits
2796     offsets vector of match offsets
2797     offsetcount size of same
2798     workspace workspace vector
2799     wscount size of same
2800    
2801     Returns: > 0 => number of match offset pairs placed in offsets
2802     = 0 => offsets overflowed; longest matches are present
2803     -1 => failed to match
2804     < -1 => some kind of unexpected problem
2805     */
2806    
2807 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2808 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2809     const char *subject, int length, int start_offset, int options, int *offsets,
2810     int offsetcount, int *workspace, int wscount)
2811     {
2812     real_pcre *re = (real_pcre *)argument_re;
2813     dfa_match_data match_block;
2814 nigel 91 dfa_match_data *md = &match_block;
2815 nigel 77 BOOL utf8, anchored, startline, firstline;
2816     const uschar *current_subject, *end_subject, *lcc;
2817    
2818     pcre_study_data internal_study;
2819     const pcre_study_data *study = NULL;
2820     real_pcre internal_re;
2821    
2822     const uschar *req_byte_ptr;
2823     const uschar *start_bits = NULL;
2824     BOOL first_byte_caseless = FALSE;
2825     BOOL req_byte_caseless = FALSE;
2826     int first_byte = -1;
2827     int req_byte = -1;
2828     int req_byte2 = -1;
2829 nigel 91 int newline;
2830 nigel 77
2831     /* Plausibility checks */
2832    
2833     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2834     if (re == NULL || subject == NULL || workspace == NULL ||
2835     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2836     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2837     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2838    
2839     /* We need to find the pointer to any study data before we test for byte
2840     flipping, so we scan the extra_data block first. This may set two fields in the
2841     match block, so we must initialize them beforehand. However, the other fields
2842     in the match block must not be set until after the byte flipping. */
2843    
2844 nigel 91 md->tables = re->tables;
2845     md->callout_data = NULL;
2846 nigel 77
2847     if (extra_data != NULL)
2848     {
2849     unsigned int flags = extra_data->flags;
2850     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2851     study = (const pcre_study_data *)extra_data->study_data;
2852     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2853 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2854     return PCRE_ERROR_DFA_UMLIMIT;
2855 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2856 nigel 91 md->callout_data = extra_data->callout_data;
2857 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2858 nigel 91 md->tables = extra_data->tables;
2859 nigel 77 }
2860 ph10 461
2861 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
2862     test for a regex that was compiled on a host of opposite endianness. If this is
2863     the case, flipped values are put in internal_re and internal_study if there was
2864     study data too. */
2865    
2866     if (re->magic_number != MAGIC_NUMBER)
2867     {
2868     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2869     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2870     if (study != NULL) study = &internal_study;
2871     }
2872    
2873     /* Set some local values */
2874    
2875     current_subject = (const unsigned char *)subject + start_offset;
2876     end_subject = (const unsigned char *)subject + length;
2877     req_byte_ptr = current_subject - 1;
2878    
2879 nigel 91 #ifdef SUPPORT_UTF8
2880 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2881 nigel 91 #else
2882     utf8 = FALSE;
2883     #endif
2884 nigel 77
2885 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2886     (re->options & PCRE_ANCHORED) != 0;
2887    
2888 nigel 77 /* The remaining fixed data for passing around. */
2889    
2890 nigel 91 md->start_code = (const uschar *)argument_re +
2891 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2892 nigel 91 md->start_subject = (const unsigned char *)subject;
2893     md->end_subject = end_subject;
2894 ph10 442 md->start_offset = start_offset;
2895 nigel 91 md->moptions = options;
2896     md->poptions = re->options;
2897 nigel 77
2898 ph10 231 /* If the BSR option is not set at match time, copy what was set
2899     at compile time. */
2900    
2901     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2902     {
2903     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2904     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2905     #ifdef BSR_ANYCRLF
2906     else md->moptions |= PCRE_BSR_ANYCRLF;
2907 ph10 243 #endif
2908     }
2909 ph10 231
2910 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2911     nothing is set at run time, whatever was used at compile time applies. */
2912 nigel 91
2913 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2914 nigel 93 PCRE_NEWLINE_BITS)
2915 nigel 91 {
2916 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2917 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2918     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2919 nigel 91 case PCRE_NEWLINE_CR+
2920 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2921 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2922 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2923 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2924 nigel 91 }
2925    
2926 ph10 149 if (newline == -2)
2927 nigel 91 {
2928 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2929     }
2930     else if (newline < 0)
2931     {
2932 nigel 93 md->nltype = NLTYPE_ANY;
2933 nigel 91 }
2934     else
2935     {
2936 nigel 93 md->nltype = NLTYPE_FIXED;
2937     if (newline > 255)
2938     {
2939     md->nllen = 2;
2940     md->nl[0] = (newline >> 8) & 255;
2941     md->nl[1] = newline & 255;
2942     }
2943     else
2944     {
2945     md->nllen = 1;
2946     md->nl[0] = newline;
2947     }
2948 nigel 91 }
2949    
2950 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2951     back the character offset. */
2952    
2953     #ifdef SUPPORT_UTF8
2954     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2955     {
2956     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2957     return PCRE_ERROR_BADUTF8;
2958     if (start_offset > 0 && start_offset < length)
2959     {
2960     int tb = ((uschar *)subject)[start_offset];
2961     if (tb > 127)
2962     {
2963     tb &= 0xc0;
2964     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2965     }
2966     }
2967     }
2968     #endif
2969    
2970     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2971     is a feature that makes it possible to save compiled regex and re-use them
2972     in other programs later. */
2973    
2974 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2975 nigel 77
2976     /* The lower casing table and the "must be at the start of a line" flag are
2977     used in a loop when finding where to start. */
2978    
2979 nigel 91 lcc = md->tables + lcc_offset;
2980 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2981 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2982    
2983     /* Set up the first character to match, if available. The first_byte value is
2984     never set for an anchored regular expression, but the anchoring may be forced
2985     at run time, so we have to test for anchoring. The first char may be unset for
2986     an unanchored pattern, of course. If there's no first char and the pattern was
2987     studied, there may be a bitmap of possible first characters. */
2988    
2989     if (!anchored)
2990     {
2991 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2992 nigel 77 {
2993     first_byte = re->first_byte & 255;
2994     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2995     first_byte = lcc[first_byte];
2996     }
2997     else
2998     {
2999 ph10 455 if (!startline && study != NULL &&
3000     (study->flags & PCRE_STUDY_MAPPED) != 0)
3001 nigel 77 start_bits = study->start_bits;
3002     }
3003     }
3004    
3005     /* For anchored or unanchored matches, there may be a "last known required
3006     character" set. */
3007    
3008 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3009 nigel 77 {
3010     req_byte = re->req_byte & 255;
3011     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3012 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3013 nigel 77 }
3014    
3015     /* Call the main matching function, looping for a non-anchored regex after a
3016 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3017     a match. */
3018 nigel 77
3019     for (;;)
3020     {
3021     int rc;
3022    
3023     if ((options & PCRE_DFA_RESTART) == 0)
3024     {
3025     const uschar *save_end_subject = end_subject;
3026    
3027 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3028     line of a multiline string. Implement this by temporarily adjusting
3029     end_subject so that we stop scanning at a newline. If the match fails at
3030     the newline, later code breaks this loop. */
3031 nigel 77
3032     if (firstline)
3033     {
3034 ph10 365 USPTR t = current_subject;
3035     #ifdef SUPPORT_UTF8
3036     if (utf8)
3037 ph10 371 {
3038     while (t < md->end_subject && !IS_NEWLINE(t))
3039 ph10 365 {
3040     t++;
3041     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3042 ph10 371 }
3043 ph10 365 }
3044     else
3045 ph10 371 #endif
3046 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3047 nigel 77 end_subject = t;
3048     }
3049 ph10 392
3050 ph10 389 /* There are some optimizations that avoid running the match if a known
3051 ph10 455 starting point is not found. However, there is an option that disables
3052     these, for testing and for ensuring that all callouts do actually occur. */
3053 nigel 77
3054 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
3055 ph10 392 {
3056 ph10 389 /* Advance to a known first byte. */
3057 ph10 392
3058 ph10 389 if (first_byte >= 0)
3059 nigel 77 {
3060 ph10 389 if (first_byte_caseless)
3061     while (current_subject < end_subject &&
3062     lcc[*current_subject] != first_byte)
3063     current_subject++;
3064     else
3065 ph10 392 while (current_subject < end_subject &&
3066 ph10 389 *current_subject != first_byte)
3067     current_subject++;
3068     }
3069 ph10 392
3070 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3071 ph10 392
3072 ph10 389 else if (startline)
3073     {
3074     if (current_subject > md->start_subject + start_offset)
3075     {
3076 ph10 365 #ifdef SUPPORT_UTF8
3077 ph10 389 if (utf8)
3078 ph10 365 {
3079 ph10 392 while (current_subject < end_subject &&
3080 ph10 389 !WAS_NEWLINE(current_subject))
3081     {
3082 ph10 365 current_subject++;
3083 ph10 389 while(current_subject < end_subject &&
3084     (*current_subject & 0xc0) == 0x80)
3085     current_subject++;
3086     }
3087 ph10 371 }
3088 ph10 389 else
3089     #endif
3090     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3091     current_subject++;
3092 ph10 392
3093 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3094     ANYCRLF, and we are now at a LF, advance the match position by one
3095     more character. */
3096 ph10 392
3097 ph10 391 if (current_subject[-1] == CHAR_CR &&
3098 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3099     current_subject < end_subject &&
3100 ph10 391 *current_subject == CHAR_NL)
3101 ph10 389 current_subject++;
3102 ph10 365 }
3103 nigel 77 }
3104 ph10 392
3105 ph10 389 /* Or to a non-unique first char after study */
3106 ph10 392
3107 ph10 389 else if (start_bits != NULL)
3108 nigel 77 {
3109 ph10 389 while (current_subject < end_subject)
3110     {
3111     register unsigned int c = *current_subject;
3112     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
3113     else break;
3114     }
3115 nigel 77 }
3116 ph10 392 }
3117 nigel 77
3118     /* Restore fudged end_subject */
3119    
3120     end_subject = save_end_subject;
3121    
3122 ph10 461 /* The following two optimizations are disabled for partial matching or if
3123     disabling is explicitly requested (and of course, by the test above, this
3124 ph10 455 code is not obeyed when restarting after a partial match). */
3125 ph10 461
3126 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3127     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3128 ph10 461 {
3129 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3130     is a lower bound; no actual string of that length may actually match the
3131     pattern. Although the value is, strictly, in characters, we treat it as
3132     bytes to avoid spending too much time in this optimization. */
3133 nigel 77
3134 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3135 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3136 ph10 455 return PCRE_ERROR_NOMATCH;
3137 ph10 461
3138 ph10 455 /* If req_byte is set, we know that that character must appear in the
3139     subject for the match to succeed. If the first character is set, req_byte
3140     must be later in the subject; otherwise the test starts at the match
3141     point. This optimization can save a huge amount of work in patterns with
3142     nested unlimited repeats that aren't going to match. Writing separate
3143     code for cased/caseless versions makes it go faster, as does using an
3144     autoincrement and backing off on a match.
3145 ph10 461
3146 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3147     can take a long time, and give bad performance on quite ordinary
3148     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3149     string... so we don't do this when the string is sufficiently long. */
3150 ph10 461
3151 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3152 nigel 77 {
3153 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3154 ph10 461
3155 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3156     place we found it at last time. */
3157 ph10 461
3158 ph10 455 if (p > req_byte_ptr)
3159 nigel 77 {
3160 ph10 455 if (req_byte_caseless)
3161     {
3162     while (p < end_subject)
3163     {
3164     register int pp = *p++;
3165     if (pp == req_byte || pp == req_byte2) { p--; break; }
3166     }
3167     }
3168     else
3169     {
3170     while (p < end_subject)
3171     {
3172     if (*p++ == req_byte) { p--; break; }
3173     }
3174     }
3175 ph10 461
3176 ph10 455 /* If we can't find the required character, break the matching loop,
3177     which will cause a return or PCRE_ERROR_NOMATCH. */
3178 ph10 461
3179 ph10 455 if (p >= end_subject) break;
3180 ph10 461
3181 ph10 455 /* If we have found the required character, save the point where we
3182     found it, so that we don't search again next time round the loop if
3183     the start hasn't passed this character yet. */
3184 ph10 461
3185 ph10 455 req_byte_ptr = p;
3186 nigel 77 }
3187 ph10 461 }
3188 nigel 77 }
3189 ph10 455 } /* End of optimizations that are done when not restarting */
3190 nigel 77
3191     /* OK, now we can do the business */
3192    
3193 ph10 435 md->start_used_ptr = current_subject;
3194 ph10 461
3195 nigel 77 rc = internal_dfa_exec(
3196 nigel 91 md, /* fixed match data */
3197     md->start_code, /* this subexpression's code */
3198     current_subject, /* where we currently are */
3199     start_offset, /* start offset in subject */
3200     offsets, /* offset vector */
3201     offsetcount, /* size of same */
3202     workspace, /* workspace vector */
3203     wscount, /* size of same */
3204 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3205 nigel 91 0, /* function recurse level */
3206     0); /* regex recurse level */
3207 nigel 77
3208     /* Anything other than "no match" means we are done, always; otherwise, carry
3209     on only if not anchored. */
3210    
3211     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3212    
3213     /* Advance to the next subject character unless we are at the end of a line
3214     and firstline is set. */
3215    
3216 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3217 nigel 77 current_subject++;
3218     if (utf8)
3219     {
3220     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3221     current_subject++;
3222     }
3223     if (current_subject > end_subject) break;
3224    
3225 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3226 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3227     or ANY or ANYCRLF, advance the match position by one more character. */
3228 nigel 93
3229 ph10 391 if (current_subject[-1] == CHAR_CR &&
3230 ph10 226 current_subject < end_subject &&
3231 ph10 391 *current_subject == CHAR_NL &&
3232 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3233 ph10 226 (md->nltype == NLTYPE_ANY ||
3234     md->nltype == NLTYPE_ANYCRLF ||
3235     md->nllen == 2))
3236 nigel 93 current_subject++;
3237    
3238     } /* "Bumpalong" loop */
3239    
3240 nigel 77 return PCRE_ERROR_NOMATCH;
3241     }
3242    
3243     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12