/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 567 - (hide annotations) (download)
Sat Nov 6 17:10:00 2010 UTC (3 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 112266 byte(s)
Test for ridiculous values of starting offsets; tidy UTF-8 code.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 473 Copyright (c) 1997-2010 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
125     1, /* Char */
126     1, /* Charnc */
127     1, /* not */
128     /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130     3, 3, 3, /* upto, minupto, exact */
131 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
132 nigel 77 /* Negative single-char repeats - only for chars < 256 */
133     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
134     3, 3, 3, /* NOT upto, minupto, exact */
135 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
136 nigel 77 /* Positive type repeats */
137     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* Type upto, minupto, exact */
139 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
140 nigel 77 /* Character class & ref repeats */
141     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
142     0, 0, /* CRRANGE, CRMINRANGE */
143     0, /* CLASS */
144     0, /* NCLASS */
145     0, /* XCLASS - variable length */
146     0, /* REF */
147     0, /* RECURSE */
148     0, /* CALLOUT */
149     0, /* Alt */
150     0, /* Ket */
151     0, /* KetRmax */
152     0, /* KetRmin */
153     0, /* Assert */
154     0, /* Assert not */
155     0, /* Assert behind */
156     0, /* Assert behind not */
157     0, /* Reverse */
158 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
159     0, 0, 0, /* SBRA, SCBRA, SCOND */
160 ph10 498 0, 0, /* CREF, NCREF */
161     0, 0, /* RREF, NRREF */
162 nigel 93 0, /* DEF */
163 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
164 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
165     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
166     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
167 nigel 77 };
168    
169 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
170 ph10 462 remember the fact that a character could have been inspected when the end of
171 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
172     two tables that follow must also be modified. */
173 ph10 462
174     static const uschar poptable[] = {
175     0, /* End */
176 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
177 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
178     1, 1, 1, /* Any, AllAny, Anybyte */
179 ph10 498 1, 1, /* \P, \p */
180 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
181 ph10 498 1, /* \X */
182 ph10 462 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
183     1, /* Char */
184     1, /* Charnc */
185     1, /* not */
186     /* Positive single-char repeats */
187     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
188     1, 1, 1, /* upto, minupto, exact */
189     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
190     /* Negative single-char repeats - only for chars < 256 */
191     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
192     1, 1, 1, /* NOT upto, minupto, exact */
193     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
194     /* Positive type repeats */
195     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
196     1, 1, 1, /* Type upto, minupto, exact */
197     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
198     /* Character class & ref repeats */
199     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200     1, 1, /* CRRANGE, CRMINRANGE */
201     1, /* CLASS */
202     1, /* NCLASS */
203     1, /* XCLASS - variable length */
204     0, /* REF */
205     0, /* RECURSE */
206     0, /* CALLOUT */
207     0, /* Alt */
208     0, /* Ket */
209     0, /* KetRmax */
210     0, /* KetRmin */
211     0, /* Assert */
212     0, /* Assert not */
213     0, /* Assert behind */
214     0, /* Assert behind not */
215     0, /* Reverse */
216     0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
217     0, 0, 0, /* SBRA, SCBRA, SCOND */
218 ph10 498 0, 0, /* CREF, NCREF */
219     0, 0, /* RREF, NRREF */
220 ph10 462 0, /* DEF */
221     0, 0, /* BRAZERO, BRAMINZERO */
222 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
223     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
224     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
225 ph10 462 };
226    
227 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
228     and \w */
229    
230 ph10 327 static const uschar toptable1[] = {
231 ph10 168 0, 0, 0, 0, 0, 0,
232 nigel 77 ctype_digit, ctype_digit,
233     ctype_space, ctype_space,
234     ctype_word, ctype_word,
235 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
236 nigel 77 };
237    
238 ph10 327 static const uschar toptable2[] = {
239 ph10 168 0, 0, 0, 0, 0, 0,
240 nigel 77 ctype_digit, 0,
241     ctype_space, 0,
242     ctype_word, 0,
243 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
244 nigel 77 };
245    
246    
247     /* Structure for holding data about a particular state, which is in effect the
248     current data for an active path through the match tree. It must consist
249     entirely of ints because the working vector we are passed, and which we put
250     these structures in, is a vector of ints. */
251    
252     typedef struct stateblock {
253     int offset; /* Offset to opcode */
254     int count; /* Count for repeats */
255     int ims; /* ims flag bits */
256     int data; /* Some use extra data */
257     } stateblock;
258    
259     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
260    
261    
262 ph10 475 #ifdef PCRE_DEBUG
263 nigel 77 /*************************************************
264     * Print character string *
265     *************************************************/
266    
267     /* Character string printing function for debugging.
268    
269     Arguments:
270     p points to string
271     length number of bytes
272     f where to print
273    
274     Returns: nothing
275     */
276    
277     static void
278     pchars(unsigned char *p, int length, FILE *f)
279     {
280     int c;
281     while (length-- > 0)
282     {
283     if (isprint(c = *(p++)))
284     fprintf(f, "%c", c);
285     else
286     fprintf(f, "\\x%02x", c);
287     }
288     }
289     #endif
290    
291    
292    
293     /*************************************************
294     * Execute a Regular Expression - DFA engine *
295     *************************************************/
296    
297     /* This internal function applies a compiled pattern to a subject string,
298     starting at a given point, using a DFA engine. This function is called from the
299     external one, possibly multiple times if the pattern is not anchored. The
300     function calls itself recursively for some kinds of subpattern.
301    
302     Arguments:
303     md the match_data block with fixed information
304     this_start_code the opening bracket of this subexpression's code
305     current_subject where we currently are in the subject string
306     start_offset start offset in the subject string
307     offsets vector to contain the matching string offsets
308     offsetcount size of same
309     workspace vector of workspace
310     wscount size of same
311     ims the current ims flags
312     rlevel function call recursion level
313     recursing regex recursive call level
314    
315 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
316 ph10 341 = 0 => offsets overflowed; longest matches are present
317 nigel 77 -1 => failed to match
318     < -1 => some kind of unexpected problem
319    
320     The following macros are used for adding states to the two state vectors (one
321     for the current character, one for the following character). */
322    
323     #define ADD_ACTIVE(x,y) \
324     if (active_count++ < wscount) \
325     { \
326     next_active_state->offset = (x); \
327     next_active_state->count = (y); \
328     next_active_state->ims = ims; \
329     next_active_state++; \
330     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
331     } \
332     else return PCRE_ERROR_DFA_WSSIZE
333    
334     #define ADD_ACTIVE_DATA(x,y,z) \
335     if (active_count++ < wscount) \
336     { \
337     next_active_state->offset = (x); \
338     next_active_state->count = (y); \
339     next_active_state->ims = ims; \
340     next_active_state->data = (z); \
341     next_active_state++; \
342     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
343     } \
344     else return PCRE_ERROR_DFA_WSSIZE
345    
346     #define ADD_NEW(x,y) \
347     if (new_count++ < wscount) \
348     { \
349     next_new_state->offset = (x); \
350     next_new_state->count = (y); \
351     next_new_state->ims = ims; \
352     next_new_state++; \
353     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
354     } \
355     else return PCRE_ERROR_DFA_WSSIZE
356    
357     #define ADD_NEW_DATA(x,y,z) \
358     if (new_count++ < wscount) \
359     { \
360     next_new_state->offset = (x); \
361     next_new_state->count = (y); \
362     next_new_state->ims = ims; \
363     next_new_state->data = (z); \
364     next_new_state++; \
365     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
366     } \
367     else return PCRE_ERROR_DFA_WSSIZE
368    
369     /* And now, here is the code */
370    
371     static int
372     internal_dfa_exec(
373     dfa_match_data *md,
374     const uschar *this_start_code,
375     const uschar *current_subject,
376     int start_offset,
377     int *offsets,
378     int offsetcount,
379     int *workspace,
380     int wscount,
381     int ims,
382     int rlevel,
383     int recursing)
384     {
385     stateblock *active_states, *new_states, *temp_states;
386     stateblock *next_active_state, *next_new_state;
387    
388     const uschar *ctypes, *lcc, *fcc;
389     const uschar *ptr;
390 nigel 93 const uschar *end_code, *first_op;
391 nigel 77
392     int active_count, new_count, match_count;
393    
394     /* Some fields in the md block are frequently referenced, so we load them into
395     independent variables in the hope that this will perform better. */
396    
397     const uschar *start_subject = md->start_subject;
398     const uschar *end_subject = md->end_subject;
399     const uschar *start_code = md->start_code;
400    
401 nigel 87 #ifdef SUPPORT_UTF8
402 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
403 nigel 93 #else
404     BOOL utf8 = FALSE;
405 nigel 87 #endif
406 nigel 77
407     rlevel++;
408     offsetcount &= (-2);
409    
410     wscount -= 2;
411     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
412     (2 * INTS_PER_STATEBLOCK);
413    
414     DPRINTF(("\n%.*s---------------------\n"
415     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
416     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
417    
418     ctypes = md->tables + ctypes_offset;
419     lcc = md->tables + lcc_offset;
420     fcc = md->tables + fcc_offset;
421    
422     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
423    
424     active_states = (stateblock *)(workspace + 2);
425     next_new_state = new_states = active_states + wscount;
426     new_count = 0;
427    
428 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
429     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
430    
431 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
432     the alternative states onto the list, and find out where the end is. This
433     makes is possible to use this function recursively, when we want to stop at a
434     matching internal ket rather than at the end.
435    
436     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
437     a backward assertion. In that case, we have to find out the maximum amount to
438     move back, and set up each alternative appropriately. */
439    
440 nigel 93 if (*first_op == OP_REVERSE)
441 nigel 77 {
442     int max_back = 0;
443     int gone_back;
444    
445     end_code = this_start_code;
446     do
447     {
448     int back = GET(end_code, 2+LINK_SIZE);
449     if (back > max_back) max_back = back;
450     end_code += GET(end_code, 1);
451     }
452     while (*end_code == OP_ALT);
453    
454     /* If we can't go back the amount required for the longest lookbehind
455     pattern, go back as far as we can; some alternatives may still be viable. */
456    
457     #ifdef SUPPORT_UTF8
458     /* In character mode we have to step back character by character */
459    
460     if (utf8)
461     {
462     for (gone_back = 0; gone_back < max_back; gone_back++)
463     {
464     if (current_subject <= start_subject) break;
465     current_subject--;
466     while (current_subject > start_subject &&
467     (*current_subject & 0xc0) == 0x80)
468     current_subject--;
469     }
470     }
471     else
472     #endif
473    
474     /* In byte-mode we can do this quickly. */
475    
476     {
477     gone_back = (current_subject - max_back < start_subject)?
478 ph10 530 (int)(current_subject - start_subject) : max_back;
479 nigel 77 current_subject -= gone_back;
480     }
481 ph10 461
482 ph10 435 /* Save the earliest consulted character */
483 nigel 77
484 ph10 461 if (current_subject < md->start_used_ptr)
485     md->start_used_ptr = current_subject;
486    
487 nigel 77 /* Now we can process the individual branches. */
488    
489     end_code = this_start_code;
490     do
491     {
492     int back = GET(end_code, 2+LINK_SIZE);
493     if (back <= gone_back)
494     {
495 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
496 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
497     }
498     end_code += GET(end_code, 1);
499     }
500     while (*end_code == OP_ALT);
501     }
502    
503     /* This is the code for a "normal" subpattern (not a backward assertion). The
504     start of a whole pattern is always one of these. If we are at the top level,
505     we may be asked to restart matching from the same point that we reached for a
506     previous partial match. We still have to scan through the top-level branches to
507     find the end state. */
508    
509     else
510     {
511     end_code = this_start_code;
512    
513     /* Restarting */
514    
515     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
516     {
517     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
518     new_count = workspace[1];
519     if (!workspace[0])
520     memcpy(new_states, active_states, new_count * sizeof(stateblock));
521     }
522    
523     /* Not restarting */
524    
525     else
526     {
527 nigel 93 int length = 1 + LINK_SIZE +
528     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
529 nigel 77 do
530     {
531 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
532 nigel 77 end_code += GET(end_code, 1);
533 nigel 93 length = 1 + LINK_SIZE;
534 nigel 77 }
535     while (*end_code == OP_ALT);
536     }
537     }
538    
539     workspace[0] = 0; /* Bit indicating which vector is current */
540    
541     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
542    
543     /* Loop for scanning the subject */
544    
545     ptr = current_subject;
546     for (;;)
547     {
548     int i, j;
549 nigel 91 int clen, dlen;
550     unsigned int c, d;
551 ph10 428 int forced_fail = 0;
552 ph10 462 BOOL could_continue = FALSE;
553 nigel 77
554     /* Make the new state list into the active state list and empty the
555     new state list. */
556    
557     temp_states = active_states;
558     active_states = new_states;
559     new_states = temp_states;
560     active_count = new_count;
561     new_count = 0;
562    
563     workspace[0] ^= 1; /* Remember for the restarting feature */
564     workspace[1] = active_count;
565    
566 ph10 475 #ifdef PCRE_DEBUG
567 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
568     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
569     printf("\"\n");
570    
571     printf("%.*sActive states: ", rlevel*2-2, SP);
572     for (i = 0; i < active_count; i++)
573     printf("%d/%d ", active_states[i].offset, active_states[i].count);
574     printf("\n");
575     #endif
576    
577     /* Set the pointers for adding new states */
578    
579     next_active_state = active_states + active_count;
580     next_new_state = new_states;
581    
582     /* Load the current character from the subject outside the loop, as many
583     different states may want to look at it, and we assume that at least one
584     will. */
585    
586     if (ptr < end_subject)
587     {
588 nigel 93 clen = 1; /* Number of bytes in the character */
589 nigel 77 #ifdef SUPPORT_UTF8
590     if (utf8) { GETCHARLEN(c, ptr, clen); } else
591     #endif /* SUPPORT_UTF8 */
592     c = *ptr;
593     }
594     else
595     {
596 nigel 93 clen = 0; /* This indicates the end of the subject */
597     c = NOTACHAR; /* This value should never actually be used */
598 nigel 77 }
599    
600     /* Scan up the active states and act on each one. The result of an action
601     may be to add more states to the currently active list (e.g. on hitting a
602     parenthesis) or it may be to put states on the new list, for considering
603     when we move the character pointer on. */
604    
605     for (i = 0; i < active_count; i++)
606     {
607     stateblock *current_state = active_states + i;
608     const uschar *code;
609     int state_offset = current_state->offset;
610 ph10 397 int count, codevalue, rrc;
611 nigel 77
612 ph10 475 #ifdef PCRE_DEBUG
613 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
614 nigel 93 if (clen == 0) printf("EOL\n");
615 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
616     else printf("0x%02x\n", c);
617     #endif
618    
619     /* This variable is referred to implicity in the ADD_xxx macros. */
620    
621     ims = current_state->ims;
622    
623     /* A negative offset is a special case meaning "hold off going to this
624     (negated) state until the number of characters in the data field have
625     been skipped". */
626    
627     if (state_offset < 0)
628     {
629     if (current_state->data > 0)
630     {
631     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
632     ADD_NEW_DATA(state_offset, current_state->count,
633     current_state->data - 1);
634     continue;
635     }
636     else
637     {
638     current_state->offset = state_offset = -state_offset;
639     }
640     }
641    
642 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
643 ph10 439 See the note at the head of this module about the possibility of improving
644     performance here. */
645 nigel 77
646     for (j = 0; j < i; j++)
647     {
648     if (active_states[j].offset == state_offset &&
649     active_states[j].count == current_state->count)
650     {
651     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
652     goto NEXT_ACTIVE_STATE;
653     }
654     }
655    
656     /* The state offset is the offset to the opcode */
657    
658     code = start_code + state_offset;
659     codevalue = *code;
660    
661 ph10 463 /* If this opcode inspects a character, but we are at the end of the
662     subject, remember the fact for use when testing for a partial match. */
663    
664 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
665 ph10 463 could_continue = TRUE;
666 ph10 462
667 nigel 77 /* If this opcode is followed by an inline character, load it. It is
668     tempting to test for the presence of a subject character here, but that
669     is wrong, because sometimes zero repetitions of the subject are
670     permitted.
671    
672     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
673 ph10 178 argument that is not a data character - but is always one byte long. We
674     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
675     this case. To keep the other cases fast, convert these ones to new opcodes.
676     */
677 nigel 77
678     if (coptable[codevalue] > 0)
679     {
680     dlen = 1;
681     #ifdef SUPPORT_UTF8
682     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
683     #endif /* SUPPORT_UTF8 */
684     d = code[coptable[codevalue]];
685     if (codevalue >= OP_TYPESTAR)
686     {
687 nigel 93 switch(d)
688     {
689     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
690     case OP_NOTPROP:
691     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
692     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
693     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
694 ph10 178 case OP_NOT_HSPACE:
695 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
696 ph10 178 case OP_NOT_VSPACE:
697 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
698 nigel 93 default: break;
699     }
700 nigel 77 }
701     }
702     else
703     {
704     dlen = 0; /* Not strictly necessary, but compilers moan */
705 nigel 93 d = NOTACHAR; /* if these variables are not set. */
706 nigel 77 }
707    
708    
709     /* Now process the individual opcodes */
710    
711     switch (codevalue)
712     {
713 ph10 498 /* ========================================================================== */
714     /* These cases are never obeyed. This is a fudge that causes a compile-
715     time error if the vectors coptable or poptable, which are indexed by
716     opcode, are not the correct length. It seems to be the only way to do
717     such a check at compile time, as the sizeof() operator does not work
718     in the C preprocessor. */
719 ph10 507
720 ph10 498 case OP_TABLE_LENGTH:
721 ph10 507 case OP_TABLE_LENGTH +
722 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
723     (sizeof(poptable) == OP_TABLE_LENGTH)):
724 ph10 507 break;
725 nigel 77
726     /* ========================================================================== */
727     /* Reached a closing bracket. If not at the end of the pattern, carry
728     on with the next opcode. Otherwise, unless we have an empty string and
729 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
730 ph10 442 start of the subject, save the match data, shifting up all previous
731 nigel 77 matches so we always have the longest first. */
732    
733     case OP_KET:
734     case OP_KETRMIN:
735     case OP_KETRMAX:
736     if (code != end_code)
737     {
738     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
739     if (codevalue != OP_KET)
740     {
741     ADD_ACTIVE(state_offset - GET(code, 1), 0);
742     }
743     }
744 ph10 461 else
745 nigel 77 {
746 ph10 461 if (ptr > current_subject ||
747 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
748     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
749     current_subject > start_subject + md->start_offset)))
750 nigel 77 {
751 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
752     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
753     match_count = 0;
754     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
755     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
756     if (offsetcount >= 2)
757     {
758 ph10 530 offsets[0] = (int)(current_subject - start_subject);
759     offsets[1] = (int)(ptr - start_subject);
760 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
761     offsets[1] - offsets[0], current_subject));
762     }
763     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
764     {
765     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
766     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
767     match_count, rlevel*2-2, SP));
768     return match_count;
769     }
770 ph10 461 }
771 nigel 77 }
772     break;
773    
774     /* ========================================================================== */
775     /* These opcodes add to the current list of states without looking
776     at the current character. */
777    
778     /*-----------------------------------------------------------------*/
779     case OP_ALT:
780     do { code += GET(code, 1); } while (*code == OP_ALT);
781 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
782 nigel 77 break;
783    
784     /*-----------------------------------------------------------------*/
785     case OP_BRA:
786 nigel 93 case OP_SBRA:
787 nigel 77 do
788     {
789 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
790 nigel 77 code += GET(code, 1);
791     }
792     while (*code == OP_ALT);
793     break;
794    
795     /*-----------------------------------------------------------------*/
796 nigel 93 case OP_CBRA:
797     case OP_SCBRA:
798 ph10 530 ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
799 nigel 93 code += GET(code, 1);
800     while (*code == OP_ALT)
801     {
802 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
803 nigel 93 code += GET(code, 1);
804     }
805     break;
806    
807     /*-----------------------------------------------------------------*/
808 nigel 77 case OP_BRAZERO:
809     case OP_BRAMINZERO:
810     ADD_ACTIVE(state_offset + 1, 0);
811     code += 1 + GET(code, 2);
812     while (*code == OP_ALT) code += GET(code, 1);
813 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
814 nigel 77 break;
815    
816     /*-----------------------------------------------------------------*/
817 ph10 335 case OP_SKIPZERO:
818     code += 1 + GET(code, 2);
819     while (*code == OP_ALT) code += GET(code, 1);
820 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821 ph10 335 break;
822    
823     /*-----------------------------------------------------------------*/
824 nigel 77 case OP_CIRC:
825     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
826 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
827     ptr != end_subject &&
828 nigel 93 WAS_NEWLINE(ptr)))
829 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
830     break;
831    
832     /*-----------------------------------------------------------------*/
833     case OP_EOD:
834 ph10 553 if (ptr >= end_subject)
835     {
836     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
837     could_continue = TRUE;
838     else { ADD_ACTIVE(state_offset + 1, 0); }
839     }
840 nigel 77 break;
841    
842     /*-----------------------------------------------------------------*/
843     case OP_OPT:
844     ims = code[1];
845     ADD_ACTIVE(state_offset + 2, 0);
846     break;
847    
848     /*-----------------------------------------------------------------*/
849     case OP_SOD:
850     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
851     break;
852    
853     /*-----------------------------------------------------------------*/
854     case OP_SOM:
855     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
856     break;
857    
858    
859     /* ========================================================================== */
860     /* These opcodes inspect the next subject character, and sometimes
861     the previous one as well, but do not have an argument. The variable
862     clen contains the length of the current character and is zero if we are
863     at the end of the subject. */
864    
865     /*-----------------------------------------------------------------*/
866     case OP_ANY:
867 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
868 nigel 77 { ADD_NEW(state_offset + 1, 0); }
869     break;
870    
871     /*-----------------------------------------------------------------*/
872 ph10 341 case OP_ALLANY:
873     if (clen > 0)
874     { ADD_NEW(state_offset + 1, 0); }
875     break;
876    
877     /*-----------------------------------------------------------------*/
878 nigel 77 case OP_EODN:
879 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
880     could_continue = TRUE;
881     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
882 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
883     break;
884    
885     /*-----------------------------------------------------------------*/
886     case OP_DOLL:
887     if ((md->moptions & PCRE_NOTEOL) == 0)
888     {
889 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
890     could_continue = TRUE;
891     else if (clen == 0 ||
892 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
893 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
894     ))
895 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
896     }
897 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
898 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
899     break;
900    
901     /*-----------------------------------------------------------------*/
902    
903     case OP_DIGIT:
904     case OP_WHITESPACE:
905     case OP_WORDCHAR:
906     if (clen > 0 && c < 256 &&
907     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
908     { ADD_NEW(state_offset + 1, 0); }
909     break;
910    
911     /*-----------------------------------------------------------------*/
912     case OP_NOT_DIGIT:
913     case OP_NOT_WHITESPACE:
914     case OP_NOT_WORDCHAR:
915     if (clen > 0 && (c >= 256 ||
916     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
917     { ADD_NEW(state_offset + 1, 0); }
918     break;
919    
920     /*-----------------------------------------------------------------*/
921     case OP_WORD_BOUNDARY:
922     case OP_NOT_WORD_BOUNDARY:
923     {
924     int left_word, right_word;
925    
926     if (ptr > start_subject)
927     {
928     const uschar *temp = ptr - 1;
929 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
930 nigel 77 #ifdef SUPPORT_UTF8
931     if (utf8) BACKCHAR(temp);
932     #endif
933     GETCHARTEST(d, temp);
934 ph10 535 #ifdef SUPPORT_UCP
935 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
936     {
937     if (d == '_') left_word = TRUE; else
938 ph10 535 {
939 ph10 518 int cat = UCD_CATEGORY(d);
940     left_word = (cat == ucp_L || cat == ucp_N);
941 ph10 535 }
942     }
943     else
944     #endif
945 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
946     }
947 ph10 518 else left_word = FALSE;
948 nigel 77
949 ph10 461 if (clen > 0)
950 ph10 535 {
951     #ifdef SUPPORT_UCP
952 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
953     {
954     if (c == '_') right_word = TRUE; else
955 ph10 535 {
956 ph10 518 int cat = UCD_CATEGORY(c);
957     right_word = (cat == ucp_L || cat == ucp_N);
958 ph10 535 }
959     }
960     else
961     #endif
962 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
963 ph10 535 }
964 ph10 518 else right_word = FALSE;
965 nigel 77
966     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
967     { ADD_ACTIVE(state_offset + 1, 0); }
968     }
969     break;
970    
971    
972     /*-----------------------------------------------------------------*/
973     /* Check the next character by Unicode property. We will get here only
974     if the support is in the binary; otherwise a compile-time error occurs.
975     */
976    
977 ph10 151 #ifdef SUPPORT_UCP
978 nigel 77 case OP_PROP:
979     case OP_NOTPROP:
980     if (clen > 0)
981     {
982 nigel 87 BOOL OK;
983 ph10 349 const ucd_record * prop = GET_UCD(c);
984 nigel 87 switch(code[1])
985 nigel 77 {
986 nigel 87 case PT_ANY:
987     OK = TRUE;
988     break;
989    
990     case PT_LAMP:
991 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
992 ph10 517 prop->chartype == ucp_Lt;
993 nigel 87 break;
994    
995     case PT_GC:
996 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
997 nigel 87 break;
998    
999     case PT_PC:
1000 ph10 349 OK = prop->chartype == code[2];
1001 nigel 87 break;
1002    
1003     case PT_SC:
1004 ph10 349 OK = prop->script == code[2];
1005 nigel 87 break;
1006 ph10 535
1007 ph10 517 /* These are specials for combination cases. */
1008 ph10 535
1009 ph10 517 case PT_ALNUM:
1010     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1011     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1012 ph10 535 break;
1013    
1014 ph10 517 case PT_SPACE: /* Perl space */
1015     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1016     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1017 ph10 535 break;
1018    
1019 ph10 517 case PT_PXSPACE: /* POSIX space */
1020     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1021     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1022     c == CHAR_FF || c == CHAR_CR;
1023 ph10 535 break;
1024    
1025 ph10 517 case PT_WORD:
1026     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1027     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1028     c == CHAR_UNDERSCORE;
1029 ph10 535 break;
1030 nigel 87
1031     /* Should never occur, but keep compilers from grumbling. */
1032    
1033     default:
1034     OK = codevalue != OP_PROP;
1035     break;
1036 nigel 77 }
1037 nigel 87
1038     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1039 nigel 77 }
1040     break;
1041     #endif
1042    
1043    
1044    
1045     /* ========================================================================== */
1046     /* These opcodes likewise inspect the subject character, but have an
1047     argument that is not a data character. It is one of these opcodes:
1048 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1049     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1050 nigel 77
1051     case OP_TYPEPLUS:
1052     case OP_TYPEMINPLUS:
1053 nigel 93 case OP_TYPEPOSPLUS:
1054 nigel 77 count = current_state->count; /* Already matched */
1055     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1056     if (clen > 0)
1057     {
1058     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1059     (c < 256 &&
1060 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1061 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1062     {
1063 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1064     {
1065     active_count--; /* Remove non-match possibility */
1066     next_active_state--;
1067     }
1068 nigel 77 count++;
1069     ADD_NEW(state_offset, count);
1070     }
1071     }
1072     break;
1073    
1074     /*-----------------------------------------------------------------*/
1075     case OP_TYPEQUERY:
1076     case OP_TYPEMINQUERY:
1077 nigel 93 case OP_TYPEPOSQUERY:
1078 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1079     if (clen > 0)
1080     {
1081     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1082     (c < 256 &&
1083 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1084 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1085     {
1086 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1087     {
1088     active_count--; /* Remove non-match possibility */
1089     next_active_state--;
1090     }
1091 nigel 77 ADD_NEW(state_offset + 2, 0);
1092     }
1093     }
1094     break;
1095    
1096     /*-----------------------------------------------------------------*/
1097     case OP_TYPESTAR:
1098     case OP_TYPEMINSTAR:
1099 nigel 93 case OP_TYPEPOSSTAR:
1100 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1101     if (clen > 0)
1102     {
1103     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1104     (c < 256 &&
1105 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1106 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1107     {
1108 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1109     {
1110     active_count--; /* Remove non-match possibility */
1111     next_active_state--;
1112     }
1113 nigel 77 ADD_NEW(state_offset, 0);
1114     }
1115     }
1116     break;
1117    
1118     /*-----------------------------------------------------------------*/
1119     case OP_TYPEEXACT:
1120 nigel 93 count = current_state->count; /* Number already matched */
1121     if (clen > 0)
1122     {
1123     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1124     (c < 256 &&
1125 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1126 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1127     {
1128     if (++count >= GET2(code, 1))
1129     { ADD_NEW(state_offset + 4, 0); }
1130     else
1131     { ADD_NEW(state_offset, count); }
1132     }
1133     }
1134     break;
1135    
1136     /*-----------------------------------------------------------------*/
1137 nigel 77 case OP_TYPEUPTO:
1138     case OP_TYPEMINUPTO:
1139 nigel 93 case OP_TYPEPOSUPTO:
1140     ADD_ACTIVE(state_offset + 4, 0);
1141 nigel 77 count = current_state->count; /* Number already matched */
1142     if (clen > 0)
1143     {
1144     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1145     (c < 256 &&
1146 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1147 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1148     {
1149 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1150     {
1151     active_count--; /* Remove non-match possibility */
1152     next_active_state--;
1153     }
1154 nigel 77 if (++count >= GET2(code, 1))
1155     { ADD_NEW(state_offset + 4, 0); }
1156     else
1157     { ADD_NEW(state_offset, count); }
1158     }
1159     }
1160     break;
1161    
1162     /* ========================================================================== */
1163     /* These are virtual opcodes that are used when something like
1164 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1165     argument. It keeps the code above fast for the other cases. The argument
1166     is in the d variable. */
1167 nigel 77
1168 ph10 151 #ifdef SUPPORT_UCP
1169 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1170     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1171 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1172 nigel 77 count = current_state->count; /* Already matched */
1173 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1174 nigel 77 if (clen > 0)
1175     {
1176 nigel 87 BOOL OK;
1177 ph10 349 const ucd_record * prop = GET_UCD(c);
1178 nigel 87 switch(code[2])
1179     {
1180     case PT_ANY:
1181     OK = TRUE;
1182     break;
1183    
1184     case PT_LAMP:
1185 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1186 ph10 517 prop->chartype == ucp_Lt;
1187 nigel 87 break;
1188    
1189     case PT_GC:
1190 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1191 nigel 87 break;
1192    
1193     case PT_PC:
1194 ph10 349 OK = prop->chartype == code[3];
1195 nigel 87 break;
1196    
1197     case PT_SC:
1198 ph10 349 OK = prop->script == code[3];
1199 nigel 87 break;
1200    
1201 ph10 517 /* These are specials for combination cases. */
1202 ph10 535
1203 ph10 517 case PT_ALNUM:
1204     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1205     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1206 ph10 535 break;
1207    
1208 ph10 517 case PT_SPACE: /* Perl space */
1209     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1210     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1211 ph10 535 break;
1212    
1213 ph10 517 case PT_PXSPACE: /* POSIX space */
1214     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1215     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1216     c == CHAR_FF || c == CHAR_CR;
1217 ph10 535 break;
1218    
1219 ph10 517 case PT_WORD:
1220     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1221     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1222     c == CHAR_UNDERSCORE;
1223 ph10 535 break;
1224 ph10 517
1225 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1226    
1227     default:
1228     OK = codevalue != OP_PROP;
1229     break;
1230     }
1231    
1232 nigel 93 if (OK == (d == OP_PROP))
1233     {
1234     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1235     {
1236     active_count--; /* Remove non-match possibility */
1237     next_active_state--;
1238     }
1239     count++;
1240     ADD_NEW(state_offset, count);
1241     }
1242 nigel 77 }
1243     break;
1244    
1245     /*-----------------------------------------------------------------*/
1246     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1247     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1248 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1249 nigel 77 count = current_state->count; /* Already matched */
1250     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1251 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1252 nigel 77 {
1253     const uschar *nptr = ptr + clen;
1254     int ncount = 0;
1255 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1256     {
1257     active_count--; /* Remove non-match possibility */
1258     next_active_state--;
1259     }
1260 nigel 77 while (nptr < end_subject)
1261     {
1262     int nd;
1263     int ndlen = 1;
1264     GETCHARLEN(nd, nptr, ndlen);
1265 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1266 nigel 77 ncount++;
1267     nptr += ndlen;
1268     }
1269     count++;
1270     ADD_NEW_DATA(-state_offset, count, ncount);
1271     }
1272     break;
1273 ph10 151 #endif
1274 nigel 77
1275     /*-----------------------------------------------------------------*/
1276 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1277     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1278     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1279     count = current_state->count; /* Already matched */
1280     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1281     if (clen > 0)
1282     {
1283     int ncount = 0;
1284     switch (c)
1285     {
1286     case 0x000b:
1287     case 0x000c:
1288     case 0x0085:
1289     case 0x2028:
1290     case 0x2029:
1291 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1292     goto ANYNL01;
1293    
1294     case 0x000d:
1295     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1296     /* Fall through */
1297    
1298     ANYNL01:
1299     case 0x000a:
1300 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1301     {
1302     active_count--; /* Remove non-match possibility */
1303     next_active_state--;
1304     }
1305     count++;
1306     ADD_NEW_DATA(-state_offset, count, ncount);
1307     break;
1308 ph10 231
1309 nigel 93 default:
1310     break;
1311     }
1312     }
1313     break;
1314    
1315     /*-----------------------------------------------------------------*/
1316 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1317     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1318     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1319     count = current_state->count; /* Already matched */
1320     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1321     if (clen > 0)
1322     {
1323 ph10 182 BOOL OK;
1324 ph10 178 switch (c)
1325     {
1326     case 0x000a:
1327     case 0x000b:
1328     case 0x000c:
1329     case 0x000d:
1330     case 0x0085:
1331     case 0x2028:
1332     case 0x2029:
1333     OK = TRUE;
1334 ph10 182 break;
1335 ph10 178
1336     default:
1337     OK = FALSE;
1338 ph10 182 break;
1339 ph10 178 }
1340    
1341     if (OK == (d == OP_VSPACE))
1342 ph10 182 {
1343 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1344     {
1345     active_count--; /* Remove non-match possibility */
1346     next_active_state--;
1347     }
1348     count++;
1349     ADD_NEW_DATA(-state_offset, count, 0);
1350     }
1351     }
1352     break;
1353    
1354     /*-----------------------------------------------------------------*/
1355     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1356     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1357     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1358     count = current_state->count; /* Already matched */
1359     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1360     if (clen > 0)
1361     {
1362 ph10 182 BOOL OK;
1363 ph10 178 switch (c)
1364     {
1365     case 0x09: /* HT */
1366     case 0x20: /* SPACE */
1367     case 0xa0: /* NBSP */
1368     case 0x1680: /* OGHAM SPACE MARK */
1369     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1370     case 0x2000: /* EN QUAD */
1371     case 0x2001: /* EM QUAD */
1372     case 0x2002: /* EN SPACE */
1373     case 0x2003: /* EM SPACE */
1374     case 0x2004: /* THREE-PER-EM SPACE */
1375     case 0x2005: /* FOUR-PER-EM SPACE */
1376     case 0x2006: /* SIX-PER-EM SPACE */
1377     case 0x2007: /* FIGURE SPACE */
1378     case 0x2008: /* PUNCTUATION SPACE */
1379     case 0x2009: /* THIN SPACE */
1380     case 0x200A: /* HAIR SPACE */
1381     case 0x202f: /* NARROW NO-BREAK SPACE */
1382     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1383     case 0x3000: /* IDEOGRAPHIC SPACE */
1384     OK = TRUE;
1385     break;
1386 ph10 182
1387 ph10 178 default:
1388     OK = FALSE;
1389     break;
1390     }
1391 ph10 182
1392 ph10 178 if (OK == (d == OP_HSPACE))
1393 ph10 182 {
1394 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1395     {
1396     active_count--; /* Remove non-match possibility */
1397     next_active_state--;
1398     }
1399     count++;
1400     ADD_NEW_DATA(-state_offset, count, 0);
1401     }
1402     }
1403     break;
1404    
1405     /*-----------------------------------------------------------------*/
1406 ph10 151 #ifdef SUPPORT_UCP
1407 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1408     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1409 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1410 nigel 87 count = 4;
1411 nigel 77 goto QS1;
1412    
1413     case OP_PROP_EXTRA + OP_TYPESTAR:
1414     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1415 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1416 nigel 77 count = 0;
1417    
1418     QS1:
1419    
1420 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1421 nigel 77 if (clen > 0)
1422     {
1423 nigel 87 BOOL OK;
1424 ph10 349 const ucd_record * prop = GET_UCD(c);
1425 nigel 87 switch(code[2])
1426     {
1427     case PT_ANY:
1428     OK = TRUE;
1429     break;
1430    
1431     case PT_LAMP:
1432 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1433 ph10 517 prop->chartype == ucp_Lt;
1434 nigel 87 break;
1435    
1436     case PT_GC:
1437 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1438 nigel 87 break;
1439    
1440     case PT_PC:
1441 ph10 349 OK = prop->chartype == code[3];
1442 nigel 87 break;
1443    
1444     case PT_SC:
1445 ph10 349 OK = prop->script == code[3];
1446 nigel 87 break;
1447 ph10 535
1448 ph10 517 /* These are specials for combination cases. */
1449 ph10 535
1450 ph10 517 case PT_ALNUM:
1451     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1452     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1453 ph10 535 break;
1454    
1455 ph10 517 case PT_SPACE: /* Perl space */
1456     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1457     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1458 ph10 535 break;
1459    
1460 ph10 517 case PT_PXSPACE: /* POSIX space */
1461     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1462     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1463     c == CHAR_FF || c == CHAR_CR;
1464 ph10 535 break;
1465    
1466 ph10 517 case PT_WORD:
1467     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1468     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1469     c == CHAR_UNDERSCORE;
1470 ph10 535 break;
1471 nigel 87
1472     /* Should never occur, but keep compilers from grumbling. */
1473    
1474     default:
1475     OK = codevalue != OP_PROP;
1476     break;
1477     }
1478    
1479 nigel 93 if (OK == (d == OP_PROP))
1480     {
1481     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1482     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1483     {
1484     active_count--; /* Remove non-match possibility */
1485     next_active_state--;
1486     }
1487     ADD_NEW(state_offset + count, 0);
1488     }
1489 nigel 77 }
1490     break;
1491    
1492     /*-----------------------------------------------------------------*/
1493     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1494     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1495 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1496 nigel 77 count = 2;
1497     goto QS2;
1498    
1499     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1500     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1501 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1502 nigel 77 count = 0;
1503    
1504     QS2:
1505    
1506     ADD_ACTIVE(state_offset + 2, 0);
1507 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1508 nigel 77 {
1509     const uschar *nptr = ptr + clen;
1510     int ncount = 0;
1511 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1512     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1513     {
1514     active_count--; /* Remove non-match possibility */
1515     next_active_state--;
1516     }
1517 nigel 77 while (nptr < end_subject)
1518     {
1519     int nd;
1520     int ndlen = 1;
1521     GETCHARLEN(nd, nptr, ndlen);
1522 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1523 nigel 77 ncount++;
1524     nptr += ndlen;
1525     }
1526     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1527     }
1528     break;
1529 ph10 151 #endif
1530 nigel 77
1531     /*-----------------------------------------------------------------*/
1532 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1533     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1534     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1535     count = 2;
1536     goto QS3;
1537    
1538     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1539     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1540     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1541     count = 0;
1542    
1543     QS3:
1544     ADD_ACTIVE(state_offset + 2, 0);
1545     if (clen > 0)
1546     {
1547     int ncount = 0;
1548     switch (c)
1549     {
1550     case 0x000b:
1551     case 0x000c:
1552     case 0x0085:
1553     case 0x2028:
1554     case 0x2029:
1555 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1556     goto ANYNL02;
1557    
1558     case 0x000d:
1559     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1560     /* Fall through */
1561    
1562     ANYNL02:
1563     case 0x000a:
1564 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1565     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1566     {
1567     active_count--; /* Remove non-match possibility */
1568     next_active_state--;
1569     }
1570     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1571     break;
1572 ph10 231
1573 nigel 93 default:
1574     break;
1575     }
1576     }
1577     break;
1578    
1579     /*-----------------------------------------------------------------*/
1580 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1581     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1582     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1583     count = 2;
1584     goto QS4;
1585    
1586     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1587     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1588     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1589     count = 0;
1590    
1591     QS4:
1592     ADD_ACTIVE(state_offset + 2, 0);
1593     if (clen > 0)
1594     {
1595 ph10 182 BOOL OK;
1596 ph10 178 switch (c)
1597     {
1598     case 0x000a:
1599     case 0x000b:
1600     case 0x000c:
1601     case 0x000d:
1602     case 0x0085:
1603     case 0x2028:
1604     case 0x2029:
1605     OK = TRUE;
1606     break;
1607 ph10 182
1608 ph10 178 default:
1609     OK = FALSE;
1610     break;
1611     }
1612     if (OK == (d == OP_VSPACE))
1613 ph10 182 {
1614 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1615     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1616     {
1617     active_count--; /* Remove non-match possibility */
1618     next_active_state--;
1619     }
1620     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1621     }
1622     }
1623     break;
1624    
1625     /*-----------------------------------------------------------------*/
1626     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1627     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1628     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1629     count = 2;
1630     goto QS5;
1631    
1632     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1633     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1634     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1635     count = 0;
1636    
1637     QS5:
1638     ADD_ACTIVE(state_offset + 2, 0);
1639     if (clen > 0)
1640     {
1641 ph10 182 BOOL OK;
1642 ph10 178 switch (c)
1643     {
1644     case 0x09: /* HT */
1645     case 0x20: /* SPACE */
1646     case 0xa0: /* NBSP */
1647     case 0x1680: /* OGHAM SPACE MARK */
1648     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1649     case 0x2000: /* EN QUAD */
1650     case 0x2001: /* EM QUAD */
1651     case 0x2002: /* EN SPACE */
1652     case 0x2003: /* EM SPACE */
1653     case 0x2004: /* THREE-PER-EM SPACE */
1654     case 0x2005: /* FOUR-PER-EM SPACE */
1655     case 0x2006: /* SIX-PER-EM SPACE */
1656     case 0x2007: /* FIGURE SPACE */
1657     case 0x2008: /* PUNCTUATION SPACE */
1658     case 0x2009: /* THIN SPACE */
1659     case 0x200A: /* HAIR SPACE */
1660     case 0x202f: /* NARROW NO-BREAK SPACE */
1661     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1662     case 0x3000: /* IDEOGRAPHIC SPACE */
1663     OK = TRUE;
1664     break;
1665 ph10 182
1666 ph10 178 default:
1667     OK = FALSE;
1668     break;
1669     }
1670 ph10 182
1671 ph10 178 if (OK == (d == OP_HSPACE))
1672 ph10 182 {
1673 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1674     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1675     {
1676     active_count--; /* Remove non-match possibility */
1677     next_active_state--;
1678     }
1679     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1680     }
1681     }
1682     break;
1683    
1684     /*-----------------------------------------------------------------*/
1685 ph10 151 #ifdef SUPPORT_UCP
1686 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1687     case OP_PROP_EXTRA + OP_TYPEUPTO:
1688     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1689 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1690 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1691 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1692 nigel 77 count = current_state->count; /* Number already matched */
1693     if (clen > 0)
1694     {
1695 nigel 87 BOOL OK;
1696 ph10 349 const ucd_record * prop = GET_UCD(c);
1697 nigel 87 switch(code[4])
1698 nigel 77 {
1699 nigel 87 case PT_ANY:
1700     OK = TRUE;
1701     break;
1702    
1703     case PT_LAMP:
1704 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1705 ph10 517 prop->chartype == ucp_Lt;
1706 nigel 87 break;
1707    
1708     case PT_GC:
1709 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1710 nigel 87 break;
1711    
1712     case PT_PC:
1713 ph10 349 OK = prop->chartype == code[5];
1714 nigel 87 break;
1715    
1716     case PT_SC:
1717 ph10 349 OK = prop->script == code[5];
1718 nigel 87 break;
1719 ph10 535
1720 ph10 517 /* These are specials for combination cases. */
1721 ph10 535
1722 ph10 517 case PT_ALNUM:
1723     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1724     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1725 ph10 535 break;
1726    
1727 ph10 517 case PT_SPACE: /* Perl space */
1728     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1729     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1730 ph10 535 break;
1731    
1732 ph10 517 case PT_PXSPACE: /* POSIX space */
1733     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1734     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1735     c == CHAR_FF || c == CHAR_CR;
1736 ph10 535 break;
1737    
1738 ph10 517 case PT_WORD:
1739     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1740     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1741     c == CHAR_UNDERSCORE;
1742 ph10 535 break;
1743 nigel 87
1744     /* Should never occur, but keep compilers from grumbling. */
1745    
1746     default:
1747     OK = codevalue != OP_PROP;
1748     break;
1749     }
1750    
1751     if (OK == (d == OP_PROP))
1752     {
1753 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1754     {
1755     active_count--; /* Remove non-match possibility */
1756     next_active_state--;
1757     }
1758 nigel 77 if (++count >= GET2(code, 1))
1759 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1760 nigel 77 else
1761     { ADD_NEW(state_offset, count); }
1762     }
1763     }
1764     break;
1765    
1766     /*-----------------------------------------------------------------*/
1767     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1768     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1769     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1770 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1771 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1772     { ADD_ACTIVE(state_offset + 4, 0); }
1773     count = current_state->count; /* Number already matched */
1774 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1775 nigel 77 {
1776     const uschar *nptr = ptr + clen;
1777     int ncount = 0;
1778 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1779     {
1780     active_count--; /* Remove non-match possibility */
1781     next_active_state--;
1782     }
1783 nigel 77 while (nptr < end_subject)
1784     {
1785     int nd;
1786     int ndlen = 1;
1787     GETCHARLEN(nd, nptr, ndlen);
1788 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1789 nigel 77 ncount++;
1790     nptr += ndlen;
1791     }
1792     if (++count >= GET2(code, 1))
1793     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1794     else
1795     { ADD_NEW_DATA(-state_offset, count, ncount); }
1796     }
1797     break;
1798 ph10 151 #endif
1799 nigel 77
1800 nigel 93 /*-----------------------------------------------------------------*/
1801     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1802     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1803     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1804     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1805     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1806     { ADD_ACTIVE(state_offset + 4, 0); }
1807     count = current_state->count; /* Number already matched */
1808     if (clen > 0)
1809     {
1810     int ncount = 0;
1811     switch (c)
1812     {
1813     case 0x000b:
1814     case 0x000c:
1815     case 0x0085:
1816     case 0x2028:
1817     case 0x2029:
1818 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1819     goto ANYNL03;
1820    
1821     case 0x000d:
1822     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1823     /* Fall through */
1824    
1825     ANYNL03:
1826     case 0x000a:
1827 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1828     {
1829     active_count--; /* Remove non-match possibility */
1830     next_active_state--;
1831     }
1832     if (++count >= GET2(code, 1))
1833     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1834     else
1835     { ADD_NEW_DATA(-state_offset, count, ncount); }
1836     break;
1837 ph10 231
1838 nigel 93 default:
1839     break;
1840     }
1841     }
1842     break;
1843    
1844 ph10 178 /*-----------------------------------------------------------------*/
1845     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1846     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1847     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1848     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1849     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1850     { ADD_ACTIVE(state_offset + 4, 0); }
1851     count = current_state->count; /* Number already matched */
1852     if (clen > 0)
1853     {
1854 ph10 182 BOOL OK;
1855 ph10 178 switch (c)
1856     {
1857     case 0x000a:
1858     case 0x000b:
1859     case 0x000c:
1860     case 0x000d:
1861     case 0x0085:
1862     case 0x2028:
1863     case 0x2029:
1864     OK = TRUE;
1865     break;
1866 ph10 182
1867 ph10 178 default:
1868     OK = FALSE;
1869     }
1870 ph10 182
1871 ph10 178 if (OK == (d == OP_VSPACE))
1872 ph10 182 {
1873 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1874     {
1875     active_count--; /* Remove non-match possibility */
1876     next_active_state--;
1877     }
1878     if (++count >= GET2(code, 1))
1879     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1880     else
1881     { ADD_NEW_DATA(-state_offset, count, 0); }
1882     }
1883     }
1884     break;
1885    
1886     /*-----------------------------------------------------------------*/
1887     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1888     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1889     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1890     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1891     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1892     { ADD_ACTIVE(state_offset + 4, 0); }
1893     count = current_state->count; /* Number already matched */
1894     if (clen > 0)
1895     {
1896 ph10 182 BOOL OK;
1897 ph10 178 switch (c)
1898     {
1899     case 0x09: /* HT */
1900     case 0x20: /* SPACE */
1901     case 0xa0: /* NBSP */
1902     case 0x1680: /* OGHAM SPACE MARK */
1903     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1904     case 0x2000: /* EN QUAD */
1905     case 0x2001: /* EM QUAD */
1906     case 0x2002: /* EN SPACE */
1907     case 0x2003: /* EM SPACE */
1908     case 0x2004: /* THREE-PER-EM SPACE */
1909     case 0x2005: /* FOUR-PER-EM SPACE */
1910     case 0x2006: /* SIX-PER-EM SPACE */
1911     case 0x2007: /* FIGURE SPACE */
1912     case 0x2008: /* PUNCTUATION SPACE */
1913     case 0x2009: /* THIN SPACE */
1914     case 0x200A: /* HAIR SPACE */
1915     case 0x202f: /* NARROW NO-BREAK SPACE */
1916     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1917     case 0x3000: /* IDEOGRAPHIC SPACE */
1918     OK = TRUE;
1919     break;
1920 ph10 182
1921 ph10 178 default:
1922     OK = FALSE;
1923     break;
1924     }
1925 ph10 182
1926 ph10 178 if (OK == (d == OP_HSPACE))
1927 ph10 182 {
1928 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1929     {
1930     active_count--; /* Remove non-match possibility */
1931     next_active_state--;
1932     }
1933     if (++count >= GET2(code, 1))
1934     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1935     else
1936     { ADD_NEW_DATA(-state_offset, count, 0); }
1937     }
1938     }
1939     break;
1940    
1941 nigel 77 /* ========================================================================== */
1942     /* These opcodes are followed by a character that is usually compared
1943     to the current subject character; it is loaded into d. We still get
1944     here even if there is no subject character, because in some cases zero
1945     repetitions are permitted. */
1946    
1947     /*-----------------------------------------------------------------*/
1948     case OP_CHAR:
1949     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1950     break;
1951    
1952     /*-----------------------------------------------------------------*/
1953     case OP_CHARNC:
1954     if (clen == 0) break;
1955    
1956     #ifdef SUPPORT_UTF8
1957     if (utf8)
1958     {
1959     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1960     {
1961 nigel 93 unsigned int othercase;
1962 nigel 77 if (c < 128) othercase = fcc[c]; else
1963    
1964     /* If we have Unicode property support, we can use it to test the
1965 nigel 87 other case of the character. */
1966 nigel 77
1967     #ifdef SUPPORT_UCP
1968 ph10 349 othercase = UCD_OTHERCASE(c);
1969 nigel 87 #else
1970 nigel 93 othercase = NOTACHAR;
1971 nigel 77 #endif
1972    
1973     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1974     }
1975     }
1976     else
1977     #endif /* SUPPORT_UTF8 */
1978    
1979     /* Non-UTF-8 mode */
1980     {
1981     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1982     }
1983     break;
1984    
1985    
1986     #ifdef SUPPORT_UCP
1987     /*-----------------------------------------------------------------*/
1988     /* This is a tricky one because it can match more than one character.
1989     Find out how many characters to skip, and then set up a negative state
1990     to wait for them to pass before continuing. */
1991    
1992     case OP_EXTUNI:
1993 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1994 nigel 77 {
1995     const uschar *nptr = ptr + clen;
1996     int ncount = 0;
1997     while (nptr < end_subject)
1998     {
1999     int nclen = 1;
2000     GETCHARLEN(c, nptr, nclen);
2001 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2002 nigel 77 ncount++;
2003     nptr += nclen;
2004     }
2005     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2006     }
2007     break;
2008     #endif
2009    
2010     /*-----------------------------------------------------------------*/
2011 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2012     character (when CR is followed by LF). In this case, set up a negative
2013     state to wait for one character to pass before continuing. */
2014    
2015     case OP_ANYNL:
2016     if (clen > 0) switch(c)
2017     {
2018     case 0x000b:
2019     case 0x000c:
2020     case 0x0085:
2021     case 0x2028:
2022     case 0x2029:
2023 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2024    
2025     case 0x000a:
2026 nigel 93 ADD_NEW(state_offset + 1, 0);
2027     break;
2028 ph10 231
2029 nigel 93 case 0x000d:
2030     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2031     {
2032     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2033     }
2034     else
2035     {
2036     ADD_NEW(state_offset + 1, 0);
2037     }
2038     break;
2039     }
2040     break;
2041    
2042     /*-----------------------------------------------------------------*/
2043 ph10 178 case OP_NOT_VSPACE:
2044     if (clen > 0) switch(c)
2045     {
2046     case 0x000a:
2047     case 0x000b:
2048     case 0x000c:
2049     case 0x000d:
2050     case 0x0085:
2051     case 0x2028:
2052     case 0x2029:
2053     break;
2054 ph10 182
2055     default:
2056 ph10 178 ADD_NEW(state_offset + 1, 0);
2057     break;
2058     }
2059     break;
2060    
2061     /*-----------------------------------------------------------------*/
2062     case OP_VSPACE:
2063     if (clen > 0) switch(c)
2064     {
2065     case 0x000a:
2066     case 0x000b:
2067     case 0x000c:
2068     case 0x000d:
2069     case 0x0085:
2070     case 0x2028:
2071     case 0x2029:
2072     ADD_NEW(state_offset + 1, 0);
2073     break;
2074 ph10 182
2075 ph10 178 default: break;
2076     }
2077     break;
2078    
2079     /*-----------------------------------------------------------------*/
2080     case OP_NOT_HSPACE:
2081     if (clen > 0) switch(c)
2082     {
2083     case 0x09: /* HT */
2084     case 0x20: /* SPACE */
2085     case 0xa0: /* NBSP */
2086     case 0x1680: /* OGHAM SPACE MARK */
2087     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2088     case 0x2000: /* EN QUAD */
2089     case 0x2001: /* EM QUAD */
2090     case 0x2002: /* EN SPACE */
2091     case 0x2003: /* EM SPACE */
2092     case 0x2004: /* THREE-PER-EM SPACE */
2093     case 0x2005: /* FOUR-PER-EM SPACE */
2094     case 0x2006: /* SIX-PER-EM SPACE */
2095     case 0x2007: /* FIGURE SPACE */
2096     case 0x2008: /* PUNCTUATION SPACE */
2097     case 0x2009: /* THIN SPACE */
2098     case 0x200A: /* HAIR SPACE */
2099     case 0x202f: /* NARROW NO-BREAK SPACE */
2100     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2101     case 0x3000: /* IDEOGRAPHIC SPACE */
2102     break;
2103 ph10 182
2104     default:
2105 ph10 178 ADD_NEW(state_offset + 1, 0);
2106     break;
2107     }
2108     break;
2109    
2110     /*-----------------------------------------------------------------*/
2111     case OP_HSPACE:
2112     if (clen > 0) switch(c)
2113     {
2114     case 0x09: /* HT */
2115     case 0x20: /* SPACE */
2116     case 0xa0: /* NBSP */
2117     case 0x1680: /* OGHAM SPACE MARK */
2118     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2119     case 0x2000: /* EN QUAD */
2120     case 0x2001: /* EM QUAD */
2121     case 0x2002: /* EN SPACE */
2122     case 0x2003: /* EM SPACE */
2123     case 0x2004: /* THREE-PER-EM SPACE */
2124     case 0x2005: /* FOUR-PER-EM SPACE */
2125     case 0x2006: /* SIX-PER-EM SPACE */
2126     case 0x2007: /* FIGURE SPACE */
2127     case 0x2008: /* PUNCTUATION SPACE */
2128     case 0x2009: /* THIN SPACE */
2129     case 0x200A: /* HAIR SPACE */
2130     case 0x202f: /* NARROW NO-BREAK SPACE */
2131     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2132     case 0x3000: /* IDEOGRAPHIC SPACE */
2133     ADD_NEW(state_offset + 1, 0);
2134     break;
2135     }
2136     break;
2137    
2138     /*-----------------------------------------------------------------*/
2139 nigel 77 /* Match a negated single character. This is only used for one-byte
2140     characters, that is, we know that d < 256. The character we are
2141     checking (c) can be multibyte. */
2142    
2143     case OP_NOT:
2144     if (clen > 0)
2145     {
2146 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2147 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2148     }
2149     break;
2150    
2151     /*-----------------------------------------------------------------*/
2152     case OP_PLUS:
2153     case OP_MINPLUS:
2154 nigel 93 case OP_POSPLUS:
2155 nigel 77 case OP_NOTPLUS:
2156     case OP_NOTMINPLUS:
2157 nigel 93 case OP_NOTPOSPLUS:
2158 nigel 77 count = current_state->count; /* Already matched */
2159     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2160     if (clen > 0)
2161     {
2162 nigel 93 unsigned int otherd = NOTACHAR;
2163 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2164     {
2165     #ifdef SUPPORT_UTF8
2166 nigel 87 if (utf8 && d >= 128)
2167 nigel 77 {
2168     #ifdef SUPPORT_UCP
2169 ph10 349 otherd = UCD_OTHERCASE(d);
2170 nigel 77 #endif /* SUPPORT_UCP */
2171     }
2172     else
2173     #endif /* SUPPORT_UTF8 */
2174     otherd = fcc[d];
2175     }
2176     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2177 nigel 93 {
2178     if (count > 0 &&
2179     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2180     {
2181     active_count--; /* Remove non-match possibility */
2182     next_active_state--;
2183     }
2184     count++;
2185     ADD_NEW(state_offset, count);
2186     }
2187 nigel 77 }
2188     break;
2189    
2190     /*-----------------------------------------------------------------*/
2191     case OP_QUERY:
2192     case OP_MINQUERY:
2193 nigel 93 case OP_POSQUERY:
2194 nigel 77 case OP_NOTQUERY:
2195     case OP_NOTMINQUERY:
2196 nigel 93 case OP_NOTPOSQUERY:
2197 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2198     if (clen > 0)
2199     {
2200 nigel 93 unsigned int otherd = NOTACHAR;
2201 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2202 nigel 77 {
2203     #ifdef SUPPORT_UTF8
2204 nigel 87 if (utf8 && d >= 128)
2205 nigel 77 {
2206     #ifdef SUPPORT_UCP
2207 ph10 349 otherd = UCD_OTHERCASE(d);
2208 nigel 77 #endif /* SUPPORT_UCP */
2209     }
2210     else
2211     #endif /* SUPPORT_UTF8 */
2212     otherd = fcc[d];
2213     }
2214     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2215 nigel 93 {
2216     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2217     {
2218     active_count--; /* Remove non-match possibility */
2219     next_active_state--;
2220     }
2221     ADD_NEW(state_offset + dlen + 1, 0);
2222     }
2223 nigel 77 }
2224     break;
2225    
2226     /*-----------------------------------------------------------------*/
2227     case OP_STAR:
2228     case OP_MINSTAR:
2229 nigel 93 case OP_POSSTAR:
2230 nigel 77 case OP_NOTSTAR:
2231     case OP_NOTMINSTAR:
2232 nigel 93 case OP_NOTPOSSTAR:
2233 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2234     if (clen > 0)
2235     {
2236 nigel 93 unsigned int otherd = NOTACHAR;
2237 nigel 91 if ((ims & PCRE_CASELESS) != 0)
2238 nigel 77 {
2239     #ifdef SUPPORT_UTF8
2240 nigel 87 if (utf8 && d >= 128)
2241 nigel 77 {
2242     #ifdef SUPPORT_UCP
2243 ph10 349 otherd = UCD_OTHERCASE(d);
2244 nigel 77 #endif /* SUPPORT_UCP */
2245     }
2246     else
2247     #endif /* SUPPORT_UTF8 */
2248     otherd = fcc[d];
2249     }
2250     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2251 nigel 93 {
2252     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2253     {
2254     active_count--; /* Remove non-match possibility */
2255     next_active_state--;
2256     }
2257     ADD_NEW(state_offset, 0);
2258     }
2259 nigel 77 }
2260     break;
2261    
2262     /*-----------------------------------------------------------------*/
2263     case OP_EXACT:
2264 nigel 93 case OP_NOTEXACT:
2265     count = current_state->count; /* Number already matched */
2266     if (clen > 0)
2267     {
2268     unsigned int otherd = NOTACHAR;
2269     if ((ims & PCRE_CASELESS) != 0)
2270     {
2271     #ifdef SUPPORT_UTF8
2272     if (utf8 && d >= 128)
2273     {
2274     #ifdef SUPPORT_UCP
2275 ph10 349 otherd = UCD_OTHERCASE(d);
2276 nigel 93 #endif /* SUPPORT_UCP */
2277     }
2278     else
2279     #endif /* SUPPORT_UTF8 */
2280     otherd = fcc[d];
2281     }
2282     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2283     {
2284     if (++count >= GET2(code, 1))
2285     { ADD_NEW(state_offset + dlen + 3, 0); }
2286     else
2287     { ADD_NEW(state_offset, count); }
2288     }
2289     }
2290     break;
2291    
2292     /*-----------------------------------------------------------------*/
2293 nigel 77 case OP_UPTO:
2294     case OP_MINUPTO:
2295 nigel 93 case OP_POSUPTO:
2296 nigel 77 case OP_NOTUPTO:
2297     case OP_NOTMINUPTO:
2298 nigel 93 case OP_NOTPOSUPTO:
2299     ADD_ACTIVE(state_offset + dlen + 3, 0);
2300 nigel 77 count = current_state->count; /* Number already matched */
2301     if (clen > 0)
2302     {
2303 nigel 93 unsigned int otherd = NOTACHAR;
2304 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2305     {
2306     #ifdef SUPPORT_UTF8
2307 nigel 87 if (utf8 && d >= 128)
2308 nigel 77 {
2309     #ifdef SUPPORT_UCP
2310 ph10 349 otherd = UCD_OTHERCASE(d);
2311 nigel 77 #endif /* SUPPORT_UCP */
2312     }
2313     else
2314     #endif /* SUPPORT_UTF8 */
2315     otherd = fcc[d];
2316     }
2317     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2318     {
2319 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2320     {
2321     active_count--; /* Remove non-match possibility */
2322     next_active_state--;
2323     }
2324 nigel 77 if (++count >= GET2(code, 1))
2325     { ADD_NEW(state_offset + dlen + 3, 0); }
2326     else
2327     { ADD_NEW(state_offset, count); }
2328     }
2329     }
2330     break;
2331    
2332    
2333     /* ========================================================================== */
2334     /* These are the class-handling opcodes */
2335    
2336     case OP_CLASS:
2337     case OP_NCLASS:
2338     case OP_XCLASS:
2339     {
2340     BOOL isinclass = FALSE;
2341     int next_state_offset;
2342     const uschar *ecode;
2343    
2344     /* For a simple class, there is always just a 32-byte table, and we
2345     can set isinclass from it. */
2346    
2347     if (codevalue != OP_XCLASS)
2348     {
2349     ecode = code + 33;
2350     if (clen > 0)
2351     {
2352     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2353     ((code[1 + c/8] & (1 << (c&7))) != 0);
2354     }
2355     }
2356    
2357     /* An extended class may have a table or a list of single characters,
2358     ranges, or both, and it may be positive or negative. There's a
2359     function that sorts all this out. */
2360    
2361     else
2362     {
2363     ecode = code + GET(code, 1);
2364     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2365     }
2366    
2367     /* At this point, isinclass is set for all kinds of class, and ecode
2368     points to the byte after the end of the class. If there is a
2369     quantifier, this is where it will be. */
2370    
2371 ph10 530 next_state_offset = (int)(ecode - start_code);
2372 nigel 77
2373     switch (*ecode)
2374     {
2375     case OP_CRSTAR:
2376     case OP_CRMINSTAR:
2377     ADD_ACTIVE(next_state_offset + 1, 0);
2378     if (isinclass) { ADD_NEW(state_offset, 0); }
2379     break;
2380    
2381     case OP_CRPLUS:
2382     case OP_CRMINPLUS:
2383     count = current_state->count; /* Already matched */
2384     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2385     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2386     break;
2387    
2388     case OP_CRQUERY:
2389     case OP_CRMINQUERY:
2390     ADD_ACTIVE(next_state_offset + 1, 0);
2391     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2392     break;
2393    
2394     case OP_CRRANGE:
2395     case OP_CRMINRANGE:
2396     count = current_state->count; /* Already matched */
2397     if (count >= GET2(ecode, 1))
2398     { ADD_ACTIVE(next_state_offset + 5, 0); }
2399     if (isinclass)
2400     {
2401 nigel 91 int max = GET2(ecode, 3);
2402     if (++count >= max && max != 0) /* Max 0 => no limit */
2403 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2404     else
2405     { ADD_NEW(state_offset, count); }
2406     }
2407     break;
2408    
2409     default:
2410     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2411     break;
2412     }
2413     }
2414     break;
2415    
2416     /* ========================================================================== */
2417     /* These are the opcodes for fancy brackets of various kinds. We have
2418 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2419     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2420 ph10 341 though the other "backtracking verbs" are not supported. */
2421 ph10 345
2422 ph10 341 case OP_FAIL:
2423 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2424 ph10 345 break;
2425 nigel 77
2426     case OP_ASSERT:
2427     case OP_ASSERT_NOT:
2428     case OP_ASSERTBACK:
2429     case OP_ASSERTBACK_NOT:
2430     {
2431     int rc;
2432     int local_offsets[2];
2433     int local_workspace[1000];
2434     const uschar *endasscode = code + GET(code, 1);
2435    
2436     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2437    
2438     rc = internal_dfa_exec(
2439     md, /* static match data */
2440     code, /* this subexpression's code */
2441     ptr, /* where we currently are */
2442 ph10 530 (int)(ptr - start_subject), /* start offset */
2443 nigel 77 local_offsets, /* offset vector */
2444     sizeof(local_offsets)/sizeof(int), /* size of same */
2445     local_workspace, /* workspace vector */
2446     sizeof(local_workspace)/sizeof(int), /* size of same */
2447     ims, /* the current ims flags */
2448     rlevel, /* function recursion level */
2449     recursing); /* pass on regex recursion */
2450 ph10 487
2451 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2452 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2453 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2454 nigel 77 }
2455     break;
2456    
2457     /*-----------------------------------------------------------------*/
2458     case OP_COND:
2459 nigel 93 case OP_SCOND:
2460 nigel 77 {
2461     int local_offsets[1000];
2462     int local_workspace[1000];
2463 ph10 406 int codelink = GET(code, 1);
2464 ph10 397 int condcode;
2465 ph10 406
2466 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2467 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2468 ph10 398 happen for the other conditions. */
2469 nigel 77
2470 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2471 ph10 406 {
2472     rrc = 0;
2473 ph10 397 if (pcre_callout != NULL)
2474     {
2475     pcre_callout_block cb;
2476     cb.version = 1; /* Version 1 of the callout block */
2477     cb.callout_number = code[LINK_SIZE+2];
2478     cb.offset_vector = offsets;
2479     cb.subject = (PCRE_SPTR)start_subject;
2480 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2481     cb.start_match = (int)(current_subject - start_subject);
2482     cb.current_position = (int)(ptr - start_subject);
2483 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2484     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2485     cb.capture_top = 1;
2486     cb.capture_last = -1;
2487     cb.callout_data = md->callout_data;
2488     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2489     }
2490 ph10 398 if (rrc > 0) break; /* Fail this thread */
2491     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2492 ph10 406 }
2493 ph10 398
2494 ph10 397 condcode = code[LINK_SIZE+1];
2495 ph10 406
2496 nigel 93 /* Back reference conditions are not supported */
2497 nigel 77
2498 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2499 ph10 459 return PCRE_ERROR_DFA_UCOND;
2500 nigel 93
2501     /* The DEFINE condition is always false */
2502    
2503     if (condcode == OP_DEF)
2504 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2505 nigel 93
2506     /* The only supported version of OP_RREF is for the value RREF_ANY,
2507     which means "test if in any recursion". We can't test for specifically
2508     recursed groups. */
2509    
2510 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2511 nigel 93 {
2512 nigel 77 int value = GET2(code, LINK_SIZE+2);
2513 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2514 ph10 406 if (recursing > 0)
2515 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2516     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2517 nigel 77 }
2518    
2519     /* Otherwise, the condition is an assertion */
2520    
2521     else
2522     {
2523     int rc;
2524     const uschar *asscode = code + LINK_SIZE + 1;
2525     const uschar *endasscode = asscode + GET(asscode, 1);
2526    
2527     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2528    
2529     rc = internal_dfa_exec(
2530     md, /* fixed match data */
2531     asscode, /* this subexpression's code */
2532     ptr, /* where we currently are */
2533 ph10 530 (int)(ptr - start_subject), /* start offset */
2534 nigel 77 local_offsets, /* offset vector */
2535     sizeof(local_offsets)/sizeof(int), /* size of same */
2536     local_workspace, /* workspace vector */
2537     sizeof(local_workspace)/sizeof(int), /* size of same */
2538     ims, /* the current ims flags */
2539     rlevel, /* function recursion level */
2540     recursing); /* pass on regex recursion */
2541    
2542 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2543 nigel 77 if ((rc >= 0) ==
2544     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2545 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2546 nigel 77 else
2547 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2548 nigel 77 }
2549     }
2550     break;
2551    
2552     /*-----------------------------------------------------------------*/
2553     case OP_RECURSE:
2554     {
2555     int local_offsets[1000];
2556     int local_workspace[1000];
2557     int rc;
2558    
2559     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2560     recursing + 1));
2561    
2562     rc = internal_dfa_exec(
2563     md, /* fixed match data */
2564     start_code + GET(code, 1), /* this subexpression's code */
2565     ptr, /* where we currently are */
2566 ph10 530 (int)(ptr - start_subject), /* start offset */
2567 nigel 77 local_offsets, /* offset vector */
2568     sizeof(local_offsets)/sizeof(int), /* size of same */
2569     local_workspace, /* workspace vector */
2570     sizeof(local_workspace)/sizeof(int), /* size of same */
2571     ims, /* the current ims flags */
2572     rlevel, /* function recursion level */
2573     recursing + 1); /* regex recurse level */
2574    
2575     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2576     recursing + 1, rc));
2577    
2578     /* Ran out of internal offsets */
2579    
2580     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2581    
2582     /* For each successful matched substring, set up the next state with a
2583     count of characters to skip before trying it. Note that the count is in
2584     characters, not bytes. */
2585    
2586     if (rc > 0)
2587     {
2588     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2589     {
2590     const uschar *p = start_subject + local_offsets[rc];
2591     const uschar *pp = start_subject + local_offsets[rc+1];
2592     int charcount = local_offsets[rc+1] - local_offsets[rc];
2593     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2594     if (charcount > 0)
2595     {
2596     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2597     }
2598     else
2599     {
2600     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2601     }
2602     }
2603     }
2604     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2605     }
2606     break;
2607    
2608     /*-----------------------------------------------------------------*/
2609     case OP_ONCE:
2610     {
2611     int local_offsets[2];
2612     int local_workspace[1000];
2613    
2614     int rc = internal_dfa_exec(
2615     md, /* fixed match data */
2616     code, /* this subexpression's code */
2617     ptr, /* where we currently are */
2618 ph10 530 (int)(ptr - start_subject), /* start offset */
2619 nigel 77 local_offsets, /* offset vector */
2620     sizeof(local_offsets)/sizeof(int), /* size of same */
2621     local_workspace, /* workspace vector */
2622     sizeof(local_workspace)/sizeof(int), /* size of same */
2623     ims, /* the current ims flags */
2624     rlevel, /* function recursion level */
2625     recursing); /* pass on regex recursion */
2626    
2627     if (rc >= 0)
2628     {
2629     const uschar *end_subpattern = code;
2630     int charcount = local_offsets[1] - local_offsets[0];
2631     int next_state_offset, repeat_state_offset;
2632    
2633     do { end_subpattern += GET(end_subpattern, 1); }
2634     while (*end_subpattern == OP_ALT);
2635 ph10 535 next_state_offset =
2636 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2637 nigel 77
2638     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2639     arrange for the repeat state also to be added to the relevant list.
2640     Calculate the offset, or set -1 for no repeat. */
2641    
2642     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2643     *end_subpattern == OP_KETRMIN)?
2644 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2645 nigel 77
2646     /* If we have matched an empty string, add the next state at the
2647     current character pointer. This is important so that the duplicate
2648     checking kicks in, which is what breaks infinite loops that match an
2649     empty string. */
2650    
2651     if (charcount == 0)
2652     {
2653     ADD_ACTIVE(next_state_offset, 0);
2654     }
2655    
2656     /* Optimization: if there are no more active states, and there
2657     are no new states yet set up, then skip over the subject string
2658     right here, to save looping. Otherwise, set up the new state to swing
2659     into action when the end of the substring is reached. */
2660    
2661     else if (i + 1 >= active_count && new_count == 0)
2662     {
2663     ptr += charcount;
2664     clen = 0;
2665     ADD_NEW(next_state_offset, 0);
2666    
2667     /* If we are adding a repeat state at the new character position,
2668     we must fudge things so that it is the only current state.
2669     Otherwise, it might be a duplicate of one we processed before, and
2670     that would cause it to be skipped. */
2671    
2672     if (repeat_state_offset >= 0)
2673     {
2674     next_active_state = active_states;
2675     active_count = 0;
2676     i = -1;
2677     ADD_ACTIVE(repeat_state_offset, 0);
2678     }
2679     }
2680     else
2681     {
2682     const uschar *p = start_subject + local_offsets[0];
2683     const uschar *pp = start_subject + local_offsets[1];
2684     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2685     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2686     if (repeat_state_offset >= 0)
2687     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2688     }
2689    
2690     }
2691     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2692     }
2693     break;
2694    
2695    
2696     /* ========================================================================== */
2697     /* Handle callouts */
2698    
2699     case OP_CALLOUT:
2700 ph10 406 rrc = 0;
2701 nigel 77 if (pcre_callout != NULL)
2702     {
2703     pcre_callout_block cb;
2704     cb.version = 1; /* Version 1 of the callout block */
2705     cb.callout_number = code[1];
2706     cb.offset_vector = offsets;
2707 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2708 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2709     cb.start_match = (int)(current_subject - start_subject);
2710     cb.current_position = (int)(ptr - start_subject);
2711 nigel 77 cb.pattern_position = GET(code, 2);
2712     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2713     cb.capture_top = 1;
2714     cb.capture_last = -1;
2715     cb.callout_data = md->callout_data;
2716     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2717 ph10 406 }
2718     if (rrc == 0)
2719     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2720 nigel 77 break;
2721    
2722    
2723     /* ========================================================================== */
2724     default: /* Unsupported opcode */
2725     return PCRE_ERROR_DFA_UITEM;
2726     }
2727    
2728     NEXT_ACTIVE_STATE: continue;
2729    
2730     } /* End of loop scanning active states */
2731    
2732     /* We have finished the processing at the current subject character. If no
2733     new states have been set for the next character, we have found all the
2734     matches that we are going to find. If we are at the top level and partial
2735 ph10 463 matching has been requested, check for appropriate conditions.
2736    
2737 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2738     character. If it is equal to the original active_count (saved in
2739     workspace[1]) it means that (*F) was found on every active state. In this
2740 ph10 463 case we don't want to give a partial match.
2741 nigel 77
2742 ph10 463 The "could_continue" variable is true if a state could have continued but
2743     for the fact that the end of the subject was reached. */
2744    
2745 nigel 77 if (new_count <= 0)
2746     {
2747 ph10 427 if (rlevel == 1 && /* Top level, and */
2748 ph10 463 could_continue && /* Some could go on */
2749 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2750 ph10 427 ( /* either... */
2751     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2752     || /* or... */
2753     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2754     match_count < 0) /* no matches */
2755     ) && /* And... */
2756 ph10 553 ptr >= end_subject && /* Reached end of subject */
2757     ptr > md->start_used_ptr) /* Inspected non-empty string */
2758 nigel 77 {
2759     if (offsetcount >= 2)
2760     {
2761 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2762     offsets[1] = (int)(end_subject - start_subject);
2763 nigel 77 }
2764     match_count = PCRE_ERROR_PARTIAL;
2765     }
2766    
2767     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2768     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2769     rlevel*2-2, SP));
2770 nigel 91 break; /* In effect, "return", but see the comment below */
2771 nigel 77 }
2772    
2773     /* One or more states are active for the next character. */
2774    
2775     ptr += clen; /* Advance to next subject character */
2776     } /* Loop to move along the subject string */
2777    
2778 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2779     if we use "return" above, we have compiler trouble. Some compilers warn if
2780     there's nothing here because they think the function doesn't return a value. On
2781     the other hand, if we put a dummy statement here, some more clever compilers
2782     complain that it can't be reached. Sigh. */
2783 nigel 77
2784 nigel 91 return match_count;
2785 nigel 77 }
2786    
2787    
2788    
2789    
2790     /*************************************************
2791     * Execute a Regular Expression - DFA engine *
2792     *************************************************/
2793    
2794     /* This external function applies a compiled re to a subject string using a DFA
2795     engine. This function calls the internal function multiple times if the pattern
2796     is not anchored.
2797    
2798     Arguments:
2799     argument_re points to the compiled expression
2800 ph10 97 extra_data points to extra data or is NULL
2801 nigel 77 subject points to the subject string
2802     length length of subject string (may contain binary zeros)
2803     start_offset where to start in the subject string
2804     options option bits
2805     offsets vector of match offsets
2806     offsetcount size of same
2807     workspace workspace vector
2808     wscount size of same
2809    
2810     Returns: > 0 => number of match offset pairs placed in offsets
2811     = 0 => offsets overflowed; longest matches are present
2812     -1 => failed to match
2813     < -1 => some kind of unexpected problem
2814     */
2815    
2816 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2817 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2818     const char *subject, int length, int start_offset, int options, int *offsets,
2819     int offsetcount, int *workspace, int wscount)
2820     {
2821     real_pcre *re = (real_pcre *)argument_re;
2822     dfa_match_data match_block;
2823 nigel 91 dfa_match_data *md = &match_block;
2824 nigel 77 BOOL utf8, anchored, startline, firstline;
2825     const uschar *current_subject, *end_subject, *lcc;
2826    
2827     pcre_study_data internal_study;
2828     const pcre_study_data *study = NULL;
2829     real_pcre internal_re;
2830    
2831     const uschar *req_byte_ptr;
2832     const uschar *start_bits = NULL;
2833     BOOL first_byte_caseless = FALSE;
2834     BOOL req_byte_caseless = FALSE;
2835     int first_byte = -1;
2836     int req_byte = -1;
2837     int req_byte2 = -1;
2838 nigel 91 int newline;
2839 nigel 77
2840     /* Plausibility checks */
2841    
2842     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2843     if (re == NULL || subject == NULL || workspace == NULL ||
2844     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2845     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2846     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2847 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
2848 nigel 77
2849     /* We need to find the pointer to any study data before we test for byte
2850     flipping, so we scan the extra_data block first. This may set two fields in the
2851     match block, so we must initialize them beforehand. However, the other fields
2852     in the match block must not be set until after the byte flipping. */
2853    
2854 nigel 91 md->tables = re->tables;
2855     md->callout_data = NULL;
2856 nigel 77
2857     if (extra_data != NULL)
2858     {
2859     unsigned int flags = extra_data->flags;
2860     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2861     study = (const pcre_study_data *)extra_data->study_data;
2862     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2863 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2864     return PCRE_ERROR_DFA_UMLIMIT;
2865 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2866 nigel 91 md->callout_data = extra_data->callout_data;
2867 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2868 nigel 91 md->tables = extra_data->tables;
2869 nigel 77 }
2870 ph10 461
2871 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
2872     test for a regex that was compiled on a host of opposite endianness. If this is
2873     the case, flipped values are put in internal_re and internal_study if there was
2874     study data too. */
2875    
2876     if (re->magic_number != MAGIC_NUMBER)
2877     {
2878     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2879     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2880     if (study != NULL) study = &internal_study;
2881     }
2882    
2883     /* Set some local values */
2884    
2885     current_subject = (const unsigned char *)subject + start_offset;
2886     end_subject = (const unsigned char *)subject + length;
2887     req_byte_ptr = current_subject - 1;
2888    
2889 nigel 91 #ifdef SUPPORT_UTF8
2890 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2891 nigel 91 #else
2892     utf8 = FALSE;
2893     #endif
2894 nigel 77
2895 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2896     (re->options & PCRE_ANCHORED) != 0;
2897    
2898 nigel 77 /* The remaining fixed data for passing around. */
2899    
2900 nigel 91 md->start_code = (const uschar *)argument_re +
2901 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2902 nigel 91 md->start_subject = (const unsigned char *)subject;
2903     md->end_subject = end_subject;
2904 ph10 442 md->start_offset = start_offset;
2905 nigel 91 md->moptions = options;
2906     md->poptions = re->options;
2907 nigel 77
2908 ph10 231 /* If the BSR option is not set at match time, copy what was set
2909     at compile time. */
2910    
2911     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2912     {
2913     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2914     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2915     #ifdef BSR_ANYCRLF
2916     else md->moptions |= PCRE_BSR_ANYCRLF;
2917 ph10 243 #endif
2918     }
2919 ph10 231
2920 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2921     nothing is set at run time, whatever was used at compile time applies. */
2922 nigel 91
2923 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2924 nigel 93 PCRE_NEWLINE_BITS)
2925 nigel 91 {
2926 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2927 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2928     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2929 nigel 91 case PCRE_NEWLINE_CR+
2930 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2931 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2932 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2933 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2934 nigel 91 }
2935    
2936 ph10 149 if (newline == -2)
2937 nigel 91 {
2938 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2939     }
2940     else if (newline < 0)
2941     {
2942 nigel 93 md->nltype = NLTYPE_ANY;
2943 nigel 91 }
2944     else
2945     {
2946 nigel 93 md->nltype = NLTYPE_FIXED;
2947     if (newline > 255)
2948     {
2949     md->nllen = 2;
2950     md->nl[0] = (newline >> 8) & 255;
2951     md->nl[1] = newline & 255;
2952     }
2953     else
2954     {
2955     md->nllen = 1;
2956     md->nl[0] = newline;
2957     }
2958 nigel 91 }
2959    
2960 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2961     back the character offset. */
2962    
2963     #ifdef SUPPORT_UTF8
2964     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2965     {
2966     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2967     return PCRE_ERROR_BADUTF8;
2968     if (start_offset > 0 && start_offset < length)
2969     {
2970 ph10 567 int tb = ((USPTR)subject)[start_offset] & 0xc0;
2971     if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
2972 nigel 77 }
2973     }
2974     #endif
2975    
2976     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2977     is a feature that makes it possible to save compiled regex and re-use them
2978     in other programs later. */
2979    
2980 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2981 nigel 77
2982     /* The lower casing table and the "must be at the start of a line" flag are
2983     used in a loop when finding where to start. */
2984    
2985 nigel 91 lcc = md->tables + lcc_offset;
2986 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2987 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2988    
2989     /* Set up the first character to match, if available. The first_byte value is
2990     never set for an anchored regular expression, but the anchoring may be forced
2991     at run time, so we have to test for anchoring. The first char may be unset for
2992     an unanchored pattern, of course. If there's no first char and the pattern was
2993     studied, there may be a bitmap of possible first characters. */
2994    
2995     if (!anchored)
2996     {
2997 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2998 nigel 77 {
2999     first_byte = re->first_byte & 255;
3000     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3001     first_byte = lcc[first_byte];
3002     }
3003     else
3004     {
3005 ph10 455 if (!startline && study != NULL &&
3006     (study->flags & PCRE_STUDY_MAPPED) != 0)
3007 nigel 77 start_bits = study->start_bits;
3008     }
3009     }
3010    
3011     /* For anchored or unanchored matches, there may be a "last known required
3012     character" set. */
3013    
3014 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3015 nigel 77 {
3016     req_byte = re->req_byte & 255;
3017     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3018 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3019 nigel 77 }
3020    
3021     /* Call the main matching function, looping for a non-anchored regex after a
3022 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3023     a match. */
3024 nigel 77
3025     for (;;)
3026     {
3027     int rc;
3028    
3029     if ((options & PCRE_DFA_RESTART) == 0)
3030     {
3031     const uschar *save_end_subject = end_subject;
3032    
3033 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3034     line of a multiline string. Implement this by temporarily adjusting
3035     end_subject so that we stop scanning at a newline. If the match fails at
3036     the newline, later code breaks this loop. */
3037 nigel 77
3038     if (firstline)
3039     {
3040 ph10 365 USPTR t = current_subject;
3041     #ifdef SUPPORT_UTF8
3042     if (utf8)
3043 ph10 371 {
3044     while (t < md->end_subject && !IS_NEWLINE(t))
3045 ph10 365 {
3046     t++;
3047     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3048 ph10 371 }
3049 ph10 365 }
3050     else
3051 ph10 371 #endif
3052 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3053 nigel 77 end_subject = t;
3054     }
3055 ph10 392
3056 ph10 389 /* There are some optimizations that avoid running the match if a known
3057 ph10 455 starting point is not found. However, there is an option that disables
3058     these, for testing and for ensuring that all callouts do actually occur. */
3059 nigel 77
3060 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
3061 ph10 392 {
3062 ph10 389 /* Advance to a known first byte. */
3063 ph10 392
3064 ph10 389 if (first_byte >= 0)
3065 nigel 77 {
3066 ph10 389 if (first_byte_caseless)
3067     while (current_subject < end_subject &&
3068     lcc[*current_subject] != first_byte)
3069     current_subject++;
3070     else
3071 ph10 392 while (current_subject < end_subject &&
3072 ph10 389 *current_subject != first_byte)
3073     current_subject++;
3074     }
3075 ph10 392
3076 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3077 ph10 392
3078 ph10 389 else if (startline)
3079     {
3080     if (current_subject > md->start_subject + start_offset)
3081     {
3082 ph10 365 #ifdef SUPPORT_UTF8
3083 ph10 389 if (utf8)
3084 ph10 365 {
3085 ph10 392 while (current_subject < end_subject &&
3086 ph10 389 !WAS_NEWLINE(current_subject))
3087     {
3088 ph10 365 current_subject++;
3089 ph10 389 while(current_subject < end_subject &&
3090     (*current_subject & 0xc0) == 0x80)
3091     current_subject++;
3092     }
3093 ph10 371 }
3094 ph10 389 else
3095     #endif
3096     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3097     current_subject++;
3098 ph10 392
3099 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3100     ANYCRLF, and we are now at a LF, advance the match position by one
3101     more character. */
3102 ph10 392
3103 ph10 391 if (current_subject[-1] == CHAR_CR &&
3104 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3105     current_subject < end_subject &&
3106 ph10 391 *current_subject == CHAR_NL)
3107 ph10 389 current_subject++;
3108 ph10 365 }
3109 nigel 77 }
3110 ph10 392
3111 ph10 389 /* Or to a non-unique first char after study */
3112 ph10 392
3113 ph10 389 else if (start_bits != NULL)
3114 nigel 77 {
3115 ph10 389 while (current_subject < end_subject)
3116     {
3117     register unsigned int c = *current_subject;
3118 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3119 ph10 538 {
3120     current_subject++;
3121     #ifdef SUPPORT_UTF8
3122     if (utf8)
3123 ph10 545 while(current_subject < end_subject &&
3124 ph10 538 (*current_subject & 0xc0) == 0x80) current_subject++;
3125 ph10 545 #endif
3126 ph10 538 }
3127     else break;
3128 ph10 389 }
3129 nigel 77 }
3130 ph10 392 }
3131 nigel 77
3132     /* Restore fudged end_subject */
3133    
3134     end_subject = save_end_subject;
3135    
3136 ph10 461 /* The following two optimizations are disabled for partial matching or if
3137     disabling is explicitly requested (and of course, by the test above, this
3138 ph10 455 code is not obeyed when restarting after a partial match). */
3139 ph10 461
3140 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3141     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3142 ph10 461 {
3143 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3144     is a lower bound; no actual string of that length may actually match the
3145     pattern. Although the value is, strictly, in characters, we treat it as
3146     bytes to avoid spending too much time in this optimization. */
3147 nigel 77
3148 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3149 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3150 ph10 455 return PCRE_ERROR_NOMATCH;
3151 ph10 461
3152 ph10 455 /* If req_byte is set, we know that that character must appear in the
3153     subject for the match to succeed. If the first character is set, req_byte
3154     must be later in the subject; otherwise the test starts at the match
3155     point. This optimization can save a huge amount of work in patterns with
3156     nested unlimited repeats that aren't going to match. Writing separate
3157     code for cased/caseless versions makes it go faster, as does using an
3158     autoincrement and backing off on a match.
3159 ph10 461
3160 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3161     can take a long time, and give bad performance on quite ordinary
3162     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3163     string... so we don't do this when the string is sufficiently long. */
3164 ph10 461
3165 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3166 nigel 77 {
3167 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3168 ph10 461
3169 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3170     place we found it at last time. */
3171 ph10 461
3172 ph10 455 if (p > req_byte_ptr)
3173 nigel 77 {
3174 ph10 455 if (req_byte_caseless)
3175     {
3176     while (p < end_subject)
3177     {
3178     register int pp = *p++;
3179     if (pp == req_byte || pp == req_byte2) { p--; break; }
3180     }
3181     }
3182     else
3183     {
3184     while (p < end_subject)
3185     {
3186     if (*p++ == req_byte) { p--; break; }
3187     }
3188     }
3189 ph10 461
3190 ph10 455 /* If we can't find the required character, break the matching loop,
3191     which will cause a return or PCRE_ERROR_NOMATCH. */
3192 ph10 461
3193 ph10 455 if (p >= end_subject) break;
3194 ph10 461
3195 ph10 455 /* If we have found the required character, save the point where we
3196     found it, so that we don't search again next time round the loop if
3197     the start hasn't passed this character yet. */
3198 ph10 461
3199 ph10 455 req_byte_ptr = p;
3200 nigel 77 }
3201 ph10 461 }
3202 nigel 77 }
3203 ph10 455 } /* End of optimizations that are done when not restarting */
3204 nigel 77
3205     /* OK, now we can do the business */
3206    
3207 ph10 435 md->start_used_ptr = current_subject;
3208 ph10 461
3209 nigel 77 rc = internal_dfa_exec(
3210 nigel 91 md, /* fixed match data */
3211     md->start_code, /* this subexpression's code */
3212     current_subject, /* where we currently are */
3213     start_offset, /* start offset in subject */
3214     offsets, /* offset vector */
3215     offsetcount, /* size of same */
3216     workspace, /* workspace vector */
3217     wscount, /* size of same */
3218 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3219 nigel 91 0, /* function recurse level */
3220     0); /* regex recurse level */
3221 nigel 77
3222     /* Anything other than "no match" means we are done, always; otherwise, carry
3223     on only if not anchored. */
3224    
3225     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3226    
3227     /* Advance to the next subject character unless we are at the end of a line
3228     and firstline is set. */
3229    
3230 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3231 nigel 77 current_subject++;
3232     if (utf8)
3233     {
3234     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3235     current_subject++;
3236     }
3237     if (current_subject > end_subject) break;
3238    
3239 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3240 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3241     or ANY or ANYCRLF, advance the match position by one more character. */
3242 nigel 93
3243 ph10 391 if (current_subject[-1] == CHAR_CR &&
3244 ph10 226 current_subject < end_subject &&
3245 ph10 391 *current_subject == CHAR_NL &&
3246 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3247 ph10 226 (md->nltype == NLTYPE_ANY ||
3248     md->nltype == NLTYPE_ANYCRLF ||
3249     md->nllen == 2))
3250 nigel 93 current_subject++;
3251    
3252     } /* "Bumpalong" loop */
3253    
3254 nigel 77 return PCRE_ERROR_NOMATCH;
3255     }
3256    
3257     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12