/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 733 - (hide annotations) (download)
Tue Oct 11 10:29:36 2011 UTC (2 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 119553 byte(s)
Source tidies for 8.20-RC3.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 598 Copyright (c) 1997-2011 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 nigel 77 1, /* Char */
126 ph10 602 1, /* Chari */
127 nigel 77 1, /* not */
128 ph10 602 1, /* noti */
129 nigel 77 /* Positive single-char repeats */
130     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131     3, 3, 3, /* upto, minupto, exact */
132 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134     3, 3, 3, /* upto I, minupto I, exact I */
135     1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 nigel 77 /* Negative single-char repeats - only for chars < 256 */
137     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* NOT upto, minupto, exact */
139 ph10 602 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140     1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141     3, 3, 3, /* NOT upto I, minupto I, exact I */
142     1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 nigel 77 /* Positive type repeats */
144     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145     3, 3, 3, /* Type upto, minupto, exact */
146 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 nigel 77 /* Character class & ref repeats */
148     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149     0, 0, /* CRRANGE, CRMINRANGE */
150     0, /* CLASS */
151     0, /* NCLASS */
152     0, /* XCLASS - variable length */
153     0, /* REF */
154 ph10 602 0, /* REFI */
155 nigel 77 0, /* RECURSE */
156     0, /* CALLOUT */
157     0, /* Alt */
158     0, /* Ket */
159     0, /* KetRmax */
160     0, /* KetRmin */
161 ph10 604 0, /* KetRpos */
162 ph10 637 0, /* Reverse */
163 nigel 77 0, /* Assert */
164     0, /* Assert not */
165     0, /* Assert behind */
166     0, /* Assert behind not */
167 ph10 723 0, 0, /* ONCE, ONCE_NC */
168     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
169 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
170 ph10 498 0, 0, /* CREF, NCREF */
171     0, 0, /* RREF, NRREF */
172 nigel 93 0, /* DEF */
173 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
174 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
175     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
176     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
177     0, 0 /* CLOSE, SKIPZERO */
178 nigel 77 };
179    
180 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
181 ph10 462 remember the fact that a character could have been inspected when the end of
182 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
183     two tables that follow must also be modified. */
184 ph10 462
185     static const uschar poptable[] = {
186     0, /* End */
187 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
188 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
189     1, 1, 1, /* Any, AllAny, Anybyte */
190 ph10 498 1, 1, /* \P, \p */
191 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
192 ph10 498 1, /* \X */
193 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
194 ph10 462 1, /* Char */
195 ph10 602 1, /* Chari */
196 ph10 462 1, /* not */
197 ph10 602 1, /* noti */
198 ph10 462 /* Positive single-char repeats */
199     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200     1, 1, 1, /* upto, minupto, exact */
201     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
202 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
203     1, 1, 1, /* upto I, minupto I, exact I */
204     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
205 ph10 462 /* Negative single-char repeats - only for chars < 256 */
206     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
207     1, 1, 1, /* NOT upto, minupto, exact */
208     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
209 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
210     1, 1, 1, /* NOT upto I, minupto I, exact I */
211     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
212 ph10 462 /* Positive type repeats */
213     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
214     1, 1, 1, /* Type upto, minupto, exact */
215     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
216     /* Character class & ref repeats */
217     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
218     1, 1, /* CRRANGE, CRMINRANGE */
219     1, /* CLASS */
220     1, /* NCLASS */
221     1, /* XCLASS - variable length */
222     0, /* REF */
223 ph10 602 0, /* REFI */
224 ph10 462 0, /* RECURSE */
225     0, /* CALLOUT */
226     0, /* Alt */
227     0, /* Ket */
228     0, /* KetRmax */
229     0, /* KetRmin */
230 ph10 604 0, /* KetRpos */
231 ph10 637 0, /* Reverse */
232 ph10 462 0, /* Assert */
233     0, /* Assert not */
234     0, /* Assert behind */
235     0, /* Assert behind not */
236 ph10 723 0, 0, /* ONCE, ONCE_NC */
237     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
238 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
239 ph10 498 0, 0, /* CREF, NCREF */
240     0, 0, /* RREF, NRREF */
241 ph10 462 0, /* DEF */
242 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
243 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
244     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
245     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
246     0, 0 /* CLOSE, SKIPZERO */
247 ph10 462 };
248    
249 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250     and \w */
251    
252 ph10 327 static const uschar toptable1[] = {
253 ph10 168 0, 0, 0, 0, 0, 0,
254 nigel 77 ctype_digit, ctype_digit,
255     ctype_space, ctype_space,
256     ctype_word, ctype_word,
257 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
258 nigel 77 };
259    
260 ph10 327 static const uschar toptable2[] = {
261 ph10 168 0, 0, 0, 0, 0, 0,
262 nigel 77 ctype_digit, 0,
263     ctype_space, 0,
264     ctype_word, 0,
265 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
266 nigel 77 };
267    
268    
269     /* Structure for holding data about a particular state, which is in effect the
270     current data for an active path through the match tree. It must consist
271     entirely of ints because the working vector we are passed, and which we put
272     these structures in, is a vector of ints. */
273    
274     typedef struct stateblock {
275     int offset; /* Offset to opcode */
276     int count; /* Count for repeats */
277     int data; /* Some use extra data */
278     } stateblock;
279    
280     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
281    
282    
283 ph10 475 #ifdef PCRE_DEBUG
284 nigel 77 /*************************************************
285     * Print character string *
286     *************************************************/
287    
288     /* Character string printing function for debugging.
289    
290     Arguments:
291     p points to string
292     length number of bytes
293     f where to print
294    
295     Returns: nothing
296     */
297    
298     static void
299     pchars(unsigned char *p, int length, FILE *f)
300     {
301     int c;
302     while (length-- > 0)
303     {
304     if (isprint(c = *(p++)))
305     fprintf(f, "%c", c);
306     else
307     fprintf(f, "\\x%02x", c);
308     }
309     }
310     #endif
311    
312    
313    
314     /*************************************************
315     * Execute a Regular Expression - DFA engine *
316     *************************************************/
317    
318     /* This internal function applies a compiled pattern to a subject string,
319     starting at a given point, using a DFA engine. This function is called from the
320     external one, possibly multiple times if the pattern is not anchored. The
321     function calls itself recursively for some kinds of subpattern.
322    
323     Arguments:
324     md the match_data block with fixed information
325     this_start_code the opening bracket of this subexpression's code
326     current_subject where we currently are in the subject string
327     start_offset start offset in the subject string
328     offsets vector to contain the matching string offsets
329     offsetcount size of same
330     workspace vector of workspace
331     wscount size of same
332     rlevel function call recursion level
333    
334 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
335 ph10 341 = 0 => offsets overflowed; longest matches are present
336 nigel 77 -1 => failed to match
337     < -1 => some kind of unexpected problem
338    
339     The following macros are used for adding states to the two state vectors (one
340     for the current character, one for the following character). */
341    
342     #define ADD_ACTIVE(x,y) \
343     if (active_count++ < wscount) \
344     { \
345     next_active_state->offset = (x); \
346     next_active_state->count = (y); \
347     next_active_state++; \
348     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349     } \
350     else return PCRE_ERROR_DFA_WSSIZE
351    
352     #define ADD_ACTIVE_DATA(x,y,z) \
353     if (active_count++ < wscount) \
354     { \
355     next_active_state->offset = (x); \
356     next_active_state->count = (y); \
357     next_active_state->data = (z); \
358     next_active_state++; \
359     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
360     } \
361     else return PCRE_ERROR_DFA_WSSIZE
362    
363     #define ADD_NEW(x,y) \
364     if (new_count++ < wscount) \
365     { \
366     next_new_state->offset = (x); \
367     next_new_state->count = (y); \
368     next_new_state++; \
369     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370     } \
371     else return PCRE_ERROR_DFA_WSSIZE
372    
373     #define ADD_NEW_DATA(x,y,z) \
374     if (new_count++ < wscount) \
375     { \
376     next_new_state->offset = (x); \
377     next_new_state->count = (y); \
378     next_new_state->data = (z); \
379     next_new_state++; \
380     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
381     } \
382     else return PCRE_ERROR_DFA_WSSIZE
383    
384     /* And now, here is the code */
385    
386     static int
387     internal_dfa_exec(
388     dfa_match_data *md,
389     const uschar *this_start_code,
390     const uschar *current_subject,
391     int start_offset,
392     int *offsets,
393     int offsetcount,
394     int *workspace,
395     int wscount,
396 ph10 642 int rlevel)
397 nigel 77 {
398     stateblock *active_states, *new_states, *temp_states;
399     stateblock *next_active_state, *next_new_state;
400    
401     const uschar *ctypes, *lcc, *fcc;
402     const uschar *ptr;
403 nigel 93 const uschar *end_code, *first_op;
404 nigel 77
405 ph10 642 dfa_recursion_info new_recursive;
406    
407 nigel 77 int active_count, new_count, match_count;
408    
409     /* Some fields in the md block are frequently referenced, so we load them into
410     independent variables in the hope that this will perform better. */
411    
412     const uschar *start_subject = md->start_subject;
413     const uschar *end_subject = md->end_subject;
414     const uschar *start_code = md->start_code;
415    
416 nigel 87 #ifdef SUPPORT_UTF8
417 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
418 nigel 93 #else
419     BOOL utf8 = FALSE;
420 nigel 87 #endif
421 nigel 77
422     rlevel++;
423     offsetcount &= (-2);
424    
425     wscount -= 2;
426     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
427     (2 * INTS_PER_STATEBLOCK);
428    
429     DPRINTF(("\n%.*s---------------------\n"
430 ph10 642 "%.*sCall to internal_dfa_exec f=%d\n",
431     rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432 nigel 77
433     ctypes = md->tables + ctypes_offset;
434     lcc = md->tables + lcc_offset;
435     fcc = md->tables + fcc_offset;
436    
437     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
438    
439     active_states = (stateblock *)(workspace + 2);
440     next_new_state = new_states = active_states + wscount;
441     new_count = 0;
442    
443 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
444 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
446 nigel 93
447 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
448     the alternative states onto the list, and find out where the end is. This
449     makes is possible to use this function recursively, when we want to stop at a
450     matching internal ket rather than at the end.
451    
452     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
453     a backward assertion. In that case, we have to find out the maximum amount to
454     move back, and set up each alternative appropriately. */
455    
456 nigel 93 if (*first_op == OP_REVERSE)
457 nigel 77 {
458     int max_back = 0;
459     int gone_back;
460    
461     end_code = this_start_code;
462     do
463     {
464     int back = GET(end_code, 2+LINK_SIZE);
465     if (back > max_back) max_back = back;
466     end_code += GET(end_code, 1);
467     }
468     while (*end_code == OP_ALT);
469    
470     /* If we can't go back the amount required for the longest lookbehind
471     pattern, go back as far as we can; some alternatives may still be viable. */
472    
473     #ifdef SUPPORT_UTF8
474     /* In character mode we have to step back character by character */
475    
476     if (utf8)
477     {
478     for (gone_back = 0; gone_back < max_back; gone_back++)
479     {
480     if (current_subject <= start_subject) break;
481     current_subject--;
482     while (current_subject > start_subject &&
483     (*current_subject & 0xc0) == 0x80)
484     current_subject--;
485     }
486     }
487     else
488     #endif
489    
490     /* In byte-mode we can do this quickly. */
491    
492     {
493     gone_back = (current_subject - max_back < start_subject)?
494 ph10 530 (int)(current_subject - start_subject) : max_back;
495 nigel 77 current_subject -= gone_back;
496     }
497 ph10 461
498 ph10 435 /* Save the earliest consulted character */
499 nigel 77
500 ph10 461 if (current_subject < md->start_used_ptr)
501     md->start_used_ptr = current_subject;
502    
503 nigel 77 /* Now we can process the individual branches. */
504    
505     end_code = this_start_code;
506     do
507     {
508     int back = GET(end_code, 2+LINK_SIZE);
509     if (back <= gone_back)
510     {
511 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
512 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
513     }
514     end_code += GET(end_code, 1);
515     }
516     while (*end_code == OP_ALT);
517     }
518    
519     /* This is the code for a "normal" subpattern (not a backward assertion). The
520     start of a whole pattern is always one of these. If we are at the top level,
521     we may be asked to restart matching from the same point that we reached for a
522     previous partial match. We still have to scan through the top-level branches to
523     find the end state. */
524    
525     else
526     {
527     end_code = this_start_code;
528    
529     /* Restarting */
530    
531     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
532     {
533     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
534     new_count = workspace[1];
535     if (!workspace[0])
536     memcpy(new_states, active_states, new_count * sizeof(stateblock));
537     }
538    
539     /* Not restarting */
540    
541     else
542     {
543 nigel 93 int length = 1 + LINK_SIZE +
544 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
545 ph10 654 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
546 ph10 604 2:0);
547 nigel 77 do
548     {
549 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
550 nigel 77 end_code += GET(end_code, 1);
551 nigel 93 length = 1 + LINK_SIZE;
552 nigel 77 }
553     while (*end_code == OP_ALT);
554     }
555     }
556    
557     workspace[0] = 0; /* Bit indicating which vector is current */
558    
559     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
560    
561     /* Loop for scanning the subject */
562    
563     ptr = current_subject;
564     for (;;)
565     {
566     int i, j;
567 nigel 91 int clen, dlen;
568     unsigned int c, d;
569 ph10 428 int forced_fail = 0;
570 ph10 462 BOOL could_continue = FALSE;
571 nigel 77
572     /* Make the new state list into the active state list and empty the
573     new state list. */
574    
575     temp_states = active_states;
576     active_states = new_states;
577     new_states = temp_states;
578     active_count = new_count;
579     new_count = 0;
580    
581     workspace[0] ^= 1; /* Remember for the restarting feature */
582     workspace[1] = active_count;
583    
584 ph10 475 #ifdef PCRE_DEBUG
585 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
586     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
587     printf("\"\n");
588    
589     printf("%.*sActive states: ", rlevel*2-2, SP);
590     for (i = 0; i < active_count; i++)
591     printf("%d/%d ", active_states[i].offset, active_states[i].count);
592     printf("\n");
593     #endif
594    
595     /* Set the pointers for adding new states */
596    
597     next_active_state = active_states + active_count;
598     next_new_state = new_states;
599    
600     /* Load the current character from the subject outside the loop, as many
601     different states may want to look at it, and we assume that at least one
602     will. */
603    
604     if (ptr < end_subject)
605     {
606 nigel 93 clen = 1; /* Number of bytes in the character */
607 nigel 77 #ifdef SUPPORT_UTF8
608     if (utf8) { GETCHARLEN(c, ptr, clen); } else
609     #endif /* SUPPORT_UTF8 */
610     c = *ptr;
611     }
612     else
613     {
614 nigel 93 clen = 0; /* This indicates the end of the subject */
615     c = NOTACHAR; /* This value should never actually be used */
616 nigel 77 }
617    
618     /* Scan up the active states and act on each one. The result of an action
619     may be to add more states to the currently active list (e.g. on hitting a
620     parenthesis) or it may be to put states on the new list, for considering
621     when we move the character pointer on. */
622    
623     for (i = 0; i < active_count; i++)
624     {
625     stateblock *current_state = active_states + i;
626 ph10 654 BOOL caseless = FALSE;
627 nigel 77 const uschar *code;
628     int state_offset = current_state->offset;
629 ph10 397 int count, codevalue, rrc;
630 nigel 77
631 ph10 475 #ifdef PCRE_DEBUG
632 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
633 nigel 93 if (clen == 0) printf("EOL\n");
634 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
635     else printf("0x%02x\n", c);
636     #endif
637    
638     /* A negative offset is a special case meaning "hold off going to this
639     (negated) state until the number of characters in the data field have
640     been skipped". */
641    
642     if (state_offset < 0)
643     {
644     if (current_state->data > 0)
645     {
646     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
647     ADD_NEW_DATA(state_offset, current_state->count,
648     current_state->data - 1);
649     continue;
650     }
651     else
652     {
653     current_state->offset = state_offset = -state_offset;
654     }
655     }
656    
657 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
658 ph10 439 See the note at the head of this module about the possibility of improving
659     performance here. */
660 nigel 77
661     for (j = 0; j < i; j++)
662     {
663     if (active_states[j].offset == state_offset &&
664     active_states[j].count == current_state->count)
665     {
666     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
667     goto NEXT_ACTIVE_STATE;
668     }
669     }
670    
671     /* The state offset is the offset to the opcode */
672    
673     code = start_code + state_offset;
674     codevalue = *code;
675    
676 ph10 463 /* If this opcode inspects a character, but we are at the end of the
677     subject, remember the fact for use when testing for a partial match. */
678    
679 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
680 ph10 463 could_continue = TRUE;
681 ph10 462
682 nigel 77 /* If this opcode is followed by an inline character, load it. It is
683     tempting to test for the presence of a subject character here, but that
684     is wrong, because sometimes zero repetitions of the subject are
685     permitted.
686    
687     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
688 ph10 178 argument that is not a data character - but is always one byte long. We
689     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
690     this case. To keep the other cases fast, convert these ones to new opcodes.
691     */
692 nigel 77
693     if (coptable[codevalue] > 0)
694     {
695     dlen = 1;
696     #ifdef SUPPORT_UTF8
697     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
698     #endif /* SUPPORT_UTF8 */
699     d = code[coptable[codevalue]];
700     if (codevalue >= OP_TYPESTAR)
701     {
702 nigel 93 switch(d)
703     {
704     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
705     case OP_NOTPROP:
706     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
707     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
708     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
709 ph10 178 case OP_NOT_HSPACE:
710 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
711 ph10 178 case OP_NOT_VSPACE:
712 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
713 nigel 93 default: break;
714     }
715 nigel 77 }
716     }
717     else
718     {
719     dlen = 0; /* Not strictly necessary, but compilers moan */
720 nigel 93 d = NOTACHAR; /* if these variables are not set. */
721 nigel 77 }
722    
723    
724     /* Now process the individual opcodes */
725    
726     switch (codevalue)
727     {
728 ph10 498 /* ========================================================================== */
729     /* These cases are never obeyed. This is a fudge that causes a compile-
730     time error if the vectors coptable or poptable, which are indexed by
731     opcode, are not the correct length. It seems to be the only way to do
732     such a check at compile time, as the sizeof() operator does not work
733     in the C preprocessor. */
734 ph10 507
735 ph10 498 case OP_TABLE_LENGTH:
736 ph10 507 case OP_TABLE_LENGTH +
737 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
738     (sizeof(poptable) == OP_TABLE_LENGTH)):
739 ph10 507 break;
740 nigel 77
741     /* ========================================================================== */
742     /* Reached a closing bracket. If not at the end of the pattern, carry
743 ph10 654 on with the next opcode. For repeating opcodes, also add the repeat
744     state. Note that KETRPOS will always be encountered at the end of the
745     subpattern, because the possessive subpattern repeats are always handled
746 ph10 604 using recursive calls. Thus, it never adds any new states.
747 ph10 654
748 ph10 604 At the end of the (sub)pattern, unless we have an empty string and
749 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
750 ph10 442 start of the subject, save the match data, shifting up all previous
751 nigel 77 matches so we always have the longest first. */
752    
753     case OP_KET:
754     case OP_KETRMIN:
755     case OP_KETRMAX:
756 ph10 654 case OP_KETRPOS:
757 nigel 77 if (code != end_code)
758     {
759     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
760     if (codevalue != OP_KET)
761     {
762     ADD_ACTIVE(state_offset - GET(code, 1), 0);
763     }
764     }
765 ph10 461 else
766 nigel 77 {
767 ph10 461 if (ptr > current_subject ||
768 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
769     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
770     current_subject > start_subject + md->start_offset)))
771 nigel 77 {
772 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
773 ph10 680 else if (match_count > 0 && ++match_count * 2 > offsetcount)
774 ph10 428 match_count = 0;
775     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
776     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
777     if (offsetcount >= 2)
778     {
779 ph10 530 offsets[0] = (int)(current_subject - start_subject);
780     offsets[1] = (int)(ptr - start_subject);
781 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
782     offsets[1] - offsets[0], current_subject));
783     }
784     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
785     {
786     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
787     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
788     match_count, rlevel*2-2, SP));
789     return match_count;
790     }
791 ph10 461 }
792 nigel 77 }
793     break;
794    
795     /* ========================================================================== */
796     /* These opcodes add to the current list of states without looking
797     at the current character. */
798    
799     /*-----------------------------------------------------------------*/
800     case OP_ALT:
801     do { code += GET(code, 1); } while (*code == OP_ALT);
802 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
803 nigel 77 break;
804    
805     /*-----------------------------------------------------------------*/
806     case OP_BRA:
807 nigel 93 case OP_SBRA:
808 nigel 77 do
809     {
810 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
811 nigel 77 code += GET(code, 1);
812     }
813     while (*code == OP_ALT);
814     break;
815    
816     /*-----------------------------------------------------------------*/
817 nigel 93 case OP_CBRA:
818     case OP_SCBRA:
819 ph10 530 ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
820 nigel 93 code += GET(code, 1);
821     while (*code == OP_ALT)
822     {
823 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
824 nigel 93 code += GET(code, 1);
825     }
826     break;
827    
828     /*-----------------------------------------------------------------*/
829 nigel 77 case OP_BRAZERO:
830     case OP_BRAMINZERO:
831     ADD_ACTIVE(state_offset + 1, 0);
832     code += 1 + GET(code, 2);
833     while (*code == OP_ALT) code += GET(code, 1);
834 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
835 nigel 77 break;
836    
837     /*-----------------------------------------------------------------*/
838 ph10 335 case OP_SKIPZERO:
839     code += 1 + GET(code, 2);
840     while (*code == OP_ALT) code += GET(code, 1);
841 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
842 ph10 335 break;
843    
844     /*-----------------------------------------------------------------*/
845 nigel 77 case OP_CIRC:
846 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
847     { ADD_ACTIVE(state_offset + 1, 0); }
848     break;
849    
850     /*-----------------------------------------------------------------*/
851     case OP_CIRCM:
852 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
853 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
854 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
855     break;
856    
857     /*-----------------------------------------------------------------*/
858     case OP_EOD:
859 ph10 579 if (ptr >= end_subject)
860     {
861 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
862     could_continue = TRUE;
863     else { ADD_ACTIVE(state_offset + 1, 0); }
864     }
865 nigel 77 break;
866    
867     /*-----------------------------------------------------------------*/
868     case OP_SOD:
869     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
870     break;
871    
872     /*-----------------------------------------------------------------*/
873     case OP_SOM:
874     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
875     break;
876    
877    
878     /* ========================================================================== */
879     /* These opcodes inspect the next subject character, and sometimes
880     the previous one as well, but do not have an argument. The variable
881     clen contains the length of the current character and is zero if we are
882     at the end of the subject. */
883    
884     /*-----------------------------------------------------------------*/
885     case OP_ANY:
886 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
887 nigel 77 { ADD_NEW(state_offset + 1, 0); }
888     break;
889    
890     /*-----------------------------------------------------------------*/
891 ph10 341 case OP_ALLANY:
892     if (clen > 0)
893     { ADD_NEW(state_offset + 1, 0); }
894     break;
895    
896     /*-----------------------------------------------------------------*/
897 nigel 77 case OP_EODN:
898 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
899     could_continue = TRUE;
900     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
901 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
902     break;
903    
904     /*-----------------------------------------------------------------*/
905     case OP_DOLL:
906     if ((md->moptions & PCRE_NOTEOL) == 0)
907     {
908 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
909     could_continue = TRUE;
910     else if (clen == 0 ||
911 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
912 ph10 602 (ptr == end_subject - md->nllen)
913 nigel 91 ))
914 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
915     }
916 ph10 602 break;
917    
918     /*-----------------------------------------------------------------*/
919     case OP_DOLLM:
920     if ((md->moptions & PCRE_NOTEOL) == 0)
921     {
922     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
923     could_continue = TRUE;
924     else if (clen == 0 ||
925     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
926     { ADD_ACTIVE(state_offset + 1, 0); }
927     }
928     else if (IS_NEWLINE(ptr))
929 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
930     break;
931    
932     /*-----------------------------------------------------------------*/
933    
934     case OP_DIGIT:
935     case OP_WHITESPACE:
936     case OP_WORDCHAR:
937     if (clen > 0 && c < 256 &&
938     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
939     { ADD_NEW(state_offset + 1, 0); }
940     break;
941    
942     /*-----------------------------------------------------------------*/
943     case OP_NOT_DIGIT:
944     case OP_NOT_WHITESPACE:
945     case OP_NOT_WORDCHAR:
946     if (clen > 0 && (c >= 256 ||
947     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
948     { ADD_NEW(state_offset + 1, 0); }
949     break;
950    
951     /*-----------------------------------------------------------------*/
952     case OP_WORD_BOUNDARY:
953     case OP_NOT_WORD_BOUNDARY:
954     {
955     int left_word, right_word;
956    
957     if (ptr > start_subject)
958     {
959     const uschar *temp = ptr - 1;
960 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
961 nigel 77 #ifdef SUPPORT_UTF8
962     if (utf8) BACKCHAR(temp);
963     #endif
964     GETCHARTEST(d, temp);
965 ph10 535 #ifdef SUPPORT_UCP
966 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
967     {
968     if (d == '_') left_word = TRUE; else
969 ph10 535 {
970 ph10 518 int cat = UCD_CATEGORY(d);
971     left_word = (cat == ucp_L || cat == ucp_N);
972 ph10 535 }
973     }
974     else
975     #endif
976 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
977     }
978 ph10 518 else left_word = FALSE;
979 nigel 77
980 ph10 461 if (clen > 0)
981 ph10 535 {
982     #ifdef SUPPORT_UCP
983 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
984     {
985     if (c == '_') right_word = TRUE; else
986 ph10 535 {
987 ph10 518 int cat = UCD_CATEGORY(c);
988     right_word = (cat == ucp_L || cat == ucp_N);
989 ph10 535 }
990     }
991     else
992     #endif
993 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
994 ph10 535 }
995 ph10 518 else right_word = FALSE;
996 nigel 77
997     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
998     { ADD_ACTIVE(state_offset + 1, 0); }
999     }
1000     break;
1001    
1002    
1003     /*-----------------------------------------------------------------*/
1004     /* Check the next character by Unicode property. We will get here only
1005     if the support is in the binary; otherwise a compile-time error occurs.
1006     */
1007    
1008 ph10 151 #ifdef SUPPORT_UCP
1009 nigel 77 case OP_PROP:
1010     case OP_NOTPROP:
1011     if (clen > 0)
1012     {
1013 nigel 87 BOOL OK;
1014 ph10 349 const ucd_record * prop = GET_UCD(c);
1015 nigel 87 switch(code[1])
1016 nigel 77 {
1017 nigel 87 case PT_ANY:
1018     OK = TRUE;
1019     break;
1020    
1021     case PT_LAMP:
1022 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1023 ph10 517 prop->chartype == ucp_Lt;
1024 nigel 87 break;
1025    
1026     case PT_GC:
1027 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1028 nigel 87 break;
1029    
1030     case PT_PC:
1031 ph10 349 OK = prop->chartype == code[2];
1032 nigel 87 break;
1033    
1034     case PT_SC:
1035 ph10 349 OK = prop->script == code[2];
1036 nigel 87 break;
1037 ph10 535
1038 ph10 517 /* These are specials for combination cases. */
1039 ph10 535
1040 ph10 517 case PT_ALNUM:
1041     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1042     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1043 ph10 535 break;
1044    
1045 ph10 517 case PT_SPACE: /* Perl space */
1046     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1047     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1048 ph10 535 break;
1049    
1050 ph10 517 case PT_PXSPACE: /* POSIX space */
1051     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1052     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1053     c == CHAR_FF || c == CHAR_CR;
1054 ph10 535 break;
1055    
1056 ph10 517 case PT_WORD:
1057     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1058     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1059     c == CHAR_UNDERSCORE;
1060 ph10 535 break;
1061 nigel 87
1062     /* Should never occur, but keep compilers from grumbling. */
1063    
1064     default:
1065     OK = codevalue != OP_PROP;
1066     break;
1067 nigel 77 }
1068 nigel 87
1069     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1070 nigel 77 }
1071     break;
1072     #endif
1073    
1074    
1075    
1076     /* ========================================================================== */
1077     /* These opcodes likewise inspect the subject character, but have an
1078     argument that is not a data character. It is one of these opcodes:
1079 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1080     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1081 nigel 77
1082     case OP_TYPEPLUS:
1083     case OP_TYPEMINPLUS:
1084 nigel 93 case OP_TYPEPOSPLUS:
1085 nigel 77 count = current_state->count; /* Already matched */
1086     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1087     if (clen > 0)
1088     {
1089     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1090     (c < 256 &&
1091 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1092 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1093     {
1094 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1095     {
1096     active_count--; /* Remove non-match possibility */
1097     next_active_state--;
1098     }
1099 nigel 77 count++;
1100     ADD_NEW(state_offset, count);
1101     }
1102     }
1103     break;
1104    
1105     /*-----------------------------------------------------------------*/
1106     case OP_TYPEQUERY:
1107     case OP_TYPEMINQUERY:
1108 nigel 93 case OP_TYPEPOSQUERY:
1109 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1110     if (clen > 0)
1111     {
1112     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1113     (c < 256 &&
1114 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1115 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1116     {
1117 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1118     {
1119     active_count--; /* Remove non-match possibility */
1120     next_active_state--;
1121     }
1122 nigel 77 ADD_NEW(state_offset + 2, 0);
1123     }
1124     }
1125     break;
1126    
1127     /*-----------------------------------------------------------------*/
1128     case OP_TYPESTAR:
1129     case OP_TYPEMINSTAR:
1130 nigel 93 case OP_TYPEPOSSTAR:
1131 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1132     if (clen > 0)
1133     {
1134     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1135     (c < 256 &&
1136 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1137 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1138     {
1139 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1140     {
1141     active_count--; /* Remove non-match possibility */
1142     next_active_state--;
1143     }
1144 nigel 77 ADD_NEW(state_offset, 0);
1145     }
1146     }
1147     break;
1148    
1149     /*-----------------------------------------------------------------*/
1150     case OP_TYPEEXACT:
1151 nigel 93 count = current_state->count; /* Number already matched */
1152     if (clen > 0)
1153     {
1154     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1155     (c < 256 &&
1156 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1157 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1158     {
1159     if (++count >= GET2(code, 1))
1160     { ADD_NEW(state_offset + 4, 0); }
1161     else
1162     { ADD_NEW(state_offset, count); }
1163     }
1164     }
1165     break;
1166    
1167     /*-----------------------------------------------------------------*/
1168 nigel 77 case OP_TYPEUPTO:
1169     case OP_TYPEMINUPTO:
1170 nigel 93 case OP_TYPEPOSUPTO:
1171     ADD_ACTIVE(state_offset + 4, 0);
1172 nigel 77 count = current_state->count; /* Number already matched */
1173     if (clen > 0)
1174     {
1175     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1176     (c < 256 &&
1177 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1178 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1179     {
1180 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1181     {
1182     active_count--; /* Remove non-match possibility */
1183     next_active_state--;
1184     }
1185 nigel 77 if (++count >= GET2(code, 1))
1186     { ADD_NEW(state_offset + 4, 0); }
1187     else
1188     { ADD_NEW(state_offset, count); }
1189     }
1190     }
1191     break;
1192    
1193     /* ========================================================================== */
1194     /* These are virtual opcodes that are used when something like
1195 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1196     argument. It keeps the code above fast for the other cases. The argument
1197     is in the d variable. */
1198 nigel 77
1199 ph10 151 #ifdef SUPPORT_UCP
1200 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1201     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1202 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1203 nigel 77 count = current_state->count; /* Already matched */
1204 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1205 nigel 77 if (clen > 0)
1206     {
1207 nigel 87 BOOL OK;
1208 ph10 349 const ucd_record * prop = GET_UCD(c);
1209 nigel 87 switch(code[2])
1210     {
1211     case PT_ANY:
1212     OK = TRUE;
1213     break;
1214    
1215     case PT_LAMP:
1216 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1217 ph10 517 prop->chartype == ucp_Lt;
1218 nigel 87 break;
1219    
1220     case PT_GC:
1221 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1222 nigel 87 break;
1223    
1224     case PT_PC:
1225 ph10 349 OK = prop->chartype == code[3];
1226 nigel 87 break;
1227    
1228     case PT_SC:
1229 ph10 349 OK = prop->script == code[3];
1230 nigel 87 break;
1231    
1232 ph10 517 /* These are specials for combination cases. */
1233 ph10 535
1234 ph10 517 case PT_ALNUM:
1235     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1236     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1237 ph10 535 break;
1238    
1239 ph10 517 case PT_SPACE: /* Perl space */
1240     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1241     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1242 ph10 535 break;
1243    
1244 ph10 517 case PT_PXSPACE: /* POSIX space */
1245     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1246     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1247     c == CHAR_FF || c == CHAR_CR;
1248 ph10 535 break;
1249    
1250 ph10 517 case PT_WORD:
1251     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1252     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1253     c == CHAR_UNDERSCORE;
1254 ph10 535 break;
1255 ph10 517
1256 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1257    
1258     default:
1259     OK = codevalue != OP_PROP;
1260     break;
1261     }
1262    
1263 nigel 93 if (OK == (d == OP_PROP))
1264     {
1265     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1266     {
1267     active_count--; /* Remove non-match possibility */
1268     next_active_state--;
1269     }
1270     count++;
1271     ADD_NEW(state_offset, count);
1272     }
1273 nigel 77 }
1274     break;
1275    
1276     /*-----------------------------------------------------------------*/
1277     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1278     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1279 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1280 nigel 77 count = current_state->count; /* Already matched */
1281     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1282 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1283 nigel 77 {
1284     const uschar *nptr = ptr + clen;
1285     int ncount = 0;
1286 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1287     {
1288     active_count--; /* Remove non-match possibility */
1289     next_active_state--;
1290     }
1291 nigel 77 while (nptr < end_subject)
1292     {
1293     int nd;
1294     int ndlen = 1;
1295     GETCHARLEN(nd, nptr, ndlen);
1296 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1297 nigel 77 ncount++;
1298     nptr += ndlen;
1299     }
1300     count++;
1301     ADD_NEW_DATA(-state_offset, count, ncount);
1302     }
1303     break;
1304 ph10 151 #endif
1305 nigel 77
1306     /*-----------------------------------------------------------------*/
1307 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1308     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1309     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1310     count = current_state->count; /* Already matched */
1311     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1312     if (clen > 0)
1313     {
1314     int ncount = 0;
1315     switch (c)
1316     {
1317     case 0x000b:
1318     case 0x000c:
1319     case 0x0085:
1320     case 0x2028:
1321     case 0x2029:
1322 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323     goto ANYNL01;
1324    
1325     case 0x000d:
1326     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327     /* Fall through */
1328    
1329     ANYNL01:
1330     case 0x000a:
1331 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1332     {
1333     active_count--; /* Remove non-match possibility */
1334     next_active_state--;
1335     }
1336     count++;
1337     ADD_NEW_DATA(-state_offset, count, ncount);
1338     break;
1339 ph10 231
1340 nigel 93 default:
1341     break;
1342     }
1343     }
1344     break;
1345    
1346     /*-----------------------------------------------------------------*/
1347 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1348     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1349     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1350     count = current_state->count; /* Already matched */
1351     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1352     if (clen > 0)
1353     {
1354 ph10 182 BOOL OK;
1355 ph10 178 switch (c)
1356     {
1357     case 0x000a:
1358     case 0x000b:
1359     case 0x000c:
1360     case 0x000d:
1361     case 0x0085:
1362     case 0x2028:
1363     case 0x2029:
1364     OK = TRUE;
1365 ph10 182 break;
1366 ph10 178
1367     default:
1368     OK = FALSE;
1369 ph10 182 break;
1370 ph10 178 }
1371    
1372     if (OK == (d == OP_VSPACE))
1373 ph10 182 {
1374 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1375     {
1376     active_count--; /* Remove non-match possibility */
1377     next_active_state--;
1378     }
1379     count++;
1380     ADD_NEW_DATA(-state_offset, count, 0);
1381     }
1382     }
1383     break;
1384    
1385     /*-----------------------------------------------------------------*/
1386     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1387     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1388     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1389     count = current_state->count; /* Already matched */
1390     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1391     if (clen > 0)
1392     {
1393 ph10 182 BOOL OK;
1394 ph10 178 switch (c)
1395     {
1396     case 0x09: /* HT */
1397     case 0x20: /* SPACE */
1398     case 0xa0: /* NBSP */
1399     case 0x1680: /* OGHAM SPACE MARK */
1400     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1401     case 0x2000: /* EN QUAD */
1402     case 0x2001: /* EM QUAD */
1403     case 0x2002: /* EN SPACE */
1404     case 0x2003: /* EM SPACE */
1405     case 0x2004: /* THREE-PER-EM SPACE */
1406     case 0x2005: /* FOUR-PER-EM SPACE */
1407     case 0x2006: /* SIX-PER-EM SPACE */
1408     case 0x2007: /* FIGURE SPACE */
1409     case 0x2008: /* PUNCTUATION SPACE */
1410     case 0x2009: /* THIN SPACE */
1411     case 0x200A: /* HAIR SPACE */
1412     case 0x202f: /* NARROW NO-BREAK SPACE */
1413     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1414     case 0x3000: /* IDEOGRAPHIC SPACE */
1415     OK = TRUE;
1416     break;
1417 ph10 182
1418 ph10 178 default:
1419     OK = FALSE;
1420     break;
1421     }
1422 ph10 182
1423 ph10 178 if (OK == (d == OP_HSPACE))
1424 ph10 182 {
1425 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1426     {
1427     active_count--; /* Remove non-match possibility */
1428     next_active_state--;
1429     }
1430     count++;
1431     ADD_NEW_DATA(-state_offset, count, 0);
1432     }
1433     }
1434     break;
1435    
1436     /*-----------------------------------------------------------------*/
1437 ph10 151 #ifdef SUPPORT_UCP
1438 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1439     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1440 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1441 nigel 87 count = 4;
1442 nigel 77 goto QS1;
1443    
1444     case OP_PROP_EXTRA + OP_TYPESTAR:
1445     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1446 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1447 nigel 77 count = 0;
1448    
1449     QS1:
1450    
1451 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1452 nigel 77 if (clen > 0)
1453     {
1454 nigel 87 BOOL OK;
1455 ph10 349 const ucd_record * prop = GET_UCD(c);
1456 nigel 87 switch(code[2])
1457     {
1458     case PT_ANY:
1459     OK = TRUE;
1460     break;
1461    
1462     case PT_LAMP:
1463 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1464 ph10 517 prop->chartype == ucp_Lt;
1465 nigel 87 break;
1466    
1467     case PT_GC:
1468 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1469 nigel 87 break;
1470    
1471     case PT_PC:
1472 ph10 349 OK = prop->chartype == code[3];
1473 nigel 87 break;
1474    
1475     case PT_SC:
1476 ph10 349 OK = prop->script == code[3];
1477 nigel 87 break;
1478 ph10 535
1479 ph10 517 /* These are specials for combination cases. */
1480 ph10 535
1481 ph10 517 case PT_ALNUM:
1482     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1483     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1484 ph10 535 break;
1485    
1486 ph10 517 case PT_SPACE: /* Perl space */
1487     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1488     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1489 ph10 535 break;
1490    
1491 ph10 517 case PT_PXSPACE: /* POSIX space */
1492     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1493     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1494     c == CHAR_FF || c == CHAR_CR;
1495 ph10 535 break;
1496    
1497 ph10 517 case PT_WORD:
1498     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1499     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1500     c == CHAR_UNDERSCORE;
1501 ph10 535 break;
1502 nigel 87
1503     /* Should never occur, but keep compilers from grumbling. */
1504    
1505     default:
1506     OK = codevalue != OP_PROP;
1507     break;
1508     }
1509    
1510 nigel 93 if (OK == (d == OP_PROP))
1511     {
1512     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1513     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1514     {
1515     active_count--; /* Remove non-match possibility */
1516     next_active_state--;
1517     }
1518     ADD_NEW(state_offset + count, 0);
1519     }
1520 nigel 77 }
1521     break;
1522    
1523     /*-----------------------------------------------------------------*/
1524     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1525     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1526 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1527 nigel 77 count = 2;
1528     goto QS2;
1529    
1530     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1531     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1532 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1533 nigel 77 count = 0;
1534    
1535     QS2:
1536    
1537     ADD_ACTIVE(state_offset + 2, 0);
1538 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1539 nigel 77 {
1540     const uschar *nptr = ptr + clen;
1541     int ncount = 0;
1542 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1543     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1544     {
1545     active_count--; /* Remove non-match possibility */
1546     next_active_state--;
1547     }
1548 nigel 77 while (nptr < end_subject)
1549     {
1550     int nd;
1551     int ndlen = 1;
1552     GETCHARLEN(nd, nptr, ndlen);
1553 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1554 nigel 77 ncount++;
1555     nptr += ndlen;
1556     }
1557     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1558     }
1559     break;
1560 ph10 151 #endif
1561 nigel 77
1562     /*-----------------------------------------------------------------*/
1563 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1564     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1565     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1566     count = 2;
1567     goto QS3;
1568    
1569     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1570     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1571     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1572     count = 0;
1573    
1574     QS3:
1575     ADD_ACTIVE(state_offset + 2, 0);
1576     if (clen > 0)
1577     {
1578     int ncount = 0;
1579     switch (c)
1580     {
1581     case 0x000b:
1582     case 0x000c:
1583     case 0x0085:
1584     case 0x2028:
1585     case 0x2029:
1586 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1587     goto ANYNL02;
1588    
1589     case 0x000d:
1590     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1591     /* Fall through */
1592    
1593     ANYNL02:
1594     case 0x000a:
1595 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1596     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1597     {
1598     active_count--; /* Remove non-match possibility */
1599     next_active_state--;
1600     }
1601     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1602     break;
1603 ph10 231
1604 nigel 93 default:
1605     break;
1606     }
1607     }
1608     break;
1609    
1610     /*-----------------------------------------------------------------*/
1611 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1612     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1613     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1614     count = 2;
1615     goto QS4;
1616    
1617     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1618     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1619     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1620     count = 0;
1621    
1622     QS4:
1623     ADD_ACTIVE(state_offset + 2, 0);
1624     if (clen > 0)
1625     {
1626 ph10 182 BOOL OK;
1627 ph10 178 switch (c)
1628     {
1629     case 0x000a:
1630     case 0x000b:
1631     case 0x000c:
1632     case 0x000d:
1633     case 0x0085:
1634     case 0x2028:
1635     case 0x2029:
1636     OK = TRUE;
1637     break;
1638 ph10 182
1639 ph10 178 default:
1640     OK = FALSE;
1641     break;
1642     }
1643     if (OK == (d == OP_VSPACE))
1644 ph10 182 {
1645 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1646     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1647     {
1648     active_count--; /* Remove non-match possibility */
1649     next_active_state--;
1650     }
1651     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1652     }
1653     }
1654     break;
1655    
1656     /*-----------------------------------------------------------------*/
1657     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1658     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1659     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1660     count = 2;
1661     goto QS5;
1662    
1663     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1664     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1665     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1666     count = 0;
1667    
1668     QS5:
1669     ADD_ACTIVE(state_offset + 2, 0);
1670     if (clen > 0)
1671     {
1672 ph10 182 BOOL OK;
1673 ph10 178 switch (c)
1674     {
1675     case 0x09: /* HT */
1676     case 0x20: /* SPACE */
1677     case 0xa0: /* NBSP */
1678     case 0x1680: /* OGHAM SPACE MARK */
1679     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1680     case 0x2000: /* EN QUAD */
1681     case 0x2001: /* EM QUAD */
1682     case 0x2002: /* EN SPACE */
1683     case 0x2003: /* EM SPACE */
1684     case 0x2004: /* THREE-PER-EM SPACE */
1685     case 0x2005: /* FOUR-PER-EM SPACE */
1686     case 0x2006: /* SIX-PER-EM SPACE */
1687     case 0x2007: /* FIGURE SPACE */
1688     case 0x2008: /* PUNCTUATION SPACE */
1689     case 0x2009: /* THIN SPACE */
1690     case 0x200A: /* HAIR SPACE */
1691     case 0x202f: /* NARROW NO-BREAK SPACE */
1692     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1693     case 0x3000: /* IDEOGRAPHIC SPACE */
1694     OK = TRUE;
1695     break;
1696 ph10 182
1697 ph10 178 default:
1698     OK = FALSE;
1699     break;
1700     }
1701 ph10 182
1702 ph10 178 if (OK == (d == OP_HSPACE))
1703 ph10 182 {
1704 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1705     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1706     {
1707     active_count--; /* Remove non-match possibility */
1708     next_active_state--;
1709     }
1710     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1711     }
1712     }
1713     break;
1714    
1715     /*-----------------------------------------------------------------*/
1716 ph10 151 #ifdef SUPPORT_UCP
1717 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1718     case OP_PROP_EXTRA + OP_TYPEUPTO:
1719     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1720 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1721 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1722 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1723 nigel 77 count = current_state->count; /* Number already matched */
1724     if (clen > 0)
1725     {
1726 nigel 87 BOOL OK;
1727 ph10 349 const ucd_record * prop = GET_UCD(c);
1728 nigel 87 switch(code[4])
1729 nigel 77 {
1730 nigel 87 case PT_ANY:
1731     OK = TRUE;
1732     break;
1733    
1734     case PT_LAMP:
1735 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1736 ph10 517 prop->chartype == ucp_Lt;
1737 nigel 87 break;
1738    
1739     case PT_GC:
1740 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1741 nigel 87 break;
1742    
1743     case PT_PC:
1744 ph10 349 OK = prop->chartype == code[5];
1745 nigel 87 break;
1746    
1747     case PT_SC:
1748 ph10 349 OK = prop->script == code[5];
1749 nigel 87 break;
1750 ph10 535
1751 ph10 517 /* These are specials for combination cases. */
1752 ph10 535
1753 ph10 517 case PT_ALNUM:
1754     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1755     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1756 ph10 535 break;
1757    
1758 ph10 517 case PT_SPACE: /* Perl space */
1759     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1760     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1761 ph10 535 break;
1762    
1763 ph10 517 case PT_PXSPACE: /* POSIX space */
1764     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1765     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1766     c == CHAR_FF || c == CHAR_CR;
1767 ph10 535 break;
1768    
1769 ph10 517 case PT_WORD:
1770     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1771     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1772     c == CHAR_UNDERSCORE;
1773 ph10 535 break;
1774 nigel 87
1775     /* Should never occur, but keep compilers from grumbling. */
1776    
1777     default:
1778     OK = codevalue != OP_PROP;
1779     break;
1780     }
1781    
1782     if (OK == (d == OP_PROP))
1783     {
1784 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1785     {
1786     active_count--; /* Remove non-match possibility */
1787     next_active_state--;
1788     }
1789 nigel 77 if (++count >= GET2(code, 1))
1790 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1791 nigel 77 else
1792     { ADD_NEW(state_offset, count); }
1793     }
1794     }
1795     break;
1796    
1797     /*-----------------------------------------------------------------*/
1798     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1799     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1800     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1801 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1802 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1803     { ADD_ACTIVE(state_offset + 4, 0); }
1804     count = current_state->count; /* Number already matched */
1805 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1806 nigel 77 {
1807     const uschar *nptr = ptr + clen;
1808     int ncount = 0;
1809 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1810     {
1811     active_count--; /* Remove non-match possibility */
1812     next_active_state--;
1813     }
1814 nigel 77 while (nptr < end_subject)
1815     {
1816     int nd;
1817     int ndlen = 1;
1818     GETCHARLEN(nd, nptr, ndlen);
1819 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1820 nigel 77 ncount++;
1821     nptr += ndlen;
1822     }
1823     if (++count >= GET2(code, 1))
1824     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1825     else
1826     { ADD_NEW_DATA(-state_offset, count, ncount); }
1827     }
1828     break;
1829 ph10 151 #endif
1830 nigel 77
1831 nigel 93 /*-----------------------------------------------------------------*/
1832     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1833     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1834     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1835     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1836     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1837     { ADD_ACTIVE(state_offset + 4, 0); }
1838     count = current_state->count; /* Number already matched */
1839     if (clen > 0)
1840     {
1841     int ncount = 0;
1842     switch (c)
1843     {
1844     case 0x000b:
1845     case 0x000c:
1846     case 0x0085:
1847     case 0x2028:
1848     case 0x2029:
1849 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1850     goto ANYNL03;
1851    
1852     case 0x000d:
1853     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1854     /* Fall through */
1855    
1856     ANYNL03:
1857     case 0x000a:
1858 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1859     {
1860     active_count--; /* Remove non-match possibility */
1861     next_active_state--;
1862     }
1863     if (++count >= GET2(code, 1))
1864     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1865     else
1866     { ADD_NEW_DATA(-state_offset, count, ncount); }
1867     break;
1868 ph10 231
1869 nigel 93 default:
1870     break;
1871     }
1872     }
1873     break;
1874    
1875 ph10 178 /*-----------------------------------------------------------------*/
1876     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1877     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1878     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1879     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1880     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1881     { ADD_ACTIVE(state_offset + 4, 0); }
1882     count = current_state->count; /* Number already matched */
1883     if (clen > 0)
1884     {
1885 ph10 182 BOOL OK;
1886 ph10 178 switch (c)
1887     {
1888     case 0x000a:
1889     case 0x000b:
1890     case 0x000c:
1891     case 0x000d:
1892     case 0x0085:
1893     case 0x2028:
1894     case 0x2029:
1895     OK = TRUE;
1896     break;
1897 ph10 182
1898 ph10 178 default:
1899     OK = FALSE;
1900     }
1901 ph10 182
1902 ph10 178 if (OK == (d == OP_VSPACE))
1903 ph10 182 {
1904 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1905     {
1906     active_count--; /* Remove non-match possibility */
1907     next_active_state--;
1908     }
1909     if (++count >= GET2(code, 1))
1910     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1911     else
1912     { ADD_NEW_DATA(-state_offset, count, 0); }
1913     }
1914     }
1915     break;
1916    
1917     /*-----------------------------------------------------------------*/
1918     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1919     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1920     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1921     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1922     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1923     { ADD_ACTIVE(state_offset + 4, 0); }
1924     count = current_state->count; /* Number already matched */
1925     if (clen > 0)
1926     {
1927 ph10 182 BOOL OK;
1928 ph10 178 switch (c)
1929     {
1930     case 0x09: /* HT */
1931     case 0x20: /* SPACE */
1932     case 0xa0: /* NBSP */
1933     case 0x1680: /* OGHAM SPACE MARK */
1934     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1935     case 0x2000: /* EN QUAD */
1936     case 0x2001: /* EM QUAD */
1937     case 0x2002: /* EN SPACE */
1938     case 0x2003: /* EM SPACE */
1939     case 0x2004: /* THREE-PER-EM SPACE */
1940     case 0x2005: /* FOUR-PER-EM SPACE */
1941     case 0x2006: /* SIX-PER-EM SPACE */
1942     case 0x2007: /* FIGURE SPACE */
1943     case 0x2008: /* PUNCTUATION SPACE */
1944     case 0x2009: /* THIN SPACE */
1945     case 0x200A: /* HAIR SPACE */
1946     case 0x202f: /* NARROW NO-BREAK SPACE */
1947     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1948     case 0x3000: /* IDEOGRAPHIC SPACE */
1949     OK = TRUE;
1950     break;
1951 ph10 182
1952 ph10 178 default:
1953     OK = FALSE;
1954     break;
1955     }
1956 ph10 182
1957 ph10 178 if (OK == (d == OP_HSPACE))
1958 ph10 182 {
1959 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1960     {
1961     active_count--; /* Remove non-match possibility */
1962     next_active_state--;
1963     }
1964     if (++count >= GET2(code, 1))
1965     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1966     else
1967     { ADD_NEW_DATA(-state_offset, count, 0); }
1968     }
1969     }
1970     break;
1971    
1972 nigel 77 /* ========================================================================== */
1973     /* These opcodes are followed by a character that is usually compared
1974     to the current subject character; it is loaded into d. We still get
1975     here even if there is no subject character, because in some cases zero
1976     repetitions are permitted. */
1977    
1978     /*-----------------------------------------------------------------*/
1979     case OP_CHAR:
1980     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1981     break;
1982    
1983     /*-----------------------------------------------------------------*/
1984 ph10 602 case OP_CHARI:
1985 nigel 77 if (clen == 0) break;
1986    
1987     #ifdef SUPPORT_UTF8
1988     if (utf8)
1989     {
1990     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1991     {
1992 nigel 93 unsigned int othercase;
1993 nigel 77 if (c < 128) othercase = fcc[c]; else
1994    
1995     /* If we have Unicode property support, we can use it to test the
1996 nigel 87 other case of the character. */
1997 nigel 77
1998     #ifdef SUPPORT_UCP
1999 ph10 349 othercase = UCD_OTHERCASE(c);
2000 nigel 87 #else
2001 nigel 93 othercase = NOTACHAR;
2002 nigel 77 #endif
2003    
2004     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2005     }
2006     }
2007     else
2008     #endif /* SUPPORT_UTF8 */
2009    
2010     /* Non-UTF-8 mode */
2011     {
2012     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2013     }
2014     break;
2015    
2016    
2017     #ifdef SUPPORT_UCP
2018     /*-----------------------------------------------------------------*/
2019     /* This is a tricky one because it can match more than one character.
2020     Find out how many characters to skip, and then set up a negative state
2021     to wait for them to pass before continuing. */
2022    
2023     case OP_EXTUNI:
2024 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2025 nigel 77 {
2026     const uschar *nptr = ptr + clen;
2027     int ncount = 0;
2028     while (nptr < end_subject)
2029     {
2030     int nclen = 1;
2031     GETCHARLEN(c, nptr, nclen);
2032 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2033 nigel 77 ncount++;
2034     nptr += nclen;
2035     }
2036     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2037     }
2038     break;
2039     #endif
2040    
2041     /*-----------------------------------------------------------------*/
2042 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2043     character (when CR is followed by LF). In this case, set up a negative
2044     state to wait for one character to pass before continuing. */
2045    
2046     case OP_ANYNL:
2047     if (clen > 0) switch(c)
2048     {
2049     case 0x000b:
2050     case 0x000c:
2051     case 0x0085:
2052     case 0x2028:
2053     case 0x2029:
2054 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2055    
2056     case 0x000a:
2057 nigel 93 ADD_NEW(state_offset + 1, 0);
2058     break;
2059 ph10 231
2060 nigel 93 case 0x000d:
2061     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2062     {
2063     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2064     }
2065     else
2066     {
2067     ADD_NEW(state_offset + 1, 0);
2068     }
2069     break;
2070     }
2071     break;
2072    
2073     /*-----------------------------------------------------------------*/
2074 ph10 178 case OP_NOT_VSPACE:
2075     if (clen > 0) switch(c)
2076     {
2077     case 0x000a:
2078     case 0x000b:
2079     case 0x000c:
2080     case 0x000d:
2081     case 0x0085:
2082     case 0x2028:
2083     case 0x2029:
2084     break;
2085 ph10 182
2086     default:
2087 ph10 178 ADD_NEW(state_offset + 1, 0);
2088     break;
2089     }
2090     break;
2091    
2092     /*-----------------------------------------------------------------*/
2093     case OP_VSPACE:
2094     if (clen > 0) switch(c)
2095     {
2096     case 0x000a:
2097     case 0x000b:
2098     case 0x000c:
2099     case 0x000d:
2100     case 0x0085:
2101     case 0x2028:
2102     case 0x2029:
2103     ADD_NEW(state_offset + 1, 0);
2104     break;
2105 ph10 182
2106 ph10 178 default: break;
2107     }
2108     break;
2109    
2110     /*-----------------------------------------------------------------*/
2111     case OP_NOT_HSPACE:
2112     if (clen > 0) switch(c)
2113     {
2114     case 0x09: /* HT */
2115     case 0x20: /* SPACE */
2116     case 0xa0: /* NBSP */
2117     case 0x1680: /* OGHAM SPACE MARK */
2118     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2119     case 0x2000: /* EN QUAD */
2120     case 0x2001: /* EM QUAD */
2121     case 0x2002: /* EN SPACE */
2122     case 0x2003: /* EM SPACE */
2123     case 0x2004: /* THREE-PER-EM SPACE */
2124     case 0x2005: /* FOUR-PER-EM SPACE */
2125     case 0x2006: /* SIX-PER-EM SPACE */
2126     case 0x2007: /* FIGURE SPACE */
2127     case 0x2008: /* PUNCTUATION SPACE */
2128     case 0x2009: /* THIN SPACE */
2129     case 0x200A: /* HAIR SPACE */
2130     case 0x202f: /* NARROW NO-BREAK SPACE */
2131     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2132     case 0x3000: /* IDEOGRAPHIC SPACE */
2133     break;
2134 ph10 182
2135     default:
2136 ph10 178 ADD_NEW(state_offset + 1, 0);
2137     break;
2138     }
2139     break;
2140    
2141     /*-----------------------------------------------------------------*/
2142     case OP_HSPACE:
2143     if (clen > 0) switch(c)
2144     {
2145     case 0x09: /* HT */
2146     case 0x20: /* SPACE */
2147     case 0xa0: /* NBSP */
2148     case 0x1680: /* OGHAM SPACE MARK */
2149     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2150     case 0x2000: /* EN QUAD */
2151     case 0x2001: /* EM QUAD */
2152     case 0x2002: /* EN SPACE */
2153     case 0x2003: /* EM SPACE */
2154     case 0x2004: /* THREE-PER-EM SPACE */
2155     case 0x2005: /* FOUR-PER-EM SPACE */
2156     case 0x2006: /* SIX-PER-EM SPACE */
2157     case 0x2007: /* FIGURE SPACE */
2158     case 0x2008: /* PUNCTUATION SPACE */
2159     case 0x2009: /* THIN SPACE */
2160     case 0x200A: /* HAIR SPACE */
2161     case 0x202f: /* NARROW NO-BREAK SPACE */
2162     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2163     case 0x3000: /* IDEOGRAPHIC SPACE */
2164     ADD_NEW(state_offset + 1, 0);
2165     break;
2166     }
2167     break;
2168    
2169     /*-----------------------------------------------------------------*/
2170 ph10 602 /* Match a negated single character casefully. This is only used for
2171     one-byte characters, that is, we know that d < 256. The character we are
2172 nigel 77 checking (c) can be multibyte. */
2173    
2174     case OP_NOT:
2175 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2176 nigel 77 break;
2177    
2178     /*-----------------------------------------------------------------*/
2179 ph10 602 /* Match a negated single character caselessly. This is only used for
2180     one-byte characters, that is, we know that d < 256. The character we are
2181     checking (c) can be multibyte. */
2182    
2183     case OP_NOTI:
2184 ph10 654 if (clen > 0 && c != d && c != fcc[d])
2185 ph10 602 { ADD_NEW(state_offset + dlen + 1, 0); }
2186     break;
2187    
2188     /*-----------------------------------------------------------------*/
2189     case OP_PLUSI:
2190     case OP_MINPLUSI:
2191     case OP_POSPLUSI:
2192     case OP_NOTPLUSI:
2193     case OP_NOTMINPLUSI:
2194     case OP_NOTPOSPLUSI:
2195     caseless = TRUE;
2196     codevalue -= OP_STARI - OP_STAR;
2197 ph10 654
2198 ph10 602 /* Fall through */
2199 nigel 77 case OP_PLUS:
2200     case OP_MINPLUS:
2201 nigel 93 case OP_POSPLUS:
2202 nigel 77 case OP_NOTPLUS:
2203     case OP_NOTMINPLUS:
2204 nigel 93 case OP_NOTPOSPLUS:
2205 nigel 77 count = current_state->count; /* Already matched */
2206     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2207     if (clen > 0)
2208     {
2209 nigel 93 unsigned int otherd = NOTACHAR;
2210 ph10 602 if (caseless)
2211 nigel 77 {
2212     #ifdef SUPPORT_UTF8
2213 nigel 87 if (utf8 && d >= 128)
2214 nigel 77 {
2215     #ifdef SUPPORT_UCP
2216 ph10 349 otherd = UCD_OTHERCASE(d);
2217 nigel 77 #endif /* SUPPORT_UCP */
2218     }
2219     else
2220     #endif /* SUPPORT_UTF8 */
2221     otherd = fcc[d];
2222     }
2223     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2224 nigel 93 {
2225     if (count > 0 &&
2226     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2227     {
2228     active_count--; /* Remove non-match possibility */
2229     next_active_state--;
2230     }
2231     count++;
2232     ADD_NEW(state_offset, count);
2233     }
2234 nigel 77 }
2235     break;
2236    
2237     /*-----------------------------------------------------------------*/
2238 ph10 602 case OP_QUERYI:
2239     case OP_MINQUERYI:
2240     case OP_POSQUERYI:
2241     case OP_NOTQUERYI:
2242     case OP_NOTMINQUERYI:
2243     case OP_NOTPOSQUERYI:
2244     caseless = TRUE;
2245     codevalue -= OP_STARI - OP_STAR;
2246     /* Fall through */
2247 nigel 77 case OP_QUERY:
2248     case OP_MINQUERY:
2249 nigel 93 case OP_POSQUERY:
2250 nigel 77 case OP_NOTQUERY:
2251     case OP_NOTMINQUERY:
2252 nigel 93 case OP_NOTPOSQUERY:
2253 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2254     if (clen > 0)
2255     {
2256 nigel 93 unsigned int otherd = NOTACHAR;
2257 ph10 602 if (caseless)
2258 nigel 77 {
2259     #ifdef SUPPORT_UTF8
2260 nigel 87 if (utf8 && d >= 128)
2261 nigel 77 {
2262     #ifdef SUPPORT_UCP
2263 ph10 349 otherd = UCD_OTHERCASE(d);
2264 nigel 77 #endif /* SUPPORT_UCP */
2265     }
2266     else
2267     #endif /* SUPPORT_UTF8 */
2268     otherd = fcc[d];
2269     }
2270     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2271 nigel 93 {
2272     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2273     {
2274     active_count--; /* Remove non-match possibility */
2275     next_active_state--;
2276     }
2277     ADD_NEW(state_offset + dlen + 1, 0);
2278     }
2279 nigel 77 }
2280     break;
2281    
2282     /*-----------------------------------------------------------------*/
2283 ph10 602 case OP_STARI:
2284     case OP_MINSTARI:
2285     case OP_POSSTARI:
2286     case OP_NOTSTARI:
2287     case OP_NOTMINSTARI:
2288     case OP_NOTPOSSTARI:
2289     caseless = TRUE;
2290     codevalue -= OP_STARI - OP_STAR;
2291     /* Fall through */
2292 nigel 77 case OP_STAR:
2293     case OP_MINSTAR:
2294 nigel 93 case OP_POSSTAR:
2295 nigel 77 case OP_NOTSTAR:
2296     case OP_NOTMINSTAR:
2297 nigel 93 case OP_NOTPOSSTAR:
2298 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2299     if (clen > 0)
2300     {
2301 nigel 93 unsigned int otherd = NOTACHAR;
2302 ph10 602 if (caseless)
2303 nigel 77 {
2304     #ifdef SUPPORT_UTF8
2305 nigel 87 if (utf8 && d >= 128)
2306 nigel 77 {
2307     #ifdef SUPPORT_UCP
2308 ph10 349 otherd = UCD_OTHERCASE(d);
2309 nigel 77 #endif /* SUPPORT_UCP */
2310     }
2311     else
2312     #endif /* SUPPORT_UTF8 */
2313     otherd = fcc[d];
2314     }
2315     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2316 nigel 93 {
2317     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2318     {
2319     active_count--; /* Remove non-match possibility */
2320     next_active_state--;
2321     }
2322     ADD_NEW(state_offset, 0);
2323     }
2324 nigel 77 }
2325     break;
2326    
2327     /*-----------------------------------------------------------------*/
2328 ph10 602 case OP_EXACTI:
2329     case OP_NOTEXACTI:
2330     caseless = TRUE;
2331     codevalue -= OP_STARI - OP_STAR;
2332     /* Fall through */
2333 nigel 77 case OP_EXACT:
2334 nigel 93 case OP_NOTEXACT:
2335     count = current_state->count; /* Number already matched */
2336     if (clen > 0)
2337     {
2338     unsigned int otherd = NOTACHAR;
2339 ph10 602 if (caseless)
2340 nigel 93 {
2341     #ifdef SUPPORT_UTF8
2342     if (utf8 && d >= 128)
2343     {
2344     #ifdef SUPPORT_UCP
2345 ph10 349 otherd = UCD_OTHERCASE(d);
2346 nigel 93 #endif /* SUPPORT_UCP */
2347     }
2348     else
2349     #endif /* SUPPORT_UTF8 */
2350     otherd = fcc[d];
2351     }
2352     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2353     {
2354     if (++count >= GET2(code, 1))
2355     { ADD_NEW(state_offset + dlen + 3, 0); }
2356     else
2357     { ADD_NEW(state_offset, count); }
2358     }
2359     }
2360     break;
2361    
2362     /*-----------------------------------------------------------------*/
2363 ph10 602 case OP_UPTOI:
2364     case OP_MINUPTOI:
2365     case OP_POSUPTOI:
2366     case OP_NOTUPTOI:
2367     case OP_NOTMINUPTOI:
2368     case OP_NOTPOSUPTOI:
2369     caseless = TRUE;
2370     codevalue -= OP_STARI - OP_STAR;
2371     /* Fall through */
2372 nigel 77 case OP_UPTO:
2373     case OP_MINUPTO:
2374 nigel 93 case OP_POSUPTO:
2375 nigel 77 case OP_NOTUPTO:
2376     case OP_NOTMINUPTO:
2377 nigel 93 case OP_NOTPOSUPTO:
2378     ADD_ACTIVE(state_offset + dlen + 3, 0);
2379 nigel 77 count = current_state->count; /* Number already matched */
2380     if (clen > 0)
2381     {
2382 nigel 93 unsigned int otherd = NOTACHAR;
2383 ph10 602 if (caseless)
2384 nigel 77 {
2385     #ifdef SUPPORT_UTF8
2386 nigel 87 if (utf8 && d >= 128)
2387 nigel 77 {
2388     #ifdef SUPPORT_UCP
2389 ph10 349 otherd = UCD_OTHERCASE(d);
2390 nigel 77 #endif /* SUPPORT_UCP */
2391     }
2392     else
2393     #endif /* SUPPORT_UTF8 */
2394     otherd = fcc[d];
2395     }
2396     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2397     {
2398 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2399     {
2400     active_count--; /* Remove non-match possibility */
2401     next_active_state--;
2402     }
2403 nigel 77 if (++count >= GET2(code, 1))
2404     { ADD_NEW(state_offset + dlen + 3, 0); }
2405     else
2406     { ADD_NEW(state_offset, count); }
2407     }
2408     }
2409     break;
2410    
2411    
2412     /* ========================================================================== */
2413     /* These are the class-handling opcodes */
2414    
2415     case OP_CLASS:
2416     case OP_NCLASS:
2417     case OP_XCLASS:
2418     {
2419     BOOL isinclass = FALSE;
2420     int next_state_offset;
2421     const uschar *ecode;
2422    
2423     /* For a simple class, there is always just a 32-byte table, and we
2424     can set isinclass from it. */
2425    
2426     if (codevalue != OP_XCLASS)
2427     {
2428     ecode = code + 33;
2429     if (clen > 0)
2430     {
2431     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2432     ((code[1 + c/8] & (1 << (c&7))) != 0);
2433     }
2434     }
2435    
2436     /* An extended class may have a table or a list of single characters,
2437     ranges, or both, and it may be positive or negative. There's a
2438     function that sorts all this out. */
2439    
2440     else
2441     {
2442     ecode = code + GET(code, 1);
2443     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2444     }
2445    
2446     /* At this point, isinclass is set for all kinds of class, and ecode
2447     points to the byte after the end of the class. If there is a
2448     quantifier, this is where it will be. */
2449    
2450 ph10 530 next_state_offset = (int)(ecode - start_code);
2451 nigel 77
2452     switch (*ecode)
2453     {
2454     case OP_CRSTAR:
2455     case OP_CRMINSTAR:
2456     ADD_ACTIVE(next_state_offset + 1, 0);
2457     if (isinclass) { ADD_NEW(state_offset, 0); }
2458     break;
2459    
2460     case OP_CRPLUS:
2461     case OP_CRMINPLUS:
2462     count = current_state->count; /* Already matched */
2463     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2464     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2465     break;
2466    
2467     case OP_CRQUERY:
2468     case OP_CRMINQUERY:
2469     ADD_ACTIVE(next_state_offset + 1, 0);
2470     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2471     break;
2472    
2473     case OP_CRRANGE:
2474     case OP_CRMINRANGE:
2475     count = current_state->count; /* Already matched */
2476     if (count >= GET2(ecode, 1))
2477     { ADD_ACTIVE(next_state_offset + 5, 0); }
2478     if (isinclass)
2479     {
2480 nigel 91 int max = GET2(ecode, 3);
2481     if (++count >= max && max != 0) /* Max 0 => no limit */
2482 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2483     else
2484     { ADD_NEW(state_offset, count); }
2485     }
2486     break;
2487    
2488     default:
2489     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2490     break;
2491     }
2492     }
2493     break;
2494    
2495     /* ========================================================================== */
2496     /* These are the opcodes for fancy brackets of various kinds. We have
2497 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2498     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2499 ph10 341 though the other "backtracking verbs" are not supported. */
2500 ph10 345
2501 ph10 341 case OP_FAIL:
2502 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2503 ph10 345 break;
2504 nigel 77
2505     case OP_ASSERT:
2506     case OP_ASSERT_NOT:
2507     case OP_ASSERTBACK:
2508     case OP_ASSERTBACK_NOT:
2509     {
2510     int rc;
2511     int local_offsets[2];
2512     int local_workspace[1000];
2513     const uschar *endasscode = code + GET(code, 1);
2514    
2515     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2516    
2517     rc = internal_dfa_exec(
2518     md, /* static match data */
2519     code, /* this subexpression's code */
2520     ptr, /* where we currently are */
2521 ph10 530 (int)(ptr - start_subject), /* start offset */
2522 nigel 77 local_offsets, /* offset vector */
2523     sizeof(local_offsets)/sizeof(int), /* size of same */
2524     local_workspace, /* workspace vector */
2525     sizeof(local_workspace)/sizeof(int), /* size of same */
2526 ph10 642 rlevel); /* function recursion level */
2527 ph10 487
2528 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2529 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2530 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2531 nigel 77 }
2532     break;
2533    
2534     /*-----------------------------------------------------------------*/
2535     case OP_COND:
2536 nigel 93 case OP_SCOND:
2537 nigel 77 {
2538     int local_offsets[1000];
2539     int local_workspace[1000];
2540 ph10 406 int codelink = GET(code, 1);
2541 ph10 397 int condcode;
2542 ph10 406
2543 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2544 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2545 ph10 398 happen for the other conditions. */
2546 nigel 77
2547 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2548 ph10 406 {
2549     rrc = 0;
2550 ph10 397 if (pcre_callout != NULL)
2551     {
2552     pcre_callout_block cb;
2553     cb.version = 1; /* Version 1 of the callout block */
2554     cb.callout_number = code[LINK_SIZE+2];
2555     cb.offset_vector = offsets;
2556     cb.subject = (PCRE_SPTR)start_subject;
2557 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2558     cb.start_match = (int)(current_subject - start_subject);
2559     cb.current_position = (int)(ptr - start_subject);
2560 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2561     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2562     cb.capture_top = 1;
2563     cb.capture_last = -1;
2564     cb.callout_data = md->callout_data;
2565 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2566 ph10 397 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2567     }
2568 ph10 398 if (rrc > 0) break; /* Fail this thread */
2569     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2570 ph10 406 }
2571 ph10 398
2572 ph10 397 condcode = code[LINK_SIZE+1];
2573 ph10 406
2574 nigel 93 /* Back reference conditions are not supported */
2575 nigel 77
2576 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2577 ph10 459 return PCRE_ERROR_DFA_UCOND;
2578 nigel 93
2579     /* The DEFINE condition is always false */
2580    
2581     if (condcode == OP_DEF)
2582 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2583 nigel 93
2584     /* The only supported version of OP_RREF is for the value RREF_ANY,
2585     which means "test if in any recursion". We can't test for specifically
2586     recursed groups. */
2587    
2588 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2589 nigel 93 {
2590 nigel 77 int value = GET2(code, LINK_SIZE+2);
2591 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2592 ph10 654 if (md->recursive != NULL)
2593 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2594     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2595 nigel 77 }
2596    
2597     /* Otherwise, the condition is an assertion */
2598    
2599     else
2600     {
2601     int rc;
2602     const uschar *asscode = code + LINK_SIZE + 1;
2603     const uschar *endasscode = asscode + GET(asscode, 1);
2604    
2605     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2606    
2607     rc = internal_dfa_exec(
2608     md, /* fixed match data */
2609     asscode, /* this subexpression's code */
2610     ptr, /* where we currently are */
2611 ph10 530 (int)(ptr - start_subject), /* start offset */
2612 nigel 77 local_offsets, /* offset vector */
2613     sizeof(local_offsets)/sizeof(int), /* size of same */
2614     local_workspace, /* workspace vector */
2615     sizeof(local_workspace)/sizeof(int), /* size of same */
2616 ph10 642 rlevel); /* function recursion level */
2617 nigel 77
2618 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2619 nigel 77 if ((rc >= 0) ==
2620     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2621 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2622 nigel 77 else
2623 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2624 nigel 77 }
2625     }
2626     break;
2627    
2628     /*-----------------------------------------------------------------*/
2629     case OP_RECURSE:
2630     {
2631 ph10 654 dfa_recursion_info *ri;
2632 nigel 77 int local_offsets[1000];
2633     int local_workspace[1000];
2634 ph10 642 const uschar *callpat = start_code + GET(code, 1);
2635 ph10 654 int recno = (callpat == md->start_code)? 0 :
2636     GET2(callpat, 1 + LINK_SIZE);
2637 nigel 77 int rc;
2638    
2639 ph10 642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2640 ph10 654
2641 ph10 642 /* Check for repeating a recursion without advancing the subject
2642     pointer. This should catch convoluted mutual recursions. (Some simple
2643     cases are caught at compile time.) */
2644 nigel 77
2645 ph10 654 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2646     if (recno == ri->group_num && ptr == ri->subject_position)
2647     return PCRE_ERROR_RECURSELOOP;
2648    
2649     /* Remember this recursion and where we started it so as to
2650 ph10 642 catch infinite loops. */
2651 ph10 654
2652 ph10 642 new_recursive.group_num = recno;
2653     new_recursive.subject_position = ptr;
2654     new_recursive.prevrec = md->recursive;
2655 ph10 654 md->recursive = &new_recursive;
2656 ph10 642
2657 nigel 77 rc = internal_dfa_exec(
2658     md, /* fixed match data */
2659 ph10 642 callpat, /* this subexpression's code */
2660 nigel 77 ptr, /* where we currently are */
2661 ph10 530 (int)(ptr - start_subject), /* start offset */
2662 nigel 77 local_offsets, /* offset vector */
2663     sizeof(local_offsets)/sizeof(int), /* size of same */
2664     local_workspace, /* workspace vector */
2665     sizeof(local_workspace)/sizeof(int), /* size of same */
2666 ph10 642 rlevel); /* function recursion level */
2667 nigel 77
2668 ph10 642 md->recursive = new_recursive.prevrec; /* Done this recursion */
2669 nigel 77
2670 ph10 654 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2671 ph10 642 rc));
2672    
2673 nigel 77 /* Ran out of internal offsets */
2674    
2675     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2676    
2677     /* For each successful matched substring, set up the next state with a
2678     count of characters to skip before trying it. Note that the count is in
2679     characters, not bytes. */
2680    
2681     if (rc > 0)
2682     {
2683     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2684     {
2685     const uschar *p = start_subject + local_offsets[rc];
2686     const uschar *pp = start_subject + local_offsets[rc+1];
2687     int charcount = local_offsets[rc+1] - local_offsets[rc];
2688     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2689     if (charcount > 0)
2690     {
2691     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2692     }
2693     else
2694     {
2695     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2696     }
2697     }
2698     }
2699     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2700     }
2701     break;
2702    
2703     /*-----------------------------------------------------------------*/
2704 ph10 604 case OP_BRAPOS:
2705     case OP_SBRAPOS:
2706     case OP_CBRAPOS:
2707     case OP_SCBRAPOS:
2708 ph10 654 case OP_BRAPOSZERO:
2709 ph10 604 {
2710     int charcount, matched_count;
2711     const uschar *local_ptr = ptr;
2712     BOOL allow_zero;
2713 ph10 654
2714 ph10 604 if (codevalue == OP_BRAPOSZERO)
2715     {
2716     allow_zero = TRUE;
2717     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2718     }
2719 ph10 654 else allow_zero = FALSE;
2720    
2721     /* Loop to match the subpattern as many times as possible as if it were
2722     a complete pattern. */
2723    
2724 ph10 604 for (matched_count = 0;; matched_count++)
2725     {
2726     int local_offsets[2];
2727     int local_workspace[1000];
2728 ph10 654
2729 ph10 604 int rc = internal_dfa_exec(
2730     md, /* fixed match data */
2731     code, /* this subexpression's code */
2732     local_ptr, /* where we currently are */
2733     (int)(ptr - start_subject), /* start offset */
2734     local_offsets, /* offset vector */
2735     sizeof(local_offsets)/sizeof(int), /* size of same */
2736     local_workspace, /* workspace vector */
2737     sizeof(local_workspace)/sizeof(int), /* size of same */
2738 ph10 642 rlevel); /* function recursion level */
2739 ph10 654
2740 ph10 604 /* Failed to match */
2741 ph10 654
2742     if (rc < 0)
2743 ph10 604 {
2744     if (rc != PCRE_ERROR_NOMATCH) return rc;
2745     break;
2746 ph10 654 }
2747    
2748 ph10 604 /* Matched: break the loop if zero characters matched. */
2749 ph10 654
2750 ph10 604 charcount = local_offsets[1] - local_offsets[0];
2751 ph10 654 if (charcount == 0) break;
2752 ph10 604 local_ptr += charcount; /* Advance temporary position ptr */
2753 ph10 654 }
2754 ph10 604
2755     /* At this point we have matched the subpattern matched_count
2756 ph10 654 times, and local_ptr is pointing to the character after the end of the
2757     last match. */
2758 ph10 604
2759     if (matched_count > 0 || allow_zero)
2760 ph10 654 {
2761 ph10 604 const uschar *end_subpattern = code;
2762     int next_state_offset;
2763 ph10 654
2764 ph10 604 do { end_subpattern += GET(end_subpattern, 1); }
2765     while (*end_subpattern == OP_ALT);
2766     next_state_offset =
2767     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2768    
2769     /* Optimization: if there are no more active states, and there
2770     are no new states yet set up, then skip over the subject string
2771     right here, to save looping. Otherwise, set up the new state to swing
2772     into action when the end of the matched substring is reached. */
2773    
2774     if (i + 1 >= active_count && new_count == 0)
2775     {
2776     ptr = local_ptr;
2777     clen = 0;
2778     ADD_NEW(next_state_offset, 0);
2779     }
2780     else
2781     {
2782     const uschar *p = ptr;
2783     const uschar *pp = local_ptr;
2784 ph10 654 charcount = pp - p;
2785 ph10 604 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2786     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2787     }
2788 ph10 654 }
2789     }
2790 ph10 604 break;
2791 ph10 654
2792 ph10 604 /*-----------------------------------------------------------------*/
2793 nigel 77 case OP_ONCE:
2794 ph10 733 case OP_ONCE_NC:
2795 nigel 77 {
2796     int local_offsets[2];
2797     int local_workspace[1000];
2798    
2799     int rc = internal_dfa_exec(
2800     md, /* fixed match data */
2801     code, /* this subexpression's code */
2802     ptr, /* where we currently are */
2803 ph10 530 (int)(ptr - start_subject), /* start offset */
2804 nigel 77 local_offsets, /* offset vector */
2805     sizeof(local_offsets)/sizeof(int), /* size of same */
2806     local_workspace, /* workspace vector */
2807     sizeof(local_workspace)/sizeof(int), /* size of same */
2808 ph10 642 rlevel); /* function recursion level */
2809 nigel 77
2810     if (rc >= 0)
2811     {
2812     const uschar *end_subpattern = code;
2813     int charcount = local_offsets[1] - local_offsets[0];
2814     int next_state_offset, repeat_state_offset;
2815    
2816     do { end_subpattern += GET(end_subpattern, 1); }
2817     while (*end_subpattern == OP_ALT);
2818 ph10 535 next_state_offset =
2819 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2820 nigel 77
2821     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2822     arrange for the repeat state also to be added to the relevant list.
2823     Calculate the offset, or set -1 for no repeat. */
2824    
2825     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2826     *end_subpattern == OP_KETRMIN)?
2827 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2828 nigel 77
2829     /* If we have matched an empty string, add the next state at the
2830     current character pointer. This is important so that the duplicate
2831     checking kicks in, which is what breaks infinite loops that match an
2832     empty string. */
2833    
2834     if (charcount == 0)
2835     {
2836     ADD_ACTIVE(next_state_offset, 0);
2837     }
2838    
2839     /* Optimization: if there are no more active states, and there
2840     are no new states yet set up, then skip over the subject string
2841     right here, to save looping. Otherwise, set up the new state to swing
2842 ph10 604 into action when the end of the matched substring is reached. */
2843 nigel 77
2844     else if (i + 1 >= active_count && new_count == 0)
2845     {
2846     ptr += charcount;
2847     clen = 0;
2848     ADD_NEW(next_state_offset, 0);
2849    
2850     /* If we are adding a repeat state at the new character position,
2851     we must fudge things so that it is the only current state.
2852     Otherwise, it might be a duplicate of one we processed before, and
2853     that would cause it to be skipped. */
2854    
2855     if (repeat_state_offset >= 0)
2856     {
2857     next_active_state = active_states;
2858     active_count = 0;
2859     i = -1;
2860     ADD_ACTIVE(repeat_state_offset, 0);
2861     }
2862     }
2863     else
2864     {
2865     const uschar *p = start_subject + local_offsets[0];
2866     const uschar *pp = start_subject + local_offsets[1];
2867     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2868     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2869     if (repeat_state_offset >= 0)
2870     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2871     }
2872     }
2873     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2874     }
2875     break;
2876    
2877    
2878     /* ========================================================================== */
2879     /* Handle callouts */
2880    
2881     case OP_CALLOUT:
2882 ph10 406 rrc = 0;
2883 nigel 77 if (pcre_callout != NULL)
2884     {
2885     pcre_callout_block cb;
2886     cb.version = 1; /* Version 1 of the callout block */
2887     cb.callout_number = code[1];
2888     cb.offset_vector = offsets;
2889 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2890 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2891     cb.start_match = (int)(current_subject - start_subject);
2892     cb.current_position = (int)(ptr - start_subject);
2893 nigel 77 cb.pattern_position = GET(code, 2);
2894     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2895     cb.capture_top = 1;
2896     cb.capture_last = -1;
2897     cb.callout_data = md->callout_data;
2898 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2899 nigel 77 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2900 ph10 406 }
2901     if (rrc == 0)
2902     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2903 nigel 77 break;
2904    
2905    
2906     /* ========================================================================== */
2907     default: /* Unsupported opcode */
2908     return PCRE_ERROR_DFA_UITEM;
2909     }
2910    
2911     NEXT_ACTIVE_STATE: continue;
2912    
2913     } /* End of loop scanning active states */
2914    
2915     /* We have finished the processing at the current subject character. If no
2916     new states have been set for the next character, we have found all the
2917     matches that we are going to find. If we are at the top level and partial
2918 ph10 463 matching has been requested, check for appropriate conditions.
2919    
2920 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2921     character. If it is equal to the original active_count (saved in
2922     workspace[1]) it means that (*F) was found on every active state. In this
2923 ph10 463 case we don't want to give a partial match.
2924 nigel 77
2925 ph10 463 The "could_continue" variable is true if a state could have continued but
2926     for the fact that the end of the subject was reached. */
2927    
2928 nigel 77 if (new_count <= 0)
2929     {
2930 ph10 427 if (rlevel == 1 && /* Top level, and */
2931 ph10 463 could_continue && /* Some could go on */
2932 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2933 ph10 427 ( /* either... */
2934     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2935     || /* or... */
2936     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2937     match_count < 0) /* no matches */
2938     ) && /* And... */
2939 ph10 553 ptr >= end_subject && /* Reached end of subject */
2940     ptr > md->start_used_ptr) /* Inspected non-empty string */
2941 nigel 77 {
2942     if (offsetcount >= 2)
2943     {
2944 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2945     offsets[1] = (int)(end_subject - start_subject);
2946 nigel 77 }
2947     match_count = PCRE_ERROR_PARTIAL;
2948     }
2949    
2950     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2951     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2952     rlevel*2-2, SP));
2953 nigel 91 break; /* In effect, "return", but see the comment below */
2954 nigel 77 }
2955    
2956     /* One or more states are active for the next character. */
2957    
2958     ptr += clen; /* Advance to next subject character */
2959     } /* Loop to move along the subject string */
2960    
2961 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2962     if we use "return" above, we have compiler trouble. Some compilers warn if
2963     there's nothing here because they think the function doesn't return a value. On
2964     the other hand, if we put a dummy statement here, some more clever compilers
2965     complain that it can't be reached. Sigh. */
2966 nigel 77
2967 nigel 91 return match_count;
2968 nigel 77 }
2969    
2970    
2971    
2972    
2973     /*************************************************
2974     * Execute a Regular Expression - DFA engine *
2975     *************************************************/
2976    
2977     /* This external function applies a compiled re to a subject string using a DFA
2978     engine. This function calls the internal function multiple times if the pattern
2979     is not anchored.
2980    
2981     Arguments:
2982     argument_re points to the compiled expression
2983 ph10 97 extra_data points to extra data or is NULL
2984 nigel 77 subject points to the subject string
2985     length length of subject string (may contain binary zeros)
2986     start_offset where to start in the subject string
2987     options option bits
2988     offsets vector of match offsets
2989     offsetcount size of same
2990     workspace workspace vector
2991     wscount size of same
2992    
2993     Returns: > 0 => number of match offset pairs placed in offsets
2994     = 0 => offsets overflowed; longest matches are present
2995     -1 => failed to match
2996     < -1 => some kind of unexpected problem
2997     */
2998    
2999 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3000 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3001     const char *subject, int length, int start_offset, int options, int *offsets,
3002     int offsetcount, int *workspace, int wscount)
3003     {
3004     real_pcre *re = (real_pcre *)argument_re;
3005     dfa_match_data match_block;
3006 nigel 91 dfa_match_data *md = &match_block;
3007 nigel 77 BOOL utf8, anchored, startline, firstline;
3008     const uschar *current_subject, *end_subject, *lcc;
3009    
3010     pcre_study_data internal_study;
3011     const pcre_study_data *study = NULL;
3012     real_pcre internal_re;
3013    
3014     const uschar *req_byte_ptr;
3015     const uschar *start_bits = NULL;
3016     BOOL first_byte_caseless = FALSE;
3017     BOOL req_byte_caseless = FALSE;
3018     int first_byte = -1;
3019     int req_byte = -1;
3020     int req_byte2 = -1;
3021 nigel 91 int newline;
3022 nigel 77
3023     /* Plausibility checks */
3024    
3025     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3026     if (re == NULL || subject == NULL || workspace == NULL ||
3027     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3028     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3029     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3030 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3031 nigel 77
3032     /* We need to find the pointer to any study data before we test for byte
3033     flipping, so we scan the extra_data block first. This may set two fields in the
3034     match block, so we must initialize them beforehand. However, the other fields
3035     in the match block must not be set until after the byte flipping. */
3036    
3037 nigel 91 md->tables = re->tables;
3038     md->callout_data = NULL;
3039 nigel 77
3040     if (extra_data != NULL)
3041     {
3042     unsigned int flags = extra_data->flags;
3043     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3044     study = (const pcre_study_data *)extra_data->study_data;
3045     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3046 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3047     return PCRE_ERROR_DFA_UMLIMIT;
3048 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3049 nigel 91 md->callout_data = extra_data->callout_data;
3050 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3051 nigel 91 md->tables = extra_data->tables;
3052 nigel 77 }
3053 ph10 461
3054 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
3055     test for a regex that was compiled on a host of opposite endianness. If this is
3056     the case, flipped values are put in internal_re and internal_study if there was
3057     study data too. */
3058    
3059     if (re->magic_number != MAGIC_NUMBER)
3060     {
3061     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3062     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3063     if (study != NULL) study = &internal_study;
3064     }
3065    
3066     /* Set some local values */
3067    
3068     current_subject = (const unsigned char *)subject + start_offset;
3069     end_subject = (const unsigned char *)subject + length;
3070     req_byte_ptr = current_subject - 1;
3071    
3072 nigel 91 #ifdef SUPPORT_UTF8
3073 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
3074 nigel 91 #else
3075     utf8 = FALSE;
3076     #endif
3077 nigel 77
3078 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3079     (re->options & PCRE_ANCHORED) != 0;
3080    
3081 nigel 77 /* The remaining fixed data for passing around. */
3082    
3083 nigel 91 md->start_code = (const uschar *)argument_re +
3084 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3085 nigel 91 md->start_subject = (const unsigned char *)subject;
3086     md->end_subject = end_subject;
3087 ph10 442 md->start_offset = start_offset;
3088 nigel 91 md->moptions = options;
3089     md->poptions = re->options;
3090 nigel 77
3091 ph10 231 /* If the BSR option is not set at match time, copy what was set
3092     at compile time. */
3093    
3094     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3095     {
3096     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3097     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3098     #ifdef BSR_ANYCRLF
3099     else md->moptions |= PCRE_BSR_ANYCRLF;
3100 ph10 243 #endif
3101     }
3102 ph10 231
3103 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3104     nothing is set at run time, whatever was used at compile time applies. */
3105 nigel 91
3106 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3107 nigel 93 PCRE_NEWLINE_BITS)
3108 nigel 91 {
3109 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3110 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3111     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3112 nigel 91 case PCRE_NEWLINE_CR+
3113 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3114 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3115 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3116 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3117 nigel 91 }
3118    
3119 ph10 149 if (newline == -2)
3120 nigel 91 {
3121 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3122     }
3123     else if (newline < 0)
3124     {
3125 nigel 93 md->nltype = NLTYPE_ANY;
3126 nigel 91 }
3127     else
3128     {
3129 nigel 93 md->nltype = NLTYPE_FIXED;
3130     if (newline > 255)
3131     {
3132     md->nllen = 2;
3133     md->nl[0] = (newline >> 8) & 255;
3134     md->nl[1] = newline & 255;
3135     }
3136     else
3137     {
3138     md->nllen = 1;
3139     md->nl[0] = newline;
3140     }
3141 nigel 91 }
3142    
3143 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3144     back the character offset. */
3145    
3146     #ifdef SUPPORT_UTF8
3147     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3148     {
3149 ph10 654 int erroroffset;
3150 ph10 606 int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
3151     if (errorcode != 0)
3152 ph10 598 {
3153     if (offsetcount >= 2)
3154     {
3155 ph10 606 offsets[0] = erroroffset;
3156 ph10 598 offsets[1] = errorcode;
3157 ph10 654 }
3158 ph10 598 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3159 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3160 ph10 654 }
3161 ph10 606 if (start_offset > 0 && start_offset < length &&
3162 ph10 654 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
3163 ph10 606 return PCRE_ERROR_BADUTF8_OFFSET;
3164 nigel 77 }
3165     #endif
3166    
3167     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3168     is a feature that makes it possible to save compiled regex and re-use them
3169     in other programs later. */
3170    
3171 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
3172 nigel 77
3173     /* The lower casing table and the "must be at the start of a line" flag are
3174     used in a loop when finding where to start. */
3175    
3176 nigel 91 lcc = md->tables + lcc_offset;
3177 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3178 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3179    
3180     /* Set up the first character to match, if available. The first_byte value is
3181     never set for an anchored regular expression, but the anchoring may be forced
3182     at run time, so we have to test for anchoring. The first char may be unset for
3183     an unanchored pattern, of course. If there's no first char and the pattern was
3184     studied, there may be a bitmap of possible first characters. */
3185    
3186     if (!anchored)
3187     {
3188 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3189 nigel 77 {
3190     first_byte = re->first_byte & 255;
3191     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3192     first_byte = lcc[first_byte];
3193     }
3194     else
3195     {
3196 ph10 455 if (!startline && study != NULL &&
3197     (study->flags & PCRE_STUDY_MAPPED) != 0)
3198 nigel 77 start_bits = study->start_bits;
3199     }
3200     }
3201    
3202     /* For anchored or unanchored matches, there may be a "last known required
3203     character" set. */
3204    
3205 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3206 nigel 77 {
3207     req_byte = re->req_byte & 255;
3208     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3209 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3210 nigel 77 }
3211    
3212     /* Call the main matching function, looping for a non-anchored regex after a
3213 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3214     a match. */
3215 nigel 77
3216     for (;;)
3217     {
3218     int rc;
3219    
3220     if ((options & PCRE_DFA_RESTART) == 0)
3221     {
3222     const uschar *save_end_subject = end_subject;
3223    
3224 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3225     line of a multiline string. Implement this by temporarily adjusting
3226     end_subject so that we stop scanning at a newline. If the match fails at
3227     the newline, later code breaks this loop. */
3228 nigel 77
3229     if (firstline)
3230     {
3231 ph10 365 USPTR t = current_subject;
3232     #ifdef SUPPORT_UTF8
3233     if (utf8)
3234 ph10 371 {
3235     while (t < md->end_subject && !IS_NEWLINE(t))
3236 ph10 365 {
3237     t++;
3238     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3239 ph10 371 }
3240 ph10 365 }
3241     else
3242 ph10 371 #endif
3243 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3244 nigel 77 end_subject = t;
3245     }
3246 ph10 392
3247 ph10 389 /* There are some optimizations that avoid running the match if a known
3248 ph10 455 starting point is not found. However, there is an option that disables
3249 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3250 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3251     match-time options. */
3252 nigel 77
3253 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3254 ph10 392 {
3255 ph10 389 /* Advance to a known first byte. */
3256 ph10 392
3257 ph10 389 if (first_byte >= 0)
3258 nigel 77 {
3259 ph10 389 if (first_byte_caseless)
3260     while (current_subject < end_subject &&
3261     lcc[*current_subject] != first_byte)
3262     current_subject++;
3263     else
3264 ph10 392 while (current_subject < end_subject &&
3265 ph10 389 *current_subject != first_byte)
3266     current_subject++;
3267     }
3268 ph10 392
3269 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3270 ph10 392
3271 ph10 389 else if (startline)
3272     {
3273     if (current_subject > md->start_subject + start_offset)
3274     {
3275 ph10 365 #ifdef SUPPORT_UTF8
3276 ph10 389 if (utf8)
3277 ph10 365 {
3278 ph10 392 while (current_subject < end_subject &&
3279 ph10 389 !WAS_NEWLINE(current_subject))
3280     {
3281 ph10 365 current_subject++;
3282 ph10 389 while(current_subject < end_subject &&
3283     (*current_subject & 0xc0) == 0x80)
3284     current_subject++;
3285     }
3286 ph10 371 }
3287 ph10 389 else
3288     #endif
3289     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3290     current_subject++;
3291 ph10 392
3292 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3293     ANYCRLF, and we are now at a LF, advance the match position by one
3294     more character. */
3295 ph10 392
3296 ph10 391 if (current_subject[-1] == CHAR_CR &&
3297 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3298     current_subject < end_subject &&
3299 ph10 391 *current_subject == CHAR_NL)
3300 ph10 389 current_subject++;
3301 ph10 365 }
3302 nigel 77 }
3303 ph10 392
3304 ph10 389 /* Or to a non-unique first char after study */
3305 ph10 392
3306 ph10 389 else if (start_bits != NULL)
3307 nigel 77 {
3308 ph10 389 while (current_subject < end_subject)
3309     {
3310     register unsigned int c = *current_subject;
3311 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3312 ph10 538 {
3313     current_subject++;
3314     #ifdef SUPPORT_UTF8
3315     if (utf8)
3316 ph10 545 while(current_subject < end_subject &&
3317 ph10 538 (*current_subject & 0xc0) == 0x80) current_subject++;
3318 ph10 545 #endif
3319 ph10 538 }
3320     else break;
3321 ph10 389 }
3322 nigel 77 }
3323 ph10 392 }
3324 nigel 77
3325     /* Restore fudged end_subject */
3326    
3327     end_subject = save_end_subject;
3328    
3329 ph10 461 /* The following two optimizations are disabled for partial matching or if
3330     disabling is explicitly requested (and of course, by the test above, this
3331 ph10 455 code is not obeyed when restarting after a partial match). */
3332 ph10 461
3333 ph10 728 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3334 ph10 455 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3335 ph10 461 {
3336 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3337     is a lower bound; no actual string of that length may actually match the
3338     pattern. Although the value is, strictly, in characters, we treat it as
3339     bytes to avoid spending too much time in this optimization. */
3340 nigel 77
3341 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3342 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3343 ph10 455 return PCRE_ERROR_NOMATCH;
3344 ph10 461
3345 ph10 455 /* If req_byte is set, we know that that character must appear in the
3346     subject for the match to succeed. If the first character is set, req_byte
3347     must be later in the subject; otherwise the test starts at the match
3348     point. This optimization can save a huge amount of work in patterns with
3349     nested unlimited repeats that aren't going to match. Writing separate
3350     code for cased/caseless versions makes it go faster, as does using an
3351     autoincrement and backing off on a match.
3352 ph10 461
3353 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3354     can take a long time, and give bad performance on quite ordinary
3355     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3356     string... so we don't do this when the string is sufficiently long. */
3357 ph10 461
3358 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3359 nigel 77 {
3360 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3361 ph10 461
3362 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3363     place we found it at last time. */
3364 ph10 461
3365 ph10 455 if (p > req_byte_ptr)
3366 nigel 77 {
3367 ph10 455 if (req_byte_caseless)
3368     {
3369     while (p < end_subject)
3370     {
3371     register int pp = *p++;
3372     if (pp == req_byte || pp == req_byte2) { p--; break; }
3373     }
3374     }
3375     else
3376     {
3377     while (p < end_subject)
3378     {
3379     if (*p++ == req_byte) { p--; break; }
3380     }
3381     }
3382 ph10 461
3383 ph10 455 /* If we can't find the required character, break the matching loop,
3384     which will cause a return or PCRE_ERROR_NOMATCH. */
3385 ph10 461
3386 ph10 455 if (p >= end_subject) break;
3387 ph10 461
3388 ph10 455 /* If we have found the required character, save the point where we
3389     found it, so that we don't search again next time round the loop if
3390     the start hasn't passed this character yet. */
3391 ph10 461
3392 ph10 455 req_byte_ptr = p;
3393 nigel 77 }
3394 ph10 461 }
3395 nigel 77 }
3396 ph10 455 } /* End of optimizations that are done when not restarting */
3397 nigel 77
3398     /* OK, now we can do the business */
3399    
3400 ph10 435 md->start_used_ptr = current_subject;
3401 ph10 654 md->recursive = NULL;
3402 ph10 461
3403 nigel 77 rc = internal_dfa_exec(
3404 nigel 91 md, /* fixed match data */
3405     md->start_code, /* this subexpression's code */
3406     current_subject, /* where we currently are */
3407     start_offset, /* start offset in subject */
3408     offsets, /* offset vector */
3409     offsetcount, /* size of same */
3410     workspace, /* workspace vector */
3411     wscount, /* size of same */
3412 ph10 642 0); /* function recurse level */
3413 nigel 77
3414     /* Anything other than "no match" means we are done, always; otherwise, carry
3415     on only if not anchored. */
3416    
3417     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3418    
3419     /* Advance to the next subject character unless we are at the end of a line
3420     and firstline is set. */
3421    
3422 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3423 nigel 77 current_subject++;
3424     if (utf8)
3425     {
3426     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3427     current_subject++;
3428     }
3429     if (current_subject > end_subject) break;
3430    
3431 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3432 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3433     or ANY or ANYCRLF, advance the match position by one more character. */
3434 nigel 93
3435 ph10 391 if (current_subject[-1] == CHAR_CR &&
3436 ph10 226 current_subject < end_subject &&
3437 ph10 391 *current_subject == CHAR_NL &&
3438 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3439 ph10 226 (md->nltype == NLTYPE_ANY ||
3440     md->nltype == NLTYPE_ANYCRLF ||
3441     md->nllen == 2))
3442 nigel 93 current_subject++;
3443    
3444     } /* "Bumpalong" loop */
3445    
3446 nigel 77 return PCRE_ERROR_NOMATCH;
3447     }
3448    
3449     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12