/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 604 - (hide annotations) (download)
Thu Jun 2 19:04:54 2011 UTC (2 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 118826 byte(s)
Refactoring to reduce stack usage for possessively quantified subpatterns. Also 
fixed a number of bugs related to repeated subpatterns. Some further tidies 
consequent on the removal of OP_OPT are also in this patch.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 598 Copyright (c) 1997-2011 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 nigel 77 1, /* Char */
126 ph10 602 1, /* Chari */
127 nigel 77 1, /* not */
128 ph10 602 1, /* noti */
129 nigel 77 /* Positive single-char repeats */
130     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131     3, 3, 3, /* upto, minupto, exact */
132 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134     3, 3, 3, /* upto I, minupto I, exact I */
135     1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 nigel 77 /* Negative single-char repeats - only for chars < 256 */
137     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* NOT upto, minupto, exact */
139 ph10 602 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140     1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141     3, 3, 3, /* NOT upto I, minupto I, exact I */
142     1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 nigel 77 /* Positive type repeats */
144     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145     3, 3, 3, /* Type upto, minupto, exact */
146 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 nigel 77 /* Character class & ref repeats */
148     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149     0, 0, /* CRRANGE, CRMINRANGE */
150     0, /* CLASS */
151     0, /* NCLASS */
152     0, /* XCLASS - variable length */
153     0, /* REF */
154 ph10 602 0, /* REFI */
155 nigel 77 0, /* RECURSE */
156     0, /* CALLOUT */
157     0, /* Alt */
158     0, /* Ket */
159     0, /* KetRmax */
160     0, /* KetRmin */
161 ph10 604 0, /* KetRpos */
162 nigel 77 0, /* Assert */
163     0, /* Assert not */
164     0, /* Assert behind */
165     0, /* Assert behind not */
166     0, /* Reverse */
167 ph10 604 0, 0, 0, 0, 0, 0, /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
168     0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
169 ph10 498 0, 0, /* CREF, NCREF */
170     0, 0, /* RREF, NRREF */
171 nigel 93 0, /* DEF */
172 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
173 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
174     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
175     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
176 nigel 77 };
177    
178 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
179 ph10 462 remember the fact that a character could have been inspected when the end of
180 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
181     two tables that follow must also be modified. */
182 ph10 462
183     static const uschar poptable[] = {
184     0, /* End */
185 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
186 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
187     1, 1, 1, /* Any, AllAny, Anybyte */
188 ph10 498 1, 1, /* \P, \p */
189 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
190 ph10 498 1, /* \X */
191 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
192 ph10 462 1, /* Char */
193 ph10 602 1, /* Chari */
194 ph10 462 1, /* not */
195 ph10 602 1, /* noti */
196 ph10 462 /* Positive single-char repeats */
197     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
198     1, 1, 1, /* upto, minupto, exact */
199     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
200 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
201     1, 1, 1, /* upto I, minupto I, exact I */
202     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
203 ph10 462 /* Negative single-char repeats - only for chars < 256 */
204     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
205     1, 1, 1, /* NOT upto, minupto, exact */
206     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
207 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
208     1, 1, 1, /* NOT upto I, minupto I, exact I */
209     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
210 ph10 462 /* Positive type repeats */
211     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
212     1, 1, 1, /* Type upto, minupto, exact */
213     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
214     /* Character class & ref repeats */
215     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
216     1, 1, /* CRRANGE, CRMINRANGE */
217     1, /* CLASS */
218     1, /* NCLASS */
219     1, /* XCLASS - variable length */
220     0, /* REF */
221 ph10 602 0, /* REFI */
222 ph10 462 0, /* RECURSE */
223     0, /* CALLOUT */
224     0, /* Alt */
225     0, /* Ket */
226     0, /* KetRmax */
227     0, /* KetRmin */
228 ph10 604 0, /* KetRpos */
229 ph10 462 0, /* Assert */
230     0, /* Assert not */
231     0, /* Assert behind */
232     0, /* Assert behind not */
233     0, /* Reverse */
234 ph10 604 0, 0, 0, 0, 0, 0, /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
235     0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
236 ph10 498 0, 0, /* CREF, NCREF */
237     0, 0, /* RREF, NRREF */
238 ph10 462 0, /* DEF */
239 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
240 ph10 510 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG, */
241     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG, */
242     0, 0, 0, 0, 0 /* COMMIT, FAIL, ACCEPT, CLOSE, SKIPZERO */
243 ph10 462 };
244    
245 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
246     and \w */
247    
248 ph10 327 static const uschar toptable1[] = {
249 ph10 168 0, 0, 0, 0, 0, 0,
250 nigel 77 ctype_digit, ctype_digit,
251     ctype_space, ctype_space,
252     ctype_word, ctype_word,
253 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
254 nigel 77 };
255    
256 ph10 327 static const uschar toptable2[] = {
257 ph10 168 0, 0, 0, 0, 0, 0,
258 nigel 77 ctype_digit, 0,
259     ctype_space, 0,
260     ctype_word, 0,
261 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
262 nigel 77 };
263    
264    
265     /* Structure for holding data about a particular state, which is in effect the
266     current data for an active path through the match tree. It must consist
267     entirely of ints because the working vector we are passed, and which we put
268     these structures in, is a vector of ints. */
269    
270     typedef struct stateblock {
271     int offset; /* Offset to opcode */
272     int count; /* Count for repeats */
273     int data; /* Some use extra data */
274     } stateblock;
275    
276     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
277    
278    
279 ph10 475 #ifdef PCRE_DEBUG
280 nigel 77 /*************************************************
281     * Print character string *
282     *************************************************/
283    
284     /* Character string printing function for debugging.
285    
286     Arguments:
287     p points to string
288     length number of bytes
289     f where to print
290    
291     Returns: nothing
292     */
293    
294     static void
295     pchars(unsigned char *p, int length, FILE *f)
296     {
297     int c;
298     while (length-- > 0)
299     {
300     if (isprint(c = *(p++)))
301     fprintf(f, "%c", c);
302     else
303     fprintf(f, "\\x%02x", c);
304     }
305     }
306     #endif
307    
308    
309    
310     /*************************************************
311     * Execute a Regular Expression - DFA engine *
312     *************************************************/
313    
314     /* This internal function applies a compiled pattern to a subject string,
315     starting at a given point, using a DFA engine. This function is called from the
316     external one, possibly multiple times if the pattern is not anchored. The
317     function calls itself recursively for some kinds of subpattern.
318    
319     Arguments:
320     md the match_data block with fixed information
321     this_start_code the opening bracket of this subexpression's code
322     current_subject where we currently are in the subject string
323     start_offset start offset in the subject string
324     offsets vector to contain the matching string offsets
325     offsetcount size of same
326     workspace vector of workspace
327     wscount size of same
328     rlevel function call recursion level
329     recursing regex recursive call level
330    
331 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
332 ph10 341 = 0 => offsets overflowed; longest matches are present
333 nigel 77 -1 => failed to match
334     < -1 => some kind of unexpected problem
335    
336     The following macros are used for adding states to the two state vectors (one
337     for the current character, one for the following character). */
338    
339     #define ADD_ACTIVE(x,y) \
340     if (active_count++ < wscount) \
341     { \
342     next_active_state->offset = (x); \
343     next_active_state->count = (y); \
344     next_active_state++; \
345     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
346     } \
347     else return PCRE_ERROR_DFA_WSSIZE
348    
349     #define ADD_ACTIVE_DATA(x,y,z) \
350     if (active_count++ < wscount) \
351     { \
352     next_active_state->offset = (x); \
353     next_active_state->count = (y); \
354     next_active_state->data = (z); \
355     next_active_state++; \
356     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
357     } \
358     else return PCRE_ERROR_DFA_WSSIZE
359    
360     #define ADD_NEW(x,y) \
361     if (new_count++ < wscount) \
362     { \
363     next_new_state->offset = (x); \
364     next_new_state->count = (y); \
365     next_new_state++; \
366     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
367     } \
368     else return PCRE_ERROR_DFA_WSSIZE
369    
370     #define ADD_NEW_DATA(x,y,z) \
371     if (new_count++ < wscount) \
372     { \
373     next_new_state->offset = (x); \
374     next_new_state->count = (y); \
375     next_new_state->data = (z); \
376     next_new_state++; \
377     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
378     } \
379     else return PCRE_ERROR_DFA_WSSIZE
380    
381     /* And now, here is the code */
382    
383     static int
384     internal_dfa_exec(
385     dfa_match_data *md,
386     const uschar *this_start_code,
387     const uschar *current_subject,
388     int start_offset,
389     int *offsets,
390     int offsetcount,
391     int *workspace,
392     int wscount,
393     int rlevel,
394     int recursing)
395     {
396     stateblock *active_states, *new_states, *temp_states;
397     stateblock *next_active_state, *next_new_state;
398    
399     const uschar *ctypes, *lcc, *fcc;
400     const uschar *ptr;
401 nigel 93 const uschar *end_code, *first_op;
402 nigel 77
403     int active_count, new_count, match_count;
404    
405     /* Some fields in the md block are frequently referenced, so we load them into
406     independent variables in the hope that this will perform better. */
407    
408     const uschar *start_subject = md->start_subject;
409     const uschar *end_subject = md->end_subject;
410     const uschar *start_code = md->start_code;
411    
412 nigel 87 #ifdef SUPPORT_UTF8
413 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
414 nigel 93 #else
415     BOOL utf8 = FALSE;
416 nigel 87 #endif
417 nigel 77
418     rlevel++;
419     offsetcount &= (-2);
420    
421     wscount -= 2;
422     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
423     (2 * INTS_PER_STATEBLOCK);
424    
425     DPRINTF(("\n%.*s---------------------\n"
426     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
427     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
428    
429     ctypes = md->tables + ctypes_offset;
430     lcc = md->tables + lcc_offset;
431     fcc = md->tables + fcc_offset;
432    
433     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
434    
435     active_states = (stateblock *)(workspace + 2);
436     next_new_state = new_states = active_states + wscount;
437     new_count = 0;
438    
439 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
440 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
441     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
442 nigel 93
443 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
444     the alternative states onto the list, and find out where the end is. This
445     makes is possible to use this function recursively, when we want to stop at a
446     matching internal ket rather than at the end.
447    
448     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
449     a backward assertion. In that case, we have to find out the maximum amount to
450     move back, and set up each alternative appropriately. */
451    
452 nigel 93 if (*first_op == OP_REVERSE)
453 nigel 77 {
454     int max_back = 0;
455     int gone_back;
456    
457     end_code = this_start_code;
458     do
459     {
460     int back = GET(end_code, 2+LINK_SIZE);
461     if (back > max_back) max_back = back;
462     end_code += GET(end_code, 1);
463     }
464     while (*end_code == OP_ALT);
465    
466     /* If we can't go back the amount required for the longest lookbehind
467     pattern, go back as far as we can; some alternatives may still be viable. */
468    
469     #ifdef SUPPORT_UTF8
470     /* In character mode we have to step back character by character */
471    
472     if (utf8)
473     {
474     for (gone_back = 0; gone_back < max_back; gone_back++)
475     {
476     if (current_subject <= start_subject) break;
477     current_subject--;
478     while (current_subject > start_subject &&
479     (*current_subject & 0xc0) == 0x80)
480     current_subject--;
481     }
482     }
483     else
484     #endif
485    
486     /* In byte-mode we can do this quickly. */
487    
488     {
489     gone_back = (current_subject - max_back < start_subject)?
490 ph10 530 (int)(current_subject - start_subject) : max_back;
491 nigel 77 current_subject -= gone_back;
492     }
493 ph10 461
494 ph10 435 /* Save the earliest consulted character */
495 nigel 77
496 ph10 461 if (current_subject < md->start_used_ptr)
497     md->start_used_ptr = current_subject;
498    
499 nigel 77 /* Now we can process the individual branches. */
500    
501     end_code = this_start_code;
502     do
503     {
504     int back = GET(end_code, 2+LINK_SIZE);
505     if (back <= gone_back)
506     {
507 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
508 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
509     }
510     end_code += GET(end_code, 1);
511     }
512     while (*end_code == OP_ALT);
513     }
514    
515     /* This is the code for a "normal" subpattern (not a backward assertion). The
516     start of a whole pattern is always one of these. If we are at the top level,
517     we may be asked to restart matching from the same point that we reached for a
518     previous partial match. We still have to scan through the top-level branches to
519     find the end state. */
520    
521     else
522     {
523     end_code = this_start_code;
524    
525     /* Restarting */
526    
527     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
528     {
529     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
530     new_count = workspace[1];
531     if (!workspace[0])
532     memcpy(new_states, active_states, new_count * sizeof(stateblock));
533     }
534    
535     /* Not restarting */
536    
537     else
538     {
539 nigel 93 int length = 1 + LINK_SIZE +
540 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
541     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
542     2:0);
543 nigel 77 do
544     {
545 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
546 nigel 77 end_code += GET(end_code, 1);
547 nigel 93 length = 1 + LINK_SIZE;
548 nigel 77 }
549     while (*end_code == OP_ALT);
550     }
551     }
552    
553     workspace[0] = 0; /* Bit indicating which vector is current */
554    
555     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
556    
557     /* Loop for scanning the subject */
558    
559     ptr = current_subject;
560     for (;;)
561     {
562     int i, j;
563 nigel 91 int clen, dlen;
564     unsigned int c, d;
565 ph10 428 int forced_fail = 0;
566 ph10 462 BOOL could_continue = FALSE;
567 nigel 77
568     /* Make the new state list into the active state list and empty the
569     new state list. */
570    
571     temp_states = active_states;
572     active_states = new_states;
573     new_states = temp_states;
574     active_count = new_count;
575     new_count = 0;
576    
577     workspace[0] ^= 1; /* Remember for the restarting feature */
578     workspace[1] = active_count;
579    
580 ph10 475 #ifdef PCRE_DEBUG
581 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
582     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
583     printf("\"\n");
584    
585     printf("%.*sActive states: ", rlevel*2-2, SP);
586     for (i = 0; i < active_count; i++)
587     printf("%d/%d ", active_states[i].offset, active_states[i].count);
588     printf("\n");
589     #endif
590    
591     /* Set the pointers for adding new states */
592    
593     next_active_state = active_states + active_count;
594     next_new_state = new_states;
595    
596     /* Load the current character from the subject outside the loop, as many
597     different states may want to look at it, and we assume that at least one
598     will. */
599    
600     if (ptr < end_subject)
601     {
602 nigel 93 clen = 1; /* Number of bytes in the character */
603 nigel 77 #ifdef SUPPORT_UTF8
604     if (utf8) { GETCHARLEN(c, ptr, clen); } else
605     #endif /* SUPPORT_UTF8 */
606     c = *ptr;
607     }
608     else
609     {
610 nigel 93 clen = 0; /* This indicates the end of the subject */
611     c = NOTACHAR; /* This value should never actually be used */
612 nigel 77 }
613    
614     /* Scan up the active states and act on each one. The result of an action
615     may be to add more states to the currently active list (e.g. on hitting a
616     parenthesis) or it may be to put states on the new list, for considering
617     when we move the character pointer on. */
618    
619     for (i = 0; i < active_count; i++)
620     {
621     stateblock *current_state = active_states + i;
622 ph10 602 BOOL caseless = FALSE;
623 nigel 77 const uschar *code;
624     int state_offset = current_state->offset;
625 ph10 397 int count, codevalue, rrc;
626 nigel 77
627 ph10 475 #ifdef PCRE_DEBUG
628 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
629 nigel 93 if (clen == 0) printf("EOL\n");
630 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
631     else printf("0x%02x\n", c);
632     #endif
633    
634     /* A negative offset is a special case meaning "hold off going to this
635     (negated) state until the number of characters in the data field have
636     been skipped". */
637    
638     if (state_offset < 0)
639     {
640     if (current_state->data > 0)
641     {
642     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
643     ADD_NEW_DATA(state_offset, current_state->count,
644     current_state->data - 1);
645     continue;
646     }
647     else
648     {
649     current_state->offset = state_offset = -state_offset;
650     }
651     }
652    
653 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
654 ph10 439 See the note at the head of this module about the possibility of improving
655     performance here. */
656 nigel 77
657     for (j = 0; j < i; j++)
658     {
659     if (active_states[j].offset == state_offset &&
660     active_states[j].count == current_state->count)
661     {
662     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
663     goto NEXT_ACTIVE_STATE;
664     }
665     }
666    
667     /* The state offset is the offset to the opcode */
668    
669     code = start_code + state_offset;
670     codevalue = *code;
671    
672 ph10 463 /* If this opcode inspects a character, but we are at the end of the
673     subject, remember the fact for use when testing for a partial match. */
674    
675 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
676 ph10 463 could_continue = TRUE;
677 ph10 462
678 nigel 77 /* If this opcode is followed by an inline character, load it. It is
679     tempting to test for the presence of a subject character here, but that
680     is wrong, because sometimes zero repetitions of the subject are
681     permitted.
682    
683     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
684 ph10 178 argument that is not a data character - but is always one byte long. We
685     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
686     this case. To keep the other cases fast, convert these ones to new opcodes.
687     */
688 nigel 77
689     if (coptable[codevalue] > 0)
690     {
691     dlen = 1;
692     #ifdef SUPPORT_UTF8
693     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
694     #endif /* SUPPORT_UTF8 */
695     d = code[coptable[codevalue]];
696     if (codevalue >= OP_TYPESTAR)
697     {
698 nigel 93 switch(d)
699     {
700     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
701     case OP_NOTPROP:
702     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
703     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
704     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
705 ph10 178 case OP_NOT_HSPACE:
706 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
707 ph10 178 case OP_NOT_VSPACE:
708 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
709 nigel 93 default: break;
710     }
711 nigel 77 }
712     }
713     else
714     {
715     dlen = 0; /* Not strictly necessary, but compilers moan */
716 nigel 93 d = NOTACHAR; /* if these variables are not set. */
717 nigel 77 }
718    
719    
720     /* Now process the individual opcodes */
721    
722     switch (codevalue)
723     {
724 ph10 498 /* ========================================================================== */
725     /* These cases are never obeyed. This is a fudge that causes a compile-
726     time error if the vectors coptable or poptable, which are indexed by
727     opcode, are not the correct length. It seems to be the only way to do
728     such a check at compile time, as the sizeof() operator does not work
729     in the C preprocessor. */
730 ph10 507
731 ph10 498 case OP_TABLE_LENGTH:
732 ph10 507 case OP_TABLE_LENGTH +
733 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
734     (sizeof(poptable) == OP_TABLE_LENGTH)):
735 ph10 507 break;
736 nigel 77
737     /* ========================================================================== */
738     /* Reached a closing bracket. If not at the end of the pattern, carry
739 ph10 604 on with the next opcode. For repeating opcodes, also add the repeat
740     state. Note that KETRPOS will always be encountered at the end of the
741     subpattern, because the possessive subpattern repeats are always handled
742     using recursive calls. Thus, it never adds any new states.
743    
744     At the end of the (sub)pattern, unless we have an empty string and
745 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
746 ph10 442 start of the subject, save the match data, shifting up all previous
747 nigel 77 matches so we always have the longest first. */
748    
749     case OP_KET:
750     case OP_KETRMIN:
751     case OP_KETRMAX:
752 ph10 604 case OP_KETRPOS:
753 nigel 77 if (code != end_code)
754     {
755     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
756     if (codevalue != OP_KET)
757     {
758     ADD_ACTIVE(state_offset - GET(code, 1), 0);
759     }
760     }
761 ph10 461 else
762 nigel 77 {
763 ph10 461 if (ptr > current_subject ||
764 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
765     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
766     current_subject > start_subject + md->start_offset)))
767 nigel 77 {
768 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
769     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
770     match_count = 0;
771     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
772     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
773     if (offsetcount >= 2)
774     {
775 ph10 530 offsets[0] = (int)(current_subject - start_subject);
776     offsets[1] = (int)(ptr - start_subject);
777 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
778     offsets[1] - offsets[0], current_subject));
779     }
780     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
781     {
782     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
783     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
784     match_count, rlevel*2-2, SP));
785     return match_count;
786     }
787 ph10 461 }
788 nigel 77 }
789     break;
790    
791     /* ========================================================================== */
792     /* These opcodes add to the current list of states without looking
793     at the current character. */
794    
795     /*-----------------------------------------------------------------*/
796     case OP_ALT:
797     do { code += GET(code, 1); } while (*code == OP_ALT);
798 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
799 nigel 77 break;
800    
801     /*-----------------------------------------------------------------*/
802     case OP_BRA:
803 nigel 93 case OP_SBRA:
804 nigel 77 do
805     {
806 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
807 nigel 77 code += GET(code, 1);
808     }
809     while (*code == OP_ALT);
810     break;
811    
812     /*-----------------------------------------------------------------*/
813 nigel 93 case OP_CBRA:
814     case OP_SCBRA:
815 ph10 530 ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
816 nigel 93 code += GET(code, 1);
817     while (*code == OP_ALT)
818     {
819 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
820 nigel 93 code += GET(code, 1);
821     }
822     break;
823    
824     /*-----------------------------------------------------------------*/
825 nigel 77 case OP_BRAZERO:
826     case OP_BRAMINZERO:
827     ADD_ACTIVE(state_offset + 1, 0);
828     code += 1 + GET(code, 2);
829     while (*code == OP_ALT) code += GET(code, 1);
830 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
831 nigel 77 break;
832    
833     /*-----------------------------------------------------------------*/
834 ph10 335 case OP_SKIPZERO:
835     code += 1 + GET(code, 2);
836     while (*code == OP_ALT) code += GET(code, 1);
837 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
838 ph10 335 break;
839    
840     /*-----------------------------------------------------------------*/
841 nigel 77 case OP_CIRC:
842 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
843     { ADD_ACTIVE(state_offset + 1, 0); }
844     break;
845    
846     /*-----------------------------------------------------------------*/
847     case OP_CIRCM:
848 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
849 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
850 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
851     break;
852    
853     /*-----------------------------------------------------------------*/
854     case OP_EOD:
855 ph10 579 if (ptr >= end_subject)
856     {
857 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
858     could_continue = TRUE;
859     else { ADD_ACTIVE(state_offset + 1, 0); }
860     }
861 nigel 77 break;
862    
863     /*-----------------------------------------------------------------*/
864     case OP_SOD:
865     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
866     break;
867    
868     /*-----------------------------------------------------------------*/
869     case OP_SOM:
870     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
871     break;
872    
873    
874     /* ========================================================================== */
875     /* These opcodes inspect the next subject character, and sometimes
876     the previous one as well, but do not have an argument. The variable
877     clen contains the length of the current character and is zero if we are
878     at the end of the subject. */
879    
880     /*-----------------------------------------------------------------*/
881     case OP_ANY:
882 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
883 nigel 77 { ADD_NEW(state_offset + 1, 0); }
884     break;
885    
886     /*-----------------------------------------------------------------*/
887 ph10 341 case OP_ALLANY:
888     if (clen > 0)
889     { ADD_NEW(state_offset + 1, 0); }
890     break;
891    
892     /*-----------------------------------------------------------------*/
893 nigel 77 case OP_EODN:
894 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
895     could_continue = TRUE;
896     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
897 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
898     break;
899    
900     /*-----------------------------------------------------------------*/
901     case OP_DOLL:
902     if ((md->moptions & PCRE_NOTEOL) == 0)
903     {
904 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
905     could_continue = TRUE;
906     else if (clen == 0 ||
907 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
908 ph10 602 (ptr == end_subject - md->nllen)
909 nigel 91 ))
910 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
911     }
912 ph10 602 break;
913    
914     /*-----------------------------------------------------------------*/
915     case OP_DOLLM:
916     if ((md->moptions & PCRE_NOTEOL) == 0)
917     {
918     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
919     could_continue = TRUE;
920     else if (clen == 0 ||
921     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
922     { ADD_ACTIVE(state_offset + 1, 0); }
923     }
924     else if (IS_NEWLINE(ptr))
925 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
926     break;
927    
928     /*-----------------------------------------------------------------*/
929    
930     case OP_DIGIT:
931     case OP_WHITESPACE:
932     case OP_WORDCHAR:
933     if (clen > 0 && c < 256 &&
934     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
935     { ADD_NEW(state_offset + 1, 0); }
936     break;
937    
938     /*-----------------------------------------------------------------*/
939     case OP_NOT_DIGIT:
940     case OP_NOT_WHITESPACE:
941     case OP_NOT_WORDCHAR:
942     if (clen > 0 && (c >= 256 ||
943     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
944     { ADD_NEW(state_offset + 1, 0); }
945     break;
946    
947     /*-----------------------------------------------------------------*/
948     case OP_WORD_BOUNDARY:
949     case OP_NOT_WORD_BOUNDARY:
950     {
951     int left_word, right_word;
952    
953     if (ptr > start_subject)
954     {
955     const uschar *temp = ptr - 1;
956 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
957 nigel 77 #ifdef SUPPORT_UTF8
958     if (utf8) BACKCHAR(temp);
959     #endif
960     GETCHARTEST(d, temp);
961 ph10 535 #ifdef SUPPORT_UCP
962 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
963     {
964     if (d == '_') left_word = TRUE; else
965 ph10 535 {
966 ph10 518 int cat = UCD_CATEGORY(d);
967     left_word = (cat == ucp_L || cat == ucp_N);
968 ph10 535 }
969     }
970     else
971     #endif
972 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
973     }
974 ph10 518 else left_word = FALSE;
975 nigel 77
976 ph10 461 if (clen > 0)
977 ph10 535 {
978     #ifdef SUPPORT_UCP
979 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
980     {
981     if (c == '_') right_word = TRUE; else
982 ph10 535 {
983 ph10 518 int cat = UCD_CATEGORY(c);
984     right_word = (cat == ucp_L || cat == ucp_N);
985 ph10 535 }
986     }
987     else
988     #endif
989 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
990 ph10 535 }
991 ph10 518 else right_word = FALSE;
992 nigel 77
993     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
994     { ADD_ACTIVE(state_offset + 1, 0); }
995     }
996     break;
997    
998    
999     /*-----------------------------------------------------------------*/
1000     /* Check the next character by Unicode property. We will get here only
1001     if the support is in the binary; otherwise a compile-time error occurs.
1002     */
1003    
1004 ph10 151 #ifdef SUPPORT_UCP
1005 nigel 77 case OP_PROP:
1006     case OP_NOTPROP:
1007     if (clen > 0)
1008     {
1009 nigel 87 BOOL OK;
1010 ph10 349 const ucd_record * prop = GET_UCD(c);
1011 nigel 87 switch(code[1])
1012 nigel 77 {
1013 nigel 87 case PT_ANY:
1014     OK = TRUE;
1015     break;
1016    
1017     case PT_LAMP:
1018 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1019 ph10 517 prop->chartype == ucp_Lt;
1020 nigel 87 break;
1021    
1022     case PT_GC:
1023 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1024 nigel 87 break;
1025    
1026     case PT_PC:
1027 ph10 349 OK = prop->chartype == code[2];
1028 nigel 87 break;
1029    
1030     case PT_SC:
1031 ph10 349 OK = prop->script == code[2];
1032 nigel 87 break;
1033 ph10 535
1034 ph10 517 /* These are specials for combination cases. */
1035 ph10 535
1036 ph10 517 case PT_ALNUM:
1037     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1038     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1039 ph10 535 break;
1040    
1041 ph10 517 case PT_SPACE: /* Perl space */
1042     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1043     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1044 ph10 535 break;
1045    
1046 ph10 517 case PT_PXSPACE: /* POSIX space */
1047     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1048     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1049     c == CHAR_FF || c == CHAR_CR;
1050 ph10 535 break;
1051    
1052 ph10 517 case PT_WORD:
1053     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1054     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1055     c == CHAR_UNDERSCORE;
1056 ph10 535 break;
1057 nigel 87
1058     /* Should never occur, but keep compilers from grumbling. */
1059    
1060     default:
1061     OK = codevalue != OP_PROP;
1062     break;
1063 nigel 77 }
1064 nigel 87
1065     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1066 nigel 77 }
1067     break;
1068     #endif
1069    
1070    
1071    
1072     /* ========================================================================== */
1073     /* These opcodes likewise inspect the subject character, but have an
1074     argument that is not a data character. It is one of these opcodes:
1075 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1076     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1077 nigel 77
1078     case OP_TYPEPLUS:
1079     case OP_TYPEMINPLUS:
1080 nigel 93 case OP_TYPEPOSPLUS:
1081 nigel 77 count = current_state->count; /* Already matched */
1082     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1083     if (clen > 0)
1084     {
1085     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1086     (c < 256 &&
1087 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1088 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1089     {
1090 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1091     {
1092     active_count--; /* Remove non-match possibility */
1093     next_active_state--;
1094     }
1095 nigel 77 count++;
1096     ADD_NEW(state_offset, count);
1097     }
1098     }
1099     break;
1100    
1101     /*-----------------------------------------------------------------*/
1102     case OP_TYPEQUERY:
1103     case OP_TYPEMINQUERY:
1104 nigel 93 case OP_TYPEPOSQUERY:
1105 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1106     if (clen > 0)
1107     {
1108     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1109     (c < 256 &&
1110 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1111 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1112     {
1113 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1114     {
1115     active_count--; /* Remove non-match possibility */
1116     next_active_state--;
1117     }
1118 nigel 77 ADD_NEW(state_offset + 2, 0);
1119     }
1120     }
1121     break;
1122    
1123     /*-----------------------------------------------------------------*/
1124     case OP_TYPESTAR:
1125     case OP_TYPEMINSTAR:
1126 nigel 93 case OP_TYPEPOSSTAR:
1127 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1128     if (clen > 0)
1129     {
1130     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1131     (c < 256 &&
1132 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1133 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1134     {
1135 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1136     {
1137     active_count--; /* Remove non-match possibility */
1138     next_active_state--;
1139     }
1140 nigel 77 ADD_NEW(state_offset, 0);
1141     }
1142     }
1143     break;
1144    
1145     /*-----------------------------------------------------------------*/
1146     case OP_TYPEEXACT:
1147 nigel 93 count = current_state->count; /* Number already matched */
1148     if (clen > 0)
1149     {
1150     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1151     (c < 256 &&
1152 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1153 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1154     {
1155     if (++count >= GET2(code, 1))
1156     { ADD_NEW(state_offset + 4, 0); }
1157     else
1158     { ADD_NEW(state_offset, count); }
1159     }
1160     }
1161     break;
1162    
1163     /*-----------------------------------------------------------------*/
1164 nigel 77 case OP_TYPEUPTO:
1165     case OP_TYPEMINUPTO:
1166 nigel 93 case OP_TYPEPOSUPTO:
1167     ADD_ACTIVE(state_offset + 4, 0);
1168 nigel 77 count = current_state->count; /* Number already matched */
1169     if (clen > 0)
1170     {
1171     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1172     (c < 256 &&
1173 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1174 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1175     {
1176 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1177     {
1178     active_count--; /* Remove non-match possibility */
1179     next_active_state--;
1180     }
1181 nigel 77 if (++count >= GET2(code, 1))
1182     { ADD_NEW(state_offset + 4, 0); }
1183     else
1184     { ADD_NEW(state_offset, count); }
1185     }
1186     }
1187     break;
1188    
1189     /* ========================================================================== */
1190     /* These are virtual opcodes that are used when something like
1191 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1192     argument. It keeps the code above fast for the other cases. The argument
1193     is in the d variable. */
1194 nigel 77
1195 ph10 151 #ifdef SUPPORT_UCP
1196 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1197     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1198 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1199 nigel 77 count = current_state->count; /* Already matched */
1200 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1201 nigel 77 if (clen > 0)
1202     {
1203 nigel 87 BOOL OK;
1204 ph10 349 const ucd_record * prop = GET_UCD(c);
1205 nigel 87 switch(code[2])
1206     {
1207     case PT_ANY:
1208     OK = TRUE;
1209     break;
1210    
1211     case PT_LAMP:
1212 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1213 ph10 517 prop->chartype == ucp_Lt;
1214 nigel 87 break;
1215    
1216     case PT_GC:
1217 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1218 nigel 87 break;
1219    
1220     case PT_PC:
1221 ph10 349 OK = prop->chartype == code[3];
1222 nigel 87 break;
1223    
1224     case PT_SC:
1225 ph10 349 OK = prop->script == code[3];
1226 nigel 87 break;
1227    
1228 ph10 517 /* These are specials for combination cases. */
1229 ph10 535
1230 ph10 517 case PT_ALNUM:
1231     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1232     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1233 ph10 535 break;
1234    
1235 ph10 517 case PT_SPACE: /* Perl space */
1236     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1237     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1238 ph10 535 break;
1239    
1240 ph10 517 case PT_PXSPACE: /* POSIX space */
1241     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1242     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1243     c == CHAR_FF || c == CHAR_CR;
1244 ph10 535 break;
1245    
1246 ph10 517 case PT_WORD:
1247     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1248     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1249     c == CHAR_UNDERSCORE;
1250 ph10 535 break;
1251 ph10 517
1252 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1253    
1254     default:
1255     OK = codevalue != OP_PROP;
1256     break;
1257     }
1258    
1259 nigel 93 if (OK == (d == OP_PROP))
1260     {
1261     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1262     {
1263     active_count--; /* Remove non-match possibility */
1264     next_active_state--;
1265     }
1266     count++;
1267     ADD_NEW(state_offset, count);
1268     }
1269 nigel 77 }
1270     break;
1271    
1272     /*-----------------------------------------------------------------*/
1273     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1274     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1275 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1276 nigel 77 count = current_state->count; /* Already matched */
1277     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1278 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1279 nigel 77 {
1280     const uschar *nptr = ptr + clen;
1281     int ncount = 0;
1282 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1283     {
1284     active_count--; /* Remove non-match possibility */
1285     next_active_state--;
1286     }
1287 nigel 77 while (nptr < end_subject)
1288     {
1289     int nd;
1290     int ndlen = 1;
1291     GETCHARLEN(nd, nptr, ndlen);
1292 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1293 nigel 77 ncount++;
1294     nptr += ndlen;
1295     }
1296     count++;
1297     ADD_NEW_DATA(-state_offset, count, ncount);
1298     }
1299     break;
1300 ph10 151 #endif
1301 nigel 77
1302     /*-----------------------------------------------------------------*/
1303 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1304     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1305     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1306     count = current_state->count; /* Already matched */
1307     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1308     if (clen > 0)
1309     {
1310     int ncount = 0;
1311     switch (c)
1312     {
1313     case 0x000b:
1314     case 0x000c:
1315     case 0x0085:
1316     case 0x2028:
1317     case 0x2029:
1318 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1319     goto ANYNL01;
1320    
1321     case 0x000d:
1322     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1323     /* Fall through */
1324    
1325     ANYNL01:
1326     case 0x000a:
1327 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1328     {
1329     active_count--; /* Remove non-match possibility */
1330     next_active_state--;
1331     }
1332     count++;
1333     ADD_NEW_DATA(-state_offset, count, ncount);
1334     break;
1335 ph10 231
1336 nigel 93 default:
1337     break;
1338     }
1339     }
1340     break;
1341    
1342     /*-----------------------------------------------------------------*/
1343 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1344     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1345     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1346     count = current_state->count; /* Already matched */
1347     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1348     if (clen > 0)
1349     {
1350 ph10 182 BOOL OK;
1351 ph10 178 switch (c)
1352     {
1353     case 0x000a:
1354     case 0x000b:
1355     case 0x000c:
1356     case 0x000d:
1357     case 0x0085:
1358     case 0x2028:
1359     case 0x2029:
1360     OK = TRUE;
1361 ph10 182 break;
1362 ph10 178
1363     default:
1364     OK = FALSE;
1365 ph10 182 break;
1366 ph10 178 }
1367    
1368     if (OK == (d == OP_VSPACE))
1369 ph10 182 {
1370 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1371     {
1372     active_count--; /* Remove non-match possibility */
1373     next_active_state--;
1374     }
1375     count++;
1376     ADD_NEW_DATA(-state_offset, count, 0);
1377     }
1378     }
1379     break;
1380    
1381     /*-----------------------------------------------------------------*/
1382     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1383     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1384     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1385     count = current_state->count; /* Already matched */
1386     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1387     if (clen > 0)
1388     {
1389 ph10 182 BOOL OK;
1390 ph10 178 switch (c)
1391     {
1392     case 0x09: /* HT */
1393     case 0x20: /* SPACE */
1394     case 0xa0: /* NBSP */
1395     case 0x1680: /* OGHAM SPACE MARK */
1396     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1397     case 0x2000: /* EN QUAD */
1398     case 0x2001: /* EM QUAD */
1399     case 0x2002: /* EN SPACE */
1400     case 0x2003: /* EM SPACE */
1401     case 0x2004: /* THREE-PER-EM SPACE */
1402     case 0x2005: /* FOUR-PER-EM SPACE */
1403     case 0x2006: /* SIX-PER-EM SPACE */
1404     case 0x2007: /* FIGURE SPACE */
1405     case 0x2008: /* PUNCTUATION SPACE */
1406     case 0x2009: /* THIN SPACE */
1407     case 0x200A: /* HAIR SPACE */
1408     case 0x202f: /* NARROW NO-BREAK SPACE */
1409     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1410     case 0x3000: /* IDEOGRAPHIC SPACE */
1411     OK = TRUE;
1412     break;
1413 ph10 182
1414 ph10 178 default:
1415     OK = FALSE;
1416     break;
1417     }
1418 ph10 182
1419 ph10 178 if (OK == (d == OP_HSPACE))
1420 ph10 182 {
1421 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1422     {
1423     active_count--; /* Remove non-match possibility */
1424     next_active_state--;
1425     }
1426     count++;
1427     ADD_NEW_DATA(-state_offset, count, 0);
1428     }
1429     }
1430     break;
1431    
1432     /*-----------------------------------------------------------------*/
1433 ph10 151 #ifdef SUPPORT_UCP
1434 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1435     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1436 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1437 nigel 87 count = 4;
1438 nigel 77 goto QS1;
1439    
1440     case OP_PROP_EXTRA + OP_TYPESTAR:
1441     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1442 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1443 nigel 77 count = 0;
1444    
1445     QS1:
1446    
1447 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1448 nigel 77 if (clen > 0)
1449     {
1450 nigel 87 BOOL OK;
1451 ph10 349 const ucd_record * prop = GET_UCD(c);
1452 nigel 87 switch(code[2])
1453     {
1454     case PT_ANY:
1455     OK = TRUE;
1456     break;
1457    
1458     case PT_LAMP:
1459 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1460 ph10 517 prop->chartype == ucp_Lt;
1461 nigel 87 break;
1462    
1463     case PT_GC:
1464 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1465 nigel 87 break;
1466    
1467     case PT_PC:
1468 ph10 349 OK = prop->chartype == code[3];
1469 nigel 87 break;
1470    
1471     case PT_SC:
1472 ph10 349 OK = prop->script == code[3];
1473 nigel 87 break;
1474 ph10 535
1475 ph10 517 /* These are specials for combination cases. */
1476 ph10 535
1477 ph10 517 case PT_ALNUM:
1478     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1479     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1480 ph10 535 break;
1481    
1482 ph10 517 case PT_SPACE: /* Perl space */
1483     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1484     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1485 ph10 535 break;
1486    
1487 ph10 517 case PT_PXSPACE: /* POSIX space */
1488     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1489     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1490     c == CHAR_FF || c == CHAR_CR;
1491 ph10 535 break;
1492    
1493 ph10 517 case PT_WORD:
1494     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1495     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1496     c == CHAR_UNDERSCORE;
1497 ph10 535 break;
1498 nigel 87
1499     /* Should never occur, but keep compilers from grumbling. */
1500    
1501     default:
1502     OK = codevalue != OP_PROP;
1503     break;
1504     }
1505    
1506 nigel 93 if (OK == (d == OP_PROP))
1507     {
1508     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1509     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1510     {
1511     active_count--; /* Remove non-match possibility */
1512     next_active_state--;
1513     }
1514     ADD_NEW(state_offset + count, 0);
1515     }
1516 nigel 77 }
1517     break;
1518    
1519     /*-----------------------------------------------------------------*/
1520     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1521     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1522 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1523 nigel 77 count = 2;
1524     goto QS2;
1525    
1526     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1527     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1528 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1529 nigel 77 count = 0;
1530    
1531     QS2:
1532    
1533     ADD_ACTIVE(state_offset + 2, 0);
1534 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1535 nigel 77 {
1536     const uschar *nptr = ptr + clen;
1537     int ncount = 0;
1538 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1539     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1540     {
1541     active_count--; /* Remove non-match possibility */
1542     next_active_state--;
1543     }
1544 nigel 77 while (nptr < end_subject)
1545     {
1546     int nd;
1547     int ndlen = 1;
1548     GETCHARLEN(nd, nptr, ndlen);
1549 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1550 nigel 77 ncount++;
1551     nptr += ndlen;
1552     }
1553     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1554     }
1555     break;
1556 ph10 151 #endif
1557 nigel 77
1558     /*-----------------------------------------------------------------*/
1559 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1560     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1561     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1562     count = 2;
1563     goto QS3;
1564    
1565     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1566     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1567     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1568     count = 0;
1569    
1570     QS3:
1571     ADD_ACTIVE(state_offset + 2, 0);
1572     if (clen > 0)
1573     {
1574     int ncount = 0;
1575     switch (c)
1576     {
1577     case 0x000b:
1578     case 0x000c:
1579     case 0x0085:
1580     case 0x2028:
1581     case 0x2029:
1582 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1583     goto ANYNL02;
1584    
1585     case 0x000d:
1586     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1587     /* Fall through */
1588    
1589     ANYNL02:
1590     case 0x000a:
1591 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1592     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1593     {
1594     active_count--; /* Remove non-match possibility */
1595     next_active_state--;
1596     }
1597     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1598     break;
1599 ph10 231
1600 nigel 93 default:
1601     break;
1602     }
1603     }
1604     break;
1605    
1606     /*-----------------------------------------------------------------*/
1607 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1608     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1609     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1610     count = 2;
1611     goto QS4;
1612    
1613     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1614     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1615     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1616     count = 0;
1617    
1618     QS4:
1619     ADD_ACTIVE(state_offset + 2, 0);
1620     if (clen > 0)
1621     {
1622 ph10 182 BOOL OK;
1623 ph10 178 switch (c)
1624     {
1625     case 0x000a:
1626     case 0x000b:
1627     case 0x000c:
1628     case 0x000d:
1629     case 0x0085:
1630     case 0x2028:
1631     case 0x2029:
1632     OK = TRUE;
1633     break;
1634 ph10 182
1635 ph10 178 default:
1636     OK = FALSE;
1637     break;
1638     }
1639     if (OK == (d == OP_VSPACE))
1640 ph10 182 {
1641 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1642     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1643     {
1644     active_count--; /* Remove non-match possibility */
1645     next_active_state--;
1646     }
1647     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1648     }
1649     }
1650     break;
1651    
1652     /*-----------------------------------------------------------------*/
1653     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1654     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1655     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1656     count = 2;
1657     goto QS5;
1658    
1659     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1660     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1661     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1662     count = 0;
1663    
1664     QS5:
1665     ADD_ACTIVE(state_offset + 2, 0);
1666     if (clen > 0)
1667     {
1668 ph10 182 BOOL OK;
1669 ph10 178 switch (c)
1670     {
1671     case 0x09: /* HT */
1672     case 0x20: /* SPACE */
1673     case 0xa0: /* NBSP */
1674     case 0x1680: /* OGHAM SPACE MARK */
1675     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1676     case 0x2000: /* EN QUAD */
1677     case 0x2001: /* EM QUAD */
1678     case 0x2002: /* EN SPACE */
1679     case 0x2003: /* EM SPACE */
1680     case 0x2004: /* THREE-PER-EM SPACE */
1681     case 0x2005: /* FOUR-PER-EM SPACE */
1682     case 0x2006: /* SIX-PER-EM SPACE */
1683     case 0x2007: /* FIGURE SPACE */
1684     case 0x2008: /* PUNCTUATION SPACE */
1685     case 0x2009: /* THIN SPACE */
1686     case 0x200A: /* HAIR SPACE */
1687     case 0x202f: /* NARROW NO-BREAK SPACE */
1688     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1689     case 0x3000: /* IDEOGRAPHIC SPACE */
1690     OK = TRUE;
1691     break;
1692 ph10 182
1693 ph10 178 default:
1694     OK = FALSE;
1695     break;
1696     }
1697 ph10 182
1698 ph10 178 if (OK == (d == OP_HSPACE))
1699 ph10 182 {
1700 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1701     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1702     {
1703     active_count--; /* Remove non-match possibility */
1704     next_active_state--;
1705     }
1706     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1707     }
1708     }
1709     break;
1710    
1711     /*-----------------------------------------------------------------*/
1712 ph10 151 #ifdef SUPPORT_UCP
1713 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1714     case OP_PROP_EXTRA + OP_TYPEUPTO:
1715     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1716 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1717 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1718 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1719 nigel 77 count = current_state->count; /* Number already matched */
1720     if (clen > 0)
1721     {
1722 nigel 87 BOOL OK;
1723 ph10 349 const ucd_record * prop = GET_UCD(c);
1724 nigel 87 switch(code[4])
1725 nigel 77 {
1726 nigel 87 case PT_ANY:
1727     OK = TRUE;
1728     break;
1729    
1730     case PT_LAMP:
1731 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1732 ph10 517 prop->chartype == ucp_Lt;
1733 nigel 87 break;
1734    
1735     case PT_GC:
1736 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1737 nigel 87 break;
1738    
1739     case PT_PC:
1740 ph10 349 OK = prop->chartype == code[5];
1741 nigel 87 break;
1742    
1743     case PT_SC:
1744 ph10 349 OK = prop->script == code[5];
1745 nigel 87 break;
1746 ph10 535
1747 ph10 517 /* These are specials for combination cases. */
1748 ph10 535
1749 ph10 517 case PT_ALNUM:
1750     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1751     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1752 ph10 535 break;
1753    
1754 ph10 517 case PT_SPACE: /* Perl space */
1755     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1756     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1757 ph10 535 break;
1758    
1759 ph10 517 case PT_PXSPACE: /* POSIX space */
1760     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1761     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1762     c == CHAR_FF || c == CHAR_CR;
1763 ph10 535 break;
1764    
1765 ph10 517 case PT_WORD:
1766     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1767     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1768     c == CHAR_UNDERSCORE;
1769 ph10 535 break;
1770 nigel 87
1771     /* Should never occur, but keep compilers from grumbling. */
1772    
1773     default:
1774     OK = codevalue != OP_PROP;
1775     break;
1776     }
1777    
1778     if (OK == (d == OP_PROP))
1779     {
1780 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1781     {
1782     active_count--; /* Remove non-match possibility */
1783     next_active_state--;
1784     }
1785 nigel 77 if (++count >= GET2(code, 1))
1786 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1787 nigel 77 else
1788     { ADD_NEW(state_offset, count); }
1789     }
1790     }
1791     break;
1792    
1793     /*-----------------------------------------------------------------*/
1794     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1795     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1796     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1797 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1798 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1799     { ADD_ACTIVE(state_offset + 4, 0); }
1800     count = current_state->count; /* Number already matched */
1801 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1802 nigel 77 {
1803     const uschar *nptr = ptr + clen;
1804     int ncount = 0;
1805 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1806     {
1807     active_count--; /* Remove non-match possibility */
1808     next_active_state--;
1809     }
1810 nigel 77 while (nptr < end_subject)
1811     {
1812     int nd;
1813     int ndlen = 1;
1814     GETCHARLEN(nd, nptr, ndlen);
1815 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1816 nigel 77 ncount++;
1817     nptr += ndlen;
1818     }
1819     if (++count >= GET2(code, 1))
1820     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1821     else
1822     { ADD_NEW_DATA(-state_offset, count, ncount); }
1823     }
1824     break;
1825 ph10 151 #endif
1826 nigel 77
1827 nigel 93 /*-----------------------------------------------------------------*/
1828     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1829     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1830     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1831     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1832     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1833     { ADD_ACTIVE(state_offset + 4, 0); }
1834     count = current_state->count; /* Number already matched */
1835     if (clen > 0)
1836     {
1837     int ncount = 0;
1838     switch (c)
1839     {
1840     case 0x000b:
1841     case 0x000c:
1842     case 0x0085:
1843     case 0x2028:
1844     case 0x2029:
1845 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1846     goto ANYNL03;
1847    
1848     case 0x000d:
1849     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1850     /* Fall through */
1851    
1852     ANYNL03:
1853     case 0x000a:
1854 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1855     {
1856     active_count--; /* Remove non-match possibility */
1857     next_active_state--;
1858     }
1859     if (++count >= GET2(code, 1))
1860     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1861     else
1862     { ADD_NEW_DATA(-state_offset, count, ncount); }
1863     break;
1864 ph10 231
1865 nigel 93 default:
1866     break;
1867     }
1868     }
1869     break;
1870    
1871 ph10 178 /*-----------------------------------------------------------------*/
1872     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1873     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1874     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1875     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1876     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1877     { ADD_ACTIVE(state_offset + 4, 0); }
1878     count = current_state->count; /* Number already matched */
1879     if (clen > 0)
1880     {
1881 ph10 182 BOOL OK;
1882 ph10 178 switch (c)
1883     {
1884     case 0x000a:
1885     case 0x000b:
1886     case 0x000c:
1887     case 0x000d:
1888     case 0x0085:
1889     case 0x2028:
1890     case 0x2029:
1891     OK = TRUE;
1892     break;
1893 ph10 182
1894 ph10 178 default:
1895     OK = FALSE;
1896     }
1897 ph10 182
1898 ph10 178 if (OK == (d == OP_VSPACE))
1899 ph10 182 {
1900 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1901     {
1902     active_count--; /* Remove non-match possibility */
1903     next_active_state--;
1904     }
1905     if (++count >= GET2(code, 1))
1906     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1907     else
1908     { ADD_NEW_DATA(-state_offset, count, 0); }
1909     }
1910     }
1911     break;
1912    
1913     /*-----------------------------------------------------------------*/
1914     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1915     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1916     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1917     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1918     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1919     { ADD_ACTIVE(state_offset + 4, 0); }
1920     count = current_state->count; /* Number already matched */
1921     if (clen > 0)
1922     {
1923 ph10 182 BOOL OK;
1924 ph10 178 switch (c)
1925     {
1926     case 0x09: /* HT */
1927     case 0x20: /* SPACE */
1928     case 0xa0: /* NBSP */
1929     case 0x1680: /* OGHAM SPACE MARK */
1930     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1931     case 0x2000: /* EN QUAD */
1932     case 0x2001: /* EM QUAD */
1933     case 0x2002: /* EN SPACE */
1934     case 0x2003: /* EM SPACE */
1935     case 0x2004: /* THREE-PER-EM SPACE */
1936     case 0x2005: /* FOUR-PER-EM SPACE */
1937     case 0x2006: /* SIX-PER-EM SPACE */
1938     case 0x2007: /* FIGURE SPACE */
1939     case 0x2008: /* PUNCTUATION SPACE */
1940     case 0x2009: /* THIN SPACE */
1941     case 0x200A: /* HAIR SPACE */
1942     case 0x202f: /* NARROW NO-BREAK SPACE */
1943     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1944     case 0x3000: /* IDEOGRAPHIC SPACE */
1945     OK = TRUE;
1946     break;
1947 ph10 182
1948 ph10 178 default:
1949     OK = FALSE;
1950     break;
1951     }
1952 ph10 182
1953 ph10 178 if (OK == (d == OP_HSPACE))
1954 ph10 182 {
1955 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1956     {
1957     active_count--; /* Remove non-match possibility */
1958     next_active_state--;
1959     }
1960     if (++count >= GET2(code, 1))
1961     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1962     else
1963     { ADD_NEW_DATA(-state_offset, count, 0); }
1964     }
1965     }
1966     break;
1967    
1968 nigel 77 /* ========================================================================== */
1969     /* These opcodes are followed by a character that is usually compared
1970     to the current subject character; it is loaded into d. We still get
1971     here even if there is no subject character, because in some cases zero
1972     repetitions are permitted. */
1973    
1974     /*-----------------------------------------------------------------*/
1975     case OP_CHAR:
1976     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1977     break;
1978    
1979     /*-----------------------------------------------------------------*/
1980 ph10 602 case OP_CHARI:
1981 nigel 77 if (clen == 0) break;
1982    
1983     #ifdef SUPPORT_UTF8
1984     if (utf8)
1985     {
1986     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1987     {
1988 nigel 93 unsigned int othercase;
1989 nigel 77 if (c < 128) othercase = fcc[c]; else
1990    
1991     /* If we have Unicode property support, we can use it to test the
1992 nigel 87 other case of the character. */
1993 nigel 77
1994     #ifdef SUPPORT_UCP
1995 ph10 349 othercase = UCD_OTHERCASE(c);
1996 nigel 87 #else
1997 nigel 93 othercase = NOTACHAR;
1998 nigel 77 #endif
1999    
2000     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2001     }
2002     }
2003     else
2004     #endif /* SUPPORT_UTF8 */
2005    
2006     /* Non-UTF-8 mode */
2007     {
2008     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2009     }
2010     break;
2011    
2012    
2013     #ifdef SUPPORT_UCP
2014     /*-----------------------------------------------------------------*/
2015     /* This is a tricky one because it can match more than one character.
2016     Find out how many characters to skip, and then set up a negative state
2017     to wait for them to pass before continuing. */
2018    
2019     case OP_EXTUNI:
2020 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2021 nigel 77 {
2022     const uschar *nptr = ptr + clen;
2023     int ncount = 0;
2024     while (nptr < end_subject)
2025     {
2026     int nclen = 1;
2027     GETCHARLEN(c, nptr, nclen);
2028 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2029 nigel 77 ncount++;
2030     nptr += nclen;
2031     }
2032     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2033     }
2034     break;
2035     #endif
2036    
2037     /*-----------------------------------------------------------------*/
2038 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2039     character (when CR is followed by LF). In this case, set up a negative
2040     state to wait for one character to pass before continuing. */
2041    
2042     case OP_ANYNL:
2043     if (clen > 0) switch(c)
2044     {
2045     case 0x000b:
2046     case 0x000c:
2047     case 0x0085:
2048     case 0x2028:
2049     case 0x2029:
2050 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2051    
2052     case 0x000a:
2053 nigel 93 ADD_NEW(state_offset + 1, 0);
2054     break;
2055 ph10 231
2056 nigel 93 case 0x000d:
2057     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2058     {
2059     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2060     }
2061     else
2062     {
2063     ADD_NEW(state_offset + 1, 0);
2064     }
2065     break;
2066     }
2067     break;
2068    
2069     /*-----------------------------------------------------------------*/
2070 ph10 178 case OP_NOT_VSPACE:
2071     if (clen > 0) switch(c)
2072     {
2073     case 0x000a:
2074     case 0x000b:
2075     case 0x000c:
2076     case 0x000d:
2077     case 0x0085:
2078     case 0x2028:
2079     case 0x2029:
2080     break;
2081 ph10 182
2082     default:
2083 ph10 178 ADD_NEW(state_offset + 1, 0);
2084     break;
2085     }
2086     break;
2087    
2088     /*-----------------------------------------------------------------*/
2089     case OP_VSPACE:
2090     if (clen > 0) switch(c)
2091     {
2092     case 0x000a:
2093     case 0x000b:
2094     case 0x000c:
2095     case 0x000d:
2096     case 0x0085:
2097     case 0x2028:
2098     case 0x2029:
2099     ADD_NEW(state_offset + 1, 0);
2100     break;
2101 ph10 182
2102 ph10 178 default: break;
2103     }
2104     break;
2105    
2106     /*-----------------------------------------------------------------*/
2107     case OP_NOT_HSPACE:
2108     if (clen > 0) switch(c)
2109     {
2110     case 0x09: /* HT */
2111     case 0x20: /* SPACE */
2112     case 0xa0: /* NBSP */
2113     case 0x1680: /* OGHAM SPACE MARK */
2114     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2115     case 0x2000: /* EN QUAD */
2116     case 0x2001: /* EM QUAD */
2117     case 0x2002: /* EN SPACE */
2118     case 0x2003: /* EM SPACE */
2119     case 0x2004: /* THREE-PER-EM SPACE */
2120     case 0x2005: /* FOUR-PER-EM SPACE */
2121     case 0x2006: /* SIX-PER-EM SPACE */
2122     case 0x2007: /* FIGURE SPACE */
2123     case 0x2008: /* PUNCTUATION SPACE */
2124     case 0x2009: /* THIN SPACE */
2125     case 0x200A: /* HAIR SPACE */
2126     case 0x202f: /* NARROW NO-BREAK SPACE */
2127     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2128     case 0x3000: /* IDEOGRAPHIC SPACE */
2129     break;
2130 ph10 182
2131     default:
2132 ph10 178 ADD_NEW(state_offset + 1, 0);
2133     break;
2134     }
2135     break;
2136    
2137     /*-----------------------------------------------------------------*/
2138     case OP_HSPACE:
2139     if (clen > 0) switch(c)
2140     {
2141     case 0x09: /* HT */
2142     case 0x20: /* SPACE */
2143     case 0xa0: /* NBSP */
2144     case 0x1680: /* OGHAM SPACE MARK */
2145     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2146     case 0x2000: /* EN QUAD */
2147     case 0x2001: /* EM QUAD */
2148     case 0x2002: /* EN SPACE */
2149     case 0x2003: /* EM SPACE */
2150     case 0x2004: /* THREE-PER-EM SPACE */
2151     case 0x2005: /* FOUR-PER-EM SPACE */
2152     case 0x2006: /* SIX-PER-EM SPACE */
2153     case 0x2007: /* FIGURE SPACE */
2154     case 0x2008: /* PUNCTUATION SPACE */
2155     case 0x2009: /* THIN SPACE */
2156     case 0x200A: /* HAIR SPACE */
2157     case 0x202f: /* NARROW NO-BREAK SPACE */
2158     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2159     case 0x3000: /* IDEOGRAPHIC SPACE */
2160     ADD_NEW(state_offset + 1, 0);
2161     break;
2162     }
2163     break;
2164    
2165     /*-----------------------------------------------------------------*/
2166 ph10 602 /* Match a negated single character casefully. This is only used for
2167     one-byte characters, that is, we know that d < 256. The character we are
2168 nigel 77 checking (c) can be multibyte. */
2169    
2170     case OP_NOT:
2171 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2172 nigel 77 break;
2173    
2174     /*-----------------------------------------------------------------*/
2175 ph10 602 /* Match a negated single character caselessly. This is only used for
2176     one-byte characters, that is, we know that d < 256. The character we are
2177     checking (c) can be multibyte. */
2178    
2179     case OP_NOTI:
2180     if (clen > 0 && c != d && c != fcc[d])
2181     { ADD_NEW(state_offset + dlen + 1, 0); }
2182     break;
2183    
2184     /*-----------------------------------------------------------------*/
2185     case OP_PLUSI:
2186     case OP_MINPLUSI:
2187     case OP_POSPLUSI:
2188     case OP_NOTPLUSI:
2189     case OP_NOTMINPLUSI:
2190     case OP_NOTPOSPLUSI:
2191     caseless = TRUE;
2192     codevalue -= OP_STARI - OP_STAR;
2193    
2194     /* Fall through */
2195 nigel 77 case OP_PLUS:
2196     case OP_MINPLUS:
2197 nigel 93 case OP_POSPLUS:
2198 nigel 77 case OP_NOTPLUS:
2199     case OP_NOTMINPLUS:
2200 nigel 93 case OP_NOTPOSPLUS:
2201 nigel 77 count = current_state->count; /* Already matched */
2202     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2203     if (clen > 0)
2204     {
2205 nigel 93 unsigned int otherd = NOTACHAR;
2206 ph10 602 if (caseless)
2207 nigel 77 {
2208     #ifdef SUPPORT_UTF8
2209 nigel 87 if (utf8 && d >= 128)
2210 nigel 77 {
2211     #ifdef SUPPORT_UCP
2212 ph10 349 otherd = UCD_OTHERCASE(d);
2213 nigel 77 #endif /* SUPPORT_UCP */
2214     }
2215     else
2216     #endif /* SUPPORT_UTF8 */
2217     otherd = fcc[d];
2218     }
2219     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2220 nigel 93 {
2221     if (count > 0 &&
2222     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2223     {
2224     active_count--; /* Remove non-match possibility */
2225     next_active_state--;
2226     }
2227     count++;
2228     ADD_NEW(state_offset, count);
2229     }
2230 nigel 77 }
2231     break;
2232    
2233     /*-----------------------------------------------------------------*/
2234 ph10 602 case OP_QUERYI:
2235     case OP_MINQUERYI:
2236     case OP_POSQUERYI:
2237     case OP_NOTQUERYI:
2238     case OP_NOTMINQUERYI:
2239     case OP_NOTPOSQUERYI:
2240     caseless = TRUE;
2241     codevalue -= OP_STARI - OP_STAR;
2242     /* Fall through */
2243 nigel 77 case OP_QUERY:
2244     case OP_MINQUERY:
2245 nigel 93 case OP_POSQUERY:
2246 nigel 77 case OP_NOTQUERY:
2247     case OP_NOTMINQUERY:
2248 nigel 93 case OP_NOTPOSQUERY:
2249 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2250     if (clen > 0)
2251     {
2252 nigel 93 unsigned int otherd = NOTACHAR;
2253 ph10 602 if (caseless)
2254 nigel 77 {
2255     #ifdef SUPPORT_UTF8
2256 nigel 87 if (utf8 && d >= 128)
2257 nigel 77 {
2258     #ifdef SUPPORT_UCP
2259 ph10 349 otherd = UCD_OTHERCASE(d);
2260 nigel 77 #endif /* SUPPORT_UCP */
2261     }
2262     else
2263     #endif /* SUPPORT_UTF8 */
2264     otherd = fcc[d];
2265     }
2266     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2267 nigel 93 {
2268     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2269     {
2270     active_count--; /* Remove non-match possibility */
2271     next_active_state--;
2272     }
2273     ADD_NEW(state_offset + dlen + 1, 0);
2274     }
2275 nigel 77 }
2276     break;
2277    
2278     /*-----------------------------------------------------------------*/
2279 ph10 602 case OP_STARI:
2280     case OP_MINSTARI:
2281     case OP_POSSTARI:
2282     case OP_NOTSTARI:
2283     case OP_NOTMINSTARI:
2284     case OP_NOTPOSSTARI:
2285     caseless = TRUE;
2286     codevalue -= OP_STARI - OP_STAR;
2287     /* Fall through */
2288 nigel 77 case OP_STAR:
2289     case OP_MINSTAR:
2290 nigel 93 case OP_POSSTAR:
2291 nigel 77 case OP_NOTSTAR:
2292     case OP_NOTMINSTAR:
2293 nigel 93 case OP_NOTPOSSTAR:
2294 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2295     if (clen > 0)
2296     {
2297 nigel 93 unsigned int otherd = NOTACHAR;
2298 ph10 602 if (caseless)
2299 nigel 77 {
2300     #ifdef SUPPORT_UTF8
2301 nigel 87 if (utf8 && d >= 128)
2302 nigel 77 {
2303     #ifdef SUPPORT_UCP
2304 ph10 349 otherd = UCD_OTHERCASE(d);
2305 nigel 77 #endif /* SUPPORT_UCP */
2306     }
2307     else
2308     #endif /* SUPPORT_UTF8 */
2309     otherd = fcc[d];
2310     }
2311     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2312 nigel 93 {
2313     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2314     {
2315     active_count--; /* Remove non-match possibility */
2316     next_active_state--;
2317     }
2318     ADD_NEW(state_offset, 0);
2319     }
2320 nigel 77 }
2321     break;
2322    
2323     /*-----------------------------------------------------------------*/
2324 ph10 602 case OP_EXACTI:
2325     case OP_NOTEXACTI:
2326     caseless = TRUE;
2327     codevalue -= OP_STARI - OP_STAR;
2328     /* Fall through */
2329 nigel 77 case OP_EXACT:
2330 nigel 93 case OP_NOTEXACT:
2331     count = current_state->count; /* Number already matched */
2332     if (clen > 0)
2333     {
2334     unsigned int otherd = NOTACHAR;
2335 ph10 602 if (caseless)
2336 nigel 93 {
2337     #ifdef SUPPORT_UTF8
2338     if (utf8 && d >= 128)
2339     {
2340     #ifdef SUPPORT_UCP
2341 ph10 349 otherd = UCD_OTHERCASE(d);
2342 nigel 93 #endif /* SUPPORT_UCP */
2343     }
2344     else
2345     #endif /* SUPPORT_UTF8 */
2346     otherd = fcc[d];
2347     }
2348     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2349     {
2350     if (++count >= GET2(code, 1))
2351     { ADD_NEW(state_offset + dlen + 3, 0); }
2352     else
2353     { ADD_NEW(state_offset, count); }
2354     }
2355     }
2356     break;
2357    
2358     /*-----------------------------------------------------------------*/
2359 ph10 602 case OP_UPTOI:
2360     case OP_MINUPTOI:
2361     case OP_POSUPTOI:
2362     case OP_NOTUPTOI:
2363     case OP_NOTMINUPTOI:
2364     case OP_NOTPOSUPTOI:
2365     caseless = TRUE;
2366     codevalue -= OP_STARI - OP_STAR;
2367     /* Fall through */
2368 nigel 77 case OP_UPTO:
2369     case OP_MINUPTO:
2370 nigel 93 case OP_POSUPTO:
2371 nigel 77 case OP_NOTUPTO:
2372     case OP_NOTMINUPTO:
2373 nigel 93 case OP_NOTPOSUPTO:
2374     ADD_ACTIVE(state_offset + dlen + 3, 0);
2375 nigel 77 count = current_state->count; /* Number already matched */
2376     if (clen > 0)
2377     {
2378 nigel 93 unsigned int otherd = NOTACHAR;
2379 ph10 602 if (caseless)
2380 nigel 77 {
2381     #ifdef SUPPORT_UTF8
2382 nigel 87 if (utf8 && d >= 128)
2383 nigel 77 {
2384     #ifdef SUPPORT_UCP
2385 ph10 349 otherd = UCD_OTHERCASE(d);
2386 nigel 77 #endif /* SUPPORT_UCP */
2387     }
2388     else
2389     #endif /* SUPPORT_UTF8 */
2390     otherd = fcc[d];
2391     }
2392     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2393     {
2394 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2395     {
2396     active_count--; /* Remove non-match possibility */
2397     next_active_state--;
2398     }
2399 nigel 77 if (++count >= GET2(code, 1))
2400     { ADD_NEW(state_offset + dlen + 3, 0); }
2401     else
2402     { ADD_NEW(state_offset, count); }
2403     }
2404     }
2405     break;
2406    
2407    
2408     /* ========================================================================== */
2409     /* These are the class-handling opcodes */
2410    
2411     case OP_CLASS:
2412     case OP_NCLASS:
2413     case OP_XCLASS:
2414     {
2415     BOOL isinclass = FALSE;
2416     int next_state_offset;
2417     const uschar *ecode;
2418    
2419     /* For a simple class, there is always just a 32-byte table, and we
2420     can set isinclass from it. */
2421    
2422     if (codevalue != OP_XCLASS)
2423     {
2424     ecode = code + 33;
2425     if (clen > 0)
2426     {
2427     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2428     ((code[1 + c/8] & (1 << (c&7))) != 0);
2429     }
2430     }
2431    
2432     /* An extended class may have a table or a list of single characters,
2433     ranges, or both, and it may be positive or negative. There's a
2434     function that sorts all this out. */
2435    
2436     else
2437     {
2438     ecode = code + GET(code, 1);
2439     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2440     }
2441    
2442     /* At this point, isinclass is set for all kinds of class, and ecode
2443     points to the byte after the end of the class. If there is a
2444     quantifier, this is where it will be. */
2445    
2446 ph10 530 next_state_offset = (int)(ecode - start_code);
2447 nigel 77
2448     switch (*ecode)
2449     {
2450     case OP_CRSTAR:
2451     case OP_CRMINSTAR:
2452     ADD_ACTIVE(next_state_offset + 1, 0);
2453     if (isinclass) { ADD_NEW(state_offset, 0); }
2454     break;
2455    
2456     case OP_CRPLUS:
2457     case OP_CRMINPLUS:
2458     count = current_state->count; /* Already matched */
2459     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2460     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2461     break;
2462    
2463     case OP_CRQUERY:
2464     case OP_CRMINQUERY:
2465     ADD_ACTIVE(next_state_offset + 1, 0);
2466     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2467     break;
2468    
2469     case OP_CRRANGE:
2470     case OP_CRMINRANGE:
2471     count = current_state->count; /* Already matched */
2472     if (count >= GET2(ecode, 1))
2473     { ADD_ACTIVE(next_state_offset + 5, 0); }
2474     if (isinclass)
2475     {
2476 nigel 91 int max = GET2(ecode, 3);
2477     if (++count >= max && max != 0) /* Max 0 => no limit */
2478 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2479     else
2480     { ADD_NEW(state_offset, count); }
2481     }
2482     break;
2483    
2484     default:
2485     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2486     break;
2487     }
2488     }
2489     break;
2490    
2491     /* ========================================================================== */
2492     /* These are the opcodes for fancy brackets of various kinds. We have
2493 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2494     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2495 ph10 341 though the other "backtracking verbs" are not supported. */
2496 ph10 345
2497 ph10 341 case OP_FAIL:
2498 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2499 ph10 345 break;
2500 nigel 77
2501     case OP_ASSERT:
2502     case OP_ASSERT_NOT:
2503     case OP_ASSERTBACK:
2504     case OP_ASSERTBACK_NOT:
2505     {
2506     int rc;
2507     int local_offsets[2];
2508     int local_workspace[1000];
2509     const uschar *endasscode = code + GET(code, 1);
2510    
2511     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2512    
2513     rc = internal_dfa_exec(
2514     md, /* static match data */
2515     code, /* this subexpression's code */
2516     ptr, /* where we currently are */
2517 ph10 530 (int)(ptr - start_subject), /* start offset */
2518 nigel 77 local_offsets, /* offset vector */
2519     sizeof(local_offsets)/sizeof(int), /* size of same */
2520     local_workspace, /* workspace vector */
2521     sizeof(local_workspace)/sizeof(int), /* size of same */
2522     rlevel, /* function recursion level */
2523     recursing); /* pass on regex recursion */
2524 ph10 487
2525 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2526 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2527 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2528 nigel 77 }
2529     break;
2530    
2531     /*-----------------------------------------------------------------*/
2532     case OP_COND:
2533 nigel 93 case OP_SCOND:
2534 nigel 77 {
2535     int local_offsets[1000];
2536     int local_workspace[1000];
2537 ph10 406 int codelink = GET(code, 1);
2538 ph10 397 int condcode;
2539 ph10 406
2540 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2541 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2542 ph10 398 happen for the other conditions. */
2543 nigel 77
2544 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2545 ph10 406 {
2546     rrc = 0;
2547 ph10 397 if (pcre_callout != NULL)
2548     {
2549     pcre_callout_block cb;
2550     cb.version = 1; /* Version 1 of the callout block */
2551     cb.callout_number = code[LINK_SIZE+2];
2552     cb.offset_vector = offsets;
2553     cb.subject = (PCRE_SPTR)start_subject;
2554 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2555     cb.start_match = (int)(current_subject - start_subject);
2556     cb.current_position = (int)(ptr - start_subject);
2557 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2558     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2559     cb.capture_top = 1;
2560     cb.capture_last = -1;
2561     cb.callout_data = md->callout_data;
2562     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2563     }
2564 ph10 398 if (rrc > 0) break; /* Fail this thread */
2565     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2566 ph10 406 }
2567 ph10 398
2568 ph10 397 condcode = code[LINK_SIZE+1];
2569 ph10 406
2570 nigel 93 /* Back reference conditions are not supported */
2571 nigel 77
2572 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2573 ph10 459 return PCRE_ERROR_DFA_UCOND;
2574 nigel 93
2575     /* The DEFINE condition is always false */
2576    
2577     if (condcode == OP_DEF)
2578 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2579 nigel 93
2580     /* The only supported version of OP_RREF is for the value RREF_ANY,
2581     which means "test if in any recursion". We can't test for specifically
2582     recursed groups. */
2583    
2584 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2585 nigel 93 {
2586 nigel 77 int value = GET2(code, LINK_SIZE+2);
2587 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2588 ph10 406 if (recursing > 0)
2589 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2590     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2591 nigel 77 }
2592    
2593     /* Otherwise, the condition is an assertion */
2594    
2595     else
2596     {
2597     int rc;
2598     const uschar *asscode = code + LINK_SIZE + 1;
2599     const uschar *endasscode = asscode + GET(asscode, 1);
2600    
2601     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2602    
2603     rc = internal_dfa_exec(
2604     md, /* fixed match data */
2605     asscode, /* this subexpression's code */
2606     ptr, /* where we currently are */
2607 ph10 530 (int)(ptr - start_subject), /* start offset */
2608 nigel 77 local_offsets, /* offset vector */
2609     sizeof(local_offsets)/sizeof(int), /* size of same */
2610     local_workspace, /* workspace vector */
2611     sizeof(local_workspace)/sizeof(int), /* size of same */
2612     rlevel, /* function recursion level */
2613     recursing); /* pass on regex recursion */
2614    
2615 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2616 nigel 77 if ((rc >= 0) ==
2617     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2618 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2619 nigel 77 else
2620 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2621 nigel 77 }
2622     }
2623     break;
2624    
2625     /*-----------------------------------------------------------------*/
2626     case OP_RECURSE:
2627     {
2628     int local_offsets[1000];
2629     int local_workspace[1000];
2630     int rc;
2631    
2632     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2633     recursing + 1));
2634    
2635     rc = internal_dfa_exec(
2636     md, /* fixed match data */
2637     start_code + GET(code, 1), /* this subexpression's code */
2638     ptr, /* where we currently are */
2639 ph10 530 (int)(ptr - start_subject), /* start offset */
2640 nigel 77 local_offsets, /* offset vector */
2641     sizeof(local_offsets)/sizeof(int), /* size of same */
2642     local_workspace, /* workspace vector */
2643     sizeof(local_workspace)/sizeof(int), /* size of same */
2644     rlevel, /* function recursion level */
2645     recursing + 1); /* regex recurse level */
2646    
2647     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2648     recursing + 1, rc));
2649    
2650     /* Ran out of internal offsets */
2651    
2652     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2653    
2654     /* For each successful matched substring, set up the next state with a
2655     count of characters to skip before trying it. Note that the count is in
2656     characters, not bytes. */
2657    
2658     if (rc > 0)
2659     {
2660     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2661     {
2662     const uschar *p = start_subject + local_offsets[rc];
2663     const uschar *pp = start_subject + local_offsets[rc+1];
2664     int charcount = local_offsets[rc+1] - local_offsets[rc];
2665     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2666     if (charcount > 0)
2667     {
2668     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2669     }
2670     else
2671     {
2672     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2673     }
2674     }
2675     }
2676     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2677     }
2678     break;
2679    
2680     /*-----------------------------------------------------------------*/
2681 ph10 604 case OP_BRAPOS:
2682     case OP_SBRAPOS:
2683     case OP_CBRAPOS:
2684     case OP_SCBRAPOS:
2685     case OP_BRAPOSZERO:
2686     {
2687     int charcount, matched_count;
2688     const uschar *local_ptr = ptr;
2689     BOOL allow_zero;
2690    
2691     if (codevalue == OP_BRAPOSZERO)
2692     {
2693     allow_zero = TRUE;
2694     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2695     }
2696     else allow_zero = FALSE;
2697    
2698     /* Loop to match the subpattern as many times as possible as if it were
2699     a complete pattern. */
2700    
2701     for (matched_count = 0;; matched_count++)
2702     {
2703     int local_offsets[2];
2704     int local_workspace[1000];
2705    
2706     int rc = internal_dfa_exec(
2707     md, /* fixed match data */
2708     code, /* this subexpression's code */
2709     local_ptr, /* where we currently are */
2710     (int)(ptr - start_subject), /* start offset */
2711     local_offsets, /* offset vector */
2712     sizeof(local_offsets)/sizeof(int), /* size of same */
2713     local_workspace, /* workspace vector */
2714     sizeof(local_workspace)/sizeof(int), /* size of same */
2715     rlevel, /* function recursion level */
2716     recursing); /* pass on regex recursion */
2717    
2718     /* Failed to match */
2719    
2720     if (rc < 0)
2721     {
2722     if (rc != PCRE_ERROR_NOMATCH) return rc;
2723     break;
2724     }
2725    
2726     /* Matched: break the loop if zero characters matched. */
2727    
2728     charcount = local_offsets[1] - local_offsets[0];
2729     if (charcount == 0) break;
2730     local_ptr += charcount; /* Advance temporary position ptr */
2731     }
2732    
2733     /* At this point we have matched the subpattern matched_count
2734     times, and local_ptr is pointing to the character after the end of the
2735     last match. */
2736    
2737     if (matched_count > 0 || allow_zero)
2738     {
2739     const uschar *end_subpattern = code;
2740     int next_state_offset;
2741    
2742     do { end_subpattern += GET(end_subpattern, 1); }
2743     while (*end_subpattern == OP_ALT);
2744     next_state_offset =
2745     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2746    
2747     /* Optimization: if there are no more active states, and there
2748     are no new states yet set up, then skip over the subject string
2749     right here, to save looping. Otherwise, set up the new state to swing
2750     into action when the end of the matched substring is reached. */
2751    
2752     if (i + 1 >= active_count && new_count == 0)
2753     {
2754     ptr = local_ptr;
2755     clen = 0;
2756     ADD_NEW(next_state_offset, 0);
2757     }
2758     else
2759     {
2760     const uschar *p = ptr;
2761     const uschar *pp = local_ptr;
2762     charcount = pp - p;
2763     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2764     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2765     }
2766     }
2767     }
2768     break;
2769    
2770     /*-----------------------------------------------------------------*/
2771 nigel 77 case OP_ONCE:
2772     {
2773     int local_offsets[2];
2774     int local_workspace[1000];
2775    
2776     int rc = internal_dfa_exec(
2777     md, /* fixed match data */
2778     code, /* this subexpression's code */
2779     ptr, /* where we currently are */
2780 ph10 530 (int)(ptr - start_subject), /* start offset */
2781 nigel 77 local_offsets, /* offset vector */
2782     sizeof(local_offsets)/sizeof(int), /* size of same */
2783     local_workspace, /* workspace vector */
2784     sizeof(local_workspace)/sizeof(int), /* size of same */
2785     rlevel, /* function recursion level */
2786     recursing); /* pass on regex recursion */
2787    
2788     if (rc >= 0)
2789     {
2790     const uschar *end_subpattern = code;
2791     int charcount = local_offsets[1] - local_offsets[0];
2792     int next_state_offset, repeat_state_offset;
2793    
2794     do { end_subpattern += GET(end_subpattern, 1); }
2795     while (*end_subpattern == OP_ALT);
2796 ph10 535 next_state_offset =
2797 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2798 nigel 77
2799     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2800     arrange for the repeat state also to be added to the relevant list.
2801     Calculate the offset, or set -1 for no repeat. */
2802    
2803     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2804     *end_subpattern == OP_KETRMIN)?
2805 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2806 nigel 77
2807     /* If we have matched an empty string, add the next state at the
2808     current character pointer. This is important so that the duplicate
2809     checking kicks in, which is what breaks infinite loops that match an
2810     empty string. */
2811    
2812     if (charcount == 0)
2813     {
2814     ADD_ACTIVE(next_state_offset, 0);
2815     }
2816    
2817     /* Optimization: if there are no more active states, and there
2818     are no new states yet set up, then skip over the subject string
2819     right here, to save looping. Otherwise, set up the new state to swing
2820 ph10 604 into action when the end of the matched substring is reached. */
2821 nigel 77
2822     else if (i + 1 >= active_count && new_count == 0)
2823     {
2824     ptr += charcount;
2825     clen = 0;
2826     ADD_NEW(next_state_offset, 0);
2827    
2828     /* If we are adding a repeat state at the new character position,
2829     we must fudge things so that it is the only current state.
2830     Otherwise, it might be a duplicate of one we processed before, and
2831     that would cause it to be skipped. */
2832    
2833     if (repeat_state_offset >= 0)
2834     {
2835     next_active_state = active_states;
2836     active_count = 0;
2837     i = -1;
2838     ADD_ACTIVE(repeat_state_offset, 0);
2839     }
2840     }
2841     else
2842     {
2843     const uschar *p = start_subject + local_offsets[0];
2844     const uschar *pp = start_subject + local_offsets[1];
2845     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2846     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2847     if (repeat_state_offset >= 0)
2848     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2849     }
2850     }
2851     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2852     }
2853     break;
2854    
2855    
2856     /* ========================================================================== */
2857     /* Handle callouts */
2858    
2859     case OP_CALLOUT:
2860 ph10 406 rrc = 0;
2861 nigel 77 if (pcre_callout != NULL)
2862     {
2863     pcre_callout_block cb;
2864     cb.version = 1; /* Version 1 of the callout block */
2865     cb.callout_number = code[1];
2866     cb.offset_vector = offsets;
2867 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2868 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2869     cb.start_match = (int)(current_subject - start_subject);
2870     cb.current_position = (int)(ptr - start_subject);
2871 nigel 77 cb.pattern_position = GET(code, 2);
2872     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2873     cb.capture_top = 1;
2874     cb.capture_last = -1;
2875     cb.callout_data = md->callout_data;
2876     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2877 ph10 406 }
2878     if (rrc == 0)
2879     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2880 nigel 77 break;
2881    
2882    
2883     /* ========================================================================== */
2884     default: /* Unsupported opcode */
2885     return PCRE_ERROR_DFA_UITEM;
2886     }
2887    
2888     NEXT_ACTIVE_STATE: continue;
2889    
2890     } /* End of loop scanning active states */
2891    
2892     /* We have finished the processing at the current subject character. If no
2893     new states have been set for the next character, we have found all the
2894     matches that we are going to find. If we are at the top level and partial
2895 ph10 463 matching has been requested, check for appropriate conditions.
2896    
2897 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2898     character. If it is equal to the original active_count (saved in
2899     workspace[1]) it means that (*F) was found on every active state. In this
2900 ph10 463 case we don't want to give a partial match.
2901 nigel 77
2902 ph10 463 The "could_continue" variable is true if a state could have continued but
2903     for the fact that the end of the subject was reached. */
2904    
2905 nigel 77 if (new_count <= 0)
2906     {
2907 ph10 427 if (rlevel == 1 && /* Top level, and */
2908 ph10 463 could_continue && /* Some could go on */
2909 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2910 ph10 427 ( /* either... */
2911     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2912     || /* or... */
2913     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2914     match_count < 0) /* no matches */
2915     ) && /* And... */
2916 ph10 553 ptr >= end_subject && /* Reached end of subject */
2917     ptr > md->start_used_ptr) /* Inspected non-empty string */
2918 nigel 77 {
2919     if (offsetcount >= 2)
2920     {
2921 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2922     offsets[1] = (int)(end_subject - start_subject);
2923 nigel 77 }
2924     match_count = PCRE_ERROR_PARTIAL;
2925     }
2926    
2927     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2928     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2929     rlevel*2-2, SP));
2930 nigel 91 break; /* In effect, "return", but see the comment below */
2931 nigel 77 }
2932    
2933     /* One or more states are active for the next character. */
2934    
2935     ptr += clen; /* Advance to next subject character */
2936     } /* Loop to move along the subject string */
2937    
2938 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2939     if we use "return" above, we have compiler trouble. Some compilers warn if
2940     there's nothing here because they think the function doesn't return a value. On
2941     the other hand, if we put a dummy statement here, some more clever compilers
2942     complain that it can't be reached. Sigh. */
2943 nigel 77
2944 nigel 91 return match_count;
2945 nigel 77 }
2946    
2947    
2948    
2949    
2950     /*************************************************
2951     * Execute a Regular Expression - DFA engine *
2952     *************************************************/
2953    
2954     /* This external function applies a compiled re to a subject string using a DFA
2955     engine. This function calls the internal function multiple times if the pattern
2956     is not anchored.
2957    
2958     Arguments:
2959     argument_re points to the compiled expression
2960 ph10 97 extra_data points to extra data or is NULL
2961 nigel 77 subject points to the subject string
2962     length length of subject string (may contain binary zeros)
2963     start_offset where to start in the subject string
2964     options option bits
2965     offsets vector of match offsets
2966     offsetcount size of same
2967     workspace workspace vector
2968     wscount size of same
2969    
2970     Returns: > 0 => number of match offset pairs placed in offsets
2971     = 0 => offsets overflowed; longest matches are present
2972     -1 => failed to match
2973     < -1 => some kind of unexpected problem
2974     */
2975    
2976 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2977 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2978     const char *subject, int length, int start_offset, int options, int *offsets,
2979     int offsetcount, int *workspace, int wscount)
2980     {
2981     real_pcre *re = (real_pcre *)argument_re;
2982     dfa_match_data match_block;
2983 nigel 91 dfa_match_data *md = &match_block;
2984 nigel 77 BOOL utf8, anchored, startline, firstline;
2985     const uschar *current_subject, *end_subject, *lcc;
2986    
2987     pcre_study_data internal_study;
2988     const pcre_study_data *study = NULL;
2989     real_pcre internal_re;
2990    
2991     const uschar *req_byte_ptr;
2992     const uschar *start_bits = NULL;
2993     BOOL first_byte_caseless = FALSE;
2994     BOOL req_byte_caseless = FALSE;
2995     int first_byte = -1;
2996     int req_byte = -1;
2997     int req_byte2 = -1;
2998 nigel 91 int newline;
2999 nigel 77
3000     /* Plausibility checks */
3001    
3002     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3003     if (re == NULL || subject == NULL || workspace == NULL ||
3004     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3005     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3006     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3007 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3008 nigel 77
3009     /* We need to find the pointer to any study data before we test for byte
3010     flipping, so we scan the extra_data block first. This may set two fields in the
3011     match block, so we must initialize them beforehand. However, the other fields
3012     in the match block must not be set until after the byte flipping. */
3013    
3014 nigel 91 md->tables = re->tables;
3015     md->callout_data = NULL;
3016 nigel 77
3017     if (extra_data != NULL)
3018     {
3019     unsigned int flags = extra_data->flags;
3020     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3021     study = (const pcre_study_data *)extra_data->study_data;
3022     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3023 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3024     return PCRE_ERROR_DFA_UMLIMIT;
3025 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3026 nigel 91 md->callout_data = extra_data->callout_data;
3027 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3028 nigel 91 md->tables = extra_data->tables;
3029 nigel 77 }
3030 ph10 461
3031 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
3032     test for a regex that was compiled on a host of opposite endianness. If this is
3033     the case, flipped values are put in internal_re and internal_study if there was
3034     study data too. */
3035    
3036     if (re->magic_number != MAGIC_NUMBER)
3037     {
3038     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3039     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3040     if (study != NULL) study = &internal_study;
3041     }
3042    
3043     /* Set some local values */
3044    
3045     current_subject = (const unsigned char *)subject + start_offset;
3046     end_subject = (const unsigned char *)subject + length;
3047     req_byte_ptr = current_subject - 1;
3048    
3049 nigel 91 #ifdef SUPPORT_UTF8
3050 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
3051 nigel 91 #else
3052     utf8 = FALSE;
3053     #endif
3054 nigel 77
3055 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3056     (re->options & PCRE_ANCHORED) != 0;
3057    
3058 nigel 77 /* The remaining fixed data for passing around. */
3059    
3060 nigel 91 md->start_code = (const uschar *)argument_re +
3061 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3062 nigel 91 md->start_subject = (const unsigned char *)subject;
3063     md->end_subject = end_subject;
3064 ph10 442 md->start_offset = start_offset;
3065 nigel 91 md->moptions = options;
3066     md->poptions = re->options;
3067 nigel 77
3068 ph10 231 /* If the BSR option is not set at match time, copy what was set
3069     at compile time. */
3070    
3071     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3072     {
3073     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3074     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3075     #ifdef BSR_ANYCRLF
3076     else md->moptions |= PCRE_BSR_ANYCRLF;
3077 ph10 243 #endif
3078     }
3079 ph10 231
3080 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3081     nothing is set at run time, whatever was used at compile time applies. */
3082 nigel 91
3083 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3084 nigel 93 PCRE_NEWLINE_BITS)
3085 nigel 91 {
3086 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3087 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3088     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3089 nigel 91 case PCRE_NEWLINE_CR+
3090 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3091 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3092 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3093 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3094 nigel 91 }
3095    
3096 ph10 149 if (newline == -2)
3097 nigel 91 {
3098 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3099     }
3100     else if (newline < 0)
3101     {
3102 nigel 93 md->nltype = NLTYPE_ANY;
3103 nigel 91 }
3104     else
3105     {
3106 nigel 93 md->nltype = NLTYPE_FIXED;
3107     if (newline > 255)
3108     {
3109     md->nllen = 2;
3110     md->nl[0] = (newline >> 8) & 255;
3111     md->nl[1] = newline & 255;
3112     }
3113     else
3114     {
3115     md->nllen = 1;
3116     md->nl[0] = newline;
3117     }
3118 nigel 91 }
3119    
3120 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3121     back the character offset. */
3122    
3123     #ifdef SUPPORT_UTF8
3124     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3125     {
3126 ph10 598 int errorcode;
3127     int tb = _pcre_valid_utf8((uschar *)subject, length, &errorcode);
3128     if (tb >= 0)
3129     {
3130     if (offsetcount >= 2)
3131     {
3132     offsets[0] = tb;
3133     offsets[1] = errorcode;
3134     }
3135     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3136 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3137 ph10 598 }
3138 nigel 77 if (start_offset > 0 && start_offset < length)
3139     {
3140 ph10 569 tb = ((USPTR)subject)[start_offset] & 0xc0;
3141 ph10 567 if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
3142 nigel 77 }
3143     }
3144     #endif
3145    
3146     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3147     is a feature that makes it possible to save compiled regex and re-use them
3148     in other programs later. */
3149    
3150 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
3151 nigel 77
3152     /* The lower casing table and the "must be at the start of a line" flag are
3153     used in a loop when finding where to start. */
3154    
3155 nigel 91 lcc = md->tables + lcc_offset;
3156 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3157 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3158    
3159     /* Set up the first character to match, if available. The first_byte value is
3160     never set for an anchored regular expression, but the anchoring may be forced
3161     at run time, so we have to test for anchoring. The first char may be unset for
3162     an unanchored pattern, of course. If there's no first char and the pattern was
3163     studied, there may be a bitmap of possible first characters. */
3164    
3165     if (!anchored)
3166     {
3167 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3168 nigel 77 {
3169     first_byte = re->first_byte & 255;
3170     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3171     first_byte = lcc[first_byte];
3172     }
3173     else
3174     {
3175 ph10 455 if (!startline && study != NULL &&
3176     (study->flags & PCRE_STUDY_MAPPED) != 0)
3177 nigel 77 start_bits = study->start_bits;
3178     }
3179     }
3180    
3181     /* For anchored or unanchored matches, there may be a "last known required
3182     character" set. */
3183    
3184 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3185 nigel 77 {
3186     req_byte = re->req_byte & 255;
3187     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3188 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3189 nigel 77 }
3190    
3191     /* Call the main matching function, looping for a non-anchored regex after a
3192 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3193     a match. */
3194 nigel 77
3195     for (;;)
3196     {
3197     int rc;
3198    
3199     if ((options & PCRE_DFA_RESTART) == 0)
3200     {
3201     const uschar *save_end_subject = end_subject;
3202    
3203 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3204     line of a multiline string. Implement this by temporarily adjusting
3205     end_subject so that we stop scanning at a newline. If the match fails at
3206     the newline, later code breaks this loop. */
3207 nigel 77
3208     if (firstline)
3209     {
3210 ph10 365 USPTR t = current_subject;
3211     #ifdef SUPPORT_UTF8
3212     if (utf8)
3213 ph10 371 {
3214     while (t < md->end_subject && !IS_NEWLINE(t))
3215 ph10 365 {
3216     t++;
3217     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3218 ph10 371 }
3219 ph10 365 }
3220     else
3221 ph10 371 #endif
3222 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3223 nigel 77 end_subject = t;
3224     }
3225 ph10 392
3226 ph10 389 /* There are some optimizations that avoid running the match if a known
3227 ph10 455 starting point is not found. However, there is an option that disables
3228 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3229 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3230     match-time options. */
3231 nigel 77
3232 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3233 ph10 392 {
3234 ph10 389 /* Advance to a known first byte. */
3235 ph10 392
3236 ph10 389 if (first_byte >= 0)
3237 nigel 77 {
3238 ph10 389 if (first_byte_caseless)
3239     while (current_subject < end_subject &&
3240     lcc[*current_subject] != first_byte)
3241     current_subject++;
3242     else
3243 ph10 392 while (current_subject < end_subject &&
3244 ph10 389 *current_subject != first_byte)
3245     current_subject++;
3246     }
3247 ph10 392
3248 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3249 ph10 392
3250 ph10 389 else if (startline)
3251     {
3252     if (current_subject > md->start_subject + start_offset)
3253     {
3254 ph10 365 #ifdef SUPPORT_UTF8
3255 ph10 389 if (utf8)
3256 ph10 365 {
3257 ph10 392 while (current_subject < end_subject &&
3258 ph10 389 !WAS_NEWLINE(current_subject))
3259     {
3260 ph10 365 current_subject++;
3261 ph10 389 while(current_subject < end_subject &&
3262     (*current_subject & 0xc0) == 0x80)
3263     current_subject++;
3264     }
3265 ph10 371 }
3266 ph10 389 else
3267     #endif
3268     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3269     current_subject++;
3270 ph10 392
3271 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3272     ANYCRLF, and we are now at a LF, advance the match position by one
3273     more character. */
3274 ph10 392
3275 ph10 391 if (current_subject[-1] == CHAR_CR &&
3276 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3277     current_subject < end_subject &&
3278 ph10 391 *current_subject == CHAR_NL)
3279 ph10 389 current_subject++;
3280 ph10 365 }
3281 nigel 77 }
3282 ph10 392
3283 ph10 389 /* Or to a non-unique first char after study */
3284 ph10 392
3285 ph10 389 else if (start_bits != NULL)
3286 nigel 77 {
3287 ph10 389 while (current_subject < end_subject)
3288     {
3289     register unsigned int c = *current_subject;
3290 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3291 ph10 538 {
3292     current_subject++;
3293     #ifdef SUPPORT_UTF8
3294     if (utf8)
3295 ph10 545 while(current_subject < end_subject &&
3296 ph10 538 (*current_subject & 0xc0) == 0x80) current_subject++;
3297 ph10 545 #endif
3298 ph10 538 }
3299     else break;
3300 ph10 389 }
3301 nigel 77 }
3302 ph10 392 }
3303 nigel 77
3304     /* Restore fudged end_subject */
3305    
3306     end_subject = save_end_subject;
3307    
3308 ph10 461 /* The following two optimizations are disabled for partial matching or if
3309     disabling is explicitly requested (and of course, by the test above, this
3310 ph10 455 code is not obeyed when restarting after a partial match). */
3311 ph10 461
3312 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3313     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3314 ph10 461 {
3315 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3316     is a lower bound; no actual string of that length may actually match the
3317     pattern. Although the value is, strictly, in characters, we treat it as
3318     bytes to avoid spending too much time in this optimization. */
3319 nigel 77
3320 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3321 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3322 ph10 455 return PCRE_ERROR_NOMATCH;
3323 ph10 461
3324 ph10 455 /* If req_byte is set, we know that that character must appear in the
3325     subject for the match to succeed. If the first character is set, req_byte
3326     must be later in the subject; otherwise the test starts at the match
3327     point. This optimization can save a huge amount of work in patterns with
3328     nested unlimited repeats that aren't going to match. Writing separate
3329     code for cased/caseless versions makes it go faster, as does using an
3330     autoincrement and backing off on a match.
3331 ph10 461
3332 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3333     can take a long time, and give bad performance on quite ordinary
3334     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3335     string... so we don't do this when the string is sufficiently long. */
3336 ph10 461
3337 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3338 nigel 77 {
3339 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3340 ph10 461
3341 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3342     place we found it at last time. */
3343 ph10 461
3344 ph10 455 if (p > req_byte_ptr)
3345 nigel 77 {
3346 ph10 455 if (req_byte_caseless)
3347     {
3348     while (p < end_subject)
3349     {
3350     register int pp = *p++;
3351     if (pp == req_byte || pp == req_byte2) { p--; break; }
3352     }
3353     }
3354     else
3355     {
3356     while (p < end_subject)
3357     {
3358     if (*p++ == req_byte) { p--; break; }
3359     }
3360     }
3361 ph10 461
3362 ph10 455 /* If we can't find the required character, break the matching loop,
3363     which will cause a return or PCRE_ERROR_NOMATCH. */
3364 ph10 461
3365 ph10 455 if (p >= end_subject) break;
3366 ph10 461
3367 ph10 455 /* If we have found the required character, save the point where we
3368     found it, so that we don't search again next time round the loop if
3369     the start hasn't passed this character yet. */
3370 ph10 461
3371 ph10 455 req_byte_ptr = p;
3372 nigel 77 }
3373 ph10 461 }
3374 nigel 77 }
3375 ph10 455 } /* End of optimizations that are done when not restarting */
3376 nigel 77
3377     /* OK, now we can do the business */
3378    
3379 ph10 435 md->start_used_ptr = current_subject;
3380 ph10 461
3381 nigel 77 rc = internal_dfa_exec(
3382 nigel 91 md, /* fixed match data */
3383     md->start_code, /* this subexpression's code */
3384     current_subject, /* where we currently are */
3385     start_offset, /* start offset in subject */
3386     offsets, /* offset vector */
3387     offsetcount, /* size of same */
3388     workspace, /* workspace vector */
3389     wscount, /* size of same */
3390     0, /* function recurse level */
3391     0); /* regex recurse level */
3392 nigel 77
3393     /* Anything other than "no match" means we are done, always; otherwise, carry
3394     on only if not anchored. */
3395    
3396     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3397    
3398     /* Advance to the next subject character unless we are at the end of a line
3399     and firstline is set. */
3400    
3401 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3402 nigel 77 current_subject++;
3403     if (utf8)
3404     {
3405     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3406     current_subject++;
3407     }
3408     if (current_subject > end_subject) break;
3409    
3410 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3411 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3412     or ANY or ANYCRLF, advance the match position by one more character. */
3413 nigel 93
3414 ph10 391 if (current_subject[-1] == CHAR_CR &&
3415 ph10 226 current_subject < end_subject &&
3416 ph10 391 *current_subject == CHAR_NL &&
3417 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3418 ph10 226 (md->nltype == NLTYPE_ANY ||
3419     md->nltype == NLTYPE_ANYCRLF ||
3420     md->nllen == 2))
3421 nigel 93 current_subject++;
3422    
3423     } /* "Bumpalong" loop */
3424    
3425 nigel 77 return PCRE_ERROR_NOMATCH;
3426     }
3427    
3428     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12