/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 680 - (hide annotations) (download)
Tue Sep 6 09:15:54 2011 UTC (3 years, 1 month ago) by ph10
Original Path: code/trunk/pcre_dfa_exec.c
File MIME type: text/plain
File size: 119358 byte(s)
Fix small return value bug.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 598 Copyright (c) 1997-2011 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 nigel 77 1, /* Char */
126 ph10 602 1, /* Chari */
127 nigel 77 1, /* not */
128 ph10 602 1, /* noti */
129 nigel 77 /* Positive single-char repeats */
130     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131     3, 3, 3, /* upto, minupto, exact */
132 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134     3, 3, 3, /* upto I, minupto I, exact I */
135     1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 nigel 77 /* Negative single-char repeats - only for chars < 256 */
137     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* NOT upto, minupto, exact */
139 ph10 602 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140     1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141     3, 3, 3, /* NOT upto I, minupto I, exact I */
142     1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 nigel 77 /* Positive type repeats */
144     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145     3, 3, 3, /* Type upto, minupto, exact */
146 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 nigel 77 /* Character class & ref repeats */
148     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149     0, 0, /* CRRANGE, CRMINRANGE */
150     0, /* CLASS */
151     0, /* NCLASS */
152     0, /* XCLASS - variable length */
153     0, /* REF */
154 ph10 602 0, /* REFI */
155 nigel 77 0, /* RECURSE */
156     0, /* CALLOUT */
157     0, /* Alt */
158     0, /* Ket */
159     0, /* KetRmax */
160     0, /* KetRmin */
161 ph10 604 0, /* KetRpos */
162 ph10 637 0, /* Reverse */
163 nigel 77 0, /* Assert */
164     0, /* Assert not */
165     0, /* Assert behind */
166     0, /* Assert behind not */
167 ph10 604 0, 0, 0, 0, 0, 0, /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
168     0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
169 ph10 498 0, 0, /* CREF, NCREF */
170     0, 0, /* RREF, NRREF */
171 nigel 93 0, /* DEF */
172 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
173 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
174     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
175     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
176     0, 0 /* CLOSE, SKIPZERO */
177 nigel 77 };
178    
179 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
180 ph10 462 remember the fact that a character could have been inspected when the end of
181 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
182     two tables that follow must also be modified. */
183 ph10 462
184     static const uschar poptable[] = {
185     0, /* End */
186 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
187 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
188     1, 1, 1, /* Any, AllAny, Anybyte */
189 ph10 498 1, 1, /* \P, \p */
190 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
191 ph10 498 1, /* \X */
192 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
193 ph10 462 1, /* Char */
194 ph10 602 1, /* Chari */
195 ph10 462 1, /* not */
196 ph10 602 1, /* noti */
197 ph10 462 /* Positive single-char repeats */
198     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
199     1, 1, 1, /* upto, minupto, exact */
200     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
201 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
202     1, 1, 1, /* upto I, minupto I, exact I */
203     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
204 ph10 462 /* Negative single-char repeats - only for chars < 256 */
205     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
206     1, 1, 1, /* NOT upto, minupto, exact */
207     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
208 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
209     1, 1, 1, /* NOT upto I, minupto I, exact I */
210     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
211 ph10 462 /* Positive type repeats */
212     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
213     1, 1, 1, /* Type upto, minupto, exact */
214     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
215     /* Character class & ref repeats */
216     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
217     1, 1, /* CRRANGE, CRMINRANGE */
218     1, /* CLASS */
219     1, /* NCLASS */
220     1, /* XCLASS - variable length */
221     0, /* REF */
222 ph10 602 0, /* REFI */
223 ph10 462 0, /* RECURSE */
224     0, /* CALLOUT */
225     0, /* Alt */
226     0, /* Ket */
227     0, /* KetRmax */
228     0, /* KetRmin */
229 ph10 604 0, /* KetRpos */
230 ph10 637 0, /* Reverse */
231 ph10 462 0, /* Assert */
232     0, /* Assert not */
233     0, /* Assert behind */
234     0, /* Assert behind not */
235 ph10 604 0, 0, 0, 0, 0, 0, /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
236     0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
237 ph10 498 0, 0, /* CREF, NCREF */
238     0, 0, /* RREF, NRREF */
239 ph10 462 0, /* DEF */
240 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
241 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
242     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
243     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
244     0, 0 /* CLOSE, SKIPZERO */
245 ph10 462 };
246    
247 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
248     and \w */
249    
250 ph10 327 static const uschar toptable1[] = {
251 ph10 168 0, 0, 0, 0, 0, 0,
252 nigel 77 ctype_digit, ctype_digit,
253     ctype_space, ctype_space,
254     ctype_word, ctype_word,
255 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
256 nigel 77 };
257    
258 ph10 327 static const uschar toptable2[] = {
259 ph10 168 0, 0, 0, 0, 0, 0,
260 nigel 77 ctype_digit, 0,
261     ctype_space, 0,
262     ctype_word, 0,
263 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
264 nigel 77 };
265    
266    
267     /* Structure for holding data about a particular state, which is in effect the
268     current data for an active path through the match tree. It must consist
269     entirely of ints because the working vector we are passed, and which we put
270     these structures in, is a vector of ints. */
271    
272     typedef struct stateblock {
273     int offset; /* Offset to opcode */
274     int count; /* Count for repeats */
275     int data; /* Some use extra data */
276     } stateblock;
277    
278     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
279    
280    
281 ph10 475 #ifdef PCRE_DEBUG
282 nigel 77 /*************************************************
283     * Print character string *
284     *************************************************/
285    
286     /* Character string printing function for debugging.
287    
288     Arguments:
289     p points to string
290     length number of bytes
291     f where to print
292    
293     Returns: nothing
294     */
295    
296     static void
297     pchars(unsigned char *p, int length, FILE *f)
298     {
299     int c;
300     while (length-- > 0)
301     {
302     if (isprint(c = *(p++)))
303     fprintf(f, "%c", c);
304     else
305     fprintf(f, "\\x%02x", c);
306     }
307     }
308     #endif
309    
310    
311    
312     /*************************************************
313     * Execute a Regular Expression - DFA engine *
314     *************************************************/
315    
316     /* This internal function applies a compiled pattern to a subject string,
317     starting at a given point, using a DFA engine. This function is called from the
318     external one, possibly multiple times if the pattern is not anchored. The
319     function calls itself recursively for some kinds of subpattern.
320    
321     Arguments:
322     md the match_data block with fixed information
323     this_start_code the opening bracket of this subexpression's code
324     current_subject where we currently are in the subject string
325     start_offset start offset in the subject string
326     offsets vector to contain the matching string offsets
327     offsetcount size of same
328     workspace vector of workspace
329     wscount size of same
330     rlevel function call recursion level
331    
332 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
333 ph10 341 = 0 => offsets overflowed; longest matches are present
334 nigel 77 -1 => failed to match
335     < -1 => some kind of unexpected problem
336    
337     The following macros are used for adding states to the two state vectors (one
338     for the current character, one for the following character). */
339    
340     #define ADD_ACTIVE(x,y) \
341     if (active_count++ < wscount) \
342     { \
343     next_active_state->offset = (x); \
344     next_active_state->count = (y); \
345     next_active_state++; \
346     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
347     } \
348     else return PCRE_ERROR_DFA_WSSIZE
349    
350     #define ADD_ACTIVE_DATA(x,y,z) \
351     if (active_count++ < wscount) \
352     { \
353     next_active_state->offset = (x); \
354     next_active_state->count = (y); \
355     next_active_state->data = (z); \
356     next_active_state++; \
357     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
358     } \
359     else return PCRE_ERROR_DFA_WSSIZE
360    
361     #define ADD_NEW(x,y) \
362     if (new_count++ < wscount) \
363     { \
364     next_new_state->offset = (x); \
365     next_new_state->count = (y); \
366     next_new_state++; \
367     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
368     } \
369     else return PCRE_ERROR_DFA_WSSIZE
370    
371     #define ADD_NEW_DATA(x,y,z) \
372     if (new_count++ < wscount) \
373     { \
374     next_new_state->offset = (x); \
375     next_new_state->count = (y); \
376     next_new_state->data = (z); \
377     next_new_state++; \
378     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
379     } \
380     else return PCRE_ERROR_DFA_WSSIZE
381    
382     /* And now, here is the code */
383    
384     static int
385     internal_dfa_exec(
386     dfa_match_data *md,
387     const uschar *this_start_code,
388     const uschar *current_subject,
389     int start_offset,
390     int *offsets,
391     int offsetcount,
392     int *workspace,
393     int wscount,
394 ph10 642 int rlevel)
395 nigel 77 {
396     stateblock *active_states, *new_states, *temp_states;
397     stateblock *next_active_state, *next_new_state;
398    
399     const uschar *ctypes, *lcc, *fcc;
400     const uschar *ptr;
401 nigel 93 const uschar *end_code, *first_op;
402 nigel 77
403 ph10 642 dfa_recursion_info new_recursive;
404    
405 nigel 77 int active_count, new_count, match_count;
406    
407     /* Some fields in the md block are frequently referenced, so we load them into
408     independent variables in the hope that this will perform better. */
409    
410     const uschar *start_subject = md->start_subject;
411     const uschar *end_subject = md->end_subject;
412     const uschar *start_code = md->start_code;
413    
414 nigel 87 #ifdef SUPPORT_UTF8
415 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
416 nigel 93 #else
417     BOOL utf8 = FALSE;
418 nigel 87 #endif
419 nigel 77
420     rlevel++;
421     offsetcount &= (-2);
422    
423     wscount -= 2;
424     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
425     (2 * INTS_PER_STATEBLOCK);
426    
427     DPRINTF(("\n%.*s---------------------\n"
428 ph10 642 "%.*sCall to internal_dfa_exec f=%d\n",
429     rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
430 nigel 77
431     ctypes = md->tables + ctypes_offset;
432     lcc = md->tables + lcc_offset;
433     fcc = md->tables + fcc_offset;
434    
435     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
436    
437     active_states = (stateblock *)(workspace + 2);
438     next_new_state = new_states = active_states + wscount;
439     new_count = 0;
440    
441 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
442 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
443     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
444 nigel 93
445 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
446     the alternative states onto the list, and find out where the end is. This
447     makes is possible to use this function recursively, when we want to stop at a
448     matching internal ket rather than at the end.
449    
450     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
451     a backward assertion. In that case, we have to find out the maximum amount to
452     move back, and set up each alternative appropriately. */
453    
454 nigel 93 if (*first_op == OP_REVERSE)
455 nigel 77 {
456     int max_back = 0;
457     int gone_back;
458    
459     end_code = this_start_code;
460     do
461     {
462     int back = GET(end_code, 2+LINK_SIZE);
463     if (back > max_back) max_back = back;
464     end_code += GET(end_code, 1);
465     }
466     while (*end_code == OP_ALT);
467    
468     /* If we can't go back the amount required for the longest lookbehind
469     pattern, go back as far as we can; some alternatives may still be viable. */
470    
471     #ifdef SUPPORT_UTF8
472     /* In character mode we have to step back character by character */
473    
474     if (utf8)
475     {
476     for (gone_back = 0; gone_back < max_back; gone_back++)
477     {
478     if (current_subject <= start_subject) break;
479     current_subject--;
480     while (current_subject > start_subject &&
481     (*current_subject & 0xc0) == 0x80)
482     current_subject--;
483     }
484     }
485     else
486     #endif
487    
488     /* In byte-mode we can do this quickly. */
489    
490     {
491     gone_back = (current_subject - max_back < start_subject)?
492 ph10 530 (int)(current_subject - start_subject) : max_back;
493 nigel 77 current_subject -= gone_back;
494     }
495 ph10 461
496 ph10 435 /* Save the earliest consulted character */
497 nigel 77
498 ph10 461 if (current_subject < md->start_used_ptr)
499     md->start_used_ptr = current_subject;
500    
501 nigel 77 /* Now we can process the individual branches. */
502    
503     end_code = this_start_code;
504     do
505     {
506     int back = GET(end_code, 2+LINK_SIZE);
507     if (back <= gone_back)
508     {
509 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
510 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
511     }
512     end_code += GET(end_code, 1);
513     }
514     while (*end_code == OP_ALT);
515     }
516    
517     /* This is the code for a "normal" subpattern (not a backward assertion). The
518     start of a whole pattern is always one of these. If we are at the top level,
519     we may be asked to restart matching from the same point that we reached for a
520     previous partial match. We still have to scan through the top-level branches to
521     find the end state. */
522    
523     else
524     {
525     end_code = this_start_code;
526    
527     /* Restarting */
528    
529     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
530     {
531     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
532     new_count = workspace[1];
533     if (!workspace[0])
534     memcpy(new_states, active_states, new_count * sizeof(stateblock));
535     }
536    
537     /* Not restarting */
538    
539     else
540     {
541 nigel 93 int length = 1 + LINK_SIZE +
542 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
543 ph10 654 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
544 ph10 604 2:0);
545 nigel 77 do
546     {
547 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
548 nigel 77 end_code += GET(end_code, 1);
549 nigel 93 length = 1 + LINK_SIZE;
550 nigel 77 }
551     while (*end_code == OP_ALT);
552     }
553     }
554    
555     workspace[0] = 0; /* Bit indicating which vector is current */
556    
557     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
558    
559     /* Loop for scanning the subject */
560    
561     ptr = current_subject;
562     for (;;)
563     {
564     int i, j;
565 nigel 91 int clen, dlen;
566     unsigned int c, d;
567 ph10 428 int forced_fail = 0;
568 ph10 462 BOOL could_continue = FALSE;
569 nigel 77
570     /* Make the new state list into the active state list and empty the
571     new state list. */
572    
573     temp_states = active_states;
574     active_states = new_states;
575     new_states = temp_states;
576     active_count = new_count;
577     new_count = 0;
578    
579     workspace[0] ^= 1; /* Remember for the restarting feature */
580     workspace[1] = active_count;
581    
582 ph10 475 #ifdef PCRE_DEBUG
583 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
584     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
585     printf("\"\n");
586    
587     printf("%.*sActive states: ", rlevel*2-2, SP);
588     for (i = 0; i < active_count; i++)
589     printf("%d/%d ", active_states[i].offset, active_states[i].count);
590     printf("\n");
591     #endif
592    
593     /* Set the pointers for adding new states */
594    
595     next_active_state = active_states + active_count;
596     next_new_state = new_states;
597    
598     /* Load the current character from the subject outside the loop, as many
599     different states may want to look at it, and we assume that at least one
600     will. */
601    
602     if (ptr < end_subject)
603     {
604 nigel 93 clen = 1; /* Number of bytes in the character */
605 nigel 77 #ifdef SUPPORT_UTF8
606     if (utf8) { GETCHARLEN(c, ptr, clen); } else
607     #endif /* SUPPORT_UTF8 */
608     c = *ptr;
609     }
610     else
611     {
612 nigel 93 clen = 0; /* This indicates the end of the subject */
613     c = NOTACHAR; /* This value should never actually be used */
614 nigel 77 }
615    
616     /* Scan up the active states and act on each one. The result of an action
617     may be to add more states to the currently active list (e.g. on hitting a
618     parenthesis) or it may be to put states on the new list, for considering
619     when we move the character pointer on. */
620    
621     for (i = 0; i < active_count; i++)
622     {
623     stateblock *current_state = active_states + i;
624 ph10 654 BOOL caseless = FALSE;
625 nigel 77 const uschar *code;
626     int state_offset = current_state->offset;
627 ph10 397 int count, codevalue, rrc;
628 nigel 77
629 ph10 475 #ifdef PCRE_DEBUG
630 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
631 nigel 93 if (clen == 0) printf("EOL\n");
632 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
633     else printf("0x%02x\n", c);
634     #endif
635    
636     /* A negative offset is a special case meaning "hold off going to this
637     (negated) state until the number of characters in the data field have
638     been skipped". */
639    
640     if (state_offset < 0)
641     {
642     if (current_state->data > 0)
643     {
644     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
645     ADD_NEW_DATA(state_offset, current_state->count,
646     current_state->data - 1);
647     continue;
648     }
649     else
650     {
651     current_state->offset = state_offset = -state_offset;
652     }
653     }
654    
655 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
656 ph10 439 See the note at the head of this module about the possibility of improving
657     performance here. */
658 nigel 77
659     for (j = 0; j < i; j++)
660     {
661     if (active_states[j].offset == state_offset &&
662     active_states[j].count == current_state->count)
663     {
664     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
665     goto NEXT_ACTIVE_STATE;
666     }
667     }
668    
669     /* The state offset is the offset to the opcode */
670    
671     code = start_code + state_offset;
672     codevalue = *code;
673    
674 ph10 463 /* If this opcode inspects a character, but we are at the end of the
675     subject, remember the fact for use when testing for a partial match. */
676    
677 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
678 ph10 463 could_continue = TRUE;
679 ph10 462
680 nigel 77 /* If this opcode is followed by an inline character, load it. It is
681     tempting to test for the presence of a subject character here, but that
682     is wrong, because sometimes zero repetitions of the subject are
683     permitted.
684    
685     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
686 ph10 178 argument that is not a data character - but is always one byte long. We
687     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
688     this case. To keep the other cases fast, convert these ones to new opcodes.
689     */
690 nigel 77
691     if (coptable[codevalue] > 0)
692     {
693     dlen = 1;
694     #ifdef SUPPORT_UTF8
695     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
696     #endif /* SUPPORT_UTF8 */
697     d = code[coptable[codevalue]];
698     if (codevalue >= OP_TYPESTAR)
699     {
700 nigel 93 switch(d)
701     {
702     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
703     case OP_NOTPROP:
704     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
705     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
706     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
707 ph10 178 case OP_NOT_HSPACE:
708 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
709 ph10 178 case OP_NOT_VSPACE:
710 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
711 nigel 93 default: break;
712     }
713 nigel 77 }
714     }
715     else
716     {
717     dlen = 0; /* Not strictly necessary, but compilers moan */
718 nigel 93 d = NOTACHAR; /* if these variables are not set. */
719 nigel 77 }
720    
721    
722     /* Now process the individual opcodes */
723    
724     switch (codevalue)
725     {
726 ph10 498 /* ========================================================================== */
727     /* These cases are never obeyed. This is a fudge that causes a compile-
728     time error if the vectors coptable or poptable, which are indexed by
729     opcode, are not the correct length. It seems to be the only way to do
730     such a check at compile time, as the sizeof() operator does not work
731     in the C preprocessor. */
732 ph10 507
733 ph10 498 case OP_TABLE_LENGTH:
734 ph10 507 case OP_TABLE_LENGTH +
735 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
736     (sizeof(poptable) == OP_TABLE_LENGTH)):
737 ph10 507 break;
738 nigel 77
739     /* ========================================================================== */
740     /* Reached a closing bracket. If not at the end of the pattern, carry
741 ph10 654 on with the next opcode. For repeating opcodes, also add the repeat
742     state. Note that KETRPOS will always be encountered at the end of the
743     subpattern, because the possessive subpattern repeats are always handled
744 ph10 604 using recursive calls. Thus, it never adds any new states.
745 ph10 654
746 ph10 604 At the end of the (sub)pattern, unless we have an empty string and
747 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
748 ph10 442 start of the subject, save the match data, shifting up all previous
749 nigel 77 matches so we always have the longest first. */
750    
751     case OP_KET:
752     case OP_KETRMIN:
753     case OP_KETRMAX:
754 ph10 654 case OP_KETRPOS:
755 nigel 77 if (code != end_code)
756     {
757     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
758     if (codevalue != OP_KET)
759     {
760     ADD_ACTIVE(state_offset - GET(code, 1), 0);
761     }
762     }
763 ph10 461 else
764 nigel 77 {
765 ph10 461 if (ptr > current_subject ||
766 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
767     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
768     current_subject > start_subject + md->start_offset)))
769 nigel 77 {
770 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
771 ph10 680 else if (match_count > 0 && ++match_count * 2 > offsetcount)
772 ph10 428 match_count = 0;
773     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
774     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
775     if (offsetcount >= 2)
776     {
777 ph10 530 offsets[0] = (int)(current_subject - start_subject);
778     offsets[1] = (int)(ptr - start_subject);
779 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
780     offsets[1] - offsets[0], current_subject));
781     }
782     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
783     {
784     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
785     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
786     match_count, rlevel*2-2, SP));
787     return match_count;
788     }
789 ph10 461 }
790 nigel 77 }
791     break;
792    
793     /* ========================================================================== */
794     /* These opcodes add to the current list of states without looking
795     at the current character. */
796    
797     /*-----------------------------------------------------------------*/
798     case OP_ALT:
799     do { code += GET(code, 1); } while (*code == OP_ALT);
800 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
801 nigel 77 break;
802    
803     /*-----------------------------------------------------------------*/
804     case OP_BRA:
805 nigel 93 case OP_SBRA:
806 nigel 77 do
807     {
808 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
809 nigel 77 code += GET(code, 1);
810     }
811     while (*code == OP_ALT);
812     break;
813    
814     /*-----------------------------------------------------------------*/
815 nigel 93 case OP_CBRA:
816     case OP_SCBRA:
817 ph10 530 ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
818 nigel 93 code += GET(code, 1);
819     while (*code == OP_ALT)
820     {
821 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
822 nigel 93 code += GET(code, 1);
823     }
824     break;
825    
826     /*-----------------------------------------------------------------*/
827 nigel 77 case OP_BRAZERO:
828     case OP_BRAMINZERO:
829     ADD_ACTIVE(state_offset + 1, 0);
830     code += 1 + GET(code, 2);
831     while (*code == OP_ALT) code += GET(code, 1);
832 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
833 nigel 77 break;
834    
835     /*-----------------------------------------------------------------*/
836 ph10 335 case OP_SKIPZERO:
837     code += 1 + GET(code, 2);
838     while (*code == OP_ALT) code += GET(code, 1);
839 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
840 ph10 335 break;
841    
842     /*-----------------------------------------------------------------*/
843 nigel 77 case OP_CIRC:
844 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
845     { ADD_ACTIVE(state_offset + 1, 0); }
846     break;
847    
848     /*-----------------------------------------------------------------*/
849     case OP_CIRCM:
850 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
851 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
852 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
853     break;
854    
855     /*-----------------------------------------------------------------*/
856     case OP_EOD:
857 ph10 579 if (ptr >= end_subject)
858     {
859 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
860     could_continue = TRUE;
861     else { ADD_ACTIVE(state_offset + 1, 0); }
862     }
863 nigel 77 break;
864    
865     /*-----------------------------------------------------------------*/
866     case OP_SOD:
867     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
868     break;
869    
870     /*-----------------------------------------------------------------*/
871     case OP_SOM:
872     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
873     break;
874    
875    
876     /* ========================================================================== */
877     /* These opcodes inspect the next subject character, and sometimes
878     the previous one as well, but do not have an argument. The variable
879     clen contains the length of the current character and is zero if we are
880     at the end of the subject. */
881    
882     /*-----------------------------------------------------------------*/
883     case OP_ANY:
884 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
885 nigel 77 { ADD_NEW(state_offset + 1, 0); }
886     break;
887    
888     /*-----------------------------------------------------------------*/
889 ph10 341 case OP_ALLANY:
890     if (clen > 0)
891     { ADD_NEW(state_offset + 1, 0); }
892     break;
893    
894     /*-----------------------------------------------------------------*/
895 nigel 77 case OP_EODN:
896 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
897     could_continue = TRUE;
898     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
899 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
900     break;
901    
902     /*-----------------------------------------------------------------*/
903     case OP_DOLL:
904     if ((md->moptions & PCRE_NOTEOL) == 0)
905     {
906 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
907     could_continue = TRUE;
908     else if (clen == 0 ||
909 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
910 ph10 602 (ptr == end_subject - md->nllen)
911 nigel 91 ))
912 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
913     }
914 ph10 602 break;
915    
916     /*-----------------------------------------------------------------*/
917     case OP_DOLLM:
918     if ((md->moptions & PCRE_NOTEOL) == 0)
919     {
920     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
921     could_continue = TRUE;
922     else if (clen == 0 ||
923     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
924     { ADD_ACTIVE(state_offset + 1, 0); }
925     }
926     else if (IS_NEWLINE(ptr))
927 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
928     break;
929    
930     /*-----------------------------------------------------------------*/
931    
932     case OP_DIGIT:
933     case OP_WHITESPACE:
934     case OP_WORDCHAR:
935     if (clen > 0 && c < 256 &&
936     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
937     { ADD_NEW(state_offset + 1, 0); }
938     break;
939    
940     /*-----------------------------------------------------------------*/
941     case OP_NOT_DIGIT:
942     case OP_NOT_WHITESPACE:
943     case OP_NOT_WORDCHAR:
944     if (clen > 0 && (c >= 256 ||
945     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
946     { ADD_NEW(state_offset + 1, 0); }
947     break;
948    
949     /*-----------------------------------------------------------------*/
950     case OP_WORD_BOUNDARY:
951     case OP_NOT_WORD_BOUNDARY:
952     {
953     int left_word, right_word;
954    
955     if (ptr > start_subject)
956     {
957     const uschar *temp = ptr - 1;
958 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
959 nigel 77 #ifdef SUPPORT_UTF8
960     if (utf8) BACKCHAR(temp);
961     #endif
962     GETCHARTEST(d, temp);
963 ph10 535 #ifdef SUPPORT_UCP
964 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
965     {
966     if (d == '_') left_word = TRUE; else
967 ph10 535 {
968 ph10 518 int cat = UCD_CATEGORY(d);
969     left_word = (cat == ucp_L || cat == ucp_N);
970 ph10 535 }
971     }
972     else
973     #endif
974 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
975     }
976 ph10 518 else left_word = FALSE;
977 nigel 77
978 ph10 461 if (clen > 0)
979 ph10 535 {
980     #ifdef SUPPORT_UCP
981 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
982     {
983     if (c == '_') right_word = TRUE; else
984 ph10 535 {
985 ph10 518 int cat = UCD_CATEGORY(c);
986     right_word = (cat == ucp_L || cat == ucp_N);
987 ph10 535 }
988     }
989     else
990     #endif
991 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
992 ph10 535 }
993 ph10 518 else right_word = FALSE;
994 nigel 77
995     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
996     { ADD_ACTIVE(state_offset + 1, 0); }
997     }
998     break;
999    
1000    
1001     /*-----------------------------------------------------------------*/
1002     /* Check the next character by Unicode property. We will get here only
1003     if the support is in the binary; otherwise a compile-time error occurs.
1004     */
1005    
1006 ph10 151 #ifdef SUPPORT_UCP
1007 nigel 77 case OP_PROP:
1008     case OP_NOTPROP:
1009     if (clen > 0)
1010     {
1011 nigel 87 BOOL OK;
1012 ph10 349 const ucd_record * prop = GET_UCD(c);
1013 nigel 87 switch(code[1])
1014 nigel 77 {
1015 nigel 87 case PT_ANY:
1016     OK = TRUE;
1017     break;
1018    
1019     case PT_LAMP:
1020 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1021 ph10 517 prop->chartype == ucp_Lt;
1022 nigel 87 break;
1023    
1024     case PT_GC:
1025 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1026 nigel 87 break;
1027    
1028     case PT_PC:
1029 ph10 349 OK = prop->chartype == code[2];
1030 nigel 87 break;
1031    
1032     case PT_SC:
1033 ph10 349 OK = prop->script == code[2];
1034 nigel 87 break;
1035 ph10 535
1036 ph10 517 /* These are specials for combination cases. */
1037 ph10 535
1038 ph10 517 case PT_ALNUM:
1039     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1040     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1041 ph10 535 break;
1042    
1043 ph10 517 case PT_SPACE: /* Perl space */
1044     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1045     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1046 ph10 535 break;
1047    
1048 ph10 517 case PT_PXSPACE: /* POSIX space */
1049     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1050     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1051     c == CHAR_FF || c == CHAR_CR;
1052 ph10 535 break;
1053    
1054 ph10 517 case PT_WORD:
1055     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1056     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1057     c == CHAR_UNDERSCORE;
1058 ph10 535 break;
1059 nigel 87
1060     /* Should never occur, but keep compilers from grumbling. */
1061    
1062     default:
1063     OK = codevalue != OP_PROP;
1064     break;
1065 nigel 77 }
1066 nigel 87
1067     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1068 nigel 77 }
1069     break;
1070     #endif
1071    
1072    
1073    
1074     /* ========================================================================== */
1075     /* These opcodes likewise inspect the subject character, but have an
1076     argument that is not a data character. It is one of these opcodes:
1077 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1078     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1079 nigel 77
1080     case OP_TYPEPLUS:
1081     case OP_TYPEMINPLUS:
1082 nigel 93 case OP_TYPEPOSPLUS:
1083 nigel 77 count = current_state->count; /* Already matched */
1084     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1085     if (clen > 0)
1086     {
1087     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1088     (c < 256 &&
1089 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1090 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1091     {
1092 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1093     {
1094     active_count--; /* Remove non-match possibility */
1095     next_active_state--;
1096     }
1097 nigel 77 count++;
1098     ADD_NEW(state_offset, count);
1099     }
1100     }
1101     break;
1102    
1103     /*-----------------------------------------------------------------*/
1104     case OP_TYPEQUERY:
1105     case OP_TYPEMINQUERY:
1106 nigel 93 case OP_TYPEPOSQUERY:
1107 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1108     if (clen > 0)
1109     {
1110     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1111     (c < 256 &&
1112 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1113 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1114     {
1115 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1116     {
1117     active_count--; /* Remove non-match possibility */
1118     next_active_state--;
1119     }
1120 nigel 77 ADD_NEW(state_offset + 2, 0);
1121     }
1122     }
1123     break;
1124    
1125     /*-----------------------------------------------------------------*/
1126     case OP_TYPESTAR:
1127     case OP_TYPEMINSTAR:
1128 nigel 93 case OP_TYPEPOSSTAR:
1129 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1130     if (clen > 0)
1131     {
1132     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1133     (c < 256 &&
1134 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1135 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1136     {
1137 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1138     {
1139     active_count--; /* Remove non-match possibility */
1140     next_active_state--;
1141     }
1142 nigel 77 ADD_NEW(state_offset, 0);
1143     }
1144     }
1145     break;
1146    
1147     /*-----------------------------------------------------------------*/
1148     case OP_TYPEEXACT:
1149 nigel 93 count = current_state->count; /* Number already matched */
1150     if (clen > 0)
1151     {
1152     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1153     (c < 256 &&
1154 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1155 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1156     {
1157     if (++count >= GET2(code, 1))
1158     { ADD_NEW(state_offset + 4, 0); }
1159     else
1160     { ADD_NEW(state_offset, count); }
1161     }
1162     }
1163     break;
1164    
1165     /*-----------------------------------------------------------------*/
1166 nigel 77 case OP_TYPEUPTO:
1167     case OP_TYPEMINUPTO:
1168 nigel 93 case OP_TYPEPOSUPTO:
1169     ADD_ACTIVE(state_offset + 4, 0);
1170 nigel 77 count = current_state->count; /* Number already matched */
1171     if (clen > 0)
1172     {
1173     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1174     (c < 256 &&
1175 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1176 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1177     {
1178 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1179     {
1180     active_count--; /* Remove non-match possibility */
1181     next_active_state--;
1182     }
1183 nigel 77 if (++count >= GET2(code, 1))
1184     { ADD_NEW(state_offset + 4, 0); }
1185     else
1186     { ADD_NEW(state_offset, count); }
1187     }
1188     }
1189     break;
1190    
1191     /* ========================================================================== */
1192     /* These are virtual opcodes that are used when something like
1193 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1194     argument. It keeps the code above fast for the other cases. The argument
1195     is in the d variable. */
1196 nigel 77
1197 ph10 151 #ifdef SUPPORT_UCP
1198 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1199     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1200 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1201 nigel 77 count = current_state->count; /* Already matched */
1202 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1203 nigel 77 if (clen > 0)
1204     {
1205 nigel 87 BOOL OK;
1206 ph10 349 const ucd_record * prop = GET_UCD(c);
1207 nigel 87 switch(code[2])
1208     {
1209     case PT_ANY:
1210     OK = TRUE;
1211     break;
1212    
1213     case PT_LAMP:
1214 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1215 ph10 517 prop->chartype == ucp_Lt;
1216 nigel 87 break;
1217    
1218     case PT_GC:
1219 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1220 nigel 87 break;
1221    
1222     case PT_PC:
1223 ph10 349 OK = prop->chartype == code[3];
1224 nigel 87 break;
1225    
1226     case PT_SC:
1227 ph10 349 OK = prop->script == code[3];
1228 nigel 87 break;
1229    
1230 ph10 517 /* These are specials for combination cases. */
1231 ph10 535
1232 ph10 517 case PT_ALNUM:
1233     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1234     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1235 ph10 535 break;
1236    
1237 ph10 517 case PT_SPACE: /* Perl space */
1238     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1239     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1240 ph10 535 break;
1241    
1242 ph10 517 case PT_PXSPACE: /* POSIX space */
1243     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1244     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1245     c == CHAR_FF || c == CHAR_CR;
1246 ph10 535 break;
1247    
1248 ph10 517 case PT_WORD:
1249     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1250     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1251     c == CHAR_UNDERSCORE;
1252 ph10 535 break;
1253 ph10 517
1254 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1255    
1256     default:
1257     OK = codevalue != OP_PROP;
1258     break;
1259     }
1260    
1261 nigel 93 if (OK == (d == OP_PROP))
1262     {
1263     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1264     {
1265     active_count--; /* Remove non-match possibility */
1266     next_active_state--;
1267     }
1268     count++;
1269     ADD_NEW(state_offset, count);
1270     }
1271 nigel 77 }
1272     break;
1273    
1274     /*-----------------------------------------------------------------*/
1275     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1276     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1277 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1278 nigel 77 count = current_state->count; /* Already matched */
1279     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1280 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1281 nigel 77 {
1282     const uschar *nptr = ptr + clen;
1283     int ncount = 0;
1284 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1285     {
1286     active_count--; /* Remove non-match possibility */
1287     next_active_state--;
1288     }
1289 nigel 77 while (nptr < end_subject)
1290     {
1291     int nd;
1292     int ndlen = 1;
1293     GETCHARLEN(nd, nptr, ndlen);
1294 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1295 nigel 77 ncount++;
1296     nptr += ndlen;
1297     }
1298     count++;
1299     ADD_NEW_DATA(-state_offset, count, ncount);
1300     }
1301     break;
1302 ph10 151 #endif
1303 nigel 77
1304     /*-----------------------------------------------------------------*/
1305 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1306     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1307     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1308     count = current_state->count; /* Already matched */
1309     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1310     if (clen > 0)
1311     {
1312     int ncount = 0;
1313     switch (c)
1314     {
1315     case 0x000b:
1316     case 0x000c:
1317     case 0x0085:
1318     case 0x2028:
1319     case 0x2029:
1320 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1321     goto ANYNL01;
1322    
1323     case 0x000d:
1324     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1325     /* Fall through */
1326    
1327     ANYNL01:
1328     case 0x000a:
1329 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1330     {
1331     active_count--; /* Remove non-match possibility */
1332     next_active_state--;
1333     }
1334     count++;
1335     ADD_NEW_DATA(-state_offset, count, ncount);
1336     break;
1337 ph10 231
1338 nigel 93 default:
1339     break;
1340     }
1341     }
1342     break;
1343    
1344     /*-----------------------------------------------------------------*/
1345 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1346     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1347     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1348     count = current_state->count; /* Already matched */
1349     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1350     if (clen > 0)
1351     {
1352 ph10 182 BOOL OK;
1353 ph10 178 switch (c)
1354     {
1355     case 0x000a:
1356     case 0x000b:
1357     case 0x000c:
1358     case 0x000d:
1359     case 0x0085:
1360     case 0x2028:
1361     case 0x2029:
1362     OK = TRUE;
1363 ph10 182 break;
1364 ph10 178
1365     default:
1366     OK = FALSE;
1367 ph10 182 break;
1368 ph10 178 }
1369    
1370     if (OK == (d == OP_VSPACE))
1371 ph10 182 {
1372 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1373     {
1374     active_count--; /* Remove non-match possibility */
1375     next_active_state--;
1376     }
1377     count++;
1378     ADD_NEW_DATA(-state_offset, count, 0);
1379     }
1380     }
1381     break;
1382    
1383     /*-----------------------------------------------------------------*/
1384     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1385     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1386     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1387     count = current_state->count; /* Already matched */
1388     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1389     if (clen > 0)
1390     {
1391 ph10 182 BOOL OK;
1392 ph10 178 switch (c)
1393     {
1394     case 0x09: /* HT */
1395     case 0x20: /* SPACE */
1396     case 0xa0: /* NBSP */
1397     case 0x1680: /* OGHAM SPACE MARK */
1398     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1399     case 0x2000: /* EN QUAD */
1400     case 0x2001: /* EM QUAD */
1401     case 0x2002: /* EN SPACE */
1402     case 0x2003: /* EM SPACE */
1403     case 0x2004: /* THREE-PER-EM SPACE */
1404     case 0x2005: /* FOUR-PER-EM SPACE */
1405     case 0x2006: /* SIX-PER-EM SPACE */
1406     case 0x2007: /* FIGURE SPACE */
1407     case 0x2008: /* PUNCTUATION SPACE */
1408     case 0x2009: /* THIN SPACE */
1409     case 0x200A: /* HAIR SPACE */
1410     case 0x202f: /* NARROW NO-BREAK SPACE */
1411     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1412     case 0x3000: /* IDEOGRAPHIC SPACE */
1413     OK = TRUE;
1414     break;
1415 ph10 182
1416 ph10 178 default:
1417     OK = FALSE;
1418     break;
1419     }
1420 ph10 182
1421 ph10 178 if (OK == (d == OP_HSPACE))
1422 ph10 182 {
1423 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1424     {
1425     active_count--; /* Remove non-match possibility */
1426     next_active_state--;
1427     }
1428     count++;
1429     ADD_NEW_DATA(-state_offset, count, 0);
1430     }
1431     }
1432     break;
1433    
1434     /*-----------------------------------------------------------------*/
1435 ph10 151 #ifdef SUPPORT_UCP
1436 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1437     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1438 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1439 nigel 87 count = 4;
1440 nigel 77 goto QS1;
1441    
1442     case OP_PROP_EXTRA + OP_TYPESTAR:
1443     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1444 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1445 nigel 77 count = 0;
1446    
1447     QS1:
1448    
1449 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1450 nigel 77 if (clen > 0)
1451     {
1452 nigel 87 BOOL OK;
1453 ph10 349 const ucd_record * prop = GET_UCD(c);
1454 nigel 87 switch(code[2])
1455     {
1456     case PT_ANY:
1457     OK = TRUE;
1458     break;
1459    
1460     case PT_LAMP:
1461 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1462 ph10 517 prop->chartype == ucp_Lt;
1463 nigel 87 break;
1464    
1465     case PT_GC:
1466 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1467 nigel 87 break;
1468    
1469     case PT_PC:
1470 ph10 349 OK = prop->chartype == code[3];
1471 nigel 87 break;
1472    
1473     case PT_SC:
1474 ph10 349 OK = prop->script == code[3];
1475 nigel 87 break;
1476 ph10 535
1477 ph10 517 /* These are specials for combination cases. */
1478 ph10 535
1479 ph10 517 case PT_ALNUM:
1480     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1481     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1482 ph10 535 break;
1483    
1484 ph10 517 case PT_SPACE: /* Perl space */
1485     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1486     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1487 ph10 535 break;
1488    
1489 ph10 517 case PT_PXSPACE: /* POSIX space */
1490     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1491     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1492     c == CHAR_FF || c == CHAR_CR;
1493 ph10 535 break;
1494    
1495 ph10 517 case PT_WORD:
1496     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1497     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1498     c == CHAR_UNDERSCORE;
1499 ph10 535 break;
1500 nigel 87
1501     /* Should never occur, but keep compilers from grumbling. */
1502    
1503     default:
1504     OK = codevalue != OP_PROP;
1505     break;
1506     }
1507    
1508 nigel 93 if (OK == (d == OP_PROP))
1509     {
1510     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1511     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1512     {
1513     active_count--; /* Remove non-match possibility */
1514     next_active_state--;
1515     }
1516     ADD_NEW(state_offset + count, 0);
1517     }
1518 nigel 77 }
1519     break;
1520    
1521     /*-----------------------------------------------------------------*/
1522     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1523     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1524 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1525 nigel 77 count = 2;
1526     goto QS2;
1527    
1528     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1529     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1530 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1531 nigel 77 count = 0;
1532    
1533     QS2:
1534    
1535     ADD_ACTIVE(state_offset + 2, 0);
1536 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1537 nigel 77 {
1538     const uschar *nptr = ptr + clen;
1539     int ncount = 0;
1540 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1541     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1542     {
1543     active_count--; /* Remove non-match possibility */
1544     next_active_state--;
1545     }
1546 nigel 77 while (nptr < end_subject)
1547     {
1548     int nd;
1549     int ndlen = 1;
1550     GETCHARLEN(nd, nptr, ndlen);
1551 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1552 nigel 77 ncount++;
1553     nptr += ndlen;
1554     }
1555     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1556     }
1557     break;
1558 ph10 151 #endif
1559 nigel 77
1560     /*-----------------------------------------------------------------*/
1561 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1562     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1563     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1564     count = 2;
1565     goto QS3;
1566    
1567     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1568     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1569     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1570     count = 0;
1571    
1572     QS3:
1573     ADD_ACTIVE(state_offset + 2, 0);
1574     if (clen > 0)
1575     {
1576     int ncount = 0;
1577     switch (c)
1578     {
1579     case 0x000b:
1580     case 0x000c:
1581     case 0x0085:
1582     case 0x2028:
1583     case 0x2029:
1584 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1585     goto ANYNL02;
1586    
1587     case 0x000d:
1588     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1589     /* Fall through */
1590    
1591     ANYNL02:
1592     case 0x000a:
1593 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1594     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1595     {
1596     active_count--; /* Remove non-match possibility */
1597     next_active_state--;
1598     }
1599     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1600     break;
1601 ph10 231
1602 nigel 93 default:
1603     break;
1604     }
1605     }
1606     break;
1607    
1608     /*-----------------------------------------------------------------*/
1609 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1610     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1611     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1612     count = 2;
1613     goto QS4;
1614    
1615     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1616     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1617     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1618     count = 0;
1619    
1620     QS4:
1621     ADD_ACTIVE(state_offset + 2, 0);
1622     if (clen > 0)
1623     {
1624 ph10 182 BOOL OK;
1625 ph10 178 switch (c)
1626     {
1627     case 0x000a:
1628     case 0x000b:
1629     case 0x000c:
1630     case 0x000d:
1631     case 0x0085:
1632     case 0x2028:
1633     case 0x2029:
1634     OK = TRUE;
1635     break;
1636 ph10 182
1637 ph10 178 default:
1638     OK = FALSE;
1639     break;
1640     }
1641     if (OK == (d == OP_VSPACE))
1642 ph10 182 {
1643 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1644     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1645     {
1646     active_count--; /* Remove non-match possibility */
1647     next_active_state--;
1648     }
1649     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1650     }
1651     }
1652     break;
1653    
1654     /*-----------------------------------------------------------------*/
1655     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1656     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1657     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1658     count = 2;
1659     goto QS5;
1660    
1661     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1662     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1663     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1664     count = 0;
1665    
1666     QS5:
1667     ADD_ACTIVE(state_offset + 2, 0);
1668     if (clen > 0)
1669     {
1670 ph10 182 BOOL OK;
1671 ph10 178 switch (c)
1672     {
1673     case 0x09: /* HT */
1674     case 0x20: /* SPACE */
1675     case 0xa0: /* NBSP */
1676     case 0x1680: /* OGHAM SPACE MARK */
1677     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1678     case 0x2000: /* EN QUAD */
1679     case 0x2001: /* EM QUAD */
1680     case 0x2002: /* EN SPACE */
1681     case 0x2003: /* EM SPACE */
1682     case 0x2004: /* THREE-PER-EM SPACE */
1683     case 0x2005: /* FOUR-PER-EM SPACE */
1684     case 0x2006: /* SIX-PER-EM SPACE */
1685     case 0x2007: /* FIGURE SPACE */
1686     case 0x2008: /* PUNCTUATION SPACE */
1687     case 0x2009: /* THIN SPACE */
1688     case 0x200A: /* HAIR SPACE */
1689     case 0x202f: /* NARROW NO-BREAK SPACE */
1690     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1691     case 0x3000: /* IDEOGRAPHIC SPACE */
1692     OK = TRUE;
1693     break;
1694 ph10 182
1695 ph10 178 default:
1696     OK = FALSE;
1697     break;
1698     }
1699 ph10 182
1700 ph10 178 if (OK == (d == OP_HSPACE))
1701 ph10 182 {
1702 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1703     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1704     {
1705     active_count--; /* Remove non-match possibility */
1706     next_active_state--;
1707     }
1708     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1709     }
1710     }
1711     break;
1712    
1713     /*-----------------------------------------------------------------*/
1714 ph10 151 #ifdef SUPPORT_UCP
1715 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1716     case OP_PROP_EXTRA + OP_TYPEUPTO:
1717     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1718 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1719 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1720 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1721 nigel 77 count = current_state->count; /* Number already matched */
1722     if (clen > 0)
1723     {
1724 nigel 87 BOOL OK;
1725 ph10 349 const ucd_record * prop = GET_UCD(c);
1726 nigel 87 switch(code[4])
1727 nigel 77 {
1728 nigel 87 case PT_ANY:
1729     OK = TRUE;
1730     break;
1731    
1732     case PT_LAMP:
1733 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1734 ph10 517 prop->chartype == ucp_Lt;
1735 nigel 87 break;
1736    
1737     case PT_GC:
1738 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1739 nigel 87 break;
1740    
1741     case PT_PC:
1742 ph10 349 OK = prop->chartype == code[5];
1743 nigel 87 break;
1744    
1745     case PT_SC:
1746 ph10 349 OK = prop->script == code[5];
1747 nigel 87 break;
1748 ph10 535
1749 ph10 517 /* These are specials for combination cases. */
1750 ph10 535
1751 ph10 517 case PT_ALNUM:
1752     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1753     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1754 ph10 535 break;
1755    
1756 ph10 517 case PT_SPACE: /* Perl space */
1757     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1758     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1759 ph10 535 break;
1760    
1761 ph10 517 case PT_PXSPACE: /* POSIX space */
1762     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1763     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1764     c == CHAR_FF || c == CHAR_CR;
1765 ph10 535 break;
1766    
1767 ph10 517 case PT_WORD:
1768     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1769     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1770     c == CHAR_UNDERSCORE;
1771 ph10 535 break;
1772 nigel 87
1773     /* Should never occur, but keep compilers from grumbling. */
1774    
1775     default:
1776     OK = codevalue != OP_PROP;
1777     break;
1778     }
1779    
1780     if (OK == (d == OP_PROP))
1781     {
1782 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1783     {
1784     active_count--; /* Remove non-match possibility */
1785     next_active_state--;
1786     }
1787 nigel 77 if (++count >= GET2(code, 1))
1788 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1789 nigel 77 else
1790     { ADD_NEW(state_offset, count); }
1791     }
1792     }
1793     break;
1794    
1795     /*-----------------------------------------------------------------*/
1796     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1797     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1798     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1799 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1800 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1801     { ADD_ACTIVE(state_offset + 4, 0); }
1802     count = current_state->count; /* Number already matched */
1803 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1804 nigel 77 {
1805     const uschar *nptr = ptr + clen;
1806     int ncount = 0;
1807 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1808     {
1809     active_count--; /* Remove non-match possibility */
1810     next_active_state--;
1811     }
1812 nigel 77 while (nptr < end_subject)
1813     {
1814     int nd;
1815     int ndlen = 1;
1816     GETCHARLEN(nd, nptr, ndlen);
1817 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1818 nigel 77 ncount++;
1819     nptr += ndlen;
1820     }
1821     if (++count >= GET2(code, 1))
1822     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1823     else
1824     { ADD_NEW_DATA(-state_offset, count, ncount); }
1825     }
1826     break;
1827 ph10 151 #endif
1828 nigel 77
1829 nigel 93 /*-----------------------------------------------------------------*/
1830     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1831     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1832     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1833     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1834     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1835     { ADD_ACTIVE(state_offset + 4, 0); }
1836     count = current_state->count; /* Number already matched */
1837     if (clen > 0)
1838     {
1839     int ncount = 0;
1840     switch (c)
1841     {
1842     case 0x000b:
1843     case 0x000c:
1844     case 0x0085:
1845     case 0x2028:
1846     case 0x2029:
1847 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1848     goto ANYNL03;
1849    
1850     case 0x000d:
1851     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1852     /* Fall through */
1853    
1854     ANYNL03:
1855     case 0x000a:
1856 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1857     {
1858     active_count--; /* Remove non-match possibility */
1859     next_active_state--;
1860     }
1861     if (++count >= GET2(code, 1))
1862     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1863     else
1864     { ADD_NEW_DATA(-state_offset, count, ncount); }
1865     break;
1866 ph10 231
1867 nigel 93 default:
1868     break;
1869     }
1870     }
1871     break;
1872    
1873 ph10 178 /*-----------------------------------------------------------------*/
1874     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1875     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1876     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1877     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1878     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1879     { ADD_ACTIVE(state_offset + 4, 0); }
1880     count = current_state->count; /* Number already matched */
1881     if (clen > 0)
1882     {
1883 ph10 182 BOOL OK;
1884 ph10 178 switch (c)
1885     {
1886     case 0x000a:
1887     case 0x000b:
1888     case 0x000c:
1889     case 0x000d:
1890     case 0x0085:
1891     case 0x2028:
1892     case 0x2029:
1893     OK = TRUE;
1894     break;
1895 ph10 182
1896 ph10 178 default:
1897     OK = FALSE;
1898     }
1899 ph10 182
1900 ph10 178 if (OK == (d == OP_VSPACE))
1901 ph10 182 {
1902 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1903     {
1904     active_count--; /* Remove non-match possibility */
1905     next_active_state--;
1906     }
1907     if (++count >= GET2(code, 1))
1908     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1909     else
1910     { ADD_NEW_DATA(-state_offset, count, 0); }
1911     }
1912     }
1913     break;
1914    
1915     /*-----------------------------------------------------------------*/
1916     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1917     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1918     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1919     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1920     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1921     { ADD_ACTIVE(state_offset + 4, 0); }
1922     count = current_state->count; /* Number already matched */
1923     if (clen > 0)
1924     {
1925 ph10 182 BOOL OK;
1926 ph10 178 switch (c)
1927     {
1928     case 0x09: /* HT */
1929     case 0x20: /* SPACE */
1930     case 0xa0: /* NBSP */
1931     case 0x1680: /* OGHAM SPACE MARK */
1932     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1933     case 0x2000: /* EN QUAD */
1934     case 0x2001: /* EM QUAD */
1935     case 0x2002: /* EN SPACE */
1936     case 0x2003: /* EM SPACE */
1937     case 0x2004: /* THREE-PER-EM SPACE */
1938     case 0x2005: /* FOUR-PER-EM SPACE */
1939     case 0x2006: /* SIX-PER-EM SPACE */
1940     case 0x2007: /* FIGURE SPACE */
1941     case 0x2008: /* PUNCTUATION SPACE */
1942     case 0x2009: /* THIN SPACE */
1943     case 0x200A: /* HAIR SPACE */
1944     case 0x202f: /* NARROW NO-BREAK SPACE */
1945     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1946     case 0x3000: /* IDEOGRAPHIC SPACE */
1947     OK = TRUE;
1948     break;
1949 ph10 182
1950 ph10 178 default:
1951     OK = FALSE;
1952     break;
1953     }
1954 ph10 182
1955 ph10 178 if (OK == (d == OP_HSPACE))
1956 ph10 182 {
1957 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1958     {
1959     active_count--; /* Remove non-match possibility */
1960     next_active_state--;
1961     }
1962     if (++count >= GET2(code, 1))
1963     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1964     else
1965     { ADD_NEW_DATA(-state_offset, count, 0); }
1966     }
1967     }
1968     break;
1969    
1970 nigel 77 /* ========================================================================== */
1971     /* These opcodes are followed by a character that is usually compared
1972     to the current subject character; it is loaded into d. We still get
1973     here even if there is no subject character, because in some cases zero
1974     repetitions are permitted. */
1975    
1976     /*-----------------------------------------------------------------*/
1977     case OP_CHAR:
1978     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1979     break;
1980    
1981     /*-----------------------------------------------------------------*/
1982 ph10 602 case OP_CHARI:
1983 nigel 77 if (clen == 0) break;
1984    
1985     #ifdef SUPPORT_UTF8
1986     if (utf8)
1987     {
1988     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1989     {
1990 nigel 93 unsigned int othercase;
1991 nigel 77 if (c < 128) othercase = fcc[c]; else
1992    
1993     /* If we have Unicode property support, we can use it to test the
1994 nigel 87 other case of the character. */
1995 nigel 77
1996     #ifdef SUPPORT_UCP
1997 ph10 349 othercase = UCD_OTHERCASE(c);
1998 nigel 87 #else
1999 nigel 93 othercase = NOTACHAR;
2000 nigel 77 #endif
2001    
2002     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2003     }
2004     }
2005     else
2006     #endif /* SUPPORT_UTF8 */
2007    
2008     /* Non-UTF-8 mode */
2009     {
2010     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2011     }
2012     break;
2013    
2014    
2015     #ifdef SUPPORT_UCP
2016     /*-----------------------------------------------------------------*/
2017     /* This is a tricky one because it can match more than one character.
2018     Find out how many characters to skip, and then set up a negative state
2019     to wait for them to pass before continuing. */
2020    
2021     case OP_EXTUNI:
2022 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2023 nigel 77 {
2024     const uschar *nptr = ptr + clen;
2025     int ncount = 0;
2026     while (nptr < end_subject)
2027     {
2028     int nclen = 1;
2029     GETCHARLEN(c, nptr, nclen);
2030 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2031 nigel 77 ncount++;
2032     nptr += nclen;
2033     }
2034     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2035     }
2036     break;
2037     #endif
2038    
2039     /*-----------------------------------------------------------------*/
2040 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2041     character (when CR is followed by LF). In this case, set up a negative
2042     state to wait for one character to pass before continuing. */
2043    
2044     case OP_ANYNL:
2045     if (clen > 0) switch(c)
2046     {
2047     case 0x000b:
2048     case 0x000c:
2049     case 0x0085:
2050     case 0x2028:
2051     case 0x2029:
2052 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2053    
2054     case 0x000a:
2055 nigel 93 ADD_NEW(state_offset + 1, 0);
2056     break;
2057 ph10 231
2058 nigel 93 case 0x000d:
2059     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2060     {
2061     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2062     }
2063     else
2064     {
2065     ADD_NEW(state_offset + 1, 0);
2066     }
2067     break;
2068     }
2069     break;
2070    
2071     /*-----------------------------------------------------------------*/
2072 ph10 178 case OP_NOT_VSPACE:
2073     if (clen > 0) switch(c)
2074     {
2075     case 0x000a:
2076     case 0x000b:
2077     case 0x000c:
2078     case 0x000d:
2079     case 0x0085:
2080     case 0x2028:
2081     case 0x2029:
2082     break;
2083 ph10 182
2084     default:
2085 ph10 178 ADD_NEW(state_offset + 1, 0);
2086     break;
2087     }
2088     break;
2089    
2090     /*-----------------------------------------------------------------*/
2091     case OP_VSPACE:
2092     if (clen > 0) switch(c)
2093     {
2094     case 0x000a:
2095     case 0x000b:
2096     case 0x000c:
2097     case 0x000d:
2098     case 0x0085:
2099     case 0x2028:
2100     case 0x2029:
2101     ADD_NEW(state_offset + 1, 0);
2102     break;
2103 ph10 182
2104 ph10 178 default: break;
2105     }
2106     break;
2107    
2108     /*-----------------------------------------------------------------*/
2109     case OP_NOT_HSPACE:
2110     if (clen > 0) switch(c)
2111     {
2112     case 0x09: /* HT */
2113     case 0x20: /* SPACE */
2114     case 0xa0: /* NBSP */
2115     case 0x1680: /* OGHAM SPACE MARK */
2116     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2117     case 0x2000: /* EN QUAD */
2118     case 0x2001: /* EM QUAD */
2119     case 0x2002: /* EN SPACE */
2120     case 0x2003: /* EM SPACE */
2121     case 0x2004: /* THREE-PER-EM SPACE */
2122     case 0x2005: /* FOUR-PER-EM SPACE */
2123     case 0x2006: /* SIX-PER-EM SPACE */
2124     case 0x2007: /* FIGURE SPACE */
2125     case 0x2008: /* PUNCTUATION SPACE */
2126     case 0x2009: /* THIN SPACE */
2127     case 0x200A: /* HAIR SPACE */
2128     case 0x202f: /* NARROW NO-BREAK SPACE */
2129     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2130     case 0x3000: /* IDEOGRAPHIC SPACE */
2131     break;
2132 ph10 182
2133     default:
2134 ph10 178 ADD_NEW(state_offset + 1, 0);
2135     break;
2136     }
2137     break;
2138    
2139     /*-----------------------------------------------------------------*/
2140     case OP_HSPACE:
2141     if (clen > 0) switch(c)
2142     {
2143     case 0x09: /* HT */
2144     case 0x20: /* SPACE */
2145     case 0xa0: /* NBSP */
2146     case 0x1680: /* OGHAM SPACE MARK */
2147     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2148     case 0x2000: /* EN QUAD */
2149     case 0x2001: /* EM QUAD */
2150     case 0x2002: /* EN SPACE */
2151     case 0x2003: /* EM SPACE */
2152     case 0x2004: /* THREE-PER-EM SPACE */
2153     case 0x2005: /* FOUR-PER-EM SPACE */
2154     case 0x2006: /* SIX-PER-EM SPACE */
2155     case 0x2007: /* FIGURE SPACE */
2156     case 0x2008: /* PUNCTUATION SPACE */
2157     case 0x2009: /* THIN SPACE */
2158     case 0x200A: /* HAIR SPACE */
2159     case 0x202f: /* NARROW NO-BREAK SPACE */
2160     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2161     case 0x3000: /* IDEOGRAPHIC SPACE */
2162     ADD_NEW(state_offset + 1, 0);
2163     break;
2164     }
2165     break;
2166    
2167     /*-----------------------------------------------------------------*/
2168 ph10 602 /* Match a negated single character casefully. This is only used for
2169     one-byte characters, that is, we know that d < 256. The character we are
2170 nigel 77 checking (c) can be multibyte. */
2171    
2172     case OP_NOT:
2173 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2174 nigel 77 break;
2175    
2176     /*-----------------------------------------------------------------*/
2177 ph10 602 /* Match a negated single character caselessly. This is only used for
2178     one-byte characters, that is, we know that d < 256. The character we are
2179     checking (c) can be multibyte. */
2180    
2181     case OP_NOTI:
2182 ph10 654 if (clen > 0 && c != d && c != fcc[d])
2183 ph10 602 { ADD_NEW(state_offset + dlen + 1, 0); }
2184     break;
2185    
2186     /*-----------------------------------------------------------------*/
2187     case OP_PLUSI:
2188     case OP_MINPLUSI:
2189     case OP_POSPLUSI:
2190     case OP_NOTPLUSI:
2191     case OP_NOTMINPLUSI:
2192     case OP_NOTPOSPLUSI:
2193     caseless = TRUE;
2194     codevalue -= OP_STARI - OP_STAR;
2195 ph10 654
2196 ph10 602 /* Fall through */
2197 nigel 77 case OP_PLUS:
2198     case OP_MINPLUS:
2199 nigel 93 case OP_POSPLUS:
2200 nigel 77 case OP_NOTPLUS:
2201     case OP_NOTMINPLUS:
2202 nigel 93 case OP_NOTPOSPLUS:
2203 nigel 77 count = current_state->count; /* Already matched */
2204     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2205     if (clen > 0)
2206     {
2207 nigel 93 unsigned int otherd = NOTACHAR;
2208 ph10 602 if (caseless)
2209 nigel 77 {
2210     #ifdef SUPPORT_UTF8
2211 nigel 87 if (utf8 && d >= 128)
2212 nigel 77 {
2213     #ifdef SUPPORT_UCP
2214 ph10 349 otherd = UCD_OTHERCASE(d);
2215 nigel 77 #endif /* SUPPORT_UCP */
2216     }
2217     else
2218     #endif /* SUPPORT_UTF8 */
2219     otherd = fcc[d];
2220     }
2221     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2222 nigel 93 {
2223     if (count > 0 &&
2224     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2225     {
2226     active_count--; /* Remove non-match possibility */
2227     next_active_state--;
2228     }
2229     count++;
2230     ADD_NEW(state_offset, count);
2231     }
2232 nigel 77 }
2233     break;
2234    
2235     /*-----------------------------------------------------------------*/
2236 ph10 602 case OP_QUERYI:
2237     case OP_MINQUERYI:
2238     case OP_POSQUERYI:
2239     case OP_NOTQUERYI:
2240     case OP_NOTMINQUERYI:
2241     case OP_NOTPOSQUERYI:
2242     caseless = TRUE;
2243     codevalue -= OP_STARI - OP_STAR;
2244     /* Fall through */
2245 nigel 77 case OP_QUERY:
2246     case OP_MINQUERY:
2247 nigel 93 case OP_POSQUERY:
2248 nigel 77 case OP_NOTQUERY:
2249     case OP_NOTMINQUERY:
2250 nigel 93 case OP_NOTPOSQUERY:
2251 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2252     if (clen > 0)
2253     {
2254 nigel 93 unsigned int otherd = NOTACHAR;
2255 ph10 602 if (caseless)
2256 nigel 77 {
2257     #ifdef SUPPORT_UTF8
2258 nigel 87 if (utf8 && d >= 128)
2259 nigel 77 {
2260     #ifdef SUPPORT_UCP
2261 ph10 349 otherd = UCD_OTHERCASE(d);
2262 nigel 77 #endif /* SUPPORT_UCP */
2263     }
2264     else
2265     #endif /* SUPPORT_UTF8 */
2266     otherd = fcc[d];
2267     }
2268     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2269 nigel 93 {
2270     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2271     {
2272     active_count--; /* Remove non-match possibility */
2273     next_active_state--;
2274     }
2275     ADD_NEW(state_offset + dlen + 1, 0);
2276     }
2277 nigel 77 }
2278     break;
2279    
2280     /*-----------------------------------------------------------------*/
2281 ph10 602 case OP_STARI:
2282     case OP_MINSTARI:
2283     case OP_POSSTARI:
2284     case OP_NOTSTARI:
2285     case OP_NOTMINSTARI:
2286     case OP_NOTPOSSTARI:
2287     caseless = TRUE;
2288     codevalue -= OP_STARI - OP_STAR;
2289     /* Fall through */
2290 nigel 77 case OP_STAR:
2291     case OP_MINSTAR:
2292 nigel 93 case OP_POSSTAR:
2293 nigel 77 case OP_NOTSTAR:
2294     case OP_NOTMINSTAR:
2295 nigel 93 case OP_NOTPOSSTAR:
2296 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2297     if (clen > 0)
2298     {
2299 nigel 93 unsigned int otherd = NOTACHAR;
2300 ph10 602 if (caseless)
2301 nigel 77 {
2302     #ifdef SUPPORT_UTF8
2303 nigel 87 if (utf8 && d >= 128)
2304 nigel 77 {
2305     #ifdef SUPPORT_UCP
2306 ph10 349 otherd = UCD_OTHERCASE(d);
2307 nigel 77 #endif /* SUPPORT_UCP */
2308     }
2309     else
2310     #endif /* SUPPORT_UTF8 */
2311     otherd = fcc[d];
2312     }
2313     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2314 nigel 93 {
2315     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2316     {
2317     active_count--; /* Remove non-match possibility */
2318     next_active_state--;
2319     }
2320     ADD_NEW(state_offset, 0);
2321     }
2322 nigel 77 }
2323     break;
2324    
2325     /*-----------------------------------------------------------------*/
2326 ph10 602 case OP_EXACTI:
2327     case OP_NOTEXACTI:
2328     caseless = TRUE;
2329     codevalue -= OP_STARI - OP_STAR;
2330     /* Fall through */
2331 nigel 77 case OP_EXACT:
2332 nigel 93 case OP_NOTEXACT:
2333     count = current_state->count; /* Number already matched */
2334     if (clen > 0)
2335     {
2336     unsigned int otherd = NOTACHAR;
2337 ph10 602 if (caseless)
2338 nigel 93 {
2339     #ifdef SUPPORT_UTF8
2340     if (utf8 && d >= 128)
2341     {
2342     #ifdef SUPPORT_UCP
2343 ph10 349 otherd = UCD_OTHERCASE(d);
2344 nigel 93 #endif /* SUPPORT_UCP */
2345     }
2346     else
2347     #endif /* SUPPORT_UTF8 */
2348     otherd = fcc[d];
2349     }
2350     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351     {
2352     if (++count >= GET2(code, 1))
2353     { ADD_NEW(state_offset + dlen + 3, 0); }
2354     else
2355     { ADD_NEW(state_offset, count); }
2356     }
2357     }
2358     break;
2359    
2360     /*-----------------------------------------------------------------*/
2361 ph10 602 case OP_UPTOI:
2362     case OP_MINUPTOI:
2363     case OP_POSUPTOI:
2364     case OP_NOTUPTOI:
2365     case OP_NOTMINUPTOI:
2366     case OP_NOTPOSUPTOI:
2367     caseless = TRUE;
2368     codevalue -= OP_STARI - OP_STAR;
2369     /* Fall through */
2370 nigel 77 case OP_UPTO:
2371     case OP_MINUPTO:
2372 nigel 93 case OP_POSUPTO:
2373 nigel 77 case OP_NOTUPTO:
2374     case OP_NOTMINUPTO:
2375 nigel 93 case OP_NOTPOSUPTO:
2376     ADD_ACTIVE(state_offset + dlen + 3, 0);
2377 nigel 77 count = current_state->count; /* Number already matched */
2378     if (clen > 0)
2379     {
2380 nigel 93 unsigned int otherd = NOTACHAR;
2381 ph10 602 if (caseless)
2382 nigel 77 {
2383     #ifdef SUPPORT_UTF8
2384 nigel 87 if (utf8 && d >= 128)
2385 nigel 77 {
2386     #ifdef SUPPORT_UCP
2387 ph10 349 otherd = UCD_OTHERCASE(d);
2388 nigel 77 #endif /* SUPPORT_UCP */
2389     }
2390     else
2391     #endif /* SUPPORT_UTF8 */
2392     otherd = fcc[d];
2393     }
2394     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2395     {
2396 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2397     {
2398     active_count--; /* Remove non-match possibility */
2399     next_active_state--;
2400     }
2401 nigel 77 if (++count >= GET2(code, 1))
2402     { ADD_NEW(state_offset + dlen + 3, 0); }
2403     else
2404     { ADD_NEW(state_offset, count); }
2405     }
2406     }
2407     break;
2408    
2409    
2410     /* ========================================================================== */
2411     /* These are the class-handling opcodes */
2412    
2413     case OP_CLASS:
2414     case OP_NCLASS:
2415     case OP_XCLASS:
2416     {
2417     BOOL isinclass = FALSE;
2418     int next_state_offset;
2419     const uschar *ecode;
2420    
2421     /* For a simple class, there is always just a 32-byte table, and we
2422     can set isinclass from it. */
2423    
2424     if (codevalue != OP_XCLASS)
2425     {
2426     ecode = code + 33;
2427     if (clen > 0)
2428     {
2429     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2430     ((code[1 + c/8] & (1 << (c&7))) != 0);
2431     }
2432     }
2433    
2434     /* An extended class may have a table or a list of single characters,
2435     ranges, or both, and it may be positive or negative. There's a
2436     function that sorts all this out. */
2437    
2438     else
2439     {
2440     ecode = code + GET(code, 1);
2441     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2442     }
2443    
2444     /* At this point, isinclass is set for all kinds of class, and ecode
2445     points to the byte after the end of the class. If there is a
2446     quantifier, this is where it will be. */
2447    
2448 ph10 530 next_state_offset = (int)(ecode - start_code);
2449 nigel 77
2450     switch (*ecode)
2451     {
2452     case OP_CRSTAR:
2453     case OP_CRMINSTAR:
2454     ADD_ACTIVE(next_state_offset + 1, 0);
2455     if (isinclass) { ADD_NEW(state_offset, 0); }
2456     break;
2457    
2458     case OP_CRPLUS:
2459     case OP_CRMINPLUS:
2460     count = current_state->count; /* Already matched */
2461     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2462     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2463     break;
2464    
2465     case OP_CRQUERY:
2466     case OP_CRMINQUERY:
2467     ADD_ACTIVE(next_state_offset + 1, 0);
2468     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2469     break;
2470    
2471     case OP_CRRANGE:
2472     case OP_CRMINRANGE:
2473     count = current_state->count; /* Already matched */
2474     if (count >= GET2(ecode, 1))
2475     { ADD_ACTIVE(next_state_offset + 5, 0); }
2476     if (isinclass)
2477     {
2478 nigel 91 int max = GET2(ecode, 3);
2479     if (++count >= max && max != 0) /* Max 0 => no limit */
2480 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2481     else
2482     { ADD_NEW(state_offset, count); }
2483     }
2484     break;
2485    
2486     default:
2487     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2488     break;
2489     }
2490     }
2491     break;
2492    
2493     /* ========================================================================== */
2494     /* These are the opcodes for fancy brackets of various kinds. We have
2495 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2496     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2497 ph10 341 though the other "backtracking verbs" are not supported. */
2498 ph10 345
2499 ph10 341 case OP_FAIL:
2500 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2501 ph10 345 break;
2502 nigel 77
2503     case OP_ASSERT:
2504     case OP_ASSERT_NOT:
2505     case OP_ASSERTBACK:
2506     case OP_ASSERTBACK_NOT:
2507     {
2508     int rc;
2509     int local_offsets[2];
2510     int local_workspace[1000];
2511     const uschar *endasscode = code + GET(code, 1);
2512    
2513     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2514    
2515     rc = internal_dfa_exec(
2516     md, /* static match data */
2517     code, /* this subexpression's code */
2518     ptr, /* where we currently are */
2519 ph10 530 (int)(ptr - start_subject), /* start offset */
2520 nigel 77 local_offsets, /* offset vector */
2521     sizeof(local_offsets)/sizeof(int), /* size of same */
2522     local_workspace, /* workspace vector */
2523     sizeof(local_workspace)/sizeof(int), /* size of same */
2524 ph10 642 rlevel); /* function recursion level */
2525 ph10 487
2526 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2527 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2528 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2529 nigel 77 }
2530     break;
2531    
2532     /*-----------------------------------------------------------------*/
2533     case OP_COND:
2534 nigel 93 case OP_SCOND:
2535 nigel 77 {
2536     int local_offsets[1000];
2537     int local_workspace[1000];
2538 ph10 406 int codelink = GET(code, 1);
2539 ph10 397 int condcode;
2540 ph10 406
2541 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2542 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2543 ph10 398 happen for the other conditions. */
2544 nigel 77
2545 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2546 ph10 406 {
2547     rrc = 0;
2548 ph10 397 if (pcre_callout != NULL)
2549     {
2550     pcre_callout_block cb;
2551     cb.version = 1; /* Version 1 of the callout block */
2552     cb.callout_number = code[LINK_SIZE+2];
2553     cb.offset_vector = offsets;
2554     cb.subject = (PCRE_SPTR)start_subject;
2555 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2556     cb.start_match = (int)(current_subject - start_subject);
2557     cb.current_position = (int)(ptr - start_subject);
2558 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2559     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2560     cb.capture_top = 1;
2561     cb.capture_last = -1;
2562     cb.callout_data = md->callout_data;
2563 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2564 ph10 397 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2565     }
2566 ph10 398 if (rrc > 0) break; /* Fail this thread */
2567     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2568 ph10 406 }
2569 ph10 398
2570 ph10 397 condcode = code[LINK_SIZE+1];
2571 ph10 406
2572 nigel 93 /* Back reference conditions are not supported */
2573 nigel 77
2574 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2575 ph10 459 return PCRE_ERROR_DFA_UCOND;
2576 nigel 93
2577     /* The DEFINE condition is always false */
2578    
2579     if (condcode == OP_DEF)
2580 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2581 nigel 93
2582     /* The only supported version of OP_RREF is for the value RREF_ANY,
2583     which means "test if in any recursion". We can't test for specifically
2584     recursed groups. */
2585    
2586 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2587 nigel 93 {
2588 nigel 77 int value = GET2(code, LINK_SIZE+2);
2589 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2590 ph10 654 if (md->recursive != NULL)
2591 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2592     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2593 nigel 77 }
2594    
2595     /* Otherwise, the condition is an assertion */
2596    
2597     else
2598     {
2599     int rc;
2600     const uschar *asscode = code + LINK_SIZE + 1;
2601     const uschar *endasscode = asscode + GET(asscode, 1);
2602    
2603     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2604    
2605     rc = internal_dfa_exec(
2606     md, /* fixed match data */
2607     asscode, /* this subexpression's code */
2608     ptr, /* where we currently are */
2609 ph10 530 (int)(ptr - start_subject), /* start offset */
2610 nigel 77 local_offsets, /* offset vector */
2611     sizeof(local_offsets)/sizeof(int), /* size of same */
2612     local_workspace, /* workspace vector */
2613     sizeof(local_workspace)/sizeof(int), /* size of same */
2614 ph10 642 rlevel); /* function recursion level */
2615 nigel 77
2616 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2617 nigel 77 if ((rc >= 0) ==
2618     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2619 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2620 nigel 77 else
2621 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2622 nigel 77 }
2623     }
2624     break;
2625    
2626     /*-----------------------------------------------------------------*/
2627     case OP_RECURSE:
2628     {
2629 ph10 654 dfa_recursion_info *ri;
2630 nigel 77 int local_offsets[1000];
2631     int local_workspace[1000];
2632 ph10 642 const uschar *callpat = start_code + GET(code, 1);
2633 ph10 654 int recno = (callpat == md->start_code)? 0 :
2634     GET2(callpat, 1 + LINK_SIZE);
2635 nigel 77 int rc;
2636    
2637 ph10 642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2638 ph10 654
2639 ph10 642 /* Check for repeating a recursion without advancing the subject
2640     pointer. This should catch convoluted mutual recursions. (Some simple
2641     cases are caught at compile time.) */
2642 nigel 77
2643 ph10 654 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2644     if (recno == ri->group_num && ptr == ri->subject_position)
2645     return PCRE_ERROR_RECURSELOOP;
2646    
2647     /* Remember this recursion and where we started it so as to
2648 ph10 642 catch infinite loops. */
2649 ph10 654
2650 ph10 642 new_recursive.group_num = recno;
2651     new_recursive.subject_position = ptr;
2652     new_recursive.prevrec = md->recursive;
2653 ph10 654 md->recursive = &new_recursive;
2654 ph10 642
2655 nigel 77 rc = internal_dfa_exec(
2656     md, /* fixed match data */
2657 ph10 642 callpat, /* this subexpression's code */
2658 nigel 77 ptr, /* where we currently are */
2659 ph10 530 (int)(ptr - start_subject), /* start offset */
2660 nigel 77 local_offsets, /* offset vector */
2661     sizeof(local_offsets)/sizeof(int), /* size of same */
2662     local_workspace, /* workspace vector */
2663     sizeof(local_workspace)/sizeof(int), /* size of same */
2664 ph10 642 rlevel); /* function recursion level */
2665 nigel 77
2666 ph10 642 md->recursive = new_recursive.prevrec; /* Done this recursion */
2667 nigel 77
2668 ph10 654 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2669 ph10 642 rc));
2670    
2671 nigel 77 /* Ran out of internal offsets */
2672    
2673     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2674    
2675     /* For each successful matched substring, set up the next state with a
2676     count of characters to skip before trying it. Note that the count is in
2677     characters, not bytes. */
2678    
2679     if (rc > 0)
2680     {
2681     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2682     {
2683     const uschar *p = start_subject + local_offsets[rc];
2684     const uschar *pp = start_subject + local_offsets[rc+1];
2685     int charcount = local_offsets[rc+1] - local_offsets[rc];
2686     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2687     if (charcount > 0)
2688     {
2689     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2690     }
2691     else
2692     {
2693     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2694     }
2695     }
2696     }
2697     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2698     }
2699     break;
2700    
2701     /*-----------------------------------------------------------------*/
2702 ph10 604 case OP_BRAPOS:
2703     case OP_SBRAPOS:
2704     case OP_CBRAPOS:
2705     case OP_SCBRAPOS:
2706 ph10 654 case OP_BRAPOSZERO:
2707 ph10 604 {
2708     int charcount, matched_count;
2709     const uschar *local_ptr = ptr;
2710     BOOL allow_zero;
2711 ph10 654
2712 ph10 604 if (codevalue == OP_BRAPOSZERO)
2713     {
2714     allow_zero = TRUE;
2715     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2716     }
2717 ph10 654 else allow_zero = FALSE;
2718    
2719     /* Loop to match the subpattern as many times as possible as if it were
2720     a complete pattern. */
2721    
2722 ph10 604 for (matched_count = 0;; matched_count++)
2723     {
2724     int local_offsets[2];
2725     int local_workspace[1000];
2726 ph10 654
2727 ph10 604 int rc = internal_dfa_exec(
2728     md, /* fixed match data */
2729     code, /* this subexpression's code */
2730     local_ptr, /* where we currently are */
2731     (int)(ptr - start_subject), /* start offset */
2732     local_offsets, /* offset vector */
2733     sizeof(local_offsets)/sizeof(int), /* size of same */
2734     local_workspace, /* workspace vector */
2735     sizeof(local_workspace)/sizeof(int), /* size of same */
2736 ph10 642 rlevel); /* function recursion level */
2737 ph10 654
2738 ph10 604 /* Failed to match */
2739 ph10 654
2740     if (rc < 0)
2741 ph10 604 {
2742     if (rc != PCRE_ERROR_NOMATCH) return rc;
2743     break;
2744 ph10 654 }
2745    
2746 ph10 604 /* Matched: break the loop if zero characters matched. */
2747 ph10 654
2748 ph10 604 charcount = local_offsets[1] - local_offsets[0];
2749 ph10 654 if (charcount == 0) break;
2750 ph10 604 local_ptr += charcount; /* Advance temporary position ptr */
2751 ph10 654 }
2752 ph10 604
2753     /* At this point we have matched the subpattern matched_count
2754 ph10 654 times, and local_ptr is pointing to the character after the end of the
2755     last match. */
2756 ph10 604
2757     if (matched_count > 0 || allow_zero)
2758 ph10 654 {
2759 ph10 604 const uschar *end_subpattern = code;
2760     int next_state_offset;
2761 ph10 654
2762 ph10 604 do { end_subpattern += GET(end_subpattern, 1); }
2763     while (*end_subpattern == OP_ALT);
2764     next_state_offset =
2765     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2766    
2767     /* Optimization: if there are no more active states, and there
2768     are no new states yet set up, then skip over the subject string
2769     right here, to save looping. Otherwise, set up the new state to swing
2770     into action when the end of the matched substring is reached. */
2771    
2772     if (i + 1 >= active_count && new_count == 0)
2773     {
2774     ptr = local_ptr;
2775     clen = 0;
2776     ADD_NEW(next_state_offset, 0);
2777     }
2778     else
2779     {
2780     const uschar *p = ptr;
2781     const uschar *pp = local_ptr;
2782 ph10 654 charcount = pp - p;
2783 ph10 604 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2784     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2785     }
2786 ph10 654 }
2787     }
2788 ph10 604 break;
2789 ph10 654
2790 ph10 604 /*-----------------------------------------------------------------*/
2791 nigel 77 case OP_ONCE:
2792     {
2793     int local_offsets[2];
2794     int local_workspace[1000];
2795    
2796     int rc = internal_dfa_exec(
2797     md, /* fixed match data */
2798     code, /* this subexpression's code */
2799     ptr, /* where we currently are */
2800 ph10 530 (int)(ptr - start_subject), /* start offset */
2801 nigel 77 local_offsets, /* offset vector */
2802     sizeof(local_offsets)/sizeof(int), /* size of same */
2803     local_workspace, /* workspace vector */
2804     sizeof(local_workspace)/sizeof(int), /* size of same */
2805 ph10 642 rlevel); /* function recursion level */
2806 nigel 77
2807     if (rc >= 0)
2808     {
2809     const uschar *end_subpattern = code;
2810     int charcount = local_offsets[1] - local_offsets[0];
2811     int next_state_offset, repeat_state_offset;
2812    
2813     do { end_subpattern += GET(end_subpattern, 1); }
2814     while (*end_subpattern == OP_ALT);
2815 ph10 535 next_state_offset =
2816 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2817 nigel 77
2818     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2819     arrange for the repeat state also to be added to the relevant list.
2820     Calculate the offset, or set -1 for no repeat. */
2821    
2822     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2823     *end_subpattern == OP_KETRMIN)?
2824 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2825 nigel 77
2826     /* If we have matched an empty string, add the next state at the
2827     current character pointer. This is important so that the duplicate
2828     checking kicks in, which is what breaks infinite loops that match an
2829     empty string. */
2830    
2831     if (charcount == 0)
2832     {
2833     ADD_ACTIVE(next_state_offset, 0);
2834     }
2835    
2836     /* Optimization: if there are no more active states, and there
2837     are no new states yet set up, then skip over the subject string
2838     right here, to save looping. Otherwise, set up the new state to swing
2839 ph10 604 into action when the end of the matched substring is reached. */
2840 nigel 77
2841     else if (i + 1 >= active_count && new_count == 0)
2842     {
2843     ptr += charcount;
2844     clen = 0;
2845     ADD_NEW(next_state_offset, 0);
2846    
2847     /* If we are adding a repeat state at the new character position,
2848     we must fudge things so that it is the only current state.
2849     Otherwise, it might be a duplicate of one we processed before, and
2850     that would cause it to be skipped. */
2851    
2852     if (repeat_state_offset >= 0)
2853     {
2854     next_active_state = active_states;
2855     active_count = 0;
2856     i = -1;
2857     ADD_ACTIVE(repeat_state_offset, 0);
2858     }
2859     }
2860     else
2861     {
2862     const uschar *p = start_subject + local_offsets[0];
2863     const uschar *pp = start_subject + local_offsets[1];
2864     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2865     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2866     if (repeat_state_offset >= 0)
2867     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2868     }
2869     }
2870     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2871     }
2872     break;
2873    
2874    
2875     /* ========================================================================== */
2876     /* Handle callouts */
2877    
2878     case OP_CALLOUT:
2879 ph10 406 rrc = 0;
2880 nigel 77 if (pcre_callout != NULL)
2881     {
2882     pcre_callout_block cb;
2883     cb.version = 1; /* Version 1 of the callout block */
2884     cb.callout_number = code[1];
2885     cb.offset_vector = offsets;
2886 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2887 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2888     cb.start_match = (int)(current_subject - start_subject);
2889     cb.current_position = (int)(ptr - start_subject);
2890 nigel 77 cb.pattern_position = GET(code, 2);
2891     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2892     cb.capture_top = 1;
2893     cb.capture_last = -1;
2894     cb.callout_data = md->callout_data;
2895 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2896 nigel 77 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2897 ph10 406 }
2898     if (rrc == 0)
2899     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2900 nigel 77 break;
2901    
2902    
2903     /* ========================================================================== */
2904     default: /* Unsupported opcode */
2905     return PCRE_ERROR_DFA_UITEM;
2906     }
2907    
2908     NEXT_ACTIVE_STATE: continue;
2909    
2910     } /* End of loop scanning active states */
2911    
2912     /* We have finished the processing at the current subject character. If no
2913     new states have been set for the next character, we have found all the
2914     matches that we are going to find. If we are at the top level and partial
2915 ph10 463 matching has been requested, check for appropriate conditions.
2916    
2917 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2918     character. If it is equal to the original active_count (saved in
2919     workspace[1]) it means that (*F) was found on every active state. In this
2920 ph10 463 case we don't want to give a partial match.
2921 nigel 77
2922 ph10 463 The "could_continue" variable is true if a state could have continued but
2923     for the fact that the end of the subject was reached. */
2924    
2925 nigel 77 if (new_count <= 0)
2926     {
2927 ph10 427 if (rlevel == 1 && /* Top level, and */
2928 ph10 463 could_continue && /* Some could go on */
2929 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2930 ph10 427 ( /* either... */
2931     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2932     || /* or... */
2933     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2934     match_count < 0) /* no matches */
2935     ) && /* And... */
2936 ph10 553 ptr >= end_subject && /* Reached end of subject */
2937     ptr > md->start_used_ptr) /* Inspected non-empty string */
2938 nigel 77 {
2939     if (offsetcount >= 2)
2940     {
2941 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2942     offsets[1] = (int)(end_subject - start_subject);
2943 nigel 77 }
2944     match_count = PCRE_ERROR_PARTIAL;
2945     }
2946    
2947     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2948     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2949     rlevel*2-2, SP));
2950 nigel 91 break; /* In effect, "return", but see the comment below */
2951 nigel 77 }
2952    
2953     /* One or more states are active for the next character. */
2954    
2955     ptr += clen; /* Advance to next subject character */
2956     } /* Loop to move along the subject string */
2957    
2958 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2959     if we use "return" above, we have compiler trouble. Some compilers warn if
2960     there's nothing here because they think the function doesn't return a value. On
2961     the other hand, if we put a dummy statement here, some more clever compilers
2962     complain that it can't be reached. Sigh. */
2963 nigel 77
2964 nigel 91 return match_count;
2965 nigel 77 }
2966    
2967    
2968    
2969    
2970     /*************************************************
2971     * Execute a Regular Expression - DFA engine *
2972     *************************************************/
2973    
2974     /* This external function applies a compiled re to a subject string using a DFA
2975     engine. This function calls the internal function multiple times if the pattern
2976     is not anchored.
2977    
2978     Arguments:
2979     argument_re points to the compiled expression
2980 ph10 97 extra_data points to extra data or is NULL
2981 nigel 77 subject points to the subject string
2982     length length of subject string (may contain binary zeros)
2983     start_offset where to start in the subject string
2984     options option bits
2985     offsets vector of match offsets
2986     offsetcount size of same
2987     workspace workspace vector
2988     wscount size of same
2989    
2990     Returns: > 0 => number of match offset pairs placed in offsets
2991     = 0 => offsets overflowed; longest matches are present
2992     -1 => failed to match
2993     < -1 => some kind of unexpected problem
2994     */
2995    
2996 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2997 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2998     const char *subject, int length, int start_offset, int options, int *offsets,
2999     int offsetcount, int *workspace, int wscount)
3000     {
3001     real_pcre *re = (real_pcre *)argument_re;
3002     dfa_match_data match_block;
3003 nigel 91 dfa_match_data *md = &match_block;
3004 nigel 77 BOOL utf8, anchored, startline, firstline;
3005     const uschar *current_subject, *end_subject, *lcc;
3006    
3007     pcre_study_data internal_study;
3008     const pcre_study_data *study = NULL;
3009     real_pcre internal_re;
3010    
3011     const uschar *req_byte_ptr;
3012     const uschar *start_bits = NULL;
3013     BOOL first_byte_caseless = FALSE;
3014     BOOL req_byte_caseless = FALSE;
3015     int first_byte = -1;
3016     int req_byte = -1;
3017     int req_byte2 = -1;
3018 nigel 91 int newline;
3019 nigel 77
3020     /* Plausibility checks */
3021    
3022     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3023     if (re == NULL || subject == NULL || workspace == NULL ||
3024     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3025     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3026     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3027 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3028 nigel 77
3029     /* We need to find the pointer to any study data before we test for byte
3030     flipping, so we scan the extra_data block first. This may set two fields in the
3031     match block, so we must initialize them beforehand. However, the other fields
3032     in the match block must not be set until after the byte flipping. */
3033    
3034 nigel 91 md->tables = re->tables;
3035     md->callout_data = NULL;
3036 nigel 77
3037     if (extra_data != NULL)
3038     {
3039     unsigned int flags = extra_data->flags;
3040     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3041     study = (const pcre_study_data *)extra_data->study_data;
3042     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3043 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3044     return PCRE_ERROR_DFA_UMLIMIT;
3045 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3046 nigel 91 md->callout_data = extra_data->callout_data;
3047 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3048 nigel 91 md->tables = extra_data->tables;
3049 nigel 77 }
3050 ph10 461
3051 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
3052     test for a regex that was compiled on a host of opposite endianness. If this is
3053     the case, flipped values are put in internal_re and internal_study if there was
3054     study data too. */
3055    
3056     if (re->magic_number != MAGIC_NUMBER)
3057     {
3058     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3059     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3060     if (study != NULL) study = &internal_study;
3061     }
3062    
3063     /* Set some local values */
3064    
3065     current_subject = (const unsigned char *)subject + start_offset;
3066     end_subject = (const unsigned char *)subject + length;
3067     req_byte_ptr = current_subject - 1;
3068    
3069 nigel 91 #ifdef SUPPORT_UTF8
3070 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
3071 nigel 91 #else
3072     utf8 = FALSE;
3073     #endif
3074 nigel 77
3075 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3076     (re->options & PCRE_ANCHORED) != 0;
3077    
3078 nigel 77 /* The remaining fixed data for passing around. */
3079    
3080 nigel 91 md->start_code = (const uschar *)argument_re +
3081 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3082 nigel 91 md->start_subject = (const unsigned char *)subject;
3083     md->end_subject = end_subject;
3084 ph10 442 md->start_offset = start_offset;
3085 nigel 91 md->moptions = options;
3086     md->poptions = re->options;
3087 nigel 77
3088 ph10 231 /* If the BSR option is not set at match time, copy what was set
3089     at compile time. */
3090    
3091     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3092     {
3093     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3094     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3095     #ifdef BSR_ANYCRLF
3096     else md->moptions |= PCRE_BSR_ANYCRLF;
3097 ph10 243 #endif
3098     }
3099 ph10 231
3100 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3101     nothing is set at run time, whatever was used at compile time applies. */
3102 nigel 91
3103 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3104 nigel 93 PCRE_NEWLINE_BITS)
3105 nigel 91 {
3106 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3107 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3108     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3109 nigel 91 case PCRE_NEWLINE_CR+
3110 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3111 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3112 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3113 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3114 nigel 91 }
3115    
3116 ph10 149 if (newline == -2)
3117 nigel 91 {
3118 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3119     }
3120     else if (newline < 0)
3121     {
3122 nigel 93 md->nltype = NLTYPE_ANY;
3123 nigel 91 }
3124     else
3125     {
3126 nigel 93 md->nltype = NLTYPE_FIXED;
3127     if (newline > 255)
3128     {
3129     md->nllen = 2;
3130     md->nl[0] = (newline >> 8) & 255;
3131     md->nl[1] = newline & 255;
3132     }
3133     else
3134     {
3135     md->nllen = 1;
3136     md->nl[0] = newline;
3137     }
3138 nigel 91 }
3139    
3140 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3141     back the character offset. */
3142    
3143     #ifdef SUPPORT_UTF8
3144     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3145     {
3146 ph10 654 int erroroffset;
3147 ph10 606 int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
3148     if (errorcode != 0)
3149 ph10 598 {
3150     if (offsetcount >= 2)
3151     {
3152 ph10 606 offsets[0] = erroroffset;
3153 ph10 598 offsets[1] = errorcode;
3154 ph10 654 }
3155 ph10 598 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3156 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3157 ph10 654 }
3158 ph10 606 if (start_offset > 0 && start_offset < length &&
3159 ph10 654 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
3160 ph10 606 return PCRE_ERROR_BADUTF8_OFFSET;
3161 nigel 77 }
3162     #endif
3163    
3164     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3165     is a feature that makes it possible to save compiled regex and re-use them
3166     in other programs later. */
3167    
3168 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
3169 nigel 77
3170     /* The lower casing table and the "must be at the start of a line" flag are
3171     used in a loop when finding where to start. */
3172    
3173 nigel 91 lcc = md->tables + lcc_offset;
3174 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3175 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3176    
3177     /* Set up the first character to match, if available. The first_byte value is
3178     never set for an anchored regular expression, but the anchoring may be forced
3179     at run time, so we have to test for anchoring. The first char may be unset for
3180     an unanchored pattern, of course. If there's no first char and the pattern was
3181     studied, there may be a bitmap of possible first characters. */
3182    
3183     if (!anchored)
3184     {
3185 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3186 nigel 77 {
3187     first_byte = re->first_byte & 255;
3188     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3189     first_byte = lcc[first_byte];
3190     }
3191     else
3192     {
3193 ph10 455 if (!startline && study != NULL &&
3194     (study->flags & PCRE_STUDY_MAPPED) != 0)
3195 nigel 77 start_bits = study->start_bits;
3196     }
3197     }
3198    
3199     /* For anchored or unanchored matches, there may be a "last known required
3200     character" set. */
3201    
3202 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3203 nigel 77 {
3204     req_byte = re->req_byte & 255;
3205     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3206 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3207 nigel 77 }
3208    
3209     /* Call the main matching function, looping for a non-anchored regex after a
3210 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3211     a match. */
3212 nigel 77
3213     for (;;)
3214     {
3215     int rc;
3216    
3217     if ((options & PCRE_DFA_RESTART) == 0)
3218     {
3219     const uschar *save_end_subject = end_subject;
3220    
3221 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3222     line of a multiline string. Implement this by temporarily adjusting
3223     end_subject so that we stop scanning at a newline. If the match fails at
3224     the newline, later code breaks this loop. */
3225 nigel 77
3226     if (firstline)
3227     {
3228 ph10 365 USPTR t = current_subject;
3229     #ifdef SUPPORT_UTF8
3230     if (utf8)
3231 ph10 371 {
3232     while (t < md->end_subject && !IS_NEWLINE(t))
3233 ph10 365 {
3234     t++;
3235     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3236 ph10 371 }
3237 ph10 365 }
3238     else
3239 ph10 371 #endif
3240 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3241 nigel 77 end_subject = t;
3242     }
3243 ph10 392
3244 ph10 389 /* There are some optimizations that avoid running the match if a known
3245 ph10 455 starting point is not found. However, there is an option that disables
3246 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3247 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3248     match-time options. */
3249 nigel 77
3250 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3251 ph10 392 {
3252 ph10 389 /* Advance to a known first byte. */
3253 ph10 392
3254 ph10 389 if (first_byte >= 0)
3255 nigel 77 {
3256 ph10 389 if (first_byte_caseless)
3257     while (current_subject < end_subject &&
3258     lcc[*current_subject] != first_byte)
3259     current_subject++;
3260     else
3261 ph10 392 while (current_subject < end_subject &&
3262 ph10 389 *current_subject != first_byte)
3263     current_subject++;
3264     }
3265 ph10 392
3266 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3267 ph10 392
3268 ph10 389 else if (startline)
3269     {
3270     if (current_subject > md->start_subject + start_offset)
3271     {
3272 ph10 365 #ifdef SUPPORT_UTF8
3273 ph10 389 if (utf8)
3274 ph10 365 {
3275 ph10 392 while (current_subject < end_subject &&
3276 ph10 389 !WAS_NEWLINE(current_subject))
3277     {
3278 ph10 365 current_subject++;
3279 ph10 389 while(current_subject < end_subject &&
3280     (*current_subject & 0xc0) == 0x80)
3281     current_subject++;
3282     }
3283 ph10 371 }
3284 ph10 389 else
3285     #endif
3286     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3287     current_subject++;
3288 ph10 392
3289 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3290     ANYCRLF, and we are now at a LF, advance the match position by one
3291     more character. */
3292 ph10 392
3293 ph10 391 if (current_subject[-1] == CHAR_CR &&
3294 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3295     current_subject < end_subject &&
3296 ph10 391 *current_subject == CHAR_NL)
3297 ph10 389 current_subject++;
3298 ph10 365 }
3299 nigel 77 }
3300 ph10 392
3301 ph10 389 /* Or to a non-unique first char after study */
3302 ph10 392
3303 ph10 389 else if (start_bits != NULL)
3304 nigel 77 {
3305 ph10 389 while (current_subject < end_subject)
3306     {
3307     register unsigned int c = *current_subject;
3308 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3309 ph10 538 {
3310     current_subject++;
3311     #ifdef SUPPORT_UTF8
3312     if (utf8)
3313 ph10 545 while(current_subject < end_subject &&
3314 ph10 538 (*current_subject & 0xc0) == 0x80) current_subject++;
3315 ph10 545 #endif
3316 ph10 538 }
3317     else break;
3318 ph10 389 }
3319 nigel 77 }
3320 ph10 392 }
3321 nigel 77
3322     /* Restore fudged end_subject */
3323    
3324     end_subject = save_end_subject;
3325    
3326 ph10 461 /* The following two optimizations are disabled for partial matching or if
3327     disabling is explicitly requested (and of course, by the test above, this
3328 ph10 455 code is not obeyed when restarting after a partial match). */
3329 ph10 461
3330 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3331     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3332 ph10 461 {
3333 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3334     is a lower bound; no actual string of that length may actually match the
3335     pattern. Although the value is, strictly, in characters, we treat it as
3336     bytes to avoid spending too much time in this optimization. */
3337 nigel 77
3338 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3339 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3340 ph10 455 return PCRE_ERROR_NOMATCH;
3341 ph10 461
3342 ph10 455 /* If req_byte is set, we know that that character must appear in the
3343     subject for the match to succeed. If the first character is set, req_byte
3344     must be later in the subject; otherwise the test starts at the match
3345     point. This optimization can save a huge amount of work in patterns with
3346     nested unlimited repeats that aren't going to match. Writing separate
3347     code for cased/caseless versions makes it go faster, as does using an
3348     autoincrement and backing off on a match.
3349 ph10 461
3350 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3351     can take a long time, and give bad performance on quite ordinary
3352     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3353     string... so we don't do this when the string is sufficiently long. */
3354 ph10 461
3355 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3356 nigel 77 {
3357 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3358 ph10 461
3359 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3360     place we found it at last time. */
3361 ph10 461
3362 ph10 455 if (p > req_byte_ptr)
3363 nigel 77 {
3364 ph10 455 if (req_byte_caseless)
3365     {
3366     while (p < end_subject)
3367     {
3368     register int pp = *p++;
3369     if (pp == req_byte || pp == req_byte2) { p--; break; }
3370     }
3371     }
3372     else
3373     {
3374     while (p < end_subject)
3375     {
3376     if (*p++ == req_byte) { p--; break; }
3377     }
3378     }
3379 ph10 461
3380 ph10 455 /* If we can't find the required character, break the matching loop,
3381     which will cause a return or PCRE_ERROR_NOMATCH. */
3382 ph10 461
3383 ph10 455 if (p >= end_subject) break;
3384 ph10 461
3385 ph10 455 /* If we have found the required character, save the point where we
3386     found it, so that we don't search again next time round the loop if
3387     the start hasn't passed this character yet. */
3388 ph10 461
3389 ph10 455 req_byte_ptr = p;
3390 nigel 77 }
3391 ph10 461 }
3392 nigel 77 }
3393 ph10 455 } /* End of optimizations that are done when not restarting */
3394 nigel 77
3395     /* OK, now we can do the business */
3396    
3397 ph10 435 md->start_used_ptr = current_subject;
3398 ph10 654 md->recursive = NULL;
3399 ph10 461
3400 nigel 77 rc = internal_dfa_exec(
3401 nigel 91 md, /* fixed match data */
3402     md->start_code, /* this subexpression's code */
3403     current_subject, /* where we currently are */
3404     start_offset, /* start offset in subject */
3405     offsets, /* offset vector */
3406     offsetcount, /* size of same */
3407     workspace, /* workspace vector */
3408     wscount, /* size of same */
3409 ph10 642 0); /* function recurse level */
3410 nigel 77
3411     /* Anything other than "no match" means we are done, always; otherwise, carry
3412     on only if not anchored. */
3413    
3414     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3415    
3416     /* Advance to the next subject character unless we are at the end of a line
3417     and firstline is set. */
3418    
3419 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3420 nigel 77 current_subject++;
3421     if (utf8)
3422     {
3423     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3424     current_subject++;
3425     }
3426     if (current_subject > end_subject) break;
3427    
3428 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3429 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3430     or ANY or ANYCRLF, advance the match position by one more character. */
3431 nigel 93
3432 ph10 391 if (current_subject[-1] == CHAR_CR &&
3433 ph10 226 current_subject < end_subject &&
3434 ph10 391 *current_subject == CHAR_NL &&
3435 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3436 ph10 226 (md->nltype == NLTYPE_ANY ||
3437     md->nltype == NLTYPE_ANYCRLF ||
3438     md->nllen == 2))
3439 nigel 93 current_subject++;
3440    
3441     } /* "Bumpalong" loop */
3442    
3443 nigel 77 return PCRE_ERROR_NOMATCH;
3444     }
3445    
3446     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12