/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 613 - (hide annotations) (download)
Sat Jul 2 16:59:52 2011 UTC (3 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 118971 byte(s)
Fix problem with the interaction of (*ACCEPT) in an assertion with 
PCRE_NOTEMPTY.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 598 Copyright (c) 1997-2011 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 327 static const uschar coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 nigel 77 1, /* Char */
126 ph10 602 1, /* Chari */
127 nigel 77 1, /* not */
128 ph10 602 1, /* noti */
129 nigel 77 /* Positive single-char repeats */
130     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131     3, 3, 3, /* upto, minupto, exact */
132 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134     3, 3, 3, /* upto I, minupto I, exact I */
135     1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 nigel 77 /* Negative single-char repeats - only for chars < 256 */
137     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* NOT upto, minupto, exact */
139 ph10 602 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140     1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141     3, 3, 3, /* NOT upto I, minupto I, exact I */
142     1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 nigel 77 /* Positive type repeats */
144     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145     3, 3, 3, /* Type upto, minupto, exact */
146 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 nigel 77 /* Character class & ref repeats */
148     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149     0, 0, /* CRRANGE, CRMINRANGE */
150     0, /* CLASS */
151     0, /* NCLASS */
152     0, /* XCLASS - variable length */
153     0, /* REF */
154 ph10 602 0, /* REFI */
155 nigel 77 0, /* RECURSE */
156     0, /* CALLOUT */
157     0, /* Alt */
158     0, /* Ket */
159     0, /* KetRmax */
160     0, /* KetRmin */
161 ph10 604 0, /* KetRpos */
162 nigel 77 0, /* Assert */
163     0, /* Assert not */
164     0, /* Assert behind */
165     0, /* Assert behind not */
166     0, /* Reverse */
167 ph10 604 0, 0, 0, 0, 0, 0, /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
168     0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
169 ph10 498 0, 0, /* CREF, NCREF */
170     0, 0, /* RREF, NRREF */
171 nigel 93 0, /* DEF */
172 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
173 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
174     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
175     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
176     0, 0 /* CLOSE, SKIPZERO */
177 nigel 77 };
178    
179 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
180 ph10 462 remember the fact that a character could have been inspected when the end of
181 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
182     two tables that follow must also be modified. */
183 ph10 462
184     static const uschar poptable[] = {
185     0, /* End */
186 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
187 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
188     1, 1, 1, /* Any, AllAny, Anybyte */
189 ph10 498 1, 1, /* \P, \p */
190 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
191 ph10 498 1, /* \X */
192 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
193 ph10 462 1, /* Char */
194 ph10 602 1, /* Chari */
195 ph10 462 1, /* not */
196 ph10 602 1, /* noti */
197 ph10 462 /* Positive single-char repeats */
198     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
199     1, 1, 1, /* upto, minupto, exact */
200     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
201 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
202     1, 1, 1, /* upto I, minupto I, exact I */
203     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
204 ph10 462 /* Negative single-char repeats - only for chars < 256 */
205     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
206     1, 1, 1, /* NOT upto, minupto, exact */
207     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
208 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
209     1, 1, 1, /* NOT upto I, minupto I, exact I */
210     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
211 ph10 462 /* Positive type repeats */
212     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
213     1, 1, 1, /* Type upto, minupto, exact */
214     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
215     /* Character class & ref repeats */
216     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
217     1, 1, /* CRRANGE, CRMINRANGE */
218     1, /* CLASS */
219     1, /* NCLASS */
220     1, /* XCLASS - variable length */
221     0, /* REF */
222 ph10 602 0, /* REFI */
223 ph10 462 0, /* RECURSE */
224     0, /* CALLOUT */
225     0, /* Alt */
226     0, /* Ket */
227     0, /* KetRmax */
228     0, /* KetRmin */
229 ph10 604 0, /* KetRpos */
230 ph10 462 0, /* Assert */
231     0, /* Assert not */
232     0, /* Assert behind */
233     0, /* Assert behind not */
234     0, /* Reverse */
235 ph10 604 0, 0, 0, 0, 0, 0, /* ONCE, BRA, BRAPOS, CBRA, CBRAPOS, COND */
236     0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
237 ph10 498 0, 0, /* CREF, NCREF */
238     0, 0, /* RREF, NRREF */
239 ph10 462 0, /* DEF */
240 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
241 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
242     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
243     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
244     0, 0 /* CLOSE, SKIPZERO */
245 ph10 462 };
246    
247 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
248     and \w */
249    
250 ph10 327 static const uschar toptable1[] = {
251 ph10 168 0, 0, 0, 0, 0, 0,
252 nigel 77 ctype_digit, ctype_digit,
253     ctype_space, ctype_space,
254     ctype_word, ctype_word,
255 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
256 nigel 77 };
257    
258 ph10 327 static const uschar toptable2[] = {
259 ph10 168 0, 0, 0, 0, 0, 0,
260 nigel 77 ctype_digit, 0,
261     ctype_space, 0,
262     ctype_word, 0,
263 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
264 nigel 77 };
265    
266    
267     /* Structure for holding data about a particular state, which is in effect the
268     current data for an active path through the match tree. It must consist
269     entirely of ints because the working vector we are passed, and which we put
270     these structures in, is a vector of ints. */
271    
272     typedef struct stateblock {
273     int offset; /* Offset to opcode */
274     int count; /* Count for repeats */
275     int data; /* Some use extra data */
276     } stateblock;
277    
278     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
279    
280    
281 ph10 475 #ifdef PCRE_DEBUG
282 nigel 77 /*************************************************
283     * Print character string *
284     *************************************************/
285    
286     /* Character string printing function for debugging.
287    
288     Arguments:
289     p points to string
290     length number of bytes
291     f where to print
292    
293     Returns: nothing
294     */
295    
296     static void
297     pchars(unsigned char *p, int length, FILE *f)
298     {
299     int c;
300     while (length-- > 0)
301     {
302     if (isprint(c = *(p++)))
303     fprintf(f, "%c", c);
304     else
305     fprintf(f, "\\x%02x", c);
306     }
307     }
308     #endif
309    
310    
311    
312     /*************************************************
313     * Execute a Regular Expression - DFA engine *
314     *************************************************/
315    
316     /* This internal function applies a compiled pattern to a subject string,
317     starting at a given point, using a DFA engine. This function is called from the
318     external one, possibly multiple times if the pattern is not anchored. The
319     function calls itself recursively for some kinds of subpattern.
320    
321     Arguments:
322     md the match_data block with fixed information
323     this_start_code the opening bracket of this subexpression's code
324     current_subject where we currently are in the subject string
325     start_offset start offset in the subject string
326     offsets vector to contain the matching string offsets
327     offsetcount size of same
328     workspace vector of workspace
329     wscount size of same
330     rlevel function call recursion level
331     recursing regex recursive call level
332    
333 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
334 ph10 341 = 0 => offsets overflowed; longest matches are present
335 nigel 77 -1 => failed to match
336     < -1 => some kind of unexpected problem
337    
338     The following macros are used for adding states to the two state vectors (one
339     for the current character, one for the following character). */
340    
341     #define ADD_ACTIVE(x,y) \
342     if (active_count++ < wscount) \
343     { \
344     next_active_state->offset = (x); \
345     next_active_state->count = (y); \
346     next_active_state++; \
347     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
348     } \
349     else return PCRE_ERROR_DFA_WSSIZE
350    
351     #define ADD_ACTIVE_DATA(x,y,z) \
352     if (active_count++ < wscount) \
353     { \
354     next_active_state->offset = (x); \
355     next_active_state->count = (y); \
356     next_active_state->data = (z); \
357     next_active_state++; \
358     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
359     } \
360     else return PCRE_ERROR_DFA_WSSIZE
361    
362     #define ADD_NEW(x,y) \
363     if (new_count++ < wscount) \
364     { \
365     next_new_state->offset = (x); \
366     next_new_state->count = (y); \
367     next_new_state++; \
368     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
369     } \
370     else return PCRE_ERROR_DFA_WSSIZE
371    
372     #define ADD_NEW_DATA(x,y,z) \
373     if (new_count++ < wscount) \
374     { \
375     next_new_state->offset = (x); \
376     next_new_state->count = (y); \
377     next_new_state->data = (z); \
378     next_new_state++; \
379     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
380     } \
381     else return PCRE_ERROR_DFA_WSSIZE
382    
383     /* And now, here is the code */
384    
385     static int
386     internal_dfa_exec(
387     dfa_match_data *md,
388     const uschar *this_start_code,
389     const uschar *current_subject,
390     int start_offset,
391     int *offsets,
392     int offsetcount,
393     int *workspace,
394     int wscount,
395     int rlevel,
396     int recursing)
397     {
398     stateblock *active_states, *new_states, *temp_states;
399     stateblock *next_active_state, *next_new_state;
400    
401     const uschar *ctypes, *lcc, *fcc;
402     const uschar *ptr;
403 nigel 93 const uschar *end_code, *first_op;
404 nigel 77
405     int active_count, new_count, match_count;
406    
407     /* Some fields in the md block are frequently referenced, so we load them into
408     independent variables in the hope that this will perform better. */
409    
410     const uschar *start_subject = md->start_subject;
411     const uschar *end_subject = md->end_subject;
412     const uschar *start_code = md->start_code;
413    
414 nigel 87 #ifdef SUPPORT_UTF8
415 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
416 nigel 93 #else
417     BOOL utf8 = FALSE;
418 nigel 87 #endif
419 nigel 77
420     rlevel++;
421     offsetcount &= (-2);
422    
423     wscount -= 2;
424     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
425     (2 * INTS_PER_STATEBLOCK);
426    
427     DPRINTF(("\n%.*s---------------------\n"
428     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
429     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
430    
431     ctypes = md->tables + ctypes_offset;
432     lcc = md->tables + lcc_offset;
433     fcc = md->tables + fcc_offset;
434    
435     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
436    
437     active_states = (stateblock *)(workspace + 2);
438     next_new_state = new_states = active_states + wscount;
439     new_count = 0;
440    
441 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
442 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
443     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)? 2:0);
444 nigel 93
445 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
446     the alternative states onto the list, and find out where the end is. This
447     makes is possible to use this function recursively, when we want to stop at a
448     matching internal ket rather than at the end.
449    
450     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
451     a backward assertion. In that case, we have to find out the maximum amount to
452     move back, and set up each alternative appropriately. */
453    
454 nigel 93 if (*first_op == OP_REVERSE)
455 nigel 77 {
456     int max_back = 0;
457     int gone_back;
458    
459     end_code = this_start_code;
460     do
461     {
462     int back = GET(end_code, 2+LINK_SIZE);
463     if (back > max_back) max_back = back;
464     end_code += GET(end_code, 1);
465     }
466     while (*end_code == OP_ALT);
467    
468     /* If we can't go back the amount required for the longest lookbehind
469     pattern, go back as far as we can; some alternatives may still be viable. */
470    
471     #ifdef SUPPORT_UTF8
472     /* In character mode we have to step back character by character */
473    
474     if (utf8)
475     {
476     for (gone_back = 0; gone_back < max_back; gone_back++)
477     {
478     if (current_subject <= start_subject) break;
479     current_subject--;
480     while (current_subject > start_subject &&
481     (*current_subject & 0xc0) == 0x80)
482     current_subject--;
483     }
484     }
485     else
486     #endif
487    
488     /* In byte-mode we can do this quickly. */
489    
490     {
491     gone_back = (current_subject - max_back < start_subject)?
492 ph10 530 (int)(current_subject - start_subject) : max_back;
493 nigel 77 current_subject -= gone_back;
494     }
495 ph10 461
496 ph10 435 /* Save the earliest consulted character */
497 nigel 77
498 ph10 461 if (current_subject < md->start_used_ptr)
499     md->start_used_ptr = current_subject;
500    
501 nigel 77 /* Now we can process the individual branches. */
502    
503     end_code = this_start_code;
504     do
505     {
506     int back = GET(end_code, 2+LINK_SIZE);
507     if (back <= gone_back)
508     {
509 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
510 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
511     }
512     end_code += GET(end_code, 1);
513     }
514     while (*end_code == OP_ALT);
515     }
516    
517     /* This is the code for a "normal" subpattern (not a backward assertion). The
518     start of a whole pattern is always one of these. If we are at the top level,
519     we may be asked to restart matching from the same point that we reached for a
520     previous partial match. We still have to scan through the top-level branches to
521     find the end state. */
522    
523     else
524     {
525     end_code = this_start_code;
526    
527     /* Restarting */
528    
529     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
530     {
531     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
532     new_count = workspace[1];
533     if (!workspace[0])
534     memcpy(new_states, active_states, new_count * sizeof(stateblock));
535     }
536    
537     /* Not restarting */
538    
539     else
540     {
541 nigel 93 int length = 1 + LINK_SIZE +
542 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
543     *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)?
544     2:0);
545 nigel 77 do
546     {
547 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
548 nigel 77 end_code += GET(end_code, 1);
549 nigel 93 length = 1 + LINK_SIZE;
550 nigel 77 }
551     while (*end_code == OP_ALT);
552     }
553     }
554    
555     workspace[0] = 0; /* Bit indicating which vector is current */
556    
557     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
558    
559     /* Loop for scanning the subject */
560    
561     ptr = current_subject;
562     for (;;)
563     {
564     int i, j;
565 nigel 91 int clen, dlen;
566     unsigned int c, d;
567 ph10 428 int forced_fail = 0;
568 ph10 462 BOOL could_continue = FALSE;
569 nigel 77
570     /* Make the new state list into the active state list and empty the
571     new state list. */
572    
573     temp_states = active_states;
574     active_states = new_states;
575     new_states = temp_states;
576     active_count = new_count;
577     new_count = 0;
578    
579     workspace[0] ^= 1; /* Remember for the restarting feature */
580     workspace[1] = active_count;
581    
582 ph10 475 #ifdef PCRE_DEBUG
583 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
584     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
585     printf("\"\n");
586    
587     printf("%.*sActive states: ", rlevel*2-2, SP);
588     for (i = 0; i < active_count; i++)
589     printf("%d/%d ", active_states[i].offset, active_states[i].count);
590     printf("\n");
591     #endif
592    
593     /* Set the pointers for adding new states */
594    
595     next_active_state = active_states + active_count;
596     next_new_state = new_states;
597    
598     /* Load the current character from the subject outside the loop, as many
599     different states may want to look at it, and we assume that at least one
600     will. */
601    
602     if (ptr < end_subject)
603     {
604 nigel 93 clen = 1; /* Number of bytes in the character */
605 nigel 77 #ifdef SUPPORT_UTF8
606     if (utf8) { GETCHARLEN(c, ptr, clen); } else
607     #endif /* SUPPORT_UTF8 */
608     c = *ptr;
609     }
610     else
611     {
612 nigel 93 clen = 0; /* This indicates the end of the subject */
613     c = NOTACHAR; /* This value should never actually be used */
614 nigel 77 }
615    
616     /* Scan up the active states and act on each one. The result of an action
617     may be to add more states to the currently active list (e.g. on hitting a
618     parenthesis) or it may be to put states on the new list, for considering
619     when we move the character pointer on. */
620    
621     for (i = 0; i < active_count; i++)
622     {
623     stateblock *current_state = active_states + i;
624 ph10 602 BOOL caseless = FALSE;
625 nigel 77 const uschar *code;
626     int state_offset = current_state->offset;
627 ph10 397 int count, codevalue, rrc;
628 nigel 77
629 ph10 475 #ifdef PCRE_DEBUG
630 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
631 nigel 93 if (clen == 0) printf("EOL\n");
632 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
633     else printf("0x%02x\n", c);
634     #endif
635    
636     /* A negative offset is a special case meaning "hold off going to this
637     (negated) state until the number of characters in the data field have
638     been skipped". */
639    
640     if (state_offset < 0)
641     {
642     if (current_state->data > 0)
643     {
644     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
645     ADD_NEW_DATA(state_offset, current_state->count,
646     current_state->data - 1);
647     continue;
648     }
649     else
650     {
651     current_state->offset = state_offset = -state_offset;
652     }
653     }
654    
655 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
656 ph10 439 See the note at the head of this module about the possibility of improving
657     performance here. */
658 nigel 77
659     for (j = 0; j < i; j++)
660     {
661     if (active_states[j].offset == state_offset &&
662     active_states[j].count == current_state->count)
663     {
664     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
665     goto NEXT_ACTIVE_STATE;
666     }
667     }
668    
669     /* The state offset is the offset to the opcode */
670    
671     code = start_code + state_offset;
672     codevalue = *code;
673    
674 ph10 463 /* If this opcode inspects a character, but we are at the end of the
675     subject, remember the fact for use when testing for a partial match. */
676    
677 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
678 ph10 463 could_continue = TRUE;
679 ph10 462
680 nigel 77 /* If this opcode is followed by an inline character, load it. It is
681     tempting to test for the presence of a subject character here, but that
682     is wrong, because sometimes zero repetitions of the subject are
683     permitted.
684    
685     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
686 ph10 178 argument that is not a data character - but is always one byte long. We
687     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
688     this case. To keep the other cases fast, convert these ones to new opcodes.
689     */
690 nigel 77
691     if (coptable[codevalue] > 0)
692     {
693     dlen = 1;
694     #ifdef SUPPORT_UTF8
695     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
696     #endif /* SUPPORT_UTF8 */
697     d = code[coptable[codevalue]];
698     if (codevalue >= OP_TYPESTAR)
699     {
700 nigel 93 switch(d)
701     {
702     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
703     case OP_NOTPROP:
704     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
705     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
706     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
707 ph10 178 case OP_NOT_HSPACE:
708 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
709 ph10 178 case OP_NOT_VSPACE:
710 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
711 nigel 93 default: break;
712     }
713 nigel 77 }
714     }
715     else
716     {
717     dlen = 0; /* Not strictly necessary, but compilers moan */
718 nigel 93 d = NOTACHAR; /* if these variables are not set. */
719 nigel 77 }
720    
721    
722     /* Now process the individual opcodes */
723    
724     switch (codevalue)
725     {
726 ph10 498 /* ========================================================================== */
727     /* These cases are never obeyed. This is a fudge that causes a compile-
728     time error if the vectors coptable or poptable, which are indexed by
729     opcode, are not the correct length. It seems to be the only way to do
730     such a check at compile time, as the sizeof() operator does not work
731     in the C preprocessor. */
732 ph10 507
733 ph10 498 case OP_TABLE_LENGTH:
734 ph10 507 case OP_TABLE_LENGTH +
735 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
736     (sizeof(poptable) == OP_TABLE_LENGTH)):
737 ph10 507 break;
738 nigel 77
739     /* ========================================================================== */
740     /* Reached a closing bracket. If not at the end of the pattern, carry
741 ph10 604 on with the next opcode. For repeating opcodes, also add the repeat
742     state. Note that KETRPOS will always be encountered at the end of the
743     subpattern, because the possessive subpattern repeats are always handled
744     using recursive calls. Thus, it never adds any new states.
745    
746     At the end of the (sub)pattern, unless we have an empty string and
747 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
748 ph10 442 start of the subject, save the match data, shifting up all previous
749 nigel 77 matches so we always have the longest first. */
750    
751     case OP_KET:
752     case OP_KETRMIN:
753     case OP_KETRMAX:
754 ph10 604 case OP_KETRPOS:
755 nigel 77 if (code != end_code)
756     {
757     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
758     if (codevalue != OP_KET)
759     {
760     ADD_ACTIVE(state_offset - GET(code, 1), 0);
761     }
762     }
763 ph10 461 else
764 nigel 77 {
765 ph10 461 if (ptr > current_subject ||
766 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
767     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
768     current_subject > start_subject + md->start_offset)))
769 nigel 77 {
770 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
771     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
772     match_count = 0;
773     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
774     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
775     if (offsetcount >= 2)
776     {
777 ph10 530 offsets[0] = (int)(current_subject - start_subject);
778     offsets[1] = (int)(ptr - start_subject);
779 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
780     offsets[1] - offsets[0], current_subject));
781     }
782     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
783     {
784     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
785     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
786     match_count, rlevel*2-2, SP));
787     return match_count;
788     }
789 ph10 461 }
790 nigel 77 }
791     break;
792    
793     /* ========================================================================== */
794     /* These opcodes add to the current list of states without looking
795     at the current character. */
796    
797     /*-----------------------------------------------------------------*/
798     case OP_ALT:
799     do { code += GET(code, 1); } while (*code == OP_ALT);
800 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
801 nigel 77 break;
802    
803     /*-----------------------------------------------------------------*/
804     case OP_BRA:
805 nigel 93 case OP_SBRA:
806 nigel 77 do
807     {
808 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
809 nigel 77 code += GET(code, 1);
810     }
811     while (*code == OP_ALT);
812     break;
813    
814     /*-----------------------------------------------------------------*/
815 nigel 93 case OP_CBRA:
816     case OP_SCBRA:
817 ph10 530 ADD_ACTIVE((int)(code - start_code + 3 + LINK_SIZE), 0);
818 nigel 93 code += GET(code, 1);
819     while (*code == OP_ALT)
820     {
821 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
822 nigel 93 code += GET(code, 1);
823     }
824     break;
825    
826     /*-----------------------------------------------------------------*/
827 nigel 77 case OP_BRAZERO:
828     case OP_BRAMINZERO:
829     ADD_ACTIVE(state_offset + 1, 0);
830     code += 1 + GET(code, 2);
831     while (*code == OP_ALT) code += GET(code, 1);
832 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
833 nigel 77 break;
834    
835     /*-----------------------------------------------------------------*/
836 ph10 335 case OP_SKIPZERO:
837     code += 1 + GET(code, 2);
838     while (*code == OP_ALT) code += GET(code, 1);
839 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
840 ph10 335 break;
841    
842     /*-----------------------------------------------------------------*/
843 nigel 77 case OP_CIRC:
844 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
845     { ADD_ACTIVE(state_offset + 1, 0); }
846     break;
847    
848     /*-----------------------------------------------------------------*/
849     case OP_CIRCM:
850 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
851 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
852 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
853     break;
854    
855     /*-----------------------------------------------------------------*/
856     case OP_EOD:
857 ph10 579 if (ptr >= end_subject)
858     {
859 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
860     could_continue = TRUE;
861     else { ADD_ACTIVE(state_offset + 1, 0); }
862     }
863 nigel 77 break;
864    
865     /*-----------------------------------------------------------------*/
866     case OP_SOD:
867     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
868     break;
869    
870     /*-----------------------------------------------------------------*/
871     case OP_SOM:
872     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
873     break;
874    
875    
876     /* ========================================================================== */
877     /* These opcodes inspect the next subject character, and sometimes
878     the previous one as well, but do not have an argument. The variable
879     clen contains the length of the current character and is zero if we are
880     at the end of the subject. */
881    
882     /*-----------------------------------------------------------------*/
883     case OP_ANY:
884 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
885 nigel 77 { ADD_NEW(state_offset + 1, 0); }
886     break;
887    
888     /*-----------------------------------------------------------------*/
889 ph10 341 case OP_ALLANY:
890     if (clen > 0)
891     { ADD_NEW(state_offset + 1, 0); }
892     break;
893    
894     /*-----------------------------------------------------------------*/
895 nigel 77 case OP_EODN:
896 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
897     could_continue = TRUE;
898     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
899 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
900     break;
901    
902     /*-----------------------------------------------------------------*/
903     case OP_DOLL:
904     if ((md->moptions & PCRE_NOTEOL) == 0)
905     {
906 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
907     could_continue = TRUE;
908     else if (clen == 0 ||
909 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
910 ph10 602 (ptr == end_subject - md->nllen)
911 nigel 91 ))
912 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
913     }
914 ph10 602 break;
915    
916     /*-----------------------------------------------------------------*/
917     case OP_DOLLM:
918     if ((md->moptions & PCRE_NOTEOL) == 0)
919     {
920     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
921     could_continue = TRUE;
922     else if (clen == 0 ||
923     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
924     { ADD_ACTIVE(state_offset + 1, 0); }
925     }
926     else if (IS_NEWLINE(ptr))
927 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
928     break;
929    
930     /*-----------------------------------------------------------------*/
931    
932     case OP_DIGIT:
933     case OP_WHITESPACE:
934     case OP_WORDCHAR:
935     if (clen > 0 && c < 256 &&
936     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
937     { ADD_NEW(state_offset + 1, 0); }
938     break;
939    
940     /*-----------------------------------------------------------------*/
941     case OP_NOT_DIGIT:
942     case OP_NOT_WHITESPACE:
943     case OP_NOT_WORDCHAR:
944     if (clen > 0 && (c >= 256 ||
945     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
946     { ADD_NEW(state_offset + 1, 0); }
947     break;
948    
949     /*-----------------------------------------------------------------*/
950     case OP_WORD_BOUNDARY:
951     case OP_NOT_WORD_BOUNDARY:
952     {
953     int left_word, right_word;
954    
955     if (ptr > start_subject)
956     {
957     const uschar *temp = ptr - 1;
958 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
959 nigel 77 #ifdef SUPPORT_UTF8
960     if (utf8) BACKCHAR(temp);
961     #endif
962     GETCHARTEST(d, temp);
963 ph10 535 #ifdef SUPPORT_UCP
964 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
965     {
966     if (d == '_') left_word = TRUE; else
967 ph10 535 {
968 ph10 518 int cat = UCD_CATEGORY(d);
969     left_word = (cat == ucp_L || cat == ucp_N);
970 ph10 535 }
971     }
972     else
973     #endif
974 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
975     }
976 ph10 518 else left_word = FALSE;
977 nigel 77
978 ph10 461 if (clen > 0)
979 ph10 535 {
980     #ifdef SUPPORT_UCP
981 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
982     {
983     if (c == '_') right_word = TRUE; else
984 ph10 535 {
985 ph10 518 int cat = UCD_CATEGORY(c);
986     right_word = (cat == ucp_L || cat == ucp_N);
987 ph10 535 }
988     }
989     else
990     #endif
991 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
992 ph10 535 }
993 ph10 518 else right_word = FALSE;
994 nigel 77
995     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
996     { ADD_ACTIVE(state_offset + 1, 0); }
997     }
998     break;
999    
1000    
1001     /*-----------------------------------------------------------------*/
1002     /* Check the next character by Unicode property. We will get here only
1003     if the support is in the binary; otherwise a compile-time error occurs.
1004     */
1005    
1006 ph10 151 #ifdef SUPPORT_UCP
1007 nigel 77 case OP_PROP:
1008     case OP_NOTPROP:
1009     if (clen > 0)
1010     {
1011 nigel 87 BOOL OK;
1012 ph10 349 const ucd_record * prop = GET_UCD(c);
1013 nigel 87 switch(code[1])
1014 nigel 77 {
1015 nigel 87 case PT_ANY:
1016     OK = TRUE;
1017     break;
1018    
1019     case PT_LAMP:
1020 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1021 ph10 517 prop->chartype == ucp_Lt;
1022 nigel 87 break;
1023    
1024     case PT_GC:
1025 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1026 nigel 87 break;
1027    
1028     case PT_PC:
1029 ph10 349 OK = prop->chartype == code[2];
1030 nigel 87 break;
1031    
1032     case PT_SC:
1033 ph10 349 OK = prop->script == code[2];
1034 nigel 87 break;
1035 ph10 535
1036 ph10 517 /* These are specials for combination cases. */
1037 ph10 535
1038 ph10 517 case PT_ALNUM:
1039     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1040     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1041 ph10 535 break;
1042    
1043 ph10 517 case PT_SPACE: /* Perl space */
1044     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1045     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1046 ph10 535 break;
1047    
1048 ph10 517 case PT_PXSPACE: /* POSIX space */
1049     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1050     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1051     c == CHAR_FF || c == CHAR_CR;
1052 ph10 535 break;
1053    
1054 ph10 517 case PT_WORD:
1055     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1056     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1057     c == CHAR_UNDERSCORE;
1058 ph10 535 break;
1059 nigel 87
1060     /* Should never occur, but keep compilers from grumbling. */
1061    
1062     default:
1063     OK = codevalue != OP_PROP;
1064     break;
1065 nigel 77 }
1066 nigel 87
1067     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1068 nigel 77 }
1069     break;
1070     #endif
1071    
1072    
1073    
1074     /* ========================================================================== */
1075     /* These opcodes likewise inspect the subject character, but have an
1076     argument that is not a data character. It is one of these opcodes:
1077 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1078     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1079 nigel 77
1080     case OP_TYPEPLUS:
1081     case OP_TYPEMINPLUS:
1082 nigel 93 case OP_TYPEPOSPLUS:
1083 nigel 77 count = current_state->count; /* Already matched */
1084     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1085     if (clen > 0)
1086     {
1087     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1088     (c < 256 &&
1089 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1090 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1091     {
1092 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1093     {
1094     active_count--; /* Remove non-match possibility */
1095     next_active_state--;
1096     }
1097 nigel 77 count++;
1098     ADD_NEW(state_offset, count);
1099     }
1100     }
1101     break;
1102    
1103     /*-----------------------------------------------------------------*/
1104     case OP_TYPEQUERY:
1105     case OP_TYPEMINQUERY:
1106 nigel 93 case OP_TYPEPOSQUERY:
1107 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1108     if (clen > 0)
1109     {
1110     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1111     (c < 256 &&
1112 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1113 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1114     {
1115 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1116     {
1117     active_count--; /* Remove non-match possibility */
1118     next_active_state--;
1119     }
1120 nigel 77 ADD_NEW(state_offset + 2, 0);
1121     }
1122     }
1123     break;
1124    
1125     /*-----------------------------------------------------------------*/
1126     case OP_TYPESTAR:
1127     case OP_TYPEMINSTAR:
1128 nigel 93 case OP_TYPEPOSSTAR:
1129 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1130     if (clen > 0)
1131     {
1132     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1133     (c < 256 &&
1134 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1135 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1136     {
1137 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1138     {
1139     active_count--; /* Remove non-match possibility */
1140     next_active_state--;
1141     }
1142 nigel 77 ADD_NEW(state_offset, 0);
1143     }
1144     }
1145     break;
1146    
1147     /*-----------------------------------------------------------------*/
1148     case OP_TYPEEXACT:
1149 nigel 93 count = current_state->count; /* Number already matched */
1150     if (clen > 0)
1151     {
1152     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1153     (c < 256 &&
1154 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1155 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1156     {
1157     if (++count >= GET2(code, 1))
1158     { ADD_NEW(state_offset + 4, 0); }
1159     else
1160     { ADD_NEW(state_offset, count); }
1161     }
1162     }
1163     break;
1164    
1165     /*-----------------------------------------------------------------*/
1166 nigel 77 case OP_TYPEUPTO:
1167     case OP_TYPEMINUPTO:
1168 nigel 93 case OP_TYPEPOSUPTO:
1169     ADD_ACTIVE(state_offset + 4, 0);
1170 nigel 77 count = current_state->count; /* Number already matched */
1171     if (clen > 0)
1172     {
1173     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1174     (c < 256 &&
1175 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1176 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1177     {
1178 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1179     {
1180     active_count--; /* Remove non-match possibility */
1181     next_active_state--;
1182     }
1183 nigel 77 if (++count >= GET2(code, 1))
1184     { ADD_NEW(state_offset + 4, 0); }
1185     else
1186     { ADD_NEW(state_offset, count); }
1187     }
1188     }
1189     break;
1190    
1191     /* ========================================================================== */
1192     /* These are virtual opcodes that are used when something like
1193 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1194     argument. It keeps the code above fast for the other cases. The argument
1195     is in the d variable. */
1196 nigel 77
1197 ph10 151 #ifdef SUPPORT_UCP
1198 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1199     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1200 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1201 nigel 77 count = current_state->count; /* Already matched */
1202 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1203 nigel 77 if (clen > 0)
1204     {
1205 nigel 87 BOOL OK;
1206 ph10 349 const ucd_record * prop = GET_UCD(c);
1207 nigel 87 switch(code[2])
1208     {
1209     case PT_ANY:
1210     OK = TRUE;
1211     break;
1212    
1213     case PT_LAMP:
1214 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1215 ph10 517 prop->chartype == ucp_Lt;
1216 nigel 87 break;
1217    
1218     case PT_GC:
1219 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1220 nigel 87 break;
1221    
1222     case PT_PC:
1223 ph10 349 OK = prop->chartype == code[3];
1224 nigel 87 break;
1225    
1226     case PT_SC:
1227 ph10 349 OK = prop->script == code[3];
1228 nigel 87 break;
1229    
1230 ph10 517 /* These are specials for combination cases. */
1231 ph10 535
1232 ph10 517 case PT_ALNUM:
1233     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1234     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1235 ph10 535 break;
1236    
1237 ph10 517 case PT_SPACE: /* Perl space */
1238     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1239     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1240 ph10 535 break;
1241    
1242 ph10 517 case PT_PXSPACE: /* POSIX space */
1243     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1244     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1245     c == CHAR_FF || c == CHAR_CR;
1246 ph10 535 break;
1247    
1248 ph10 517 case PT_WORD:
1249     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1250     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1251     c == CHAR_UNDERSCORE;
1252 ph10 535 break;
1253 ph10 517
1254 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1255    
1256     default:
1257     OK = codevalue != OP_PROP;
1258     break;
1259     }
1260    
1261 nigel 93 if (OK == (d == OP_PROP))
1262     {
1263     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1264     {
1265     active_count--; /* Remove non-match possibility */
1266     next_active_state--;
1267     }
1268     count++;
1269     ADD_NEW(state_offset, count);
1270     }
1271 nigel 77 }
1272     break;
1273    
1274     /*-----------------------------------------------------------------*/
1275     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1276     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1277 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1278 nigel 77 count = current_state->count; /* Already matched */
1279     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1280 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1281 nigel 77 {
1282     const uschar *nptr = ptr + clen;
1283     int ncount = 0;
1284 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1285     {
1286     active_count--; /* Remove non-match possibility */
1287     next_active_state--;
1288     }
1289 nigel 77 while (nptr < end_subject)
1290     {
1291     int nd;
1292     int ndlen = 1;
1293     GETCHARLEN(nd, nptr, ndlen);
1294 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1295 nigel 77 ncount++;
1296     nptr += ndlen;
1297     }
1298     count++;
1299     ADD_NEW_DATA(-state_offset, count, ncount);
1300     }
1301     break;
1302 ph10 151 #endif
1303 nigel 77
1304     /*-----------------------------------------------------------------*/
1305 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1306     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1307     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1308     count = current_state->count; /* Already matched */
1309     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1310     if (clen > 0)
1311     {
1312     int ncount = 0;
1313     switch (c)
1314     {
1315     case 0x000b:
1316     case 0x000c:
1317     case 0x0085:
1318     case 0x2028:
1319     case 0x2029:
1320 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1321     goto ANYNL01;
1322    
1323     case 0x000d:
1324     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1325     /* Fall through */
1326    
1327     ANYNL01:
1328     case 0x000a:
1329 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1330     {
1331     active_count--; /* Remove non-match possibility */
1332     next_active_state--;
1333     }
1334     count++;
1335     ADD_NEW_DATA(-state_offset, count, ncount);
1336     break;
1337 ph10 231
1338 nigel 93 default:
1339     break;
1340     }
1341     }
1342     break;
1343    
1344     /*-----------------------------------------------------------------*/
1345 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1346     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1347     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1348     count = current_state->count; /* Already matched */
1349     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1350     if (clen > 0)
1351     {
1352 ph10 182 BOOL OK;
1353 ph10 178 switch (c)
1354     {
1355     case 0x000a:
1356     case 0x000b:
1357     case 0x000c:
1358     case 0x000d:
1359     case 0x0085:
1360     case 0x2028:
1361     case 0x2029:
1362     OK = TRUE;
1363 ph10 182 break;
1364 ph10 178
1365     default:
1366     OK = FALSE;
1367 ph10 182 break;
1368 ph10 178 }
1369    
1370     if (OK == (d == OP_VSPACE))
1371 ph10 182 {
1372 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1373     {
1374     active_count--; /* Remove non-match possibility */
1375     next_active_state--;
1376     }
1377     count++;
1378     ADD_NEW_DATA(-state_offset, count, 0);
1379     }
1380     }
1381     break;
1382    
1383     /*-----------------------------------------------------------------*/
1384     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1385     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1386     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1387     count = current_state->count; /* Already matched */
1388     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1389     if (clen > 0)
1390     {
1391 ph10 182 BOOL OK;
1392 ph10 178 switch (c)
1393     {
1394     case 0x09: /* HT */
1395     case 0x20: /* SPACE */
1396     case 0xa0: /* NBSP */
1397     case 0x1680: /* OGHAM SPACE MARK */
1398     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1399     case 0x2000: /* EN QUAD */
1400     case 0x2001: /* EM QUAD */
1401     case 0x2002: /* EN SPACE */
1402     case 0x2003: /* EM SPACE */
1403     case 0x2004: /* THREE-PER-EM SPACE */
1404     case 0x2005: /* FOUR-PER-EM SPACE */
1405     case 0x2006: /* SIX-PER-EM SPACE */
1406     case 0x2007: /* FIGURE SPACE */
1407     case 0x2008: /* PUNCTUATION SPACE */
1408     case 0x2009: /* THIN SPACE */
1409     case 0x200A: /* HAIR SPACE */
1410     case 0x202f: /* NARROW NO-BREAK SPACE */
1411     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1412     case 0x3000: /* IDEOGRAPHIC SPACE */
1413     OK = TRUE;
1414     break;
1415 ph10 182
1416 ph10 178 default:
1417     OK = FALSE;
1418     break;
1419     }
1420 ph10 182
1421 ph10 178 if (OK == (d == OP_HSPACE))
1422 ph10 182 {
1423 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1424     {
1425     active_count--; /* Remove non-match possibility */
1426     next_active_state--;
1427     }
1428     count++;
1429     ADD_NEW_DATA(-state_offset, count, 0);
1430     }
1431     }
1432     break;
1433    
1434     /*-----------------------------------------------------------------*/
1435 ph10 151 #ifdef SUPPORT_UCP
1436 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1437     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1438 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1439 nigel 87 count = 4;
1440 nigel 77 goto QS1;
1441    
1442     case OP_PROP_EXTRA + OP_TYPESTAR:
1443     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1444 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1445 nigel 77 count = 0;
1446    
1447     QS1:
1448    
1449 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1450 nigel 77 if (clen > 0)
1451     {
1452 nigel 87 BOOL OK;
1453 ph10 349 const ucd_record * prop = GET_UCD(c);
1454 nigel 87 switch(code[2])
1455     {
1456     case PT_ANY:
1457     OK = TRUE;
1458     break;
1459    
1460     case PT_LAMP:
1461 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1462 ph10 517 prop->chartype == ucp_Lt;
1463 nigel 87 break;
1464    
1465     case PT_GC:
1466 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1467 nigel 87 break;
1468    
1469     case PT_PC:
1470 ph10 349 OK = prop->chartype == code[3];
1471 nigel 87 break;
1472    
1473     case PT_SC:
1474 ph10 349 OK = prop->script == code[3];
1475 nigel 87 break;
1476 ph10 535
1477 ph10 517 /* These are specials for combination cases. */
1478 ph10 535
1479 ph10 517 case PT_ALNUM:
1480     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1481     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1482 ph10 535 break;
1483    
1484 ph10 517 case PT_SPACE: /* Perl space */
1485     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1486     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1487 ph10 535 break;
1488    
1489 ph10 517 case PT_PXSPACE: /* POSIX space */
1490     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1491     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1492     c == CHAR_FF || c == CHAR_CR;
1493 ph10 535 break;
1494    
1495 ph10 517 case PT_WORD:
1496     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1497     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1498     c == CHAR_UNDERSCORE;
1499 ph10 535 break;
1500 nigel 87
1501     /* Should never occur, but keep compilers from grumbling. */
1502    
1503     default:
1504     OK = codevalue != OP_PROP;
1505     break;
1506     }
1507    
1508 nigel 93 if (OK == (d == OP_PROP))
1509     {
1510     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1511     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1512     {
1513     active_count--; /* Remove non-match possibility */
1514     next_active_state--;
1515     }
1516     ADD_NEW(state_offset + count, 0);
1517     }
1518 nigel 77 }
1519     break;
1520    
1521     /*-----------------------------------------------------------------*/
1522     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1523     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1524 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1525 nigel 77 count = 2;
1526     goto QS2;
1527    
1528     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1529     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1530 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1531 nigel 77 count = 0;
1532    
1533     QS2:
1534    
1535     ADD_ACTIVE(state_offset + 2, 0);
1536 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1537 nigel 77 {
1538     const uschar *nptr = ptr + clen;
1539     int ncount = 0;
1540 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1541     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1542     {
1543     active_count--; /* Remove non-match possibility */
1544     next_active_state--;
1545     }
1546 nigel 77 while (nptr < end_subject)
1547     {
1548     int nd;
1549     int ndlen = 1;
1550     GETCHARLEN(nd, nptr, ndlen);
1551 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1552 nigel 77 ncount++;
1553     nptr += ndlen;
1554     }
1555     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1556     }
1557     break;
1558 ph10 151 #endif
1559 nigel 77
1560     /*-----------------------------------------------------------------*/
1561 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1562     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1563     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1564     count = 2;
1565     goto QS3;
1566    
1567     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1568     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1569     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1570     count = 0;
1571    
1572     QS3:
1573     ADD_ACTIVE(state_offset + 2, 0);
1574     if (clen > 0)
1575     {
1576     int ncount = 0;
1577     switch (c)
1578     {
1579     case 0x000b:
1580     case 0x000c:
1581     case 0x0085:
1582     case 0x2028:
1583     case 0x2029:
1584 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1585     goto ANYNL02;
1586    
1587     case 0x000d:
1588     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1589     /* Fall through */
1590    
1591     ANYNL02:
1592     case 0x000a:
1593 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1594     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1595     {
1596     active_count--; /* Remove non-match possibility */
1597     next_active_state--;
1598     }
1599     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1600     break;
1601 ph10 231
1602 nigel 93 default:
1603     break;
1604     }
1605     }
1606     break;
1607    
1608     /*-----------------------------------------------------------------*/
1609 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1610     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1611     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1612     count = 2;
1613     goto QS4;
1614    
1615     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1616     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1617     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1618     count = 0;
1619    
1620     QS4:
1621     ADD_ACTIVE(state_offset + 2, 0);
1622     if (clen > 0)
1623     {
1624 ph10 182 BOOL OK;
1625 ph10 178 switch (c)
1626     {
1627     case 0x000a:
1628     case 0x000b:
1629     case 0x000c:
1630     case 0x000d:
1631     case 0x0085:
1632     case 0x2028:
1633     case 0x2029:
1634     OK = TRUE;
1635     break;
1636 ph10 182
1637 ph10 178 default:
1638     OK = FALSE;
1639     break;
1640     }
1641     if (OK == (d == OP_VSPACE))
1642 ph10 182 {
1643 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1644     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1645     {
1646     active_count--; /* Remove non-match possibility */
1647     next_active_state--;
1648     }
1649     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1650     }
1651     }
1652     break;
1653    
1654     /*-----------------------------------------------------------------*/
1655     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1656     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1657     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1658     count = 2;
1659     goto QS5;
1660    
1661     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1662     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1663     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1664     count = 0;
1665    
1666     QS5:
1667     ADD_ACTIVE(state_offset + 2, 0);
1668     if (clen > 0)
1669     {
1670 ph10 182 BOOL OK;
1671 ph10 178 switch (c)
1672     {
1673     case 0x09: /* HT */
1674     case 0x20: /* SPACE */
1675     case 0xa0: /* NBSP */
1676     case 0x1680: /* OGHAM SPACE MARK */
1677     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1678     case 0x2000: /* EN QUAD */
1679     case 0x2001: /* EM QUAD */
1680     case 0x2002: /* EN SPACE */
1681     case 0x2003: /* EM SPACE */
1682     case 0x2004: /* THREE-PER-EM SPACE */
1683     case 0x2005: /* FOUR-PER-EM SPACE */
1684     case 0x2006: /* SIX-PER-EM SPACE */
1685     case 0x2007: /* FIGURE SPACE */
1686     case 0x2008: /* PUNCTUATION SPACE */
1687     case 0x2009: /* THIN SPACE */
1688     case 0x200A: /* HAIR SPACE */
1689     case 0x202f: /* NARROW NO-BREAK SPACE */
1690     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1691     case 0x3000: /* IDEOGRAPHIC SPACE */
1692     OK = TRUE;
1693     break;
1694 ph10 182
1695 ph10 178 default:
1696     OK = FALSE;
1697     break;
1698     }
1699 ph10 182
1700 ph10 178 if (OK == (d == OP_HSPACE))
1701 ph10 182 {
1702 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1703     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1704     {
1705     active_count--; /* Remove non-match possibility */
1706     next_active_state--;
1707     }
1708     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1709     }
1710     }
1711     break;
1712    
1713     /*-----------------------------------------------------------------*/
1714 ph10 151 #ifdef SUPPORT_UCP
1715 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1716     case OP_PROP_EXTRA + OP_TYPEUPTO:
1717     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1718 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1719 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1720 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1721 nigel 77 count = current_state->count; /* Number already matched */
1722     if (clen > 0)
1723     {
1724 nigel 87 BOOL OK;
1725 ph10 349 const ucd_record * prop = GET_UCD(c);
1726 nigel 87 switch(code[4])
1727 nigel 77 {
1728 nigel 87 case PT_ANY:
1729     OK = TRUE;
1730     break;
1731    
1732     case PT_LAMP:
1733 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1734 ph10 517 prop->chartype == ucp_Lt;
1735 nigel 87 break;
1736    
1737     case PT_GC:
1738 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1739 nigel 87 break;
1740    
1741     case PT_PC:
1742 ph10 349 OK = prop->chartype == code[5];
1743 nigel 87 break;
1744    
1745     case PT_SC:
1746 ph10 349 OK = prop->script == code[5];
1747 nigel 87 break;
1748 ph10 535
1749 ph10 517 /* These are specials for combination cases. */
1750 ph10 535
1751 ph10 517 case PT_ALNUM:
1752     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1753     _pcre_ucp_gentype[prop->chartype] == ucp_N;
1754 ph10 535 break;
1755    
1756 ph10 517 case PT_SPACE: /* Perl space */
1757     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1758     c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1759 ph10 535 break;
1760    
1761 ph10 517 case PT_PXSPACE: /* POSIX space */
1762     OK = _pcre_ucp_gentype[prop->chartype] == ucp_Z ||
1763     c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1764     c == CHAR_FF || c == CHAR_CR;
1765 ph10 535 break;
1766    
1767 ph10 517 case PT_WORD:
1768     OK = _pcre_ucp_gentype[prop->chartype] == ucp_L ||
1769     _pcre_ucp_gentype[prop->chartype] == ucp_N ||
1770     c == CHAR_UNDERSCORE;
1771 ph10 535 break;
1772 nigel 87
1773     /* Should never occur, but keep compilers from grumbling. */
1774    
1775     default:
1776     OK = codevalue != OP_PROP;
1777     break;
1778     }
1779    
1780     if (OK == (d == OP_PROP))
1781     {
1782 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1783     {
1784     active_count--; /* Remove non-match possibility */
1785     next_active_state--;
1786     }
1787 nigel 77 if (++count >= GET2(code, 1))
1788 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1789 nigel 77 else
1790     { ADD_NEW(state_offset, count); }
1791     }
1792     }
1793     break;
1794    
1795     /*-----------------------------------------------------------------*/
1796     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1797     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1798     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1799 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1800 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1801     { ADD_ACTIVE(state_offset + 4, 0); }
1802     count = current_state->count; /* Number already matched */
1803 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1804 nigel 77 {
1805     const uschar *nptr = ptr + clen;
1806     int ncount = 0;
1807 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1808     {
1809     active_count--; /* Remove non-match possibility */
1810     next_active_state--;
1811     }
1812 nigel 77 while (nptr < end_subject)
1813     {
1814     int nd;
1815     int ndlen = 1;
1816     GETCHARLEN(nd, nptr, ndlen);
1817 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1818 nigel 77 ncount++;
1819     nptr += ndlen;
1820     }
1821     if (++count >= GET2(code, 1))
1822     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1823     else
1824     { ADD_NEW_DATA(-state_offset, count, ncount); }
1825     }
1826     break;
1827 ph10 151 #endif
1828 nigel 77
1829 nigel 93 /*-----------------------------------------------------------------*/
1830     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1831     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1832     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1833     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1834     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1835     { ADD_ACTIVE(state_offset + 4, 0); }
1836     count = current_state->count; /* Number already matched */
1837     if (clen > 0)
1838     {
1839     int ncount = 0;
1840     switch (c)
1841     {
1842     case 0x000b:
1843     case 0x000c:
1844     case 0x0085:
1845     case 0x2028:
1846     case 0x2029:
1847 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1848     goto ANYNL03;
1849    
1850     case 0x000d:
1851     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1852     /* Fall through */
1853    
1854     ANYNL03:
1855     case 0x000a:
1856 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1857     {
1858     active_count--; /* Remove non-match possibility */
1859     next_active_state--;
1860     }
1861     if (++count >= GET2(code, 1))
1862     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1863     else
1864     { ADD_NEW_DATA(-state_offset, count, ncount); }
1865     break;
1866 ph10 231
1867 nigel 93 default:
1868     break;
1869     }
1870     }
1871     break;
1872    
1873 ph10 178 /*-----------------------------------------------------------------*/
1874     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1875     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1876     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1877     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1878     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1879     { ADD_ACTIVE(state_offset + 4, 0); }
1880     count = current_state->count; /* Number already matched */
1881     if (clen > 0)
1882     {
1883 ph10 182 BOOL OK;
1884 ph10 178 switch (c)
1885     {
1886     case 0x000a:
1887     case 0x000b:
1888     case 0x000c:
1889     case 0x000d:
1890     case 0x0085:
1891     case 0x2028:
1892     case 0x2029:
1893     OK = TRUE;
1894     break;
1895 ph10 182
1896 ph10 178 default:
1897     OK = FALSE;
1898     }
1899 ph10 182
1900 ph10 178 if (OK == (d == OP_VSPACE))
1901 ph10 182 {
1902 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1903     {
1904     active_count--; /* Remove non-match possibility */
1905     next_active_state--;
1906     }
1907     if (++count >= GET2(code, 1))
1908     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1909     else
1910     { ADD_NEW_DATA(-state_offset, count, 0); }
1911     }
1912     }
1913     break;
1914    
1915     /*-----------------------------------------------------------------*/
1916     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1917     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1918     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1919     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1920     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1921     { ADD_ACTIVE(state_offset + 4, 0); }
1922     count = current_state->count; /* Number already matched */
1923     if (clen > 0)
1924     {
1925 ph10 182 BOOL OK;
1926 ph10 178 switch (c)
1927     {
1928     case 0x09: /* HT */
1929     case 0x20: /* SPACE */
1930     case 0xa0: /* NBSP */
1931     case 0x1680: /* OGHAM SPACE MARK */
1932     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1933     case 0x2000: /* EN QUAD */
1934     case 0x2001: /* EM QUAD */
1935     case 0x2002: /* EN SPACE */
1936     case 0x2003: /* EM SPACE */
1937     case 0x2004: /* THREE-PER-EM SPACE */
1938     case 0x2005: /* FOUR-PER-EM SPACE */
1939     case 0x2006: /* SIX-PER-EM SPACE */
1940     case 0x2007: /* FIGURE SPACE */
1941     case 0x2008: /* PUNCTUATION SPACE */
1942     case 0x2009: /* THIN SPACE */
1943     case 0x200A: /* HAIR SPACE */
1944     case 0x202f: /* NARROW NO-BREAK SPACE */
1945     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1946     case 0x3000: /* IDEOGRAPHIC SPACE */
1947     OK = TRUE;
1948     break;
1949 ph10 182
1950 ph10 178 default:
1951     OK = FALSE;
1952     break;
1953     }
1954 ph10 182
1955 ph10 178 if (OK == (d == OP_HSPACE))
1956 ph10 182 {
1957 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1958     {
1959     active_count--; /* Remove non-match possibility */
1960     next_active_state--;
1961     }
1962     if (++count >= GET2(code, 1))
1963     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1964     else
1965     { ADD_NEW_DATA(-state_offset, count, 0); }
1966     }
1967     }
1968     break;
1969    
1970 nigel 77 /* ========================================================================== */
1971     /* These opcodes are followed by a character that is usually compared
1972     to the current subject character; it is loaded into d. We still get
1973     here even if there is no subject character, because in some cases zero
1974     repetitions are permitted. */
1975    
1976     /*-----------------------------------------------------------------*/
1977     case OP_CHAR:
1978     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1979     break;
1980    
1981     /*-----------------------------------------------------------------*/
1982 ph10 602 case OP_CHARI:
1983 nigel 77 if (clen == 0) break;
1984    
1985     #ifdef SUPPORT_UTF8
1986     if (utf8)
1987     {
1988     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1989     {
1990 nigel 93 unsigned int othercase;
1991 nigel 77 if (c < 128) othercase = fcc[c]; else
1992    
1993     /* If we have Unicode property support, we can use it to test the
1994 nigel 87 other case of the character. */
1995 nigel 77
1996     #ifdef SUPPORT_UCP
1997 ph10 349 othercase = UCD_OTHERCASE(c);
1998 nigel 87 #else
1999 nigel 93 othercase = NOTACHAR;
2000 nigel 77 #endif
2001    
2002     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2003     }
2004     }
2005     else
2006     #endif /* SUPPORT_UTF8 */
2007    
2008     /* Non-UTF-8 mode */
2009     {
2010     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2011     }
2012     break;
2013    
2014    
2015     #ifdef SUPPORT_UCP
2016     /*-----------------------------------------------------------------*/
2017     /* This is a tricky one because it can match more than one character.
2018     Find out how many characters to skip, and then set up a negative state
2019     to wait for them to pass before continuing. */
2020    
2021     case OP_EXTUNI:
2022 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2023 nigel 77 {
2024     const uschar *nptr = ptr + clen;
2025     int ncount = 0;
2026     while (nptr < end_subject)
2027     {
2028     int nclen = 1;
2029     GETCHARLEN(c, nptr, nclen);
2030 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2031 nigel 77 ncount++;
2032     nptr += nclen;
2033     }
2034     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2035     }
2036     break;
2037     #endif
2038    
2039     /*-----------------------------------------------------------------*/
2040 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2041     character (when CR is followed by LF). In this case, set up a negative
2042     state to wait for one character to pass before continuing. */
2043    
2044     case OP_ANYNL:
2045     if (clen > 0) switch(c)
2046     {
2047     case 0x000b:
2048     case 0x000c:
2049     case 0x0085:
2050     case 0x2028:
2051     case 0x2029:
2052 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2053    
2054     case 0x000a:
2055 nigel 93 ADD_NEW(state_offset + 1, 0);
2056     break;
2057 ph10 231
2058 nigel 93 case 0x000d:
2059     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2060     {
2061     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2062     }
2063     else
2064     {
2065     ADD_NEW(state_offset + 1, 0);
2066     }
2067     break;
2068     }
2069     break;
2070    
2071     /*-----------------------------------------------------------------*/
2072 ph10 178 case OP_NOT_VSPACE:
2073     if (clen > 0) switch(c)
2074     {
2075     case 0x000a:
2076     case 0x000b:
2077     case 0x000c:
2078     case 0x000d:
2079     case 0x0085:
2080     case 0x2028:
2081     case 0x2029:
2082     break;
2083 ph10 182
2084     default:
2085 ph10 178 ADD_NEW(state_offset + 1, 0);
2086     break;
2087     }
2088     break;
2089    
2090     /*-----------------------------------------------------------------*/
2091     case OP_VSPACE:
2092     if (clen > 0) switch(c)
2093     {
2094     case 0x000a:
2095     case 0x000b:
2096     case 0x000c:
2097     case 0x000d:
2098     case 0x0085:
2099     case 0x2028:
2100     case 0x2029:
2101     ADD_NEW(state_offset + 1, 0);
2102     break;
2103 ph10 182
2104 ph10 178 default: break;
2105     }
2106     break;
2107    
2108     /*-----------------------------------------------------------------*/
2109     case OP_NOT_HSPACE:
2110     if (clen > 0) switch(c)
2111     {
2112     case 0x09: /* HT */
2113     case 0x20: /* SPACE */
2114     case 0xa0: /* NBSP */
2115     case 0x1680: /* OGHAM SPACE MARK */
2116     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2117     case 0x2000: /* EN QUAD */
2118     case 0x2001: /* EM QUAD */
2119     case 0x2002: /* EN SPACE */
2120     case 0x2003: /* EM SPACE */
2121     case 0x2004: /* THREE-PER-EM SPACE */
2122     case 0x2005: /* FOUR-PER-EM SPACE */
2123     case 0x2006: /* SIX-PER-EM SPACE */
2124     case 0x2007: /* FIGURE SPACE */
2125     case 0x2008: /* PUNCTUATION SPACE */
2126     case 0x2009: /* THIN SPACE */
2127     case 0x200A: /* HAIR SPACE */
2128     case 0x202f: /* NARROW NO-BREAK SPACE */
2129     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2130     case 0x3000: /* IDEOGRAPHIC SPACE */
2131     break;
2132 ph10 182
2133     default:
2134 ph10 178 ADD_NEW(state_offset + 1, 0);
2135     break;
2136     }
2137     break;
2138    
2139     /*-----------------------------------------------------------------*/
2140     case OP_HSPACE:
2141     if (clen > 0) switch(c)
2142     {
2143     case 0x09: /* HT */
2144     case 0x20: /* SPACE */
2145     case 0xa0: /* NBSP */
2146     case 0x1680: /* OGHAM SPACE MARK */
2147     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2148     case 0x2000: /* EN QUAD */
2149     case 0x2001: /* EM QUAD */
2150     case 0x2002: /* EN SPACE */
2151     case 0x2003: /* EM SPACE */
2152     case 0x2004: /* THREE-PER-EM SPACE */
2153     case 0x2005: /* FOUR-PER-EM SPACE */
2154     case 0x2006: /* SIX-PER-EM SPACE */
2155     case 0x2007: /* FIGURE SPACE */
2156     case 0x2008: /* PUNCTUATION SPACE */
2157     case 0x2009: /* THIN SPACE */
2158     case 0x200A: /* HAIR SPACE */
2159     case 0x202f: /* NARROW NO-BREAK SPACE */
2160     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2161     case 0x3000: /* IDEOGRAPHIC SPACE */
2162     ADD_NEW(state_offset + 1, 0);
2163     break;
2164     }
2165     break;
2166    
2167     /*-----------------------------------------------------------------*/
2168 ph10 602 /* Match a negated single character casefully. This is only used for
2169     one-byte characters, that is, we know that d < 256. The character we are
2170 nigel 77 checking (c) can be multibyte. */
2171    
2172     case OP_NOT:
2173 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2174 nigel 77 break;
2175    
2176     /*-----------------------------------------------------------------*/
2177 ph10 602 /* Match a negated single character caselessly. This is only used for
2178     one-byte characters, that is, we know that d < 256. The character we are
2179     checking (c) can be multibyte. */
2180    
2181     case OP_NOTI:
2182     if (clen > 0 && c != d && c != fcc[d])
2183     { ADD_NEW(state_offset + dlen + 1, 0); }
2184     break;
2185    
2186     /*-----------------------------------------------------------------*/
2187     case OP_PLUSI:
2188     case OP_MINPLUSI:
2189     case OP_POSPLUSI:
2190     case OP_NOTPLUSI:
2191     case OP_NOTMINPLUSI:
2192     case OP_NOTPOSPLUSI:
2193     caseless = TRUE;
2194     codevalue -= OP_STARI - OP_STAR;
2195    
2196     /* Fall through */
2197 nigel 77 case OP_PLUS:
2198     case OP_MINPLUS:
2199 nigel 93 case OP_POSPLUS:
2200 nigel 77 case OP_NOTPLUS:
2201     case OP_NOTMINPLUS:
2202 nigel 93 case OP_NOTPOSPLUS:
2203 nigel 77 count = current_state->count; /* Already matched */
2204     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2205     if (clen > 0)
2206     {
2207 nigel 93 unsigned int otherd = NOTACHAR;
2208 ph10 602 if (caseless)
2209 nigel 77 {
2210     #ifdef SUPPORT_UTF8
2211 nigel 87 if (utf8 && d >= 128)
2212 nigel 77 {
2213     #ifdef SUPPORT_UCP
2214 ph10 349 otherd = UCD_OTHERCASE(d);
2215 nigel 77 #endif /* SUPPORT_UCP */
2216     }
2217     else
2218     #endif /* SUPPORT_UTF8 */
2219     otherd = fcc[d];
2220     }
2221     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2222 nigel 93 {
2223     if (count > 0 &&
2224     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2225     {
2226     active_count--; /* Remove non-match possibility */
2227     next_active_state--;
2228     }
2229     count++;
2230     ADD_NEW(state_offset, count);
2231     }
2232 nigel 77 }
2233     break;
2234    
2235     /*-----------------------------------------------------------------*/
2236 ph10 602 case OP_QUERYI:
2237     case OP_MINQUERYI:
2238     case OP_POSQUERYI:
2239     case OP_NOTQUERYI:
2240     case OP_NOTMINQUERYI:
2241     case OP_NOTPOSQUERYI:
2242     caseless = TRUE;
2243     codevalue -= OP_STARI - OP_STAR;
2244     /* Fall through */
2245 nigel 77 case OP_QUERY:
2246     case OP_MINQUERY:
2247 nigel 93 case OP_POSQUERY:
2248 nigel 77 case OP_NOTQUERY:
2249     case OP_NOTMINQUERY:
2250 nigel 93 case OP_NOTPOSQUERY:
2251 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2252     if (clen > 0)
2253     {
2254 nigel 93 unsigned int otherd = NOTACHAR;
2255 ph10 602 if (caseless)
2256 nigel 77 {
2257     #ifdef SUPPORT_UTF8
2258 nigel 87 if (utf8 && d >= 128)
2259 nigel 77 {
2260     #ifdef SUPPORT_UCP
2261 ph10 349 otherd = UCD_OTHERCASE(d);
2262 nigel 77 #endif /* SUPPORT_UCP */
2263     }
2264     else
2265     #endif /* SUPPORT_UTF8 */
2266     otherd = fcc[d];
2267     }
2268     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2269 nigel 93 {
2270     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2271     {
2272     active_count--; /* Remove non-match possibility */
2273     next_active_state--;
2274     }
2275     ADD_NEW(state_offset + dlen + 1, 0);
2276     }
2277 nigel 77 }
2278     break;
2279    
2280     /*-----------------------------------------------------------------*/
2281 ph10 602 case OP_STARI:
2282     case OP_MINSTARI:
2283     case OP_POSSTARI:
2284     case OP_NOTSTARI:
2285     case OP_NOTMINSTARI:
2286     case OP_NOTPOSSTARI:
2287     caseless = TRUE;
2288     codevalue -= OP_STARI - OP_STAR;
2289     /* Fall through */
2290 nigel 77 case OP_STAR:
2291     case OP_MINSTAR:
2292 nigel 93 case OP_POSSTAR:
2293 nigel 77 case OP_NOTSTAR:
2294     case OP_NOTMINSTAR:
2295 nigel 93 case OP_NOTPOSSTAR:
2296 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2297     if (clen > 0)
2298     {
2299 nigel 93 unsigned int otherd = NOTACHAR;
2300 ph10 602 if (caseless)
2301 nigel 77 {
2302     #ifdef SUPPORT_UTF8
2303 nigel 87 if (utf8 && d >= 128)
2304 nigel 77 {
2305     #ifdef SUPPORT_UCP
2306 ph10 349 otherd = UCD_OTHERCASE(d);
2307 nigel 77 #endif /* SUPPORT_UCP */
2308     }
2309     else
2310     #endif /* SUPPORT_UTF8 */
2311     otherd = fcc[d];
2312     }
2313     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2314 nigel 93 {
2315     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2316     {
2317     active_count--; /* Remove non-match possibility */
2318     next_active_state--;
2319     }
2320     ADD_NEW(state_offset, 0);
2321     }
2322 nigel 77 }
2323     break;
2324    
2325     /*-----------------------------------------------------------------*/
2326 ph10 602 case OP_EXACTI:
2327     case OP_NOTEXACTI:
2328     caseless = TRUE;
2329     codevalue -= OP_STARI - OP_STAR;
2330     /* Fall through */
2331 nigel 77 case OP_EXACT:
2332 nigel 93 case OP_NOTEXACT:
2333     count = current_state->count; /* Number already matched */
2334     if (clen > 0)
2335     {
2336     unsigned int otherd = NOTACHAR;
2337 ph10 602 if (caseless)
2338 nigel 93 {
2339     #ifdef SUPPORT_UTF8
2340     if (utf8 && d >= 128)
2341     {
2342     #ifdef SUPPORT_UCP
2343 ph10 349 otherd = UCD_OTHERCASE(d);
2344 nigel 93 #endif /* SUPPORT_UCP */
2345     }
2346     else
2347     #endif /* SUPPORT_UTF8 */
2348     otherd = fcc[d];
2349     }
2350     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351     {
2352     if (++count >= GET2(code, 1))
2353     { ADD_NEW(state_offset + dlen + 3, 0); }
2354     else
2355     { ADD_NEW(state_offset, count); }
2356     }
2357     }
2358     break;
2359    
2360     /*-----------------------------------------------------------------*/
2361 ph10 602 case OP_UPTOI:
2362     case OP_MINUPTOI:
2363     case OP_POSUPTOI:
2364     case OP_NOTUPTOI:
2365     case OP_NOTMINUPTOI:
2366     case OP_NOTPOSUPTOI:
2367     caseless = TRUE;
2368     codevalue -= OP_STARI - OP_STAR;
2369     /* Fall through */
2370 nigel 77 case OP_UPTO:
2371     case OP_MINUPTO:
2372 nigel 93 case OP_POSUPTO:
2373 nigel 77 case OP_NOTUPTO:
2374     case OP_NOTMINUPTO:
2375 nigel 93 case OP_NOTPOSUPTO:
2376     ADD_ACTIVE(state_offset + dlen + 3, 0);
2377 nigel 77 count = current_state->count; /* Number already matched */
2378     if (clen > 0)
2379     {
2380 nigel 93 unsigned int otherd = NOTACHAR;
2381 ph10 602 if (caseless)
2382 nigel 77 {
2383     #ifdef SUPPORT_UTF8
2384 nigel 87 if (utf8 && d >= 128)
2385 nigel 77 {
2386     #ifdef SUPPORT_UCP
2387 ph10 349 otherd = UCD_OTHERCASE(d);
2388 nigel 77 #endif /* SUPPORT_UCP */
2389     }
2390     else
2391     #endif /* SUPPORT_UTF8 */
2392     otherd = fcc[d];
2393     }
2394     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2395     {
2396 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2397     {
2398     active_count--; /* Remove non-match possibility */
2399     next_active_state--;
2400     }
2401 nigel 77 if (++count >= GET2(code, 1))
2402     { ADD_NEW(state_offset + dlen + 3, 0); }
2403     else
2404     { ADD_NEW(state_offset, count); }
2405     }
2406     }
2407     break;
2408    
2409    
2410     /* ========================================================================== */
2411     /* These are the class-handling opcodes */
2412    
2413     case OP_CLASS:
2414     case OP_NCLASS:
2415     case OP_XCLASS:
2416     {
2417     BOOL isinclass = FALSE;
2418     int next_state_offset;
2419     const uschar *ecode;
2420    
2421     /* For a simple class, there is always just a 32-byte table, and we
2422     can set isinclass from it. */
2423    
2424     if (codevalue != OP_XCLASS)
2425     {
2426     ecode = code + 33;
2427     if (clen > 0)
2428     {
2429     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2430     ((code[1 + c/8] & (1 << (c&7))) != 0);
2431     }
2432     }
2433    
2434     /* An extended class may have a table or a list of single characters,
2435     ranges, or both, and it may be positive or negative. There's a
2436     function that sorts all this out. */
2437    
2438     else
2439     {
2440     ecode = code + GET(code, 1);
2441     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2442     }
2443    
2444     /* At this point, isinclass is set for all kinds of class, and ecode
2445     points to the byte after the end of the class. If there is a
2446     quantifier, this is where it will be. */
2447    
2448 ph10 530 next_state_offset = (int)(ecode - start_code);
2449 nigel 77
2450     switch (*ecode)
2451     {
2452     case OP_CRSTAR:
2453     case OP_CRMINSTAR:
2454     ADD_ACTIVE(next_state_offset + 1, 0);
2455     if (isinclass) { ADD_NEW(state_offset, 0); }
2456     break;
2457    
2458     case OP_CRPLUS:
2459     case OP_CRMINPLUS:
2460     count = current_state->count; /* Already matched */
2461     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2462     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2463     break;
2464    
2465     case OP_CRQUERY:
2466     case OP_CRMINQUERY:
2467     ADD_ACTIVE(next_state_offset + 1, 0);
2468     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2469     break;
2470    
2471     case OP_CRRANGE:
2472     case OP_CRMINRANGE:
2473     count = current_state->count; /* Already matched */
2474     if (count >= GET2(ecode, 1))
2475     { ADD_ACTIVE(next_state_offset + 5, 0); }
2476     if (isinclass)
2477     {
2478 nigel 91 int max = GET2(ecode, 3);
2479     if (++count >= max && max != 0) /* Max 0 => no limit */
2480 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2481     else
2482     { ADD_NEW(state_offset, count); }
2483     }
2484     break;
2485    
2486     default:
2487     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2488     break;
2489     }
2490     }
2491     break;
2492    
2493     /* ========================================================================== */
2494     /* These are the opcodes for fancy brackets of various kinds. We have
2495 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2496     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2497 ph10 341 though the other "backtracking verbs" are not supported. */
2498 ph10 345
2499 ph10 341 case OP_FAIL:
2500 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2501 ph10 345 break;
2502 nigel 77
2503     case OP_ASSERT:
2504     case OP_ASSERT_NOT:
2505     case OP_ASSERTBACK:
2506     case OP_ASSERTBACK_NOT:
2507     {
2508     int rc;
2509     int local_offsets[2];
2510     int local_workspace[1000];
2511     const uschar *endasscode = code + GET(code, 1);
2512    
2513     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2514    
2515     rc = internal_dfa_exec(
2516     md, /* static match data */
2517     code, /* this subexpression's code */
2518     ptr, /* where we currently are */
2519 ph10 530 (int)(ptr - start_subject), /* start offset */
2520 nigel 77 local_offsets, /* offset vector */
2521     sizeof(local_offsets)/sizeof(int), /* size of same */
2522     local_workspace, /* workspace vector */
2523     sizeof(local_workspace)/sizeof(int), /* size of same */
2524     rlevel, /* function recursion level */
2525     recursing); /* pass on regex recursion */
2526 ph10 487
2527 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2528 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2529 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2530 nigel 77 }
2531     break;
2532    
2533     /*-----------------------------------------------------------------*/
2534     case OP_COND:
2535 nigel 93 case OP_SCOND:
2536 nigel 77 {
2537     int local_offsets[1000];
2538     int local_workspace[1000];
2539 ph10 406 int codelink = GET(code, 1);
2540 ph10 397 int condcode;
2541 ph10 406
2542 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2543 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2544 ph10 398 happen for the other conditions. */
2545 nigel 77
2546 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2547 ph10 406 {
2548     rrc = 0;
2549 ph10 397 if (pcre_callout != NULL)
2550     {
2551     pcre_callout_block cb;
2552     cb.version = 1; /* Version 1 of the callout block */
2553     cb.callout_number = code[LINK_SIZE+2];
2554     cb.offset_vector = offsets;
2555     cb.subject = (PCRE_SPTR)start_subject;
2556 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2557     cb.start_match = (int)(current_subject - start_subject);
2558     cb.current_position = (int)(ptr - start_subject);
2559 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2560     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2561     cb.capture_top = 1;
2562     cb.capture_last = -1;
2563     cb.callout_data = md->callout_data;
2564     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2565     }
2566 ph10 398 if (rrc > 0) break; /* Fail this thread */
2567     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2568 ph10 406 }
2569 ph10 398
2570 ph10 397 condcode = code[LINK_SIZE+1];
2571 ph10 406
2572 nigel 93 /* Back reference conditions are not supported */
2573 nigel 77
2574 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2575 ph10 459 return PCRE_ERROR_DFA_UCOND;
2576 nigel 93
2577     /* The DEFINE condition is always false */
2578    
2579     if (condcode == OP_DEF)
2580 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2581 nigel 93
2582     /* The only supported version of OP_RREF is for the value RREF_ANY,
2583     which means "test if in any recursion". We can't test for specifically
2584     recursed groups. */
2585    
2586 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2587 nigel 93 {
2588 nigel 77 int value = GET2(code, LINK_SIZE+2);
2589 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2590 ph10 406 if (recursing > 0)
2591 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2592     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2593 nigel 77 }
2594    
2595     /* Otherwise, the condition is an assertion */
2596    
2597     else
2598     {
2599     int rc;
2600     const uschar *asscode = code + LINK_SIZE + 1;
2601     const uschar *endasscode = asscode + GET(asscode, 1);
2602    
2603     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2604    
2605     rc = internal_dfa_exec(
2606     md, /* fixed match data */
2607     asscode, /* this subexpression's code */
2608     ptr, /* where we currently are */
2609 ph10 530 (int)(ptr - start_subject), /* start offset */
2610 nigel 77 local_offsets, /* offset vector */
2611     sizeof(local_offsets)/sizeof(int), /* size of same */
2612     local_workspace, /* workspace vector */
2613     sizeof(local_workspace)/sizeof(int), /* size of same */
2614     rlevel, /* function recursion level */
2615     recursing); /* pass on regex recursion */
2616    
2617 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2618 nigel 77 if ((rc >= 0) ==
2619     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2620 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2621 nigel 77 else
2622 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2623 nigel 77 }
2624     }
2625     break;
2626    
2627     /*-----------------------------------------------------------------*/
2628     case OP_RECURSE:
2629     {
2630     int local_offsets[1000];
2631     int local_workspace[1000];
2632     int rc;
2633    
2634     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2635     recursing + 1));
2636    
2637     rc = internal_dfa_exec(
2638     md, /* fixed match data */
2639     start_code + GET(code, 1), /* this subexpression's code */
2640     ptr, /* where we currently are */
2641 ph10 530 (int)(ptr - start_subject), /* start offset */
2642 nigel 77 local_offsets, /* offset vector */
2643     sizeof(local_offsets)/sizeof(int), /* size of same */
2644     local_workspace, /* workspace vector */
2645     sizeof(local_workspace)/sizeof(int), /* size of same */
2646     rlevel, /* function recursion level */
2647     recursing + 1); /* regex recurse level */
2648    
2649     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2650     recursing + 1, rc));
2651    
2652     /* Ran out of internal offsets */
2653    
2654     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2655    
2656     /* For each successful matched substring, set up the next state with a
2657     count of characters to skip before trying it. Note that the count is in
2658     characters, not bytes. */
2659    
2660     if (rc > 0)
2661     {
2662     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2663     {
2664     const uschar *p = start_subject + local_offsets[rc];
2665     const uschar *pp = start_subject + local_offsets[rc+1];
2666     int charcount = local_offsets[rc+1] - local_offsets[rc];
2667     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2668     if (charcount > 0)
2669     {
2670     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2671     }
2672     else
2673     {
2674     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2675     }
2676     }
2677     }
2678     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2679     }
2680     break;
2681    
2682     /*-----------------------------------------------------------------*/
2683 ph10 604 case OP_BRAPOS:
2684     case OP_SBRAPOS:
2685     case OP_CBRAPOS:
2686     case OP_SCBRAPOS:
2687     case OP_BRAPOSZERO:
2688     {
2689     int charcount, matched_count;
2690     const uschar *local_ptr = ptr;
2691     BOOL allow_zero;
2692    
2693     if (codevalue == OP_BRAPOSZERO)
2694     {
2695     allow_zero = TRUE;
2696     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2697     }
2698     else allow_zero = FALSE;
2699    
2700     /* Loop to match the subpattern as many times as possible as if it were
2701     a complete pattern. */
2702    
2703     for (matched_count = 0;; matched_count++)
2704     {
2705     int local_offsets[2];
2706     int local_workspace[1000];
2707    
2708     int rc = internal_dfa_exec(
2709     md, /* fixed match data */
2710     code, /* this subexpression's code */
2711     local_ptr, /* where we currently are */
2712     (int)(ptr - start_subject), /* start offset */
2713     local_offsets, /* offset vector */
2714     sizeof(local_offsets)/sizeof(int), /* size of same */
2715     local_workspace, /* workspace vector */
2716     sizeof(local_workspace)/sizeof(int), /* size of same */
2717     rlevel, /* function recursion level */
2718     recursing); /* pass on regex recursion */
2719    
2720     /* Failed to match */
2721    
2722     if (rc < 0)
2723     {
2724     if (rc != PCRE_ERROR_NOMATCH) return rc;
2725     break;
2726     }
2727    
2728     /* Matched: break the loop if zero characters matched. */
2729    
2730     charcount = local_offsets[1] - local_offsets[0];
2731     if (charcount == 0) break;
2732     local_ptr += charcount; /* Advance temporary position ptr */
2733     }
2734    
2735     /* At this point we have matched the subpattern matched_count
2736     times, and local_ptr is pointing to the character after the end of the
2737     last match. */
2738    
2739     if (matched_count > 0 || allow_zero)
2740     {
2741     const uschar *end_subpattern = code;
2742     int next_state_offset;
2743    
2744     do { end_subpattern += GET(end_subpattern, 1); }
2745     while (*end_subpattern == OP_ALT);
2746     next_state_offset =
2747     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2748    
2749     /* Optimization: if there are no more active states, and there
2750     are no new states yet set up, then skip over the subject string
2751     right here, to save looping. Otherwise, set up the new state to swing
2752     into action when the end of the matched substring is reached. */
2753    
2754     if (i + 1 >= active_count && new_count == 0)
2755     {
2756     ptr = local_ptr;
2757     clen = 0;
2758     ADD_NEW(next_state_offset, 0);
2759     }
2760     else
2761     {
2762     const uschar *p = ptr;
2763     const uschar *pp = local_ptr;
2764     charcount = pp - p;
2765     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2766     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2767     }
2768     }
2769     }
2770     break;
2771    
2772     /*-----------------------------------------------------------------*/
2773 nigel 77 case OP_ONCE:
2774     {
2775     int local_offsets[2];
2776     int local_workspace[1000];
2777    
2778     int rc = internal_dfa_exec(
2779     md, /* fixed match data */
2780     code, /* this subexpression's code */
2781     ptr, /* where we currently are */
2782 ph10 530 (int)(ptr - start_subject), /* start offset */
2783 nigel 77 local_offsets, /* offset vector */
2784     sizeof(local_offsets)/sizeof(int), /* size of same */
2785     local_workspace, /* workspace vector */
2786     sizeof(local_workspace)/sizeof(int), /* size of same */
2787     rlevel, /* function recursion level */
2788     recursing); /* pass on regex recursion */
2789    
2790     if (rc >= 0)
2791     {
2792     const uschar *end_subpattern = code;
2793     int charcount = local_offsets[1] - local_offsets[0];
2794     int next_state_offset, repeat_state_offset;
2795    
2796     do { end_subpattern += GET(end_subpattern, 1); }
2797     while (*end_subpattern == OP_ALT);
2798 ph10 535 next_state_offset =
2799 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2800 nigel 77
2801     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2802     arrange for the repeat state also to be added to the relevant list.
2803     Calculate the offset, or set -1 for no repeat. */
2804    
2805     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2806     *end_subpattern == OP_KETRMIN)?
2807 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2808 nigel 77
2809     /* If we have matched an empty string, add the next state at the
2810     current character pointer. This is important so that the duplicate
2811     checking kicks in, which is what breaks infinite loops that match an
2812     empty string. */
2813    
2814     if (charcount == 0)
2815     {
2816     ADD_ACTIVE(next_state_offset, 0);
2817     }
2818    
2819     /* Optimization: if there are no more active states, and there
2820     are no new states yet set up, then skip over the subject string
2821     right here, to save looping. Otherwise, set up the new state to swing
2822 ph10 604 into action when the end of the matched substring is reached. */
2823 nigel 77
2824     else if (i + 1 >= active_count && new_count == 0)
2825     {
2826     ptr += charcount;
2827     clen = 0;
2828     ADD_NEW(next_state_offset, 0);
2829    
2830     /* If we are adding a repeat state at the new character position,
2831     we must fudge things so that it is the only current state.
2832     Otherwise, it might be a duplicate of one we processed before, and
2833     that would cause it to be skipped. */
2834    
2835     if (repeat_state_offset >= 0)
2836     {
2837     next_active_state = active_states;
2838     active_count = 0;
2839     i = -1;
2840     ADD_ACTIVE(repeat_state_offset, 0);
2841     }
2842     }
2843     else
2844     {
2845     const uschar *p = start_subject + local_offsets[0];
2846     const uschar *pp = start_subject + local_offsets[1];
2847     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2848     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2849     if (repeat_state_offset >= 0)
2850     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2851     }
2852     }
2853     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2854     }
2855     break;
2856    
2857    
2858     /* ========================================================================== */
2859     /* Handle callouts */
2860    
2861     case OP_CALLOUT:
2862 ph10 406 rrc = 0;
2863 nigel 77 if (pcre_callout != NULL)
2864     {
2865     pcre_callout_block cb;
2866     cb.version = 1; /* Version 1 of the callout block */
2867     cb.callout_number = code[1];
2868     cb.offset_vector = offsets;
2869 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2870 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2871     cb.start_match = (int)(current_subject - start_subject);
2872     cb.current_position = (int)(ptr - start_subject);
2873 nigel 77 cb.pattern_position = GET(code, 2);
2874     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2875     cb.capture_top = 1;
2876     cb.capture_last = -1;
2877     cb.callout_data = md->callout_data;
2878     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2879 ph10 406 }
2880     if (rrc == 0)
2881     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2882 nigel 77 break;
2883    
2884    
2885     /* ========================================================================== */
2886     default: /* Unsupported opcode */
2887     return PCRE_ERROR_DFA_UITEM;
2888     }
2889    
2890     NEXT_ACTIVE_STATE: continue;
2891    
2892     } /* End of loop scanning active states */
2893    
2894     /* We have finished the processing at the current subject character. If no
2895     new states have been set for the next character, we have found all the
2896     matches that we are going to find. If we are at the top level and partial
2897 ph10 463 matching has been requested, check for appropriate conditions.
2898    
2899 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2900     character. If it is equal to the original active_count (saved in
2901     workspace[1]) it means that (*F) was found on every active state. In this
2902 ph10 463 case we don't want to give a partial match.
2903 nigel 77
2904 ph10 463 The "could_continue" variable is true if a state could have continued but
2905     for the fact that the end of the subject was reached. */
2906    
2907 nigel 77 if (new_count <= 0)
2908     {
2909 ph10 427 if (rlevel == 1 && /* Top level, and */
2910 ph10 463 could_continue && /* Some could go on */
2911 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2912 ph10 427 ( /* either... */
2913     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2914     || /* or... */
2915     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2916     match_count < 0) /* no matches */
2917     ) && /* And... */
2918 ph10 553 ptr >= end_subject && /* Reached end of subject */
2919     ptr > md->start_used_ptr) /* Inspected non-empty string */
2920 nigel 77 {
2921     if (offsetcount >= 2)
2922     {
2923 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2924     offsets[1] = (int)(end_subject - start_subject);
2925 nigel 77 }
2926     match_count = PCRE_ERROR_PARTIAL;
2927     }
2928    
2929     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2930     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2931     rlevel*2-2, SP));
2932 nigel 91 break; /* In effect, "return", but see the comment below */
2933 nigel 77 }
2934    
2935     /* One or more states are active for the next character. */
2936    
2937     ptr += clen; /* Advance to next subject character */
2938     } /* Loop to move along the subject string */
2939    
2940 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2941     if we use "return" above, we have compiler trouble. Some compilers warn if
2942     there's nothing here because they think the function doesn't return a value. On
2943     the other hand, if we put a dummy statement here, some more clever compilers
2944     complain that it can't be reached. Sigh. */
2945 nigel 77
2946 nigel 91 return match_count;
2947 nigel 77 }
2948    
2949    
2950    
2951    
2952     /*************************************************
2953     * Execute a Regular Expression - DFA engine *
2954     *************************************************/
2955    
2956     /* This external function applies a compiled re to a subject string using a DFA
2957     engine. This function calls the internal function multiple times if the pattern
2958     is not anchored.
2959    
2960     Arguments:
2961     argument_re points to the compiled expression
2962 ph10 97 extra_data points to extra data or is NULL
2963 nigel 77 subject points to the subject string
2964     length length of subject string (may contain binary zeros)
2965     start_offset where to start in the subject string
2966     options option bits
2967     offsets vector of match offsets
2968     offsetcount size of same
2969     workspace workspace vector
2970     wscount size of same
2971    
2972     Returns: > 0 => number of match offset pairs placed in offsets
2973     = 0 => offsets overflowed; longest matches are present
2974     -1 => failed to match
2975     < -1 => some kind of unexpected problem
2976     */
2977    
2978 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2979 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2980     const char *subject, int length, int start_offset, int options, int *offsets,
2981     int offsetcount, int *workspace, int wscount)
2982     {
2983     real_pcre *re = (real_pcre *)argument_re;
2984     dfa_match_data match_block;
2985 nigel 91 dfa_match_data *md = &match_block;
2986 nigel 77 BOOL utf8, anchored, startline, firstline;
2987     const uschar *current_subject, *end_subject, *lcc;
2988    
2989     pcre_study_data internal_study;
2990     const pcre_study_data *study = NULL;
2991     real_pcre internal_re;
2992    
2993     const uschar *req_byte_ptr;
2994     const uschar *start_bits = NULL;
2995     BOOL first_byte_caseless = FALSE;
2996     BOOL req_byte_caseless = FALSE;
2997     int first_byte = -1;
2998     int req_byte = -1;
2999     int req_byte2 = -1;
3000 nigel 91 int newline;
3001 nigel 77
3002     /* Plausibility checks */
3003    
3004     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3005     if (re == NULL || subject == NULL || workspace == NULL ||
3006     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3007     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3008     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3009 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3010 nigel 77
3011     /* We need to find the pointer to any study data before we test for byte
3012     flipping, so we scan the extra_data block first. This may set two fields in the
3013     match block, so we must initialize them beforehand. However, the other fields
3014     in the match block must not be set until after the byte flipping. */
3015    
3016 nigel 91 md->tables = re->tables;
3017     md->callout_data = NULL;
3018 nigel 77
3019     if (extra_data != NULL)
3020     {
3021     unsigned int flags = extra_data->flags;
3022     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3023     study = (const pcre_study_data *)extra_data->study_data;
3024     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3025 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3026     return PCRE_ERROR_DFA_UMLIMIT;
3027 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3028 nigel 91 md->callout_data = extra_data->callout_data;
3029 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3030 nigel 91 md->tables = extra_data->tables;
3031 nigel 77 }
3032 ph10 461
3033 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
3034     test for a regex that was compiled on a host of opposite endianness. If this is
3035     the case, flipped values are put in internal_re and internal_study if there was
3036     study data too. */
3037    
3038     if (re->magic_number != MAGIC_NUMBER)
3039     {
3040     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3041     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3042     if (study != NULL) study = &internal_study;
3043     }
3044    
3045     /* Set some local values */
3046    
3047     current_subject = (const unsigned char *)subject + start_offset;
3048     end_subject = (const unsigned char *)subject + length;
3049     req_byte_ptr = current_subject - 1;
3050    
3051 nigel 91 #ifdef SUPPORT_UTF8
3052 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
3053 nigel 91 #else
3054     utf8 = FALSE;
3055     #endif
3056 nigel 77
3057 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3058     (re->options & PCRE_ANCHORED) != 0;
3059    
3060 nigel 77 /* The remaining fixed data for passing around. */
3061    
3062 nigel 91 md->start_code = (const uschar *)argument_re +
3063 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3064 nigel 91 md->start_subject = (const unsigned char *)subject;
3065     md->end_subject = end_subject;
3066 ph10 442 md->start_offset = start_offset;
3067 nigel 91 md->moptions = options;
3068     md->poptions = re->options;
3069 nigel 77
3070 ph10 231 /* If the BSR option is not set at match time, copy what was set
3071     at compile time. */
3072    
3073     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3074     {
3075     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3076     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3077     #ifdef BSR_ANYCRLF
3078     else md->moptions |= PCRE_BSR_ANYCRLF;
3079 ph10 243 #endif
3080     }
3081 ph10 231
3082 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3083     nothing is set at run time, whatever was used at compile time applies. */
3084 nigel 91
3085 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3086 nigel 93 PCRE_NEWLINE_BITS)
3087 nigel 91 {
3088 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3089 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3090     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3091 nigel 91 case PCRE_NEWLINE_CR+
3092 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3093 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3094 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3095 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3096 nigel 91 }
3097    
3098 ph10 149 if (newline == -2)
3099 nigel 91 {
3100 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3101     }
3102     else if (newline < 0)
3103     {
3104 nigel 93 md->nltype = NLTYPE_ANY;
3105 nigel 91 }
3106     else
3107     {
3108 nigel 93 md->nltype = NLTYPE_FIXED;
3109     if (newline > 255)
3110     {
3111     md->nllen = 2;
3112     md->nl[0] = (newline >> 8) & 255;
3113     md->nl[1] = newline & 255;
3114     }
3115     else
3116     {
3117     md->nllen = 1;
3118     md->nl[0] = newline;
3119     }
3120 nigel 91 }
3121    
3122 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3123     back the character offset. */
3124    
3125     #ifdef SUPPORT_UTF8
3126     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3127     {
3128 ph10 606 int erroroffset;
3129     int errorcode = _pcre_valid_utf8((uschar *)subject, length, &erroroffset);
3130     if (errorcode != 0)
3131 ph10 598 {
3132     if (offsetcount >= 2)
3133     {
3134 ph10 606 offsets[0] = erroroffset;
3135 ph10 598 offsets[1] = errorcode;
3136     }
3137     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3138 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3139 ph10 598 }
3140 ph10 606 if (start_offset > 0 && start_offset < length &&
3141     (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
3142     return PCRE_ERROR_BADUTF8_OFFSET;
3143 nigel 77 }
3144     #endif
3145    
3146     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3147     is a feature that makes it possible to save compiled regex and re-use them
3148     in other programs later. */
3149    
3150 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
3151 nigel 77
3152     /* The lower casing table and the "must be at the start of a line" flag are
3153     used in a loop when finding where to start. */
3154    
3155 nigel 91 lcc = md->tables + lcc_offset;
3156 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3157 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3158    
3159     /* Set up the first character to match, if available. The first_byte value is
3160     never set for an anchored regular expression, but the anchoring may be forced
3161     at run time, so we have to test for anchoring. The first char may be unset for
3162     an unanchored pattern, of course. If there's no first char and the pattern was
3163     studied, there may be a bitmap of possible first characters. */
3164    
3165     if (!anchored)
3166     {
3167 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3168 nigel 77 {
3169     first_byte = re->first_byte & 255;
3170     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3171     first_byte = lcc[first_byte];
3172     }
3173     else
3174     {
3175 ph10 455 if (!startline && study != NULL &&
3176     (study->flags & PCRE_STUDY_MAPPED) != 0)
3177 nigel 77 start_bits = study->start_bits;
3178     }
3179     }
3180    
3181     /* For anchored or unanchored matches, there may be a "last known required
3182     character" set. */
3183    
3184 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3185 nigel 77 {
3186     req_byte = re->req_byte & 255;
3187     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3188 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
3189 nigel 77 }
3190    
3191     /* Call the main matching function, looping for a non-anchored regex after a
3192 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3193     a match. */
3194 nigel 77
3195     for (;;)
3196     {
3197     int rc;
3198    
3199     if ((options & PCRE_DFA_RESTART) == 0)
3200     {
3201     const uschar *save_end_subject = end_subject;
3202    
3203 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3204     line of a multiline string. Implement this by temporarily adjusting
3205     end_subject so that we stop scanning at a newline. If the match fails at
3206     the newline, later code breaks this loop. */
3207 nigel 77
3208     if (firstline)
3209     {
3210 ph10 365 USPTR t = current_subject;
3211     #ifdef SUPPORT_UTF8
3212     if (utf8)
3213 ph10 371 {
3214     while (t < md->end_subject && !IS_NEWLINE(t))
3215 ph10 365 {
3216     t++;
3217     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3218 ph10 371 }
3219 ph10 365 }
3220     else
3221 ph10 371 #endif
3222 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3223 nigel 77 end_subject = t;
3224     }
3225 ph10 392
3226 ph10 389 /* There are some optimizations that avoid running the match if a known
3227 ph10 455 starting point is not found. However, there is an option that disables
3228 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3229 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3230     match-time options. */
3231 nigel 77
3232 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3233 ph10 392 {
3234 ph10 389 /* Advance to a known first byte. */
3235 ph10 392
3236 ph10 389 if (first_byte >= 0)
3237 nigel 77 {
3238 ph10 389 if (first_byte_caseless)
3239     while (current_subject < end_subject &&
3240     lcc[*current_subject] != first_byte)
3241     current_subject++;
3242     else
3243 ph10 392 while (current_subject < end_subject &&
3244 ph10 389 *current_subject != first_byte)
3245     current_subject++;
3246     }
3247 ph10 392
3248 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3249 ph10 392
3250 ph10 389 else if (startline)
3251     {
3252     if (current_subject > md->start_subject + start_offset)
3253     {
3254 ph10 365 #ifdef SUPPORT_UTF8
3255 ph10 389 if (utf8)
3256 ph10 365 {
3257 ph10 392 while (current_subject < end_subject &&
3258 ph10 389 !WAS_NEWLINE(current_subject))
3259     {
3260 ph10 365 current_subject++;
3261 ph10 389 while(current_subject < end_subject &&
3262     (*current_subject & 0xc0) == 0x80)
3263     current_subject++;
3264     }
3265 ph10 371 }
3266 ph10 389 else
3267     #endif
3268     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3269     current_subject++;
3270 ph10 392
3271 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3272     ANYCRLF, and we are now at a LF, advance the match position by one
3273     more character. */
3274 ph10 392
3275 ph10 391 if (current_subject[-1] == CHAR_CR &&
3276 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3277     current_subject < end_subject &&
3278 ph10 391 *current_subject == CHAR_NL)
3279 ph10 389 current_subject++;
3280 ph10 365 }
3281 nigel 77 }
3282 ph10 392
3283 ph10 389 /* Or to a non-unique first char after study */
3284 ph10 392
3285 ph10 389 else if (start_bits != NULL)
3286 nigel 77 {
3287 ph10 389 while (current_subject < end_subject)
3288     {
3289     register unsigned int c = *current_subject;
3290 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3291 ph10 538 {
3292     current_subject++;
3293     #ifdef SUPPORT_UTF8
3294     if (utf8)
3295 ph10 545 while(current_subject < end_subject &&
3296 ph10 538 (*current_subject & 0xc0) == 0x80) current_subject++;
3297 ph10 545 #endif
3298 ph10 538 }
3299     else break;
3300 ph10 389 }
3301 nigel 77 }
3302 ph10 392 }
3303 nigel 77
3304     /* Restore fudged end_subject */
3305    
3306     end_subject = save_end_subject;
3307    
3308 ph10 461 /* The following two optimizations are disabled for partial matching or if
3309     disabling is explicitly requested (and of course, by the test above, this
3310 ph10 455 code is not obeyed when restarting after a partial match). */
3311 ph10 461
3312 ph10 455 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
3313     (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3314 ph10 461 {
3315 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3316     is a lower bound; no actual string of that length may actually match the
3317     pattern. Although the value is, strictly, in characters, we treat it as
3318     bytes to avoid spending too much time in this optimization. */
3319 nigel 77
3320 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3321 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3322 ph10 455 return PCRE_ERROR_NOMATCH;
3323 ph10 461
3324 ph10 455 /* If req_byte is set, we know that that character must appear in the
3325     subject for the match to succeed. If the first character is set, req_byte
3326     must be later in the subject; otherwise the test starts at the match
3327     point. This optimization can save a huge amount of work in patterns with
3328     nested unlimited repeats that aren't going to match. Writing separate
3329     code for cased/caseless versions makes it go faster, as does using an
3330     autoincrement and backing off on a match.
3331 ph10 461
3332 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3333     can take a long time, and give bad performance on quite ordinary
3334     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3335     string... so we don't do this when the string is sufficiently long. */
3336 ph10 461
3337 ph10 455 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3338 nigel 77 {
3339 ph10 455 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3340 ph10 461
3341 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3342     place we found it at last time. */
3343 ph10 461
3344 ph10 455 if (p > req_byte_ptr)
3345 nigel 77 {
3346 ph10 455 if (req_byte_caseless)
3347     {
3348     while (p < end_subject)
3349     {
3350     register int pp = *p++;
3351     if (pp == req_byte || pp == req_byte2) { p--; break; }
3352     }
3353     }
3354     else
3355     {
3356     while (p < end_subject)
3357     {
3358     if (*p++ == req_byte) { p--; break; }
3359     }
3360     }
3361 ph10 461
3362 ph10 455 /* If we can't find the required character, break the matching loop,
3363     which will cause a return or PCRE_ERROR_NOMATCH. */
3364 ph10 461
3365 ph10 455 if (p >= end_subject) break;
3366 ph10 461
3367 ph10 455 /* If we have found the required character, save the point where we
3368     found it, so that we don't search again next time round the loop if
3369     the start hasn't passed this character yet. */
3370 ph10 461
3371 ph10 455 req_byte_ptr = p;
3372 nigel 77 }
3373 ph10 461 }
3374 nigel 77 }
3375 ph10 455 } /* End of optimizations that are done when not restarting */
3376 nigel 77
3377     /* OK, now we can do the business */
3378    
3379 ph10 435 md->start_used_ptr = current_subject;
3380 ph10 461
3381 nigel 77 rc = internal_dfa_exec(
3382 nigel 91 md, /* fixed match data */
3383     md->start_code, /* this subexpression's code */
3384     current_subject, /* where we currently are */
3385     start_offset, /* start offset in subject */
3386     offsets, /* offset vector */
3387     offsetcount, /* size of same */
3388     workspace, /* workspace vector */
3389     wscount, /* size of same */
3390     0, /* function recurse level */
3391     0); /* regex recurse level */
3392 nigel 77
3393     /* Anything other than "no match" means we are done, always; otherwise, carry
3394     on only if not anchored. */
3395    
3396     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3397    
3398     /* Advance to the next subject character unless we are at the end of a line
3399     and firstline is set. */
3400    
3401 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3402 nigel 77 current_subject++;
3403     if (utf8)
3404     {
3405     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3406     current_subject++;
3407     }
3408     if (current_subject > end_subject) break;
3409    
3410 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3411 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3412     or ANY or ANYCRLF, advance the match position by one more character. */
3413 nigel 93
3414 ph10 391 if (current_subject[-1] == CHAR_CR &&
3415 ph10 226 current_subject < end_subject &&
3416 ph10 391 *current_subject == CHAR_NL &&
3417 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3418 ph10 226 (md->nltype == NLTYPE_ANY ||
3419     md->nltype == NLTYPE_ANYCRLF ||
3420     md->nllen == 2))
3421 nigel 93 current_subject++;
3422    
3423     } /* "Bumpalong" loop */
3424    
3425 nigel 77 return PCRE_ERROR_NOMATCH;
3426     }
3427    
3428     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12