/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1334 - (hide annotations) (download)
Wed May 15 16:53:18 2013 UTC (3 days, 13 hours ago) by ph10
File MIME type: text/plain
File size: 124959 byte(s)
Fix segfault when pcre_dfa_exec() is called with an output vector of length 
less than 2.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 1251 Copyright (c) 1997-2013 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43 ph10 960 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 nigel 93 applications. */
45 nigel 77
46    
47 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48     the performance of his patterns greatly. I could not use it as it stood, as it
49     was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
51    
52     The issue is the check for duplicate states, which is done by a simple linear
53     search up the state list. (Grep for "duplicate" below to find the code.) For
54     many patterns, there will never be many states active at one time, so a simple
55     linear search is fine. In patterns that have many active states, it might be a
56     bottleneck. The suggested code used an indexing scheme to remember which states
57     had previously been used for each character, and avoided the linear search when
58     it knew there was no chance of a duplicate. This was implemented when adding
59     states to the state lists.
60    
61     I wrote some thread-safe, not-limited code to try something similar at the time
62     of checking for duplicates (instead of when adding states), using index vectors
63     on the stack. It did give a 13% improvement with one specially constructed
64     pattern for certain subject strings, but on other strings and on many of the
65     simpler patterns in the test suite it did worse. The major problem, I think,
66     was the extra time to initialize the index. This had to be done for each call
67     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68     only once - I suspect this was the cause of the problems with the tests.)
69    
70 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
71 ph10 439 in others, so I abandoned this code. */
72    
73    
74    
75 ph10 200 #ifdef HAVE_CONFIG_H
76 ph10 236 #include "config.h"
77 ph10 200 #endif
78 ph10 199
79 nigel 93 #define NLBLOCK md /* Block containing newline information */
80     #define PSSTART start_subject /* Field containing processed string start */
81     #define PSEND end_subject /* Field containing processed string end */
82    
83 nigel 77 #include "pcre_internal.h"
84    
85    
86     /* For use to indent debugging output */
87    
88     #define SP " "
89    
90    
91     /*************************************************
92     * Code parameters and static tables *
93     *************************************************/
94    
95     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
97 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
98 ph10 178 never stored, so we push them well clear of the normal opcodes. */
99 nigel 77
100 ph10 178 #define OP_PROP_EXTRA 300
101     #define OP_EXTUNI_EXTRA 320
102     #define OP_ANYNL_EXTRA 340
103     #define OP_HSPACE_EXTRA 360
104     #define OP_VSPACE_EXTRA 380
105 nigel 77
106    
107     /* This table identifies those opcodes that are followed immediately by a
108 ph10 510 character that is to be tested in some way. This makes it possible to
109 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
110     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
112 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
113     modified, the three tables that follow must also be modified. */
114 nigel 77
115 ph10 836 static const pcre_uint8 coptable[] = {
116 nigel 77 0, /* End */
117 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
120 ph10 498 0, 0, /* \P, \p */
121 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 ph10 498 0, /* \X */
123 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
124 nigel 77 1, /* Char */
125 ph10 602 1, /* Chari */
126 nigel 77 1, /* not */
127 ph10 602 1, /* noti */
128 nigel 77 /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131     1+IMM2_SIZE, /* exact */
132     1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135     1+IMM2_SIZE, /* exact I */
136     1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 nigel 77 /* Negative single-char repeats - only for chars < 256 */
138     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140     1+IMM2_SIZE, /* NOT exact */
141     1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144     1+IMM2_SIZE, /* NOT exact I */
145     1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 nigel 77 /* Positive type repeats */
147     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149     1+IMM2_SIZE, /* Type exact */
150     1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 nigel 77 /* Character class & ref repeats */
152     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153     0, 0, /* CRRANGE, CRMINRANGE */
154     0, /* CLASS */
155     0, /* NCLASS */
156     0, /* XCLASS - variable length */
157     0, /* REF */
158 ph10 602 0, /* REFI */
159 nigel 77 0, /* RECURSE */
160     0, /* CALLOUT */
161     0, /* Alt */
162     0, /* Ket */
163     0, /* KetRmax */
164     0, /* KetRmin */
165 ph10 604 0, /* KetRpos */
166 ph10 637 0, /* Reverse */
167 nigel 77 0, /* Assert */
168     0, /* Assert not */
169     0, /* Assert behind */
170     0, /* Assert behind not */
171 ph10 723 0, 0, /* ONCE, ONCE_NC */
172     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
173 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
174 ph10 498 0, 0, /* CREF, NCREF */
175     0, 0, /* RREF, NRREF */
176 nigel 93 0, /* DEF */
177 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
178 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
179     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
180     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
181     0, 0 /* CLOSE, SKIPZERO */
182 nigel 77 };
183    
184 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
185 ph10 462 remember the fact that a character could have been inspected when the end of
186 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
187     two tables that follow must also be modified. */
188 ph10 462
189 ph10 836 static const pcre_uint8 poptable[] = {
190 ph10 462 0, /* End */
191 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
192 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
193     1, 1, 1, /* Any, AllAny, Anybyte */
194 ph10 498 1, 1, /* \P, \p */
195 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
196 ph10 498 1, /* \X */
197 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
198 ph10 462 1, /* Char */
199 ph10 602 1, /* Chari */
200 ph10 462 1, /* not */
201 ph10 602 1, /* noti */
202 ph10 462 /* Positive single-char repeats */
203     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
204     1, 1, 1, /* upto, minupto, exact */
205     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
206 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
207     1, 1, 1, /* upto I, minupto I, exact I */
208     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
209 ph10 462 /* Negative single-char repeats - only for chars < 256 */
210     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
211     1, 1, 1, /* NOT upto, minupto, exact */
212     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
213 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
214     1, 1, 1, /* NOT upto I, minupto I, exact I */
215     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
216 ph10 462 /* Positive type repeats */
217     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
218     1, 1, 1, /* Type upto, minupto, exact */
219     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
220     /* Character class & ref repeats */
221     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
222     1, 1, /* CRRANGE, CRMINRANGE */
223     1, /* CLASS */
224     1, /* NCLASS */
225     1, /* XCLASS - variable length */
226     0, /* REF */
227 ph10 602 0, /* REFI */
228 ph10 462 0, /* RECURSE */
229     0, /* CALLOUT */
230     0, /* Alt */
231     0, /* Ket */
232     0, /* KetRmax */
233     0, /* KetRmin */
234 ph10 604 0, /* KetRpos */
235 ph10 637 0, /* Reverse */
236 ph10 462 0, /* Assert */
237     0, /* Assert not */
238     0, /* Assert behind */
239     0, /* Assert behind not */
240 ph10 723 0, 0, /* ONCE, ONCE_NC */
241     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
242 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
243 ph10 498 0, 0, /* CREF, NCREF */
244     0, 0, /* RREF, NRREF */
245 ph10 462 0, /* DEF */
246 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
247 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
248     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
249     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
250     0, 0 /* CLOSE, SKIPZERO */
251 ph10 462 };
252    
253 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254     and \w */
255    
256 ph10 836 static const pcre_uint8 toptable1[] = {
257 ph10 168 0, 0, 0, 0, 0, 0,
258 nigel 77 ctype_digit, ctype_digit,
259     ctype_space, ctype_space,
260     ctype_word, ctype_word,
261 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
262 nigel 77 };
263    
264 ph10 836 static const pcre_uint8 toptable2[] = {
265 ph10 168 0, 0, 0, 0, 0, 0,
266 nigel 77 ctype_digit, 0,
267     ctype_space, 0,
268     ctype_word, 0,
269 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
270 nigel 77 };
271    
272    
273     /* Structure for holding data about a particular state, which is in effect the
274     current data for an active path through the match tree. It must consist
275     entirely of ints because the working vector we are passed, and which we put
276     these structures in, is a vector of ints. */
277    
278     typedef struct stateblock {
279     int offset; /* Offset to opcode */
280     int count; /* Count for repeats */
281     int data; /* Some use extra data */
282     } stateblock;
283    
284 ph10 960 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
285 nigel 77
286    
287 ph10 475 #ifdef PCRE_DEBUG
288 nigel 77 /*************************************************
289     * Print character string *
290     *************************************************/
291    
292     /* Character string printing function for debugging.
293    
294     Arguments:
295     p points to string
296     length number of bytes
297     f where to print
298    
299     Returns: nothing
300     */
301    
302     static void
303 ph10 836 pchars(const pcre_uchar *p, int length, FILE *f)
304 nigel 77 {
305 chpe 1091 pcre_uint32 c;
306 nigel 77 while (length-- > 0)
307     {
308     if (isprint(c = *(p++)))
309     fprintf(f, "%c", c);
310     else
311 chpe 1091 fprintf(f, "\\x{%02x}", c);
312 nigel 77 }
313     }
314     #endif
315    
316    
317    
318     /*************************************************
319     * Execute a Regular Expression - DFA engine *
320     *************************************************/
321    
322     /* This internal function applies a compiled pattern to a subject string,
323     starting at a given point, using a DFA engine. This function is called from the
324     external one, possibly multiple times if the pattern is not anchored. The
325     function calls itself recursively for some kinds of subpattern.
326    
327     Arguments:
328     md the match_data block with fixed information
329     this_start_code the opening bracket of this subexpression's code
330     current_subject where we currently are in the subject string
331     start_offset start offset in the subject string
332     offsets vector to contain the matching string offsets
333     offsetcount size of same
334     workspace vector of workspace
335     wscount size of same
336     rlevel function call recursion level
337    
338 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
339 ph10 341 = 0 => offsets overflowed; longest matches are present
340 nigel 77 -1 => failed to match
341     < -1 => some kind of unexpected problem
342    
343     The following macros are used for adding states to the two state vectors (one
344     for the current character, one for the following character). */
345    
346     #define ADD_ACTIVE(x,y) \
347     if (active_count++ < wscount) \
348     { \
349     next_active_state->offset = (x); \
350     next_active_state->count = (y); \
351     next_active_state++; \
352     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353     } \
354     else return PCRE_ERROR_DFA_WSSIZE
355    
356     #define ADD_ACTIVE_DATA(x,y,z) \
357     if (active_count++ < wscount) \
358     { \
359     next_active_state->offset = (x); \
360     next_active_state->count = (y); \
361     next_active_state->data = (z); \
362     next_active_state++; \
363     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364     } \
365     else return PCRE_ERROR_DFA_WSSIZE
366    
367     #define ADD_NEW(x,y) \
368     if (new_count++ < wscount) \
369     { \
370     next_new_state->offset = (x); \
371     next_new_state->count = (y); \
372     next_new_state++; \
373     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374     } \
375     else return PCRE_ERROR_DFA_WSSIZE
376    
377     #define ADD_NEW_DATA(x,y,z) \
378     if (new_count++ < wscount) \
379     { \
380     next_new_state->offset = (x); \
381     next_new_state->count = (y); \
382     next_new_state->data = (z); \
383     next_new_state++; \
384 ph10 979 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385     (x), (y), (z), __LINE__)); \
386 nigel 77 } \
387     else return PCRE_ERROR_DFA_WSSIZE
388    
389     /* And now, here is the code */
390    
391     static int
392     internal_dfa_exec(
393     dfa_match_data *md,
394 ph10 836 const pcre_uchar *this_start_code,
395     const pcre_uchar *current_subject,
396 nigel 77 int start_offset,
397     int *offsets,
398     int offsetcount,
399     int *workspace,
400     int wscount,
401 ph10 642 int rlevel)
402 nigel 77 {
403     stateblock *active_states, *new_states, *temp_states;
404     stateblock *next_active_state, *next_new_state;
405    
406 ph10 836 const pcre_uint8 *ctypes, *lcc, *fcc;
407     const pcre_uchar *ptr;
408     const pcre_uchar *end_code, *first_op;
409 nigel 77
410 ph10 642 dfa_recursion_info new_recursive;
411    
412 nigel 77 int active_count, new_count, match_count;
413    
414     /* Some fields in the md block are frequently referenced, so we load them into
415     independent variables in the hope that this will perform better. */
416    
417 ph10 836 const pcre_uchar *start_subject = md->start_subject;
418     const pcre_uchar *end_subject = md->end_subject;
419     const pcre_uchar *start_code = md->start_code;
420 nigel 77
421 ph10 836 #ifdef SUPPORT_UTF
422     BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423 nigel 93 #else
424 ph10 836 BOOL utf = FALSE;
425 nigel 87 #endif
426 nigel 77
427 ph10 916 BOOL reset_could_continue = FALSE;
428    
429 nigel 77 rlevel++;
430     offsetcount &= (-2);
431    
432     wscount -= 2;
433     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
434     (2 * INTS_PER_STATEBLOCK);
435    
436     DPRINTF(("\n%.*s---------------------\n"
437 ph10 642 "%.*sCall to internal_dfa_exec f=%d\n",
438     rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439 nigel 77
440     ctypes = md->tables + ctypes_offset;
441     lcc = md->tables + lcc_offset;
442     fcc = md->tables + fcc_offset;
443    
444     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
445    
446     active_states = (stateblock *)(workspace + 2);
447     next_new_state = new_states = active_states + wscount;
448     new_count = 0;
449    
450 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
451 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452 ph10 836 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453     ? IMM2_SIZE:0);
454 nigel 93
455 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456     the alternative states onto the list, and find out where the end is. This
457     makes is possible to use this function recursively, when we want to stop at a
458     matching internal ket rather than at the end.
459    
460     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461     a backward assertion. In that case, we have to find out the maximum amount to
462     move back, and set up each alternative appropriately. */
463    
464 nigel 93 if (*first_op == OP_REVERSE)
465 nigel 77 {
466     int max_back = 0;
467     int gone_back;
468    
469     end_code = this_start_code;
470     do
471     {
472     int back = GET(end_code, 2+LINK_SIZE);
473     if (back > max_back) max_back = back;
474     end_code += GET(end_code, 1);
475     }
476     while (*end_code == OP_ALT);
477    
478     /* If we can't go back the amount required for the longest lookbehind
479     pattern, go back as far as we can; some alternatives may still be viable. */
480    
481 ph10 836 #ifdef SUPPORT_UTF
482 nigel 77 /* In character mode we have to step back character by character */
483    
484 ph10 836 if (utf)
485 nigel 77 {
486     for (gone_back = 0; gone_back < max_back; gone_back++)
487     {
488     if (current_subject <= start_subject) break;
489     current_subject--;
490 ph10 836 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
491 nigel 77 }
492     }
493     else
494     #endif
495    
496     /* In byte-mode we can do this quickly. */
497    
498     {
499     gone_back = (current_subject - max_back < start_subject)?
500 ph10 530 (int)(current_subject - start_subject) : max_back;
501 nigel 77 current_subject -= gone_back;
502     }
503 ph10 461
504 ph10 435 /* Save the earliest consulted character */
505 nigel 77
506 ph10 461 if (current_subject < md->start_used_ptr)
507     md->start_used_ptr = current_subject;
508    
509 nigel 77 /* Now we can process the individual branches. */
510    
511     end_code = this_start_code;
512     do
513     {
514     int back = GET(end_code, 2+LINK_SIZE);
515     if (back <= gone_back)
516     {
517 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
519     }
520     end_code += GET(end_code, 1);
521     }
522     while (*end_code == OP_ALT);
523     }
524    
525     /* This is the code for a "normal" subpattern (not a backward assertion). The
526     start of a whole pattern is always one of these. If we are at the top level,
527     we may be asked to restart matching from the same point that we reached for a
528     previous partial match. We still have to scan through the top-level branches to
529     find the end state. */
530    
531     else
532     {
533     end_code = this_start_code;
534    
535     /* Restarting */
536    
537     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
538     {
539     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
540     new_count = workspace[1];
541     if (!workspace[0])
542     memcpy(new_states, active_states, new_count * sizeof(stateblock));
543     }
544    
545     /* Not restarting */
546    
547     else
548     {
549 nigel 93 int length = 1 + LINK_SIZE +
550 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551 ph10 836 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552     ? IMM2_SIZE:0);
553 nigel 77 do
554     {
555 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
556 nigel 77 end_code += GET(end_code, 1);
557 nigel 93 length = 1 + LINK_SIZE;
558 nigel 77 }
559     while (*end_code == OP_ALT);
560     }
561     }
562    
563     workspace[0] = 0; /* Bit indicating which vector is current */
564    
565 ph10 836 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566 nigel 77
567     /* Loop for scanning the subject */
568    
569     ptr = current_subject;
570     for (;;)
571     {
572     int i, j;
573 nigel 91 int clen, dlen;
574 chpe 1084 pcre_uint32 c, d;
575 ph10 428 int forced_fail = 0;
576 ph10 975 BOOL partial_newline = FALSE;
577 ph10 916 BOOL could_continue = reset_could_continue;
578 ph10 975 reset_could_continue = FALSE;
579    
580 nigel 77 /* Make the new state list into the active state list and empty the
581     new state list. */
582    
583     temp_states = active_states;
584     active_states = new_states;
585     new_states = temp_states;
586     active_count = new_count;
587     new_count = 0;
588    
589     workspace[0] ^= 1; /* Remember for the restarting feature */
590     workspace[1] = active_count;
591    
592 ph10 475 #ifdef PCRE_DEBUG
593 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594 ph10 836 pchars(ptr, STRLEN_UC(ptr), stdout);
595 nigel 77 printf("\"\n");
596    
597     printf("%.*sActive states: ", rlevel*2-2, SP);
598     for (i = 0; i < active_count; i++)
599     printf("%d/%d ", active_states[i].offset, active_states[i].count);
600     printf("\n");
601     #endif
602    
603     /* Set the pointers for adding new states */
604    
605     next_active_state = active_states + active_count;
606     next_new_state = new_states;
607    
608     /* Load the current character from the subject outside the loop, as many
609     different states may want to look at it, and we assume that at least one
610     will. */
611    
612     if (ptr < end_subject)
613     {
614 ph10 979 clen = 1; /* Number of data items in the character */
615 ph10 836 #ifdef SUPPORT_UTF
616 chpe 1100 GETCHARLENTEST(c, ptr, clen);
617     #else
618     c = *ptr;
619 ph10 836 #endif /* SUPPORT_UTF */
620 nigel 77 }
621     else
622     {
623 nigel 93 clen = 0; /* This indicates the end of the subject */
624     c = NOTACHAR; /* This value should never actually be used */
625 nigel 77 }
626    
627     /* Scan up the active states and act on each one. The result of an action
628     may be to add more states to the currently active list (e.g. on hitting a
629     parenthesis) or it may be to put states on the new list, for considering
630     when we move the character pointer on. */
631    
632     for (i = 0; i < active_count; i++)
633     {
634     stateblock *current_state = active_states + i;
635 ph10 654 BOOL caseless = FALSE;
636 ph10 836 const pcre_uchar *code;
637 nigel 77 int state_offset = current_state->offset;
638 ph10 1144 int codevalue, rrc;
639 ph10 1334 int count;
640 nigel 77
641 ph10 475 #ifdef PCRE_DEBUG
642 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
643 nigel 93 if (clen == 0) printf("EOL\n");
644 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
645     else printf("0x%02x\n", c);
646     #endif
647    
648     /* A negative offset is a special case meaning "hold off going to this
649     (negated) state until the number of characters in the data field have
650 ph10 975 been skipped". If the could_continue flag was passed over from a previous
651 ph10 916 state, arrange for it to passed on. */
652 nigel 77
653     if (state_offset < 0)
654     {
655     if (current_state->data > 0)
656     {
657     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
658     ADD_NEW_DATA(state_offset, current_state->count,
659     current_state->data - 1);
660 ph10 916 if (could_continue) reset_could_continue = TRUE;
661 nigel 77 continue;
662     }
663     else
664     {
665     current_state->offset = state_offset = -state_offset;
666     }
667     }
668    
669 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
670 ph10 439 See the note at the head of this module about the possibility of improving
671     performance here. */
672 nigel 77
673     for (j = 0; j < i; j++)
674     {
675     if (active_states[j].offset == state_offset &&
676     active_states[j].count == current_state->count)
677     {
678     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
679     goto NEXT_ACTIVE_STATE;
680     }
681     }
682    
683     /* The state offset is the offset to the opcode */
684    
685     code = start_code + state_offset;
686     codevalue = *code;
687    
688 ph10 463 /* If this opcode inspects a character, but we are at the end of the
689     subject, remember the fact for use when testing for a partial match. */
690    
691 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
692 ph10 463 could_continue = TRUE;
693 ph10 462
694 nigel 77 /* If this opcode is followed by an inline character, load it. It is
695     tempting to test for the presence of a subject character here, but that
696     is wrong, because sometimes zero repetitions of the subject are
697     permitted.
698    
699     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
700 ph10 975 argument that is not a data character - but is always one byte long because
701 ph10 925 the values are small. We have to take special action to deal with \P, \p,
702     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
703     these ones to new opcodes. */
704 nigel 77
705     if (coptable[codevalue] > 0)
706     {
707     dlen = 1;
708 ph10 836 #ifdef SUPPORT_UTF
709     if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
710     #endif /* SUPPORT_UTF */
711 nigel 77 d = code[coptable[codevalue]];
712     if (codevalue >= OP_TYPESTAR)
713     {
714 nigel 93 switch(d)
715     {
716     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
717     case OP_NOTPROP:
718     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
719     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
720     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
721 ph10 178 case OP_NOT_HSPACE:
722 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
723 ph10 178 case OP_NOT_VSPACE:
724 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
725 nigel 93 default: break;
726     }
727 nigel 77 }
728     }
729     else
730     {
731     dlen = 0; /* Not strictly necessary, but compilers moan */
732 nigel 93 d = NOTACHAR; /* if these variables are not set. */
733 nigel 77 }
734    
735    
736     /* Now process the individual opcodes */
737    
738     switch (codevalue)
739     {
740 ph10 498 /* ========================================================================== */
741     /* These cases are never obeyed. This is a fudge that causes a compile-
742     time error if the vectors coptable or poptable, which are indexed by
743     opcode, are not the correct length. It seems to be the only way to do
744     such a check at compile time, as the sizeof() operator does not work
745     in the C preprocessor. */
746 ph10 507
747 ph10 498 case OP_TABLE_LENGTH:
748 ph10 507 case OP_TABLE_LENGTH +
749 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
750     (sizeof(poptable) == OP_TABLE_LENGTH)):
751 ph10 507 break;
752 nigel 77
753     /* ========================================================================== */
754     /* Reached a closing bracket. If not at the end of the pattern, carry
755 ph10 654 on with the next opcode. For repeating opcodes, also add the repeat
756     state. Note that KETRPOS will always be encountered at the end of the
757     subpattern, because the possessive subpattern repeats are always handled
758 ph10 604 using recursive calls. Thus, it never adds any new states.
759 ph10 654
760 ph10 604 At the end of the (sub)pattern, unless we have an empty string and
761 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
762 ph10 442 start of the subject, save the match data, shifting up all previous
763 nigel 77 matches so we always have the longest first. */
764    
765     case OP_KET:
766     case OP_KETRMIN:
767     case OP_KETRMAX:
768 ph10 654 case OP_KETRPOS:
769 nigel 77 if (code != end_code)
770     {
771     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
772     if (codevalue != OP_KET)
773     {
774     ADD_ACTIVE(state_offset - GET(code, 1), 0);
775     }
776     }
777 ph10 461 else
778 nigel 77 {
779 ph10 461 if (ptr > current_subject ||
780 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
781     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
782     current_subject > start_subject + md->start_offset)))
783 nigel 77 {
784 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
785 ph10 680 else if (match_count > 0 && ++match_count * 2 > offsetcount)
786 ph10 428 match_count = 0;
787     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
788     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
789     if (offsetcount >= 2)
790     {
791 ph10 530 offsets[0] = (int)(current_subject - start_subject);
792     offsets[1] = (int)(ptr - start_subject);
793 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
794 ph10 979 offsets[1] - offsets[0], (char *)current_subject));
795 ph10 428 }
796     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
797     {
798     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
799     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
800     match_count, rlevel*2-2, SP));
801     return match_count;
802     }
803 ph10 461 }
804 nigel 77 }
805     break;
806    
807     /* ========================================================================== */
808     /* These opcodes add to the current list of states without looking
809     at the current character. */
810    
811     /*-----------------------------------------------------------------*/
812     case OP_ALT:
813     do { code += GET(code, 1); } while (*code == OP_ALT);
814 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
815 nigel 77 break;
816    
817     /*-----------------------------------------------------------------*/
818     case OP_BRA:
819 nigel 93 case OP_SBRA:
820 nigel 77 do
821     {
822 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
823 nigel 77 code += GET(code, 1);
824     }
825     while (*code == OP_ALT);
826     break;
827    
828     /*-----------------------------------------------------------------*/
829 nigel 93 case OP_CBRA:
830     case OP_SCBRA:
831 ph10 836 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
832 nigel 93 code += GET(code, 1);
833     while (*code == OP_ALT)
834     {
835 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
836 nigel 93 code += GET(code, 1);
837     }
838     break;
839    
840     /*-----------------------------------------------------------------*/
841 nigel 77 case OP_BRAZERO:
842     case OP_BRAMINZERO:
843     ADD_ACTIVE(state_offset + 1, 0);
844     code += 1 + GET(code, 2);
845     while (*code == OP_ALT) code += GET(code, 1);
846 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
847 nigel 77 break;
848    
849     /*-----------------------------------------------------------------*/
850 ph10 335 case OP_SKIPZERO:
851     code += 1 + GET(code, 2);
852     while (*code == OP_ALT) code += GET(code, 1);
853 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
854 ph10 335 break;
855    
856     /*-----------------------------------------------------------------*/
857 nigel 77 case OP_CIRC:
858 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
859     { ADD_ACTIVE(state_offset + 1, 0); }
860     break;
861    
862     /*-----------------------------------------------------------------*/
863     case OP_CIRCM:
864 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
865 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
866 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
867     break;
868    
869     /*-----------------------------------------------------------------*/
870     case OP_EOD:
871 ph10 579 if (ptr >= end_subject)
872     {
873 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
874     could_continue = TRUE;
875     else { ADD_ACTIVE(state_offset + 1, 0); }
876     }
877 nigel 77 break;
878    
879     /*-----------------------------------------------------------------*/
880     case OP_SOD:
881     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
882     break;
883    
884     /*-----------------------------------------------------------------*/
885     case OP_SOM:
886     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
887     break;
888    
889    
890     /* ========================================================================== */
891     /* These opcodes inspect the next subject character, and sometimes
892     the previous one as well, but do not have an argument. The variable
893     clen contains the length of the current character and is zero if we are
894     at the end of the subject. */
895    
896     /*-----------------------------------------------------------------*/
897     case OP_ANY:
898 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
899 ph10 975 {
900 ph10 919 if (ptr + 1 >= md->end_subject &&
901     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
902     NLBLOCK->nltype == NLTYPE_FIXED &&
903 ph10 975 NLBLOCK->nllen == 2 &&
904 ph10 919 c == NLBLOCK->nl[0])
905     {
906 ph10 975 could_continue = partial_newline = TRUE;
907     }
908 ph10 919 else
909 ph10 975 {
910     ADD_NEW(state_offset + 1, 0);
911     }
912 ph10 919 }
913 nigel 77 break;
914    
915     /*-----------------------------------------------------------------*/
916 ph10 341 case OP_ALLANY:
917     if (clen > 0)
918     { ADD_NEW(state_offset + 1, 0); }
919     break;
920    
921     /*-----------------------------------------------------------------*/
922 nigel 77 case OP_EODN:
923 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
924     could_continue = TRUE;
925     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
926 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
927     break;
928    
929     /*-----------------------------------------------------------------*/
930     case OP_DOLL:
931     if ((md->moptions & PCRE_NOTEOL) == 0)
932     {
933 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
934     could_continue = TRUE;
935     else if (clen == 0 ||
936 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
937 ph10 602 (ptr == end_subject - md->nllen)
938 nigel 91 ))
939 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
940 ph10 916 else if (ptr + 1 >= md->end_subject &&
941     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
942     NLBLOCK->nltype == NLTYPE_FIXED &&
943 ph10 975 NLBLOCK->nllen == 2 &&
944 ph10 916 c == NLBLOCK->nl[0])
945     {
946     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
947     {
948     reset_could_continue = TRUE;
949 ph10 975 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
950     }
951     else could_continue = partial_newline = TRUE;
952     }
953 nigel 77 }
954 ph10 602 break;
955    
956     /*-----------------------------------------------------------------*/
957     case OP_DOLLM:
958     if ((md->moptions & PCRE_NOTEOL) == 0)
959     {
960     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
961     could_continue = TRUE;
962     else if (clen == 0 ||
963     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
964     { ADD_ACTIVE(state_offset + 1, 0); }
965 ph10 916 else if (ptr + 1 >= md->end_subject &&
966     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
967     NLBLOCK->nltype == NLTYPE_FIXED &&
968 ph10 975 NLBLOCK->nllen == 2 &&
969 ph10 916 c == NLBLOCK->nl[0])
970     {
971     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
972     {
973     reset_could_continue = TRUE;
974 ph10 975 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
975     }
976     else could_continue = partial_newline = TRUE;
977     }
978 ph10 602 }
979     else if (IS_NEWLINE(ptr))
980 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
981     break;
982    
983     /*-----------------------------------------------------------------*/
984    
985     case OP_DIGIT:
986     case OP_WHITESPACE:
987     case OP_WORDCHAR:
988     if (clen > 0 && c < 256 &&
989     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
990     { ADD_NEW(state_offset + 1, 0); }
991     break;
992    
993     /*-----------------------------------------------------------------*/
994     case OP_NOT_DIGIT:
995     case OP_NOT_WHITESPACE:
996     case OP_NOT_WORDCHAR:
997     if (clen > 0 && (c >= 256 ||
998     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
999     { ADD_NEW(state_offset + 1, 0); }
1000     break;
1001    
1002     /*-----------------------------------------------------------------*/
1003     case OP_WORD_BOUNDARY:
1004     case OP_NOT_WORD_BOUNDARY:
1005     {
1006     int left_word, right_word;
1007    
1008     if (ptr > start_subject)
1009     {
1010 ph10 836 const pcre_uchar *temp = ptr - 1;
1011 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1012 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1013 ph10 836 if (utf) { BACKCHAR(temp); }
1014 nigel 77 #endif
1015     GETCHARTEST(d, temp);
1016 ph10 535 #ifdef SUPPORT_UCP
1017 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
1018     {
1019     if (d == '_') left_word = TRUE; else
1020 ph10 535 {
1021 ph10 518 int cat = UCD_CATEGORY(d);
1022     left_word = (cat == ucp_L || cat == ucp_N);
1023 ph10 535 }
1024     }
1025     else
1026     #endif
1027 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1028     }
1029 ph10 518 else left_word = FALSE;
1030 nigel 77
1031 ph10 461 if (clen > 0)
1032 ph10 535 {
1033     #ifdef SUPPORT_UCP
1034 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
1035     {
1036     if (c == '_') right_word = TRUE; else
1037 ph10 535 {
1038 ph10 518 int cat = UCD_CATEGORY(c);
1039     right_word = (cat == ucp_L || cat == ucp_N);
1040 ph10 535 }
1041     }
1042     else
1043     #endif
1044 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1045 ph10 535 }
1046 ph10 518 else right_word = FALSE;
1047 nigel 77
1048     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1049     { ADD_ACTIVE(state_offset + 1, 0); }
1050     }
1051     break;
1052    
1053    
1054     /*-----------------------------------------------------------------*/
1055     /* Check the next character by Unicode property. We will get here only
1056     if the support is in the binary; otherwise a compile-time error occurs.
1057     */
1058    
1059 ph10 151 #ifdef SUPPORT_UCP
1060 nigel 77 case OP_PROP:
1061     case OP_NOTPROP:
1062     if (clen > 0)
1063     {
1064 nigel 87 BOOL OK;
1065 ph10 1221 const pcre_uint32 *cp;
1066 ph10 349 const ucd_record * prop = GET_UCD(c);
1067 nigel 87 switch(code[1])
1068 nigel 77 {
1069 nigel 87 case PT_ANY:
1070     OK = TRUE;
1071     break;
1072    
1073     case PT_LAMP:
1074 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1075 ph10 517 prop->chartype == ucp_Lt;
1076 nigel 87 break;
1077    
1078     case PT_GC:
1079 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1080 nigel 87 break;
1081    
1082     case PT_PC:
1083 ph10 349 OK = prop->chartype == code[2];
1084 nigel 87 break;
1085    
1086     case PT_SC:
1087 ph10 349 OK = prop->script == code[2];
1088 nigel 87 break;
1089 ph10 535
1090 ph10 517 /* These are specials for combination cases. */
1091 ph10 535
1092 ph10 517 case PT_ALNUM:
1093 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1094     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1095 ph10 535 break;
1096    
1097 ph10 517 case PT_SPACE: /* Perl space */
1098 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1099 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1100 ph10 535 break;
1101    
1102 ph10 517 case PT_PXSPACE: /* POSIX space */
1103 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1104 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1105     c == CHAR_FF || c == CHAR_CR;
1106 ph10 535 break;
1107    
1108 ph10 517 case PT_WORD:
1109 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1110     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1111 ph10 517 c == CHAR_UNDERSCORE;
1112 ph10 535 break;
1113 ph10 1221
1114 ph10 1046 case PT_CLIST:
1115 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[2];
1116 ph10 1046 for (;;)
1117     {
1118     if (c < *cp) { OK = FALSE; break; }
1119     if (c == *cp++) { OK = TRUE; break; }
1120 ph10 1221 }
1121     break;
1122 ph10 1320
1123 ph10 1260 case PT_UCNC:
1124     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1125 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1126 ph10 1260 c >= 0xe000;
1127 ph10 1320 break;
1128 nigel 87
1129     /* Should never occur, but keep compilers from grumbling. */
1130    
1131     default:
1132     OK = codevalue != OP_PROP;
1133     break;
1134 nigel 77 }
1135 nigel 87
1136     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1137 nigel 77 }
1138     break;
1139     #endif
1140    
1141    
1142    
1143     /* ========================================================================== */
1144     /* These opcodes likewise inspect the subject character, but have an
1145     argument that is not a data character. It is one of these opcodes:
1146 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1147     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1148 nigel 77
1149     case OP_TYPEPLUS:
1150     case OP_TYPEMINPLUS:
1151 nigel 93 case OP_TYPEPOSPLUS:
1152 nigel 77 count = current_state->count; /* Already matched */
1153     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1154     if (clen > 0)
1155     {
1156 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1157     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1158     NLBLOCK->nltype == NLTYPE_FIXED &&
1159 ph10 975 NLBLOCK->nllen == 2 &&
1160 ph10 919 c == NLBLOCK->nl[0])
1161     {
1162 ph10 975 could_continue = partial_newline = TRUE;
1163     }
1164 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1165 nigel 77 (c < 256 &&
1166 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1167 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1168     {
1169 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1170     {
1171     active_count--; /* Remove non-match possibility */
1172     next_active_state--;
1173     }
1174 nigel 77 count++;
1175     ADD_NEW(state_offset, count);
1176     }
1177     }
1178     break;
1179    
1180     /*-----------------------------------------------------------------*/
1181     case OP_TYPEQUERY:
1182     case OP_TYPEMINQUERY:
1183 nigel 93 case OP_TYPEPOSQUERY:
1184 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1185     if (clen > 0)
1186     {
1187 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1188     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1189     NLBLOCK->nltype == NLTYPE_FIXED &&
1190 ph10 975 NLBLOCK->nllen == 2 &&
1191 ph10 919 c == NLBLOCK->nl[0])
1192     {
1193 ph10 975 could_continue = partial_newline = TRUE;
1194     }
1195 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1196 nigel 77 (c < 256 &&
1197 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1198 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1199     {
1200 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1201     {
1202     active_count--; /* Remove non-match possibility */
1203     next_active_state--;
1204     }
1205 nigel 77 ADD_NEW(state_offset + 2, 0);
1206     }
1207     }
1208     break;
1209    
1210     /*-----------------------------------------------------------------*/
1211     case OP_TYPESTAR:
1212     case OP_TYPEMINSTAR:
1213 nigel 93 case OP_TYPEPOSSTAR:
1214 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1215     if (clen > 0)
1216     {
1217 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1218     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1219     NLBLOCK->nltype == NLTYPE_FIXED &&
1220 ph10 975 NLBLOCK->nllen == 2 &&
1221 ph10 919 c == NLBLOCK->nl[0])
1222     {
1223 ph10 975 could_continue = partial_newline = TRUE;
1224     }
1225 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1226 nigel 77 (c < 256 &&
1227 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1228 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1229     {
1230 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1231     {
1232     active_count--; /* Remove non-match possibility */
1233     next_active_state--;
1234     }
1235 nigel 77 ADD_NEW(state_offset, 0);
1236     }
1237     }
1238     break;
1239    
1240     /*-----------------------------------------------------------------*/
1241     case OP_TYPEEXACT:
1242 nigel 93 count = current_state->count; /* Number already matched */
1243     if (clen > 0)
1244     {
1245 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1246     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1247     NLBLOCK->nltype == NLTYPE_FIXED &&
1248 ph10 975 NLBLOCK->nllen == 2 &&
1249 ph10 919 c == NLBLOCK->nl[0])
1250     {
1251 ph10 975 could_continue = partial_newline = TRUE;
1252     }
1253 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1254 nigel 93 (c < 256 &&
1255 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1256 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1257     {
1258 ph10 1334 if (++count >= (int)GET2(code, 1))
1259 ph10 836 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1260 nigel 93 else
1261     { ADD_NEW(state_offset, count); }
1262     }
1263     }
1264     break;
1265    
1266     /*-----------------------------------------------------------------*/
1267 nigel 77 case OP_TYPEUPTO:
1268     case OP_TYPEMINUPTO:
1269 nigel 93 case OP_TYPEPOSUPTO:
1270 ph10 836 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1271 nigel 77 count = current_state->count; /* Number already matched */
1272     if (clen > 0)
1273     {
1274 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1275     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1276     NLBLOCK->nltype == NLTYPE_FIXED &&
1277 ph10 975 NLBLOCK->nllen == 2 &&
1278 ph10 919 c == NLBLOCK->nl[0])
1279     {
1280 ph10 975 could_continue = partial_newline = TRUE;
1281     }
1282 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1283 nigel 77 (c < 256 &&
1284 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1285 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1286     {
1287 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1288     {
1289     active_count--; /* Remove non-match possibility */
1290     next_active_state--;
1291     }
1292 ph10 1334 if (++count >= (int)GET2(code, 1))
1293 ph10 836 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1294 nigel 77 else
1295     { ADD_NEW(state_offset, count); }
1296     }
1297     }
1298     break;
1299    
1300     /* ========================================================================== */
1301     /* These are virtual opcodes that are used when something like
1302 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1303     argument. It keeps the code above fast for the other cases. The argument
1304     is in the d variable. */
1305 nigel 77
1306 ph10 151 #ifdef SUPPORT_UCP
1307 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1308     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1309 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1310 nigel 77 count = current_state->count; /* Already matched */
1311 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1312 nigel 77 if (clen > 0)
1313     {
1314 nigel 87 BOOL OK;
1315 ph10 1221 const pcre_uint32 *cp;
1316 ph10 349 const ucd_record * prop = GET_UCD(c);
1317 nigel 87 switch(code[2])
1318     {
1319     case PT_ANY:
1320     OK = TRUE;
1321     break;
1322    
1323     case PT_LAMP:
1324 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1325 ph10 517 prop->chartype == ucp_Lt;
1326 nigel 87 break;
1327    
1328     case PT_GC:
1329 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1330 nigel 87 break;
1331    
1332     case PT_PC:
1333 ph10 349 OK = prop->chartype == code[3];
1334 nigel 87 break;
1335    
1336     case PT_SC:
1337 ph10 349 OK = prop->script == code[3];
1338 nigel 87 break;
1339    
1340 ph10 517 /* These are specials for combination cases. */
1341 ph10 535
1342 ph10 517 case PT_ALNUM:
1343 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1344     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1345 ph10 535 break;
1346    
1347 ph10 517 case PT_SPACE: /* Perl space */
1348 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1349 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1350 ph10 535 break;
1351    
1352 ph10 517 case PT_PXSPACE: /* POSIX space */
1353 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1354 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1355     c == CHAR_FF || c == CHAR_CR;
1356 ph10 535 break;
1357    
1358 ph10 517 case PT_WORD:
1359 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1360     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1361 ph10 517 c == CHAR_UNDERSCORE;
1362 ph10 535 break;
1363 ph10 517
1364 ph10 1046 case PT_CLIST:
1365 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[3];
1366 ph10 1046 for (;;)
1367     {
1368     if (c < *cp) { OK = FALSE; break; }
1369     if (c == *cp++) { OK = TRUE; break; }
1370 ph10 1221 }
1371     break;
1372 ph10 1046
1373 ph10 1260 case PT_UCNC:
1374     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1375 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1376 ph10 1260 c >= 0xe000;
1377 ph10 1320 break;
1378 ph10 1260
1379 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1380    
1381     default:
1382     OK = codevalue != OP_PROP;
1383     break;
1384     }
1385    
1386 nigel 93 if (OK == (d == OP_PROP))
1387     {
1388     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1389     {
1390     active_count--; /* Remove non-match possibility */
1391     next_active_state--;
1392     }
1393     count++;
1394     ADD_NEW(state_offset, count);
1395     }
1396 nigel 77 }
1397     break;
1398    
1399     /*-----------------------------------------------------------------*/
1400     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1401     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1402 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1403 nigel 77 count = current_state->count; /* Already matched */
1404     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1405 ph10 1011 if (clen > 0)
1406 nigel 77 {
1407 ph10 1033 int lgb, rgb;
1408 ph10 836 const pcre_uchar *nptr = ptr + clen;
1409 nigel 77 int ncount = 0;
1410 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1411     {
1412     active_count--; /* Remove non-match possibility */
1413     next_active_state--;
1414     }
1415 ph10 1033 lgb = UCD_GRAPHBREAK(c);
1416 nigel 77 while (nptr < end_subject)
1417     {
1418 ph10 1011 dlen = 1;
1419     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1420 ph10 1033 rgb = UCD_GRAPHBREAK(d);
1421 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1422 nigel 77 ncount++;
1423 ph10 1033 lgb = rgb;
1424 ph10 1011 nptr += dlen;
1425 nigel 77 }
1426     count++;
1427     ADD_NEW_DATA(-state_offset, count, ncount);
1428     }
1429     break;
1430 ph10 151 #endif
1431 nigel 77
1432     /*-----------------------------------------------------------------*/
1433 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1434     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1435     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1436     count = current_state->count; /* Already matched */
1437     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1438     if (clen > 0)
1439     {
1440     int ncount = 0;
1441     switch (c)
1442     {
1443 ph10 1033 case CHAR_VT:
1444     case CHAR_FF:
1445     case CHAR_NEL:
1446     #ifndef EBCDIC
1447 nigel 93 case 0x2028:
1448     case 0x2029:
1449 ph10 1033 #endif /* Not EBCDIC */
1450 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1451     goto ANYNL01;
1452    
1453 ph10 1033 case CHAR_CR:
1454 chpe 1100 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1455 ph10 231 /* Fall through */
1456    
1457     ANYNL01:
1458 ph10 1033 case CHAR_LF:
1459 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1460     {
1461     active_count--; /* Remove non-match possibility */
1462     next_active_state--;
1463     }
1464     count++;
1465     ADD_NEW_DATA(-state_offset, count, ncount);
1466     break;
1467 ph10 231
1468 nigel 93 default:
1469     break;
1470     }
1471     }
1472     break;
1473    
1474     /*-----------------------------------------------------------------*/
1475 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1476     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1477     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1478     count = current_state->count; /* Already matched */
1479     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480     if (clen > 0)
1481     {
1482 ph10 182 BOOL OK;
1483 ph10 178 switch (c)
1484     {
1485 ph10 1221 VSPACE_CASES:
1486 ph10 178 OK = TRUE;
1487 ph10 182 break;
1488 ph10 178
1489     default:
1490     OK = FALSE;
1491 ph10 182 break;
1492 ph10 178 }
1493    
1494     if (OK == (d == OP_VSPACE))
1495 ph10 182 {
1496 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1497     {
1498     active_count--; /* Remove non-match possibility */
1499     next_active_state--;
1500     }
1501     count++;
1502     ADD_NEW_DATA(-state_offset, count, 0);
1503     }
1504     }
1505     break;
1506    
1507     /*-----------------------------------------------------------------*/
1508     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1509     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1510     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1511     count = current_state->count; /* Already matched */
1512     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1513     if (clen > 0)
1514     {
1515 ph10 182 BOOL OK;
1516 ph10 178 switch (c)
1517     {
1518 ph10 1221 HSPACE_CASES:
1519 ph10 178 OK = TRUE;
1520     break;
1521 ph10 182
1522 ph10 178 default:
1523     OK = FALSE;
1524     break;
1525     }
1526 ph10 182
1527 ph10 178 if (OK == (d == OP_HSPACE))
1528 ph10 182 {
1529 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1530     {
1531     active_count--; /* Remove non-match possibility */
1532     next_active_state--;
1533     }
1534     count++;
1535     ADD_NEW_DATA(-state_offset, count, 0);
1536     }
1537     }
1538     break;
1539    
1540     /*-----------------------------------------------------------------*/
1541 ph10 151 #ifdef SUPPORT_UCP
1542 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1543     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1544 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1545 nigel 87 count = 4;
1546 nigel 77 goto QS1;
1547    
1548     case OP_PROP_EXTRA + OP_TYPESTAR:
1549     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1550 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1551 nigel 77 count = 0;
1552    
1553     QS1:
1554    
1555 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1556 nigel 77 if (clen > 0)
1557     {
1558 nigel 87 BOOL OK;
1559 ph10 1221 const pcre_uint32 *cp;
1560 ph10 349 const ucd_record * prop = GET_UCD(c);
1561 nigel 87 switch(code[2])
1562     {
1563     case PT_ANY:
1564     OK = TRUE;
1565     break;
1566    
1567     case PT_LAMP:
1568 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1569 ph10 517 prop->chartype == ucp_Lt;
1570 nigel 87 break;
1571    
1572     case PT_GC:
1573 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1574 nigel 87 break;
1575    
1576     case PT_PC:
1577 ph10 349 OK = prop->chartype == code[3];
1578 nigel 87 break;
1579    
1580     case PT_SC:
1581 ph10 349 OK = prop->script == code[3];
1582 nigel 87 break;
1583 ph10 535
1584 ph10 517 /* These are specials for combination cases. */
1585 ph10 535
1586 ph10 517 case PT_ALNUM:
1587 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1588     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1589 ph10 535 break;
1590    
1591 ph10 517 case PT_SPACE: /* Perl space */
1592 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1593 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1594 ph10 535 break;
1595    
1596 ph10 517 case PT_PXSPACE: /* POSIX space */
1597 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1598 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1599     c == CHAR_FF || c == CHAR_CR;
1600 ph10 535 break;
1601    
1602 ph10 517 case PT_WORD:
1603 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1604     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1605 ph10 517 c == CHAR_UNDERSCORE;
1606 ph10 535 break;
1607 nigel 87
1608 ph10 1046 case PT_CLIST:
1609 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[3];
1610 ph10 1046 for (;;)
1611     {
1612     if (c < *cp) { OK = FALSE; break; }
1613     if (c == *cp++) { OK = TRUE; break; }
1614 ph10 1221 }
1615     break;
1616 ph10 1046
1617 ph10 1260 case PT_UCNC:
1618     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1619 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1620 ph10 1260 c >= 0xe000;
1621 ph10 1320 break;
1622 ph10 1260
1623 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1624    
1625     default:
1626     OK = codevalue != OP_PROP;
1627     break;
1628     }
1629    
1630 nigel 93 if (OK == (d == OP_PROP))
1631     {
1632     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1633     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1634     {
1635     active_count--; /* Remove non-match possibility */
1636     next_active_state--;
1637     }
1638     ADD_NEW(state_offset + count, 0);
1639     }
1640 nigel 77 }
1641     break;
1642    
1643     /*-----------------------------------------------------------------*/
1644     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1645     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1646 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1647 nigel 77 count = 2;
1648     goto QS2;
1649    
1650     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1651     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1652 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1653 nigel 77 count = 0;
1654    
1655     QS2:
1656    
1657     ADD_ACTIVE(state_offset + 2, 0);
1658 ph10 1011 if (clen > 0)
1659 nigel 77 {
1660 ph10 1033 int lgb, rgb;
1661 ph10 836 const pcre_uchar *nptr = ptr + clen;
1662 nigel 77 int ncount = 0;
1663 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1664     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1665     {
1666     active_count--; /* Remove non-match possibility */
1667     next_active_state--;
1668     }
1669 ph10 1033 lgb = UCD_GRAPHBREAK(c);
1670 nigel 77 while (nptr < end_subject)
1671     {
1672 ph10 1011 dlen = 1;
1673     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1674 ph10 1033 rgb = UCD_GRAPHBREAK(d);
1675 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1676 nigel 77 ncount++;
1677 ph10 1033 lgb = rgb;
1678 ph10 1011 nptr += dlen;
1679 nigel 77 }
1680     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1681     }
1682     break;
1683 ph10 151 #endif
1684 nigel 77
1685     /*-----------------------------------------------------------------*/
1686 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1687     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1688     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1689     count = 2;
1690     goto QS3;
1691    
1692     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1693     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1694     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1695     count = 0;
1696    
1697     QS3:
1698     ADD_ACTIVE(state_offset + 2, 0);
1699     if (clen > 0)
1700     {
1701     int ncount = 0;
1702     switch (c)
1703     {
1704 ph10 1033 case CHAR_VT:
1705     case CHAR_FF:
1706     case CHAR_NEL:
1707     #ifndef EBCDIC
1708 nigel 93 case 0x2028:
1709     case 0x2029:
1710 ph10 1033 #endif /* Not EBCDIC */
1711 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1712     goto ANYNL02;
1713    
1714 ph10 1033 case CHAR_CR:
1715 chpe 1100 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1716 ph10 231 /* Fall through */
1717    
1718     ANYNL02:
1719 ph10 1033 case CHAR_LF:
1720 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1721     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1722     {
1723     active_count--; /* Remove non-match possibility */
1724     next_active_state--;
1725     }
1726 ph10 1233 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1727 nigel 93 break;
1728 ph10 231
1729 nigel 93 default:
1730     break;
1731     }
1732     }
1733     break;
1734    
1735     /*-----------------------------------------------------------------*/
1736 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1737     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1738     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1739     count = 2;
1740     goto QS4;
1741    
1742     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1743     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1744     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1745     count = 0;
1746    
1747     QS4:
1748     ADD_ACTIVE(state_offset + 2, 0);
1749     if (clen > 0)
1750     {
1751 ph10 182 BOOL OK;
1752 ph10 178 switch (c)
1753     {
1754 ph10 1221 VSPACE_CASES:
1755 ph10 178 OK = TRUE;
1756     break;
1757 ph10 182
1758 ph10 178 default:
1759     OK = FALSE;
1760     break;
1761     }
1762     if (OK == (d == OP_VSPACE))
1763 ph10 182 {
1764 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1765     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1766     {
1767     active_count--; /* Remove non-match possibility */
1768     next_active_state--;
1769     }
1770 ph10 1233 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1771 ph10 178 }
1772     }
1773     break;
1774    
1775     /*-----------------------------------------------------------------*/
1776     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1777     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1778     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1779     count = 2;
1780     goto QS5;
1781    
1782     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1783     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1784     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1785     count = 0;
1786    
1787     QS5:
1788     ADD_ACTIVE(state_offset + 2, 0);
1789     if (clen > 0)
1790     {
1791 ph10 182 BOOL OK;
1792 ph10 178 switch (c)
1793     {
1794 ph10 1221 HSPACE_CASES:
1795 ph10 178 OK = TRUE;
1796     break;
1797 ph10 182
1798 ph10 178 default:
1799     OK = FALSE;
1800     break;
1801     }
1802 ph10 182
1803 ph10 178 if (OK == (d == OP_HSPACE))
1804 ph10 182 {
1805 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1806     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1807     {
1808     active_count--; /* Remove non-match possibility */
1809     next_active_state--;
1810     }
1811 ph10 1233 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1812 ph10 178 }
1813     }
1814     break;
1815    
1816     /*-----------------------------------------------------------------*/
1817 ph10 151 #ifdef SUPPORT_UCP
1818 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1819     case OP_PROP_EXTRA + OP_TYPEUPTO:
1820     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1821 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1822 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1823 ph10 836 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1824 nigel 77 count = current_state->count; /* Number already matched */
1825     if (clen > 0)
1826     {
1827 nigel 87 BOOL OK;
1828 ph10 1221 const pcre_uint32 *cp;
1829 ph10 349 const ucd_record * prop = GET_UCD(c);
1830 ph10 836 switch(code[1 + IMM2_SIZE + 1])
1831 nigel 77 {
1832 nigel 87 case PT_ANY:
1833     OK = TRUE;
1834     break;
1835    
1836     case PT_LAMP:
1837 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1838 ph10 517 prop->chartype == ucp_Lt;
1839 nigel 87 break;
1840    
1841     case PT_GC:
1842 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1843 nigel 87 break;
1844    
1845     case PT_PC:
1846 ph10 836 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1847 nigel 87 break;
1848    
1849     case PT_SC:
1850 ph10 836 OK = prop->script == code[1 + IMM2_SIZE + 2];
1851 nigel 87 break;
1852 ph10 535
1853 ph10 517 /* These are specials for combination cases. */
1854 ph10 535
1855 ph10 517 case PT_ALNUM:
1856 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1857     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1858 ph10 535 break;
1859    
1860 ph10 517 case PT_SPACE: /* Perl space */
1861 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1862 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1863 ph10 535 break;
1864    
1865 ph10 517 case PT_PXSPACE: /* POSIX space */
1866 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1867 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1868     c == CHAR_FF || c == CHAR_CR;
1869 ph10 535 break;
1870    
1871 ph10 517 case PT_WORD:
1872 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1873     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1874 ph10 517 c == CHAR_UNDERSCORE;
1875 ph10 535 break;
1876 nigel 87
1877 ph10 1046 case PT_CLIST:
1878 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1879 ph10 1046 for (;;)
1880     {
1881     if (c < *cp) { OK = FALSE; break; }
1882     if (c == *cp++) { OK = TRUE; break; }
1883 ph10 1221 }
1884     break;
1885 ph10 1046
1886 ph10 1260 case PT_UCNC:
1887     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1888 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1889 ph10 1260 c >= 0xe000;
1890 ph10 1320 break;
1891 ph10 1260
1892 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1893    
1894     default:
1895     OK = codevalue != OP_PROP;
1896     break;
1897     }
1898    
1899     if (OK == (d == OP_PROP))
1900     {
1901 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1902     {
1903     active_count--; /* Remove non-match possibility */
1904     next_active_state--;
1905     }
1906 ph10 1334 if (++count >= (int)GET2(code, 1))
1907 ph10 836 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1908 nigel 77 else
1909     { ADD_NEW(state_offset, count); }
1910     }
1911     }
1912     break;
1913    
1914     /*-----------------------------------------------------------------*/
1915     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1916     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1917     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1918 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1919 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1920 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1921 nigel 77 count = current_state->count; /* Number already matched */
1922 ph10 1011 if (clen > 0)
1923 nigel 77 {
1924 ph10 1033 int lgb, rgb;
1925 ph10 836 const pcre_uchar *nptr = ptr + clen;
1926 nigel 77 int ncount = 0;
1927 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1928     {
1929     active_count--; /* Remove non-match possibility */
1930     next_active_state--;
1931     }
1932 ph10 1033 lgb = UCD_GRAPHBREAK(c);
1933 nigel 77 while (nptr < end_subject)
1934     {
1935 ph10 1011 dlen = 1;
1936     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1937 ph10 1033 rgb = UCD_GRAPHBREAK(d);
1938 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1939 nigel 77 ncount++;
1940 ph10 1033 lgb = rgb;
1941 ph10 1011 nptr += dlen;
1942 nigel 77 }
1943 ph10 975 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1944     reset_could_continue = TRUE;
1945 ph10 1334 if (++count >= (int)GET2(code, 1))
1946 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1947 nigel 77 else
1948     { ADD_NEW_DATA(-state_offset, count, ncount); }
1949     }
1950     break;
1951 ph10 151 #endif
1952 nigel 77
1953 nigel 93 /*-----------------------------------------------------------------*/
1954     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1955     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1956     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1957     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1958     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1959 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1960 nigel 93 count = current_state->count; /* Number already matched */
1961     if (clen > 0)
1962     {
1963     int ncount = 0;
1964     switch (c)
1965     {
1966 ph10 1033 case CHAR_VT:
1967     case CHAR_FF:
1968     case CHAR_NEL:
1969     #ifndef EBCDIC
1970 nigel 93 case 0x2028:
1971     case 0x2029:
1972 ph10 1033 #endif /* Not EBCDIC */
1973 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1974     goto ANYNL03;
1975    
1976 ph10 1033 case CHAR_CR:
1977 chpe 1100 if (ptr + 1 < end_subject && RAWUCHARTEST(ptr + 1) == CHAR_LF) ncount = 1;
1978 ph10 231 /* Fall through */
1979    
1980     ANYNL03:
1981 ph10 1033 case CHAR_LF:
1982 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1983     {
1984     active_count--; /* Remove non-match possibility */
1985     next_active_state--;
1986     }
1987 ph10 1334 if (++count >= (int)GET2(code, 1))
1988 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1989 nigel 93 else
1990     { ADD_NEW_DATA(-state_offset, count, ncount); }
1991     break;
1992 ph10 231
1993 nigel 93 default:
1994     break;
1995     }
1996     }
1997     break;
1998    
1999 ph10 178 /*-----------------------------------------------------------------*/
2000     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2001     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2002     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2003     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2004     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2005 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2006 ph10 178 count = current_state->count; /* Number already matched */
2007     if (clen > 0)
2008     {
2009 ph10 182 BOOL OK;
2010 ph10 178 switch (c)
2011     {
2012 ph10 1221 VSPACE_CASES:
2013 ph10 178 OK = TRUE;
2014     break;
2015 ph10 182
2016 ph10 178 default:
2017     OK = FALSE;
2018     }
2019 ph10 182
2020 ph10 178 if (OK == (d == OP_VSPACE))
2021 ph10 182 {
2022 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2023     {
2024     active_count--; /* Remove non-match possibility */
2025     next_active_state--;
2026     }
2027 ph10 1334 if (++count >= (int)GET2(code, 1))
2028 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2029 ph10 178 else
2030     { ADD_NEW_DATA(-state_offset, count, 0); }
2031     }
2032     }
2033     break;
2034    
2035     /*-----------------------------------------------------------------*/
2036     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2037     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2038     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2039     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2040     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2041 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2042 ph10 178 count = current_state->count; /* Number already matched */
2043     if (clen > 0)
2044     {
2045 ph10 182 BOOL OK;
2046 ph10 178 switch (c)
2047     {
2048 ph10 1221 HSPACE_CASES:
2049 ph10 178 OK = TRUE;
2050     break;
2051 ph10 182
2052 ph10 178 default:
2053     OK = FALSE;
2054     break;
2055     }
2056 ph10 182
2057 ph10 178 if (OK == (d == OP_HSPACE))
2058 ph10 182 {
2059 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2060     {
2061     active_count--; /* Remove non-match possibility */
2062     next_active_state--;
2063     }
2064 ph10 1334 if (++count >= (int)GET2(code, 1))
2065 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2066 ph10 178 else
2067     { ADD_NEW_DATA(-state_offset, count, 0); }
2068     }
2069     }
2070     break;
2071    
2072 nigel 77 /* ========================================================================== */
2073     /* These opcodes are followed by a character that is usually compared
2074     to the current subject character; it is loaded into d. We still get
2075     here even if there is no subject character, because in some cases zero
2076     repetitions are permitted. */
2077    
2078     /*-----------------------------------------------------------------*/
2079     case OP_CHAR:
2080     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2081     break;
2082    
2083     /*-----------------------------------------------------------------*/
2084 ph10 602 case OP_CHARI:
2085 nigel 77 if (clen == 0) break;
2086    
2087 ph10 836 #ifdef SUPPORT_UTF
2088     if (utf)
2089 nigel 77 {
2090     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2091     {
2092 nigel 93 unsigned int othercase;
2093 ph10 836 if (c < 128)
2094     othercase = fcc[c];
2095     else
2096     /* If we have Unicode property support, we can use it to test the
2097     other case of the character. */
2098 nigel 77 #ifdef SUPPORT_UCP
2099 ph10 836 othercase = UCD_OTHERCASE(c);
2100 nigel 87 #else
2101 ph10 836 othercase = NOTACHAR;
2102 nigel 77 #endif
2103    
2104     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2105     }
2106     }
2107     else
2108 ph10 836 #endif /* SUPPORT_UTF */
2109     /* Not UTF mode */
2110 nigel 77 {
2111 ph10 836 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2112     { ADD_NEW(state_offset + 2, 0); }
2113 nigel 77 }
2114     break;
2115    
2116    
2117     #ifdef SUPPORT_UCP
2118     /*-----------------------------------------------------------------*/
2119     /* This is a tricky one because it can match more than one character.
2120     Find out how many characters to skip, and then set up a negative state
2121     to wait for them to pass before continuing. */
2122    
2123     case OP_EXTUNI:
2124 ph10 1011 if (clen > 0)
2125 nigel 77 {
2126 ph10 1033 int lgb, rgb;
2127 ph10 836 const pcre_uchar *nptr = ptr + clen;
2128 nigel 77 int ncount = 0;
2129 ph10 1033 lgb = UCD_GRAPHBREAK(c);
2130 nigel 77 while (nptr < end_subject)
2131     {
2132 ph10 1011 dlen = 1;
2133     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2134 ph10 1033 rgb = UCD_GRAPHBREAK(d);
2135 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2136 nigel 77 ncount++;
2137 ph10 1033 lgb = rgb;
2138 ph10 1011 nptr += dlen;
2139 nigel 77 }
2140 ph10 975 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2141     reset_could_continue = TRUE;
2142 nigel 77 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2143     }
2144     break;
2145     #endif
2146    
2147     /*-----------------------------------------------------------------*/
2148 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2149     character (when CR is followed by LF). In this case, set up a negative
2150     state to wait for one character to pass before continuing. */
2151    
2152     case OP_ANYNL:
2153     if (clen > 0) switch(c)
2154     {
2155 ph10 1033 case CHAR_VT:
2156     case CHAR_FF:
2157     case CHAR_NEL:
2158     #ifndef EBCDIC
2159 nigel 93 case 0x2028:
2160     case 0x2029:
2161 ph10 1033 #endif /* Not EBCDIC */
2162 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2163    
2164 ph10 1033 case CHAR_LF:
2165 nigel 93 ADD_NEW(state_offset + 1, 0);
2166     break;
2167 ph10 231
2168 ph10 1033 case CHAR_CR:
2169 ph10 975 if (ptr + 1 >= end_subject)
2170 nigel 93 {
2171 ph10 975 ADD_NEW(state_offset + 1, 0);
2172     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2173     reset_could_continue = TRUE;
2174     }
2175 chpe 1100 else if (RAWUCHARTEST(ptr + 1) == CHAR_LF)
2176 ph10 916 {
2177 nigel 93 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2178     }
2179     else
2180 ph10 975 {
2181 nigel 93 ADD_NEW(state_offset + 1, 0);
2182 ph10 975 }
2183 nigel 93 break;
2184     }
2185     break;
2186    
2187     /*-----------------------------------------------------------------*/
2188 ph10 178 case OP_NOT_VSPACE:
2189     if (clen > 0) switch(c)
2190     {
2191 ph10 1221 VSPACE_CASES:
2192 ph10 178 break;
2193 ph10 182
2194     default:
2195 ph10 178 ADD_NEW(state_offset + 1, 0);
2196     break;
2197     }
2198     break;
2199    
2200     /*-----------------------------------------------------------------*/
2201     case OP_VSPACE:
2202     if (clen > 0) switch(c)
2203     {
2204 ph10 1221 VSPACE_CASES:
2205 ph10 178 ADD_NEW(state_offset + 1, 0);
2206     break;
2207 ph10 182
2208 ph10 1221 default:
2209 ph10 1041 break;
2210 ph10 178 }
2211     break;
2212    
2213     /*-----------------------------------------------------------------*/
2214     case OP_NOT_HSPACE:
2215     if (clen > 0) switch(c)
2216     {
2217 ph10 1221 HSPACE_CASES:
2218 ph10 178 break;
2219 ph10 182
2220     default:
2221 ph10 178 ADD_NEW(state_offset + 1, 0);
2222     break;
2223     }
2224     break;
2225    
2226     /*-----------------------------------------------------------------*/
2227     case OP_HSPACE:
2228     if (clen > 0) switch(c)
2229     {
2230 ph10 1221 HSPACE_CASES:
2231 ph10 178 ADD_NEW(state_offset + 1, 0);
2232     break;
2233 ph10 1221
2234 ph10 1041 default:
2235 ph10 1221 break;
2236 ph10 178 }
2237     break;
2238    
2239     /*-----------------------------------------------------------------*/
2240 ph10 925 /* Match a negated single character casefully. */
2241 nigel 77
2242     case OP_NOT:
2243 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2244 nigel 77 break;
2245    
2246     /*-----------------------------------------------------------------*/
2247 ph10 925 /* Match a negated single character caselessly. */
2248 ph10 602
2249     case OP_NOTI:
2250 ph10 925 if (clen > 0)
2251 ph10 975 {
2252 ph10 925 unsigned int otherd;
2253     #ifdef SUPPORT_UTF
2254     if (utf && d >= 128)
2255     {
2256     #ifdef SUPPORT_UCP
2257     otherd = UCD_OTHERCASE(d);
2258     #endif /* SUPPORT_UCP */
2259     }
2260     else
2261     #endif /* SUPPORT_UTF */
2262     otherd = TABLE_GET(d, fcc, d);
2263     if (c != d && c != otherd)
2264     { ADD_NEW(state_offset + dlen + 1, 0); }
2265 ph10 975 }
2266 ph10 602 break;
2267    
2268     /*-----------------------------------------------------------------*/
2269     case OP_PLUSI:
2270     case OP_MINPLUSI:
2271     case OP_POSPLUSI:
2272     case OP_NOTPLUSI:
2273     case OP_NOTMINPLUSI:
2274     case OP_NOTPOSPLUSI:
2275     caseless = TRUE;
2276     codevalue -= OP_STARI - OP_STAR;
2277 ph10 654
2278 ph10 602 /* Fall through */
2279 nigel 77 case OP_PLUS:
2280     case OP_MINPLUS:
2281 nigel 93 case OP_POSPLUS:
2282 nigel 77 case OP_NOTPLUS:
2283     case OP_NOTMINPLUS:
2284 nigel 93 case OP_NOTPOSPLUS:
2285 nigel 77 count = current_state->count; /* Already matched */
2286     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2287     if (clen > 0)
2288     {
2289 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2290 ph10 602 if (caseless)
2291 nigel 77 {
2292 ph10 836 #ifdef SUPPORT_UTF
2293     if (utf && d >= 128)
2294 nigel 77 {
2295     #ifdef SUPPORT_UCP
2296 ph10 349 otherd = UCD_OTHERCASE(d);
2297 nigel 77 #endif /* SUPPORT_UCP */
2298     }
2299     else
2300 ph10 836 #endif /* SUPPORT_UTF */
2301     otherd = TABLE_GET(d, fcc, d);
2302 nigel 77 }
2303     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2304 nigel 93 {
2305     if (count > 0 &&
2306     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2307     {
2308     active_count--; /* Remove non-match possibility */
2309     next_active_state--;
2310     }
2311     count++;
2312     ADD_NEW(state_offset, count);
2313     }
2314 nigel 77 }
2315     break;
2316    
2317     /*-----------------------------------------------------------------*/
2318 ph10 602 case OP_QUERYI:
2319     case OP_MINQUERYI:
2320     case OP_POSQUERYI:
2321     case OP_NOTQUERYI:
2322     case OP_NOTMINQUERYI:
2323     case OP_NOTPOSQUERYI:
2324     caseless = TRUE;
2325     codevalue -= OP_STARI - OP_STAR;
2326     /* Fall through */
2327 nigel 77 case OP_QUERY:
2328     case OP_MINQUERY:
2329 nigel 93 case OP_POSQUERY:
2330 nigel 77 case OP_NOTQUERY:
2331     case OP_NOTMINQUERY:
2332 nigel 93 case OP_NOTPOSQUERY:
2333 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2334     if (clen > 0)
2335     {
2336 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2337 ph10 602 if (caseless)
2338 nigel 77 {
2339 ph10 836 #ifdef SUPPORT_UTF
2340     if (utf && d >= 128)
2341 nigel 77 {
2342     #ifdef SUPPORT_UCP
2343 ph10 349 otherd = UCD_OTHERCASE(d);
2344 nigel 77 #endif /* SUPPORT_UCP */
2345     }
2346     else
2347 ph10 836 #endif /* SUPPORT_UTF */
2348     otherd = TABLE_GET(d, fcc, d);
2349 nigel 77 }
2350     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351 nigel 93 {
2352     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2353     {
2354     active_count--; /* Remove non-match possibility */
2355     next_active_state--;
2356     }
2357     ADD_NEW(state_offset + dlen + 1, 0);
2358     }
2359 nigel 77 }
2360     break;
2361    
2362     /*-----------------------------------------------------------------*/
2363 ph10 602 case OP_STARI:
2364     case OP_MINSTARI:
2365     case OP_POSSTARI:
2366     case OP_NOTSTARI:
2367     case OP_NOTMINSTARI:
2368     case OP_NOTPOSSTARI:
2369     caseless = TRUE;
2370     codevalue -= OP_STARI - OP_STAR;
2371     /* Fall through */
2372 nigel 77 case OP_STAR:
2373     case OP_MINSTAR:
2374 nigel 93 case OP_POSSTAR:
2375 nigel 77 case OP_NOTSTAR:
2376     case OP_NOTMINSTAR:
2377 nigel 93 case OP_NOTPOSSTAR:
2378 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2379     if (clen > 0)
2380     {
2381 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2382 ph10 602 if (caseless)
2383 nigel 77 {
2384 ph10 836 #ifdef SUPPORT_UTF
2385     if (utf && d >= 128)
2386 nigel 77 {
2387     #ifdef SUPPORT_UCP
2388 ph10 349 otherd = UCD_OTHERCASE(d);
2389 nigel 77 #endif /* SUPPORT_UCP */
2390     }
2391     else
2392 ph10 836 #endif /* SUPPORT_UTF */
2393     otherd = TABLE_GET(d, fcc, d);
2394 nigel 77 }
2395     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2396 nigel 93 {
2397     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2398     {
2399     active_count--; /* Remove non-match possibility */
2400     next_active_state--;
2401     }
2402     ADD_NEW(state_offset, 0);
2403     }
2404 nigel 77 }
2405     break;
2406    
2407     /*-----------------------------------------------------------------*/
2408 ph10 602 case OP_EXACTI:
2409     case OP_NOTEXACTI:
2410     caseless = TRUE;
2411     codevalue -= OP_STARI - OP_STAR;
2412     /* Fall through */
2413 nigel 77 case OP_EXACT:
2414 nigel 93 case OP_NOTEXACT:
2415     count = current_state->count; /* Number already matched */
2416     if (clen > 0)
2417     {
2418 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2419 ph10 602 if (caseless)
2420 nigel 93 {
2421 ph10 836 #ifdef SUPPORT_UTF
2422     if (utf && d >= 128)
2423 nigel 93 {
2424     #ifdef SUPPORT_UCP
2425 ph10 349 otherd = UCD_OTHERCASE(d);
2426 nigel 93 #endif /* SUPPORT_UCP */
2427     }
2428     else
2429 ph10 836 #endif /* SUPPORT_UTF */
2430     otherd = TABLE_GET(d, fcc, d);
2431 nigel 93 }
2432     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2433     {
2434 ph10 1334 if (++count >= (int)GET2(code, 1))
2435 ph10 836 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2436 nigel 93 else
2437     { ADD_NEW(state_offset, count); }
2438     }
2439     }
2440     break;
2441    
2442     /*-----------------------------------------------------------------*/
2443 ph10 602 case OP_UPTOI:
2444     case OP_MINUPTOI:
2445     case OP_POSUPTOI:
2446     case OP_NOTUPTOI:
2447     case OP_NOTMINUPTOI:
2448     case OP_NOTPOSUPTOI:
2449     caseless = TRUE;
2450     codevalue -= OP_STARI - OP_STAR;
2451     /* Fall through */
2452 nigel 77 case OP_UPTO:
2453     case OP_MINUPTO:
2454 nigel 93 case OP_POSUPTO:
2455 nigel 77 case OP_NOTUPTO:
2456     case OP_NOTMINUPTO:
2457 nigel 93 case OP_NOTPOSUPTO:
2458 ph10 836 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2459 nigel 77 count = current_state->count; /* Number already matched */
2460     if (clen > 0)
2461     {
2462 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2463 ph10 602 if (caseless)
2464 nigel 77 {
2465 ph10 836 #ifdef SUPPORT_UTF
2466     if (utf && d >= 128)
2467 nigel 77 {
2468     #ifdef SUPPORT_UCP
2469 ph10 349 otherd = UCD_OTHERCASE(d);
2470 nigel 77 #endif /* SUPPORT_UCP */
2471     }
2472     else
2473 ph10 836 #endif /* SUPPORT_UTF */
2474     otherd = TABLE_GET(d, fcc, d);
2475 nigel 77 }
2476     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2477     {
2478 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2479     {
2480     active_count--; /* Remove non-match possibility */
2481     next_active_state--;
2482     }
2483 ph10 1334 if (++count >= (int)GET2(code, 1))
2484 ph10 836 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2485 nigel 77 else
2486     { ADD_NEW(state_offset, count); }
2487     }
2488     }
2489     break;
2490    
2491    
2492     /* ========================================================================== */
2493     /* These are the class-handling opcodes */
2494    
2495     case OP_CLASS:
2496     case OP_NCLASS:
2497     case OP_XCLASS:
2498     {
2499     BOOL isinclass = FALSE;
2500     int next_state_offset;
2501 ph10 836 const pcre_uchar *ecode;
2502 nigel 77
2503     /* For a simple class, there is always just a 32-byte table, and we
2504     can set isinclass from it. */
2505    
2506     if (codevalue != OP_XCLASS)
2507     {
2508 ph10 836 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2509 nigel 77 if (clen > 0)
2510     {
2511     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2512 ph10 836 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2513 nigel 77 }
2514     }
2515    
2516     /* An extended class may have a table or a list of single characters,
2517     ranges, or both, and it may be positive or negative. There's a
2518     function that sorts all this out. */
2519    
2520     else
2521     {
2522     ecode = code + GET(code, 1);
2523 ph10 836 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2524 nigel 77 }
2525    
2526     /* At this point, isinclass is set for all kinds of class, and ecode
2527     points to the byte after the end of the class. If there is a
2528     quantifier, this is where it will be. */
2529    
2530 ph10 530 next_state_offset = (int)(ecode - start_code);
2531 nigel 77
2532     switch (*ecode)
2533     {
2534     case OP_CRSTAR:
2535     case OP_CRMINSTAR:
2536     ADD_ACTIVE(next_state_offset + 1, 0);
2537     if (isinclass) { ADD_NEW(state_offset, 0); }
2538     break;
2539    
2540     case OP_CRPLUS:
2541     case OP_CRMINPLUS:
2542     count = current_state->count; /* Already matched */
2543     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2544     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2545     break;
2546    
2547     case OP_CRQUERY:
2548     case OP_CRMINQUERY:
2549     ADD_ACTIVE(next_state_offset + 1, 0);
2550     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2551     break;
2552    
2553     case OP_CRRANGE:
2554     case OP_CRMINRANGE:
2555     count = current_state->count; /* Already matched */
2556 ph10 1334 if (count >= (int)GET2(ecode, 1))
2557 ph10 836 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2558 nigel 77 if (isinclass)
2559     {
2560 ph10 1334 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2561 nigel 91 if (++count >= max && max != 0) /* Max 0 => no limit */
2562 ph10 836 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2563 nigel 77 else
2564     { ADD_NEW(state_offset, count); }
2565     }
2566     break;
2567    
2568     default:
2569     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2570     break;
2571     }
2572     }
2573     break;
2574    
2575     /* ========================================================================== */
2576     /* These are the opcodes for fancy brackets of various kinds. We have
2577 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2578     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2579 ph10 341 though the other "backtracking verbs" are not supported. */
2580 ph10 345
2581 ph10 341 case OP_FAIL:
2582 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2583 ph10 345 break;
2584 nigel 77
2585     case OP_ASSERT:
2586     case OP_ASSERT_NOT:
2587     case OP_ASSERTBACK:
2588     case OP_ASSERTBACK_NOT:
2589     {
2590     int rc;
2591     int local_offsets[2];
2592     int local_workspace[1000];
2593 ph10 836 const pcre_uchar *endasscode = code + GET(code, 1);
2594 nigel 77
2595     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2596    
2597     rc = internal_dfa_exec(
2598     md, /* static match data */
2599     code, /* this subexpression's code */
2600     ptr, /* where we currently are */
2601 ph10 530 (int)(ptr - start_subject), /* start offset */
2602 nigel 77 local_offsets, /* offset vector */
2603     sizeof(local_offsets)/sizeof(int), /* size of same */
2604     local_workspace, /* workspace vector */
2605     sizeof(local_workspace)/sizeof(int), /* size of same */
2606 ph10 642 rlevel); /* function recursion level */
2607 ph10 487
2608 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2609 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2610 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2611 nigel 77 }
2612     break;
2613    
2614     /*-----------------------------------------------------------------*/
2615     case OP_COND:
2616 nigel 93 case OP_SCOND:
2617 nigel 77 {
2618     int local_offsets[1000];
2619     int local_workspace[1000];
2620 ph10 406 int codelink = GET(code, 1);
2621 ph10 397 int condcode;
2622 ph10 406
2623 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2624 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2625 ph10 398 happen for the other conditions. */
2626 nigel 77
2627 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2628 ph10 406 {
2629     rrc = 0;
2630 ph10 836 if (PUBL(callout) != NULL)
2631 ph10 397 {
2632 zherczeg 850 PUBL(callout_block) cb;
2633 ph10 397 cb.version = 1; /* Version 1 of the callout block */
2634     cb.callout_number = code[LINK_SIZE+2];
2635     cb.offset_vector = offsets;
2636 chpe 1055 #if defined COMPILE_PCRE8
2637 ph10 397 cb.subject = (PCRE_SPTR)start_subject;
2638 chpe 1055 #elif defined COMPILE_PCRE16
2639 zherczeg 852 cb.subject = (PCRE_SPTR16)start_subject;
2640 chpe 1055 #elif defined COMPILE_PCRE32
2641     cb.subject = (PCRE_SPTR32)start_subject;
2642 zherczeg 852 #endif
2643 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2644     cb.start_match = (int)(current_subject - start_subject);
2645     cb.current_position = (int)(ptr - start_subject);
2646 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2647     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2648     cb.capture_top = 1;
2649     cb.capture_last = -1;
2650     cb.callout_data = md->callout_data;
2651 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2652 ph10 836 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2653 ph10 397 }
2654 ph10 398 if (rrc > 0) break; /* Fail this thread */
2655 ph10 836 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2656 ph10 406 }
2657 ph10 398
2658 ph10 397 condcode = code[LINK_SIZE+1];
2659 ph10 406
2660 nigel 93 /* Back reference conditions are not supported */
2661 nigel 77
2662 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2663 ph10 459 return PCRE_ERROR_DFA_UCOND;
2664 nigel 93
2665     /* The DEFINE condition is always false */
2666    
2667     if (condcode == OP_DEF)
2668 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2669 nigel 93
2670     /* The only supported version of OP_RREF is for the value RREF_ANY,
2671     which means "test if in any recursion". We can't test for specifically
2672     recursed groups. */
2673    
2674 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2675 nigel 93 {
2676 ph10 836 int value = GET2(code, LINK_SIZE + 2);
2677 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2678 ph10 654 if (md->recursive != NULL)
2679 ph10 836 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2680 ph10 398 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2681 nigel 77 }
2682    
2683     /* Otherwise, the condition is an assertion */
2684    
2685     else
2686     {
2687     int rc;
2688 ph10 836 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2689     const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2690 nigel 77
2691     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2692    
2693     rc = internal_dfa_exec(
2694     md, /* fixed match data */
2695     asscode, /* this subexpression's code */
2696     ptr, /* where we currently are */
2697 ph10 530 (int)(ptr - start_subject), /* start offset */
2698 nigel 77 local_offsets, /* offset vector */
2699     sizeof(local_offsets)/sizeof(int), /* size of same */
2700     local_workspace, /* workspace vector */
2701     sizeof(local_workspace)/sizeof(int), /* size of same */
2702 ph10 642 rlevel); /* function recursion level */
2703 nigel 77
2704 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2705 nigel 77 if ((rc >= 0) ==
2706     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2707 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2708 nigel 77 else
2709 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2710 nigel 77 }
2711     }
2712     break;
2713    
2714     /*-----------------------------------------------------------------*/
2715     case OP_RECURSE:
2716     {
2717 ph10 654 dfa_recursion_info *ri;
2718 nigel 77 int local_offsets[1000];
2719     int local_workspace[1000];
2720 ph10 836 const pcre_uchar *callpat = start_code + GET(code, 1);
2721 ph10 654 int recno = (callpat == md->start_code)? 0 :
2722     GET2(callpat, 1 + LINK_SIZE);
2723 nigel 77 int rc;
2724    
2725 ph10 642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2726 ph10 654
2727 ph10 642 /* Check for repeating a recursion without advancing the subject
2728     pointer. This should catch convoluted mutual recursions. (Some simple
2729     cases are caught at compile time.) */
2730 nigel 77
2731 ph10 654 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2732     if (recno == ri->group_num && ptr == ri->subject_position)
2733     return PCRE_ERROR_RECURSELOOP;
2734    
2735     /* Remember this recursion and where we started it so as to
2736 ph10 642 catch infinite loops. */
2737 ph10 654
2738 ph10 642 new_recursive.group_num = recno;
2739     new_recursive.subject_position = ptr;
2740     new_recursive.prevrec = md->recursive;
2741 ph10 654 md->recursive = &new_recursive;
2742 ph10 642
2743 nigel 77 rc = internal_dfa_exec(
2744     md, /* fixed match data */
2745 ph10 642 callpat, /* this subexpression's code */
2746 nigel 77 ptr, /* where we currently are */
2747 ph10 530 (int)(ptr - start_subject), /* start offset */
2748 nigel 77 local_offsets, /* offset vector */
2749     sizeof(local_offsets)/sizeof(int), /* size of same */
2750     local_workspace, /* workspace vector */
2751     sizeof(local_workspace)/sizeof(int), /* size of same */
2752 ph10 642 rlevel); /* function recursion level */
2753 nigel 77
2754 ph10 642 md->recursive = new_recursive.prevrec; /* Done this recursion */
2755 nigel 77
2756 ph10 654 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2757 ph10 642 rc));
2758    
2759 nigel 77 /* Ran out of internal offsets */
2760    
2761     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2762    
2763     /* For each successful matched substring, set up the next state with a
2764     count of characters to skip before trying it. Note that the count is in
2765     characters, not bytes. */
2766    
2767     if (rc > 0)
2768     {
2769     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2770     {
2771 ph10 894 int charcount = local_offsets[rc+1] - local_offsets[rc];
2772 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2773 ph10 979 if (utf)
2774 ph10 982 {
2775 ph10 979 const pcre_uchar *p = start_subject + local_offsets[rc];
2776     const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2777     while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2778 ph10 982 }
2779 ph10 836 #endif
2780 nigel 77 if (charcount > 0)
2781     {
2782     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2783     }
2784     else
2785     {
2786     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2787     }
2788     }
2789     }
2790     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2791     }
2792     break;
2793    
2794     /*-----------------------------------------------------------------*/
2795 ph10 604 case OP_BRAPOS:
2796     case OP_SBRAPOS:
2797     case OP_CBRAPOS:
2798     case OP_SCBRAPOS:
2799 ph10 654 case OP_BRAPOSZERO:
2800 ph10 604 {
2801     int charcount, matched_count;
2802 ph10 836 const pcre_uchar *local_ptr = ptr;
2803 ph10 604 BOOL allow_zero;
2804 ph10 654
2805 ph10 604 if (codevalue == OP_BRAPOSZERO)
2806     {
2807     allow_zero = TRUE;
2808     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2809     }
2810 ph10 654 else allow_zero = FALSE;
2811    
2812     /* Loop to match the subpattern as many times as possible as if it were
2813     a complete pattern. */
2814    
2815 ph10 604 for (matched_count = 0;; matched_count++)
2816     {
2817     int local_offsets[2];
2818     int local_workspace[1000];
2819 ph10 654
2820 ph10 604 int rc = internal_dfa_exec(
2821     md, /* fixed match data */
2822     code, /* this subexpression's code */
2823     local_ptr, /* where we currently are */
2824     (int)(ptr - start_subject), /* start offset */
2825     local_offsets, /* offset vector */
2826     sizeof(local_offsets)/sizeof(int), /* size of same */
2827     local_workspace, /* workspace vector */
2828     sizeof(local_workspace)/sizeof(int), /* size of same */
2829 ph10 642 rlevel); /* function recursion level */
2830 ph10 654
2831 ph10 604 /* Failed to match */
2832 ph10 654
2833     if (rc < 0)
2834 ph10 604 {
2835     if (rc != PCRE_ERROR_NOMATCH) return rc;
2836     break;
2837 ph10 654 }
2838    
2839 ph10 604 /* Matched: break the loop if zero characters matched. */
2840 ph10 654
2841 ph10 604 charcount = local_offsets[1] - local_offsets[0];
2842 ph10 654 if (charcount == 0) break;
2843 ph10 604 local_ptr += charcount; /* Advance temporary position ptr */
2844 ph10 654 }
2845 ph10 604
2846     /* At this point we have matched the subpattern matched_count
2847 ph10 654 times, and local_ptr is pointing to the character after the end of the
2848     last match. */
2849 ph10 604
2850     if (matched_count > 0 || allow_zero)
2851 ph10 654 {
2852 ph10 836 const pcre_uchar *end_subpattern = code;
2853 ph10 604 int next_state_offset;
2854 ph10 654
2855 ph10 604 do { end_subpattern += GET(end_subpattern, 1); }
2856     while (*end_subpattern == OP_ALT);
2857     next_state_offset =
2858     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2859    
2860     /* Optimization: if there are no more active states, and there
2861     are no new states yet set up, then skip over the subject string
2862     right here, to save looping. Otherwise, set up the new state to swing
2863     into action when the end of the matched substring is reached. */
2864    
2865     if (i + 1 >= active_count && new_count == 0)
2866     {
2867     ptr = local_ptr;
2868     clen = 0;
2869     ADD_NEW(next_state_offset, 0);
2870     }
2871     else
2872     {
2873 ph10 836 const pcre_uchar *p = ptr;
2874     const pcre_uchar *pp = local_ptr;
2875     charcount = (int)(pp - p);
2876 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2877 ph10 979 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2878 ph10 836 #endif
2879 ph10 604 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2880     }
2881 ph10 654 }
2882     }
2883 ph10 604 break;
2884 ph10 654
2885 ph10 604 /*-----------------------------------------------------------------*/
2886 nigel 77 case OP_ONCE:
2887 ph10 733 case OP_ONCE_NC:
2888 nigel 77 {
2889     int local_offsets[2];
2890     int local_workspace[1000];
2891    
2892     int rc = internal_dfa_exec(
2893     md, /* fixed match data */
2894     code, /* this subexpression's code */
2895     ptr, /* where we currently are */
2896 ph10 530 (int)(ptr - start_subject), /* start offset */
2897 nigel 77 local_offsets, /* offset vector */
2898     sizeof(local_offsets)/sizeof(int), /* size of same */
2899     local_workspace, /* workspace vector */
2900     sizeof(local_workspace)/sizeof(int), /* size of same */
2901 ph10 642 rlevel); /* function recursion level */
2902 nigel 77
2903     if (rc >= 0)
2904     {
2905 ph10 836 const pcre_uchar *end_subpattern = code;
2906 nigel 77 int charcount = local_offsets[1] - local_offsets[0];
2907     int next_state_offset, repeat_state_offset;
2908    
2909     do { end_subpattern += GET(end_subpattern, 1); }
2910     while (*end_subpattern == OP_ALT);
2911 ph10 535 next_state_offset =
2912 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2913 nigel 77
2914     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2915     arrange for the repeat state also to be added to the relevant list.
2916     Calculate the offset, or set -1 for no repeat. */
2917    
2918     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2919     *end_subpattern == OP_KETRMIN)?
2920 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2921 nigel 77
2922     /* If we have matched an empty string, add the next state at the
2923     current character pointer. This is important so that the duplicate
2924     checking kicks in, which is what breaks infinite loops that match an
2925     empty string. */
2926    
2927     if (charcount == 0)
2928     {
2929     ADD_ACTIVE(next_state_offset, 0);
2930     }
2931    
2932     /* Optimization: if there are no more active states, and there
2933     are no new states yet set up, then skip over the subject string
2934     right here, to save looping. Otherwise, set up the new state to swing
2935 ph10 604 into action when the end of the matched substring is reached. */
2936 nigel 77
2937     else if (i + 1 >= active_count && new_count == 0)
2938     {
2939     ptr += charcount;
2940     clen = 0;
2941     ADD_NEW(next_state_offset, 0);
2942    
2943     /* If we are adding a repeat state at the new character position,
2944     we must fudge things so that it is the only current state.
2945     Otherwise, it might be a duplicate of one we processed before, and
2946     that would cause it to be skipped. */
2947    
2948     if (repeat_state_offset >= 0)
2949     {
2950     next_active_state = active_states;
2951     active_count = 0;
2952     i = -1;
2953     ADD_ACTIVE(repeat_state_offset, 0);
2954     }
2955     }
2956     else
2957     {
2958 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2959 ph10 979 if (utf)
2960 ph10 982 {
2961 ph10 979 const pcre_uchar *p = start_subject + local_offsets[0];
2962     const pcre_uchar *pp = start_subject + local_offsets[1];
2963     while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2964 ph10 982 }
2965 ph10 836 #endif
2966 nigel 77 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2967     if (repeat_state_offset >= 0)
2968     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2969     }
2970     }
2971     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2972     }
2973     break;
2974    
2975    
2976     /* ========================================================================== */
2977     /* Handle callouts */
2978    
2979     case OP_CALLOUT:
2980 ph10 406 rrc = 0;
2981 ph10 836 if (PUBL(callout) != NULL)
2982 nigel 77 {
2983 zherczeg 850 PUBL(callout_block) cb;
2984 nigel 77 cb.version = 1; /* Version 1 of the callout block */
2985     cb.callout_number = code[1];
2986     cb.offset_vector = offsets;
2987 chpe 1055 #if defined COMPILE_PCRE8
2988 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2989 chpe 1055 #elif defined COMPILE_PCRE16
2990 zherczeg 852 cb.subject = (PCRE_SPTR16)start_subject;
2991 chpe 1055 #elif defined COMPILE_PCRE32
2992     cb.subject = (PCRE_SPTR32)start_subject;
2993 zherczeg 852 #endif
2994 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2995     cb.start_match = (int)(current_subject - start_subject);
2996     cb.current_position = (int)(ptr - start_subject);
2997 nigel 77 cb.pattern_position = GET(code, 2);
2998     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2999     cb.capture_top = 1;
3000     cb.capture_last = -1;
3001     cb.callout_data = md->callout_data;
3002 ph10 654 cb.mark = NULL; /* No (*MARK) support */
3003 ph10 836 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3004 ph10 406 }
3005     if (rrc == 0)
3006 ph10 836 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3007 nigel 77 break;
3008    
3009    
3010     /* ========================================================================== */
3011     default: /* Unsupported opcode */
3012     return PCRE_ERROR_DFA_UITEM;
3013     }
3014    
3015     NEXT_ACTIVE_STATE: continue;
3016    
3017     } /* End of loop scanning active states */
3018    
3019     /* We have finished the processing at the current subject character. If no
3020     new states have been set for the next character, we have found all the
3021     matches that we are going to find. If we are at the top level and partial
3022 ph10 463 matching has been requested, check for appropriate conditions.
3023    
3024 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
3025     character. If it is equal to the original active_count (saved in
3026     workspace[1]) it means that (*F) was found on every active state. In this
3027 ph10 463 case we don't want to give a partial match.
3028 nigel 77
3029 ph10 463 The "could_continue" variable is true if a state could have continued but
3030     for the fact that the end of the subject was reached. */
3031 ph10 975
3032 nigel 77 if (new_count <= 0)
3033     {
3034 ph10 427 if (rlevel == 1 && /* Top level, and */
3035 ph10 919 could_continue && /* Some could go on, and */
3036 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
3037 ph10 427 ( /* either... */
3038     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3039     || /* or... */
3040     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3041     match_count < 0) /* no matches */
3042     ) && /* And... */
3043 ph10 916 (
3044 ph10 919 partial_newline || /* Either partial NL */
3045     ( /* or ... */
3046     ptr >= end_subject && /* End of subject and */
3047     ptr > md->start_used_ptr) /* Inspected non-empty string */
3048 ph10 975 )
3049     )
3050 nigel 77 match_count = PCRE_ERROR_PARTIAL;
3051     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3052     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3053     rlevel*2-2, SP));
3054 nigel 91 break; /* In effect, "return", but see the comment below */
3055 nigel 77 }
3056    
3057     /* One or more states are active for the next character. */
3058    
3059     ptr += clen; /* Advance to next subject character */
3060     } /* Loop to move along the subject string */
3061    
3062 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
3063     if we use "return" above, we have compiler trouble. Some compilers warn if
3064     there's nothing here because they think the function doesn't return a value. On
3065     the other hand, if we put a dummy statement here, some more clever compilers
3066     complain that it can't be reached. Sigh. */
3067 nigel 77
3068 nigel 91 return match_count;
3069 nigel 77 }
3070    
3071    
3072    
3073    
3074     /*************************************************
3075     * Execute a Regular Expression - DFA engine *
3076     *************************************************/
3077    
3078     /* This external function applies a compiled re to a subject string using a DFA
3079     engine. This function calls the internal function multiple times if the pattern
3080     is not anchored.
3081    
3082     Arguments:
3083     argument_re points to the compiled expression
3084 ph10 97 extra_data points to extra data or is NULL
3085 nigel 77 subject points to the subject string
3086     length length of subject string (may contain binary zeros)
3087     start_offset where to start in the subject string
3088     options option bits
3089     offsets vector of match offsets
3090     offsetcount size of same
3091     workspace workspace vector
3092     wscount size of same
3093    
3094     Returns: > 0 => number of match offset pairs placed in offsets
3095     = 0 => offsets overflowed; longest matches are present
3096     -1 => failed to match
3097     < -1 => some kind of unexpected problem
3098     */
3099    
3100 chpe 1055 #if defined COMPILE_PCRE8
3101 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3102 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3103     const char *subject, int length, int start_offset, int options, int *offsets,
3104     int offsetcount, int *workspace, int wscount)
3105 chpe 1055 #elif defined COMPILE_PCRE16
3106 ph10 836 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3107 zherczeg 852 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3108 ph10 836 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3109     int offsetcount, int *workspace, int wscount)
3110 chpe 1055 #elif defined COMPILE_PCRE32
3111     PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3112     pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3113     PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3114     int offsetcount, int *workspace, int wscount)
3115 ph10 836 #endif
3116 nigel 77 {
3117 zherczeg 852 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3118 nigel 77 dfa_match_data match_block;
3119 nigel 91 dfa_match_data *md = &match_block;
3120 ph10 836 BOOL utf, anchored, startline, firstline;
3121     const pcre_uchar *current_subject, *end_subject;
3122 nigel 77 const pcre_study_data *study = NULL;
3123    
3124 ph10 836 const pcre_uchar *req_char_ptr;
3125     const pcre_uint8 *start_bits = NULL;
3126     BOOL has_first_char = FALSE;
3127     BOOL has_req_char = FALSE;
3128     pcre_uchar first_char = 0;
3129     pcre_uchar first_char2 = 0;
3130     pcre_uchar req_char = 0;
3131     pcre_uchar req_char2 = 0;
3132 nigel 91 int newline;
3133 nigel 77
3134     /* Plausibility checks */
3135    
3136     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3137     if (re == NULL || subject == NULL || workspace == NULL ||
3138     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3139     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3140     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3141 ph10 1189 if (length < 0) return PCRE_ERROR_BADLENGTH;
3142 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3143 nigel 77
3144 ph10 960 /* Check that the first field in the block is the magic number. If it is not,
3145     return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3146     REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3147     means that the pattern is likely compiled with different endianness. */
3148 nigel 77
3149 ph10 960 if (re->magic_number != MAGIC_NUMBER)
3150     return re->magic_number == REVERSED_MAGIC_NUMBER?
3151     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3152     if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3153    
3154 ph10 975 /* If restarting after a partial match, do some sanity checks on the contents
3155 ph10 960 of the workspace. */
3156    
3157     if ((options & PCRE_DFA_RESTART) != 0)
3158     {
3159 ph10 975 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3160 ph10 960 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3161 ph10 975 return PCRE_ERROR_DFA_BADRESTART;
3162     }
3163 ph10 960
3164     /* Set up study, callout, and table data */
3165    
3166 nigel 91 md->tables = re->tables;
3167     md->callout_data = NULL;
3168 nigel 77
3169     if (extra_data != NULL)
3170     {
3171     unsigned int flags = extra_data->flags;
3172     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3173     study = (const pcre_study_data *)extra_data->study_data;
3174     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3175 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3176     return PCRE_ERROR_DFA_UMLIMIT;
3177 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3178 nigel 91 md->callout_data = extra_data->callout_data;
3179 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3180 nigel 91 md->tables = extra_data->tables;
3181 nigel 77 }
3182 ph10 461
3183 nigel 77 /* Set some local values */
3184    
3185 ph10 836 current_subject = (const pcre_uchar *)subject + start_offset;
3186     end_subject = (const pcre_uchar *)subject + length;
3187     req_char_ptr = current_subject - 1;
3188 nigel 77
3189 ph10 836 #ifdef SUPPORT_UTF
3190 chpe 1055 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3191 ph10 836 utf = (re->options & PCRE_UTF8) != 0;
3192 nigel 91 #else
3193 ph10 836 utf = FALSE;
3194 nigel 91 #endif
3195 nigel 77
3196 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3197     (re->options & PCRE_ANCHORED) != 0;
3198    
3199 nigel 77 /* The remaining fixed data for passing around. */
3200    
3201 ph10 836 md->start_code = (const pcre_uchar *)argument_re +
3202 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3203 ph10 836 md->start_subject = (const pcre_uchar *)subject;
3204 nigel 91 md->end_subject = end_subject;
3205 ph10 442 md->start_offset = start_offset;
3206 nigel 91 md->moptions = options;
3207     md->poptions = re->options;
3208 nigel 77
3209 ph10 231 /* If the BSR option is not set at match time, copy what was set
3210     at compile time. */
3211    
3212     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3213     {
3214     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3215     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3216     #ifdef BSR_ANYCRLF
3217     else md->moptions |= PCRE_BSR_ANYCRLF;
3218 ph10 243 #endif
3219     }
3220 ph10 231
3221 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3222     nothing is set at run time, whatever was used at compile time applies. */
3223 nigel 91
3224 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3225 nigel 93 PCRE_NEWLINE_BITS)
3226 nigel 91 {
3227 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3228 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3229     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3230 nigel 91 case PCRE_NEWLINE_CR+
3231 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3232 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3233 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3234 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3235 nigel 91 }
3236    
3237 ph10 149 if (newline == -2)
3238 nigel 91 {
3239 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3240     }
3241     else if (newline < 0)
3242     {
3243 nigel 93 md->nltype = NLTYPE_ANY;
3244 nigel 91 }
3245     else
3246     {
3247 nigel 93 md->nltype = NLTYPE_FIXED;
3248     if (newline > 255)
3249     {
3250     md->nllen = 2;
3251     md->nl[0] = (newline >> 8) & 255;
3252     md->nl[1] = newline & 255;
3253     }
3254     else
3255     {
3256     md->nllen = 1;
3257     md->nl[0] = newline;
3258     }
3259 nigel 91 }
3260    
3261 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3262     back the character offset. */
3263    
3264 ph10 836 #ifdef SUPPORT_UTF
3265     if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3266 nigel 77 {
3267 ph10 654 int erroroffset;
3268 ph10 836 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3269 ph10 606 if (errorcode != 0)
3270 ph10 598 {
3271     if (offsetcount >= 2)
3272     {
3273 ph10 606 offsets[0] = erroroffset;
3274 ph10 598 offsets[1] = errorcode;
3275 ph10 654 }
3276 chpe 1055 #if defined COMPILE_PCRE8
3277     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3278 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3279 chpe 1055 #elif defined COMPILE_PCRE16
3280     return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3281     PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3282     #elif defined COMPILE_PCRE32
3283     return PCRE_ERROR_BADUTF32;
3284     #endif
3285 ph10 654 }
3286 chpe 1055 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3287 ph10 606 if (start_offset > 0 && start_offset < length &&
3288 ph10 836 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3289 ph10 606 return PCRE_ERROR_BADUTF8_OFFSET;
3290 chpe 1055 #endif
3291 nigel 77 }
3292     #endif
3293    
3294     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3295     is a feature that makes it possible to save compiled regex and re-use them
3296     in other programs later. */
3297    
3298 ph10 836 if (md->tables == NULL) md->tables = PRIV(default_tables);
3299 nigel 77
3300 ph10 881 /* The "must be at the start of a line" flags are used in a loop when finding
3301     where to start. */
3302 nigel 77
3303 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3304 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3305    
3306     /* Set up the first character to match, if available. The first_byte value is
3307     never set for an anchored regular expression, but the anchoring may be forced
3308     at run time, so we have to test for anchoring. The first char may be unset for
3309     an unanchored pattern, of course. If there's no first char and the pattern was
3310     studied, there may be a bitmap of possible first characters. */
3311    
3312     if (!anchored)
3313     {
3314 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3315 nigel 77 {
3316 ph10 836 has_first_char = TRUE;
3317 ph10 904 first_char = first_char2 = (pcre_uchar)(re->first_char);
3318 ph10 836 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3319     {
3320     first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3321     #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3322     if (utf && first_char > 127)
3323     first_char2 = UCD_OTHERCASE(first_char);
3324     #endif
3325     }
3326 nigel 77 }
3327     else
3328     {
3329 ph10 455 if (!startline && study != NULL &&
3330     (study->flags & PCRE_STUDY_MAPPED) != 0)
3331 nigel 77 start_bits = study->start_bits;
3332     }
3333     }
3334    
3335     /* For anchored or unanchored matches, there may be a "last known required
3336     character" set. */
3337    
3338 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3339 nigel 77 {
3340 ph10 836 has_req_char = TRUE;
3341 ph10 904 req_char = req_char2 = (pcre_uchar)(re->req_char);
3342 ph10 836 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3343     {
3344     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3345     #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3346     if (utf && req_char > 127)
3347     req_char2 = UCD_OTHERCASE(req_char);
3348     #endif
3349     }
3350 nigel 77 }
3351    
3352     /* Call the main matching function, looping for a non-anchored regex after a
3353 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3354     a match. */
3355 nigel 77
3356     for (;;)
3357     {
3358     int rc;
3359    
3360     if ((options & PCRE_DFA_RESTART) == 0)
3361     {
3362 ph10 836 const pcre_uchar *save_end_subject = end_subject;
3363 nigel 77
3364 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3365     line of a multiline string. Implement this by temporarily adjusting
3366     end_subject so that we stop scanning at a newline. If the match fails at
3367     the newline, later code breaks this loop. */
3368 nigel 77
3369     if (firstline)
3370     {
3371 ph10 836 PCRE_PUCHAR t = current_subject;
3372     #ifdef SUPPORT_UTF
3373     if (utf)
3374 ph10 371 {
3375     while (t < md->end_subject && !IS_NEWLINE(t))
3376 ph10 365 {
3377     t++;
3378 ph10 836 ACROSSCHAR(t < end_subject, *t, t++);
3379 ph10 371 }
3380 ph10 365 }
3381     else
3382 ph10 371 #endif
3383 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3384 nigel 77 end_subject = t;
3385     }
3386 ph10 392
3387 ph10 389 /* There are some optimizations that avoid running the match if a known
3388 ph10 455 starting point is not found. However, there is an option that disables
3389 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3390 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3391     match-time options. */
3392 nigel 77
3393 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3394 ph10 392 {
3395 ph10 836 /* Advance to a known first char. */
3396 ph10 392
3397 ph10 836 if (has_first_char)
3398 nigel 77 {
3399 ph10 836 if (first_char != first_char2)
3400 chpe 1100 {
3401     pcre_uchar csc;
3402 ph10 389 while (current_subject < end_subject &&
3403 chpe 1100 (csc = RAWUCHARTEST(current_subject)) != first_char && csc != first_char2)
3404 ph10 389 current_subject++;
3405 chpe 1100 }
3406 ph10 389 else
3407 ph10 392 while (current_subject < end_subject &&
3408 chpe 1100 RAWUCHARTEST(current_subject) != first_char)
3409 ph10 389 current_subject++;
3410     }
3411 ph10 392
3412 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3413 ph10 392
3414 ph10 389 else if (startline)
3415     {
3416     if (current_subject > md->start_subject + start_offset)
3417     {
3418 ph10 836 #ifdef SUPPORT_UTF
3419     if (utf)
3420 ph10 365 {
3421 ph10 392 while (current_subject < end_subject &&
3422 ph10 389 !WAS_NEWLINE(current_subject))
3423     {
3424 ph10 365 current_subject++;
3425 ph10 836 ACROSSCHAR(current_subject < end_subject, *current_subject,
3426     current_subject++);
3427 ph10 389 }
3428 ph10 371 }
3429 ph10 389 else
3430     #endif
3431     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3432     current_subject++;
3433 ph10 392
3434 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3435     ANYCRLF, and we are now at a LF, advance the match position by one
3436     more character. */
3437 ph10 392
3438 chpe 1100 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3439 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3440     current_subject < end_subject &&
3441 chpe 1100 RAWUCHARTEST(current_subject) == CHAR_NL)
3442 ph10 389 current_subject++;
3443 ph10 365 }
3444 nigel 77 }
3445 ph10 392
3446 ph10 389 /* Or to a non-unique first char after study */
3447 ph10 392
3448 ph10 389 else if (start_bits != NULL)
3449 nigel 77 {
3450 ph10 389 while (current_subject < end_subject)
3451     {
3452 chpe 1100 register pcre_uint32 c = RAWUCHARTEST(current_subject);
3453 ph10 836 #ifndef COMPILE_PCRE8
3454     if (c > 255) c = 255;
3455     #endif
3456 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3457 ph10 538 {
3458     current_subject++;
3459 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3460     /* In non 8-bit mode, the iteration will stop for
3461     characters > 255 at the beginning or not stop at all. */
3462     if (utf)
3463     ACROSSCHAR(current_subject < end_subject, *current_subject,
3464     current_subject++);
3465 ph10 545 #endif
3466 ph10 538 }
3467     else break;
3468 ph10 389 }
3469 nigel 77 }
3470 ph10 392 }
3471 nigel 77
3472     /* Restore fudged end_subject */
3473    
3474     end_subject = save_end_subject;
3475    
3476 ph10 461 /* The following two optimizations are disabled for partial matching or if
3477     disabling is explicitly requested (and of course, by the test above, this
3478 ph10 455 code is not obeyed when restarting after a partial match). */
3479 ph10 461
3480 ph10 728 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3481 ph10 455 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3482 ph10 461 {
3483 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3484     is a lower bound; no actual string of that length may actually match the
3485     pattern. Although the value is, strictly, in characters, we treat it as
3486     bytes to avoid spending too much time in this optimization. */
3487 nigel 77
3488 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3489 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3490 ph10 455 return PCRE_ERROR_NOMATCH;
3491 ph10 461
3492 ph10 836 /* If req_char is set, we know that that character must appear in the
3493     subject for the match to succeed. If the first character is set, req_char
3494 ph10 455 must be later in the subject; otherwise the test starts at the match
3495     point. This optimization can save a huge amount of work in patterns with
3496     nested unlimited repeats that aren't going to match. Writing separate
3497     code for cased/caseless versions makes it go faster, as does using an
3498     autoincrement and backing off on a match.
3499 ph10 461
3500 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3501     can take a long time, and give bad performance on quite ordinary
3502     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3503     string... so we don't do this when the string is sufficiently long. */
3504 ph10 461
3505 ph10 836 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3506 nigel 77 {
3507 ph10 836 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3508 ph10 461
3509 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3510     place we found it at last time. */
3511 ph10 461
3512 ph10 836 if (p > req_char_ptr)
3513 nigel 77 {
3514 ph10 836 if (req_char != req_char2)
3515 ph10 455 {
3516     while (p < end_subject)
3517     {
3518 chpe 1100 register pcre_uint32 pp = RAWUCHARINCTEST(p);
3519 ph10 836 if (pp == req_char || pp == req_char2) { p--; break; }
3520 ph10 455 }
3521     }
3522     else
3523     {
3524     while (p < end_subject)
3525     {
3526 chpe 1100 if (RAWUCHARINCTEST(p) == req_char) { p--; break; }
3527 ph10 455 }
3528     }
3529 ph10 461
3530 ph10 455 /* If we can't find the required character, break the matching loop,
3531     which will cause a return or PCRE_ERROR_NOMATCH. */
3532 ph10 461
3533 ph10 455 if (p >= end_subject) break;
3534 ph10 461
3535 ph10 455 /* If we have found the required character, save the point where we
3536     found it, so that we don't search again next time round the loop if
3537     the start hasn't passed this character yet. */
3538 ph10 461
3539 ph10 836 req_char_ptr = p;
3540 nigel 77 }
3541 ph10 461 }
3542 nigel 77 }
3543 ph10 455 } /* End of optimizations that are done when not restarting */
3544 nigel 77
3545     /* OK, now we can do the business */
3546    
3547 ph10 435 md->start_used_ptr = current_subject;
3548 ph10 654 md->recursive = NULL;
3549 ph10 461
3550 nigel 77 rc = internal_dfa_exec(
3551 nigel 91 md, /* fixed match data */
3552     md->start_code, /* this subexpression's code */
3553     current_subject, /* where we currently are */
3554     start_offset, /* start offset in subject */
3555     offsets, /* offset vector */
3556     offsetcount, /* size of same */
3557     workspace, /* workspace vector */
3558     wscount, /* size of same */
3559 ph10 642 0); /* function recurse level */
3560 nigel 77
3561     /* Anything other than "no match" means we are done, always; otherwise, carry
3562     on only if not anchored. */
3563    
3564 ph10 1320 if (rc != PCRE_ERROR_NOMATCH || anchored)
3565 ph10 1251 {
3566     if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3567     {
3568     offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3569     offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3570 ph10 1320 if (offsetcount > 2)
3571 ph10 1251 offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3572     }
3573     return rc;
3574 ph10 1320 }
3575 nigel 77
3576     /* Advance to the next subject character unless we are at the end of a line
3577     and firstline is set. */
3578    
3579 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3580 nigel 77 current_subject++;
3581 ph10 836 #ifdef SUPPORT_UTF
3582     if (utf)
3583 nigel 77 {
3584 ph10 836 ACROSSCHAR(current_subject < end_subject, *current_subject,
3585     current_subject++);
3586 nigel 77 }
3587 ph10 836 #endif
3588 nigel 77 if (current_subject > end_subject) break;
3589    
3590 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3591 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3592     or ANY or ANYCRLF, advance the match position by one more character. */
3593 nigel 93
3594 chpe 1100 if (RAWUCHARTEST(current_subject - 1) == CHAR_CR &&
3595 ph10 226 current_subject < end_subject &&
3596 chpe 1100 RAWUCHARTEST(current_subject) == CHAR_NL &&
3597 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3598 ph10 226 (md->nltype == NLTYPE_ANY ||
3599     md->nltype == NLTYPE_ANYCRLF ||
3600     md->nllen == 2))
3601 nigel 93 current_subject++;
3602    
3603     } /* "Bumpalong" loop */
3604    
3605 nigel 77 return PCRE_ERROR_NOMATCH;
3606     }
3607    
3608     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12