/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 979 - (hide annotations) (download)
Sun Jun 17 19:08:41 2012 UTC (22 months ago) by ph10
File MIME type: text/plain
File size: 125731 byte(s)
Fix DFA bug (3 cases) when UTF code was being obeyed in non-UTF mode.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 836 Copyright (c) 1997-2012 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43 ph10 960 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 nigel 93 applications. */
45 nigel 77
46    
47 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48     the performance of his patterns greatly. I could not use it as it stood, as it
49     was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
51    
52     The issue is the check for duplicate states, which is done by a simple linear
53     search up the state list. (Grep for "duplicate" below to find the code.) For
54     many patterns, there will never be many states active at one time, so a simple
55     linear search is fine. In patterns that have many active states, it might be a
56     bottleneck. The suggested code used an indexing scheme to remember which states
57     had previously been used for each character, and avoided the linear search when
58     it knew there was no chance of a duplicate. This was implemented when adding
59     states to the state lists.
60    
61     I wrote some thread-safe, not-limited code to try something similar at the time
62     of checking for duplicates (instead of when adding states), using index vectors
63     on the stack. It did give a 13% improvement with one specially constructed
64     pattern for certain subject strings, but on other strings and on many of the
65     simpler patterns in the test suite it did worse. The major problem, I think,
66     was the extra time to initialize the index. This had to be done for each call
67     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68     only once - I suspect this was the cause of the problems with the tests.)
69    
70 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
71 ph10 439 in others, so I abandoned this code. */
72    
73    
74    
75 ph10 200 #ifdef HAVE_CONFIG_H
76 ph10 236 #include "config.h"
77 ph10 200 #endif
78 ph10 199
79 nigel 93 #define NLBLOCK md /* Block containing newline information */
80     #define PSSTART start_subject /* Field containing processed string start */
81     #define PSEND end_subject /* Field containing processed string end */
82    
83 nigel 77 #include "pcre_internal.h"
84    
85    
86     /* For use to indent debugging output */
87    
88     #define SP " "
89    
90    
91     /*************************************************
92     * Code parameters and static tables *
93     *************************************************/
94    
95     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
97 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
98 ph10 178 never stored, so we push them well clear of the normal opcodes. */
99 nigel 77
100 ph10 178 #define OP_PROP_EXTRA 300
101     #define OP_EXTUNI_EXTRA 320
102     #define OP_ANYNL_EXTRA 340
103     #define OP_HSPACE_EXTRA 360
104     #define OP_VSPACE_EXTRA 380
105 nigel 77
106    
107     /* This table identifies those opcodes that are followed immediately by a
108 ph10 510 character that is to be tested in some way. This makes it possible to
109 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
110     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
112 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
113     modified, the three tables that follow must also be modified. */
114 nigel 77
115 ph10 836 static const pcre_uint8 coptable[] = {
116 nigel 77 0, /* End */
117 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
120 ph10 498 0, 0, /* \P, \p */
121 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 ph10 498 0, /* \X */
123 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
124 nigel 77 1, /* Char */
125 ph10 602 1, /* Chari */
126 nigel 77 1, /* not */
127 ph10 602 1, /* noti */
128 nigel 77 /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131     1+IMM2_SIZE, /* exact */
132     1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135     1+IMM2_SIZE, /* exact I */
136     1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 nigel 77 /* Negative single-char repeats - only for chars < 256 */
138     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140     1+IMM2_SIZE, /* NOT exact */
141     1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144     1+IMM2_SIZE, /* NOT exact I */
145     1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 nigel 77 /* Positive type repeats */
147     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149     1+IMM2_SIZE, /* Type exact */
150     1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 nigel 77 /* Character class & ref repeats */
152     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153     0, 0, /* CRRANGE, CRMINRANGE */
154     0, /* CLASS */
155     0, /* NCLASS */
156     0, /* XCLASS - variable length */
157     0, /* REF */
158 ph10 602 0, /* REFI */
159 nigel 77 0, /* RECURSE */
160     0, /* CALLOUT */
161     0, /* Alt */
162     0, /* Ket */
163     0, /* KetRmax */
164     0, /* KetRmin */
165 ph10 604 0, /* KetRpos */
166 ph10 637 0, /* Reverse */
167 nigel 77 0, /* Assert */
168     0, /* Assert not */
169     0, /* Assert behind */
170     0, /* Assert behind not */
171 ph10 723 0, 0, /* ONCE, ONCE_NC */
172     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
173 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
174 ph10 498 0, 0, /* CREF, NCREF */
175     0, 0, /* RREF, NRREF */
176 nigel 93 0, /* DEF */
177 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
178 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
179     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
180     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
181     0, 0 /* CLOSE, SKIPZERO */
182 nigel 77 };
183    
184 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
185 ph10 462 remember the fact that a character could have been inspected when the end of
186 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
187     two tables that follow must also be modified. */
188 ph10 462
189 ph10 836 static const pcre_uint8 poptable[] = {
190 ph10 462 0, /* End */
191 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
192 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
193     1, 1, 1, /* Any, AllAny, Anybyte */
194 ph10 498 1, 1, /* \P, \p */
195 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
196 ph10 498 1, /* \X */
197 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
198 ph10 462 1, /* Char */
199 ph10 602 1, /* Chari */
200 ph10 462 1, /* not */
201 ph10 602 1, /* noti */
202 ph10 462 /* Positive single-char repeats */
203     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
204     1, 1, 1, /* upto, minupto, exact */
205     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
206 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
207     1, 1, 1, /* upto I, minupto I, exact I */
208     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
209 ph10 462 /* Negative single-char repeats - only for chars < 256 */
210     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
211     1, 1, 1, /* NOT upto, minupto, exact */
212     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
213 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
214     1, 1, 1, /* NOT upto I, minupto I, exact I */
215     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
216 ph10 462 /* Positive type repeats */
217     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
218     1, 1, 1, /* Type upto, minupto, exact */
219     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
220     /* Character class & ref repeats */
221     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
222     1, 1, /* CRRANGE, CRMINRANGE */
223     1, /* CLASS */
224     1, /* NCLASS */
225     1, /* XCLASS - variable length */
226     0, /* REF */
227 ph10 602 0, /* REFI */
228 ph10 462 0, /* RECURSE */
229     0, /* CALLOUT */
230     0, /* Alt */
231     0, /* Ket */
232     0, /* KetRmax */
233     0, /* KetRmin */
234 ph10 604 0, /* KetRpos */
235 ph10 637 0, /* Reverse */
236 ph10 462 0, /* Assert */
237     0, /* Assert not */
238     0, /* Assert behind */
239     0, /* Assert behind not */
240 ph10 723 0, 0, /* ONCE, ONCE_NC */
241     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
242 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
243 ph10 498 0, 0, /* CREF, NCREF */
244     0, 0, /* RREF, NRREF */
245 ph10 462 0, /* DEF */
246 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
247 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
248     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
249     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
250     0, 0 /* CLOSE, SKIPZERO */
251 ph10 462 };
252    
253 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
254     and \w */
255    
256 ph10 836 static const pcre_uint8 toptable1[] = {
257 ph10 168 0, 0, 0, 0, 0, 0,
258 nigel 77 ctype_digit, ctype_digit,
259     ctype_space, ctype_space,
260     ctype_word, ctype_word,
261 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
262 nigel 77 };
263    
264 ph10 836 static const pcre_uint8 toptable2[] = {
265 ph10 168 0, 0, 0, 0, 0, 0,
266 nigel 77 ctype_digit, 0,
267     ctype_space, 0,
268     ctype_word, 0,
269 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
270 nigel 77 };
271    
272    
273     /* Structure for holding data about a particular state, which is in effect the
274     current data for an active path through the match tree. It must consist
275     entirely of ints because the working vector we are passed, and which we put
276     these structures in, is a vector of ints. */
277    
278     typedef struct stateblock {
279     int offset; /* Offset to opcode */
280     int count; /* Count for repeats */
281     int data; /* Some use extra data */
282     } stateblock;
283    
284 ph10 960 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
285 nigel 77
286    
287 ph10 475 #ifdef PCRE_DEBUG
288 nigel 77 /*************************************************
289     * Print character string *
290     *************************************************/
291    
292     /* Character string printing function for debugging.
293    
294     Arguments:
295     p points to string
296     length number of bytes
297     f where to print
298    
299     Returns: nothing
300     */
301    
302     static void
303 ph10 836 pchars(const pcre_uchar *p, int length, FILE *f)
304 nigel 77 {
305     int c;
306     while (length-- > 0)
307     {
308     if (isprint(c = *(p++)))
309     fprintf(f, "%c", c);
310     else
311     fprintf(f, "\\x%02x", c);
312     }
313     }
314     #endif
315    
316    
317    
318     /*************************************************
319     * Execute a Regular Expression - DFA engine *
320     *************************************************/
321    
322     /* This internal function applies a compiled pattern to a subject string,
323     starting at a given point, using a DFA engine. This function is called from the
324     external one, possibly multiple times if the pattern is not anchored. The
325     function calls itself recursively for some kinds of subpattern.
326    
327     Arguments:
328     md the match_data block with fixed information
329     this_start_code the opening bracket of this subexpression's code
330     current_subject where we currently are in the subject string
331     start_offset start offset in the subject string
332     offsets vector to contain the matching string offsets
333     offsetcount size of same
334     workspace vector of workspace
335     wscount size of same
336     rlevel function call recursion level
337    
338 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
339 ph10 341 = 0 => offsets overflowed; longest matches are present
340 nigel 77 -1 => failed to match
341     < -1 => some kind of unexpected problem
342    
343     The following macros are used for adding states to the two state vectors (one
344     for the current character, one for the following character). */
345    
346     #define ADD_ACTIVE(x,y) \
347     if (active_count++ < wscount) \
348     { \
349     next_active_state->offset = (x); \
350     next_active_state->count = (y); \
351     next_active_state++; \
352     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
353     } \
354     else return PCRE_ERROR_DFA_WSSIZE
355    
356     #define ADD_ACTIVE_DATA(x,y,z) \
357     if (active_count++ < wscount) \
358     { \
359     next_active_state->offset = (x); \
360     next_active_state->count = (y); \
361     next_active_state->data = (z); \
362     next_active_state++; \
363     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
364     } \
365     else return PCRE_ERROR_DFA_WSSIZE
366    
367     #define ADD_NEW(x,y) \
368     if (new_count++ < wscount) \
369     { \
370     next_new_state->offset = (x); \
371     next_new_state->count = (y); \
372     next_new_state++; \
373     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
374     } \
375     else return PCRE_ERROR_DFA_WSSIZE
376    
377     #define ADD_NEW_DATA(x,y,z) \
378     if (new_count++ < wscount) \
379     { \
380     next_new_state->offset = (x); \
381     next_new_state->count = (y); \
382     next_new_state->data = (z); \
383     next_new_state++; \
384 ph10 979 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
385     (x), (y), (z), __LINE__)); \
386 nigel 77 } \
387     else return PCRE_ERROR_DFA_WSSIZE
388    
389     /* And now, here is the code */
390    
391     static int
392     internal_dfa_exec(
393     dfa_match_data *md,
394 ph10 836 const pcre_uchar *this_start_code,
395     const pcre_uchar *current_subject,
396 nigel 77 int start_offset,
397     int *offsets,
398     int offsetcount,
399     int *workspace,
400     int wscount,
401 ph10 642 int rlevel)
402 nigel 77 {
403     stateblock *active_states, *new_states, *temp_states;
404     stateblock *next_active_state, *next_new_state;
405    
406 ph10 836 const pcre_uint8 *ctypes, *lcc, *fcc;
407     const pcre_uchar *ptr;
408     const pcre_uchar *end_code, *first_op;
409 nigel 77
410 ph10 642 dfa_recursion_info new_recursive;
411    
412 nigel 77 int active_count, new_count, match_count;
413    
414     /* Some fields in the md block are frequently referenced, so we load them into
415     independent variables in the hope that this will perform better. */
416    
417 ph10 836 const pcre_uchar *start_subject = md->start_subject;
418     const pcre_uchar *end_subject = md->end_subject;
419     const pcre_uchar *start_code = md->start_code;
420 nigel 77
421 ph10 836 #ifdef SUPPORT_UTF
422     BOOL utf = (md->poptions & PCRE_UTF8) != 0;
423 nigel 93 #else
424 ph10 836 BOOL utf = FALSE;
425 nigel 87 #endif
426 nigel 77
427 ph10 916 BOOL reset_could_continue = FALSE;
428    
429 nigel 77 rlevel++;
430     offsetcount &= (-2);
431    
432     wscount -= 2;
433     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
434     (2 * INTS_PER_STATEBLOCK);
435    
436     DPRINTF(("\n%.*s---------------------\n"
437 ph10 642 "%.*sCall to internal_dfa_exec f=%d\n",
438     rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
439 nigel 77
440     ctypes = md->tables + ctypes_offset;
441     lcc = md->tables + lcc_offset;
442     fcc = md->tables + fcc_offset;
443    
444     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
445    
446     active_states = (stateblock *)(workspace + 2);
447     next_new_state = new_states = active_states + wscount;
448     new_count = 0;
449    
450 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
451 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
452 ph10 836 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
453     ? IMM2_SIZE:0);
454 nigel 93
455 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
456     the alternative states onto the list, and find out where the end is. This
457     makes is possible to use this function recursively, when we want to stop at a
458     matching internal ket rather than at the end.
459    
460     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
461     a backward assertion. In that case, we have to find out the maximum amount to
462     move back, and set up each alternative appropriately. */
463    
464 nigel 93 if (*first_op == OP_REVERSE)
465 nigel 77 {
466     int max_back = 0;
467     int gone_back;
468    
469     end_code = this_start_code;
470     do
471     {
472     int back = GET(end_code, 2+LINK_SIZE);
473     if (back > max_back) max_back = back;
474     end_code += GET(end_code, 1);
475     }
476     while (*end_code == OP_ALT);
477    
478     /* If we can't go back the amount required for the longest lookbehind
479     pattern, go back as far as we can; some alternatives may still be viable. */
480    
481 ph10 836 #ifdef SUPPORT_UTF
482 nigel 77 /* In character mode we have to step back character by character */
483    
484 ph10 836 if (utf)
485 nigel 77 {
486     for (gone_back = 0; gone_back < max_back; gone_back++)
487     {
488     if (current_subject <= start_subject) break;
489     current_subject--;
490 ph10 836 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
491 nigel 77 }
492     }
493     else
494     #endif
495    
496     /* In byte-mode we can do this quickly. */
497    
498     {
499     gone_back = (current_subject - max_back < start_subject)?
500 ph10 530 (int)(current_subject - start_subject) : max_back;
501 nigel 77 current_subject -= gone_back;
502     }
503 ph10 461
504 ph10 435 /* Save the earliest consulted character */
505 nigel 77
506 ph10 461 if (current_subject < md->start_used_ptr)
507     md->start_used_ptr = current_subject;
508    
509 nigel 77 /* Now we can process the individual branches. */
510    
511     end_code = this_start_code;
512     do
513     {
514     int back = GET(end_code, 2+LINK_SIZE);
515     if (back <= gone_back)
516     {
517 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
518 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
519     }
520     end_code += GET(end_code, 1);
521     }
522     while (*end_code == OP_ALT);
523     }
524    
525     /* This is the code for a "normal" subpattern (not a backward assertion). The
526     start of a whole pattern is always one of these. If we are at the top level,
527     we may be asked to restart matching from the same point that we reached for a
528     previous partial match. We still have to scan through the top-level branches to
529     find the end state. */
530    
531     else
532     {
533     end_code = this_start_code;
534    
535     /* Restarting */
536    
537     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
538     {
539     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
540     new_count = workspace[1];
541     if (!workspace[0])
542     memcpy(new_states, active_states, new_count * sizeof(stateblock));
543     }
544    
545     /* Not restarting */
546    
547     else
548     {
549 nigel 93 int length = 1 + LINK_SIZE +
550 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
551 ph10 836 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
552     ? IMM2_SIZE:0);
553 nigel 77 do
554     {
555 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
556 nigel 77 end_code += GET(end_code, 1);
557 nigel 93 length = 1 + LINK_SIZE;
558 nigel 77 }
559     while (*end_code == OP_ALT);
560     }
561     }
562    
563     workspace[0] = 0; /* Bit indicating which vector is current */
564    
565 ph10 836 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
566 nigel 77
567     /* Loop for scanning the subject */
568    
569     ptr = current_subject;
570     for (;;)
571     {
572     int i, j;
573 nigel 91 int clen, dlen;
574     unsigned int c, d;
575 ph10 428 int forced_fail = 0;
576 ph10 975 BOOL partial_newline = FALSE;
577 ph10 916 BOOL could_continue = reset_could_continue;
578 ph10 975 reset_could_continue = FALSE;
579    
580 nigel 77 /* Make the new state list into the active state list and empty the
581     new state list. */
582    
583     temp_states = active_states;
584     active_states = new_states;
585     new_states = temp_states;
586     active_count = new_count;
587     new_count = 0;
588    
589     workspace[0] ^= 1; /* Remember for the restarting feature */
590     workspace[1] = active_count;
591    
592 ph10 475 #ifdef PCRE_DEBUG
593 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
594 ph10 836 pchars(ptr, STRLEN_UC(ptr), stdout);
595 nigel 77 printf("\"\n");
596    
597     printf("%.*sActive states: ", rlevel*2-2, SP);
598     for (i = 0; i < active_count; i++)
599     printf("%d/%d ", active_states[i].offset, active_states[i].count);
600     printf("\n");
601     #endif
602    
603     /* Set the pointers for adding new states */
604    
605     next_active_state = active_states + active_count;
606     next_new_state = new_states;
607    
608     /* Load the current character from the subject outside the loop, as many
609     different states may want to look at it, and we assume that at least one
610     will. */
611    
612     if (ptr < end_subject)
613     {
614 ph10 979 clen = 1; /* Number of data items in the character */
615 ph10 836 #ifdef SUPPORT_UTF
616     if (utf) { GETCHARLEN(c, ptr, clen); } else
617     #endif /* SUPPORT_UTF */
618 nigel 77 c = *ptr;
619     }
620     else
621     {
622 nigel 93 clen = 0; /* This indicates the end of the subject */
623     c = NOTACHAR; /* This value should never actually be used */
624 nigel 77 }
625    
626     /* Scan up the active states and act on each one. The result of an action
627     may be to add more states to the currently active list (e.g. on hitting a
628     parenthesis) or it may be to put states on the new list, for considering
629     when we move the character pointer on. */
630    
631     for (i = 0; i < active_count; i++)
632     {
633     stateblock *current_state = active_states + i;
634 ph10 654 BOOL caseless = FALSE;
635 ph10 836 const pcre_uchar *code;
636 nigel 77 int state_offset = current_state->offset;
637 ph10 397 int count, codevalue, rrc;
638 nigel 77
639 ph10 475 #ifdef PCRE_DEBUG
640 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
641 nigel 93 if (clen == 0) printf("EOL\n");
642 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
643     else printf("0x%02x\n", c);
644     #endif
645    
646     /* A negative offset is a special case meaning "hold off going to this
647     (negated) state until the number of characters in the data field have
648 ph10 975 been skipped". If the could_continue flag was passed over from a previous
649 ph10 916 state, arrange for it to passed on. */
650 nigel 77
651     if (state_offset < 0)
652     {
653     if (current_state->data > 0)
654     {
655     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
656     ADD_NEW_DATA(state_offset, current_state->count,
657     current_state->data - 1);
658 ph10 916 if (could_continue) reset_could_continue = TRUE;
659 nigel 77 continue;
660     }
661     else
662     {
663     current_state->offset = state_offset = -state_offset;
664     }
665     }
666    
667 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
668 ph10 439 See the note at the head of this module about the possibility of improving
669     performance here. */
670 nigel 77
671     for (j = 0; j < i; j++)
672     {
673     if (active_states[j].offset == state_offset &&
674     active_states[j].count == current_state->count)
675     {
676     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
677     goto NEXT_ACTIVE_STATE;
678     }
679     }
680    
681     /* The state offset is the offset to the opcode */
682    
683     code = start_code + state_offset;
684     codevalue = *code;
685    
686 ph10 463 /* If this opcode inspects a character, but we are at the end of the
687     subject, remember the fact for use when testing for a partial match. */
688    
689 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
690 ph10 463 could_continue = TRUE;
691 ph10 462
692 nigel 77 /* If this opcode is followed by an inline character, load it. It is
693     tempting to test for the presence of a subject character here, but that
694     is wrong, because sometimes zero repetitions of the subject are
695     permitted.
696    
697     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
698 ph10 975 argument that is not a data character - but is always one byte long because
699 ph10 925 the values are small. We have to take special action to deal with \P, \p,
700     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
701     these ones to new opcodes. */
702 nigel 77
703     if (coptable[codevalue] > 0)
704     {
705     dlen = 1;
706 ph10 836 #ifdef SUPPORT_UTF
707     if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
708     #endif /* SUPPORT_UTF */
709 nigel 77 d = code[coptable[codevalue]];
710     if (codevalue >= OP_TYPESTAR)
711     {
712 nigel 93 switch(d)
713     {
714     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
715     case OP_NOTPROP:
716     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
717     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
718     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
719 ph10 178 case OP_NOT_HSPACE:
720 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
721 ph10 178 case OP_NOT_VSPACE:
722 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
723 nigel 93 default: break;
724     }
725 nigel 77 }
726     }
727     else
728     {
729     dlen = 0; /* Not strictly necessary, but compilers moan */
730 nigel 93 d = NOTACHAR; /* if these variables are not set. */
731 nigel 77 }
732    
733    
734     /* Now process the individual opcodes */
735    
736     switch (codevalue)
737     {
738 ph10 498 /* ========================================================================== */
739     /* These cases are never obeyed. This is a fudge that causes a compile-
740     time error if the vectors coptable or poptable, which are indexed by
741     opcode, are not the correct length. It seems to be the only way to do
742     such a check at compile time, as the sizeof() operator does not work
743     in the C preprocessor. */
744 ph10 507
745 ph10 498 case OP_TABLE_LENGTH:
746 ph10 507 case OP_TABLE_LENGTH +
747 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
748     (sizeof(poptable) == OP_TABLE_LENGTH)):
749 ph10 507 break;
750 nigel 77
751     /* ========================================================================== */
752     /* Reached a closing bracket. If not at the end of the pattern, carry
753 ph10 654 on with the next opcode. For repeating opcodes, also add the repeat
754     state. Note that KETRPOS will always be encountered at the end of the
755     subpattern, because the possessive subpattern repeats are always handled
756 ph10 604 using recursive calls. Thus, it never adds any new states.
757 ph10 654
758 ph10 604 At the end of the (sub)pattern, unless we have an empty string and
759 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
760 ph10 442 start of the subject, save the match data, shifting up all previous
761 nigel 77 matches so we always have the longest first. */
762    
763     case OP_KET:
764     case OP_KETRMIN:
765     case OP_KETRMAX:
766 ph10 654 case OP_KETRPOS:
767 nigel 77 if (code != end_code)
768     {
769     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
770     if (codevalue != OP_KET)
771     {
772     ADD_ACTIVE(state_offset - GET(code, 1), 0);
773     }
774     }
775 ph10 461 else
776 nigel 77 {
777 ph10 461 if (ptr > current_subject ||
778 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
779     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
780     current_subject > start_subject + md->start_offset)))
781 nigel 77 {
782 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
783 ph10 680 else if (match_count > 0 && ++match_count * 2 > offsetcount)
784 ph10 428 match_count = 0;
785     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
786     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
787     if (offsetcount >= 2)
788     {
789 ph10 530 offsets[0] = (int)(current_subject - start_subject);
790     offsets[1] = (int)(ptr - start_subject);
791 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
792 ph10 979 offsets[1] - offsets[0], (char *)current_subject));
793 ph10 428 }
794     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
795     {
796     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
797     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
798     match_count, rlevel*2-2, SP));
799     return match_count;
800     }
801 ph10 461 }
802 nigel 77 }
803     break;
804    
805     /* ========================================================================== */
806     /* These opcodes add to the current list of states without looking
807     at the current character. */
808    
809     /*-----------------------------------------------------------------*/
810     case OP_ALT:
811     do { code += GET(code, 1); } while (*code == OP_ALT);
812 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
813 nigel 77 break;
814    
815     /*-----------------------------------------------------------------*/
816     case OP_BRA:
817 nigel 93 case OP_SBRA:
818 nigel 77 do
819     {
820 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
821 nigel 77 code += GET(code, 1);
822     }
823     while (*code == OP_ALT);
824     break;
825    
826     /*-----------------------------------------------------------------*/
827 nigel 93 case OP_CBRA:
828     case OP_SCBRA:
829 ph10 836 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
830 nigel 93 code += GET(code, 1);
831     while (*code == OP_ALT)
832     {
833 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
834 nigel 93 code += GET(code, 1);
835     }
836     break;
837    
838     /*-----------------------------------------------------------------*/
839 nigel 77 case OP_BRAZERO:
840     case OP_BRAMINZERO:
841     ADD_ACTIVE(state_offset + 1, 0);
842     code += 1 + GET(code, 2);
843     while (*code == OP_ALT) code += GET(code, 1);
844 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
845 nigel 77 break;
846    
847     /*-----------------------------------------------------------------*/
848 ph10 335 case OP_SKIPZERO:
849     code += 1 + GET(code, 2);
850     while (*code == OP_ALT) code += GET(code, 1);
851 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
852 ph10 335 break;
853    
854     /*-----------------------------------------------------------------*/
855 nigel 77 case OP_CIRC:
856 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
857     { ADD_ACTIVE(state_offset + 1, 0); }
858     break;
859    
860     /*-----------------------------------------------------------------*/
861     case OP_CIRCM:
862 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
863 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
864 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
865     break;
866    
867     /*-----------------------------------------------------------------*/
868     case OP_EOD:
869 ph10 579 if (ptr >= end_subject)
870     {
871 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
872     could_continue = TRUE;
873     else { ADD_ACTIVE(state_offset + 1, 0); }
874     }
875 nigel 77 break;
876    
877     /*-----------------------------------------------------------------*/
878     case OP_SOD:
879     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
880     break;
881    
882     /*-----------------------------------------------------------------*/
883     case OP_SOM:
884     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
885     break;
886    
887    
888     /* ========================================================================== */
889     /* These opcodes inspect the next subject character, and sometimes
890     the previous one as well, but do not have an argument. The variable
891     clen contains the length of the current character and is zero if we are
892     at the end of the subject. */
893    
894     /*-----------------------------------------------------------------*/
895     case OP_ANY:
896 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
897 ph10 975 {
898 ph10 919 if (ptr + 1 >= md->end_subject &&
899     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
900     NLBLOCK->nltype == NLTYPE_FIXED &&
901 ph10 975 NLBLOCK->nllen == 2 &&
902 ph10 919 c == NLBLOCK->nl[0])
903     {
904 ph10 975 could_continue = partial_newline = TRUE;
905     }
906 ph10 919 else
907 ph10 975 {
908     ADD_NEW(state_offset + 1, 0);
909     }
910 ph10 919 }
911 nigel 77 break;
912    
913     /*-----------------------------------------------------------------*/
914 ph10 341 case OP_ALLANY:
915     if (clen > 0)
916     { ADD_NEW(state_offset + 1, 0); }
917     break;
918    
919     /*-----------------------------------------------------------------*/
920 nigel 77 case OP_EODN:
921 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922     could_continue = TRUE;
923     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
924 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
925     break;
926    
927     /*-----------------------------------------------------------------*/
928     case OP_DOLL:
929     if ((md->moptions & PCRE_NOTEOL) == 0)
930     {
931 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
932     could_continue = TRUE;
933     else if (clen == 0 ||
934 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
935 ph10 602 (ptr == end_subject - md->nllen)
936 nigel 91 ))
937 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
938 ph10 916 else if (ptr + 1 >= md->end_subject &&
939     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
940     NLBLOCK->nltype == NLTYPE_FIXED &&
941 ph10 975 NLBLOCK->nllen == 2 &&
942 ph10 916 c == NLBLOCK->nl[0])
943     {
944     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
945     {
946     reset_could_continue = TRUE;
947 ph10 975 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
948     }
949     else could_continue = partial_newline = TRUE;
950     }
951 nigel 77 }
952 ph10 602 break;
953    
954     /*-----------------------------------------------------------------*/
955     case OP_DOLLM:
956     if ((md->moptions & PCRE_NOTEOL) == 0)
957     {
958     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
959     could_continue = TRUE;
960     else if (clen == 0 ||
961     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
962     { ADD_ACTIVE(state_offset + 1, 0); }
963 ph10 916 else if (ptr + 1 >= md->end_subject &&
964     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
965     NLBLOCK->nltype == NLTYPE_FIXED &&
966 ph10 975 NLBLOCK->nllen == 2 &&
967 ph10 916 c == NLBLOCK->nl[0])
968     {
969     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
970     {
971     reset_could_continue = TRUE;
972 ph10 975 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
973     }
974     else could_continue = partial_newline = TRUE;
975     }
976 ph10 602 }
977     else if (IS_NEWLINE(ptr))
978 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
979     break;
980    
981     /*-----------------------------------------------------------------*/
982    
983     case OP_DIGIT:
984     case OP_WHITESPACE:
985     case OP_WORDCHAR:
986     if (clen > 0 && c < 256 &&
987     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
988     { ADD_NEW(state_offset + 1, 0); }
989     break;
990    
991     /*-----------------------------------------------------------------*/
992     case OP_NOT_DIGIT:
993     case OP_NOT_WHITESPACE:
994     case OP_NOT_WORDCHAR:
995     if (clen > 0 && (c >= 256 ||
996     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
997     { ADD_NEW(state_offset + 1, 0); }
998     break;
999    
1000     /*-----------------------------------------------------------------*/
1001     case OP_WORD_BOUNDARY:
1002     case OP_NOT_WORD_BOUNDARY:
1003     {
1004     int left_word, right_word;
1005    
1006     if (ptr > start_subject)
1007     {
1008 ph10 836 const pcre_uchar *temp = ptr - 1;
1009 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1010 ph10 836 #ifdef SUPPORT_UTF
1011     if (utf) { BACKCHAR(temp); }
1012 nigel 77 #endif
1013     GETCHARTEST(d, temp);
1014 ph10 535 #ifdef SUPPORT_UCP
1015 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
1016     {
1017     if (d == '_') left_word = TRUE; else
1018 ph10 535 {
1019 ph10 518 int cat = UCD_CATEGORY(d);
1020     left_word = (cat == ucp_L || cat == ucp_N);
1021 ph10 535 }
1022     }
1023     else
1024     #endif
1025 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1026     }
1027 ph10 518 else left_word = FALSE;
1028 nigel 77
1029 ph10 461 if (clen > 0)
1030 ph10 535 {
1031     #ifdef SUPPORT_UCP
1032 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
1033     {
1034     if (c == '_') right_word = TRUE; else
1035 ph10 535 {
1036 ph10 518 int cat = UCD_CATEGORY(c);
1037     right_word = (cat == ucp_L || cat == ucp_N);
1038 ph10 535 }
1039     }
1040     else
1041     #endif
1042 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1043 ph10 535 }
1044 ph10 518 else right_word = FALSE;
1045 nigel 77
1046     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1047     { ADD_ACTIVE(state_offset + 1, 0); }
1048     }
1049     break;
1050    
1051    
1052     /*-----------------------------------------------------------------*/
1053     /* Check the next character by Unicode property. We will get here only
1054     if the support is in the binary; otherwise a compile-time error occurs.
1055     */
1056    
1057 ph10 151 #ifdef SUPPORT_UCP
1058 nigel 77 case OP_PROP:
1059     case OP_NOTPROP:
1060     if (clen > 0)
1061     {
1062 nigel 87 BOOL OK;
1063 ph10 349 const ucd_record * prop = GET_UCD(c);
1064 nigel 87 switch(code[1])
1065 nigel 77 {
1066 nigel 87 case PT_ANY:
1067     OK = TRUE;
1068     break;
1069    
1070     case PT_LAMP:
1071 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1072 ph10 517 prop->chartype == ucp_Lt;
1073 nigel 87 break;
1074    
1075     case PT_GC:
1076 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1077 nigel 87 break;
1078    
1079     case PT_PC:
1080 ph10 349 OK = prop->chartype == code[2];
1081 nigel 87 break;
1082    
1083     case PT_SC:
1084 ph10 349 OK = prop->script == code[2];
1085 nigel 87 break;
1086 ph10 535
1087 ph10 517 /* These are specials for combination cases. */
1088 ph10 535
1089 ph10 517 case PT_ALNUM:
1090 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1091     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1092 ph10 535 break;
1093    
1094 ph10 517 case PT_SPACE: /* Perl space */
1095 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1096 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1097 ph10 535 break;
1098    
1099 ph10 517 case PT_PXSPACE: /* POSIX space */
1100 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1101 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1102     c == CHAR_FF || c == CHAR_CR;
1103 ph10 535 break;
1104    
1105 ph10 517 case PT_WORD:
1106 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1107     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1108 ph10 517 c == CHAR_UNDERSCORE;
1109 ph10 535 break;
1110 nigel 87
1111     /* Should never occur, but keep compilers from grumbling. */
1112    
1113     default:
1114     OK = codevalue != OP_PROP;
1115     break;
1116 nigel 77 }
1117 nigel 87
1118     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1119 nigel 77 }
1120     break;
1121     #endif
1122    
1123    
1124    
1125     /* ========================================================================== */
1126     /* These opcodes likewise inspect the subject character, but have an
1127     argument that is not a data character. It is one of these opcodes:
1128 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1129     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1130 nigel 77
1131     case OP_TYPEPLUS:
1132     case OP_TYPEMINPLUS:
1133 nigel 93 case OP_TYPEPOSPLUS:
1134 nigel 77 count = current_state->count; /* Already matched */
1135     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1136     if (clen > 0)
1137     {
1138 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1139     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1140     NLBLOCK->nltype == NLTYPE_FIXED &&
1141 ph10 975 NLBLOCK->nllen == 2 &&
1142 ph10 919 c == NLBLOCK->nl[0])
1143     {
1144 ph10 975 could_continue = partial_newline = TRUE;
1145     }
1146 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1147 nigel 77 (c < 256 &&
1148 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1149 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1150     {
1151 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1152     {
1153     active_count--; /* Remove non-match possibility */
1154     next_active_state--;
1155     }
1156 nigel 77 count++;
1157     ADD_NEW(state_offset, count);
1158     }
1159     }
1160     break;
1161    
1162     /*-----------------------------------------------------------------*/
1163     case OP_TYPEQUERY:
1164     case OP_TYPEMINQUERY:
1165 nigel 93 case OP_TYPEPOSQUERY:
1166 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1167     if (clen > 0)
1168     {
1169 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1170     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1171     NLBLOCK->nltype == NLTYPE_FIXED &&
1172 ph10 975 NLBLOCK->nllen == 2 &&
1173 ph10 919 c == NLBLOCK->nl[0])
1174     {
1175 ph10 975 could_continue = partial_newline = TRUE;
1176     }
1177 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1178 nigel 77 (c < 256 &&
1179 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1180 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1181     {
1182 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1183     {
1184     active_count--; /* Remove non-match possibility */
1185     next_active_state--;
1186     }
1187 nigel 77 ADD_NEW(state_offset + 2, 0);
1188     }
1189     }
1190     break;
1191    
1192     /*-----------------------------------------------------------------*/
1193     case OP_TYPESTAR:
1194     case OP_TYPEMINSTAR:
1195 nigel 93 case OP_TYPEPOSSTAR:
1196 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1197     if (clen > 0)
1198     {
1199 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1200     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1201     NLBLOCK->nltype == NLTYPE_FIXED &&
1202 ph10 975 NLBLOCK->nllen == 2 &&
1203 ph10 919 c == NLBLOCK->nl[0])
1204     {
1205 ph10 975 could_continue = partial_newline = TRUE;
1206     }
1207 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1208 nigel 77 (c < 256 &&
1209 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1210 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1211     {
1212 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1213     {
1214     active_count--; /* Remove non-match possibility */
1215     next_active_state--;
1216     }
1217 nigel 77 ADD_NEW(state_offset, 0);
1218     }
1219     }
1220     break;
1221    
1222     /*-----------------------------------------------------------------*/
1223     case OP_TYPEEXACT:
1224 nigel 93 count = current_state->count; /* Number already matched */
1225     if (clen > 0)
1226     {
1227 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1228     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1229     NLBLOCK->nltype == NLTYPE_FIXED &&
1230 ph10 975 NLBLOCK->nllen == 2 &&
1231 ph10 919 c == NLBLOCK->nl[0])
1232     {
1233 ph10 975 could_continue = partial_newline = TRUE;
1234     }
1235 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1236 nigel 93 (c < 256 &&
1237 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1238 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1239     {
1240     if (++count >= GET2(code, 1))
1241 ph10 836 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1242 nigel 93 else
1243     { ADD_NEW(state_offset, count); }
1244     }
1245     }
1246     break;
1247    
1248     /*-----------------------------------------------------------------*/
1249 nigel 77 case OP_TYPEUPTO:
1250     case OP_TYPEMINUPTO:
1251 nigel 93 case OP_TYPEPOSUPTO:
1252 ph10 836 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1253 nigel 77 count = current_state->count; /* Number already matched */
1254     if (clen > 0)
1255     {
1256 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1257     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1258     NLBLOCK->nltype == NLTYPE_FIXED &&
1259 ph10 975 NLBLOCK->nllen == 2 &&
1260 ph10 919 c == NLBLOCK->nl[0])
1261     {
1262 ph10 975 could_continue = partial_newline = TRUE;
1263     }
1264 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1265 nigel 77 (c < 256 &&
1266 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1267 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1268     {
1269 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1270     {
1271     active_count--; /* Remove non-match possibility */
1272     next_active_state--;
1273     }
1274 nigel 77 if (++count >= GET2(code, 1))
1275 ph10 836 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1276 nigel 77 else
1277     { ADD_NEW(state_offset, count); }
1278     }
1279     }
1280     break;
1281    
1282     /* ========================================================================== */
1283     /* These are virtual opcodes that are used when something like
1284 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1285     argument. It keeps the code above fast for the other cases. The argument
1286     is in the d variable. */
1287 nigel 77
1288 ph10 151 #ifdef SUPPORT_UCP
1289 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1290     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1291 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1292 nigel 77 count = current_state->count; /* Already matched */
1293 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1294 nigel 77 if (clen > 0)
1295     {
1296 nigel 87 BOOL OK;
1297 ph10 349 const ucd_record * prop = GET_UCD(c);
1298 nigel 87 switch(code[2])
1299     {
1300     case PT_ANY:
1301     OK = TRUE;
1302     break;
1303    
1304     case PT_LAMP:
1305 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1306 ph10 517 prop->chartype == ucp_Lt;
1307 nigel 87 break;
1308    
1309     case PT_GC:
1310 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1311 nigel 87 break;
1312    
1313     case PT_PC:
1314 ph10 349 OK = prop->chartype == code[3];
1315 nigel 87 break;
1316    
1317     case PT_SC:
1318 ph10 349 OK = prop->script == code[3];
1319 nigel 87 break;
1320    
1321 ph10 517 /* These are specials for combination cases. */
1322 ph10 535
1323 ph10 517 case PT_ALNUM:
1324 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1325     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1326 ph10 535 break;
1327    
1328 ph10 517 case PT_SPACE: /* Perl space */
1329 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1330 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1331 ph10 535 break;
1332    
1333 ph10 517 case PT_PXSPACE: /* POSIX space */
1334 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1335 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1336     c == CHAR_FF || c == CHAR_CR;
1337 ph10 535 break;
1338    
1339 ph10 517 case PT_WORD:
1340 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1341     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1342 ph10 517 c == CHAR_UNDERSCORE;
1343 ph10 535 break;
1344 ph10 517
1345 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1346    
1347     default:
1348     OK = codevalue != OP_PROP;
1349     break;
1350     }
1351    
1352 nigel 93 if (OK == (d == OP_PROP))
1353     {
1354     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1355     {
1356     active_count--; /* Remove non-match possibility */
1357     next_active_state--;
1358     }
1359     count++;
1360     ADD_NEW(state_offset, count);
1361     }
1362 nigel 77 }
1363     break;
1364    
1365     /*-----------------------------------------------------------------*/
1366     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1367     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1368 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1369 nigel 77 count = current_state->count; /* Already matched */
1370     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1371 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1372 nigel 77 {
1373 ph10 836 const pcre_uchar *nptr = ptr + clen;
1374 nigel 77 int ncount = 0;
1375 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1376     {
1377     active_count--; /* Remove non-match possibility */
1378     next_active_state--;
1379     }
1380 nigel 77 while (nptr < end_subject)
1381     {
1382     int nd;
1383     int ndlen = 1;
1384     GETCHARLEN(nd, nptr, ndlen);
1385 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1386 nigel 77 ncount++;
1387     nptr += ndlen;
1388     }
1389     count++;
1390     ADD_NEW_DATA(-state_offset, count, ncount);
1391     }
1392     break;
1393 ph10 151 #endif
1394 nigel 77
1395     /*-----------------------------------------------------------------*/
1396 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1397     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1398     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1399     count = current_state->count; /* Already matched */
1400     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1401     if (clen > 0)
1402     {
1403     int ncount = 0;
1404     switch (c)
1405     {
1406     case 0x000b:
1407     case 0x000c:
1408     case 0x0085:
1409     case 0x2028:
1410     case 0x2029:
1411 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1412     goto ANYNL01;
1413    
1414     case 0x000d:
1415     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1416     /* Fall through */
1417    
1418     ANYNL01:
1419     case 0x000a:
1420 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1421     {
1422     active_count--; /* Remove non-match possibility */
1423     next_active_state--;
1424     }
1425     count++;
1426     ADD_NEW_DATA(-state_offset, count, ncount);
1427     break;
1428 ph10 231
1429 nigel 93 default:
1430     break;
1431     }
1432     }
1433     break;
1434    
1435     /*-----------------------------------------------------------------*/
1436 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1437     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1438     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1439     count = current_state->count; /* Already matched */
1440     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1441     if (clen > 0)
1442     {
1443 ph10 182 BOOL OK;
1444 ph10 178 switch (c)
1445     {
1446     case 0x000a:
1447     case 0x000b:
1448     case 0x000c:
1449     case 0x000d:
1450     case 0x0085:
1451     case 0x2028:
1452     case 0x2029:
1453     OK = TRUE;
1454 ph10 182 break;
1455 ph10 178
1456     default:
1457     OK = FALSE;
1458 ph10 182 break;
1459 ph10 178 }
1460    
1461     if (OK == (d == OP_VSPACE))
1462 ph10 182 {
1463 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1464     {
1465     active_count--; /* Remove non-match possibility */
1466     next_active_state--;
1467     }
1468     count++;
1469     ADD_NEW_DATA(-state_offset, count, 0);
1470     }
1471     }
1472     break;
1473    
1474     /*-----------------------------------------------------------------*/
1475     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1476     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1477     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1478     count = current_state->count; /* Already matched */
1479     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1480     if (clen > 0)
1481     {
1482 ph10 182 BOOL OK;
1483 ph10 178 switch (c)
1484     {
1485     case 0x09: /* HT */
1486     case 0x20: /* SPACE */
1487     case 0xa0: /* NBSP */
1488     case 0x1680: /* OGHAM SPACE MARK */
1489     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1490     case 0x2000: /* EN QUAD */
1491     case 0x2001: /* EM QUAD */
1492     case 0x2002: /* EN SPACE */
1493     case 0x2003: /* EM SPACE */
1494     case 0x2004: /* THREE-PER-EM SPACE */
1495     case 0x2005: /* FOUR-PER-EM SPACE */
1496     case 0x2006: /* SIX-PER-EM SPACE */
1497     case 0x2007: /* FIGURE SPACE */
1498     case 0x2008: /* PUNCTUATION SPACE */
1499     case 0x2009: /* THIN SPACE */
1500     case 0x200A: /* HAIR SPACE */
1501     case 0x202f: /* NARROW NO-BREAK SPACE */
1502     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1503     case 0x3000: /* IDEOGRAPHIC SPACE */
1504     OK = TRUE;
1505     break;
1506 ph10 182
1507 ph10 178 default:
1508     OK = FALSE;
1509     break;
1510     }
1511 ph10 182
1512 ph10 178 if (OK == (d == OP_HSPACE))
1513 ph10 182 {
1514 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1515     {
1516     active_count--; /* Remove non-match possibility */
1517     next_active_state--;
1518     }
1519     count++;
1520     ADD_NEW_DATA(-state_offset, count, 0);
1521     }
1522     }
1523     break;
1524    
1525     /*-----------------------------------------------------------------*/
1526 ph10 151 #ifdef SUPPORT_UCP
1527 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1528     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1529 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1530 nigel 87 count = 4;
1531 nigel 77 goto QS1;
1532    
1533     case OP_PROP_EXTRA + OP_TYPESTAR:
1534     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1535 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1536 nigel 77 count = 0;
1537    
1538     QS1:
1539    
1540 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1541 nigel 77 if (clen > 0)
1542     {
1543 nigel 87 BOOL OK;
1544 ph10 349 const ucd_record * prop = GET_UCD(c);
1545 nigel 87 switch(code[2])
1546     {
1547     case PT_ANY:
1548     OK = TRUE;
1549     break;
1550    
1551     case PT_LAMP:
1552 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1553 ph10 517 prop->chartype == ucp_Lt;
1554 nigel 87 break;
1555    
1556     case PT_GC:
1557 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1558 nigel 87 break;
1559    
1560     case PT_PC:
1561 ph10 349 OK = prop->chartype == code[3];
1562 nigel 87 break;
1563    
1564     case PT_SC:
1565 ph10 349 OK = prop->script == code[3];
1566 nigel 87 break;
1567 ph10 535
1568 ph10 517 /* These are specials for combination cases. */
1569 ph10 535
1570 ph10 517 case PT_ALNUM:
1571 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1572     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1573 ph10 535 break;
1574    
1575 ph10 517 case PT_SPACE: /* Perl space */
1576 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1577 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1578 ph10 535 break;
1579    
1580 ph10 517 case PT_PXSPACE: /* POSIX space */
1581 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1582 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1583     c == CHAR_FF || c == CHAR_CR;
1584 ph10 535 break;
1585    
1586 ph10 517 case PT_WORD:
1587 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1588     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1589 ph10 517 c == CHAR_UNDERSCORE;
1590 ph10 535 break;
1591 nigel 87
1592     /* Should never occur, but keep compilers from grumbling. */
1593    
1594     default:
1595     OK = codevalue != OP_PROP;
1596     break;
1597     }
1598    
1599 nigel 93 if (OK == (d == OP_PROP))
1600     {
1601     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1602     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1603     {
1604     active_count--; /* Remove non-match possibility */
1605     next_active_state--;
1606     }
1607     ADD_NEW(state_offset + count, 0);
1608     }
1609 nigel 77 }
1610     break;
1611    
1612     /*-----------------------------------------------------------------*/
1613     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1614     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1615 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1616 nigel 77 count = 2;
1617     goto QS2;
1618    
1619     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1620     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1621 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1622 nigel 77 count = 0;
1623    
1624     QS2:
1625    
1626     ADD_ACTIVE(state_offset + 2, 0);
1627 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1628 nigel 77 {
1629 ph10 836 const pcre_uchar *nptr = ptr + clen;
1630 nigel 77 int ncount = 0;
1631 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1632     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1633     {
1634     active_count--; /* Remove non-match possibility */
1635     next_active_state--;
1636     }
1637 nigel 77 while (nptr < end_subject)
1638     {
1639     int nd;
1640     int ndlen = 1;
1641     GETCHARLEN(nd, nptr, ndlen);
1642 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1643 nigel 77 ncount++;
1644     nptr += ndlen;
1645     }
1646     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1647     }
1648     break;
1649 ph10 151 #endif
1650 nigel 77
1651     /*-----------------------------------------------------------------*/
1652 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1653     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1654     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1655     count = 2;
1656     goto QS3;
1657    
1658     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1659     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1660     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1661     count = 0;
1662    
1663     QS3:
1664     ADD_ACTIVE(state_offset + 2, 0);
1665     if (clen > 0)
1666     {
1667     int ncount = 0;
1668     switch (c)
1669     {
1670     case 0x000b:
1671     case 0x000c:
1672     case 0x0085:
1673     case 0x2028:
1674     case 0x2029:
1675 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1676     goto ANYNL02;
1677    
1678     case 0x000d:
1679     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1680     /* Fall through */
1681    
1682     ANYNL02:
1683     case 0x000a:
1684 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1685     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1686     {
1687     active_count--; /* Remove non-match possibility */
1688     next_active_state--;
1689     }
1690     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1691     break;
1692 ph10 231
1693 nigel 93 default:
1694     break;
1695     }
1696     }
1697     break;
1698    
1699     /*-----------------------------------------------------------------*/
1700 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1701     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1702     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1703     count = 2;
1704     goto QS4;
1705    
1706     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1707     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1708     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1709     count = 0;
1710    
1711     QS4:
1712     ADD_ACTIVE(state_offset + 2, 0);
1713     if (clen > 0)
1714     {
1715 ph10 182 BOOL OK;
1716 ph10 178 switch (c)
1717     {
1718     case 0x000a:
1719     case 0x000b:
1720     case 0x000c:
1721     case 0x000d:
1722     case 0x0085:
1723     case 0x2028:
1724     case 0x2029:
1725     OK = TRUE;
1726     break;
1727 ph10 182
1728 ph10 178 default:
1729     OK = FALSE;
1730     break;
1731     }
1732     if (OK == (d == OP_VSPACE))
1733 ph10 182 {
1734 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1735     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1736     {
1737     active_count--; /* Remove non-match possibility */
1738     next_active_state--;
1739     }
1740     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1741     }
1742     }
1743     break;
1744    
1745     /*-----------------------------------------------------------------*/
1746     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1747     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1748     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1749     count = 2;
1750     goto QS5;
1751    
1752     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1753     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1754     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1755     count = 0;
1756    
1757     QS5:
1758     ADD_ACTIVE(state_offset + 2, 0);
1759     if (clen > 0)
1760     {
1761 ph10 182 BOOL OK;
1762 ph10 178 switch (c)
1763     {
1764     case 0x09: /* HT */
1765     case 0x20: /* SPACE */
1766     case 0xa0: /* NBSP */
1767     case 0x1680: /* OGHAM SPACE MARK */
1768     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1769     case 0x2000: /* EN QUAD */
1770     case 0x2001: /* EM QUAD */
1771     case 0x2002: /* EN SPACE */
1772     case 0x2003: /* EM SPACE */
1773     case 0x2004: /* THREE-PER-EM SPACE */
1774     case 0x2005: /* FOUR-PER-EM SPACE */
1775     case 0x2006: /* SIX-PER-EM SPACE */
1776     case 0x2007: /* FIGURE SPACE */
1777     case 0x2008: /* PUNCTUATION SPACE */
1778     case 0x2009: /* THIN SPACE */
1779     case 0x200A: /* HAIR SPACE */
1780     case 0x202f: /* NARROW NO-BREAK SPACE */
1781     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1782     case 0x3000: /* IDEOGRAPHIC SPACE */
1783     OK = TRUE;
1784     break;
1785 ph10 182
1786 ph10 178 default:
1787     OK = FALSE;
1788     break;
1789     }
1790 ph10 182
1791 ph10 178 if (OK == (d == OP_HSPACE))
1792 ph10 182 {
1793 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1794     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1795     {
1796     active_count--; /* Remove non-match possibility */
1797     next_active_state--;
1798     }
1799     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1800     }
1801     }
1802     break;
1803    
1804     /*-----------------------------------------------------------------*/
1805 ph10 151 #ifdef SUPPORT_UCP
1806 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1807     case OP_PROP_EXTRA + OP_TYPEUPTO:
1808     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1809 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1810 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1811 ph10 836 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1812 nigel 77 count = current_state->count; /* Number already matched */
1813     if (clen > 0)
1814     {
1815 nigel 87 BOOL OK;
1816 ph10 349 const ucd_record * prop = GET_UCD(c);
1817 ph10 836 switch(code[1 + IMM2_SIZE + 1])
1818 nigel 77 {
1819 nigel 87 case PT_ANY:
1820     OK = TRUE;
1821     break;
1822    
1823     case PT_LAMP:
1824 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1825 ph10 517 prop->chartype == ucp_Lt;
1826 nigel 87 break;
1827    
1828     case PT_GC:
1829 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1830 nigel 87 break;
1831    
1832     case PT_PC:
1833 ph10 836 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1834 nigel 87 break;
1835    
1836     case PT_SC:
1837 ph10 836 OK = prop->script == code[1 + IMM2_SIZE + 2];
1838 nigel 87 break;
1839 ph10 535
1840 ph10 517 /* These are specials for combination cases. */
1841 ph10 535
1842 ph10 517 case PT_ALNUM:
1843 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1844     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1845 ph10 535 break;
1846    
1847 ph10 517 case PT_SPACE: /* Perl space */
1848 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1849 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1850 ph10 535 break;
1851    
1852 ph10 517 case PT_PXSPACE: /* POSIX space */
1853 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1854 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1855     c == CHAR_FF || c == CHAR_CR;
1856 ph10 535 break;
1857    
1858 ph10 517 case PT_WORD:
1859 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1860     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1861 ph10 517 c == CHAR_UNDERSCORE;
1862 ph10 535 break;
1863 nigel 87
1864     /* Should never occur, but keep compilers from grumbling. */
1865    
1866     default:
1867     OK = codevalue != OP_PROP;
1868     break;
1869     }
1870    
1871     if (OK == (d == OP_PROP))
1872     {
1873 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1874     {
1875     active_count--; /* Remove non-match possibility */
1876     next_active_state--;
1877     }
1878 nigel 77 if (++count >= GET2(code, 1))
1879 ph10 836 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1880 nigel 77 else
1881     { ADD_NEW(state_offset, count); }
1882     }
1883     }
1884     break;
1885    
1886     /*-----------------------------------------------------------------*/
1887     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1888     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1889     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1890 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1891 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1892 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1893 nigel 77 count = current_state->count; /* Number already matched */
1894 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1895 nigel 77 {
1896 ph10 836 const pcre_uchar *nptr = ptr + clen;
1897 nigel 77 int ncount = 0;
1898 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1899     {
1900     active_count--; /* Remove non-match possibility */
1901     next_active_state--;
1902     }
1903 nigel 77 while (nptr < end_subject)
1904     {
1905     int nd;
1906     int ndlen = 1;
1907     GETCHARLEN(nd, nptr, ndlen);
1908 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1909 nigel 77 ncount++;
1910     nptr += ndlen;
1911     }
1912 ph10 975 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1913     reset_could_continue = TRUE;
1914 nigel 77 if (++count >= GET2(code, 1))
1915 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1916 nigel 77 else
1917     { ADD_NEW_DATA(-state_offset, count, ncount); }
1918     }
1919     break;
1920 ph10 151 #endif
1921 nigel 77
1922 nigel 93 /*-----------------------------------------------------------------*/
1923     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1924     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1925     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1926     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1927     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1928 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1929 nigel 93 count = current_state->count; /* Number already matched */
1930     if (clen > 0)
1931     {
1932     int ncount = 0;
1933     switch (c)
1934     {
1935     case 0x000b:
1936     case 0x000c:
1937     case 0x0085:
1938     case 0x2028:
1939     case 0x2029:
1940 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1941     goto ANYNL03;
1942    
1943     case 0x000d:
1944     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1945     /* Fall through */
1946    
1947     ANYNL03:
1948     case 0x000a:
1949 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1950     {
1951     active_count--; /* Remove non-match possibility */
1952     next_active_state--;
1953     }
1954     if (++count >= GET2(code, 1))
1955 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1956 nigel 93 else
1957     { ADD_NEW_DATA(-state_offset, count, ncount); }
1958     break;
1959 ph10 231
1960 nigel 93 default:
1961     break;
1962     }
1963     }
1964     break;
1965    
1966 ph10 178 /*-----------------------------------------------------------------*/
1967     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1968     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1969     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1970     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1971     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1972 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1973 ph10 178 count = current_state->count; /* Number already matched */
1974     if (clen > 0)
1975     {
1976 ph10 182 BOOL OK;
1977 ph10 178 switch (c)
1978     {
1979     case 0x000a:
1980     case 0x000b:
1981     case 0x000c:
1982     case 0x000d:
1983     case 0x0085:
1984     case 0x2028:
1985     case 0x2029:
1986     OK = TRUE;
1987     break;
1988 ph10 182
1989 ph10 178 default:
1990     OK = FALSE;
1991     }
1992 ph10 182
1993 ph10 178 if (OK == (d == OP_VSPACE))
1994 ph10 182 {
1995 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1996     {
1997     active_count--; /* Remove non-match possibility */
1998     next_active_state--;
1999     }
2000     if (++count >= GET2(code, 1))
2001 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2002 ph10 178 else
2003     { ADD_NEW_DATA(-state_offset, count, 0); }
2004     }
2005     }
2006     break;
2007    
2008     /*-----------------------------------------------------------------*/
2009     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2010     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2011     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2012     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2013     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2014 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2015 ph10 178 count = current_state->count; /* Number already matched */
2016     if (clen > 0)
2017     {
2018 ph10 182 BOOL OK;
2019 ph10 178 switch (c)
2020     {
2021     case 0x09: /* HT */
2022     case 0x20: /* SPACE */
2023     case 0xa0: /* NBSP */
2024     case 0x1680: /* OGHAM SPACE MARK */
2025     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2026     case 0x2000: /* EN QUAD */
2027     case 0x2001: /* EM QUAD */
2028     case 0x2002: /* EN SPACE */
2029     case 0x2003: /* EM SPACE */
2030     case 0x2004: /* THREE-PER-EM SPACE */
2031     case 0x2005: /* FOUR-PER-EM SPACE */
2032     case 0x2006: /* SIX-PER-EM SPACE */
2033     case 0x2007: /* FIGURE SPACE */
2034     case 0x2008: /* PUNCTUATION SPACE */
2035     case 0x2009: /* THIN SPACE */
2036     case 0x200A: /* HAIR SPACE */
2037     case 0x202f: /* NARROW NO-BREAK SPACE */
2038     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2039     case 0x3000: /* IDEOGRAPHIC SPACE */
2040     OK = TRUE;
2041     break;
2042 ph10 182
2043 ph10 178 default:
2044     OK = FALSE;
2045     break;
2046     }
2047 ph10 182
2048 ph10 178 if (OK == (d == OP_HSPACE))
2049 ph10 182 {
2050 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2051     {
2052     active_count--; /* Remove non-match possibility */
2053     next_active_state--;
2054     }
2055     if (++count >= GET2(code, 1))
2056 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2057 ph10 178 else
2058     { ADD_NEW_DATA(-state_offset, count, 0); }
2059     }
2060     }
2061     break;
2062    
2063 nigel 77 /* ========================================================================== */
2064     /* These opcodes are followed by a character that is usually compared
2065     to the current subject character; it is loaded into d. We still get
2066     here even if there is no subject character, because in some cases zero
2067     repetitions are permitted. */
2068    
2069     /*-----------------------------------------------------------------*/
2070     case OP_CHAR:
2071     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2072     break;
2073    
2074     /*-----------------------------------------------------------------*/
2075 ph10 602 case OP_CHARI:
2076 nigel 77 if (clen == 0) break;
2077    
2078 ph10 836 #ifdef SUPPORT_UTF
2079     if (utf)
2080 nigel 77 {
2081     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2082     {
2083 nigel 93 unsigned int othercase;
2084 ph10 836 if (c < 128)
2085     othercase = fcc[c];
2086     else
2087     /* If we have Unicode property support, we can use it to test the
2088     other case of the character. */
2089 nigel 77 #ifdef SUPPORT_UCP
2090 ph10 836 othercase = UCD_OTHERCASE(c);
2091 nigel 87 #else
2092 ph10 836 othercase = NOTACHAR;
2093 nigel 77 #endif
2094    
2095     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2096     }
2097     }
2098     else
2099 ph10 836 #endif /* SUPPORT_UTF */
2100     /* Not UTF mode */
2101 nigel 77 {
2102 ph10 836 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2103     { ADD_NEW(state_offset + 2, 0); }
2104 nigel 77 }
2105     break;
2106    
2107    
2108     #ifdef SUPPORT_UCP
2109     /*-----------------------------------------------------------------*/
2110     /* This is a tricky one because it can match more than one character.
2111     Find out how many characters to skip, and then set up a negative state
2112     to wait for them to pass before continuing. */
2113    
2114     case OP_EXTUNI:
2115 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2116 nigel 77 {
2117 ph10 836 const pcre_uchar *nptr = ptr + clen;
2118 nigel 77 int ncount = 0;
2119     while (nptr < end_subject)
2120     {
2121     int nclen = 1;
2122     GETCHARLEN(c, nptr, nclen);
2123 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2124 nigel 77 ncount++;
2125     nptr += nclen;
2126     }
2127 ph10 975 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2128     reset_could_continue = TRUE;
2129 nigel 77 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2130     }
2131     break;
2132     #endif
2133    
2134     /*-----------------------------------------------------------------*/
2135 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2136     character (when CR is followed by LF). In this case, set up a negative
2137     state to wait for one character to pass before continuing. */
2138    
2139     case OP_ANYNL:
2140     if (clen > 0) switch(c)
2141     {
2142     case 0x000b:
2143     case 0x000c:
2144     case 0x0085:
2145     case 0x2028:
2146     case 0x2029:
2147 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2148    
2149     case 0x000a:
2150 nigel 93 ADD_NEW(state_offset + 1, 0);
2151     break;
2152 ph10 231
2153 nigel 93 case 0x000d:
2154 ph10 975 if (ptr + 1 >= end_subject)
2155 nigel 93 {
2156 ph10 975 ADD_NEW(state_offset + 1, 0);
2157     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2158     reset_could_continue = TRUE;
2159     }
2160 ph10 916 else if (ptr[1] == 0x0a)
2161     {
2162 nigel 93 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2163     }
2164     else
2165 ph10 975 {
2166 nigel 93 ADD_NEW(state_offset + 1, 0);
2167 ph10 975 }
2168 nigel 93 break;
2169     }
2170     break;
2171    
2172     /*-----------------------------------------------------------------*/
2173 ph10 178 case OP_NOT_VSPACE:
2174     if (clen > 0) switch(c)
2175     {
2176     case 0x000a:
2177     case 0x000b:
2178     case 0x000c:
2179     case 0x000d:
2180     case 0x0085:
2181     case 0x2028:
2182     case 0x2029:
2183     break;
2184 ph10 182
2185     default:
2186 ph10 178 ADD_NEW(state_offset + 1, 0);
2187     break;
2188     }
2189     break;
2190    
2191     /*-----------------------------------------------------------------*/
2192     case OP_VSPACE:
2193     if (clen > 0) switch(c)
2194     {
2195     case 0x000a:
2196     case 0x000b:
2197     case 0x000c:
2198     case 0x000d:
2199     case 0x0085:
2200     case 0x2028:
2201     case 0x2029:
2202     ADD_NEW(state_offset + 1, 0);
2203     break;
2204 ph10 182
2205 ph10 178 default: break;
2206     }
2207     break;
2208    
2209     /*-----------------------------------------------------------------*/
2210     case OP_NOT_HSPACE:
2211     if (clen > 0) switch(c)
2212     {
2213     case 0x09: /* HT */
2214     case 0x20: /* SPACE */
2215     case 0xa0: /* NBSP */
2216     case 0x1680: /* OGHAM SPACE MARK */
2217     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2218     case 0x2000: /* EN QUAD */
2219     case 0x2001: /* EM QUAD */
2220     case 0x2002: /* EN SPACE */
2221     case 0x2003: /* EM SPACE */
2222     case 0x2004: /* THREE-PER-EM SPACE */
2223     case 0x2005: /* FOUR-PER-EM SPACE */
2224     case 0x2006: /* SIX-PER-EM SPACE */
2225     case 0x2007: /* FIGURE SPACE */
2226     case 0x2008: /* PUNCTUATION SPACE */
2227     case 0x2009: /* THIN SPACE */
2228     case 0x200A: /* HAIR SPACE */
2229     case 0x202f: /* NARROW NO-BREAK SPACE */
2230     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2231     case 0x3000: /* IDEOGRAPHIC SPACE */
2232     break;
2233 ph10 182
2234     default:
2235 ph10 178 ADD_NEW(state_offset + 1, 0);
2236     break;
2237     }
2238     break;
2239    
2240     /*-----------------------------------------------------------------*/
2241     case OP_HSPACE:
2242     if (clen > 0) switch(c)
2243     {
2244     case 0x09: /* HT */
2245     case 0x20: /* SPACE */
2246     case 0xa0: /* NBSP */
2247     case 0x1680: /* OGHAM SPACE MARK */
2248     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2249     case 0x2000: /* EN QUAD */
2250     case 0x2001: /* EM QUAD */
2251     case 0x2002: /* EN SPACE */
2252     case 0x2003: /* EM SPACE */
2253     case 0x2004: /* THREE-PER-EM SPACE */
2254     case 0x2005: /* FOUR-PER-EM SPACE */
2255     case 0x2006: /* SIX-PER-EM SPACE */
2256     case 0x2007: /* FIGURE SPACE */
2257     case 0x2008: /* PUNCTUATION SPACE */
2258     case 0x2009: /* THIN SPACE */
2259     case 0x200A: /* HAIR SPACE */
2260     case 0x202f: /* NARROW NO-BREAK SPACE */
2261     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2262     case 0x3000: /* IDEOGRAPHIC SPACE */
2263     ADD_NEW(state_offset + 1, 0);
2264     break;
2265     }
2266     break;
2267    
2268     /*-----------------------------------------------------------------*/
2269 ph10 925 /* Match a negated single character casefully. */
2270 nigel 77
2271     case OP_NOT:
2272 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2273 nigel 77 break;
2274    
2275     /*-----------------------------------------------------------------*/
2276 ph10 925 /* Match a negated single character caselessly. */
2277 ph10 602
2278     case OP_NOTI:
2279 ph10 925 if (clen > 0)
2280 ph10 975 {
2281 ph10 925 unsigned int otherd;
2282     #ifdef SUPPORT_UTF
2283     if (utf && d >= 128)
2284     {
2285     #ifdef SUPPORT_UCP
2286     otherd = UCD_OTHERCASE(d);
2287     #endif /* SUPPORT_UCP */
2288     }
2289     else
2290     #endif /* SUPPORT_UTF */
2291     otherd = TABLE_GET(d, fcc, d);
2292     if (c != d && c != otherd)
2293     { ADD_NEW(state_offset + dlen + 1, 0); }
2294 ph10 975 }
2295 ph10 602 break;
2296    
2297     /*-----------------------------------------------------------------*/
2298     case OP_PLUSI:
2299     case OP_MINPLUSI:
2300     case OP_POSPLUSI:
2301     case OP_NOTPLUSI:
2302     case OP_NOTMINPLUSI:
2303     case OP_NOTPOSPLUSI:
2304     caseless = TRUE;
2305     codevalue -= OP_STARI - OP_STAR;
2306 ph10 654
2307 ph10 602 /* Fall through */
2308 nigel 77 case OP_PLUS:
2309     case OP_MINPLUS:
2310 nigel 93 case OP_POSPLUS:
2311 nigel 77 case OP_NOTPLUS:
2312     case OP_NOTMINPLUS:
2313 nigel 93 case OP_NOTPOSPLUS:
2314 nigel 77 count = current_state->count; /* Already matched */
2315     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2316     if (clen > 0)
2317     {
2318 nigel 93 unsigned int otherd = NOTACHAR;
2319 ph10 602 if (caseless)
2320 nigel 77 {
2321 ph10 836 #ifdef SUPPORT_UTF
2322     if (utf && d >= 128)
2323 nigel 77 {
2324     #ifdef SUPPORT_UCP
2325 ph10 349 otherd = UCD_OTHERCASE(d);
2326 nigel 77 #endif /* SUPPORT_UCP */
2327     }
2328     else
2329 ph10 836 #endif /* SUPPORT_UTF */
2330     otherd = TABLE_GET(d, fcc, d);
2331 nigel 77 }
2332     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2333 nigel 93 {
2334     if (count > 0 &&
2335     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2336     {
2337     active_count--; /* Remove non-match possibility */
2338     next_active_state--;
2339     }
2340     count++;
2341     ADD_NEW(state_offset, count);
2342     }
2343 nigel 77 }
2344     break;
2345    
2346     /*-----------------------------------------------------------------*/
2347 ph10 602 case OP_QUERYI:
2348     case OP_MINQUERYI:
2349     case OP_POSQUERYI:
2350     case OP_NOTQUERYI:
2351     case OP_NOTMINQUERYI:
2352     case OP_NOTPOSQUERYI:
2353     caseless = TRUE;
2354     codevalue -= OP_STARI - OP_STAR;
2355     /* Fall through */
2356 nigel 77 case OP_QUERY:
2357     case OP_MINQUERY:
2358 nigel 93 case OP_POSQUERY:
2359 nigel 77 case OP_NOTQUERY:
2360     case OP_NOTMINQUERY:
2361 nigel 93 case OP_NOTPOSQUERY:
2362 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2363     if (clen > 0)
2364     {
2365 nigel 93 unsigned int otherd = NOTACHAR;
2366 ph10 602 if (caseless)
2367 nigel 77 {
2368 ph10 836 #ifdef SUPPORT_UTF
2369     if (utf && d >= 128)
2370 nigel 77 {
2371     #ifdef SUPPORT_UCP
2372 ph10 349 otherd = UCD_OTHERCASE(d);
2373 nigel 77 #endif /* SUPPORT_UCP */
2374     }
2375     else
2376 ph10 836 #endif /* SUPPORT_UTF */
2377     otherd = TABLE_GET(d, fcc, d);
2378 nigel 77 }
2379     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2380 nigel 93 {
2381     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2382     {
2383     active_count--; /* Remove non-match possibility */
2384     next_active_state--;
2385     }
2386     ADD_NEW(state_offset + dlen + 1, 0);
2387     }
2388 nigel 77 }
2389     break;
2390    
2391     /*-----------------------------------------------------------------*/
2392 ph10 602 case OP_STARI:
2393     case OP_MINSTARI:
2394     case OP_POSSTARI:
2395     case OP_NOTSTARI:
2396     case OP_NOTMINSTARI:
2397     case OP_NOTPOSSTARI:
2398     caseless = TRUE;
2399     codevalue -= OP_STARI - OP_STAR;
2400     /* Fall through */
2401 nigel 77 case OP_STAR:
2402     case OP_MINSTAR:
2403 nigel 93 case OP_POSSTAR:
2404 nigel 77 case OP_NOTSTAR:
2405     case OP_NOTMINSTAR:
2406 nigel 93 case OP_NOTPOSSTAR:
2407 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2408     if (clen > 0)
2409     {
2410 nigel 93 unsigned int otherd = NOTACHAR;
2411 ph10 602 if (caseless)
2412 nigel 77 {
2413 ph10 836 #ifdef SUPPORT_UTF
2414     if (utf && d >= 128)
2415 nigel 77 {
2416     #ifdef SUPPORT_UCP
2417 ph10 349 otherd = UCD_OTHERCASE(d);
2418 nigel 77 #endif /* SUPPORT_UCP */
2419     }
2420     else
2421 ph10 836 #endif /* SUPPORT_UTF */
2422     otherd = TABLE_GET(d, fcc, d);
2423 nigel 77 }
2424     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2425 nigel 93 {
2426     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2427     {
2428     active_count--; /* Remove non-match possibility */
2429     next_active_state--;
2430     }
2431     ADD_NEW(state_offset, 0);
2432     }
2433 nigel 77 }
2434     break;
2435    
2436     /*-----------------------------------------------------------------*/
2437 ph10 602 case OP_EXACTI:
2438     case OP_NOTEXACTI:
2439     caseless = TRUE;
2440     codevalue -= OP_STARI - OP_STAR;
2441     /* Fall through */
2442 nigel 77 case OP_EXACT:
2443 nigel 93 case OP_NOTEXACT:
2444     count = current_state->count; /* Number already matched */
2445     if (clen > 0)
2446     {
2447     unsigned int otherd = NOTACHAR;
2448 ph10 602 if (caseless)
2449 nigel 93 {
2450 ph10 836 #ifdef SUPPORT_UTF
2451     if (utf && d >= 128)
2452 nigel 93 {
2453     #ifdef SUPPORT_UCP
2454 ph10 349 otherd = UCD_OTHERCASE(d);
2455 nigel 93 #endif /* SUPPORT_UCP */
2456     }
2457     else
2458 ph10 836 #endif /* SUPPORT_UTF */
2459     otherd = TABLE_GET(d, fcc, d);
2460 nigel 93 }
2461     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2462     {
2463     if (++count >= GET2(code, 1))
2464 ph10 836 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2465 nigel 93 else
2466     { ADD_NEW(state_offset, count); }
2467     }
2468     }
2469     break;
2470    
2471     /*-----------------------------------------------------------------*/
2472 ph10 602 case OP_UPTOI:
2473     case OP_MINUPTOI:
2474     case OP_POSUPTOI:
2475     case OP_NOTUPTOI:
2476     case OP_NOTMINUPTOI:
2477     case OP_NOTPOSUPTOI:
2478     caseless = TRUE;
2479     codevalue -= OP_STARI - OP_STAR;
2480     /* Fall through */
2481 nigel 77 case OP_UPTO:
2482     case OP_MINUPTO:
2483 nigel 93 case OP_POSUPTO:
2484 nigel 77 case OP_NOTUPTO:
2485     case OP_NOTMINUPTO:
2486 nigel 93 case OP_NOTPOSUPTO:
2487 ph10 836 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2488 nigel 77 count = current_state->count; /* Number already matched */
2489     if (clen > 0)
2490     {
2491 nigel 93 unsigned int otherd = NOTACHAR;
2492 ph10 602 if (caseless)
2493 nigel 77 {
2494 ph10 836 #ifdef SUPPORT_UTF
2495     if (utf && d >= 128)
2496 nigel 77 {
2497     #ifdef SUPPORT_UCP
2498 ph10 349 otherd = UCD_OTHERCASE(d);
2499 nigel 77 #endif /* SUPPORT_UCP */
2500     }
2501     else
2502 ph10 836 #endif /* SUPPORT_UTF */
2503     otherd = TABLE_GET(d, fcc, d);
2504 nigel 77 }
2505     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2506     {
2507 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2508     {
2509     active_count--; /* Remove non-match possibility */
2510     next_active_state--;
2511     }
2512 nigel 77 if (++count >= GET2(code, 1))
2513 ph10 836 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2514 nigel 77 else
2515     { ADD_NEW(state_offset, count); }
2516     }
2517     }
2518     break;
2519    
2520    
2521     /* ========================================================================== */
2522     /* These are the class-handling opcodes */
2523    
2524     case OP_CLASS:
2525     case OP_NCLASS:
2526     case OP_XCLASS:
2527     {
2528     BOOL isinclass = FALSE;
2529     int next_state_offset;
2530 ph10 836 const pcre_uchar *ecode;
2531 nigel 77
2532     /* For a simple class, there is always just a 32-byte table, and we
2533     can set isinclass from it. */
2534    
2535     if (codevalue != OP_XCLASS)
2536     {
2537 ph10 836 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2538 nigel 77 if (clen > 0)
2539     {
2540     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2541 ph10 836 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2542 nigel 77 }
2543     }
2544    
2545     /* An extended class may have a table or a list of single characters,
2546     ranges, or both, and it may be positive or negative. There's a
2547     function that sorts all this out. */
2548    
2549     else
2550     {
2551     ecode = code + GET(code, 1);
2552 ph10 836 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2553 nigel 77 }
2554    
2555     /* At this point, isinclass is set for all kinds of class, and ecode
2556     points to the byte after the end of the class. If there is a
2557     quantifier, this is where it will be. */
2558    
2559 ph10 530 next_state_offset = (int)(ecode - start_code);
2560 nigel 77
2561     switch (*ecode)
2562     {
2563     case OP_CRSTAR:
2564     case OP_CRMINSTAR:
2565     ADD_ACTIVE(next_state_offset + 1, 0);
2566     if (isinclass) { ADD_NEW(state_offset, 0); }
2567     break;
2568    
2569     case OP_CRPLUS:
2570     case OP_CRMINPLUS:
2571     count = current_state->count; /* Already matched */
2572     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2573     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2574     break;
2575    
2576     case OP_CRQUERY:
2577     case OP_CRMINQUERY:
2578     ADD_ACTIVE(next_state_offset + 1, 0);
2579     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2580     break;
2581    
2582     case OP_CRRANGE:
2583     case OP_CRMINRANGE:
2584     count = current_state->count; /* Already matched */
2585     if (count >= GET2(ecode, 1))
2586 ph10 836 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2587 nigel 77 if (isinclass)
2588     {
2589 ph10 836 int max = GET2(ecode, 1 + IMM2_SIZE);
2590 nigel 91 if (++count >= max && max != 0) /* Max 0 => no limit */
2591 ph10 836 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2592 nigel 77 else
2593     { ADD_NEW(state_offset, count); }
2594     }
2595     break;
2596    
2597     default:
2598     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2599     break;
2600     }
2601     }
2602     break;
2603    
2604     /* ========================================================================== */
2605     /* These are the opcodes for fancy brackets of various kinds. We have
2606 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2607     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2608 ph10 341 though the other "backtracking verbs" are not supported. */
2609 ph10 345
2610 ph10 341 case OP_FAIL:
2611 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2612 ph10 345 break;
2613 nigel 77
2614     case OP_ASSERT:
2615     case OP_ASSERT_NOT:
2616     case OP_ASSERTBACK:
2617     case OP_ASSERTBACK_NOT:
2618     {
2619     int rc;
2620     int local_offsets[2];
2621     int local_workspace[1000];
2622 ph10 836 const pcre_uchar *endasscode = code + GET(code, 1);
2623 nigel 77
2624     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2625    
2626     rc = internal_dfa_exec(
2627     md, /* static match data */
2628     code, /* this subexpression's code */
2629     ptr, /* where we currently are */
2630 ph10 530 (int)(ptr - start_subject), /* start offset */
2631 nigel 77 local_offsets, /* offset vector */
2632     sizeof(local_offsets)/sizeof(int), /* size of same */
2633     local_workspace, /* workspace vector */
2634     sizeof(local_workspace)/sizeof(int), /* size of same */
2635 ph10 642 rlevel); /* function recursion level */
2636 ph10 487
2637 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2638 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2639 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2640 nigel 77 }
2641     break;
2642    
2643     /*-----------------------------------------------------------------*/
2644     case OP_COND:
2645 nigel 93 case OP_SCOND:
2646 nigel 77 {
2647     int local_offsets[1000];
2648     int local_workspace[1000];
2649 ph10 406 int codelink = GET(code, 1);
2650 ph10 397 int condcode;
2651 ph10 406
2652 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2653 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2654 ph10 398 happen for the other conditions. */
2655 nigel 77
2656 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2657 ph10 406 {
2658     rrc = 0;
2659 ph10 836 if (PUBL(callout) != NULL)
2660 ph10 397 {
2661 zherczeg 850 PUBL(callout_block) cb;
2662 ph10 397 cb.version = 1; /* Version 1 of the callout block */
2663     cb.callout_number = code[LINK_SIZE+2];
2664     cb.offset_vector = offsets;
2665 zherczeg 852 #ifdef COMPILE_PCRE8
2666 ph10 397 cb.subject = (PCRE_SPTR)start_subject;
2667 zherczeg 852 #else
2668     cb.subject = (PCRE_SPTR16)start_subject;
2669     #endif
2670 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2671     cb.start_match = (int)(current_subject - start_subject);
2672     cb.current_position = (int)(ptr - start_subject);
2673 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2674     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2675     cb.capture_top = 1;
2676     cb.capture_last = -1;
2677     cb.callout_data = md->callout_data;
2678 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2679 ph10 836 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2680 ph10 397 }
2681 ph10 398 if (rrc > 0) break; /* Fail this thread */
2682 ph10 836 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2683 ph10 406 }
2684 ph10 398
2685 ph10 397 condcode = code[LINK_SIZE+1];
2686 ph10 406
2687 nigel 93 /* Back reference conditions are not supported */
2688 nigel 77
2689 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2690 ph10 459 return PCRE_ERROR_DFA_UCOND;
2691 nigel 93
2692     /* The DEFINE condition is always false */
2693    
2694     if (condcode == OP_DEF)
2695 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2696 nigel 93
2697     /* The only supported version of OP_RREF is for the value RREF_ANY,
2698     which means "test if in any recursion". We can't test for specifically
2699     recursed groups. */
2700    
2701 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2702 nigel 93 {
2703 ph10 836 int value = GET2(code, LINK_SIZE + 2);
2704 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2705 ph10 654 if (md->recursive != NULL)
2706 ph10 836 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2707 ph10 398 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2708 nigel 77 }
2709    
2710     /* Otherwise, the condition is an assertion */
2711    
2712     else
2713     {
2714     int rc;
2715 ph10 836 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2716     const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2717 nigel 77
2718     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2719    
2720     rc = internal_dfa_exec(
2721     md, /* fixed match data */
2722     asscode, /* this subexpression's code */
2723     ptr, /* where we currently are */
2724 ph10 530 (int)(ptr - start_subject), /* start offset */
2725 nigel 77 local_offsets, /* offset vector */
2726     sizeof(local_offsets)/sizeof(int), /* size of same */
2727     local_workspace, /* workspace vector */
2728     sizeof(local_workspace)/sizeof(int), /* size of same */
2729 ph10 642 rlevel); /* function recursion level */
2730 nigel 77
2731 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2732 nigel 77 if ((rc >= 0) ==
2733     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2734 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2735 nigel 77 else
2736 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2737 nigel 77 }
2738     }
2739     break;
2740    
2741     /*-----------------------------------------------------------------*/
2742     case OP_RECURSE:
2743     {
2744 ph10 654 dfa_recursion_info *ri;
2745 nigel 77 int local_offsets[1000];
2746     int local_workspace[1000];
2747 ph10 836 const pcre_uchar *callpat = start_code + GET(code, 1);
2748 ph10 654 int recno = (callpat == md->start_code)? 0 :
2749     GET2(callpat, 1 + LINK_SIZE);
2750 nigel 77 int rc;
2751    
2752 ph10 642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2753 ph10 654
2754 ph10 642 /* Check for repeating a recursion without advancing the subject
2755     pointer. This should catch convoluted mutual recursions. (Some simple
2756     cases are caught at compile time.) */
2757 nigel 77
2758 ph10 654 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2759     if (recno == ri->group_num && ptr == ri->subject_position)
2760     return PCRE_ERROR_RECURSELOOP;
2761    
2762     /* Remember this recursion and where we started it so as to
2763 ph10 642 catch infinite loops. */
2764 ph10 654
2765 ph10 642 new_recursive.group_num = recno;
2766     new_recursive.subject_position = ptr;
2767     new_recursive.prevrec = md->recursive;
2768 ph10 654 md->recursive = &new_recursive;
2769 ph10 642
2770 nigel 77 rc = internal_dfa_exec(
2771     md, /* fixed match data */
2772 ph10 642 callpat, /* this subexpression's code */
2773 nigel 77 ptr, /* where we currently are */
2774 ph10 530 (int)(ptr - start_subject), /* start offset */
2775 nigel 77 local_offsets, /* offset vector */
2776     sizeof(local_offsets)/sizeof(int), /* size of same */
2777     local_workspace, /* workspace vector */
2778     sizeof(local_workspace)/sizeof(int), /* size of same */
2779 ph10 642 rlevel); /* function recursion level */
2780 nigel 77
2781 ph10 642 md->recursive = new_recursive.prevrec; /* Done this recursion */
2782 nigel 77
2783 ph10 654 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2784 ph10 642 rc));
2785    
2786 nigel 77 /* Ran out of internal offsets */
2787    
2788     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2789    
2790     /* For each successful matched substring, set up the next state with a
2791     count of characters to skip before trying it. Note that the count is in
2792     characters, not bytes. */
2793    
2794     if (rc > 0)
2795     {
2796     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2797     {
2798 ph10 894 int charcount = local_offsets[rc+1] - local_offsets[rc];
2799     #ifdef SUPPORT_UTF
2800 ph10 979 if (utf)
2801     {
2802     const pcre_uchar *p = start_subject + local_offsets[rc];
2803     const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2804     while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2805     }
2806 ph10 836 #endif
2807 nigel 77 if (charcount > 0)
2808     {
2809     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2810     }
2811     else
2812     {
2813     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2814     }
2815     }
2816     }
2817     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2818     }
2819     break;
2820    
2821     /*-----------------------------------------------------------------*/
2822 ph10 604 case OP_BRAPOS:
2823     case OP_SBRAPOS:
2824     case OP_CBRAPOS:
2825     case OP_SCBRAPOS:
2826 ph10 654 case OP_BRAPOSZERO:
2827 ph10 604 {
2828     int charcount, matched_count;
2829 ph10 836 const pcre_uchar *local_ptr = ptr;
2830 ph10 604 BOOL allow_zero;
2831 ph10 654
2832 ph10 604 if (codevalue == OP_BRAPOSZERO)
2833     {
2834     allow_zero = TRUE;
2835     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2836     }
2837 ph10 654 else allow_zero = FALSE;
2838    
2839     /* Loop to match the subpattern as many times as possible as if it were
2840     a complete pattern. */
2841    
2842 ph10 604 for (matched_count = 0;; matched_count++)
2843     {
2844     int local_offsets[2];
2845     int local_workspace[1000];
2846 ph10 654
2847 ph10 604 int rc = internal_dfa_exec(
2848     md, /* fixed match data */
2849     code, /* this subexpression's code */
2850     local_ptr, /* where we currently are */
2851     (int)(ptr - start_subject), /* start offset */
2852     local_offsets, /* offset vector */
2853     sizeof(local_offsets)/sizeof(int), /* size of same */
2854     local_workspace, /* workspace vector */
2855     sizeof(local_workspace)/sizeof(int), /* size of same */
2856 ph10 642 rlevel); /* function recursion level */
2857 ph10 654
2858 ph10 604 /* Failed to match */
2859 ph10 654
2860     if (rc < 0)
2861 ph10 604 {
2862     if (rc != PCRE_ERROR_NOMATCH) return rc;
2863     break;
2864 ph10 654 }
2865    
2866 ph10 604 /* Matched: break the loop if zero characters matched. */
2867 ph10 654
2868 ph10 604 charcount = local_offsets[1] - local_offsets[0];
2869 ph10 654 if (charcount == 0) break;
2870 ph10 604 local_ptr += charcount; /* Advance temporary position ptr */
2871 ph10 654 }
2872 ph10 604
2873     /* At this point we have matched the subpattern matched_count
2874 ph10 654 times, and local_ptr is pointing to the character after the end of the
2875     last match. */
2876 ph10 604
2877     if (matched_count > 0 || allow_zero)
2878 ph10 654 {
2879 ph10 836 const pcre_uchar *end_subpattern = code;
2880 ph10 604 int next_state_offset;
2881 ph10 654
2882 ph10 604 do { end_subpattern += GET(end_subpattern, 1); }
2883     while (*end_subpattern == OP_ALT);
2884     next_state_offset =
2885     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2886    
2887     /* Optimization: if there are no more active states, and there
2888     are no new states yet set up, then skip over the subject string
2889     right here, to save looping. Otherwise, set up the new state to swing
2890     into action when the end of the matched substring is reached. */
2891    
2892     if (i + 1 >= active_count && new_count == 0)
2893     {
2894     ptr = local_ptr;
2895     clen = 0;
2896     ADD_NEW(next_state_offset, 0);
2897     }
2898     else
2899     {
2900 ph10 836 const pcre_uchar *p = ptr;
2901     const pcre_uchar *pp = local_ptr;
2902     charcount = (int)(pp - p);
2903     #ifdef SUPPORT_UTF
2904 ph10 979 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2905 ph10 836 #endif
2906 ph10 604 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2907     }
2908 ph10 654 }
2909     }
2910 ph10 604 break;
2911 ph10 654
2912 ph10 604 /*-----------------------------------------------------------------*/
2913 nigel 77 case OP_ONCE:
2914 ph10 733 case OP_ONCE_NC:
2915 nigel 77 {
2916     int local_offsets[2];
2917     int local_workspace[1000];
2918    
2919     int rc = internal_dfa_exec(
2920     md, /* fixed match data */
2921     code, /* this subexpression's code */
2922     ptr, /* where we currently are */
2923 ph10 530 (int)(ptr - start_subject), /* start offset */
2924 nigel 77 local_offsets, /* offset vector */
2925     sizeof(local_offsets)/sizeof(int), /* size of same */
2926     local_workspace, /* workspace vector */
2927     sizeof(local_workspace)/sizeof(int), /* size of same */
2928 ph10 642 rlevel); /* function recursion level */
2929 nigel 77
2930     if (rc >= 0)
2931     {
2932 ph10 836 const pcre_uchar *end_subpattern = code;
2933 nigel 77 int charcount = local_offsets[1] - local_offsets[0];
2934     int next_state_offset, repeat_state_offset;
2935    
2936     do { end_subpattern += GET(end_subpattern, 1); }
2937     while (*end_subpattern == OP_ALT);
2938 ph10 535 next_state_offset =
2939 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2940 nigel 77
2941     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2942     arrange for the repeat state also to be added to the relevant list.
2943     Calculate the offset, or set -1 for no repeat. */
2944    
2945     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2946     *end_subpattern == OP_KETRMIN)?
2947 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2948 nigel 77
2949     /* If we have matched an empty string, add the next state at the
2950     current character pointer. This is important so that the duplicate
2951     checking kicks in, which is what breaks infinite loops that match an
2952     empty string. */
2953    
2954     if (charcount == 0)
2955     {
2956     ADD_ACTIVE(next_state_offset, 0);
2957     }
2958    
2959     /* Optimization: if there are no more active states, and there
2960     are no new states yet set up, then skip over the subject string
2961     right here, to save looping. Otherwise, set up the new state to swing
2962 ph10 604 into action when the end of the matched substring is reached. */
2963 nigel 77
2964     else if (i + 1 >= active_count && new_count == 0)
2965     {
2966     ptr += charcount;
2967     clen = 0;
2968     ADD_NEW(next_state_offset, 0);
2969    
2970     /* If we are adding a repeat state at the new character position,
2971     we must fudge things so that it is the only current state.
2972     Otherwise, it might be a duplicate of one we processed before, and
2973     that would cause it to be skipped. */
2974    
2975     if (repeat_state_offset >= 0)
2976     {
2977     next_active_state = active_states;
2978     active_count = 0;
2979     i = -1;
2980     ADD_ACTIVE(repeat_state_offset, 0);
2981     }
2982     }
2983     else
2984     {
2985 ph10 836 #ifdef SUPPORT_UTF
2986 ph10 979 if (utf)
2987     {
2988     const pcre_uchar *p = start_subject + local_offsets[0];
2989     const pcre_uchar *pp = start_subject + local_offsets[1];
2990     while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2991     }
2992 ph10 836 #endif
2993 nigel 77 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2994     if (repeat_state_offset >= 0)
2995     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2996     }
2997     }
2998     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2999     }
3000     break;
3001    
3002    
3003     /* ========================================================================== */
3004     /* Handle callouts */
3005    
3006     case OP_CALLOUT:
3007 ph10 406 rrc = 0;
3008 ph10 836 if (PUBL(callout) != NULL)
3009 nigel 77 {
3010 zherczeg 850 PUBL(callout_block) cb;
3011 nigel 77 cb.version = 1; /* Version 1 of the callout block */
3012     cb.callout_number = code[1];
3013     cb.offset_vector = offsets;
3014 zherczeg 852 #ifdef COMPILE_PCRE8
3015 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
3016 zherczeg 852 #else
3017     cb.subject = (PCRE_SPTR16)start_subject;
3018     #endif
3019 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
3020     cb.start_match = (int)(current_subject - start_subject);
3021     cb.current_position = (int)(ptr - start_subject);
3022 nigel 77 cb.pattern_position = GET(code, 2);
3023     cb.next_item_length = GET(code, 2 + LINK_SIZE);
3024     cb.capture_top = 1;
3025     cb.capture_last = -1;
3026     cb.callout_data = md->callout_data;
3027 ph10 654 cb.mark = NULL; /* No (*MARK) support */
3028 ph10 836 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3029 ph10 406 }
3030     if (rrc == 0)
3031 ph10 836 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3032 nigel 77 break;
3033    
3034    
3035     /* ========================================================================== */
3036     default: /* Unsupported opcode */
3037     return PCRE_ERROR_DFA_UITEM;
3038     }
3039    
3040     NEXT_ACTIVE_STATE: continue;
3041    
3042     } /* End of loop scanning active states */
3043    
3044     /* We have finished the processing at the current subject character. If no
3045     new states have been set for the next character, we have found all the
3046     matches that we are going to find. If we are at the top level and partial
3047 ph10 463 matching has been requested, check for appropriate conditions.
3048    
3049 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
3050     character. If it is equal to the original active_count (saved in
3051     workspace[1]) it means that (*F) was found on every active state. In this
3052 ph10 463 case we don't want to give a partial match.
3053 nigel 77
3054 ph10 463 The "could_continue" variable is true if a state could have continued but
3055     for the fact that the end of the subject was reached. */
3056 ph10 975
3057 nigel 77 if (new_count <= 0)
3058     {
3059 ph10 427 if (rlevel == 1 && /* Top level, and */
3060 ph10 919 could_continue && /* Some could go on, and */
3061 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
3062 ph10 427 ( /* either... */
3063     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3064     || /* or... */
3065     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3066     match_count < 0) /* no matches */
3067     ) && /* And... */
3068 ph10 916 (
3069 ph10 919 partial_newline || /* Either partial NL */
3070     ( /* or ... */
3071     ptr >= end_subject && /* End of subject and */
3072     ptr > md->start_used_ptr) /* Inspected non-empty string */
3073 ph10 975 )
3074     )
3075 nigel 77 {
3076     if (offsetcount >= 2)
3077     {
3078 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
3079     offsets[1] = (int)(end_subject - start_subject);
3080 nigel 77 }
3081     match_count = PCRE_ERROR_PARTIAL;
3082     }
3083    
3084     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3085     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3086     rlevel*2-2, SP));
3087 nigel 91 break; /* In effect, "return", but see the comment below */
3088 nigel 77 }
3089    
3090     /* One or more states are active for the next character. */
3091    
3092     ptr += clen; /* Advance to next subject character */
3093     } /* Loop to move along the subject string */
3094    
3095 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
3096     if we use "return" above, we have compiler trouble. Some compilers warn if
3097     there's nothing here because they think the function doesn't return a value. On
3098     the other hand, if we put a dummy statement here, some more clever compilers
3099     complain that it can't be reached. Sigh. */
3100 nigel 77
3101 nigel 91 return match_count;
3102 nigel 77 }
3103    
3104    
3105    
3106    
3107     /*************************************************
3108     * Execute a Regular Expression - DFA engine *
3109     *************************************************/
3110    
3111     /* This external function applies a compiled re to a subject string using a DFA
3112     engine. This function calls the internal function multiple times if the pattern
3113     is not anchored.
3114    
3115     Arguments:
3116     argument_re points to the compiled expression
3117 ph10 97 extra_data points to extra data or is NULL
3118 nigel 77 subject points to the subject string
3119     length length of subject string (may contain binary zeros)
3120     start_offset where to start in the subject string
3121     options option bits
3122     offsets vector of match offsets
3123     offsetcount size of same
3124     workspace workspace vector
3125     wscount size of same
3126    
3127     Returns: > 0 => number of match offset pairs placed in offsets
3128     = 0 => offsets overflowed; longest matches are present
3129     -1 => failed to match
3130     < -1 => some kind of unexpected problem
3131     */
3132    
3133 ph10 836 #ifdef COMPILE_PCRE8
3134 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3135 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3136     const char *subject, int length, int start_offset, int options, int *offsets,
3137     int offsetcount, int *workspace, int wscount)
3138 ph10 836 #else
3139     PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3140 zherczeg 852 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3141 ph10 836 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3142     int offsetcount, int *workspace, int wscount)
3143     #endif
3144 nigel 77 {
3145 zherczeg 852 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3146 nigel 77 dfa_match_data match_block;
3147 nigel 91 dfa_match_data *md = &match_block;
3148 ph10 836 BOOL utf, anchored, startline, firstline;
3149     const pcre_uchar *current_subject, *end_subject;
3150 nigel 77 const pcre_study_data *study = NULL;
3151    
3152 ph10 836 const pcre_uchar *req_char_ptr;
3153     const pcre_uint8 *start_bits = NULL;
3154     BOOL has_first_char = FALSE;
3155     BOOL has_req_char = FALSE;
3156     pcre_uchar first_char = 0;
3157     pcre_uchar first_char2 = 0;
3158     pcre_uchar req_char = 0;
3159     pcre_uchar req_char2 = 0;
3160 nigel 91 int newline;
3161 nigel 77
3162     /* Plausibility checks */
3163    
3164     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3165     if (re == NULL || subject == NULL || workspace == NULL ||
3166     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3167     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3168     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3169 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3170 nigel 77
3171 ph10 960 /* Check that the first field in the block is the magic number. If it is not,
3172     return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3173     REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3174     means that the pattern is likely compiled with different endianness. */
3175 nigel 77
3176 ph10 960 if (re->magic_number != MAGIC_NUMBER)
3177     return re->magic_number == REVERSED_MAGIC_NUMBER?
3178     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3179     if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3180    
3181 ph10 975 /* If restarting after a partial match, do some sanity checks on the contents
3182 ph10 960 of the workspace. */
3183    
3184     if ((options & PCRE_DFA_RESTART) != 0)
3185     {
3186 ph10 975 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3187 ph10 960 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3188 ph10 975 return PCRE_ERROR_DFA_BADRESTART;
3189     }
3190 ph10 960
3191     /* Set up study, callout, and table data */
3192    
3193 nigel 91 md->tables = re->tables;
3194     md->callout_data = NULL;
3195 nigel 77
3196     if (extra_data != NULL)
3197     {
3198     unsigned int flags = extra_data->flags;
3199     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3200     study = (const pcre_study_data *)extra_data->study_data;
3201     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3202 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3203     return PCRE_ERROR_DFA_UMLIMIT;
3204 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3205 nigel 91 md->callout_data = extra_data->callout_data;
3206 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3207 nigel 91 md->tables = extra_data->tables;
3208 nigel 77 }
3209 ph10 461
3210 nigel 77 /* Set some local values */
3211    
3212 ph10 836 current_subject = (const pcre_uchar *)subject + start_offset;
3213     end_subject = (const pcre_uchar *)subject + length;
3214     req_char_ptr = current_subject - 1;
3215 nigel 77
3216 ph10 836 #ifdef SUPPORT_UTF
3217     /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3218     utf = (re->options & PCRE_UTF8) != 0;
3219 nigel 91 #else
3220 ph10 836 utf = FALSE;
3221 nigel 91 #endif
3222 nigel 77
3223 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3224     (re->options & PCRE_ANCHORED) != 0;
3225    
3226 nigel 77 /* The remaining fixed data for passing around. */
3227    
3228 ph10 836 md->start_code = (const pcre_uchar *)argument_re +
3229 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3230 ph10 836 md->start_subject = (const pcre_uchar *)subject;
3231 nigel 91 md->end_subject = end_subject;
3232 ph10 442 md->start_offset = start_offset;
3233 nigel 91 md->moptions = options;
3234     md->poptions = re->options;
3235 nigel 77
3236 ph10 231 /* If the BSR option is not set at match time, copy what was set
3237     at compile time. */
3238    
3239     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3240     {
3241     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3242     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3243     #ifdef BSR_ANYCRLF
3244     else md->moptions |= PCRE_BSR_ANYCRLF;
3245 ph10 243 #endif
3246     }
3247 ph10 231
3248 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3249     nothing is set at run time, whatever was used at compile time applies. */
3250 nigel 91
3251 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3252 nigel 93 PCRE_NEWLINE_BITS)
3253 nigel 91 {
3254 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3255 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3256     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3257 nigel 91 case PCRE_NEWLINE_CR+
3258 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3259 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3260 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3261 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3262 nigel 91 }
3263    
3264 ph10 149 if (newline == -2)
3265 nigel 91 {
3266 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3267     }
3268     else if (newline < 0)
3269     {
3270 nigel 93 md->nltype = NLTYPE_ANY;
3271 nigel 91 }
3272     else
3273     {
3274 nigel 93 md->nltype = NLTYPE_FIXED;
3275     if (newline > 255)
3276     {
3277     md->nllen = 2;
3278     md->nl[0] = (newline >> 8) & 255;
3279     md->nl[1] = newline & 255;
3280     }
3281     else
3282     {
3283     md->nllen = 1;
3284     md->nl[0] = newline;
3285     }
3286 nigel 91 }
3287    
3288 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3289     back the character offset. */
3290    
3291 ph10 836 #ifdef SUPPORT_UTF
3292     if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3293 nigel 77 {
3294 ph10 654 int erroroffset;
3295 ph10 836 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3296 ph10 606 if (errorcode != 0)
3297 ph10 598 {
3298     if (offsetcount >= 2)
3299     {
3300 ph10 606 offsets[0] = erroroffset;
3301 ph10 598 offsets[1] = errorcode;
3302 ph10 654 }
3303 ph10 598 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3304 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3305 ph10 654 }
3306 ph10 606 if (start_offset > 0 && start_offset < length &&
3307 ph10 836 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3308 ph10 606 return PCRE_ERROR_BADUTF8_OFFSET;
3309 nigel 77 }
3310     #endif
3311    
3312     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3313     is a feature that makes it possible to save compiled regex and re-use them
3314     in other programs later. */
3315    
3316 ph10 836 if (md->tables == NULL) md->tables = PRIV(default_tables);
3317 nigel 77
3318 ph10 881 /* The "must be at the start of a line" flags are used in a loop when finding
3319     where to start. */
3320 nigel 77
3321 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3322 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3323    
3324     /* Set up the first character to match, if available. The first_byte value is
3325     never set for an anchored regular expression, but the anchoring may be forced
3326     at run time, so we have to test for anchoring. The first char may be unset for
3327     an unanchored pattern, of course. If there's no first char and the pattern was
3328     studied, there may be a bitmap of possible first characters. */
3329    
3330     if (!anchored)
3331     {
3332 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3333 nigel 77 {
3334 ph10 836 has_first_char = TRUE;
3335 ph10 904 first_char = first_char2 = (pcre_uchar)(re->first_char);
3336 ph10 836 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3337     {
3338     first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3339     #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3340     if (utf && first_char > 127)
3341     first_char2 = UCD_OTHERCASE(first_char);
3342     #endif
3343     }
3344 nigel 77 }
3345     else
3346     {
3347 ph10 455 if (!startline && study != NULL &&
3348     (study->flags & PCRE_STUDY_MAPPED) != 0)
3349 nigel 77 start_bits = study->start_bits;
3350     }
3351     }
3352    
3353     /* For anchored or unanchored matches, there may be a "last known required
3354     character" set. */
3355    
3356 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3357 nigel 77 {
3358 ph10 836 has_req_char = TRUE;
3359 ph10 904 req_char = req_char2 = (pcre_uchar)(re->req_char);
3360 ph10 836 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3361     {
3362     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3363     #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3364     if (utf && req_char > 127)
3365     req_char2 = UCD_OTHERCASE(req_char);
3366     #endif
3367     }
3368 nigel 77 }
3369    
3370     /* Call the main matching function, looping for a non-anchored regex after a
3371 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3372     a match. */
3373 nigel 77
3374     for (;;)
3375     {
3376     int rc;
3377    
3378     if ((options & PCRE_DFA_RESTART) == 0)
3379     {
3380 ph10 836 const pcre_uchar *save_end_subject = end_subject;
3381 nigel 77
3382 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3383     line of a multiline string. Implement this by temporarily adjusting
3384     end_subject so that we stop scanning at a newline. If the match fails at
3385     the newline, later code breaks this loop. */
3386 nigel 77
3387     if (firstline)
3388     {
3389 ph10 836 PCRE_PUCHAR t = current_subject;
3390     #ifdef SUPPORT_UTF
3391     if (utf)
3392 ph10 371 {
3393     while (t < md->end_subject && !IS_NEWLINE(t))
3394 ph10 365 {
3395     t++;
3396 ph10 836 ACROSSCHAR(t < end_subject, *t, t++);
3397 ph10 371 }
3398 ph10 365 }
3399     else
3400 ph10 371 #endif
3401 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3402 nigel 77 end_subject = t;
3403     }
3404 ph10 392
3405 ph10 389 /* There are some optimizations that avoid running the match if a known
3406 ph10 455 starting point is not found. However, there is an option that disables
3407 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3408 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3409     match-time options. */
3410 nigel 77
3411 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3412 ph10 392 {
3413 ph10 836 /* Advance to a known first char. */
3414 ph10 392
3415 ph10 836 if (has_first_char)
3416 nigel 77 {
3417 ph10 836 if (first_char != first_char2)
3418 ph10 389 while (current_subject < end_subject &&
3419 ph10 836 *current_subject != first_char && *current_subject != first_char2)
3420 ph10 389 current_subject++;
3421     else
3422 ph10 392 while (current_subject < end_subject &&
3423 ph10 836 *current_subject != first_char)
3424 ph10 389 current_subject++;
3425     }
3426 ph10 392
3427 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3428 ph10 392
3429 ph10 389 else if (startline)
3430     {
3431     if (current_subject > md->start_subject + start_offset)
3432     {
3433 ph10 836 #ifdef SUPPORT_UTF
3434     if (utf)
3435 ph10 365 {
3436 ph10 392 while (current_subject < end_subject &&
3437 ph10 389 !WAS_NEWLINE(current_subject))
3438     {
3439 ph10 365 current_subject++;
3440 ph10 836 ACROSSCHAR(current_subject < end_subject, *current_subject,
3441     current_subject++);
3442 ph10 389 }
3443 ph10 371 }
3444 ph10 389 else
3445     #endif
3446     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3447     current_subject++;
3448 ph10 392
3449 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3450     ANYCRLF, and we are now at a LF, advance the match position by one
3451     more character. */
3452 ph10 392
3453 ph10 391 if (current_subject[-1] == CHAR_CR &&
3454 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3455     current_subject < end_subject &&
3456 ph10 391 *current_subject == CHAR_NL)
3457 ph10 389 current_subject++;
3458 ph10 365 }
3459 nigel 77 }
3460 ph10 392
3461 ph10 389 /* Or to a non-unique first char after study */
3462 ph10 392
3463 ph10 389 else if (start_bits != NULL)
3464 nigel 77 {
3465 ph10 389 while (current_subject < end_subject)
3466     {
3467     register unsigned int c = *current_subject;
3468 ph10 836 #ifndef COMPILE_PCRE8
3469     if (c > 255) c = 255;
3470     #endif
3471 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3472 ph10 538 {
3473     current_subject++;
3474 ph10 836 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
3475     /* In non 8-bit mode, the iteration will stop for
3476     characters > 255 at the beginning or not stop at all. */
3477     if (utf)
3478     ACROSSCHAR(current_subject < end_subject, *current_subject,
3479     current_subject++);
3480 ph10 545 #endif
3481 ph10 538 }
3482     else break;
3483 ph10 389 }
3484 nigel 77 }
3485 ph10 392 }
3486 nigel 77
3487     /* Restore fudged end_subject */
3488    
3489     end_subject = save_end_subject;
3490    
3491 ph10 461 /* The following two optimizations are disabled for partial matching or if
3492     disabling is explicitly requested (and of course, by the test above, this
3493 ph10 455 code is not obeyed when restarting after a partial match). */
3494 ph10 461
3495 ph10 728 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3496 ph10 455 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3497 ph10 461 {
3498 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3499     is a lower bound; no actual string of that length may actually match the
3500     pattern. Although the value is, strictly, in characters, we treat it as
3501     bytes to avoid spending too much time in this optimization. */
3502 nigel 77
3503 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3504 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3505 ph10 455 return PCRE_ERROR_NOMATCH;
3506 ph10 461
3507 ph10 836 /* If req_char is set, we know that that character must appear in the
3508     subject for the match to succeed. If the first character is set, req_char
3509 ph10 455 must be later in the subject; otherwise the test starts at the match
3510     point. This optimization can save a huge amount of work in patterns with
3511     nested unlimited repeats that aren't going to match. Writing separate
3512     code for cased/caseless versions makes it go faster, as does using an
3513     autoincrement and backing off on a match.
3514 ph10 461
3515 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3516     can take a long time, and give bad performance on quite ordinary
3517     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3518     string... so we don't do this when the string is sufficiently long. */
3519 ph10 461
3520 ph10 836 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3521 nigel 77 {
3522 ph10 836 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3523 ph10 461
3524 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3525     place we found it at last time. */
3526 ph10 461
3527 ph10 836 if (p > req_char_ptr)
3528 nigel 77 {
3529 ph10 836 if (req_char != req_char2)
3530 ph10 455 {
3531     while (p < end_subject)
3532     {
3533     register int pp = *p++;
3534 ph10 836 if (pp == req_char || pp == req_char2) { p--; break; }
3535 ph10 455 }
3536     }
3537     else
3538     {
3539     while (p < end_subject)
3540     {
3541 ph10 836 if (*p++ == req_char) { p--; break; }
3542 ph10 455 }
3543     }
3544 ph10 461
3545 ph10 455 /* If we can't find the required character, break the matching loop,
3546     which will cause a return or PCRE_ERROR_NOMATCH. */
3547 ph10 461
3548 ph10 455 if (p >= end_subject) break;
3549 ph10 461
3550 ph10 455 /* If we have found the required character, save the point where we
3551     found it, so that we don't search again next time round the loop if
3552     the start hasn't passed this character yet. */
3553 ph10 461
3554 ph10 836 req_char_ptr = p;
3555 nigel 77 }
3556 ph10 461 }
3557 nigel 77 }
3558 ph10 455 } /* End of optimizations that are done when not restarting */
3559 nigel 77
3560     /* OK, now we can do the business */
3561    
3562 ph10 435 md->start_used_ptr = current_subject;
3563 ph10 654 md->recursive = NULL;
3564 ph10 461
3565 nigel 77 rc = internal_dfa_exec(
3566 nigel 91 md, /* fixed match data */
3567     md->start_code, /* this subexpression's code */
3568     current_subject, /* where we currently are */
3569     start_offset, /* start offset in subject */
3570     offsets, /* offset vector */
3571     offsetcount, /* size of same */
3572     workspace, /* workspace vector */
3573     wscount, /* size of same */
3574 ph10 642 0); /* function recurse level */
3575 nigel 77
3576     /* Anything other than "no match" means we are done, always; otherwise, carry
3577     on only if not anchored. */
3578    
3579     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3580    
3581     /* Advance to the next subject character unless we are at the end of a line
3582     and firstline is set. */
3583    
3584 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3585 nigel 77 current_subject++;
3586 ph10 836 #ifdef SUPPORT_UTF
3587     if (utf)
3588 nigel 77 {
3589 ph10 836 ACROSSCHAR(current_subject < end_subject, *current_subject,
3590     current_subject++);
3591 nigel 77 }
3592 ph10 836 #endif
3593 nigel 77 if (current_subject > end_subject) break;
3594    
3595 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3596 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3597     or ANY or ANYCRLF, advance the match position by one more character. */
3598 nigel 93
3599 ph10 391 if (current_subject[-1] == CHAR_CR &&
3600 ph10 226 current_subject < end_subject &&
3601 ph10 391 *current_subject == CHAR_NL &&
3602 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3603 ph10 226 (md->nltype == NLTYPE_ANY ||
3604     md->nltype == NLTYPE_ANYCRLF ||
3605     md->nllen == 2))
3606 nigel 93 current_subject++;
3607    
3608     } /* "Bumpalong" loop */
3609    
3610 nigel 77 return PCRE_ERROR_NOMATCH;
3611     }
3612    
3613     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12