/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 782 - (hide annotations) (download)
Sat Dec 3 23:58:37 2011 UTC (17 months, 2 weeks ago) by zherczeg
File MIME type: text/plain
File size: 120437 byte(s)
Start working on UTF-16. Updating macros and adding new ones.
1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 598 Copyright (c) 1997-2011 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 756 static const pcre_uint8 coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 nigel 77 1, /* Char */
126 ph10 602 1, /* Chari */
127 nigel 77 1, /* not */
128 ph10 602 1, /* noti */
129 nigel 77 /* Positive single-char repeats */
130     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131     3, 3, 3, /* upto, minupto, exact */
132 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134     3, 3, 3, /* upto I, minupto I, exact I */
135     1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 nigel 77 /* Negative single-char repeats - only for chars < 256 */
137     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* NOT upto, minupto, exact */
139 ph10 602 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140     1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141     3, 3, 3, /* NOT upto I, minupto I, exact I */
142     1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 nigel 77 /* Positive type repeats */
144     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145     3, 3, 3, /* Type upto, minupto, exact */
146 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 nigel 77 /* Character class & ref repeats */
148     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149     0, 0, /* CRRANGE, CRMINRANGE */
150     0, /* CLASS */
151     0, /* NCLASS */
152     0, /* XCLASS - variable length */
153     0, /* REF */
154 ph10 602 0, /* REFI */
155 nigel 77 0, /* RECURSE */
156     0, /* CALLOUT */
157     0, /* Alt */
158     0, /* Ket */
159     0, /* KetRmax */
160     0, /* KetRmin */
161 ph10 604 0, /* KetRpos */
162 ph10 637 0, /* Reverse */
163 nigel 77 0, /* Assert */
164     0, /* Assert not */
165     0, /* Assert behind */
166     0, /* Assert behind not */
167 ph10 723 0, 0, /* ONCE, ONCE_NC */
168     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
169 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
170 ph10 498 0, 0, /* CREF, NCREF */
171     0, 0, /* RREF, NRREF */
172 nigel 93 0, /* DEF */
173 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
174 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
175     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
176     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
177     0, 0 /* CLOSE, SKIPZERO */
178 nigel 77 };
179    
180 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
181 ph10 462 remember the fact that a character could have been inspected when the end of
182 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
183     two tables that follow must also be modified. */
184 ph10 462
185 ph10 756 static const pcre_uint8 poptable[] = {
186 ph10 462 0, /* End */
187 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
188 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
189     1, 1, 1, /* Any, AllAny, Anybyte */
190 ph10 498 1, 1, /* \P, \p */
191 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
192 ph10 498 1, /* \X */
193 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
194 ph10 462 1, /* Char */
195 ph10 602 1, /* Chari */
196 ph10 462 1, /* not */
197 ph10 602 1, /* noti */
198 ph10 462 /* Positive single-char repeats */
199     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200     1, 1, 1, /* upto, minupto, exact */
201     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
202 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
203     1, 1, 1, /* upto I, minupto I, exact I */
204     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
205 ph10 462 /* Negative single-char repeats - only for chars < 256 */
206     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
207     1, 1, 1, /* NOT upto, minupto, exact */
208     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
209 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
210     1, 1, 1, /* NOT upto I, minupto I, exact I */
211     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
212 ph10 462 /* Positive type repeats */
213     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
214     1, 1, 1, /* Type upto, minupto, exact */
215     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
216     /* Character class & ref repeats */
217     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
218     1, 1, /* CRRANGE, CRMINRANGE */
219     1, /* CLASS */
220     1, /* NCLASS */
221     1, /* XCLASS - variable length */
222     0, /* REF */
223 ph10 602 0, /* REFI */
224 ph10 462 0, /* RECURSE */
225     0, /* CALLOUT */
226     0, /* Alt */
227     0, /* Ket */
228     0, /* KetRmax */
229     0, /* KetRmin */
230 ph10 604 0, /* KetRpos */
231 ph10 637 0, /* Reverse */
232 ph10 462 0, /* Assert */
233     0, /* Assert not */
234     0, /* Assert behind */
235     0, /* Assert behind not */
236 ph10 723 0, 0, /* ONCE, ONCE_NC */
237     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
238 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
239 ph10 498 0, 0, /* CREF, NCREF */
240     0, 0, /* RREF, NRREF */
241 ph10 462 0, /* DEF */
242 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
243 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
244     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
245     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
246     0, 0 /* CLOSE, SKIPZERO */
247 ph10 462 };
248    
249 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250     and \w */
251    
252 ph10 756 static const pcre_uint8 toptable1[] = {
253 ph10 168 0, 0, 0, 0, 0, 0,
254 nigel 77 ctype_digit, ctype_digit,
255     ctype_space, ctype_space,
256     ctype_word, ctype_word,
257 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
258 nigel 77 };
259    
260 ph10 756 static const pcre_uint8 toptable2[] = {
261 ph10 168 0, 0, 0, 0, 0, 0,
262 nigel 77 ctype_digit, 0,
263     ctype_space, 0,
264     ctype_word, 0,
265 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
266 nigel 77 };
267    
268    
269     /* Structure for holding data about a particular state, which is in effect the
270     current data for an active path through the match tree. It must consist
271     entirely of ints because the working vector we are passed, and which we put
272     these structures in, is a vector of ints. */
273    
274     typedef struct stateblock {
275     int offset; /* Offset to opcode */
276     int count; /* Count for repeats */
277     int data; /* Some use extra data */
278     } stateblock;
279    
280     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
281    
282    
283 ph10 475 #ifdef PCRE_DEBUG
284 nigel 77 /*************************************************
285     * Print character string *
286     *************************************************/
287    
288     /* Character string printing function for debugging.
289    
290     Arguments:
291     p points to string
292     length number of bytes
293     f where to print
294    
295     Returns: nothing
296     */
297    
298     static void
299     pchars(unsigned char *p, int length, FILE *f)
300     {
301     int c;
302     while (length-- > 0)
303     {
304     if (isprint(c = *(p++)))
305     fprintf(f, "%c", c);
306     else
307     fprintf(f, "\\x%02x", c);
308     }
309     }
310     #endif
311    
312    
313    
314     /*************************************************
315     * Execute a Regular Expression - DFA engine *
316     *************************************************/
317    
318     /* This internal function applies a compiled pattern to a subject string,
319     starting at a given point, using a DFA engine. This function is called from the
320     external one, possibly multiple times if the pattern is not anchored. The
321     function calls itself recursively for some kinds of subpattern.
322    
323     Arguments:
324     md the match_data block with fixed information
325     this_start_code the opening bracket of this subexpression's code
326     current_subject where we currently are in the subject string
327     start_offset start offset in the subject string
328     offsets vector to contain the matching string offsets
329     offsetcount size of same
330     workspace vector of workspace
331     wscount size of same
332     rlevel function call recursion level
333    
334 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
335 ph10 341 = 0 => offsets overflowed; longest matches are present
336 nigel 77 -1 => failed to match
337     < -1 => some kind of unexpected problem
338    
339     The following macros are used for adding states to the two state vectors (one
340     for the current character, one for the following character). */
341    
342     #define ADD_ACTIVE(x,y) \
343     if (active_count++ < wscount) \
344     { \
345     next_active_state->offset = (x); \
346     next_active_state->count = (y); \
347     next_active_state++; \
348     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349     } \
350     else return PCRE_ERROR_DFA_WSSIZE
351    
352     #define ADD_ACTIVE_DATA(x,y,z) \
353     if (active_count++ < wscount) \
354     { \
355     next_active_state->offset = (x); \
356     next_active_state->count = (y); \
357     next_active_state->data = (z); \
358     next_active_state++; \
359     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
360     } \
361     else return PCRE_ERROR_DFA_WSSIZE
362    
363     #define ADD_NEW(x,y) \
364     if (new_count++ < wscount) \
365     { \
366     next_new_state->offset = (x); \
367     next_new_state->count = (y); \
368     next_new_state++; \
369     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370     } \
371     else return PCRE_ERROR_DFA_WSSIZE
372    
373     #define ADD_NEW_DATA(x,y,z) \
374     if (new_count++ < wscount) \
375     { \
376     next_new_state->offset = (x); \
377     next_new_state->count = (y); \
378     next_new_state->data = (z); \
379     next_new_state++; \
380     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
381     } \
382     else return PCRE_ERROR_DFA_WSSIZE
383    
384     /* And now, here is the code */
385    
386     static int
387     internal_dfa_exec(
388     dfa_match_data *md,
389 ph10 756 const pcre_uchar *this_start_code,
390     const pcre_uchar *current_subject,
391 nigel 77 int start_offset,
392     int *offsets,
393     int offsetcount,
394     int *workspace,
395     int wscount,
396 ph10 642 int rlevel)
397 nigel 77 {
398     stateblock *active_states, *new_states, *temp_states;
399     stateblock *next_active_state, *next_new_state;
400    
401 ph10 756 const pcre_uint8 *ctypes, *lcc, *fcc;
402     const pcre_uchar *ptr;
403     const pcre_uchar *end_code, *first_op;
404 nigel 77
405 ph10 642 dfa_recursion_info new_recursive;
406    
407 nigel 77 int active_count, new_count, match_count;
408    
409     /* Some fields in the md block are frequently referenced, so we load them into
410     independent variables in the hope that this will perform better. */
411    
412 ph10 756 const pcre_uchar *start_subject = md->start_subject;
413     const pcre_uchar *end_subject = md->end_subject;
414     const pcre_uchar *start_code = md->start_code;
415 nigel 77
416 nigel 87 #ifdef SUPPORT_UTF8
417 zherczeg 781 BOOL utf = (md->poptions & PCRE_UTF8) != 0;
418 nigel 93 #else
419 zherczeg 781 BOOL utf = FALSE;
420 nigel 87 #endif
421 nigel 77
422     rlevel++;
423     offsetcount &= (-2);
424    
425     wscount -= 2;
426     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
427     (2 * INTS_PER_STATEBLOCK);
428    
429     DPRINTF(("\n%.*s---------------------\n"
430 ph10 642 "%.*sCall to internal_dfa_exec f=%d\n",
431     rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432 nigel 77
433     ctypes = md->tables + ctypes_offset;
434     lcc = md->tables + lcc_offset;
435     fcc = md->tables + fcc_offset;
436    
437     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
438    
439     active_states = (stateblock *)(workspace + 2);
440     next_new_state = new_states = active_states + wscount;
441     new_count = 0;
442    
443 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
444 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445 zherczeg 769 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
446     ? IMM2_SIZE:0);
447 nigel 93
448 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
449     the alternative states onto the list, and find out where the end is. This
450     makes is possible to use this function recursively, when we want to stop at a
451     matching internal ket rather than at the end.
452    
453     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
454     a backward assertion. In that case, we have to find out the maximum amount to
455     move back, and set up each alternative appropriately. */
456    
457 nigel 93 if (*first_op == OP_REVERSE)
458 nigel 77 {
459     int max_back = 0;
460     int gone_back;
461    
462     end_code = this_start_code;
463     do
464     {
465     int back = GET(end_code, 2+LINK_SIZE);
466     if (back > max_back) max_back = back;
467     end_code += GET(end_code, 1);
468     }
469     while (*end_code == OP_ALT);
470    
471     /* If we can't go back the amount required for the longest lookbehind
472     pattern, go back as far as we can; some alternatives may still be viable. */
473    
474     #ifdef SUPPORT_UTF8
475     /* In character mode we have to step back character by character */
476    
477 zherczeg 781 if (utf)
478 nigel 77 {
479     for (gone_back = 0; gone_back < max_back; gone_back++)
480     {
481     if (current_subject <= start_subject) break;
482     current_subject--;
483 zherczeg 782 INTERNALCHAR(current_subject > start_subject, *current_subject, current_subject--);
484 nigel 77 }
485     }
486     else
487     #endif
488    
489     /* In byte-mode we can do this quickly. */
490    
491     {
492     gone_back = (current_subject - max_back < start_subject)?
493 ph10 530 (int)(current_subject - start_subject) : max_back;
494 nigel 77 current_subject -= gone_back;
495     }
496 ph10 461
497 ph10 435 /* Save the earliest consulted character */
498 nigel 77
499 ph10 461 if (current_subject < md->start_used_ptr)
500     md->start_used_ptr = current_subject;
501    
502 nigel 77 /* Now we can process the individual branches. */
503    
504     end_code = this_start_code;
505     do
506     {
507     int back = GET(end_code, 2+LINK_SIZE);
508     if (back <= gone_back)
509     {
510 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
511 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
512     }
513     end_code += GET(end_code, 1);
514     }
515     while (*end_code == OP_ALT);
516     }
517    
518     /* This is the code for a "normal" subpattern (not a backward assertion). The
519     start of a whole pattern is always one of these. If we are at the top level,
520     we may be asked to restart matching from the same point that we reached for a
521     previous partial match. We still have to scan through the top-level branches to
522     find the end state. */
523    
524     else
525     {
526     end_code = this_start_code;
527    
528     /* Restarting */
529    
530     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
531     {
532     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
533     new_count = workspace[1];
534     if (!workspace[0])
535     memcpy(new_states, active_states, new_count * sizeof(stateblock));
536     }
537    
538     /* Not restarting */
539    
540     else
541     {
542 nigel 93 int length = 1 + LINK_SIZE +
543 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
544 zherczeg 769 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
545     ? IMM2_SIZE:0);
546 nigel 77 do
547     {
548 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
549 nigel 77 end_code += GET(end_code, 1);
550 nigel 93 length = 1 + LINK_SIZE;
551 nigel 77 }
552     while (*end_code == OP_ALT);
553     }
554     }
555    
556     workspace[0] = 0; /* Bit indicating which vector is current */
557    
558 zherczeg 769 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
559 nigel 77
560     /* Loop for scanning the subject */
561    
562     ptr = current_subject;
563     for (;;)
564     {
565     int i, j;
566 nigel 91 int clen, dlen;
567     unsigned int c, d;
568 ph10 428 int forced_fail = 0;
569 ph10 462 BOOL could_continue = FALSE;
570 nigel 77
571     /* Make the new state list into the active state list and empty the
572     new state list. */
573    
574     temp_states = active_states;
575     active_states = new_states;
576     new_states = temp_states;
577     active_count = new_count;
578     new_count = 0;
579    
580     workspace[0] ^= 1; /* Remember for the restarting feature */
581     workspace[1] = active_count;
582    
583 ph10 475 #ifdef PCRE_DEBUG
584 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
585 ph10 756 pchars((pcre_uchar *)ptr, strlen((char *)ptr), stdout);
586 nigel 77 printf("\"\n");
587    
588     printf("%.*sActive states: ", rlevel*2-2, SP);
589     for (i = 0; i < active_count; i++)
590     printf("%d/%d ", active_states[i].offset, active_states[i].count);
591     printf("\n");
592     #endif
593    
594     /* Set the pointers for adding new states */
595    
596     next_active_state = active_states + active_count;
597     next_new_state = new_states;
598    
599     /* Load the current character from the subject outside the loop, as many
600     different states may want to look at it, and we assume that at least one
601     will. */
602    
603     if (ptr < end_subject)
604     {
605 nigel 93 clen = 1; /* Number of bytes in the character */
606 nigel 77 #ifdef SUPPORT_UTF8
607 zherczeg 781 if (utf) { GETCHARLEN(c, ptr, clen); } else
608 nigel 77 #endif /* SUPPORT_UTF8 */
609     c = *ptr;
610     }
611     else
612     {
613 nigel 93 clen = 0; /* This indicates the end of the subject */
614     c = NOTACHAR; /* This value should never actually be used */
615 nigel 77 }
616    
617     /* Scan up the active states and act on each one. The result of an action
618     may be to add more states to the currently active list (e.g. on hitting a
619     parenthesis) or it may be to put states on the new list, for considering
620     when we move the character pointer on. */
621    
622     for (i = 0; i < active_count; i++)
623     {
624     stateblock *current_state = active_states + i;
625 ph10 654 BOOL caseless = FALSE;
626 ph10 756 const pcre_uchar *code;
627 nigel 77 int state_offset = current_state->offset;
628 ph10 397 int count, codevalue, rrc;
629 nigel 77
630 ph10 475 #ifdef PCRE_DEBUG
631 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
632 nigel 93 if (clen == 0) printf("EOL\n");
633 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
634     else printf("0x%02x\n", c);
635     #endif
636    
637     /* A negative offset is a special case meaning "hold off going to this
638     (negated) state until the number of characters in the data field have
639     been skipped". */
640    
641     if (state_offset < 0)
642     {
643     if (current_state->data > 0)
644     {
645     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
646     ADD_NEW_DATA(state_offset, current_state->count,
647     current_state->data - 1);
648     continue;
649     }
650     else
651     {
652     current_state->offset = state_offset = -state_offset;
653     }
654     }
655    
656 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
657 ph10 439 See the note at the head of this module about the possibility of improving
658     performance here. */
659 nigel 77
660     for (j = 0; j < i; j++)
661     {
662     if (active_states[j].offset == state_offset &&
663     active_states[j].count == current_state->count)
664     {
665     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
666     goto NEXT_ACTIVE_STATE;
667     }
668     }
669    
670     /* The state offset is the offset to the opcode */
671    
672     code = start_code + state_offset;
673     codevalue = *code;
674    
675 ph10 463 /* If this opcode inspects a character, but we are at the end of the
676     subject, remember the fact for use when testing for a partial match. */
677    
678 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
679 ph10 463 could_continue = TRUE;
680 ph10 462
681 nigel 77 /* If this opcode is followed by an inline character, load it. It is
682     tempting to test for the presence of a subject character here, but that
683     is wrong, because sometimes zero repetitions of the subject are
684     permitted.
685    
686     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
687 ph10 178 argument that is not a data character - but is always one byte long. We
688     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
689     this case. To keep the other cases fast, convert these ones to new opcodes.
690     */
691 nigel 77
692     if (coptable[codevalue] > 0)
693     {
694     dlen = 1;
695     #ifdef SUPPORT_UTF8
696 zherczeg 781 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
697 nigel 77 #endif /* SUPPORT_UTF8 */
698     d = code[coptable[codevalue]];
699     if (codevalue >= OP_TYPESTAR)
700     {
701 nigel 93 switch(d)
702     {
703     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
704     case OP_NOTPROP:
705     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
706     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
707     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
708 ph10 178 case OP_NOT_HSPACE:
709 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
710 ph10 178 case OP_NOT_VSPACE:
711 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
712 nigel 93 default: break;
713     }
714 nigel 77 }
715     }
716     else
717     {
718     dlen = 0; /* Not strictly necessary, but compilers moan */
719 nigel 93 d = NOTACHAR; /* if these variables are not set. */
720 nigel 77 }
721    
722    
723     /* Now process the individual opcodes */
724    
725     switch (codevalue)
726     {
727 ph10 498 /* ========================================================================== */
728     /* These cases are never obeyed. This is a fudge that causes a compile-
729     time error if the vectors coptable or poptable, which are indexed by
730     opcode, are not the correct length. It seems to be the only way to do
731     such a check at compile time, as the sizeof() operator does not work
732     in the C preprocessor. */
733 ph10 507
734 ph10 498 case OP_TABLE_LENGTH:
735 ph10 507 case OP_TABLE_LENGTH +
736 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
737     (sizeof(poptable) == OP_TABLE_LENGTH)):
738 ph10 507 break;
739 nigel 77
740     /* ========================================================================== */
741     /* Reached a closing bracket. If not at the end of the pattern, carry
742 ph10 654 on with the next opcode. For repeating opcodes, also add the repeat
743     state. Note that KETRPOS will always be encountered at the end of the
744     subpattern, because the possessive subpattern repeats are always handled
745 ph10 604 using recursive calls. Thus, it never adds any new states.
746 ph10 654
747 ph10 604 At the end of the (sub)pattern, unless we have an empty string and
748 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
749 ph10 442 start of the subject, save the match data, shifting up all previous
750 nigel 77 matches so we always have the longest first. */
751    
752     case OP_KET:
753     case OP_KETRMIN:
754     case OP_KETRMAX:
755 ph10 654 case OP_KETRPOS:
756 nigel 77 if (code != end_code)
757     {
758     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
759     if (codevalue != OP_KET)
760     {
761     ADD_ACTIVE(state_offset - GET(code, 1), 0);
762     }
763     }
764 ph10 461 else
765 nigel 77 {
766 ph10 461 if (ptr > current_subject ||
767 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
768     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
769     current_subject > start_subject + md->start_offset)))
770 nigel 77 {
771 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
772 ph10 680 else if (match_count > 0 && ++match_count * 2 > offsetcount)
773 ph10 428 match_count = 0;
774     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
775     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
776     if (offsetcount >= 2)
777     {
778 ph10 530 offsets[0] = (int)(current_subject - start_subject);
779     offsets[1] = (int)(ptr - start_subject);
780 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
781     offsets[1] - offsets[0], current_subject));
782     }
783     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
784     {
785     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
786     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
787     match_count, rlevel*2-2, SP));
788     return match_count;
789     }
790 ph10 461 }
791 nigel 77 }
792     break;
793    
794     /* ========================================================================== */
795     /* These opcodes add to the current list of states without looking
796     at the current character. */
797    
798     /*-----------------------------------------------------------------*/
799     case OP_ALT:
800     do { code += GET(code, 1); } while (*code == OP_ALT);
801 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
802 nigel 77 break;
803    
804     /*-----------------------------------------------------------------*/
805     case OP_BRA:
806 nigel 93 case OP_SBRA:
807 nigel 77 do
808     {
809 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
810 nigel 77 code += GET(code, 1);
811     }
812     while (*code == OP_ALT);
813     break;
814    
815     /*-----------------------------------------------------------------*/
816 nigel 93 case OP_CBRA:
817     case OP_SCBRA:
818 zherczeg 769 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
819 nigel 93 code += GET(code, 1);
820     while (*code == OP_ALT)
821     {
822 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
823 nigel 93 code += GET(code, 1);
824     }
825     break;
826    
827     /*-----------------------------------------------------------------*/
828 nigel 77 case OP_BRAZERO:
829     case OP_BRAMINZERO:
830     ADD_ACTIVE(state_offset + 1, 0);
831     code += 1 + GET(code, 2);
832     while (*code == OP_ALT) code += GET(code, 1);
833 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
834 nigel 77 break;
835    
836     /*-----------------------------------------------------------------*/
837 ph10 335 case OP_SKIPZERO:
838     code += 1 + GET(code, 2);
839     while (*code == OP_ALT) code += GET(code, 1);
840 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
841 ph10 335 break;
842    
843     /*-----------------------------------------------------------------*/
844 nigel 77 case OP_CIRC:
845 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
846     { ADD_ACTIVE(state_offset + 1, 0); }
847     break;
848    
849     /*-----------------------------------------------------------------*/
850     case OP_CIRCM:
851 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
852 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
853 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
854     break;
855    
856     /*-----------------------------------------------------------------*/
857     case OP_EOD:
858 ph10 579 if (ptr >= end_subject)
859     {
860 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
861     could_continue = TRUE;
862     else { ADD_ACTIVE(state_offset + 1, 0); }
863     }
864 nigel 77 break;
865    
866     /*-----------------------------------------------------------------*/
867     case OP_SOD:
868     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
869     break;
870    
871     /*-----------------------------------------------------------------*/
872     case OP_SOM:
873     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
874     break;
875    
876    
877     /* ========================================================================== */
878     /* These opcodes inspect the next subject character, and sometimes
879     the previous one as well, but do not have an argument. The variable
880     clen contains the length of the current character and is zero if we are
881     at the end of the subject. */
882    
883     /*-----------------------------------------------------------------*/
884     case OP_ANY:
885 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
886 nigel 77 { ADD_NEW(state_offset + 1, 0); }
887     break;
888    
889     /*-----------------------------------------------------------------*/
890 ph10 341 case OP_ALLANY:
891     if (clen > 0)
892     { ADD_NEW(state_offset + 1, 0); }
893     break;
894    
895     /*-----------------------------------------------------------------*/
896 nigel 77 case OP_EODN:
897 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
898     could_continue = TRUE;
899     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
900 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
901     break;
902    
903     /*-----------------------------------------------------------------*/
904     case OP_DOLL:
905     if ((md->moptions & PCRE_NOTEOL) == 0)
906     {
907 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
908     could_continue = TRUE;
909     else if (clen == 0 ||
910 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
911 ph10 602 (ptr == end_subject - md->nllen)
912 nigel 91 ))
913 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
914     }
915 ph10 602 break;
916    
917     /*-----------------------------------------------------------------*/
918     case OP_DOLLM:
919     if ((md->moptions & PCRE_NOTEOL) == 0)
920     {
921     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
922     could_continue = TRUE;
923     else if (clen == 0 ||
924     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
925     { ADD_ACTIVE(state_offset + 1, 0); }
926     }
927     else if (IS_NEWLINE(ptr))
928 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
929     break;
930    
931     /*-----------------------------------------------------------------*/
932    
933     case OP_DIGIT:
934     case OP_WHITESPACE:
935     case OP_WORDCHAR:
936     if (clen > 0 && c < 256 &&
937     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
938     { ADD_NEW(state_offset + 1, 0); }
939     break;
940    
941     /*-----------------------------------------------------------------*/
942     case OP_NOT_DIGIT:
943     case OP_NOT_WHITESPACE:
944     case OP_NOT_WORDCHAR:
945     if (clen > 0 && (c >= 256 ||
946     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
947     { ADD_NEW(state_offset + 1, 0); }
948     break;
949    
950     /*-----------------------------------------------------------------*/
951     case OP_WORD_BOUNDARY:
952     case OP_NOT_WORD_BOUNDARY:
953     {
954     int left_word, right_word;
955    
956     if (ptr > start_subject)
957     {
958 ph10 756 const pcre_uchar *temp = ptr - 1;
959 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
960 nigel 77 #ifdef SUPPORT_UTF8
961 zherczeg 781 if (utf) BACKCHAR(temp);
962 nigel 77 #endif
963     GETCHARTEST(d, temp);
964 ph10 535 #ifdef SUPPORT_UCP
965 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
966     {
967     if (d == '_') left_word = TRUE; else
968 ph10 535 {
969 ph10 518 int cat = UCD_CATEGORY(d);
970     left_word = (cat == ucp_L || cat == ucp_N);
971 ph10 535 }
972     }
973     else
974     #endif
975 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
976     }
977 ph10 518 else left_word = FALSE;
978 nigel 77
979 ph10 461 if (clen > 0)
980 ph10 535 {
981     #ifdef SUPPORT_UCP
982 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
983     {
984     if (c == '_') right_word = TRUE; else
985 ph10 535 {
986 ph10 518 int cat = UCD_CATEGORY(c);
987     right_word = (cat == ucp_L || cat == ucp_N);
988 ph10 535 }
989     }
990     else
991     #endif
992 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
993 ph10 535 }
994 ph10 518 else right_word = FALSE;
995 nigel 77
996     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
997     { ADD_ACTIVE(state_offset + 1, 0); }
998     }
999     break;
1000    
1001    
1002     /*-----------------------------------------------------------------*/
1003     /* Check the next character by Unicode property. We will get here only
1004     if the support is in the binary; otherwise a compile-time error occurs.
1005     */
1006    
1007 ph10 151 #ifdef SUPPORT_UCP
1008 nigel 77 case OP_PROP:
1009     case OP_NOTPROP:
1010     if (clen > 0)
1011     {
1012 nigel 87 BOOL OK;
1013 ph10 349 const ucd_record * prop = GET_UCD(c);
1014 nigel 87 switch(code[1])
1015 nigel 77 {
1016 nigel 87 case PT_ANY:
1017     OK = TRUE;
1018     break;
1019    
1020     case PT_LAMP:
1021 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1022 ph10 517 prop->chartype == ucp_Lt;
1023 nigel 87 break;
1024    
1025     case PT_GC:
1026 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1027 nigel 87 break;
1028    
1029     case PT_PC:
1030 ph10 349 OK = prop->chartype == code[2];
1031 nigel 87 break;
1032    
1033     case PT_SC:
1034 ph10 349 OK = prop->script == code[2];
1035 nigel 87 break;
1036 ph10 535
1037 ph10 517 /* These are specials for combination cases. */
1038 ph10 535
1039 ph10 517 case PT_ALNUM:
1040 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1041     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1042 ph10 535 break;
1043    
1044 ph10 517 case PT_SPACE: /* Perl space */
1045 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1046 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1047 ph10 535 break;
1048    
1049 ph10 517 case PT_PXSPACE: /* POSIX space */
1050 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1051 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1052     c == CHAR_FF || c == CHAR_CR;
1053 ph10 535 break;
1054    
1055 ph10 517 case PT_WORD:
1056 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1057     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1058 ph10 517 c == CHAR_UNDERSCORE;
1059 ph10 535 break;
1060 nigel 87
1061     /* Should never occur, but keep compilers from grumbling. */
1062    
1063     default:
1064     OK = codevalue != OP_PROP;
1065     break;
1066 nigel 77 }
1067 nigel 87
1068     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1069 nigel 77 }
1070     break;
1071     #endif
1072    
1073    
1074    
1075     /* ========================================================================== */
1076     /* These opcodes likewise inspect the subject character, but have an
1077     argument that is not a data character. It is one of these opcodes:
1078 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1079     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1080 nigel 77
1081     case OP_TYPEPLUS:
1082     case OP_TYPEMINPLUS:
1083 nigel 93 case OP_TYPEPOSPLUS:
1084 nigel 77 count = current_state->count; /* Already matched */
1085     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1086     if (clen > 0)
1087     {
1088     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1089     (c < 256 &&
1090 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1091 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1092     {
1093 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1094     {
1095     active_count--; /* Remove non-match possibility */
1096     next_active_state--;
1097     }
1098 nigel 77 count++;
1099     ADD_NEW(state_offset, count);
1100     }
1101     }
1102     break;
1103    
1104     /*-----------------------------------------------------------------*/
1105     case OP_TYPEQUERY:
1106     case OP_TYPEMINQUERY:
1107 nigel 93 case OP_TYPEPOSQUERY:
1108 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1109     if (clen > 0)
1110     {
1111     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1112     (c < 256 &&
1113 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1114 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1115     {
1116 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1117     {
1118     active_count--; /* Remove non-match possibility */
1119     next_active_state--;
1120     }
1121 nigel 77 ADD_NEW(state_offset + 2, 0);
1122     }
1123     }
1124     break;
1125    
1126     /*-----------------------------------------------------------------*/
1127     case OP_TYPESTAR:
1128     case OP_TYPEMINSTAR:
1129 nigel 93 case OP_TYPEPOSSTAR:
1130 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1131     if (clen > 0)
1132     {
1133     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1134     (c < 256 &&
1135 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1136 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1137     {
1138 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1139     {
1140     active_count--; /* Remove non-match possibility */
1141     next_active_state--;
1142     }
1143 nigel 77 ADD_NEW(state_offset, 0);
1144     }
1145     }
1146     break;
1147    
1148     /*-----------------------------------------------------------------*/
1149     case OP_TYPEEXACT:
1150 nigel 93 count = current_state->count; /* Number already matched */
1151     if (clen > 0)
1152     {
1153     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1154     (c < 256 &&
1155 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1156 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1157     {
1158     if (++count >= GET2(code, 1))
1159 zherczeg 769 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1160 nigel 93 else
1161     { ADD_NEW(state_offset, count); }
1162     }
1163     }
1164     break;
1165    
1166     /*-----------------------------------------------------------------*/
1167 nigel 77 case OP_TYPEUPTO:
1168     case OP_TYPEMINUPTO:
1169 nigel 93 case OP_TYPEPOSUPTO:
1170 zherczeg 769 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1171 nigel 77 count = current_state->count; /* Number already matched */
1172     if (clen > 0)
1173     {
1174     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1175     (c < 256 &&
1176 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1177 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1178     {
1179 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1180     {
1181     active_count--; /* Remove non-match possibility */
1182     next_active_state--;
1183     }
1184 nigel 77 if (++count >= GET2(code, 1))
1185 zherczeg 769 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1186 nigel 77 else
1187     { ADD_NEW(state_offset, count); }
1188     }
1189     }
1190     break;
1191    
1192     /* ========================================================================== */
1193     /* These are virtual opcodes that are used when something like
1194 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1195     argument. It keeps the code above fast for the other cases. The argument
1196     is in the d variable. */
1197 nigel 77
1198 ph10 151 #ifdef SUPPORT_UCP
1199 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1200     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1201 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1202 nigel 77 count = current_state->count; /* Already matched */
1203 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1204 nigel 77 if (clen > 0)
1205     {
1206 nigel 87 BOOL OK;
1207 ph10 349 const ucd_record * prop = GET_UCD(c);
1208 nigel 87 switch(code[2])
1209     {
1210     case PT_ANY:
1211     OK = TRUE;
1212     break;
1213    
1214     case PT_LAMP:
1215 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1216 ph10 517 prop->chartype == ucp_Lt;
1217 nigel 87 break;
1218    
1219     case PT_GC:
1220 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1221 nigel 87 break;
1222    
1223     case PT_PC:
1224 ph10 349 OK = prop->chartype == code[3];
1225 nigel 87 break;
1226    
1227     case PT_SC:
1228 ph10 349 OK = prop->script == code[3];
1229 nigel 87 break;
1230    
1231 ph10 517 /* These are specials for combination cases. */
1232 ph10 535
1233 ph10 517 case PT_ALNUM:
1234 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1235     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1236 ph10 535 break;
1237    
1238 ph10 517 case PT_SPACE: /* Perl space */
1239 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1240 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1241 ph10 535 break;
1242    
1243 ph10 517 case PT_PXSPACE: /* POSIX space */
1244 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1245 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1246     c == CHAR_FF || c == CHAR_CR;
1247 ph10 535 break;
1248    
1249 ph10 517 case PT_WORD:
1250 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1251     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1252 ph10 517 c == CHAR_UNDERSCORE;
1253 ph10 535 break;
1254 ph10 517
1255 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1256    
1257     default:
1258     OK = codevalue != OP_PROP;
1259     break;
1260     }
1261    
1262 nigel 93 if (OK == (d == OP_PROP))
1263     {
1264     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1265     {
1266     active_count--; /* Remove non-match possibility */
1267     next_active_state--;
1268     }
1269     count++;
1270     ADD_NEW(state_offset, count);
1271     }
1272 nigel 77 }
1273     break;
1274    
1275     /*-----------------------------------------------------------------*/
1276     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1277     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1278 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1279 nigel 77 count = current_state->count; /* Already matched */
1280     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1281 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1282 nigel 77 {
1283 ph10 756 const pcre_uchar *nptr = ptr + clen;
1284 nigel 77 int ncount = 0;
1285 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1286     {
1287     active_count--; /* Remove non-match possibility */
1288     next_active_state--;
1289     }
1290 nigel 77 while (nptr < end_subject)
1291     {
1292     int nd;
1293     int ndlen = 1;
1294     GETCHARLEN(nd, nptr, ndlen);
1295 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1296 nigel 77 ncount++;
1297     nptr += ndlen;
1298     }
1299     count++;
1300     ADD_NEW_DATA(-state_offset, count, ncount);
1301     }
1302     break;
1303 ph10 151 #endif
1304 nigel 77
1305     /*-----------------------------------------------------------------*/
1306 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1307     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1308     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1309     count = current_state->count; /* Already matched */
1310     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1311     if (clen > 0)
1312     {
1313     int ncount = 0;
1314     switch (c)
1315     {
1316     case 0x000b:
1317     case 0x000c:
1318     case 0x0085:
1319     case 0x2028:
1320     case 0x2029:
1321 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1322     goto ANYNL01;
1323    
1324     case 0x000d:
1325     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1326     /* Fall through */
1327    
1328     ANYNL01:
1329     case 0x000a:
1330 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1331     {
1332     active_count--; /* Remove non-match possibility */
1333     next_active_state--;
1334     }
1335     count++;
1336     ADD_NEW_DATA(-state_offset, count, ncount);
1337     break;
1338 ph10 231
1339 nigel 93 default:
1340     break;
1341     }
1342     }
1343     break;
1344    
1345     /*-----------------------------------------------------------------*/
1346 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1347     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1348     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1349     count = current_state->count; /* Already matched */
1350     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1351     if (clen > 0)
1352     {
1353 ph10 182 BOOL OK;
1354 ph10 178 switch (c)
1355     {
1356     case 0x000a:
1357     case 0x000b:
1358     case 0x000c:
1359     case 0x000d:
1360     case 0x0085:
1361     case 0x2028:
1362     case 0x2029:
1363     OK = TRUE;
1364 ph10 182 break;
1365 ph10 178
1366     default:
1367     OK = FALSE;
1368 ph10 182 break;
1369 ph10 178 }
1370    
1371     if (OK == (d == OP_VSPACE))
1372 ph10 182 {
1373 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1374     {
1375     active_count--; /* Remove non-match possibility */
1376     next_active_state--;
1377     }
1378     count++;
1379     ADD_NEW_DATA(-state_offset, count, 0);
1380     }
1381     }
1382     break;
1383    
1384     /*-----------------------------------------------------------------*/
1385     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1386     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1387     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1388     count = current_state->count; /* Already matched */
1389     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1390     if (clen > 0)
1391     {
1392 ph10 182 BOOL OK;
1393 ph10 178 switch (c)
1394     {
1395     case 0x09: /* HT */
1396     case 0x20: /* SPACE */
1397     case 0xa0: /* NBSP */
1398     case 0x1680: /* OGHAM SPACE MARK */
1399     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1400     case 0x2000: /* EN QUAD */
1401     case 0x2001: /* EM QUAD */
1402     case 0x2002: /* EN SPACE */
1403     case 0x2003: /* EM SPACE */
1404     case 0x2004: /* THREE-PER-EM SPACE */
1405     case 0x2005: /* FOUR-PER-EM SPACE */
1406     case 0x2006: /* SIX-PER-EM SPACE */
1407     case 0x2007: /* FIGURE SPACE */
1408     case 0x2008: /* PUNCTUATION SPACE */
1409     case 0x2009: /* THIN SPACE */
1410     case 0x200A: /* HAIR SPACE */
1411     case 0x202f: /* NARROW NO-BREAK SPACE */
1412     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1413     case 0x3000: /* IDEOGRAPHIC SPACE */
1414     OK = TRUE;
1415     break;
1416 ph10 182
1417 ph10 178 default:
1418     OK = FALSE;
1419     break;
1420     }
1421 ph10 182
1422 ph10 178 if (OK == (d == OP_HSPACE))
1423 ph10 182 {
1424 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1425     {
1426     active_count--; /* Remove non-match possibility */
1427     next_active_state--;
1428     }
1429     count++;
1430     ADD_NEW_DATA(-state_offset, count, 0);
1431     }
1432     }
1433     break;
1434    
1435     /*-----------------------------------------------------------------*/
1436 ph10 151 #ifdef SUPPORT_UCP
1437 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1438     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1439 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1440 nigel 87 count = 4;
1441 nigel 77 goto QS1;
1442    
1443     case OP_PROP_EXTRA + OP_TYPESTAR:
1444     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1445 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1446 nigel 77 count = 0;
1447    
1448     QS1:
1449    
1450 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1451 nigel 77 if (clen > 0)
1452     {
1453 nigel 87 BOOL OK;
1454 ph10 349 const ucd_record * prop = GET_UCD(c);
1455 nigel 87 switch(code[2])
1456     {
1457     case PT_ANY:
1458     OK = TRUE;
1459     break;
1460    
1461     case PT_LAMP:
1462 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1463 ph10 517 prop->chartype == ucp_Lt;
1464 nigel 87 break;
1465    
1466     case PT_GC:
1467 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1468 nigel 87 break;
1469    
1470     case PT_PC:
1471 ph10 349 OK = prop->chartype == code[3];
1472 nigel 87 break;
1473    
1474     case PT_SC:
1475 ph10 349 OK = prop->script == code[3];
1476 nigel 87 break;
1477 ph10 535
1478 ph10 517 /* These are specials for combination cases. */
1479 ph10 535
1480 ph10 517 case PT_ALNUM:
1481 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1482     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1483 ph10 535 break;
1484    
1485 ph10 517 case PT_SPACE: /* Perl space */
1486 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1487 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1488 ph10 535 break;
1489    
1490 ph10 517 case PT_PXSPACE: /* POSIX space */
1491 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1492 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1493     c == CHAR_FF || c == CHAR_CR;
1494 ph10 535 break;
1495    
1496 ph10 517 case PT_WORD:
1497 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1498     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1499 ph10 517 c == CHAR_UNDERSCORE;
1500 ph10 535 break;
1501 nigel 87
1502     /* Should never occur, but keep compilers from grumbling. */
1503    
1504     default:
1505     OK = codevalue != OP_PROP;
1506     break;
1507     }
1508    
1509 nigel 93 if (OK == (d == OP_PROP))
1510     {
1511     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1512     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1513     {
1514     active_count--; /* Remove non-match possibility */
1515     next_active_state--;
1516     }
1517     ADD_NEW(state_offset + count, 0);
1518     }
1519 nigel 77 }
1520     break;
1521    
1522     /*-----------------------------------------------------------------*/
1523     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1524     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1525 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1526 nigel 77 count = 2;
1527     goto QS2;
1528    
1529     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1530     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1531 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1532 nigel 77 count = 0;
1533    
1534     QS2:
1535    
1536     ADD_ACTIVE(state_offset + 2, 0);
1537 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1538 nigel 77 {
1539 ph10 756 const pcre_uchar *nptr = ptr + clen;
1540 nigel 77 int ncount = 0;
1541 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1542     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1543     {
1544     active_count--; /* Remove non-match possibility */
1545     next_active_state--;
1546     }
1547 nigel 77 while (nptr < end_subject)
1548     {
1549     int nd;
1550     int ndlen = 1;
1551     GETCHARLEN(nd, nptr, ndlen);
1552 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1553 nigel 77 ncount++;
1554     nptr += ndlen;
1555     }
1556     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1557     }
1558     break;
1559 ph10 151 #endif
1560 nigel 77
1561     /*-----------------------------------------------------------------*/
1562 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1563     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1564     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1565     count = 2;
1566     goto QS3;
1567    
1568     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1569     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1570     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1571     count = 0;
1572    
1573     QS3:
1574     ADD_ACTIVE(state_offset + 2, 0);
1575     if (clen > 0)
1576     {
1577     int ncount = 0;
1578     switch (c)
1579     {
1580     case 0x000b:
1581     case 0x000c:
1582     case 0x0085:
1583     case 0x2028:
1584     case 0x2029:
1585 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1586     goto ANYNL02;
1587    
1588     case 0x000d:
1589     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1590     /* Fall through */
1591    
1592     ANYNL02:
1593     case 0x000a:
1594 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1595     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1596     {
1597     active_count--; /* Remove non-match possibility */
1598     next_active_state--;
1599     }
1600     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1601     break;
1602 ph10 231
1603 nigel 93 default:
1604     break;
1605     }
1606     }
1607     break;
1608    
1609     /*-----------------------------------------------------------------*/
1610 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1611     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1612     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1613     count = 2;
1614     goto QS4;
1615    
1616     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1617     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1618     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1619     count = 0;
1620    
1621     QS4:
1622     ADD_ACTIVE(state_offset + 2, 0);
1623     if (clen > 0)
1624     {
1625 ph10 182 BOOL OK;
1626 ph10 178 switch (c)
1627     {
1628     case 0x000a:
1629     case 0x000b:
1630     case 0x000c:
1631     case 0x000d:
1632     case 0x0085:
1633     case 0x2028:
1634     case 0x2029:
1635     OK = TRUE;
1636     break;
1637 ph10 182
1638 ph10 178 default:
1639     OK = FALSE;
1640     break;
1641     }
1642     if (OK == (d == OP_VSPACE))
1643 ph10 182 {
1644 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1645     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1646     {
1647     active_count--; /* Remove non-match possibility */
1648     next_active_state--;
1649     }
1650     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1651     }
1652     }
1653     break;
1654    
1655     /*-----------------------------------------------------------------*/
1656     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1657     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1658     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1659     count = 2;
1660     goto QS5;
1661    
1662     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1663     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1664     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1665     count = 0;
1666    
1667     QS5:
1668     ADD_ACTIVE(state_offset + 2, 0);
1669     if (clen > 0)
1670     {
1671 ph10 182 BOOL OK;
1672 ph10 178 switch (c)
1673     {
1674     case 0x09: /* HT */
1675     case 0x20: /* SPACE */
1676     case 0xa0: /* NBSP */
1677     case 0x1680: /* OGHAM SPACE MARK */
1678     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1679     case 0x2000: /* EN QUAD */
1680     case 0x2001: /* EM QUAD */
1681     case 0x2002: /* EN SPACE */
1682     case 0x2003: /* EM SPACE */
1683     case 0x2004: /* THREE-PER-EM SPACE */
1684     case 0x2005: /* FOUR-PER-EM SPACE */
1685     case 0x2006: /* SIX-PER-EM SPACE */
1686     case 0x2007: /* FIGURE SPACE */
1687     case 0x2008: /* PUNCTUATION SPACE */
1688     case 0x2009: /* THIN SPACE */
1689     case 0x200A: /* HAIR SPACE */
1690     case 0x202f: /* NARROW NO-BREAK SPACE */
1691     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1692     case 0x3000: /* IDEOGRAPHIC SPACE */
1693     OK = TRUE;
1694     break;
1695 ph10 182
1696 ph10 178 default:
1697     OK = FALSE;
1698     break;
1699     }
1700 ph10 182
1701 ph10 178 if (OK == (d == OP_HSPACE))
1702 ph10 182 {
1703 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1704     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1705     {
1706     active_count--; /* Remove non-match possibility */
1707     next_active_state--;
1708     }
1709     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1710     }
1711     }
1712     break;
1713    
1714     /*-----------------------------------------------------------------*/
1715 ph10 151 #ifdef SUPPORT_UCP
1716 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1717     case OP_PROP_EXTRA + OP_TYPEUPTO:
1718     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1719 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1720 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1721 zherczeg 769 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1722 nigel 77 count = current_state->count; /* Number already matched */
1723     if (clen > 0)
1724     {
1725 nigel 87 BOOL OK;
1726 ph10 349 const ucd_record * prop = GET_UCD(c);
1727 zherczeg 769 switch(code[1 + IMM2_SIZE + 1])
1728 nigel 77 {
1729 nigel 87 case PT_ANY:
1730     OK = TRUE;
1731     break;
1732    
1733     case PT_LAMP:
1734 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1735 ph10 517 prop->chartype == ucp_Lt;
1736 nigel 87 break;
1737    
1738     case PT_GC:
1739 zherczeg 769 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1740 nigel 87 break;
1741    
1742     case PT_PC:
1743 zherczeg 769 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1744 nigel 87 break;
1745    
1746     case PT_SC:
1747 zherczeg 769 OK = prop->script == code[1 + IMM2_SIZE + 2];
1748 nigel 87 break;
1749 ph10 535
1750 ph10 517 /* These are specials for combination cases. */
1751 ph10 535
1752 ph10 517 case PT_ALNUM:
1753 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1754     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1755 ph10 535 break;
1756    
1757 ph10 517 case PT_SPACE: /* Perl space */
1758 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1759 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1760 ph10 535 break;
1761    
1762 ph10 517 case PT_PXSPACE: /* POSIX space */
1763 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1764 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1765     c == CHAR_FF || c == CHAR_CR;
1766 ph10 535 break;
1767    
1768 ph10 517 case PT_WORD:
1769 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1770     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1771 ph10 517 c == CHAR_UNDERSCORE;
1772 ph10 535 break;
1773 nigel 87
1774     /* Should never occur, but keep compilers from grumbling. */
1775    
1776     default:
1777     OK = codevalue != OP_PROP;
1778     break;
1779     }
1780    
1781     if (OK == (d == OP_PROP))
1782     {
1783 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1784     {
1785     active_count--; /* Remove non-match possibility */
1786     next_active_state--;
1787     }
1788 nigel 77 if (++count >= GET2(code, 1))
1789 zherczeg 769 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1790 nigel 77 else
1791     { ADD_NEW(state_offset, count); }
1792     }
1793     }
1794     break;
1795    
1796     /*-----------------------------------------------------------------*/
1797     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1798     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1799     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1800 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1801 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1802 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1803 nigel 77 count = current_state->count; /* Number already matched */
1804 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1805 nigel 77 {
1806 ph10 756 const pcre_uchar *nptr = ptr + clen;
1807 nigel 77 int ncount = 0;
1808 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1809     {
1810     active_count--; /* Remove non-match possibility */
1811     next_active_state--;
1812     }
1813 nigel 77 while (nptr < end_subject)
1814     {
1815     int nd;
1816     int ndlen = 1;
1817     GETCHARLEN(nd, nptr, ndlen);
1818 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1819 nigel 77 ncount++;
1820     nptr += ndlen;
1821     }
1822     if (++count >= GET2(code, 1))
1823 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1824 nigel 77 else
1825     { ADD_NEW_DATA(-state_offset, count, ncount); }
1826     }
1827     break;
1828 ph10 151 #endif
1829 nigel 77
1830 nigel 93 /*-----------------------------------------------------------------*/
1831     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1832     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1833     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1834     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1835     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1836 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1837 nigel 93 count = current_state->count; /* Number already matched */
1838     if (clen > 0)
1839     {
1840     int ncount = 0;
1841     switch (c)
1842     {
1843     case 0x000b:
1844     case 0x000c:
1845     case 0x0085:
1846     case 0x2028:
1847     case 0x2029:
1848 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1849     goto ANYNL03;
1850    
1851     case 0x000d:
1852     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1853     /* Fall through */
1854    
1855     ANYNL03:
1856     case 0x000a:
1857 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1858     {
1859     active_count--; /* Remove non-match possibility */
1860     next_active_state--;
1861     }
1862     if (++count >= GET2(code, 1))
1863 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1864 nigel 93 else
1865     { ADD_NEW_DATA(-state_offset, count, ncount); }
1866     break;
1867 ph10 231
1868 nigel 93 default:
1869     break;
1870     }
1871     }
1872     break;
1873    
1874 ph10 178 /*-----------------------------------------------------------------*/
1875     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1876     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1877     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1878     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1879     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1880 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1881 ph10 178 count = current_state->count; /* Number already matched */
1882     if (clen > 0)
1883     {
1884 ph10 182 BOOL OK;
1885 ph10 178 switch (c)
1886     {
1887     case 0x000a:
1888     case 0x000b:
1889     case 0x000c:
1890     case 0x000d:
1891     case 0x0085:
1892     case 0x2028:
1893     case 0x2029:
1894     OK = TRUE;
1895     break;
1896 ph10 182
1897 ph10 178 default:
1898     OK = FALSE;
1899     }
1900 ph10 182
1901 ph10 178 if (OK == (d == OP_VSPACE))
1902 ph10 182 {
1903 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1904     {
1905     active_count--; /* Remove non-match possibility */
1906     next_active_state--;
1907     }
1908     if (++count >= GET2(code, 1))
1909 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1910 ph10 178 else
1911     { ADD_NEW_DATA(-state_offset, count, 0); }
1912     }
1913     }
1914     break;
1915    
1916     /*-----------------------------------------------------------------*/
1917     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1918     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1919     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1920     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1921     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1922 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1923 ph10 178 count = current_state->count; /* Number already matched */
1924     if (clen > 0)
1925     {
1926 ph10 182 BOOL OK;
1927 ph10 178 switch (c)
1928     {
1929     case 0x09: /* HT */
1930     case 0x20: /* SPACE */
1931     case 0xa0: /* NBSP */
1932     case 0x1680: /* OGHAM SPACE MARK */
1933     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1934     case 0x2000: /* EN QUAD */
1935     case 0x2001: /* EM QUAD */
1936     case 0x2002: /* EN SPACE */
1937     case 0x2003: /* EM SPACE */
1938     case 0x2004: /* THREE-PER-EM SPACE */
1939     case 0x2005: /* FOUR-PER-EM SPACE */
1940     case 0x2006: /* SIX-PER-EM SPACE */
1941     case 0x2007: /* FIGURE SPACE */
1942     case 0x2008: /* PUNCTUATION SPACE */
1943     case 0x2009: /* THIN SPACE */
1944     case 0x200A: /* HAIR SPACE */
1945     case 0x202f: /* NARROW NO-BREAK SPACE */
1946     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1947     case 0x3000: /* IDEOGRAPHIC SPACE */
1948     OK = TRUE;
1949     break;
1950 ph10 182
1951 ph10 178 default:
1952     OK = FALSE;
1953     break;
1954     }
1955 ph10 182
1956 ph10 178 if (OK == (d == OP_HSPACE))
1957 ph10 182 {
1958 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1959     {
1960     active_count--; /* Remove non-match possibility */
1961     next_active_state--;
1962     }
1963     if (++count >= GET2(code, 1))
1964 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1965 ph10 178 else
1966     { ADD_NEW_DATA(-state_offset, count, 0); }
1967     }
1968     }
1969     break;
1970    
1971 nigel 77 /* ========================================================================== */
1972     /* These opcodes are followed by a character that is usually compared
1973     to the current subject character; it is loaded into d. We still get
1974     here even if there is no subject character, because in some cases zero
1975     repetitions are permitted. */
1976    
1977     /*-----------------------------------------------------------------*/
1978     case OP_CHAR:
1979     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1980     break;
1981    
1982     /*-----------------------------------------------------------------*/
1983 ph10 602 case OP_CHARI:
1984 nigel 77 if (clen == 0) break;
1985    
1986     #ifdef SUPPORT_UTF8
1987 zherczeg 781 if (utf)
1988 nigel 77 {
1989     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1990     {
1991 nigel 93 unsigned int othercase;
1992 nigel 77 if (c < 128) othercase = fcc[c]; else
1993    
1994     /* If we have Unicode property support, we can use it to test the
1995 nigel 87 other case of the character. */
1996 nigel 77
1997     #ifdef SUPPORT_UCP
1998 ph10 349 othercase = UCD_OTHERCASE(c);
1999 nigel 87 #else
2000 nigel 93 othercase = NOTACHAR;
2001 nigel 77 #endif
2002    
2003     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2004     }
2005     }
2006     else
2007     #endif /* SUPPORT_UTF8 */
2008 zherczeg 781 /* Not UTF mode */
2009 nigel 77 {
2010     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2011     }
2012     break;
2013    
2014    
2015     #ifdef SUPPORT_UCP
2016     /*-----------------------------------------------------------------*/
2017     /* This is a tricky one because it can match more than one character.
2018     Find out how many characters to skip, and then set up a negative state
2019     to wait for them to pass before continuing. */
2020    
2021     case OP_EXTUNI:
2022 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2023 nigel 77 {
2024 ph10 756 const pcre_uchar *nptr = ptr + clen;
2025 nigel 77 int ncount = 0;
2026     while (nptr < end_subject)
2027     {
2028     int nclen = 1;
2029     GETCHARLEN(c, nptr, nclen);
2030 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2031 nigel 77 ncount++;
2032     nptr += nclen;
2033     }
2034     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2035     }
2036     break;
2037     #endif
2038    
2039     /*-----------------------------------------------------------------*/
2040 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2041     character (when CR is followed by LF). In this case, set up a negative
2042     state to wait for one character to pass before continuing. */
2043    
2044     case OP_ANYNL:
2045     if (clen > 0) switch(c)
2046     {
2047     case 0x000b:
2048     case 0x000c:
2049     case 0x0085:
2050     case 0x2028:
2051     case 0x2029:
2052 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2053    
2054     case 0x000a:
2055 nigel 93 ADD_NEW(state_offset + 1, 0);
2056     break;
2057 ph10 231
2058 nigel 93 case 0x000d:
2059     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2060     {
2061     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2062     }
2063     else
2064     {
2065     ADD_NEW(state_offset + 1, 0);
2066     }
2067     break;
2068     }
2069     break;
2070    
2071     /*-----------------------------------------------------------------*/
2072 ph10 178 case OP_NOT_VSPACE:
2073     if (clen > 0) switch(c)
2074     {
2075     case 0x000a:
2076     case 0x000b:
2077     case 0x000c:
2078     case 0x000d:
2079     case 0x0085:
2080     case 0x2028:
2081     case 0x2029:
2082     break;
2083 ph10 182
2084     default:
2085 ph10 178 ADD_NEW(state_offset + 1, 0);
2086     break;
2087     }
2088     break;
2089    
2090     /*-----------------------------------------------------------------*/
2091     case OP_VSPACE:
2092     if (clen > 0) switch(c)
2093     {
2094     case 0x000a:
2095     case 0x000b:
2096     case 0x000c:
2097     case 0x000d:
2098     case 0x0085:
2099     case 0x2028:
2100     case 0x2029:
2101     ADD_NEW(state_offset + 1, 0);
2102     break;
2103 ph10 182
2104 ph10 178 default: break;
2105     }
2106     break;
2107    
2108     /*-----------------------------------------------------------------*/
2109     case OP_NOT_HSPACE:
2110     if (clen > 0) switch(c)
2111     {
2112     case 0x09: /* HT */
2113     case 0x20: /* SPACE */
2114     case 0xa0: /* NBSP */
2115     case 0x1680: /* OGHAM SPACE MARK */
2116     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2117     case 0x2000: /* EN QUAD */
2118     case 0x2001: /* EM QUAD */
2119     case 0x2002: /* EN SPACE */
2120     case 0x2003: /* EM SPACE */
2121     case 0x2004: /* THREE-PER-EM SPACE */
2122     case 0x2005: /* FOUR-PER-EM SPACE */
2123     case 0x2006: /* SIX-PER-EM SPACE */
2124     case 0x2007: /* FIGURE SPACE */
2125     case 0x2008: /* PUNCTUATION SPACE */
2126     case 0x2009: /* THIN SPACE */
2127     case 0x200A: /* HAIR SPACE */
2128     case 0x202f: /* NARROW NO-BREAK SPACE */
2129     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2130     case 0x3000: /* IDEOGRAPHIC SPACE */
2131     break;
2132 ph10 182
2133     default:
2134 ph10 178 ADD_NEW(state_offset + 1, 0);
2135     break;
2136     }
2137     break;
2138    
2139     /*-----------------------------------------------------------------*/
2140     case OP_HSPACE:
2141     if (clen > 0) switch(c)
2142     {
2143     case 0x09: /* HT */
2144     case 0x20: /* SPACE */
2145     case 0xa0: /* NBSP */
2146     case 0x1680: /* OGHAM SPACE MARK */
2147     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2148     case 0x2000: /* EN QUAD */
2149     case 0x2001: /* EM QUAD */
2150     case 0x2002: /* EN SPACE */
2151     case 0x2003: /* EM SPACE */
2152     case 0x2004: /* THREE-PER-EM SPACE */
2153     case 0x2005: /* FOUR-PER-EM SPACE */
2154     case 0x2006: /* SIX-PER-EM SPACE */
2155     case 0x2007: /* FIGURE SPACE */
2156     case 0x2008: /* PUNCTUATION SPACE */
2157     case 0x2009: /* THIN SPACE */
2158     case 0x200A: /* HAIR SPACE */
2159     case 0x202f: /* NARROW NO-BREAK SPACE */
2160     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2161     case 0x3000: /* IDEOGRAPHIC SPACE */
2162     ADD_NEW(state_offset + 1, 0);
2163     break;
2164     }
2165     break;
2166    
2167     /*-----------------------------------------------------------------*/
2168 ph10 602 /* Match a negated single character casefully. This is only used for
2169     one-byte characters, that is, we know that d < 256. The character we are
2170 nigel 77 checking (c) can be multibyte. */
2171    
2172     case OP_NOT:
2173 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2174 nigel 77 break;
2175    
2176     /*-----------------------------------------------------------------*/
2177 ph10 602 /* Match a negated single character caselessly. This is only used for
2178     one-byte characters, that is, we know that d < 256. The character we are
2179     checking (c) can be multibyte. */
2180    
2181     case OP_NOTI:
2182 ph10 654 if (clen > 0 && c != d && c != fcc[d])
2183 ph10 602 { ADD_NEW(state_offset + dlen + 1, 0); }
2184     break;
2185    
2186     /*-----------------------------------------------------------------*/
2187     case OP_PLUSI:
2188     case OP_MINPLUSI:
2189     case OP_POSPLUSI:
2190     case OP_NOTPLUSI:
2191     case OP_NOTMINPLUSI:
2192     case OP_NOTPOSPLUSI:
2193     caseless = TRUE;
2194     codevalue -= OP_STARI - OP_STAR;
2195 ph10 654
2196 ph10 602 /* Fall through */
2197 nigel 77 case OP_PLUS:
2198     case OP_MINPLUS:
2199 nigel 93 case OP_POSPLUS:
2200 nigel 77 case OP_NOTPLUS:
2201     case OP_NOTMINPLUS:
2202 nigel 93 case OP_NOTPOSPLUS:
2203 nigel 77 count = current_state->count; /* Already matched */
2204     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2205     if (clen > 0)
2206     {
2207 nigel 93 unsigned int otherd = NOTACHAR;
2208 ph10 602 if (caseless)
2209 nigel 77 {
2210     #ifdef SUPPORT_UTF8
2211 zherczeg 781 if (utf && d >= 128)
2212 nigel 77 {
2213     #ifdef SUPPORT_UCP
2214 ph10 349 otherd = UCD_OTHERCASE(d);
2215 nigel 77 #endif /* SUPPORT_UCP */
2216     }
2217     else
2218     #endif /* SUPPORT_UTF8 */
2219     otherd = fcc[d];
2220     }
2221     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2222 nigel 93 {
2223     if (count > 0 &&
2224     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2225     {
2226     active_count--; /* Remove non-match possibility */
2227     next_active_state--;
2228     }
2229     count++;
2230     ADD_NEW(state_offset, count);
2231     }
2232 nigel 77 }
2233     break;
2234    
2235     /*-----------------------------------------------------------------*/
2236 ph10 602 case OP_QUERYI:
2237     case OP_MINQUERYI:
2238     case OP_POSQUERYI:
2239     case OP_NOTQUERYI:
2240     case OP_NOTMINQUERYI:
2241     case OP_NOTPOSQUERYI:
2242     caseless = TRUE;
2243     codevalue -= OP_STARI - OP_STAR;
2244     /* Fall through */
2245 nigel 77 case OP_QUERY:
2246     case OP_MINQUERY:
2247 nigel 93 case OP_POSQUERY:
2248 nigel 77 case OP_NOTQUERY:
2249     case OP_NOTMINQUERY:
2250 nigel 93 case OP_NOTPOSQUERY:
2251 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2252     if (clen > 0)
2253     {
2254 nigel 93 unsigned int otherd = NOTACHAR;
2255 ph10 602 if (caseless)
2256 nigel 77 {
2257     #ifdef SUPPORT_UTF8
2258 zherczeg 781 if (utf && d >= 128)
2259 nigel 77 {
2260     #ifdef SUPPORT_UCP
2261 ph10 349 otherd = UCD_OTHERCASE(d);
2262 nigel 77 #endif /* SUPPORT_UCP */
2263     }
2264     else
2265     #endif /* SUPPORT_UTF8 */
2266     otherd = fcc[d];
2267     }
2268     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2269 nigel 93 {
2270     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2271     {
2272     active_count--; /* Remove non-match possibility */
2273     next_active_state--;
2274     }
2275     ADD_NEW(state_offset + dlen + 1, 0);
2276     }
2277 nigel 77 }
2278     break;
2279    
2280     /*-----------------------------------------------------------------*/
2281 ph10 602 case OP_STARI:
2282     case OP_MINSTARI:
2283     case OP_POSSTARI:
2284     case OP_NOTSTARI:
2285     case OP_NOTMINSTARI:
2286     case OP_NOTPOSSTARI:
2287     caseless = TRUE;
2288     codevalue -= OP_STARI - OP_STAR;
2289     /* Fall through */
2290 nigel 77 case OP_STAR:
2291     case OP_MINSTAR:
2292 nigel 93 case OP_POSSTAR:
2293 nigel 77 case OP_NOTSTAR:
2294     case OP_NOTMINSTAR:
2295 nigel 93 case OP_NOTPOSSTAR:
2296 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2297     if (clen > 0)
2298     {
2299 nigel 93 unsigned int otherd = NOTACHAR;
2300 ph10 602 if (caseless)
2301 nigel 77 {
2302     #ifdef SUPPORT_UTF8
2303 zherczeg 781 if (utf && d >= 128)
2304 nigel 77 {
2305     #ifdef SUPPORT_UCP
2306 ph10 349 otherd = UCD_OTHERCASE(d);
2307 nigel 77 #endif /* SUPPORT_UCP */
2308     }
2309     else
2310     #endif /* SUPPORT_UTF8 */
2311     otherd = fcc[d];
2312     }
2313     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2314 nigel 93 {
2315     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2316     {
2317     active_count--; /* Remove non-match possibility */
2318     next_active_state--;
2319     }
2320     ADD_NEW(state_offset, 0);
2321     }
2322 nigel 77 }
2323     break;
2324    
2325     /*-----------------------------------------------------------------*/
2326 ph10 602 case OP_EXACTI:
2327     case OP_NOTEXACTI:
2328     caseless = TRUE;
2329     codevalue -= OP_STARI - OP_STAR;
2330     /* Fall through */
2331 nigel 77 case OP_EXACT:
2332 nigel 93 case OP_NOTEXACT:
2333     count = current_state->count; /* Number already matched */
2334     if (clen > 0)
2335     {
2336     unsigned int otherd = NOTACHAR;
2337 ph10 602 if (caseless)
2338 nigel 93 {
2339     #ifdef SUPPORT_UTF8
2340 zherczeg 781 if (utf && d >= 128)
2341 nigel 93 {
2342     #ifdef SUPPORT_UCP
2343 ph10 349 otherd = UCD_OTHERCASE(d);
2344 nigel 93 #endif /* SUPPORT_UCP */
2345     }
2346     else
2347     #endif /* SUPPORT_UTF8 */
2348     otherd = fcc[d];
2349     }
2350     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2351     {
2352     if (++count >= GET2(code, 1))
2353 zherczeg 769 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2354 nigel 93 else
2355     { ADD_NEW(state_offset, count); }
2356     }
2357     }
2358     break;
2359    
2360     /*-----------------------------------------------------------------*/
2361 ph10 602 case OP_UPTOI:
2362     case OP_MINUPTOI:
2363     case OP_POSUPTOI:
2364     case OP_NOTUPTOI:
2365     case OP_NOTMINUPTOI:
2366     case OP_NOTPOSUPTOI:
2367     caseless = TRUE;
2368     codevalue -= OP_STARI - OP_STAR;
2369     /* Fall through */
2370 nigel 77 case OP_UPTO:
2371     case OP_MINUPTO:
2372 nigel 93 case OP_POSUPTO:
2373 nigel 77 case OP_NOTUPTO:
2374     case OP_NOTMINUPTO:
2375 nigel 93 case OP_NOTPOSUPTO:
2376 zherczeg 769 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2377 nigel 77 count = current_state->count; /* Number already matched */
2378     if (clen > 0)
2379     {
2380 nigel 93 unsigned int otherd = NOTACHAR;
2381 ph10 602 if (caseless)
2382 nigel 77 {
2383     #ifdef SUPPORT_UTF8
2384 zherczeg 781 if (utf && d >= 128)
2385 nigel 77 {
2386     #ifdef SUPPORT_UCP
2387 ph10 349 otherd = UCD_OTHERCASE(d);
2388 nigel 77 #endif /* SUPPORT_UCP */
2389     }
2390     else
2391     #endif /* SUPPORT_UTF8 */
2392     otherd = fcc[d];
2393     }
2394     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2395     {
2396 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2397     {
2398     active_count--; /* Remove non-match possibility */
2399     next_active_state--;
2400     }
2401 nigel 77 if (++count >= GET2(code, 1))
2402 zherczeg 769 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2403 nigel 77 else
2404     { ADD_NEW(state_offset, count); }
2405     }
2406     }
2407     break;
2408    
2409    
2410     /* ========================================================================== */
2411     /* These are the class-handling opcodes */
2412    
2413     case OP_CLASS:
2414     case OP_NCLASS:
2415     case OP_XCLASS:
2416     {
2417     BOOL isinclass = FALSE;
2418     int next_state_offset;
2419 ph10 756 const pcre_uchar *ecode;
2420 nigel 77
2421     /* For a simple class, there is always just a 32-byte table, and we
2422     can set isinclass from it. */
2423    
2424     if (codevalue != OP_XCLASS)
2425     {
2426 zherczeg 770 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2427 nigel 77 if (clen > 0)
2428     {
2429     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2430     ((code[1 + c/8] & (1 << (c&7))) != 0);
2431     }
2432     }
2433    
2434     /* An extended class may have a table or a list of single characters,
2435     ranges, or both, and it may be positive or negative. There's a
2436     function that sorts all this out. */
2437    
2438     else
2439     {
2440     ecode = code + GET(code, 1);
2441 zherczeg 764 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE);
2442 nigel 77 }
2443    
2444     /* At this point, isinclass is set for all kinds of class, and ecode
2445     points to the byte after the end of the class. If there is a
2446     quantifier, this is where it will be. */
2447    
2448 ph10 530 next_state_offset = (int)(ecode - start_code);
2449 nigel 77
2450     switch (*ecode)
2451     {
2452     case OP_CRSTAR:
2453     case OP_CRMINSTAR:
2454     ADD_ACTIVE(next_state_offset + 1, 0);
2455     if (isinclass) { ADD_NEW(state_offset, 0); }
2456     break;
2457    
2458     case OP_CRPLUS:
2459     case OP_CRMINPLUS:
2460     count = current_state->count; /* Already matched */
2461     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2462     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2463     break;
2464    
2465     case OP_CRQUERY:
2466     case OP_CRMINQUERY:
2467     ADD_ACTIVE(next_state_offset + 1, 0);
2468     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2469     break;
2470    
2471     case OP_CRRANGE:
2472     case OP_CRMINRANGE:
2473     count = current_state->count; /* Already matched */
2474     if (count >= GET2(ecode, 1))
2475 zherczeg 769 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2476 nigel 77 if (isinclass)
2477     {
2478 nigel 91 int max = GET2(ecode, 3);
2479     if (++count >= max && max != 0) /* Max 0 => no limit */
2480 zherczeg 769 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2481 nigel 77 else
2482     { ADD_NEW(state_offset, count); }
2483     }
2484     break;
2485    
2486     default:
2487     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2488     break;
2489     }
2490     }
2491     break;
2492    
2493     /* ========================================================================== */
2494     /* These are the opcodes for fancy brackets of various kinds. We have
2495 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2496     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2497 ph10 341 though the other "backtracking verbs" are not supported. */
2498 ph10 345
2499 ph10 341 case OP_FAIL:
2500 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2501 ph10 345 break;
2502 nigel 77
2503     case OP_ASSERT:
2504     case OP_ASSERT_NOT:
2505     case OP_ASSERTBACK:
2506     case OP_ASSERTBACK_NOT:
2507     {
2508     int rc;
2509     int local_offsets[2];
2510     int local_workspace[1000];
2511 ph10 756 const pcre_uchar *endasscode = code + GET(code, 1);
2512 nigel 77
2513     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2514    
2515     rc = internal_dfa_exec(
2516     md, /* static match data */
2517     code, /* this subexpression's code */
2518     ptr, /* where we currently are */
2519 ph10 530 (int)(ptr - start_subject), /* start offset */
2520 nigel 77 local_offsets, /* offset vector */
2521     sizeof(local_offsets)/sizeof(int), /* size of same */
2522     local_workspace, /* workspace vector */
2523     sizeof(local_workspace)/sizeof(int), /* size of same */
2524 ph10 642 rlevel); /* function recursion level */
2525 ph10 487
2526 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2527 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2528 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2529 nigel 77 }
2530     break;
2531    
2532     /*-----------------------------------------------------------------*/
2533     case OP_COND:
2534 nigel 93 case OP_SCOND:
2535 nigel 77 {
2536     int local_offsets[1000];
2537     int local_workspace[1000];
2538 ph10 406 int codelink = GET(code, 1);
2539 ph10 397 int condcode;
2540 ph10 406
2541 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2542 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2543 ph10 398 happen for the other conditions. */
2544 nigel 77
2545 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2546 ph10 406 {
2547     rrc = 0;
2548 ph10 397 if (pcre_callout != NULL)
2549     {
2550     pcre_callout_block cb;
2551     cb.version = 1; /* Version 1 of the callout block */
2552     cb.callout_number = code[LINK_SIZE+2];
2553     cb.offset_vector = offsets;
2554     cb.subject = (PCRE_SPTR)start_subject;
2555 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2556     cb.start_match = (int)(current_subject - start_subject);
2557     cb.current_position = (int)(ptr - start_subject);
2558 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2559     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2560     cb.capture_top = 1;
2561     cb.capture_last = -1;
2562     cb.callout_data = md->callout_data;
2563 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2564 ph10 397 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2565     }
2566 ph10 398 if (rrc > 0) break; /* Fail this thread */
2567 zherczeg 764 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2568 ph10 406 }
2569 ph10 398
2570 ph10 397 condcode = code[LINK_SIZE+1];
2571 ph10 406
2572 nigel 93 /* Back reference conditions are not supported */
2573 nigel 77
2574 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2575 ph10 459 return PCRE_ERROR_DFA_UCOND;
2576 nigel 93
2577     /* The DEFINE condition is always false */
2578    
2579     if (condcode == OP_DEF)
2580 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2581 nigel 93
2582     /* The only supported version of OP_RREF is for the value RREF_ANY,
2583     which means "test if in any recursion". We can't test for specifically
2584     recursed groups. */
2585    
2586 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2587 nigel 93 {
2588 nigel 77 int value = GET2(code, LINK_SIZE+2);
2589 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2590 ph10 654 if (md->recursive != NULL)
2591 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2592     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2593 nigel 77 }
2594    
2595     /* Otherwise, the condition is an assertion */
2596    
2597     else
2598     {
2599     int rc;
2600 ph10 756 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2601     const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2602 nigel 77
2603     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2604    
2605     rc = internal_dfa_exec(
2606     md, /* fixed match data */
2607     asscode, /* this subexpression's code */
2608     ptr, /* where we currently are */
2609 ph10 530 (int)(ptr - start_subject), /* start offset */
2610 nigel 77 local_offsets, /* offset vector */
2611     sizeof(local_offsets)/sizeof(int), /* size of same */
2612     local_workspace, /* workspace vector */
2613     sizeof(local_workspace)/sizeof(int), /* size of same */
2614 ph10 642 rlevel); /* function recursion level */
2615 nigel 77
2616 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2617 nigel 77 if ((rc >= 0) ==
2618     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2619 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2620 nigel 77 else
2621 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2622 nigel 77 }
2623     }
2624     break;
2625    
2626     /*-----------------------------------------------------------------*/
2627     case OP_RECURSE:
2628     {
2629 ph10 654 dfa_recursion_info *ri;
2630 nigel 77 int local_offsets[1000];
2631     int local_workspace[1000];
2632 ph10 756 const pcre_uchar *callpat = start_code + GET(code, 1);
2633 ph10 654 int recno = (callpat == md->start_code)? 0 :
2634     GET2(callpat, 1 + LINK_SIZE);
2635 nigel 77 int rc;
2636    
2637 ph10 642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2638 ph10 654
2639 ph10 642 /* Check for repeating a recursion without advancing the subject
2640     pointer. This should catch convoluted mutual recursions. (Some simple
2641     cases are caught at compile time.) */
2642 nigel 77
2643 ph10 654 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2644     if (recno == ri->group_num && ptr == ri->subject_position)
2645     return PCRE_ERROR_RECURSELOOP;
2646    
2647     /* Remember this recursion and where we started it so as to
2648 ph10 642 catch infinite loops. */
2649 ph10 654
2650 ph10 642 new_recursive.group_num = recno;
2651     new_recursive.subject_position = ptr;
2652     new_recursive.prevrec = md->recursive;
2653 ph10 654 md->recursive = &new_recursive;
2654 ph10 642
2655 nigel 77 rc = internal_dfa_exec(
2656     md, /* fixed match data */
2657 ph10 642 callpat, /* this subexpression's code */
2658 nigel 77 ptr, /* where we currently are */
2659 ph10 530 (int)(ptr - start_subject), /* start offset */
2660 nigel 77 local_offsets, /* offset vector */
2661     sizeof(local_offsets)/sizeof(int), /* size of same */
2662     local_workspace, /* workspace vector */
2663     sizeof(local_workspace)/sizeof(int), /* size of same */
2664 ph10 642 rlevel); /* function recursion level */
2665 nigel 77
2666 ph10 642 md->recursive = new_recursive.prevrec; /* Done this recursion */
2667 nigel 77
2668 ph10 654 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2669 ph10 642 rc));
2670    
2671 nigel 77 /* Ran out of internal offsets */
2672    
2673     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2674    
2675     /* For each successful matched substring, set up the next state with a
2676     count of characters to skip before trying it. Note that the count is in
2677     characters, not bytes. */
2678    
2679     if (rc > 0)
2680     {
2681     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2682     {
2683 ph10 756 const pcre_uchar *p = start_subject + local_offsets[rc];
2684     const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2685 nigel 77 int charcount = local_offsets[rc+1] - local_offsets[rc];
2686     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2687     if (charcount > 0)
2688     {
2689     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2690     }
2691     else
2692     {
2693     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2694     }
2695     }
2696     }
2697     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2698     }
2699     break;
2700    
2701     /*-----------------------------------------------------------------*/
2702 ph10 604 case OP_BRAPOS:
2703     case OP_SBRAPOS:
2704     case OP_CBRAPOS:
2705     case OP_SCBRAPOS:
2706 ph10 654 case OP_BRAPOSZERO:
2707 ph10 604 {
2708     int charcount, matched_count;
2709 ph10 756 const pcre_uchar *local_ptr = ptr;
2710 ph10 604 BOOL allow_zero;
2711 ph10 654
2712 ph10 604 if (codevalue == OP_BRAPOSZERO)
2713     {
2714     allow_zero = TRUE;
2715     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2716     }
2717 ph10 654 else allow_zero = FALSE;
2718    
2719     /* Loop to match the subpattern as many times as possible as if it were
2720     a complete pattern. */
2721    
2722 ph10 604 for (matched_count = 0;; matched_count++)
2723     {
2724     int local_offsets[2];
2725     int local_workspace[1000];
2726 ph10 654
2727 ph10 604 int rc = internal_dfa_exec(
2728     md, /* fixed match data */
2729     code, /* this subexpression's code */
2730     local_ptr, /* where we currently are */
2731     (int)(ptr - start_subject), /* start offset */
2732     local_offsets, /* offset vector */
2733     sizeof(local_offsets)/sizeof(int), /* size of same */
2734     local_workspace, /* workspace vector */
2735     sizeof(local_workspace)/sizeof(int), /* size of same */
2736 ph10 642 rlevel); /* function recursion level */
2737 ph10 654
2738 ph10 604 /* Failed to match */
2739 ph10 654
2740     if (rc < 0)
2741 ph10 604 {
2742     if (rc != PCRE_ERROR_NOMATCH) return rc;
2743     break;
2744 ph10 654 }
2745    
2746 ph10 604 /* Matched: break the loop if zero characters matched. */
2747 ph10 654
2748 ph10 604 charcount = local_offsets[1] - local_offsets[0];
2749 ph10 654 if (charcount == 0) break;
2750 ph10 604 local_ptr += charcount; /* Advance temporary position ptr */
2751 ph10 654 }
2752 ph10 604
2753     /* At this point we have matched the subpattern matched_count
2754 ph10 654 times, and local_ptr is pointing to the character after the end of the
2755     last match. */
2756 ph10 604
2757     if (matched_count > 0 || allow_zero)
2758 ph10 654 {
2759 ph10 756 const pcre_uchar *end_subpattern = code;
2760 ph10 604 int next_state_offset;
2761 ph10 654
2762 ph10 604 do { end_subpattern += GET(end_subpattern, 1); }
2763     while (*end_subpattern == OP_ALT);
2764     next_state_offset =
2765     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2766    
2767     /* Optimization: if there are no more active states, and there
2768     are no new states yet set up, then skip over the subject string
2769     right here, to save looping. Otherwise, set up the new state to swing
2770     into action when the end of the matched substring is reached. */
2771    
2772     if (i + 1 >= active_count && new_count == 0)
2773     {
2774     ptr = local_ptr;
2775     clen = 0;
2776     ADD_NEW(next_state_offset, 0);
2777     }
2778     else
2779     {
2780 ph10 756 const pcre_uchar *p = ptr;
2781     const pcre_uchar *pp = local_ptr;
2782 ph10 654 charcount = pp - p;
2783 ph10 604 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2784     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2785     }
2786 ph10 654 }
2787     }
2788 ph10 604 break;
2789 ph10 654
2790 ph10 604 /*-----------------------------------------------------------------*/
2791 nigel 77 case OP_ONCE:
2792 ph10 733 case OP_ONCE_NC:
2793 nigel 77 {
2794     int local_offsets[2];
2795     int local_workspace[1000];
2796    
2797     int rc = internal_dfa_exec(
2798     md, /* fixed match data */
2799     code, /* this subexpression's code */
2800     ptr, /* where we currently are */
2801 ph10 530 (int)(ptr - start_subject), /* start offset */
2802 nigel 77 local_offsets, /* offset vector */
2803     sizeof(local_offsets)/sizeof(int), /* size of same */
2804     local_workspace, /* workspace vector */
2805     sizeof(local_workspace)/sizeof(int), /* size of same */
2806 ph10 642 rlevel); /* function recursion level */
2807 nigel 77
2808     if (rc >= 0)
2809     {
2810 ph10 756 const pcre_uchar *end_subpattern = code;
2811 nigel 77 int charcount = local_offsets[1] - local_offsets[0];
2812     int next_state_offset, repeat_state_offset;
2813    
2814     do { end_subpattern += GET(end_subpattern, 1); }
2815     while (*end_subpattern == OP_ALT);
2816 ph10 535 next_state_offset =
2817 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2818 nigel 77
2819     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2820     arrange for the repeat state also to be added to the relevant list.
2821     Calculate the offset, or set -1 for no repeat. */
2822    
2823     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2824     *end_subpattern == OP_KETRMIN)?
2825 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2826 nigel 77
2827     /* If we have matched an empty string, add the next state at the
2828     current character pointer. This is important so that the duplicate
2829     checking kicks in, which is what breaks infinite loops that match an
2830     empty string. */
2831    
2832     if (charcount == 0)
2833     {
2834     ADD_ACTIVE(next_state_offset, 0);
2835     }
2836    
2837     /* Optimization: if there are no more active states, and there
2838     are no new states yet set up, then skip over the subject string
2839     right here, to save looping. Otherwise, set up the new state to swing
2840 ph10 604 into action when the end of the matched substring is reached. */
2841 nigel 77
2842     else if (i + 1 >= active_count && new_count == 0)
2843     {
2844     ptr += charcount;
2845     clen = 0;
2846     ADD_NEW(next_state_offset, 0);
2847    
2848     /* If we are adding a repeat state at the new character position,
2849     we must fudge things so that it is the only current state.
2850     Otherwise, it might be a duplicate of one we processed before, and
2851     that would cause it to be skipped. */
2852    
2853     if (repeat_state_offset >= 0)
2854     {
2855     next_active_state = active_states;
2856     active_count = 0;
2857     i = -1;
2858     ADD_ACTIVE(repeat_state_offset, 0);
2859     }
2860     }
2861     else
2862     {
2863 ph10 756 const pcre_uchar *p = start_subject + local_offsets[0];
2864     const pcre_uchar *pp = start_subject + local_offsets[1];
2865 nigel 77 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2866     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2867     if (repeat_state_offset >= 0)
2868     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2869     }
2870     }
2871     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2872     }
2873     break;
2874    
2875    
2876     /* ========================================================================== */
2877     /* Handle callouts */
2878    
2879     case OP_CALLOUT:
2880 ph10 406 rrc = 0;
2881 nigel 77 if (pcre_callout != NULL)
2882     {
2883     pcre_callout_block cb;
2884     cb.version = 1; /* Version 1 of the callout block */
2885     cb.callout_number = code[1];
2886     cb.offset_vector = offsets;
2887 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2888 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2889     cb.start_match = (int)(current_subject - start_subject);
2890     cb.current_position = (int)(ptr - start_subject);
2891 nigel 77 cb.pattern_position = GET(code, 2);
2892     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2893     cb.capture_top = 1;
2894     cb.capture_last = -1;
2895     cb.callout_data = md->callout_data;
2896 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2897 nigel 77 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2898 ph10 406 }
2899     if (rrc == 0)
2900 zherczeg 764 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2901 nigel 77 break;
2902    
2903    
2904     /* ========================================================================== */
2905     default: /* Unsupported opcode */
2906     return PCRE_ERROR_DFA_UITEM;
2907     }
2908    
2909     NEXT_ACTIVE_STATE: continue;
2910    
2911     } /* End of loop scanning active states */
2912    
2913     /* We have finished the processing at the current subject character. If no
2914     new states have been set for the next character, we have found all the
2915     matches that we are going to find. If we are at the top level and partial
2916 ph10 463 matching has been requested, check for appropriate conditions.
2917    
2918 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2919     character. If it is equal to the original active_count (saved in
2920     workspace[1]) it means that (*F) was found on every active state. In this
2921 ph10 463 case we don't want to give a partial match.
2922 nigel 77
2923 ph10 463 The "could_continue" variable is true if a state could have continued but
2924     for the fact that the end of the subject was reached. */
2925    
2926 nigel 77 if (new_count <= 0)
2927     {
2928 ph10 427 if (rlevel == 1 && /* Top level, and */
2929 ph10 463 could_continue && /* Some could go on */
2930 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2931 ph10 427 ( /* either... */
2932     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2933     || /* or... */
2934     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2935     match_count < 0) /* no matches */
2936     ) && /* And... */
2937 ph10 553 ptr >= end_subject && /* Reached end of subject */
2938     ptr > md->start_used_ptr) /* Inspected non-empty string */
2939 nigel 77 {
2940     if (offsetcount >= 2)
2941     {
2942 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2943     offsets[1] = (int)(end_subject - start_subject);
2944 nigel 77 }
2945     match_count = PCRE_ERROR_PARTIAL;
2946     }
2947    
2948     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2949     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2950     rlevel*2-2, SP));
2951 nigel 91 break; /* In effect, "return", but see the comment below */
2952 nigel 77 }
2953    
2954     /* One or more states are active for the next character. */
2955    
2956     ptr += clen; /* Advance to next subject character */
2957     } /* Loop to move along the subject string */
2958    
2959 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2960     if we use "return" above, we have compiler trouble. Some compilers warn if
2961     there's nothing here because they think the function doesn't return a value. On
2962     the other hand, if we put a dummy statement here, some more clever compilers
2963     complain that it can't be reached. Sigh. */
2964 nigel 77
2965 nigel 91 return match_count;
2966 nigel 77 }
2967    
2968    
2969    
2970    
2971     /*************************************************
2972     * Execute a Regular Expression - DFA engine *
2973     *************************************************/
2974    
2975     /* This external function applies a compiled re to a subject string using a DFA
2976     engine. This function calls the internal function multiple times if the pattern
2977     is not anchored.
2978    
2979     Arguments:
2980     argument_re points to the compiled expression
2981 ph10 97 extra_data points to extra data or is NULL
2982 nigel 77 subject points to the subject string
2983     length length of subject string (may contain binary zeros)
2984     start_offset where to start in the subject string
2985     options option bits
2986     offsets vector of match offsets
2987     offsetcount size of same
2988     workspace workspace vector
2989     wscount size of same
2990    
2991     Returns: > 0 => number of match offset pairs placed in offsets
2992     = 0 => offsets overflowed; longest matches are present
2993     -1 => failed to match
2994     < -1 => some kind of unexpected problem
2995     */
2996    
2997 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2998 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2999     const char *subject, int length, int start_offset, int options, int *offsets,
3000     int offsetcount, int *workspace, int wscount)
3001     {
3002     real_pcre *re = (real_pcre *)argument_re;
3003     dfa_match_data match_block;
3004 nigel 91 dfa_match_data *md = &match_block;
3005 zherczeg 781 BOOL utf, anchored, startline, firstline;
3006 ph10 756 const pcre_uchar *current_subject, *end_subject;
3007     const pcre_uint8 *lcc;
3008 nigel 77
3009     pcre_study_data internal_study;
3010     const pcre_study_data *study = NULL;
3011     real_pcre internal_re;
3012    
3013 zherczeg 774 const pcre_uchar *req_char_ptr;
3014 ph10 756 const pcre_uint8 *start_bits = NULL;
3015 zherczeg 774 BOOL has_first_char = FALSE;
3016     BOOL has_req_char = FALSE;
3017     pcre_uchar first_char = 0;
3018     pcre_uchar first_char2 = 0;
3019     pcre_uchar req_char = 0;
3020     pcre_uchar req_char2 = 0;
3021 nigel 91 int newline;
3022 nigel 77
3023     /* Plausibility checks */
3024    
3025     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3026     if (re == NULL || subject == NULL || workspace == NULL ||
3027     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3028     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3029     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3030 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3031 nigel 77
3032     /* We need to find the pointer to any study data before we test for byte
3033     flipping, so we scan the extra_data block first. This may set two fields in the
3034     match block, so we must initialize them beforehand. However, the other fields
3035     in the match block must not be set until after the byte flipping. */
3036    
3037 nigel 91 md->tables = re->tables;
3038     md->callout_data = NULL;
3039 nigel 77
3040     if (extra_data != NULL)
3041     {
3042     unsigned int flags = extra_data->flags;
3043     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3044     study = (const pcre_study_data *)extra_data->study_data;
3045     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3046 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3047     return PCRE_ERROR_DFA_UMLIMIT;
3048 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3049 nigel 91 md->callout_data = extra_data->callout_data;
3050 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3051 nigel 91 md->tables = extra_data->tables;
3052 nigel 77 }
3053 ph10 461
3054 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
3055     test for a regex that was compiled on a host of opposite endianness. If this is
3056     the case, flipped values are put in internal_re and internal_study if there was
3057     study data too. */
3058    
3059     if (re->magic_number != MAGIC_NUMBER)
3060     {
3061 zherczeg 764 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
3062 nigel 77 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3063     if (study != NULL) study = &internal_study;
3064     }
3065    
3066     /* Set some local values */
3067    
3068     current_subject = (const unsigned char *)subject + start_offset;
3069     end_subject = (const unsigned char *)subject + length;
3070 zherczeg 774 req_char_ptr = current_subject - 1;
3071 nigel 77
3072 nigel 91 #ifdef SUPPORT_UTF8
3073 zherczeg 781 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3074     utf = (re->options & PCRE_UTF8) != 0;
3075 nigel 91 #else
3076 zherczeg 781 utf = FALSE;
3077 nigel 91 #endif
3078 nigel 77
3079 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3080     (re->options & PCRE_ANCHORED) != 0;
3081    
3082 nigel 77 /* The remaining fixed data for passing around. */
3083    
3084 ph10 756 md->start_code = (const pcre_uchar *)argument_re +
3085 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3086 nigel 91 md->start_subject = (const unsigned char *)subject;
3087     md->end_subject = end_subject;
3088 ph10 442 md->start_offset = start_offset;
3089 nigel 91 md->moptions = options;
3090     md->poptions = re->options;
3091 nigel 77
3092 ph10 231 /* If the BSR option is not set at match time, copy what was set
3093     at compile time. */
3094    
3095     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3096     {
3097     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3098     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3099     #ifdef BSR_ANYCRLF
3100     else md->moptions |= PCRE_BSR_ANYCRLF;
3101 ph10 243 #endif
3102     }
3103 ph10 231
3104 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3105     nothing is set at run time, whatever was used at compile time applies. */
3106 nigel 91
3107 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3108 nigel 93 PCRE_NEWLINE_BITS)
3109 nigel 91 {
3110 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3111 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3112     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3113 nigel 91 case PCRE_NEWLINE_CR+
3114 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3115 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3116 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3117 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3118 nigel 91 }
3119    
3120 ph10 149 if (newline == -2)
3121 nigel 91 {
3122 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3123     }
3124     else if (newline < 0)
3125     {
3126 nigel 93 md->nltype = NLTYPE_ANY;
3127 nigel 91 }
3128     else
3129     {
3130 nigel 93 md->nltype = NLTYPE_FIXED;
3131     if (newline > 255)
3132     {
3133     md->nllen = 2;
3134     md->nl[0] = (newline >> 8) & 255;
3135     md->nl[1] = newline & 255;
3136     }
3137     else
3138     {
3139     md->nllen = 1;
3140     md->nl[0] = newline;
3141     }
3142 nigel 91 }
3143    
3144 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3145     back the character offset. */
3146    
3147     #ifdef SUPPORT_UTF8
3148 zherczeg 781 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3149 nigel 77 {
3150 ph10 654 int erroroffset;
3151 zherczeg 781 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3152 ph10 606 if (errorcode != 0)
3153 ph10 598 {
3154     if (offsetcount >= 2)
3155     {
3156 ph10 606 offsets[0] = erroroffset;
3157 ph10 598 offsets[1] = errorcode;
3158 ph10 654 }
3159 ph10 598 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3160 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3161 ph10 654 }
3162 zherczeg 782 #ifdef COMPILE_PCRE8
3163 ph10 606 if (start_offset > 0 && start_offset < length &&
3164 ph10 756 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
3165 ph10 606 return PCRE_ERROR_BADUTF8_OFFSET;
3166 zherczeg 782 #else
3167     #ifdef COMPILE_PCRE16
3168     if (start_offset > 0 && start_offset < length &&
3169     (((PCRE_PUCHAR)subject)[start_offset] & 0xfc00) == 0xdc00)
3170     return PCRE_ERROR_BADUTF8_OFFSET;
3171     #endif /* COMPILE_PCRE16 */
3172     #endif /* COMPILE_PCRE8 */
3173 nigel 77 }
3174     #endif
3175    
3176     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3177     is a feature that makes it possible to save compiled regex and re-use them
3178     in other programs later. */
3179    
3180 zherczeg 764 if (md->tables == NULL) md->tables = PRIV(default_tables);
3181 nigel 77
3182     /* The lower casing table and the "must be at the start of a line" flag are
3183     used in a loop when finding where to start. */
3184    
3185 nigel 91 lcc = md->tables + lcc_offset;
3186 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3187 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3188    
3189     /* Set up the first character to match, if available. The first_byte value is
3190     never set for an anchored regular expression, but the anchoring may be forced
3191     at run time, so we have to test for anchoring. The first char may be unset for
3192     an unanchored pattern, of course. If there's no first char and the pattern was
3193     studied, there may be a bitmap of possible first characters. */
3194    
3195     if (!anchored)
3196     {
3197 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3198 nigel 77 {
3199 zherczeg 774 has_first_char = TRUE;
3200     first_char = first_char2 = re->first_char;
3201     if ((re->flags & PCRE_FCH_CASELESS) != 0)
3202     first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3203 nigel 77 }
3204     else
3205     {
3206 ph10 455 if (!startline && study != NULL &&
3207     (study->flags & PCRE_STUDY_MAPPED) != 0)
3208 nigel 77 start_bits = study->start_bits;
3209     }
3210     }
3211    
3212     /* For anchored or unanchored matches, there may be a "last known required
3213     character" set. */
3214    
3215 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3216 nigel 77 {
3217 zherczeg 774 has_req_char = TRUE;
3218     req_char = req_char2 = re->req_char;
3219     if ((re->flags & PCRE_RCH_CASELESS) != 0)
3220     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3221 nigel 77 }
3222    
3223     /* Call the main matching function, looping for a non-anchored regex after a
3224 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3225     a match. */
3226 nigel 77
3227     for (;;)
3228     {
3229     int rc;
3230    
3231     if ((options & PCRE_DFA_RESTART) == 0)
3232     {
3233 ph10 756 const pcre_uchar *save_end_subject = end_subject;
3234 nigel 77
3235 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3236     line of a multiline string. Implement this by temporarily adjusting
3237     end_subject so that we stop scanning at a newline. If the match fails at
3238     the newline, later code breaks this loop. */
3239 nigel 77
3240     if (firstline)
3241     {
3242 ph10 756 PCRE_PUCHAR t = current_subject;
3243 zherczeg 782 #ifdef SUPPORT_UTF
3244 zherczeg 781 if (utf)
3245 ph10 371 {
3246     while (t < md->end_subject && !IS_NEWLINE(t))
3247 ph10 365 {
3248     t++;
3249 zherczeg 782 INTERNALCHAR(t < end_subject, *t, t++);
3250 ph10 371 }
3251 ph10 365 }
3252     else
3253 ph10 371 #endif
3254 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3255 nigel 77 end_subject = t;
3256     }
3257 ph10 392
3258 ph10 389 /* There are some optimizations that avoid running the match if a known
3259 ph10 455 starting point is not found. However, there is an option that disables
3260 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3261 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3262     match-time options. */
3263 nigel 77
3264 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3265 ph10 392 {
3266 zherczeg 774 /* Advance to a known first char. */
3267 ph10 392
3268 zherczeg 774 if (has_first_char)
3269 nigel 77 {
3270 zherczeg 774 if (first_char != first_char2)
3271 ph10 389 while (current_subject < end_subject &&
3272 zherczeg 774 *current_subject != first_char && *current_subject != first_char2)
3273 ph10 389 current_subject++;
3274     else
3275 ph10 392 while (current_subject < end_subject &&
3276 zherczeg 774 *current_subject != first_char)
3277 ph10 389 current_subject++;
3278     }
3279 ph10 392
3280 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3281 ph10 392
3282 ph10 389 else if (startline)
3283     {
3284     if (current_subject > md->start_subject + start_offset)
3285     {
3286 zherczeg 782 #ifdef SUPPORT_UTF
3287 zherczeg 781 if (utf)
3288 ph10 365 {
3289 ph10 392 while (current_subject < end_subject &&
3290 ph10 389 !WAS_NEWLINE(current_subject))
3291     {
3292 ph10 365 current_subject++;
3293 zherczeg 782 INTERNALCHAR(current_subject < end_subject, *current_subject,
3294     current_subject++);
3295 ph10 389 }
3296 ph10 371 }
3297 ph10 389 else
3298     #endif
3299     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3300     current_subject++;
3301 ph10 392
3302 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3303     ANYCRLF, and we are now at a LF, advance the match position by one
3304     more character. */
3305 ph10 392
3306 ph10 391 if (current_subject[-1] == CHAR_CR &&
3307 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3308     current_subject < end_subject &&
3309 ph10 391 *current_subject == CHAR_NL)
3310 ph10 389 current_subject++;
3311 ph10 365 }
3312 nigel 77 }
3313 ph10 392
3314 ph10 389 /* Or to a non-unique first char after study */
3315 ph10 392
3316 ph10 389 else if (start_bits != NULL)
3317 nigel 77 {
3318 ph10 389 while (current_subject < end_subject)
3319     {
3320     register unsigned int c = *current_subject;
3321 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3322 ph10 538 {
3323     current_subject++;
3324 zherczeg 782 #ifdef SUPPORT_UTF
3325 zherczeg 781 if (utf)
3326 zherczeg 782 INTERNALCHAR(current_subject < end_subject, *current_subject,
3327     current_subject++);
3328 ph10 545 #endif
3329 ph10 538 }
3330     else break;
3331 ph10 389 }
3332 nigel 77 }
3333 ph10 392 }
3334 nigel 77
3335     /* Restore fudged end_subject */
3336    
3337     end_subject = save_end_subject;
3338    
3339 ph10 461 /* The following two optimizations are disabled for partial matching or if
3340     disabling is explicitly requested (and of course, by the test above, this
3341 ph10 455 code is not obeyed when restarting after a partial match). */
3342 ph10 461
3343 ph10 728 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3344 ph10 455 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3345 ph10 461 {
3346 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3347     is a lower bound; no actual string of that length may actually match the
3348     pattern. Although the value is, strictly, in characters, we treat it as
3349     bytes to avoid spending too much time in this optimization. */
3350 nigel 77
3351 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3352 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3353 ph10 455 return PCRE_ERROR_NOMATCH;
3354 ph10 461
3355 zherczeg 774 /* If req_char is set, we know that that character must appear in the
3356     subject for the match to succeed. If the first character is set, req_char
3357 ph10 455 must be later in the subject; otherwise the test starts at the match
3358     point. This optimization can save a huge amount of work in patterns with
3359     nested unlimited repeats that aren't going to match. Writing separate
3360     code for cased/caseless versions makes it go faster, as does using an
3361     autoincrement and backing off on a match.
3362 ph10 461
3363 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3364     can take a long time, and give bad performance on quite ordinary
3365     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3366     string... so we don't do this when the string is sufficiently long. */
3367 ph10 461
3368 zherczeg 774 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3369 nigel 77 {
3370 zherczeg 774 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3371 ph10 461
3372 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3373     place we found it at last time. */
3374 ph10 461
3375 zherczeg 774 if (p > req_char_ptr)
3376 nigel 77 {
3377 zherczeg 774 if (req_char != req_char2)
3378 ph10 455 {
3379     while (p < end_subject)
3380     {
3381     register int pp = *p++;
3382 zherczeg 774 if (pp == req_char || pp == req_char2) { p--; break; }
3383 ph10 455 }
3384     }
3385     else
3386     {
3387     while (p < end_subject)
3388     {
3389 zherczeg 774 if (*p++ == req_char) { p--; break; }
3390 ph10 455 }
3391     }
3392 ph10 461
3393 ph10 455 /* If we can't find the required character, break the matching loop,
3394     which will cause a return or PCRE_ERROR_NOMATCH. */
3395 ph10 461
3396 ph10 455 if (p >= end_subject) break;
3397 ph10 461
3398 ph10 455 /* If we have found the required character, save the point where we
3399     found it, so that we don't search again next time round the loop if
3400     the start hasn't passed this character yet. */
3401 ph10 461
3402 zherczeg 774 req_char_ptr = p;
3403 nigel 77 }
3404 ph10 461 }
3405 nigel 77 }
3406 ph10 455 } /* End of optimizations that are done when not restarting */
3407 nigel 77
3408     /* OK, now we can do the business */
3409    
3410 ph10 435 md->start_used_ptr = current_subject;
3411 ph10 654 md->recursive = NULL;
3412 ph10 461
3413 nigel 77 rc = internal_dfa_exec(
3414 nigel 91 md, /* fixed match data */
3415     md->start_code, /* this subexpression's code */
3416     current_subject, /* where we currently are */
3417     start_offset, /* start offset in subject */
3418     offsets, /* offset vector */
3419     offsetcount, /* size of same */
3420     workspace, /* workspace vector */
3421     wscount, /* size of same */
3422 ph10 642 0); /* function recurse level */
3423 nigel 77
3424     /* Anything other than "no match" means we are done, always; otherwise, carry
3425     on only if not anchored. */
3426    
3427     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3428    
3429     /* Advance to the next subject character unless we are at the end of a line
3430     and firstline is set. */
3431    
3432 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3433 nigel 77 current_subject++;
3434 zherczeg 782 #ifdef SUPPORT_UTF
3435 zherczeg 781 if (utf)
3436 nigel 77 {
3437 zherczeg 782 INTERNALCHAR(current_subject < end_subject, *current_subject,
3438     current_subject++);
3439 nigel 77 }
3440 zherczeg 782 #endif
3441 nigel 77 if (current_subject > end_subject) break;
3442    
3443 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3444 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3445     or ANY or ANYCRLF, advance the match position by one more character. */
3446 nigel 93
3447 ph10 391 if (current_subject[-1] == CHAR_CR &&
3448 ph10 226 current_subject < end_subject &&
3449 ph10 391 *current_subject == CHAR_NL &&
3450 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3451 ph10 226 (md->nltype == NLTYPE_ANY ||
3452     md->nltype == NLTYPE_ANYCRLF ||
3453     md->nllen == 2))
3454 nigel 93 current_subject++;
3455    
3456     } /* "Bumpalong" loop */
3457    
3458 nigel 77 return PCRE_ERROR_NOMATCH;
3459     }
3460    
3461     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12