/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 774 - (hide annotations) (download)
Thu Dec 1 06:08:45 2011 UTC (2 years, 10 months ago) by zherczeg
File MIME type: text/plain
File size: 120231 byte(s)
better digit parsing, first_byte, req_byte are renamed to first_char req_char respectively
1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 598 Copyright (c) 1997-2011 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49     the performance of his patterns greatly. I could not use it as it stood, as it
50     was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
52    
53     The issue is the check for duplicate states, which is done by a simple linear
54     search up the state list. (Grep for "duplicate" below to find the code.) For
55     many patterns, there will never be many states active at one time, so a simple
56     linear search is fine. In patterns that have many active states, it might be a
57     bottleneck. The suggested code used an indexing scheme to remember which states
58     had previously been used for each character, and avoided the linear search when
59     it knew there was no chance of a duplicate. This was implemented when adding
60     states to the state lists.
61    
62     I wrote some thread-safe, not-limited code to try something similar at the time
63     of checking for duplicates (instead of when adding states), using index vectors
64     on the stack. It did give a 13% improvement with one specially constructed
65     pattern for certain subject strings, but on other strings and on many of the
66     simpler patterns in the test suite it did worse. The major problem, I think,
67     was the extra time to initialize the index. This had to be done for each call
68     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69     only once - I suspect this was the cause of the problems with the tests.)
70    
71 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
72 ph10 439 in others, so I abandoned this code. */
73    
74    
75    
76 ph10 200 #ifdef HAVE_CONFIG_H
77 ph10 236 #include "config.h"
78 ph10 200 #endif
79 ph10 199
80 nigel 93 #define NLBLOCK md /* Block containing newline information */
81     #define PSSTART start_subject /* Field containing processed string start */
82     #define PSEND end_subject /* Field containing processed string end */
83    
84 nigel 77 #include "pcre_internal.h"
85    
86    
87     /* For use to indent debugging output */
88    
89     #define SP " "
90    
91    
92     /*************************************************
93     * Code parameters and static tables *
94     *************************************************/
95    
96     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
98 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
99 ph10 178 never stored, so we push them well clear of the normal opcodes. */
100 nigel 77
101 ph10 178 #define OP_PROP_EXTRA 300
102     #define OP_EXTUNI_EXTRA 320
103     #define OP_ANYNL_EXTRA 340
104     #define OP_HSPACE_EXTRA 360
105     #define OP_VSPACE_EXTRA 380
106 nigel 77
107    
108     /* This table identifies those opcodes that are followed immediately by a
109 ph10 510 character that is to be tested in some way. This makes it possible to
110 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
111     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
113 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
114     modified, the three tables that follow must also be modified. */
115 nigel 77
116 ph10 756 static const pcre_uint8 coptable[] = {
117 nigel 77 0, /* End */
118 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
121 ph10 498 0, 0, /* \P, \p */
122 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 ph10 498 0, /* \X */
124 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
125 nigel 77 1, /* Char */
126 ph10 602 1, /* Chari */
127 nigel 77 1, /* not */
128 ph10 602 1, /* noti */
129 nigel 77 /* Positive single-char repeats */
130     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131     3, 3, 3, /* upto, minupto, exact */
132 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134     3, 3, 3, /* upto I, minupto I, exact I */
135     1, 1, 1, 3, /* *+I, ++I, ?+I, upto+I */
136 nigel 77 /* Negative single-char repeats - only for chars < 256 */
137     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
138     3, 3, 3, /* NOT upto, minupto, exact */
139 ph10 602 1, 1, 1, 3, /* NOT *+, ++, ?+, upto+ */
140     1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
141     3, 3, 3, /* NOT upto I, minupto I, exact I */
142     1, 1, 1, 3, /* NOT *+I, ++I, ?+I, upto+I */
143 nigel 77 /* Positive type repeats */
144     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
145     3, 3, 3, /* Type upto, minupto, exact */
146 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
147 nigel 77 /* Character class & ref repeats */
148     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
149     0, 0, /* CRRANGE, CRMINRANGE */
150     0, /* CLASS */
151     0, /* NCLASS */
152     0, /* XCLASS - variable length */
153     0, /* REF */
154 ph10 602 0, /* REFI */
155 nigel 77 0, /* RECURSE */
156     0, /* CALLOUT */
157     0, /* Alt */
158     0, /* Ket */
159     0, /* KetRmax */
160     0, /* KetRmin */
161 ph10 604 0, /* KetRpos */
162 ph10 637 0, /* Reverse */
163 nigel 77 0, /* Assert */
164     0, /* Assert not */
165     0, /* Assert behind */
166     0, /* Assert behind not */
167 ph10 723 0, 0, /* ONCE, ONCE_NC */
168     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
169 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
170 ph10 498 0, 0, /* CREF, NCREF */
171     0, 0, /* RREF, NRREF */
172 nigel 93 0, /* DEF */
173 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
174 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
175     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
176     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
177     0, 0 /* CLOSE, SKIPZERO */
178 nigel 77 };
179    
180 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
181 ph10 462 remember the fact that a character could have been inspected when the end of
182 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
183     two tables that follow must also be modified. */
184 ph10 462
185 ph10 756 static const pcre_uint8 poptable[] = {
186 ph10 462 0, /* End */
187 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
188 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
189     1, 1, 1, /* Any, AllAny, Anybyte */
190 ph10 498 1, 1, /* \P, \p */
191 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
192 ph10 498 1, /* \X */
193 ph10 602 0, 0, 0, 0, 0, 0, /* \Z, \z, ^, ^M, $, $M */
194 ph10 462 1, /* Char */
195 ph10 602 1, /* Chari */
196 ph10 462 1, /* not */
197 ph10 602 1, /* noti */
198 ph10 462 /* Positive single-char repeats */
199     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
200     1, 1, 1, /* upto, minupto, exact */
201     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
202 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
203     1, 1, 1, /* upto I, minupto I, exact I */
204     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
205 ph10 462 /* Negative single-char repeats - only for chars < 256 */
206     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
207     1, 1, 1, /* NOT upto, minupto, exact */
208     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
209 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
210     1, 1, 1, /* NOT upto I, minupto I, exact I */
211     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
212 ph10 462 /* Positive type repeats */
213     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
214     1, 1, 1, /* Type upto, minupto, exact */
215     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
216     /* Character class & ref repeats */
217     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
218     1, 1, /* CRRANGE, CRMINRANGE */
219     1, /* CLASS */
220     1, /* NCLASS */
221     1, /* XCLASS - variable length */
222     0, /* REF */
223 ph10 602 0, /* REFI */
224 ph10 462 0, /* RECURSE */
225     0, /* CALLOUT */
226     0, /* Alt */
227     0, /* Ket */
228     0, /* KetRmax */
229     0, /* KetRmin */
230 ph10 604 0, /* KetRpos */
231 ph10 637 0, /* Reverse */
232 ph10 462 0, /* Assert */
233     0, /* Assert not */
234     0, /* Assert behind */
235     0, /* Assert behind not */
236 ph10 723 0, 0, /* ONCE, ONCE_NC */
237     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
238 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
239 ph10 498 0, 0, /* CREF, NCREF */
240     0, 0, /* RREF, NRREF */
241 ph10 462 0, /* DEF */
242 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
243 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
244     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
245     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
246     0, 0 /* CLOSE, SKIPZERO */
247 ph10 462 };
248    
249 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
250     and \w */
251    
252 ph10 756 static const pcre_uint8 toptable1[] = {
253 ph10 168 0, 0, 0, 0, 0, 0,
254 nigel 77 ctype_digit, ctype_digit,
255     ctype_space, ctype_space,
256     ctype_word, ctype_word,
257 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
258 nigel 77 };
259    
260 ph10 756 static const pcre_uint8 toptable2[] = {
261 ph10 168 0, 0, 0, 0, 0, 0,
262 nigel 77 ctype_digit, 0,
263     ctype_space, 0,
264     ctype_word, 0,
265 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
266 nigel 77 };
267    
268    
269     /* Structure for holding data about a particular state, which is in effect the
270     current data for an active path through the match tree. It must consist
271     entirely of ints because the working vector we are passed, and which we put
272     these structures in, is a vector of ints. */
273    
274     typedef struct stateblock {
275     int offset; /* Offset to opcode */
276     int count; /* Count for repeats */
277     int data; /* Some use extra data */
278     } stateblock;
279    
280     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
281    
282    
283 ph10 475 #ifdef PCRE_DEBUG
284 nigel 77 /*************************************************
285     * Print character string *
286     *************************************************/
287    
288     /* Character string printing function for debugging.
289    
290     Arguments:
291     p points to string
292     length number of bytes
293     f where to print
294    
295     Returns: nothing
296     */
297    
298     static void
299     pchars(unsigned char *p, int length, FILE *f)
300     {
301     int c;
302     while (length-- > 0)
303     {
304     if (isprint(c = *(p++)))
305     fprintf(f, "%c", c);
306     else
307     fprintf(f, "\\x%02x", c);
308     }
309     }
310     #endif
311    
312    
313    
314     /*************************************************
315     * Execute a Regular Expression - DFA engine *
316     *************************************************/
317    
318     /* This internal function applies a compiled pattern to a subject string,
319     starting at a given point, using a DFA engine. This function is called from the
320     external one, possibly multiple times if the pattern is not anchored. The
321     function calls itself recursively for some kinds of subpattern.
322    
323     Arguments:
324     md the match_data block with fixed information
325     this_start_code the opening bracket of this subexpression's code
326     current_subject where we currently are in the subject string
327     start_offset start offset in the subject string
328     offsets vector to contain the matching string offsets
329     offsetcount size of same
330     workspace vector of workspace
331     wscount size of same
332     rlevel function call recursion level
333    
334 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
335 ph10 341 = 0 => offsets overflowed; longest matches are present
336 nigel 77 -1 => failed to match
337     < -1 => some kind of unexpected problem
338    
339     The following macros are used for adding states to the two state vectors (one
340     for the current character, one for the following character). */
341    
342     #define ADD_ACTIVE(x,y) \
343     if (active_count++ < wscount) \
344     { \
345     next_active_state->offset = (x); \
346     next_active_state->count = (y); \
347     next_active_state++; \
348     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
349     } \
350     else return PCRE_ERROR_DFA_WSSIZE
351    
352     #define ADD_ACTIVE_DATA(x,y,z) \
353     if (active_count++ < wscount) \
354     { \
355     next_active_state->offset = (x); \
356     next_active_state->count = (y); \
357     next_active_state->data = (z); \
358     next_active_state++; \
359     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
360     } \
361     else return PCRE_ERROR_DFA_WSSIZE
362    
363     #define ADD_NEW(x,y) \
364     if (new_count++ < wscount) \
365     { \
366     next_new_state->offset = (x); \
367     next_new_state->count = (y); \
368     next_new_state++; \
369     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
370     } \
371     else return PCRE_ERROR_DFA_WSSIZE
372    
373     #define ADD_NEW_DATA(x,y,z) \
374     if (new_count++ < wscount) \
375     { \
376     next_new_state->offset = (x); \
377     next_new_state->count = (y); \
378     next_new_state->data = (z); \
379     next_new_state++; \
380     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
381     } \
382     else return PCRE_ERROR_DFA_WSSIZE
383    
384     /* And now, here is the code */
385    
386     static int
387     internal_dfa_exec(
388     dfa_match_data *md,
389 ph10 756 const pcre_uchar *this_start_code,
390     const pcre_uchar *current_subject,
391 nigel 77 int start_offset,
392     int *offsets,
393     int offsetcount,
394     int *workspace,
395     int wscount,
396 ph10 642 int rlevel)
397 nigel 77 {
398     stateblock *active_states, *new_states, *temp_states;
399     stateblock *next_active_state, *next_new_state;
400    
401 ph10 756 const pcre_uint8 *ctypes, *lcc, *fcc;
402     const pcre_uchar *ptr;
403     const pcre_uchar *end_code, *first_op;
404 nigel 77
405 ph10 642 dfa_recursion_info new_recursive;
406    
407 nigel 77 int active_count, new_count, match_count;
408    
409     /* Some fields in the md block are frequently referenced, so we load them into
410     independent variables in the hope that this will perform better. */
411    
412 ph10 756 const pcre_uchar *start_subject = md->start_subject;
413     const pcre_uchar *end_subject = md->end_subject;
414     const pcre_uchar *start_code = md->start_code;
415 nigel 77
416 nigel 87 #ifdef SUPPORT_UTF8
417 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
418 nigel 93 #else
419     BOOL utf8 = FALSE;
420 nigel 87 #endif
421 nigel 77
422     rlevel++;
423     offsetcount &= (-2);
424    
425     wscount -= 2;
426     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
427     (2 * INTS_PER_STATEBLOCK);
428    
429     DPRINTF(("\n%.*s---------------------\n"
430 ph10 642 "%.*sCall to internal_dfa_exec f=%d\n",
431     rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
432 nigel 77
433     ctypes = md->tables + ctypes_offset;
434     lcc = md->tables + lcc_offset;
435     fcc = md->tables + fcc_offset;
436    
437     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
438    
439     active_states = (stateblock *)(workspace + 2);
440     next_new_state = new_states = active_states + wscount;
441     new_count = 0;
442    
443 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
444 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
445 zherczeg 769 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
446     ? IMM2_SIZE:0);
447 nigel 93
448 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
449     the alternative states onto the list, and find out where the end is. This
450     makes is possible to use this function recursively, when we want to stop at a
451     matching internal ket rather than at the end.
452    
453     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
454     a backward assertion. In that case, we have to find out the maximum amount to
455     move back, and set up each alternative appropriately. */
456    
457 nigel 93 if (*first_op == OP_REVERSE)
458 nigel 77 {
459     int max_back = 0;
460     int gone_back;
461    
462     end_code = this_start_code;
463     do
464     {
465     int back = GET(end_code, 2+LINK_SIZE);
466     if (back > max_back) max_back = back;
467     end_code += GET(end_code, 1);
468     }
469     while (*end_code == OP_ALT);
470    
471     /* If we can't go back the amount required for the longest lookbehind
472     pattern, go back as far as we can; some alternatives may still be viable. */
473    
474     #ifdef SUPPORT_UTF8
475     /* In character mode we have to step back character by character */
476    
477     if (utf8)
478     {
479     for (gone_back = 0; gone_back < max_back; gone_back++)
480     {
481     if (current_subject <= start_subject) break;
482     current_subject--;
483     while (current_subject > start_subject &&
484     (*current_subject & 0xc0) == 0x80)
485     current_subject--;
486     }
487     }
488     else
489     #endif
490    
491     /* In byte-mode we can do this quickly. */
492    
493     {
494     gone_back = (current_subject - max_back < start_subject)?
495 ph10 530 (int)(current_subject - start_subject) : max_back;
496 nigel 77 current_subject -= gone_back;
497     }
498 ph10 461
499 ph10 435 /* Save the earliest consulted character */
500 nigel 77
501 ph10 461 if (current_subject < md->start_used_ptr)
502     md->start_used_ptr = current_subject;
503    
504 nigel 77 /* Now we can process the individual branches. */
505    
506     end_code = this_start_code;
507     do
508     {
509     int back = GET(end_code, 2+LINK_SIZE);
510     if (back <= gone_back)
511     {
512 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
513 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
514     }
515     end_code += GET(end_code, 1);
516     }
517     while (*end_code == OP_ALT);
518     }
519    
520     /* This is the code for a "normal" subpattern (not a backward assertion). The
521     start of a whole pattern is always one of these. If we are at the top level,
522     we may be asked to restart matching from the same point that we reached for a
523     previous partial match. We still have to scan through the top-level branches to
524     find the end state. */
525    
526     else
527     {
528     end_code = this_start_code;
529    
530     /* Restarting */
531    
532     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
533     {
534     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
535     new_count = workspace[1];
536     if (!workspace[0])
537     memcpy(new_states, active_states, new_count * sizeof(stateblock));
538     }
539    
540     /* Not restarting */
541    
542     else
543     {
544 nigel 93 int length = 1 + LINK_SIZE +
545 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
546 zherczeg 769 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
547     ? IMM2_SIZE:0);
548 nigel 77 do
549     {
550 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
551 nigel 77 end_code += GET(end_code, 1);
552 nigel 93 length = 1 + LINK_SIZE;
553 nigel 77 }
554     while (*end_code == OP_ALT);
555     }
556     }
557    
558     workspace[0] = 0; /* Bit indicating which vector is current */
559    
560 zherczeg 769 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
561 nigel 77
562     /* Loop for scanning the subject */
563    
564     ptr = current_subject;
565     for (;;)
566     {
567     int i, j;
568 nigel 91 int clen, dlen;
569     unsigned int c, d;
570 ph10 428 int forced_fail = 0;
571 ph10 462 BOOL could_continue = FALSE;
572 nigel 77
573     /* Make the new state list into the active state list and empty the
574     new state list. */
575    
576     temp_states = active_states;
577     active_states = new_states;
578     new_states = temp_states;
579     active_count = new_count;
580     new_count = 0;
581    
582     workspace[0] ^= 1; /* Remember for the restarting feature */
583     workspace[1] = active_count;
584    
585 ph10 475 #ifdef PCRE_DEBUG
586 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
587 ph10 756 pchars((pcre_uchar *)ptr, strlen((char *)ptr), stdout);
588 nigel 77 printf("\"\n");
589    
590     printf("%.*sActive states: ", rlevel*2-2, SP);
591     for (i = 0; i < active_count; i++)
592     printf("%d/%d ", active_states[i].offset, active_states[i].count);
593     printf("\n");
594     #endif
595    
596     /* Set the pointers for adding new states */
597    
598     next_active_state = active_states + active_count;
599     next_new_state = new_states;
600    
601     /* Load the current character from the subject outside the loop, as many
602     different states may want to look at it, and we assume that at least one
603     will. */
604    
605     if (ptr < end_subject)
606     {
607 nigel 93 clen = 1; /* Number of bytes in the character */
608 nigel 77 #ifdef SUPPORT_UTF8
609     if (utf8) { GETCHARLEN(c, ptr, clen); } else
610     #endif /* SUPPORT_UTF8 */
611     c = *ptr;
612     }
613     else
614     {
615 nigel 93 clen = 0; /* This indicates the end of the subject */
616     c = NOTACHAR; /* This value should never actually be used */
617 nigel 77 }
618    
619     /* Scan up the active states and act on each one. The result of an action
620     may be to add more states to the currently active list (e.g. on hitting a
621     parenthesis) or it may be to put states on the new list, for considering
622     when we move the character pointer on. */
623    
624     for (i = 0; i < active_count; i++)
625     {
626     stateblock *current_state = active_states + i;
627 ph10 654 BOOL caseless = FALSE;
628 ph10 756 const pcre_uchar *code;
629 nigel 77 int state_offset = current_state->offset;
630 ph10 397 int count, codevalue, rrc;
631 nigel 77
632 ph10 475 #ifdef PCRE_DEBUG
633 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
634 nigel 93 if (clen == 0) printf("EOL\n");
635 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
636     else printf("0x%02x\n", c);
637     #endif
638    
639     /* A negative offset is a special case meaning "hold off going to this
640     (negated) state until the number of characters in the data field have
641     been skipped". */
642    
643     if (state_offset < 0)
644     {
645     if (current_state->data > 0)
646     {
647     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
648     ADD_NEW_DATA(state_offset, current_state->count,
649     current_state->data - 1);
650     continue;
651     }
652     else
653     {
654     current_state->offset = state_offset = -state_offset;
655     }
656     }
657    
658 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
659 ph10 439 See the note at the head of this module about the possibility of improving
660     performance here. */
661 nigel 77
662     for (j = 0; j < i; j++)
663     {
664     if (active_states[j].offset == state_offset &&
665     active_states[j].count == current_state->count)
666     {
667     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
668     goto NEXT_ACTIVE_STATE;
669     }
670     }
671    
672     /* The state offset is the offset to the opcode */
673    
674     code = start_code + state_offset;
675     codevalue = *code;
676    
677 ph10 463 /* If this opcode inspects a character, but we are at the end of the
678     subject, remember the fact for use when testing for a partial match. */
679    
680 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
681 ph10 463 could_continue = TRUE;
682 ph10 462
683 nigel 77 /* If this opcode is followed by an inline character, load it. It is
684     tempting to test for the presence of a subject character here, but that
685     is wrong, because sometimes zero repetitions of the subject are
686     permitted.
687    
688     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
689 ph10 178 argument that is not a data character - but is always one byte long. We
690     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
691     this case. To keep the other cases fast, convert these ones to new opcodes.
692     */
693 nigel 77
694     if (coptable[codevalue] > 0)
695     {
696     dlen = 1;
697     #ifdef SUPPORT_UTF8
698     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
699     #endif /* SUPPORT_UTF8 */
700     d = code[coptable[codevalue]];
701     if (codevalue >= OP_TYPESTAR)
702     {
703 nigel 93 switch(d)
704     {
705     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
706     case OP_NOTPROP:
707     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
708     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
709     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
710 ph10 178 case OP_NOT_HSPACE:
711 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
712 ph10 178 case OP_NOT_VSPACE:
713 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
714 nigel 93 default: break;
715     }
716 nigel 77 }
717     }
718     else
719     {
720     dlen = 0; /* Not strictly necessary, but compilers moan */
721 nigel 93 d = NOTACHAR; /* if these variables are not set. */
722 nigel 77 }
723    
724    
725     /* Now process the individual opcodes */
726    
727     switch (codevalue)
728     {
729 ph10 498 /* ========================================================================== */
730     /* These cases are never obeyed. This is a fudge that causes a compile-
731     time error if the vectors coptable or poptable, which are indexed by
732     opcode, are not the correct length. It seems to be the only way to do
733     such a check at compile time, as the sizeof() operator does not work
734     in the C preprocessor. */
735 ph10 507
736 ph10 498 case OP_TABLE_LENGTH:
737 ph10 507 case OP_TABLE_LENGTH +
738 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
739     (sizeof(poptable) == OP_TABLE_LENGTH)):
740 ph10 507 break;
741 nigel 77
742     /* ========================================================================== */
743     /* Reached a closing bracket. If not at the end of the pattern, carry
744 ph10 654 on with the next opcode. For repeating opcodes, also add the repeat
745     state. Note that KETRPOS will always be encountered at the end of the
746     subpattern, because the possessive subpattern repeats are always handled
747 ph10 604 using recursive calls. Thus, it never adds any new states.
748 ph10 654
749 ph10 604 At the end of the (sub)pattern, unless we have an empty string and
750 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
751 ph10 442 start of the subject, save the match data, shifting up all previous
752 nigel 77 matches so we always have the longest first. */
753    
754     case OP_KET:
755     case OP_KETRMIN:
756     case OP_KETRMAX:
757 ph10 654 case OP_KETRPOS:
758 nigel 77 if (code != end_code)
759     {
760     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
761     if (codevalue != OP_KET)
762     {
763     ADD_ACTIVE(state_offset - GET(code, 1), 0);
764     }
765     }
766 ph10 461 else
767 nigel 77 {
768 ph10 461 if (ptr > current_subject ||
769 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
770     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
771     current_subject > start_subject + md->start_offset)))
772 nigel 77 {
773 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
774 ph10 680 else if (match_count > 0 && ++match_count * 2 > offsetcount)
775 ph10 428 match_count = 0;
776     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
777     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
778     if (offsetcount >= 2)
779     {
780 ph10 530 offsets[0] = (int)(current_subject - start_subject);
781     offsets[1] = (int)(ptr - start_subject);
782 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
783     offsets[1] - offsets[0], current_subject));
784     }
785     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
786     {
787     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
788     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
789     match_count, rlevel*2-2, SP));
790     return match_count;
791     }
792 ph10 461 }
793 nigel 77 }
794     break;
795    
796     /* ========================================================================== */
797     /* These opcodes add to the current list of states without looking
798     at the current character. */
799    
800     /*-----------------------------------------------------------------*/
801     case OP_ALT:
802     do { code += GET(code, 1); } while (*code == OP_ALT);
803 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
804 nigel 77 break;
805    
806     /*-----------------------------------------------------------------*/
807     case OP_BRA:
808 nigel 93 case OP_SBRA:
809 nigel 77 do
810     {
811 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
812 nigel 77 code += GET(code, 1);
813     }
814     while (*code == OP_ALT);
815     break;
816    
817     /*-----------------------------------------------------------------*/
818 nigel 93 case OP_CBRA:
819     case OP_SCBRA:
820 zherczeg 769 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
821 nigel 93 code += GET(code, 1);
822     while (*code == OP_ALT)
823     {
824 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
825 nigel 93 code += GET(code, 1);
826     }
827     break;
828    
829     /*-----------------------------------------------------------------*/
830 nigel 77 case OP_BRAZERO:
831     case OP_BRAMINZERO:
832     ADD_ACTIVE(state_offset + 1, 0);
833     code += 1 + GET(code, 2);
834     while (*code == OP_ALT) code += GET(code, 1);
835 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
836 nigel 77 break;
837    
838     /*-----------------------------------------------------------------*/
839 ph10 335 case OP_SKIPZERO:
840     code += 1 + GET(code, 2);
841     while (*code == OP_ALT) code += GET(code, 1);
842 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
843 ph10 335 break;
844    
845     /*-----------------------------------------------------------------*/
846 nigel 77 case OP_CIRC:
847 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
848     { ADD_ACTIVE(state_offset + 1, 0); }
849     break;
850    
851     /*-----------------------------------------------------------------*/
852     case OP_CIRCM:
853 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
854 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
855 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
856     break;
857    
858     /*-----------------------------------------------------------------*/
859     case OP_EOD:
860 ph10 579 if (ptr >= end_subject)
861     {
862 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
863     could_continue = TRUE;
864     else { ADD_ACTIVE(state_offset + 1, 0); }
865     }
866 nigel 77 break;
867    
868     /*-----------------------------------------------------------------*/
869     case OP_SOD:
870     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
871     break;
872    
873     /*-----------------------------------------------------------------*/
874     case OP_SOM:
875     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
876     break;
877    
878    
879     /* ========================================================================== */
880     /* These opcodes inspect the next subject character, and sometimes
881     the previous one as well, but do not have an argument. The variable
882     clen contains the length of the current character and is zero if we are
883     at the end of the subject. */
884    
885     /*-----------------------------------------------------------------*/
886     case OP_ANY:
887 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
888 nigel 77 { ADD_NEW(state_offset + 1, 0); }
889     break;
890    
891     /*-----------------------------------------------------------------*/
892 ph10 341 case OP_ALLANY:
893     if (clen > 0)
894     { ADD_NEW(state_offset + 1, 0); }
895     break;
896    
897     /*-----------------------------------------------------------------*/
898 nigel 77 case OP_EODN:
899 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
900     could_continue = TRUE;
901     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
902 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
903     break;
904    
905     /*-----------------------------------------------------------------*/
906     case OP_DOLL:
907     if ((md->moptions & PCRE_NOTEOL) == 0)
908     {
909 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
910     could_continue = TRUE;
911     else if (clen == 0 ||
912 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
913 ph10 602 (ptr == end_subject - md->nllen)
914 nigel 91 ))
915 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
916     }
917 ph10 602 break;
918    
919     /*-----------------------------------------------------------------*/
920     case OP_DOLLM:
921     if ((md->moptions & PCRE_NOTEOL) == 0)
922     {
923     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
924     could_continue = TRUE;
925     else if (clen == 0 ||
926     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
927     { ADD_ACTIVE(state_offset + 1, 0); }
928     }
929     else if (IS_NEWLINE(ptr))
930 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
931     break;
932    
933     /*-----------------------------------------------------------------*/
934    
935     case OP_DIGIT:
936     case OP_WHITESPACE:
937     case OP_WORDCHAR:
938     if (clen > 0 && c < 256 &&
939     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
940     { ADD_NEW(state_offset + 1, 0); }
941     break;
942    
943     /*-----------------------------------------------------------------*/
944     case OP_NOT_DIGIT:
945     case OP_NOT_WHITESPACE:
946     case OP_NOT_WORDCHAR:
947     if (clen > 0 && (c >= 256 ||
948     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
949     { ADD_NEW(state_offset + 1, 0); }
950     break;
951    
952     /*-----------------------------------------------------------------*/
953     case OP_WORD_BOUNDARY:
954     case OP_NOT_WORD_BOUNDARY:
955     {
956     int left_word, right_word;
957    
958     if (ptr > start_subject)
959     {
960 ph10 756 const pcre_uchar *temp = ptr - 1;
961 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
962 nigel 77 #ifdef SUPPORT_UTF8
963     if (utf8) BACKCHAR(temp);
964     #endif
965     GETCHARTEST(d, temp);
966 ph10 535 #ifdef SUPPORT_UCP
967 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
968     {
969     if (d == '_') left_word = TRUE; else
970 ph10 535 {
971 ph10 518 int cat = UCD_CATEGORY(d);
972     left_word = (cat == ucp_L || cat == ucp_N);
973 ph10 535 }
974     }
975     else
976     #endif
977 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
978     }
979 ph10 518 else left_word = FALSE;
980 nigel 77
981 ph10 461 if (clen > 0)
982 ph10 535 {
983     #ifdef SUPPORT_UCP
984 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
985     {
986     if (c == '_') right_word = TRUE; else
987 ph10 535 {
988 ph10 518 int cat = UCD_CATEGORY(c);
989     right_word = (cat == ucp_L || cat == ucp_N);
990 ph10 535 }
991     }
992     else
993     #endif
994 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
995 ph10 535 }
996 ph10 518 else right_word = FALSE;
997 nigel 77
998     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
999     { ADD_ACTIVE(state_offset + 1, 0); }
1000     }
1001     break;
1002    
1003    
1004     /*-----------------------------------------------------------------*/
1005     /* Check the next character by Unicode property. We will get here only
1006     if the support is in the binary; otherwise a compile-time error occurs.
1007     */
1008    
1009 ph10 151 #ifdef SUPPORT_UCP
1010 nigel 77 case OP_PROP:
1011     case OP_NOTPROP:
1012     if (clen > 0)
1013     {
1014 nigel 87 BOOL OK;
1015 ph10 349 const ucd_record * prop = GET_UCD(c);
1016 nigel 87 switch(code[1])
1017 nigel 77 {
1018 nigel 87 case PT_ANY:
1019     OK = TRUE;
1020     break;
1021    
1022     case PT_LAMP:
1023 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1024 ph10 517 prop->chartype == ucp_Lt;
1025 nigel 87 break;
1026    
1027     case PT_GC:
1028 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1029 nigel 87 break;
1030    
1031     case PT_PC:
1032 ph10 349 OK = prop->chartype == code[2];
1033 nigel 87 break;
1034    
1035     case PT_SC:
1036 ph10 349 OK = prop->script == code[2];
1037 nigel 87 break;
1038 ph10 535
1039 ph10 517 /* These are specials for combination cases. */
1040 ph10 535
1041 ph10 517 case PT_ALNUM:
1042 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1043     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1044 ph10 535 break;
1045    
1046 ph10 517 case PT_SPACE: /* Perl space */
1047 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1048 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1049 ph10 535 break;
1050    
1051 ph10 517 case PT_PXSPACE: /* POSIX space */
1052 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1053 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1054     c == CHAR_FF || c == CHAR_CR;
1055 ph10 535 break;
1056    
1057 ph10 517 case PT_WORD:
1058 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1059     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1060 ph10 517 c == CHAR_UNDERSCORE;
1061 ph10 535 break;
1062 nigel 87
1063     /* Should never occur, but keep compilers from grumbling. */
1064    
1065     default:
1066     OK = codevalue != OP_PROP;
1067     break;
1068 nigel 77 }
1069 nigel 87
1070     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1071 nigel 77 }
1072     break;
1073     #endif
1074    
1075    
1076    
1077     /* ========================================================================== */
1078     /* These opcodes likewise inspect the subject character, but have an
1079     argument that is not a data character. It is one of these opcodes:
1080 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1081     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1082 nigel 77
1083     case OP_TYPEPLUS:
1084     case OP_TYPEMINPLUS:
1085 nigel 93 case OP_TYPEPOSPLUS:
1086 nigel 77 count = current_state->count; /* Already matched */
1087     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1088     if (clen > 0)
1089     {
1090     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1091     (c < 256 &&
1092 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1093 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1094     {
1095 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1096     {
1097     active_count--; /* Remove non-match possibility */
1098     next_active_state--;
1099     }
1100 nigel 77 count++;
1101     ADD_NEW(state_offset, count);
1102     }
1103     }
1104     break;
1105    
1106     /*-----------------------------------------------------------------*/
1107     case OP_TYPEQUERY:
1108     case OP_TYPEMINQUERY:
1109 nigel 93 case OP_TYPEPOSQUERY:
1110 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1111     if (clen > 0)
1112     {
1113     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1114     (c < 256 &&
1115 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1116 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1117     {
1118 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1119     {
1120     active_count--; /* Remove non-match possibility */
1121     next_active_state--;
1122     }
1123 nigel 77 ADD_NEW(state_offset + 2, 0);
1124     }
1125     }
1126     break;
1127    
1128     /*-----------------------------------------------------------------*/
1129     case OP_TYPESTAR:
1130     case OP_TYPEMINSTAR:
1131 nigel 93 case OP_TYPEPOSSTAR:
1132 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1133     if (clen > 0)
1134     {
1135     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1136     (c < 256 &&
1137 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1138 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1139     {
1140 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1141     {
1142     active_count--; /* Remove non-match possibility */
1143     next_active_state--;
1144     }
1145 nigel 77 ADD_NEW(state_offset, 0);
1146     }
1147     }
1148     break;
1149    
1150     /*-----------------------------------------------------------------*/
1151     case OP_TYPEEXACT:
1152 nigel 93 count = current_state->count; /* Number already matched */
1153     if (clen > 0)
1154     {
1155     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1156     (c < 256 &&
1157 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1158 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1159     {
1160     if (++count >= GET2(code, 1))
1161 zherczeg 769 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1162 nigel 93 else
1163     { ADD_NEW(state_offset, count); }
1164     }
1165     }
1166     break;
1167    
1168     /*-----------------------------------------------------------------*/
1169 nigel 77 case OP_TYPEUPTO:
1170     case OP_TYPEMINUPTO:
1171 nigel 93 case OP_TYPEPOSUPTO:
1172 zherczeg 769 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1173 nigel 77 count = current_state->count; /* Number already matched */
1174     if (clen > 0)
1175     {
1176     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1177     (c < 256 &&
1178 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1179 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1180     {
1181 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1182     {
1183     active_count--; /* Remove non-match possibility */
1184     next_active_state--;
1185     }
1186 nigel 77 if (++count >= GET2(code, 1))
1187 zherczeg 769 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1188 nigel 77 else
1189     { ADD_NEW(state_offset, count); }
1190     }
1191     }
1192     break;
1193    
1194     /* ========================================================================== */
1195     /* These are virtual opcodes that are used when something like
1196 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1197     argument. It keeps the code above fast for the other cases. The argument
1198     is in the d variable. */
1199 nigel 77
1200 ph10 151 #ifdef SUPPORT_UCP
1201 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1202     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1203 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1204 nigel 77 count = current_state->count; /* Already matched */
1205 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1206 nigel 77 if (clen > 0)
1207     {
1208 nigel 87 BOOL OK;
1209 ph10 349 const ucd_record * prop = GET_UCD(c);
1210 nigel 87 switch(code[2])
1211     {
1212     case PT_ANY:
1213     OK = TRUE;
1214     break;
1215    
1216     case PT_LAMP:
1217 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1218 ph10 517 prop->chartype == ucp_Lt;
1219 nigel 87 break;
1220    
1221     case PT_GC:
1222 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1223 nigel 87 break;
1224    
1225     case PT_PC:
1226 ph10 349 OK = prop->chartype == code[3];
1227 nigel 87 break;
1228    
1229     case PT_SC:
1230 ph10 349 OK = prop->script == code[3];
1231 nigel 87 break;
1232    
1233 ph10 517 /* These are specials for combination cases. */
1234 ph10 535
1235 ph10 517 case PT_ALNUM:
1236 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1237     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1238 ph10 535 break;
1239    
1240 ph10 517 case PT_SPACE: /* Perl space */
1241 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1242 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1243 ph10 535 break;
1244    
1245 ph10 517 case PT_PXSPACE: /* POSIX space */
1246 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1247 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1248     c == CHAR_FF || c == CHAR_CR;
1249 ph10 535 break;
1250    
1251 ph10 517 case PT_WORD:
1252 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1253     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1254 ph10 517 c == CHAR_UNDERSCORE;
1255 ph10 535 break;
1256 ph10 517
1257 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1258    
1259     default:
1260     OK = codevalue != OP_PROP;
1261     break;
1262     }
1263    
1264 nigel 93 if (OK == (d == OP_PROP))
1265     {
1266     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1267     {
1268     active_count--; /* Remove non-match possibility */
1269     next_active_state--;
1270     }
1271     count++;
1272     ADD_NEW(state_offset, count);
1273     }
1274 nigel 77 }
1275     break;
1276    
1277     /*-----------------------------------------------------------------*/
1278     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1279     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1280 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1281 nigel 77 count = current_state->count; /* Already matched */
1282     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1283 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1284 nigel 77 {
1285 ph10 756 const pcre_uchar *nptr = ptr + clen;
1286 nigel 77 int ncount = 0;
1287 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1288     {
1289     active_count--; /* Remove non-match possibility */
1290     next_active_state--;
1291     }
1292 nigel 77 while (nptr < end_subject)
1293     {
1294     int nd;
1295     int ndlen = 1;
1296     GETCHARLEN(nd, nptr, ndlen);
1297 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1298 nigel 77 ncount++;
1299     nptr += ndlen;
1300     }
1301     count++;
1302     ADD_NEW_DATA(-state_offset, count, ncount);
1303     }
1304     break;
1305 ph10 151 #endif
1306 nigel 77
1307     /*-----------------------------------------------------------------*/
1308 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1309     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1310     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1311     count = current_state->count; /* Already matched */
1312     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1313     if (clen > 0)
1314     {
1315     int ncount = 0;
1316     switch (c)
1317     {
1318     case 0x000b:
1319     case 0x000c:
1320     case 0x0085:
1321     case 0x2028:
1322     case 0x2029:
1323 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324     goto ANYNL01;
1325    
1326     case 0x000d:
1327     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328     /* Fall through */
1329    
1330     ANYNL01:
1331     case 0x000a:
1332 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1333     {
1334     active_count--; /* Remove non-match possibility */
1335     next_active_state--;
1336     }
1337     count++;
1338     ADD_NEW_DATA(-state_offset, count, ncount);
1339     break;
1340 ph10 231
1341 nigel 93 default:
1342     break;
1343     }
1344     }
1345     break;
1346    
1347     /*-----------------------------------------------------------------*/
1348 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1349     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1350     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1351     count = current_state->count; /* Already matched */
1352     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1353     if (clen > 0)
1354     {
1355 ph10 182 BOOL OK;
1356 ph10 178 switch (c)
1357     {
1358     case 0x000a:
1359     case 0x000b:
1360     case 0x000c:
1361     case 0x000d:
1362     case 0x0085:
1363     case 0x2028:
1364     case 0x2029:
1365     OK = TRUE;
1366 ph10 182 break;
1367 ph10 178
1368     default:
1369     OK = FALSE;
1370 ph10 182 break;
1371 ph10 178 }
1372    
1373     if (OK == (d == OP_VSPACE))
1374 ph10 182 {
1375 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1376     {
1377     active_count--; /* Remove non-match possibility */
1378     next_active_state--;
1379     }
1380     count++;
1381     ADD_NEW_DATA(-state_offset, count, 0);
1382     }
1383     }
1384     break;
1385    
1386     /*-----------------------------------------------------------------*/
1387     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1388     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1389     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1390     count = current_state->count; /* Already matched */
1391     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1392     if (clen > 0)
1393     {
1394 ph10 182 BOOL OK;
1395 ph10 178 switch (c)
1396     {
1397     case 0x09: /* HT */
1398     case 0x20: /* SPACE */
1399     case 0xa0: /* NBSP */
1400     case 0x1680: /* OGHAM SPACE MARK */
1401     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1402     case 0x2000: /* EN QUAD */
1403     case 0x2001: /* EM QUAD */
1404     case 0x2002: /* EN SPACE */
1405     case 0x2003: /* EM SPACE */
1406     case 0x2004: /* THREE-PER-EM SPACE */
1407     case 0x2005: /* FOUR-PER-EM SPACE */
1408     case 0x2006: /* SIX-PER-EM SPACE */
1409     case 0x2007: /* FIGURE SPACE */
1410     case 0x2008: /* PUNCTUATION SPACE */
1411     case 0x2009: /* THIN SPACE */
1412     case 0x200A: /* HAIR SPACE */
1413     case 0x202f: /* NARROW NO-BREAK SPACE */
1414     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1415     case 0x3000: /* IDEOGRAPHIC SPACE */
1416     OK = TRUE;
1417     break;
1418 ph10 182
1419 ph10 178 default:
1420     OK = FALSE;
1421     break;
1422     }
1423 ph10 182
1424 ph10 178 if (OK == (d == OP_HSPACE))
1425 ph10 182 {
1426 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1427     {
1428     active_count--; /* Remove non-match possibility */
1429     next_active_state--;
1430     }
1431     count++;
1432     ADD_NEW_DATA(-state_offset, count, 0);
1433     }
1434     }
1435     break;
1436    
1437     /*-----------------------------------------------------------------*/
1438 ph10 151 #ifdef SUPPORT_UCP
1439 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1440     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1441 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1442 nigel 87 count = 4;
1443 nigel 77 goto QS1;
1444    
1445     case OP_PROP_EXTRA + OP_TYPESTAR:
1446     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1447 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1448 nigel 77 count = 0;
1449    
1450     QS1:
1451    
1452 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1453 nigel 77 if (clen > 0)
1454     {
1455 nigel 87 BOOL OK;
1456 ph10 349 const ucd_record * prop = GET_UCD(c);
1457 nigel 87 switch(code[2])
1458     {
1459     case PT_ANY:
1460     OK = TRUE;
1461     break;
1462    
1463     case PT_LAMP:
1464 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1465 ph10 517 prop->chartype == ucp_Lt;
1466 nigel 87 break;
1467    
1468     case PT_GC:
1469 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1470 nigel 87 break;
1471    
1472     case PT_PC:
1473 ph10 349 OK = prop->chartype == code[3];
1474 nigel 87 break;
1475    
1476     case PT_SC:
1477 ph10 349 OK = prop->script == code[3];
1478 nigel 87 break;
1479 ph10 535
1480 ph10 517 /* These are specials for combination cases. */
1481 ph10 535
1482 ph10 517 case PT_ALNUM:
1483 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1484     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1485 ph10 535 break;
1486    
1487 ph10 517 case PT_SPACE: /* Perl space */
1488 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1489 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1490 ph10 535 break;
1491    
1492 ph10 517 case PT_PXSPACE: /* POSIX space */
1493 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1494 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1495     c == CHAR_FF || c == CHAR_CR;
1496 ph10 535 break;
1497    
1498 ph10 517 case PT_WORD:
1499 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1500     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1501 ph10 517 c == CHAR_UNDERSCORE;
1502 ph10 535 break;
1503 nigel 87
1504     /* Should never occur, but keep compilers from grumbling. */
1505    
1506     default:
1507     OK = codevalue != OP_PROP;
1508     break;
1509     }
1510    
1511 nigel 93 if (OK == (d == OP_PROP))
1512     {
1513     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1514     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1515     {
1516     active_count--; /* Remove non-match possibility */
1517     next_active_state--;
1518     }
1519     ADD_NEW(state_offset + count, 0);
1520     }
1521 nigel 77 }
1522     break;
1523    
1524     /*-----------------------------------------------------------------*/
1525     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1526     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1527 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1528 nigel 77 count = 2;
1529     goto QS2;
1530    
1531     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1532     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1533 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1534 nigel 77 count = 0;
1535    
1536     QS2:
1537    
1538     ADD_ACTIVE(state_offset + 2, 0);
1539 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1540 nigel 77 {
1541 ph10 756 const pcre_uchar *nptr = ptr + clen;
1542 nigel 77 int ncount = 0;
1543 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1544     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1545     {
1546     active_count--; /* Remove non-match possibility */
1547     next_active_state--;
1548     }
1549 nigel 77 while (nptr < end_subject)
1550     {
1551     int nd;
1552     int ndlen = 1;
1553     GETCHARLEN(nd, nptr, ndlen);
1554 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1555 nigel 77 ncount++;
1556     nptr += ndlen;
1557     }
1558     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1559     }
1560     break;
1561 ph10 151 #endif
1562 nigel 77
1563     /*-----------------------------------------------------------------*/
1564 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1565     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1566     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1567     count = 2;
1568     goto QS3;
1569    
1570     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1571     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1572     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1573     count = 0;
1574    
1575     QS3:
1576     ADD_ACTIVE(state_offset + 2, 0);
1577     if (clen > 0)
1578     {
1579     int ncount = 0;
1580     switch (c)
1581     {
1582     case 0x000b:
1583     case 0x000c:
1584     case 0x0085:
1585     case 0x2028:
1586     case 0x2029:
1587 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1588     goto ANYNL02;
1589    
1590     case 0x000d:
1591     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1592     /* Fall through */
1593    
1594     ANYNL02:
1595     case 0x000a:
1596 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1597     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1598     {
1599     active_count--; /* Remove non-match possibility */
1600     next_active_state--;
1601     }
1602     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1603     break;
1604 ph10 231
1605 nigel 93 default:
1606     break;
1607     }
1608     }
1609     break;
1610    
1611     /*-----------------------------------------------------------------*/
1612 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1613     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1614     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1615     count = 2;
1616     goto QS4;
1617    
1618     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1619     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1620     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1621     count = 0;
1622    
1623     QS4:
1624     ADD_ACTIVE(state_offset + 2, 0);
1625     if (clen > 0)
1626     {
1627 ph10 182 BOOL OK;
1628 ph10 178 switch (c)
1629     {
1630     case 0x000a:
1631     case 0x000b:
1632     case 0x000c:
1633     case 0x000d:
1634     case 0x0085:
1635     case 0x2028:
1636     case 0x2029:
1637     OK = TRUE;
1638     break;
1639 ph10 182
1640 ph10 178 default:
1641     OK = FALSE;
1642     break;
1643     }
1644     if (OK == (d == OP_VSPACE))
1645 ph10 182 {
1646 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1647     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1648     {
1649     active_count--; /* Remove non-match possibility */
1650     next_active_state--;
1651     }
1652     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1653     }
1654     }
1655     break;
1656    
1657     /*-----------------------------------------------------------------*/
1658     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1659     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1660     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1661     count = 2;
1662     goto QS5;
1663    
1664     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1665     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1666     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1667     count = 0;
1668    
1669     QS5:
1670     ADD_ACTIVE(state_offset + 2, 0);
1671     if (clen > 0)
1672     {
1673 ph10 182 BOOL OK;
1674 ph10 178 switch (c)
1675     {
1676     case 0x09: /* HT */
1677     case 0x20: /* SPACE */
1678     case 0xa0: /* NBSP */
1679     case 0x1680: /* OGHAM SPACE MARK */
1680     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1681     case 0x2000: /* EN QUAD */
1682     case 0x2001: /* EM QUAD */
1683     case 0x2002: /* EN SPACE */
1684     case 0x2003: /* EM SPACE */
1685     case 0x2004: /* THREE-PER-EM SPACE */
1686     case 0x2005: /* FOUR-PER-EM SPACE */
1687     case 0x2006: /* SIX-PER-EM SPACE */
1688     case 0x2007: /* FIGURE SPACE */
1689     case 0x2008: /* PUNCTUATION SPACE */
1690     case 0x2009: /* THIN SPACE */
1691     case 0x200A: /* HAIR SPACE */
1692     case 0x202f: /* NARROW NO-BREAK SPACE */
1693     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1694     case 0x3000: /* IDEOGRAPHIC SPACE */
1695     OK = TRUE;
1696     break;
1697 ph10 182
1698 ph10 178 default:
1699     OK = FALSE;
1700     break;
1701     }
1702 ph10 182
1703 ph10 178 if (OK == (d == OP_HSPACE))
1704 ph10 182 {
1705 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1706     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1707     {
1708     active_count--; /* Remove non-match possibility */
1709     next_active_state--;
1710     }
1711     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1712     }
1713     }
1714     break;
1715    
1716     /*-----------------------------------------------------------------*/
1717 ph10 151 #ifdef SUPPORT_UCP
1718 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1719     case OP_PROP_EXTRA + OP_TYPEUPTO:
1720     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1721 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1722 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1723 zherczeg 769 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1724 nigel 77 count = current_state->count; /* Number already matched */
1725     if (clen > 0)
1726     {
1727 nigel 87 BOOL OK;
1728 ph10 349 const ucd_record * prop = GET_UCD(c);
1729 zherczeg 769 switch(code[1 + IMM2_SIZE + 1])
1730 nigel 77 {
1731 nigel 87 case PT_ANY:
1732     OK = TRUE;
1733     break;
1734    
1735     case PT_LAMP:
1736 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1737 ph10 517 prop->chartype == ucp_Lt;
1738 nigel 87 break;
1739    
1740     case PT_GC:
1741 zherczeg 769 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1742 nigel 87 break;
1743    
1744     case PT_PC:
1745 zherczeg 769 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1746 nigel 87 break;
1747    
1748     case PT_SC:
1749 zherczeg 769 OK = prop->script == code[1 + IMM2_SIZE + 2];
1750 nigel 87 break;
1751 ph10 535
1752 ph10 517 /* These are specials for combination cases. */
1753 ph10 535
1754 ph10 517 case PT_ALNUM:
1755 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1756     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1757 ph10 535 break;
1758    
1759 ph10 517 case PT_SPACE: /* Perl space */
1760 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1761 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR;
1762 ph10 535 break;
1763    
1764 ph10 517 case PT_PXSPACE: /* POSIX space */
1765 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1766 ph10 517 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
1767     c == CHAR_FF || c == CHAR_CR;
1768 ph10 535 break;
1769    
1770 ph10 517 case PT_WORD:
1771 zherczeg 764 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1772     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1773 ph10 517 c == CHAR_UNDERSCORE;
1774 ph10 535 break;
1775 nigel 87
1776     /* Should never occur, but keep compilers from grumbling. */
1777    
1778     default:
1779     OK = codevalue != OP_PROP;
1780     break;
1781     }
1782    
1783     if (OK == (d == OP_PROP))
1784     {
1785 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1786     {
1787     active_count--; /* Remove non-match possibility */
1788     next_active_state--;
1789     }
1790 nigel 77 if (++count >= GET2(code, 1))
1791 zherczeg 769 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1792 nigel 77 else
1793     { ADD_NEW(state_offset, count); }
1794     }
1795     }
1796     break;
1797    
1798     /*-----------------------------------------------------------------*/
1799     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1800     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1801     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1802 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1803 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1804 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1805 nigel 77 count = current_state->count; /* Number already matched */
1806 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1807 nigel 77 {
1808 ph10 756 const pcre_uchar *nptr = ptr + clen;
1809 nigel 77 int ncount = 0;
1810 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1811     {
1812     active_count--; /* Remove non-match possibility */
1813     next_active_state--;
1814     }
1815 nigel 77 while (nptr < end_subject)
1816     {
1817     int nd;
1818     int ndlen = 1;
1819     GETCHARLEN(nd, nptr, ndlen);
1820 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1821 nigel 77 ncount++;
1822     nptr += ndlen;
1823     }
1824     if (++count >= GET2(code, 1))
1825 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1826 nigel 77 else
1827     { ADD_NEW_DATA(-state_offset, count, ncount); }
1828     }
1829     break;
1830 ph10 151 #endif
1831 nigel 77
1832 nigel 93 /*-----------------------------------------------------------------*/
1833     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1834     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1835     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1836     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1837     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1838 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1839 nigel 93 count = current_state->count; /* Number already matched */
1840     if (clen > 0)
1841     {
1842     int ncount = 0;
1843     switch (c)
1844     {
1845     case 0x000b:
1846     case 0x000c:
1847     case 0x0085:
1848     case 0x2028:
1849     case 0x2029:
1850 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1851     goto ANYNL03;
1852    
1853     case 0x000d:
1854     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1855     /* Fall through */
1856    
1857     ANYNL03:
1858     case 0x000a:
1859 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1860     {
1861     active_count--; /* Remove non-match possibility */
1862     next_active_state--;
1863     }
1864     if (++count >= GET2(code, 1))
1865 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1866 nigel 93 else
1867     { ADD_NEW_DATA(-state_offset, count, ncount); }
1868     break;
1869 ph10 231
1870 nigel 93 default:
1871     break;
1872     }
1873     }
1874     break;
1875    
1876 ph10 178 /*-----------------------------------------------------------------*/
1877     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1878     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1879     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1880     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1881     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1882 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1883 ph10 178 count = current_state->count; /* Number already matched */
1884     if (clen > 0)
1885     {
1886 ph10 182 BOOL OK;
1887 ph10 178 switch (c)
1888     {
1889     case 0x000a:
1890     case 0x000b:
1891     case 0x000c:
1892     case 0x000d:
1893     case 0x0085:
1894     case 0x2028:
1895     case 0x2029:
1896     OK = TRUE;
1897     break;
1898 ph10 182
1899 ph10 178 default:
1900     OK = FALSE;
1901     }
1902 ph10 182
1903 ph10 178 if (OK == (d == OP_VSPACE))
1904 ph10 182 {
1905 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1906     {
1907     active_count--; /* Remove non-match possibility */
1908     next_active_state--;
1909     }
1910     if (++count >= GET2(code, 1))
1911 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1912 ph10 178 else
1913     { ADD_NEW_DATA(-state_offset, count, 0); }
1914     }
1915     }
1916     break;
1917    
1918     /*-----------------------------------------------------------------*/
1919     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1920     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1921     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1922     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1923     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1924 zherczeg 769 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1925 ph10 178 count = current_state->count; /* Number already matched */
1926     if (clen > 0)
1927     {
1928 ph10 182 BOOL OK;
1929 ph10 178 switch (c)
1930     {
1931     case 0x09: /* HT */
1932     case 0x20: /* SPACE */
1933     case 0xa0: /* NBSP */
1934     case 0x1680: /* OGHAM SPACE MARK */
1935     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1936     case 0x2000: /* EN QUAD */
1937     case 0x2001: /* EM QUAD */
1938     case 0x2002: /* EN SPACE */
1939     case 0x2003: /* EM SPACE */
1940     case 0x2004: /* THREE-PER-EM SPACE */
1941     case 0x2005: /* FOUR-PER-EM SPACE */
1942     case 0x2006: /* SIX-PER-EM SPACE */
1943     case 0x2007: /* FIGURE SPACE */
1944     case 0x2008: /* PUNCTUATION SPACE */
1945     case 0x2009: /* THIN SPACE */
1946     case 0x200A: /* HAIR SPACE */
1947     case 0x202f: /* NARROW NO-BREAK SPACE */
1948     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1949     case 0x3000: /* IDEOGRAPHIC SPACE */
1950     OK = TRUE;
1951     break;
1952 ph10 182
1953 ph10 178 default:
1954     OK = FALSE;
1955     break;
1956     }
1957 ph10 182
1958 ph10 178 if (OK == (d == OP_HSPACE))
1959 ph10 182 {
1960 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1961     {
1962     active_count--; /* Remove non-match possibility */
1963     next_active_state--;
1964     }
1965     if (++count >= GET2(code, 1))
1966 zherczeg 769 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
1967 ph10 178 else
1968     { ADD_NEW_DATA(-state_offset, count, 0); }
1969     }
1970     }
1971     break;
1972    
1973 nigel 77 /* ========================================================================== */
1974     /* These opcodes are followed by a character that is usually compared
1975     to the current subject character; it is loaded into d. We still get
1976     here even if there is no subject character, because in some cases zero
1977     repetitions are permitted. */
1978    
1979     /*-----------------------------------------------------------------*/
1980     case OP_CHAR:
1981     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1982     break;
1983    
1984     /*-----------------------------------------------------------------*/
1985 ph10 602 case OP_CHARI:
1986 nigel 77 if (clen == 0) break;
1987    
1988     #ifdef SUPPORT_UTF8
1989     if (utf8)
1990     {
1991     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1992     {
1993 nigel 93 unsigned int othercase;
1994 nigel 77 if (c < 128) othercase = fcc[c]; else
1995    
1996     /* If we have Unicode property support, we can use it to test the
1997 nigel 87 other case of the character. */
1998 nigel 77
1999     #ifdef SUPPORT_UCP
2000 ph10 349 othercase = UCD_OTHERCASE(c);
2001 nigel 87 #else
2002 nigel 93 othercase = NOTACHAR;
2003 nigel 77 #endif
2004    
2005     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2006     }
2007     }
2008     else
2009     #endif /* SUPPORT_UTF8 */
2010    
2011     /* Non-UTF-8 mode */
2012     {
2013     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
2014     }
2015     break;
2016    
2017    
2018     #ifdef SUPPORT_UCP
2019     /*-----------------------------------------------------------------*/
2020     /* This is a tricky one because it can match more than one character.
2021     Find out how many characters to skip, and then set up a negative state
2022     to wait for them to pass before continuing. */
2023    
2024     case OP_EXTUNI:
2025 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
2026 nigel 77 {
2027 ph10 756 const pcre_uchar *nptr = ptr + clen;
2028 nigel 77 int ncount = 0;
2029     while (nptr < end_subject)
2030     {
2031     int nclen = 1;
2032     GETCHARLEN(c, nptr, nclen);
2033 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
2034 nigel 77 ncount++;
2035     nptr += nclen;
2036     }
2037     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2038     }
2039     break;
2040     #endif
2041    
2042     /*-----------------------------------------------------------------*/
2043 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2044     character (when CR is followed by LF). In this case, set up a negative
2045     state to wait for one character to pass before continuing. */
2046    
2047     case OP_ANYNL:
2048     if (clen > 0) switch(c)
2049     {
2050     case 0x000b:
2051     case 0x000c:
2052     case 0x0085:
2053     case 0x2028:
2054     case 0x2029:
2055 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2056    
2057     case 0x000a:
2058 nigel 93 ADD_NEW(state_offset + 1, 0);
2059     break;
2060 ph10 231
2061 nigel 93 case 0x000d:
2062     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
2063     {
2064     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2065     }
2066     else
2067     {
2068     ADD_NEW(state_offset + 1, 0);
2069     }
2070     break;
2071     }
2072     break;
2073    
2074     /*-----------------------------------------------------------------*/
2075 ph10 178 case OP_NOT_VSPACE:
2076     if (clen > 0) switch(c)
2077     {
2078     case 0x000a:
2079     case 0x000b:
2080     case 0x000c:
2081     case 0x000d:
2082     case 0x0085:
2083     case 0x2028:
2084     case 0x2029:
2085     break;
2086 ph10 182
2087     default:
2088 ph10 178 ADD_NEW(state_offset + 1, 0);
2089     break;
2090     }
2091     break;
2092    
2093     /*-----------------------------------------------------------------*/
2094     case OP_VSPACE:
2095     if (clen > 0) switch(c)
2096     {
2097     case 0x000a:
2098     case 0x000b:
2099     case 0x000c:
2100     case 0x000d:
2101     case 0x0085:
2102     case 0x2028:
2103     case 0x2029:
2104     ADD_NEW(state_offset + 1, 0);
2105     break;
2106 ph10 182
2107 ph10 178 default: break;
2108     }
2109     break;
2110    
2111     /*-----------------------------------------------------------------*/
2112     case OP_NOT_HSPACE:
2113     if (clen > 0) switch(c)
2114     {
2115     case 0x09: /* HT */
2116     case 0x20: /* SPACE */
2117     case 0xa0: /* NBSP */
2118     case 0x1680: /* OGHAM SPACE MARK */
2119     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2120     case 0x2000: /* EN QUAD */
2121     case 0x2001: /* EM QUAD */
2122     case 0x2002: /* EN SPACE */
2123     case 0x2003: /* EM SPACE */
2124     case 0x2004: /* THREE-PER-EM SPACE */
2125     case 0x2005: /* FOUR-PER-EM SPACE */
2126     case 0x2006: /* SIX-PER-EM SPACE */
2127     case 0x2007: /* FIGURE SPACE */
2128     case 0x2008: /* PUNCTUATION SPACE */
2129     case 0x2009: /* THIN SPACE */
2130     case 0x200A: /* HAIR SPACE */
2131     case 0x202f: /* NARROW NO-BREAK SPACE */
2132     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2133     case 0x3000: /* IDEOGRAPHIC SPACE */
2134     break;
2135 ph10 182
2136     default:
2137 ph10 178 ADD_NEW(state_offset + 1, 0);
2138     break;
2139     }
2140     break;
2141    
2142     /*-----------------------------------------------------------------*/
2143     case OP_HSPACE:
2144     if (clen > 0) switch(c)
2145     {
2146     case 0x09: /* HT */
2147     case 0x20: /* SPACE */
2148     case 0xa0: /* NBSP */
2149     case 0x1680: /* OGHAM SPACE MARK */
2150     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2151     case 0x2000: /* EN QUAD */
2152     case 0x2001: /* EM QUAD */
2153     case 0x2002: /* EN SPACE */
2154     case 0x2003: /* EM SPACE */
2155     case 0x2004: /* THREE-PER-EM SPACE */
2156     case 0x2005: /* FOUR-PER-EM SPACE */
2157     case 0x2006: /* SIX-PER-EM SPACE */
2158     case 0x2007: /* FIGURE SPACE */
2159     case 0x2008: /* PUNCTUATION SPACE */
2160     case 0x2009: /* THIN SPACE */
2161     case 0x200A: /* HAIR SPACE */
2162     case 0x202f: /* NARROW NO-BREAK SPACE */
2163     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2164     case 0x3000: /* IDEOGRAPHIC SPACE */
2165     ADD_NEW(state_offset + 1, 0);
2166     break;
2167     }
2168     break;
2169    
2170     /*-----------------------------------------------------------------*/
2171 ph10 602 /* Match a negated single character casefully. This is only used for
2172     one-byte characters, that is, we know that d < 256. The character we are
2173 nigel 77 checking (c) can be multibyte. */
2174    
2175     case OP_NOT:
2176 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2177 nigel 77 break;
2178    
2179     /*-----------------------------------------------------------------*/
2180 ph10 602 /* Match a negated single character caselessly. This is only used for
2181     one-byte characters, that is, we know that d < 256. The character we are
2182     checking (c) can be multibyte. */
2183    
2184     case OP_NOTI:
2185 ph10 654 if (clen > 0 && c != d && c != fcc[d])
2186 ph10 602 { ADD_NEW(state_offset + dlen + 1, 0); }
2187     break;
2188    
2189     /*-----------------------------------------------------------------*/
2190     case OP_PLUSI:
2191     case OP_MINPLUSI:
2192     case OP_POSPLUSI:
2193     case OP_NOTPLUSI:
2194     case OP_NOTMINPLUSI:
2195     case OP_NOTPOSPLUSI:
2196     caseless = TRUE;
2197     codevalue -= OP_STARI - OP_STAR;
2198 ph10 654
2199 ph10 602 /* Fall through */
2200 nigel 77 case OP_PLUS:
2201     case OP_MINPLUS:
2202 nigel 93 case OP_POSPLUS:
2203 nigel 77 case OP_NOTPLUS:
2204     case OP_NOTMINPLUS:
2205 nigel 93 case OP_NOTPOSPLUS:
2206 nigel 77 count = current_state->count; /* Already matched */
2207     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2208     if (clen > 0)
2209     {
2210 nigel 93 unsigned int otherd = NOTACHAR;
2211 ph10 602 if (caseless)
2212 nigel 77 {
2213     #ifdef SUPPORT_UTF8
2214 nigel 87 if (utf8 && d >= 128)
2215 nigel 77 {
2216     #ifdef SUPPORT_UCP
2217 ph10 349 otherd = UCD_OTHERCASE(d);
2218 nigel 77 #endif /* SUPPORT_UCP */
2219     }
2220     else
2221     #endif /* SUPPORT_UTF8 */
2222     otherd = fcc[d];
2223     }
2224     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2225 nigel 93 {
2226     if (count > 0 &&
2227     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2228     {
2229     active_count--; /* Remove non-match possibility */
2230     next_active_state--;
2231     }
2232     count++;
2233     ADD_NEW(state_offset, count);
2234     }
2235 nigel 77 }
2236     break;
2237    
2238     /*-----------------------------------------------------------------*/
2239 ph10 602 case OP_QUERYI:
2240     case OP_MINQUERYI:
2241     case OP_POSQUERYI:
2242     case OP_NOTQUERYI:
2243     case OP_NOTMINQUERYI:
2244     case OP_NOTPOSQUERYI:
2245     caseless = TRUE;
2246     codevalue -= OP_STARI - OP_STAR;
2247     /* Fall through */
2248 nigel 77 case OP_QUERY:
2249     case OP_MINQUERY:
2250 nigel 93 case OP_POSQUERY:
2251 nigel 77 case OP_NOTQUERY:
2252     case OP_NOTMINQUERY:
2253 nigel 93 case OP_NOTPOSQUERY:
2254 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2255     if (clen > 0)
2256     {
2257 nigel 93 unsigned int otherd = NOTACHAR;
2258 ph10 602 if (caseless)
2259 nigel 77 {
2260     #ifdef SUPPORT_UTF8
2261 nigel 87 if (utf8 && d >= 128)
2262 nigel 77 {
2263     #ifdef SUPPORT_UCP
2264 ph10 349 otherd = UCD_OTHERCASE(d);
2265 nigel 77 #endif /* SUPPORT_UCP */
2266     }
2267     else
2268     #endif /* SUPPORT_UTF8 */
2269     otherd = fcc[d];
2270     }
2271     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2272 nigel 93 {
2273     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2274     {
2275     active_count--; /* Remove non-match possibility */
2276     next_active_state--;
2277     }
2278     ADD_NEW(state_offset + dlen + 1, 0);
2279     }
2280 nigel 77 }
2281     break;
2282    
2283     /*-----------------------------------------------------------------*/
2284 ph10 602 case OP_STARI:
2285     case OP_MINSTARI:
2286     case OP_POSSTARI:
2287     case OP_NOTSTARI:
2288     case OP_NOTMINSTARI:
2289     case OP_NOTPOSSTARI:
2290     caseless = TRUE;
2291     codevalue -= OP_STARI - OP_STAR;
2292     /* Fall through */
2293 nigel 77 case OP_STAR:
2294     case OP_MINSTAR:
2295 nigel 93 case OP_POSSTAR:
2296 nigel 77 case OP_NOTSTAR:
2297     case OP_NOTMINSTAR:
2298 nigel 93 case OP_NOTPOSSTAR:
2299 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2300     if (clen > 0)
2301     {
2302 nigel 93 unsigned int otherd = NOTACHAR;
2303 ph10 602 if (caseless)
2304 nigel 77 {
2305     #ifdef SUPPORT_UTF8
2306 nigel 87 if (utf8 && d >= 128)
2307 nigel 77 {
2308     #ifdef SUPPORT_UCP
2309 ph10 349 otherd = UCD_OTHERCASE(d);
2310 nigel 77 #endif /* SUPPORT_UCP */
2311     }
2312     else
2313     #endif /* SUPPORT_UTF8 */
2314     otherd = fcc[d];
2315     }
2316     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2317 nigel 93 {
2318     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2319     {
2320     active_count--; /* Remove non-match possibility */
2321     next_active_state--;
2322     }
2323     ADD_NEW(state_offset, 0);
2324     }
2325 nigel 77 }
2326     break;
2327    
2328     /*-----------------------------------------------------------------*/
2329 ph10 602 case OP_EXACTI:
2330     case OP_NOTEXACTI:
2331     caseless = TRUE;
2332     codevalue -= OP_STARI - OP_STAR;
2333     /* Fall through */
2334 nigel 77 case OP_EXACT:
2335 nigel 93 case OP_NOTEXACT:
2336     count = current_state->count; /* Number already matched */
2337     if (clen > 0)
2338     {
2339     unsigned int otherd = NOTACHAR;
2340 ph10 602 if (caseless)
2341 nigel 93 {
2342     #ifdef SUPPORT_UTF8
2343     if (utf8 && d >= 128)
2344     {
2345     #ifdef SUPPORT_UCP
2346 ph10 349 otherd = UCD_OTHERCASE(d);
2347 nigel 93 #endif /* SUPPORT_UCP */
2348     }
2349     else
2350     #endif /* SUPPORT_UTF8 */
2351     otherd = fcc[d];
2352     }
2353     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2354     {
2355     if (++count >= GET2(code, 1))
2356 zherczeg 769 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2357 nigel 93 else
2358     { ADD_NEW(state_offset, count); }
2359     }
2360     }
2361     break;
2362    
2363     /*-----------------------------------------------------------------*/
2364 ph10 602 case OP_UPTOI:
2365     case OP_MINUPTOI:
2366     case OP_POSUPTOI:
2367     case OP_NOTUPTOI:
2368     case OP_NOTMINUPTOI:
2369     case OP_NOTPOSUPTOI:
2370     caseless = TRUE;
2371     codevalue -= OP_STARI - OP_STAR;
2372     /* Fall through */
2373 nigel 77 case OP_UPTO:
2374     case OP_MINUPTO:
2375 nigel 93 case OP_POSUPTO:
2376 nigel 77 case OP_NOTUPTO:
2377     case OP_NOTMINUPTO:
2378 nigel 93 case OP_NOTPOSUPTO:
2379 zherczeg 769 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2380 nigel 77 count = current_state->count; /* Number already matched */
2381     if (clen > 0)
2382     {
2383 nigel 93 unsigned int otherd = NOTACHAR;
2384 ph10 602 if (caseless)
2385 nigel 77 {
2386     #ifdef SUPPORT_UTF8
2387 nigel 87 if (utf8 && d >= 128)
2388 nigel 77 {
2389     #ifdef SUPPORT_UCP
2390 ph10 349 otherd = UCD_OTHERCASE(d);
2391 nigel 77 #endif /* SUPPORT_UCP */
2392     }
2393     else
2394     #endif /* SUPPORT_UTF8 */
2395     otherd = fcc[d];
2396     }
2397     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2398     {
2399 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2400     {
2401     active_count--; /* Remove non-match possibility */
2402     next_active_state--;
2403     }
2404 nigel 77 if (++count >= GET2(code, 1))
2405 zherczeg 769 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2406 nigel 77 else
2407     { ADD_NEW(state_offset, count); }
2408     }
2409     }
2410     break;
2411    
2412    
2413     /* ========================================================================== */
2414     /* These are the class-handling opcodes */
2415    
2416     case OP_CLASS:
2417     case OP_NCLASS:
2418     case OP_XCLASS:
2419     {
2420     BOOL isinclass = FALSE;
2421     int next_state_offset;
2422 ph10 756 const pcre_uchar *ecode;
2423 nigel 77
2424     /* For a simple class, there is always just a 32-byte table, and we
2425     can set isinclass from it. */
2426    
2427     if (codevalue != OP_XCLASS)
2428     {
2429 zherczeg 770 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2430 nigel 77 if (clen > 0)
2431     {
2432     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2433     ((code[1 + c/8] & (1 << (c&7))) != 0);
2434     }
2435     }
2436    
2437     /* An extended class may have a table or a list of single characters,
2438     ranges, or both, and it may be positive or negative. There's a
2439     function that sorts all this out. */
2440    
2441     else
2442     {
2443     ecode = code + GET(code, 1);
2444 zherczeg 764 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE);
2445 nigel 77 }
2446    
2447     /* At this point, isinclass is set for all kinds of class, and ecode
2448     points to the byte after the end of the class. If there is a
2449     quantifier, this is where it will be. */
2450    
2451 ph10 530 next_state_offset = (int)(ecode - start_code);
2452 nigel 77
2453     switch (*ecode)
2454     {
2455     case OP_CRSTAR:
2456     case OP_CRMINSTAR:
2457     ADD_ACTIVE(next_state_offset + 1, 0);
2458     if (isinclass) { ADD_NEW(state_offset, 0); }
2459     break;
2460    
2461     case OP_CRPLUS:
2462     case OP_CRMINPLUS:
2463     count = current_state->count; /* Already matched */
2464     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2465     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2466     break;
2467    
2468     case OP_CRQUERY:
2469     case OP_CRMINQUERY:
2470     ADD_ACTIVE(next_state_offset + 1, 0);
2471     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2472     break;
2473    
2474     case OP_CRRANGE:
2475     case OP_CRMINRANGE:
2476     count = current_state->count; /* Already matched */
2477     if (count >= GET2(ecode, 1))
2478 zherczeg 769 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2479 nigel 77 if (isinclass)
2480     {
2481 nigel 91 int max = GET2(ecode, 3);
2482     if (++count >= max && max != 0) /* Max 0 => no limit */
2483 zherczeg 769 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2484 nigel 77 else
2485     { ADD_NEW(state_offset, count); }
2486     }
2487     break;
2488    
2489     default:
2490     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2491     break;
2492     }
2493     }
2494     break;
2495    
2496     /* ========================================================================== */
2497     /* These are the opcodes for fancy brackets of various kinds. We have
2498 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2499     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2500 ph10 341 though the other "backtracking verbs" are not supported. */
2501 ph10 345
2502 ph10 341 case OP_FAIL:
2503 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2504 ph10 345 break;
2505 nigel 77
2506     case OP_ASSERT:
2507     case OP_ASSERT_NOT:
2508     case OP_ASSERTBACK:
2509     case OP_ASSERTBACK_NOT:
2510     {
2511     int rc;
2512     int local_offsets[2];
2513     int local_workspace[1000];
2514 ph10 756 const pcre_uchar *endasscode = code + GET(code, 1);
2515 nigel 77
2516     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2517    
2518     rc = internal_dfa_exec(
2519     md, /* static match data */
2520     code, /* this subexpression's code */
2521     ptr, /* where we currently are */
2522 ph10 530 (int)(ptr - start_subject), /* start offset */
2523 nigel 77 local_offsets, /* offset vector */
2524     sizeof(local_offsets)/sizeof(int), /* size of same */
2525     local_workspace, /* workspace vector */
2526     sizeof(local_workspace)/sizeof(int), /* size of same */
2527 ph10 642 rlevel); /* function recursion level */
2528 ph10 487
2529 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2530 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2531 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2532 nigel 77 }
2533     break;
2534    
2535     /*-----------------------------------------------------------------*/
2536     case OP_COND:
2537 nigel 93 case OP_SCOND:
2538 nigel 77 {
2539     int local_offsets[1000];
2540     int local_workspace[1000];
2541 ph10 406 int codelink = GET(code, 1);
2542 ph10 397 int condcode;
2543 ph10 406
2544 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2545 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2546 ph10 398 happen for the other conditions. */
2547 nigel 77
2548 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2549 ph10 406 {
2550     rrc = 0;
2551 ph10 397 if (pcre_callout != NULL)
2552     {
2553     pcre_callout_block cb;
2554     cb.version = 1; /* Version 1 of the callout block */
2555     cb.callout_number = code[LINK_SIZE+2];
2556     cb.offset_vector = offsets;
2557     cb.subject = (PCRE_SPTR)start_subject;
2558 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2559     cb.start_match = (int)(current_subject - start_subject);
2560     cb.current_position = (int)(ptr - start_subject);
2561 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2562     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2563     cb.capture_top = 1;
2564     cb.capture_last = -1;
2565     cb.callout_data = md->callout_data;
2566 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2567 ph10 397 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2568     }
2569 ph10 398 if (rrc > 0) break; /* Fail this thread */
2570 zherczeg 764 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2571 ph10 406 }
2572 ph10 398
2573 ph10 397 condcode = code[LINK_SIZE+1];
2574 ph10 406
2575 nigel 93 /* Back reference conditions are not supported */
2576 nigel 77
2577 ph10 461 if (condcode == OP_CREF || condcode == OP_NCREF)
2578 ph10 459 return PCRE_ERROR_DFA_UCOND;
2579 nigel 93
2580     /* The DEFINE condition is always false */
2581    
2582     if (condcode == OP_DEF)
2583 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2584 nigel 93
2585     /* The only supported version of OP_RREF is for the value RREF_ANY,
2586     which means "test if in any recursion". We can't test for specifically
2587     recursed groups. */
2588    
2589 ph10 459 else if (condcode == OP_RREF || condcode == OP_NRREF)
2590 nigel 93 {
2591 nigel 77 int value = GET2(code, LINK_SIZE+2);
2592 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2593 ph10 654 if (md->recursive != NULL)
2594 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2595     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2596 nigel 77 }
2597    
2598     /* Otherwise, the condition is an assertion */
2599    
2600     else
2601     {
2602     int rc;
2603 ph10 756 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2604     const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2605 nigel 77
2606     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2607    
2608     rc = internal_dfa_exec(
2609     md, /* fixed match data */
2610     asscode, /* this subexpression's code */
2611     ptr, /* where we currently are */
2612 ph10 530 (int)(ptr - start_subject), /* start offset */
2613 nigel 77 local_offsets, /* offset vector */
2614     sizeof(local_offsets)/sizeof(int), /* size of same */
2615     local_workspace, /* workspace vector */
2616     sizeof(local_workspace)/sizeof(int), /* size of same */
2617 ph10 642 rlevel); /* function recursion level */
2618 nigel 77
2619 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2620 nigel 77 if ((rc >= 0) ==
2621     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2622 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2623 nigel 77 else
2624 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2625 nigel 77 }
2626     }
2627     break;
2628    
2629     /*-----------------------------------------------------------------*/
2630     case OP_RECURSE:
2631     {
2632 ph10 654 dfa_recursion_info *ri;
2633 nigel 77 int local_offsets[1000];
2634     int local_workspace[1000];
2635 ph10 756 const pcre_uchar *callpat = start_code + GET(code, 1);
2636 ph10 654 int recno = (callpat == md->start_code)? 0 :
2637     GET2(callpat, 1 + LINK_SIZE);
2638 nigel 77 int rc;
2639    
2640 ph10 642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2641 ph10 654
2642 ph10 642 /* Check for repeating a recursion without advancing the subject
2643     pointer. This should catch convoluted mutual recursions. (Some simple
2644     cases are caught at compile time.) */
2645 nigel 77
2646 ph10 654 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2647     if (recno == ri->group_num && ptr == ri->subject_position)
2648     return PCRE_ERROR_RECURSELOOP;
2649    
2650     /* Remember this recursion and where we started it so as to
2651 ph10 642 catch infinite loops. */
2652 ph10 654
2653 ph10 642 new_recursive.group_num = recno;
2654     new_recursive.subject_position = ptr;
2655     new_recursive.prevrec = md->recursive;
2656 ph10 654 md->recursive = &new_recursive;
2657 ph10 642
2658 nigel 77 rc = internal_dfa_exec(
2659     md, /* fixed match data */
2660 ph10 642 callpat, /* this subexpression's code */
2661 nigel 77 ptr, /* where we currently are */
2662 ph10 530 (int)(ptr - start_subject), /* start offset */
2663 nigel 77 local_offsets, /* offset vector */
2664     sizeof(local_offsets)/sizeof(int), /* size of same */
2665     local_workspace, /* workspace vector */
2666     sizeof(local_workspace)/sizeof(int), /* size of same */
2667 ph10 642 rlevel); /* function recursion level */
2668 nigel 77
2669 ph10 642 md->recursive = new_recursive.prevrec; /* Done this recursion */
2670 nigel 77
2671 ph10 654 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2672 ph10 642 rc));
2673    
2674 nigel 77 /* Ran out of internal offsets */
2675    
2676     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2677    
2678     /* For each successful matched substring, set up the next state with a
2679     count of characters to skip before trying it. Note that the count is in
2680     characters, not bytes. */
2681    
2682     if (rc > 0)
2683     {
2684     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2685     {
2686 ph10 756 const pcre_uchar *p = start_subject + local_offsets[rc];
2687     const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2688 nigel 77 int charcount = local_offsets[rc+1] - local_offsets[rc];
2689     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2690     if (charcount > 0)
2691     {
2692     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2693     }
2694     else
2695     {
2696     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2697     }
2698     }
2699     }
2700     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2701     }
2702     break;
2703    
2704     /*-----------------------------------------------------------------*/
2705 ph10 604 case OP_BRAPOS:
2706     case OP_SBRAPOS:
2707     case OP_CBRAPOS:
2708     case OP_SCBRAPOS:
2709 ph10 654 case OP_BRAPOSZERO:
2710 ph10 604 {
2711     int charcount, matched_count;
2712 ph10 756 const pcre_uchar *local_ptr = ptr;
2713 ph10 604 BOOL allow_zero;
2714 ph10 654
2715 ph10 604 if (codevalue == OP_BRAPOSZERO)
2716     {
2717     allow_zero = TRUE;
2718     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2719     }
2720 ph10 654 else allow_zero = FALSE;
2721    
2722     /* Loop to match the subpattern as many times as possible as if it were
2723     a complete pattern. */
2724    
2725 ph10 604 for (matched_count = 0;; matched_count++)
2726     {
2727     int local_offsets[2];
2728     int local_workspace[1000];
2729 ph10 654
2730 ph10 604 int rc = internal_dfa_exec(
2731     md, /* fixed match data */
2732     code, /* this subexpression's code */
2733     local_ptr, /* where we currently are */
2734     (int)(ptr - start_subject), /* start offset */
2735     local_offsets, /* offset vector */
2736     sizeof(local_offsets)/sizeof(int), /* size of same */
2737     local_workspace, /* workspace vector */
2738     sizeof(local_workspace)/sizeof(int), /* size of same */
2739 ph10 642 rlevel); /* function recursion level */
2740 ph10 654
2741 ph10 604 /* Failed to match */
2742 ph10 654
2743     if (rc < 0)
2744 ph10 604 {
2745     if (rc != PCRE_ERROR_NOMATCH) return rc;
2746     break;
2747 ph10 654 }
2748    
2749 ph10 604 /* Matched: break the loop if zero characters matched. */
2750 ph10 654
2751 ph10 604 charcount = local_offsets[1] - local_offsets[0];
2752 ph10 654 if (charcount == 0) break;
2753 ph10 604 local_ptr += charcount; /* Advance temporary position ptr */
2754 ph10 654 }
2755 ph10 604
2756     /* At this point we have matched the subpattern matched_count
2757 ph10 654 times, and local_ptr is pointing to the character after the end of the
2758     last match. */
2759 ph10 604
2760     if (matched_count > 0 || allow_zero)
2761 ph10 654 {
2762 ph10 756 const pcre_uchar *end_subpattern = code;
2763 ph10 604 int next_state_offset;
2764 ph10 654
2765 ph10 604 do { end_subpattern += GET(end_subpattern, 1); }
2766     while (*end_subpattern == OP_ALT);
2767     next_state_offset =
2768     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2769    
2770     /* Optimization: if there are no more active states, and there
2771     are no new states yet set up, then skip over the subject string
2772     right here, to save looping. Otherwise, set up the new state to swing
2773     into action when the end of the matched substring is reached. */
2774    
2775     if (i + 1 >= active_count && new_count == 0)
2776     {
2777     ptr = local_ptr;
2778     clen = 0;
2779     ADD_NEW(next_state_offset, 0);
2780     }
2781     else
2782     {
2783 ph10 756 const pcre_uchar *p = ptr;
2784     const pcre_uchar *pp = local_ptr;
2785 ph10 654 charcount = pp - p;
2786 ph10 604 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2787     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2788     }
2789 ph10 654 }
2790     }
2791 ph10 604 break;
2792 ph10 654
2793 ph10 604 /*-----------------------------------------------------------------*/
2794 nigel 77 case OP_ONCE:
2795 ph10 733 case OP_ONCE_NC:
2796 nigel 77 {
2797     int local_offsets[2];
2798     int local_workspace[1000];
2799    
2800     int rc = internal_dfa_exec(
2801     md, /* fixed match data */
2802     code, /* this subexpression's code */
2803     ptr, /* where we currently are */
2804 ph10 530 (int)(ptr - start_subject), /* start offset */
2805 nigel 77 local_offsets, /* offset vector */
2806     sizeof(local_offsets)/sizeof(int), /* size of same */
2807     local_workspace, /* workspace vector */
2808     sizeof(local_workspace)/sizeof(int), /* size of same */
2809 ph10 642 rlevel); /* function recursion level */
2810 nigel 77
2811     if (rc >= 0)
2812     {
2813 ph10 756 const pcre_uchar *end_subpattern = code;
2814 nigel 77 int charcount = local_offsets[1] - local_offsets[0];
2815     int next_state_offset, repeat_state_offset;
2816    
2817     do { end_subpattern += GET(end_subpattern, 1); }
2818     while (*end_subpattern == OP_ALT);
2819 ph10 535 next_state_offset =
2820 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2821 nigel 77
2822     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2823     arrange for the repeat state also to be added to the relevant list.
2824     Calculate the offset, or set -1 for no repeat. */
2825    
2826     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2827     *end_subpattern == OP_KETRMIN)?
2828 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2829 nigel 77
2830     /* If we have matched an empty string, add the next state at the
2831     current character pointer. This is important so that the duplicate
2832     checking kicks in, which is what breaks infinite loops that match an
2833     empty string. */
2834    
2835     if (charcount == 0)
2836     {
2837     ADD_ACTIVE(next_state_offset, 0);
2838     }
2839    
2840     /* Optimization: if there are no more active states, and there
2841     are no new states yet set up, then skip over the subject string
2842     right here, to save looping. Otherwise, set up the new state to swing
2843 ph10 604 into action when the end of the matched substring is reached. */
2844 nigel 77
2845     else if (i + 1 >= active_count && new_count == 0)
2846     {
2847     ptr += charcount;
2848     clen = 0;
2849     ADD_NEW(next_state_offset, 0);
2850    
2851     /* If we are adding a repeat state at the new character position,
2852     we must fudge things so that it is the only current state.
2853     Otherwise, it might be a duplicate of one we processed before, and
2854     that would cause it to be skipped. */
2855    
2856     if (repeat_state_offset >= 0)
2857     {
2858     next_active_state = active_states;
2859     active_count = 0;
2860     i = -1;
2861     ADD_ACTIVE(repeat_state_offset, 0);
2862     }
2863     }
2864     else
2865     {
2866 ph10 756 const pcre_uchar *p = start_subject + local_offsets[0];
2867     const pcre_uchar *pp = start_subject + local_offsets[1];
2868 nigel 77 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2869     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2870     if (repeat_state_offset >= 0)
2871     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2872     }
2873     }
2874     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2875     }
2876     break;
2877    
2878    
2879     /* ========================================================================== */
2880     /* Handle callouts */
2881    
2882     case OP_CALLOUT:
2883 ph10 406 rrc = 0;
2884 nigel 77 if (pcre_callout != NULL)
2885     {
2886     pcre_callout_block cb;
2887     cb.version = 1; /* Version 1 of the callout block */
2888     cb.callout_number = code[1];
2889     cb.offset_vector = offsets;
2890 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2891 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2892     cb.start_match = (int)(current_subject - start_subject);
2893     cb.current_position = (int)(ptr - start_subject);
2894 nigel 77 cb.pattern_position = GET(code, 2);
2895     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2896     cb.capture_top = 1;
2897     cb.capture_last = -1;
2898     cb.callout_data = md->callout_data;
2899 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2900 nigel 77 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2901 ph10 406 }
2902     if (rrc == 0)
2903 zherczeg 764 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
2904 nigel 77 break;
2905    
2906    
2907     /* ========================================================================== */
2908     default: /* Unsupported opcode */
2909     return PCRE_ERROR_DFA_UITEM;
2910     }
2911    
2912     NEXT_ACTIVE_STATE: continue;
2913    
2914     } /* End of loop scanning active states */
2915    
2916     /* We have finished the processing at the current subject character. If no
2917     new states have been set for the next character, we have found all the
2918     matches that we are going to find. If we are at the top level and partial
2919 ph10 463 matching has been requested, check for appropriate conditions.
2920    
2921 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
2922     character. If it is equal to the original active_count (saved in
2923     workspace[1]) it means that (*F) was found on every active state. In this
2924 ph10 463 case we don't want to give a partial match.
2925 nigel 77
2926 ph10 463 The "could_continue" variable is true if a state could have continued but
2927     for the fact that the end of the subject was reached. */
2928    
2929 nigel 77 if (new_count <= 0)
2930     {
2931 ph10 427 if (rlevel == 1 && /* Top level, and */
2932 ph10 463 could_continue && /* Some could go on */
2933 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
2934 ph10 427 ( /* either... */
2935     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2936     || /* or... */
2937     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2938     match_count < 0) /* no matches */
2939     ) && /* And... */
2940 ph10 553 ptr >= end_subject && /* Reached end of subject */
2941     ptr > md->start_used_ptr) /* Inspected non-empty string */
2942 nigel 77 {
2943     if (offsetcount >= 2)
2944     {
2945 ph10 530 offsets[0] = (int)(md->start_used_ptr - start_subject);
2946     offsets[1] = (int)(end_subject - start_subject);
2947 nigel 77 }
2948     match_count = PCRE_ERROR_PARTIAL;
2949     }
2950    
2951     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2952     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2953     rlevel*2-2, SP));
2954 nigel 91 break; /* In effect, "return", but see the comment below */
2955 nigel 77 }
2956    
2957     /* One or more states are active for the next character. */
2958    
2959     ptr += clen; /* Advance to next subject character */
2960     } /* Loop to move along the subject string */
2961    
2962 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2963     if we use "return" above, we have compiler trouble. Some compilers warn if
2964     there's nothing here because they think the function doesn't return a value. On
2965     the other hand, if we put a dummy statement here, some more clever compilers
2966     complain that it can't be reached. Sigh. */
2967 nigel 77
2968 nigel 91 return match_count;
2969 nigel 77 }
2970    
2971    
2972    
2973    
2974     /*************************************************
2975     * Execute a Regular Expression - DFA engine *
2976     *************************************************/
2977    
2978     /* This external function applies a compiled re to a subject string using a DFA
2979     engine. This function calls the internal function multiple times if the pattern
2980     is not anchored.
2981    
2982     Arguments:
2983     argument_re points to the compiled expression
2984 ph10 97 extra_data points to extra data or is NULL
2985 nigel 77 subject points to the subject string
2986     length length of subject string (may contain binary zeros)
2987     start_offset where to start in the subject string
2988     options option bits
2989     offsets vector of match offsets
2990     offsetcount size of same
2991     workspace workspace vector
2992     wscount size of same
2993    
2994     Returns: > 0 => number of match offset pairs placed in offsets
2995     = 0 => offsets overflowed; longest matches are present
2996     -1 => failed to match
2997     < -1 => some kind of unexpected problem
2998     */
2999    
3000 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3001 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3002     const char *subject, int length, int start_offset, int options, int *offsets,
3003     int offsetcount, int *workspace, int wscount)
3004     {
3005     real_pcre *re = (real_pcre *)argument_re;
3006     dfa_match_data match_block;
3007 nigel 91 dfa_match_data *md = &match_block;
3008 nigel 77 BOOL utf8, anchored, startline, firstline;
3009 ph10 756 const pcre_uchar *current_subject, *end_subject;
3010     const pcre_uint8 *lcc;
3011 nigel 77
3012     pcre_study_data internal_study;
3013     const pcre_study_data *study = NULL;
3014     real_pcre internal_re;
3015    
3016 zherczeg 774 const pcre_uchar *req_char_ptr;
3017 ph10 756 const pcre_uint8 *start_bits = NULL;
3018 zherczeg 774 BOOL has_first_char = FALSE;
3019     BOOL has_req_char = FALSE;
3020     pcre_uchar first_char = 0;
3021     pcre_uchar first_char2 = 0;
3022     pcre_uchar req_char = 0;
3023     pcre_uchar req_char2 = 0;
3024 nigel 91 int newline;
3025 nigel 77
3026     /* Plausibility checks */
3027    
3028     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3029     if (re == NULL || subject == NULL || workspace == NULL ||
3030     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3031     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3032     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3033 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3034 nigel 77
3035     /* We need to find the pointer to any study data before we test for byte
3036     flipping, so we scan the extra_data block first. This may set two fields in the
3037     match block, so we must initialize them beforehand. However, the other fields
3038     in the match block must not be set until after the byte flipping. */
3039    
3040 nigel 91 md->tables = re->tables;
3041     md->callout_data = NULL;
3042 nigel 77
3043     if (extra_data != NULL)
3044     {
3045     unsigned int flags = extra_data->flags;
3046     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3047     study = (const pcre_study_data *)extra_data->study_data;
3048     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3049 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3050     return PCRE_ERROR_DFA_UMLIMIT;
3051 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3052 nigel 91 md->callout_data = extra_data->callout_data;
3053 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3054 nigel 91 md->tables = extra_data->tables;
3055 nigel 77 }
3056 ph10 461
3057 nigel 77 /* Check that the first field in the block is the magic number. If it is not,
3058     test for a regex that was compiled on a host of opposite endianness. If this is
3059     the case, flipped values are put in internal_re and internal_study if there was
3060     study data too. */
3061    
3062     if (re->magic_number != MAGIC_NUMBER)
3063     {
3064 zherczeg 764 re = PRIV(try_flipped)(re, &internal_re, study, &internal_study);
3065 nigel 77 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3066     if (study != NULL) study = &internal_study;
3067     }
3068    
3069     /* Set some local values */
3070    
3071     current_subject = (const unsigned char *)subject + start_offset;
3072     end_subject = (const unsigned char *)subject + length;
3073 zherczeg 774 req_char_ptr = current_subject - 1;
3074 nigel 77
3075 nigel 91 #ifdef SUPPORT_UTF8
3076 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
3077 nigel 91 #else
3078     utf8 = FALSE;
3079     #endif
3080 nigel 77
3081 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3082     (re->options & PCRE_ANCHORED) != 0;
3083    
3084 nigel 77 /* The remaining fixed data for passing around. */
3085    
3086 ph10 756 md->start_code = (const pcre_uchar *)argument_re +
3087 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3088 nigel 91 md->start_subject = (const unsigned char *)subject;
3089     md->end_subject = end_subject;
3090 ph10 442 md->start_offset = start_offset;
3091 nigel 91 md->moptions = options;
3092     md->poptions = re->options;
3093 nigel 77
3094 ph10 231 /* If the BSR option is not set at match time, copy what was set
3095     at compile time. */
3096    
3097     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3098     {
3099     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3100     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3101     #ifdef BSR_ANYCRLF
3102     else md->moptions |= PCRE_BSR_ANYCRLF;
3103 ph10 243 #endif
3104     }
3105 ph10 231
3106 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3107     nothing is set at run time, whatever was used at compile time applies. */
3108 nigel 91
3109 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3110 nigel 93 PCRE_NEWLINE_BITS)
3111 nigel 91 {
3112 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3113 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3114     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3115 nigel 91 case PCRE_NEWLINE_CR+
3116 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3117 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3118 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3119 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3120 nigel 91 }
3121    
3122 ph10 149 if (newline == -2)
3123 nigel 91 {
3124 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3125     }
3126     else if (newline < 0)
3127     {
3128 nigel 93 md->nltype = NLTYPE_ANY;
3129 nigel 91 }
3130     else
3131     {
3132 nigel 93 md->nltype = NLTYPE_FIXED;
3133     if (newline > 255)
3134     {
3135     md->nllen = 2;
3136     md->nl[0] = (newline >> 8) & 255;
3137     md->nl[1] = newline & 255;
3138     }
3139     else
3140     {
3141     md->nllen = 1;
3142     md->nl[0] = newline;
3143     }
3144 nigel 91 }
3145    
3146 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3147     back the character offset. */
3148    
3149     #ifdef SUPPORT_UTF8
3150     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3151     {
3152 ph10 654 int erroroffset;
3153 zherczeg 764 int errorcode = PRIV(valid_utf8)((pcre_uchar *)subject, length, &erroroffset);
3154 ph10 606 if (errorcode != 0)
3155 ph10 598 {
3156     if (offsetcount >= 2)
3157     {
3158 ph10 606 offsets[0] = erroroffset;
3159 ph10 598 offsets[1] = errorcode;
3160 ph10 654 }
3161 ph10 598 return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0)?
3162 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3163 ph10 654 }
3164 ph10 606 if (start_offset > 0 && start_offset < length &&
3165 ph10 756 (((PCRE_PUCHAR)subject)[start_offset] & 0xc0) == 0x80)
3166 ph10 606 return PCRE_ERROR_BADUTF8_OFFSET;
3167 nigel 77 }
3168     #endif
3169    
3170     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3171     is a feature that makes it possible to save compiled regex and re-use them
3172     in other programs later. */
3173    
3174 zherczeg 764 if (md->tables == NULL) md->tables = PRIV(default_tables);
3175 nigel 77
3176     /* The lower casing table and the "must be at the start of a line" flag are
3177     used in a loop when finding where to start. */
3178    
3179 nigel 91 lcc = md->tables + lcc_offset;
3180 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3181 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3182    
3183     /* Set up the first character to match, if available. The first_byte value is
3184     never set for an anchored regular expression, but the anchoring may be forced
3185     at run time, so we have to test for anchoring. The first char may be unset for
3186     an unanchored pattern, of course. If there's no first char and the pattern was
3187     studied, there may be a bitmap of possible first characters. */
3188    
3189     if (!anchored)
3190     {
3191 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3192 nigel 77 {
3193 zherczeg 774 has_first_char = TRUE;
3194     first_char = first_char2 = re->first_char;
3195     if ((re->flags & PCRE_FCH_CASELESS) != 0)
3196     first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3197 nigel 77 }
3198     else
3199     {
3200 ph10 455 if (!startline && study != NULL &&
3201     (study->flags & PCRE_STUDY_MAPPED) != 0)
3202 nigel 77 start_bits = study->start_bits;
3203     }
3204     }
3205    
3206     /* For anchored or unanchored matches, there may be a "last known required
3207     character" set. */
3208    
3209 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3210 nigel 77 {
3211 zherczeg 774 has_req_char = TRUE;
3212     req_char = req_char2 = re->req_char;
3213     if ((re->flags & PCRE_RCH_CASELESS) != 0)
3214     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3215 nigel 77 }
3216    
3217     /* Call the main matching function, looping for a non-anchored regex after a
3218 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3219     a match. */
3220 nigel 77
3221     for (;;)
3222     {
3223     int rc;
3224    
3225     if ((options & PCRE_DFA_RESTART) == 0)
3226     {
3227 ph10 756 const pcre_uchar *save_end_subject = end_subject;
3228 nigel 77
3229 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3230     line of a multiline string. Implement this by temporarily adjusting
3231     end_subject so that we stop scanning at a newline. If the match fails at
3232     the newline, later code breaks this loop. */
3233 nigel 77
3234     if (firstline)
3235     {
3236 ph10 756 PCRE_PUCHAR t = current_subject;
3237 ph10 365 #ifdef SUPPORT_UTF8
3238     if (utf8)
3239 ph10 371 {
3240     while (t < md->end_subject && !IS_NEWLINE(t))
3241 ph10 365 {
3242     t++;
3243     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
3244 ph10 371 }
3245 ph10 365 }
3246     else
3247 ph10 371 #endif
3248 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3249 nigel 77 end_subject = t;
3250     }
3251 ph10 392
3252 ph10 389 /* There are some optimizations that avoid running the match if a known
3253 ph10 455 starting point is not found. However, there is an option that disables
3254 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3255 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3256     match-time options. */
3257 nigel 77
3258 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3259 ph10 392 {
3260 zherczeg 774 /* Advance to a known first char. */
3261 ph10 392
3262 zherczeg 774 if (has_first_char)
3263 nigel 77 {
3264 zherczeg 774 if (first_char != first_char2)
3265 ph10 389 while (current_subject < end_subject &&
3266 zherczeg 774 *current_subject != first_char && *current_subject != first_char2)
3267 ph10 389 current_subject++;
3268     else
3269 ph10 392 while (current_subject < end_subject &&
3270 zherczeg 774 *current_subject != first_char)
3271 ph10 389 current_subject++;
3272     }
3273 ph10 392
3274 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3275 ph10 392
3276 ph10 389 else if (startline)
3277     {
3278     if (current_subject > md->start_subject + start_offset)
3279     {
3280 ph10 365 #ifdef SUPPORT_UTF8
3281 ph10 389 if (utf8)
3282 ph10 365 {
3283 ph10 392 while (current_subject < end_subject &&
3284 ph10 389 !WAS_NEWLINE(current_subject))
3285     {
3286 ph10 365 current_subject++;
3287 ph10 389 while(current_subject < end_subject &&
3288     (*current_subject & 0xc0) == 0x80)
3289     current_subject++;
3290     }
3291 ph10 371 }
3292 ph10 389 else
3293     #endif
3294     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3295     current_subject++;
3296 ph10 392
3297 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3298     ANYCRLF, and we are now at a LF, advance the match position by one
3299     more character. */
3300 ph10 392
3301 ph10 391 if (current_subject[-1] == CHAR_CR &&
3302 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3303     current_subject < end_subject &&
3304 ph10 391 *current_subject == CHAR_NL)
3305 ph10 389 current_subject++;
3306 ph10 365 }
3307 nigel 77 }
3308 ph10 392
3309 ph10 389 /* Or to a non-unique first char after study */
3310 ph10 392
3311 ph10 389 else if (start_bits != NULL)
3312 nigel 77 {
3313 ph10 389 while (current_subject < end_subject)
3314     {
3315     register unsigned int c = *current_subject;
3316 ph10 545 if ((start_bits[c/8] & (1 << (c&7))) == 0)
3317 ph10 538 {
3318     current_subject++;
3319     #ifdef SUPPORT_UTF8
3320     if (utf8)
3321 ph10 545 while(current_subject < end_subject &&
3322 ph10 538 (*current_subject & 0xc0) == 0x80) current_subject++;
3323 ph10 545 #endif
3324 ph10 538 }
3325     else break;
3326 ph10 389 }
3327 nigel 77 }
3328 ph10 392 }
3329 nigel 77
3330     /* Restore fudged end_subject */
3331    
3332     end_subject = save_end_subject;
3333    
3334 ph10 461 /* The following two optimizations are disabled for partial matching or if
3335     disabling is explicitly requested (and of course, by the test above, this
3336 ph10 455 code is not obeyed when restarting after a partial match). */
3337 ph10 461
3338 ph10 728 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3339 ph10 455 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3340 ph10 461 {
3341 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3342     is a lower bound; no actual string of that length may actually match the
3343     pattern. Although the value is, strictly, in characters, we treat it as
3344     bytes to avoid spending too much time in this optimization. */
3345 nigel 77
3346 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3347 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3348 ph10 455 return PCRE_ERROR_NOMATCH;
3349 ph10 461
3350 zherczeg 774 /* If req_char is set, we know that that character must appear in the
3351     subject for the match to succeed. If the first character is set, req_char
3352 ph10 455 must be later in the subject; otherwise the test starts at the match
3353     point. This optimization can save a huge amount of work in patterns with
3354     nested unlimited repeats that aren't going to match. Writing separate
3355     code for cased/caseless versions makes it go faster, as does using an
3356     autoincrement and backing off on a match.
3357 ph10 461
3358 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3359     can take a long time, and give bad performance on quite ordinary
3360     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3361     string... so we don't do this when the string is sufficiently long. */
3362 ph10 461
3363 zherczeg 774 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3364 nigel 77 {
3365 zherczeg 774 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3366 ph10 461
3367 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3368     place we found it at last time. */
3369 ph10 461
3370 zherczeg 774 if (p > req_char_ptr)
3371 nigel 77 {
3372 zherczeg 774 if (req_char != req_char2)
3373 ph10 455 {
3374     while (p < end_subject)
3375     {
3376     register int pp = *p++;
3377 zherczeg 774 if (pp == req_char || pp == req_char2) { p--; break; }
3378 ph10 455 }
3379     }
3380     else
3381     {
3382     while (p < end_subject)
3383     {
3384 zherczeg 774 if (*p++ == req_char) { p--; break; }
3385 ph10 455 }
3386     }
3387 ph10 461
3388 ph10 455 /* If we can't find the required character, break the matching loop,
3389     which will cause a return or PCRE_ERROR_NOMATCH. */
3390 ph10 461
3391 ph10 455 if (p >= end_subject) break;
3392 ph10 461
3393 ph10 455 /* If we have found the required character, save the point where we
3394     found it, so that we don't search again next time round the loop if
3395     the start hasn't passed this character yet. */
3396 ph10 461
3397 zherczeg 774 req_char_ptr = p;
3398 nigel 77 }
3399 ph10 461 }
3400 nigel 77 }
3401 ph10 455 } /* End of optimizations that are done when not restarting */
3402 nigel 77
3403     /* OK, now we can do the business */
3404    
3405 ph10 435 md->start_used_ptr = current_subject;
3406 ph10 654 md->recursive = NULL;
3407 ph10 461
3408 nigel 77 rc = internal_dfa_exec(
3409 nigel 91 md, /* fixed match data */
3410     md->start_code, /* this subexpression's code */
3411     current_subject, /* where we currently are */
3412     start_offset, /* start offset in subject */
3413     offsets, /* offset vector */
3414     offsetcount, /* size of same */
3415     workspace, /* workspace vector */
3416     wscount, /* size of same */
3417 ph10 642 0); /* function recurse level */
3418 nigel 77
3419     /* Anything other than "no match" means we are done, always; otherwise, carry
3420     on only if not anchored. */
3421    
3422     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3423    
3424     /* Advance to the next subject character unless we are at the end of a line
3425     and firstline is set. */
3426    
3427 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3428 nigel 77 current_subject++;
3429     if (utf8)
3430     {
3431     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3432     current_subject++;
3433     }
3434     if (current_subject > end_subject) break;
3435    
3436 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3437 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3438     or ANY or ANYCRLF, advance the match position by one more character. */
3439 nigel 93
3440 ph10 391 if (current_subject[-1] == CHAR_CR &&
3441 ph10 226 current_subject < end_subject &&
3442 ph10 391 *current_subject == CHAR_NL &&
3443 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3444 ph10 226 (md->nltype == NLTYPE_ANY ||
3445     md->nltype == NLTYPE_ANYCRLF ||
3446     md->nllen == 2))
3447 nigel 93 current_subject++;
3448    
3449     } /* "Bumpalong" loop */
3450    
3451 nigel 77 return PCRE_ERROR_NOMATCH;
3452     }
3453    
3454     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12