/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1477 - (hide annotations) (download)
Wed May 21 17:53:49 2014 UTC (6 months, 1 week ago) by ph10
File MIME type: text/plain
File size: 126629 byte(s)
Casts and type changes for compiler warnings.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 1431 Copyright (c) 1997-2014 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43 ph10 960 FSM). This is NOT Perl-compatible, but it has advantages in certain
44 nigel 93 applications. */
45 nigel 77
46    
47 ph10 461 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
48     the performance of his patterns greatly. I could not use it as it stood, as it
49     was not thread safe, and made assumptions about pattern sizes. Also, it caused
50 ph10 439 test 7 to loop, and test 9 to crash with a segfault.
51    
52     The issue is the check for duplicate states, which is done by a simple linear
53     search up the state list. (Grep for "duplicate" below to find the code.) For
54     many patterns, there will never be many states active at one time, so a simple
55     linear search is fine. In patterns that have many active states, it might be a
56     bottleneck. The suggested code used an indexing scheme to remember which states
57     had previously been used for each character, and avoided the linear search when
58     it knew there was no chance of a duplicate. This was implemented when adding
59     states to the state lists.
60    
61     I wrote some thread-safe, not-limited code to try something similar at the time
62     of checking for duplicates (instead of when adding states), using index vectors
63     on the stack. It did give a 13% improvement with one specially constructed
64     pattern for certain subject strings, but on other strings and on many of the
65     simpler patterns in the test suite it did worse. The major problem, I think,
66     was the extra time to initialize the index. This had to be done for each call
67     of internal_dfa_exec(). (The supplied patch used a static vector, initialized
68     only once - I suspect this was the cause of the problems with the tests.)
69    
70 ph10 461 Overall, I concluded that the gains in some cases did not outweigh the losses
71 ph10 439 in others, so I abandoned this code. */
72    
73    
74    
75 ph10 200 #ifdef HAVE_CONFIG_H
76 ph10 236 #include "config.h"
77 ph10 200 #endif
78 ph10 199
79 nigel 93 #define NLBLOCK md /* Block containing newline information */
80     #define PSSTART start_subject /* Field containing processed string start */
81     #define PSEND end_subject /* Field containing processed string end */
82    
83 nigel 77 #include "pcre_internal.h"
84    
85    
86     /* For use to indent debugging output */
87    
88     #define SP " "
89    
90    
91     /*************************************************
92     * Code parameters and static tables *
93     *************************************************/
94    
95     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
96 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
97 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
98 ph10 178 never stored, so we push them well clear of the normal opcodes. */
99 nigel 77
100 ph10 178 #define OP_PROP_EXTRA 300
101     #define OP_EXTUNI_EXTRA 320
102     #define OP_ANYNL_EXTRA 340
103     #define OP_HSPACE_EXTRA 360
104     #define OP_VSPACE_EXTRA 380
105 nigel 77
106    
107     /* This table identifies those opcodes that are followed immediately by a
108 ph10 510 character that is to be tested in some way. This makes it possible to
109 nigel 77 centralize the loading of these characters. In the case of Type * etc, the
110     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
111 ph10 463 small value. Non-zero values in the table are the offsets from the opcode where
112 ph10 462 the character is to be found. ***NOTE*** If the start of this table is
113     modified, the three tables that follow must also be modified. */
114 nigel 77
115 ph10 836 static const pcre_uint8 coptable[] = {
116 nigel 77 0, /* End */
117 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
118     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
119 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
120 ph10 498 0, 0, /* \P, \p */
121 ph10 178 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
122 ph10 498 0, /* \X */
123 ph10 1363 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
124 nigel 77 1, /* Char */
125 ph10 602 1, /* Chari */
126 nigel 77 1, /* not */
127 ph10 602 1, /* noti */
128 nigel 77 /* Positive single-char repeats */
129     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
130 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
131     1+IMM2_SIZE, /* exact */
132     1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
133 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
134 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
135     1+IMM2_SIZE, /* exact I */
136     1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
137 nigel 77 /* Negative single-char repeats - only for chars < 256 */
138     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
139 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
140     1+IMM2_SIZE, /* NOT exact */
141     1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
142 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
143 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
144     1+IMM2_SIZE, /* NOT exact I */
145     1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
146 nigel 77 /* Positive type repeats */
147     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
148 ph10 836 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
149     1+IMM2_SIZE, /* Type exact */
150     1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
151 nigel 77 /* Character class & ref repeats */
152     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
153     0, 0, /* CRRANGE, CRMINRANGE */
154 ph10 1379 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
155 nigel 77 0, /* CLASS */
156     0, /* NCLASS */
157     0, /* XCLASS - variable length */
158     0, /* REF */
159 ph10 602 0, /* REFI */
160 ph10 1361 0, /* DNREF */
161     0, /* DNREFI */
162 nigel 77 0, /* RECURSE */
163     0, /* CALLOUT */
164     0, /* Alt */
165     0, /* Ket */
166     0, /* KetRmax */
167     0, /* KetRmin */
168 ph10 604 0, /* KetRpos */
169 ph10 637 0, /* Reverse */
170 nigel 77 0, /* Assert */
171     0, /* Assert not */
172     0, /* Assert behind */
173     0, /* Assert behind not */
174 ph10 723 0, 0, /* ONCE, ONCE_NC */
175     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
176 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
177 ph10 1365 0, 0, /* CREF, DNCREF */
178     0, 0, /* RREF, DNRREF */
179 nigel 93 0, /* DEF */
180 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
181 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
182     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
183     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
184     0, 0 /* CLOSE, SKIPZERO */
185 nigel 77 };
186    
187 ph10 463 /* This table identifies those opcodes that inspect a character. It is used to
188 ph10 462 remember the fact that a character could have been inspected when the end of
189 ph10 463 the subject is reached. ***NOTE*** If the start of this table is modified, the
190     two tables that follow must also be modified. */
191 ph10 462
192 ph10 836 static const pcre_uint8 poptable[] = {
193 ph10 462 0, /* End */
194 ph10 463 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
195 ph10 462 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
196     1, 1, 1, /* Any, AllAny, Anybyte */
197 ph10 498 1, 1, /* \P, \p */
198 ph10 462 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
199 ph10 498 1, /* \X */
200 ph10 1363 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
201 ph10 462 1, /* Char */
202 ph10 602 1, /* Chari */
203 ph10 462 1, /* not */
204 ph10 602 1, /* noti */
205 ph10 462 /* Positive single-char repeats */
206     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
207     1, 1, 1, /* upto, minupto, exact */
208     1, 1, 1, 1, /* *+, ++, ?+, upto+ */
209 ph10 602 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
210     1, 1, 1, /* upto I, minupto I, exact I */
211     1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
212 ph10 462 /* Negative single-char repeats - only for chars < 256 */
213     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
214     1, 1, 1, /* NOT upto, minupto, exact */
215     1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
216 ph10 602 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
217     1, 1, 1, /* NOT upto I, minupto I, exact I */
218     1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
219 ph10 462 /* Positive type repeats */
220     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
221     1, 1, 1, /* Type upto, minupto, exact */
222     1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
223     /* Character class & ref repeats */
224     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
225     1, 1, /* CRRANGE, CRMINRANGE */
226 ph10 1379 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
227 ph10 462 1, /* CLASS */
228     1, /* NCLASS */
229     1, /* XCLASS - variable length */
230     0, /* REF */
231 ph10 602 0, /* REFI */
232 ph10 1361 0, /* DNREF */
233     0, /* DNREFI */
234 ph10 462 0, /* RECURSE */
235     0, /* CALLOUT */
236     0, /* Alt */
237     0, /* Ket */
238     0, /* KetRmax */
239     0, /* KetRmin */
240 ph10 604 0, /* KetRpos */
241 ph10 637 0, /* Reverse */
242 ph10 462 0, /* Assert */
243     0, /* Assert not */
244     0, /* Assert behind */
245     0, /* Assert behind not */
246 ph10 723 0, 0, /* ONCE, ONCE_NC */
247     0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
248 ph10 604 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
249 ph10 1365 0, 0, /* CREF, DNCREF */
250     0, 0, /* RREF, DNRREF */
251 ph10 462 0, /* DEF */
252 ph10 604 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
253 ph10 613 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
254     0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
255     0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
256     0, 0 /* CLOSE, SKIPZERO */
257 ph10 462 };
258    
259 nigel 77 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
260     and \w */
261    
262 ph10 836 static const pcre_uint8 toptable1[] = {
263 ph10 168 0, 0, 0, 0, 0, 0,
264 nigel 77 ctype_digit, ctype_digit,
265     ctype_space, ctype_space,
266     ctype_word, ctype_word,
267 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
268 nigel 77 };
269    
270 ph10 836 static const pcre_uint8 toptable2[] = {
271 ph10 168 0, 0, 0, 0, 0, 0,
272 nigel 77 ctype_digit, 0,
273     ctype_space, 0,
274     ctype_word, 0,
275 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
276 nigel 77 };
277    
278    
279     /* Structure for holding data about a particular state, which is in effect the
280     current data for an active path through the match tree. It must consist
281     entirely of ints because the working vector we are passed, and which we put
282     these structures in, is a vector of ints. */
283    
284     typedef struct stateblock {
285     int offset; /* Offset to opcode */
286     int count; /* Count for repeats */
287     int data; /* Some use extra data */
288     } stateblock;
289    
290 ph10 960 #define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
291 nigel 77
292    
293 ph10 475 #ifdef PCRE_DEBUG
294 nigel 77 /*************************************************
295     * Print character string *
296     *************************************************/
297    
298     /* Character string printing function for debugging.
299    
300     Arguments:
301     p points to string
302     length number of bytes
303     f where to print
304    
305     Returns: nothing
306     */
307    
308     static void
309 ph10 836 pchars(const pcre_uchar *p, int length, FILE *f)
310 nigel 77 {
311 chpe 1091 pcre_uint32 c;
312 nigel 77 while (length-- > 0)
313     {
314     if (isprint(c = *(p++)))
315     fprintf(f, "%c", c);
316     else
317 chpe 1091 fprintf(f, "\\x{%02x}", c);
318 nigel 77 }
319     }
320     #endif
321    
322    
323    
324     /*************************************************
325     * Execute a Regular Expression - DFA engine *
326     *************************************************/
327    
328     /* This internal function applies a compiled pattern to a subject string,
329     starting at a given point, using a DFA engine. This function is called from the
330     external one, possibly multiple times if the pattern is not anchored. The
331     function calls itself recursively for some kinds of subpattern.
332    
333     Arguments:
334     md the match_data block with fixed information
335     this_start_code the opening bracket of this subexpression's code
336     current_subject where we currently are in the subject string
337     start_offset start offset in the subject string
338     offsets vector to contain the matching string offsets
339     offsetcount size of same
340     workspace vector of workspace
341     wscount size of same
342     rlevel function call recursion level
343    
344 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
345 ph10 341 = 0 => offsets overflowed; longest matches are present
346 nigel 77 -1 => failed to match
347     < -1 => some kind of unexpected problem
348    
349     The following macros are used for adding states to the two state vectors (one
350     for the current character, one for the following character). */
351    
352     #define ADD_ACTIVE(x,y) \
353     if (active_count++ < wscount) \
354     { \
355     next_active_state->offset = (x); \
356     next_active_state->count = (y); \
357     next_active_state++; \
358     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
359     } \
360     else return PCRE_ERROR_DFA_WSSIZE
361    
362     #define ADD_ACTIVE_DATA(x,y,z) \
363     if (active_count++ < wscount) \
364     { \
365     next_active_state->offset = (x); \
366     next_active_state->count = (y); \
367     next_active_state->data = (z); \
368     next_active_state++; \
369     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
370     } \
371     else return PCRE_ERROR_DFA_WSSIZE
372    
373     #define ADD_NEW(x,y) \
374     if (new_count++ < wscount) \
375     { \
376     next_new_state->offset = (x); \
377     next_new_state->count = (y); \
378     next_new_state++; \
379     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
380     } \
381     else return PCRE_ERROR_DFA_WSSIZE
382    
383     #define ADD_NEW_DATA(x,y,z) \
384     if (new_count++ < wscount) \
385     { \
386     next_new_state->offset = (x); \
387     next_new_state->count = (y); \
388     next_new_state->data = (z); \
389     next_new_state++; \
390 ph10 979 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d) line %d\n", rlevel*2-2, SP, \
391     (x), (y), (z), __LINE__)); \
392 nigel 77 } \
393     else return PCRE_ERROR_DFA_WSSIZE
394    
395     /* And now, here is the code */
396    
397     static int
398     internal_dfa_exec(
399     dfa_match_data *md,
400 ph10 836 const pcre_uchar *this_start_code,
401     const pcre_uchar *current_subject,
402 nigel 77 int start_offset,
403     int *offsets,
404     int offsetcount,
405     int *workspace,
406     int wscount,
407 ph10 642 int rlevel)
408 nigel 77 {
409     stateblock *active_states, *new_states, *temp_states;
410     stateblock *next_active_state, *next_new_state;
411    
412 ph10 836 const pcre_uint8 *ctypes, *lcc, *fcc;
413     const pcre_uchar *ptr;
414     const pcre_uchar *end_code, *first_op;
415 nigel 77
416 ph10 642 dfa_recursion_info new_recursive;
417    
418 nigel 77 int active_count, new_count, match_count;
419    
420     /* Some fields in the md block are frequently referenced, so we load them into
421     independent variables in the hope that this will perform better. */
422    
423 ph10 836 const pcre_uchar *start_subject = md->start_subject;
424     const pcre_uchar *end_subject = md->end_subject;
425     const pcre_uchar *start_code = md->start_code;
426 nigel 77
427 ph10 836 #ifdef SUPPORT_UTF
428     BOOL utf = (md->poptions & PCRE_UTF8) != 0;
429 nigel 93 #else
430 ph10 836 BOOL utf = FALSE;
431 nigel 87 #endif
432 nigel 77
433 ph10 916 BOOL reset_could_continue = FALSE;
434    
435 nigel 77 rlevel++;
436     offsetcount &= (-2);
437    
438     wscount -= 2;
439     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
440     (2 * INTS_PER_STATEBLOCK);
441    
442     DPRINTF(("\n%.*s---------------------\n"
443 ph10 642 "%.*sCall to internal_dfa_exec f=%d\n",
444     rlevel*2-2, SP, rlevel*2-2, SP, rlevel));
445 nigel 77
446     ctypes = md->tables + ctypes_offset;
447     lcc = md->tables + lcc_offset;
448     fcc = md->tables + fcc_offset;
449    
450     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
451    
452     active_states = (stateblock *)(workspace + 2);
453     next_new_state = new_states = active_states + wscount;
454     new_count = 0;
455    
456 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
457 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
458 ph10 836 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
459     ? IMM2_SIZE:0);
460 nigel 93
461 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
462     the alternative states onto the list, and find out where the end is. This
463     makes is possible to use this function recursively, when we want to stop at a
464     matching internal ket rather than at the end.
465    
466     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
467     a backward assertion. In that case, we have to find out the maximum amount to
468     move back, and set up each alternative appropriately. */
469    
470 nigel 93 if (*first_op == OP_REVERSE)
471 nigel 77 {
472     int max_back = 0;
473     int gone_back;
474    
475     end_code = this_start_code;
476     do
477     {
478     int back = GET(end_code, 2+LINK_SIZE);
479     if (back > max_back) max_back = back;
480     end_code += GET(end_code, 1);
481     }
482     while (*end_code == OP_ALT);
483    
484     /* If we can't go back the amount required for the longest lookbehind
485     pattern, go back as far as we can; some alternatives may still be viable. */
486    
487 ph10 836 #ifdef SUPPORT_UTF
488 nigel 77 /* In character mode we have to step back character by character */
489    
490 ph10 836 if (utf)
491 nigel 77 {
492     for (gone_back = 0; gone_back < max_back; gone_back++)
493     {
494     if (current_subject <= start_subject) break;
495     current_subject--;
496 ph10 836 ACROSSCHAR(current_subject > start_subject, *current_subject, current_subject--);
497 nigel 77 }
498     }
499     else
500     #endif
501    
502     /* In byte-mode we can do this quickly. */
503    
504     {
505     gone_back = (current_subject - max_back < start_subject)?
506 ph10 530 (int)(current_subject - start_subject) : max_back;
507 nigel 77 current_subject -= gone_back;
508     }
509 ph10 461
510 ph10 435 /* Save the earliest consulted character */
511 nigel 77
512 ph10 461 if (current_subject < md->start_used_ptr)
513     md->start_used_ptr = current_subject;
514    
515 nigel 77 /* Now we can process the individual branches. */
516    
517     end_code = this_start_code;
518     do
519     {
520     int back = GET(end_code, 2+LINK_SIZE);
521     if (back <= gone_back)
522     {
523 ph10 530 int bstate = (int)(end_code - start_code + 2 + 2*LINK_SIZE);
524 nigel 77 ADD_NEW_DATA(-bstate, 0, gone_back - back);
525     }
526     end_code += GET(end_code, 1);
527     }
528     while (*end_code == OP_ALT);
529     }
530    
531     /* This is the code for a "normal" subpattern (not a backward assertion). The
532     start of a whole pattern is always one of these. If we are at the top level,
533     we may be asked to restart matching from the same point that we reached for a
534     previous partial match. We still have to scan through the top-level branches to
535     find the end state. */
536    
537     else
538     {
539     end_code = this_start_code;
540    
541     /* Restarting */
542    
543     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
544     {
545     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
546     new_count = workspace[1];
547     if (!workspace[0])
548     memcpy(new_states, active_states, new_count * sizeof(stateblock));
549     }
550    
551     /* Not restarting */
552    
553     else
554     {
555 nigel 93 int length = 1 + LINK_SIZE +
556 ph10 604 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
557 ph10 836 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
558     ? IMM2_SIZE:0);
559 nigel 77 do
560     {
561 ph10 530 ADD_NEW((int)(end_code - start_code + length), 0);
562 nigel 77 end_code += GET(end_code, 1);
563 nigel 93 length = 1 + LINK_SIZE;
564 nigel 77 }
565     while (*end_code == OP_ALT);
566     }
567     }
568    
569     workspace[0] = 0; /* Bit indicating which vector is current */
570    
571 ph10 836 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, (int)(end_code - start_code)));
572 nigel 77
573     /* Loop for scanning the subject */
574    
575     ptr = current_subject;
576     for (;;)
577     {
578     int i, j;
579 nigel 91 int clen, dlen;
580 chpe 1084 pcre_uint32 c, d;
581 ph10 428 int forced_fail = 0;
582 ph10 975 BOOL partial_newline = FALSE;
583 ph10 916 BOOL could_continue = reset_could_continue;
584 ph10 975 reset_could_continue = FALSE;
585    
586 nigel 77 /* Make the new state list into the active state list and empty the
587     new state list. */
588    
589     temp_states = active_states;
590     active_states = new_states;
591     new_states = temp_states;
592     active_count = new_count;
593     new_count = 0;
594    
595     workspace[0] ^= 1; /* Remember for the restarting feature */
596     workspace[1] = active_count;
597    
598 ph10 475 #ifdef PCRE_DEBUG
599 nigel 77 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
600 ph10 836 pchars(ptr, STRLEN_UC(ptr), stdout);
601 nigel 77 printf("\"\n");
602    
603     printf("%.*sActive states: ", rlevel*2-2, SP);
604     for (i = 0; i < active_count; i++)
605     printf("%d/%d ", active_states[i].offset, active_states[i].count);
606     printf("\n");
607     #endif
608    
609     /* Set the pointers for adding new states */
610    
611     next_active_state = active_states + active_count;
612     next_new_state = new_states;
613    
614     /* Load the current character from the subject outside the loop, as many
615     different states may want to look at it, and we assume that at least one
616     will. */
617    
618     if (ptr < end_subject)
619     {
620 ph10 979 clen = 1; /* Number of data items in the character */
621 ph10 836 #ifdef SUPPORT_UTF
622 chpe 1100 GETCHARLENTEST(c, ptr, clen);
623     #else
624     c = *ptr;
625 ph10 836 #endif /* SUPPORT_UTF */
626 nigel 77 }
627     else
628     {
629 nigel 93 clen = 0; /* This indicates the end of the subject */
630     c = NOTACHAR; /* This value should never actually be used */
631 nigel 77 }
632    
633     /* Scan up the active states and act on each one. The result of an action
634     may be to add more states to the currently active list (e.g. on hitting a
635     parenthesis) or it may be to put states on the new list, for considering
636     when we move the character pointer on. */
637    
638     for (i = 0; i < active_count; i++)
639     {
640     stateblock *current_state = active_states + i;
641 ph10 654 BOOL caseless = FALSE;
642 ph10 836 const pcre_uchar *code;
643 nigel 77 int state_offset = current_state->offset;
644 ph10 1144 int codevalue, rrc;
645 ph10 1334 int count;
646 nigel 77
647 ph10 475 #ifdef PCRE_DEBUG
648 nigel 77 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
649 nigel 93 if (clen == 0) printf("EOL\n");
650 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
651     else printf("0x%02x\n", c);
652     #endif
653    
654     /* A negative offset is a special case meaning "hold off going to this
655     (negated) state until the number of characters in the data field have
656 ph10 975 been skipped". If the could_continue flag was passed over from a previous
657 ph10 916 state, arrange for it to passed on. */
658 nigel 77
659     if (state_offset < 0)
660     {
661     if (current_state->data > 0)
662     {
663     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
664     ADD_NEW_DATA(state_offset, current_state->count,
665     current_state->data - 1);
666 ph10 916 if (could_continue) reset_could_continue = TRUE;
667 nigel 77 continue;
668     }
669     else
670     {
671     current_state->offset = state_offset = -state_offset;
672     }
673     }
674    
675 ph10 461 /* Check for a duplicate state with the same count, and skip if found.
676 ph10 439 See the note at the head of this module about the possibility of improving
677     performance here. */
678 nigel 77
679     for (j = 0; j < i; j++)
680     {
681     if (active_states[j].offset == state_offset &&
682     active_states[j].count == current_state->count)
683     {
684     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
685     goto NEXT_ACTIVE_STATE;
686     }
687     }
688    
689     /* The state offset is the offset to the opcode */
690    
691     code = start_code + state_offset;
692     codevalue = *code;
693    
694 ph10 463 /* If this opcode inspects a character, but we are at the end of the
695     subject, remember the fact for use when testing for a partial match. */
696    
697 ph10 462 if (clen == 0 && poptable[codevalue] != 0)
698 ph10 463 could_continue = TRUE;
699 ph10 462
700 nigel 77 /* If this opcode is followed by an inline character, load it. It is
701     tempting to test for the presence of a subject character here, but that
702     is wrong, because sometimes zero repetitions of the subject are
703     permitted.
704    
705     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
706 ph10 975 argument that is not a data character - but is always one byte long because
707 ph10 925 the values are small. We have to take special action to deal with \P, \p,
708     \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
709     these ones to new opcodes. */
710 nigel 77
711     if (coptable[codevalue] > 0)
712     {
713     dlen = 1;
714 ph10 836 #ifdef SUPPORT_UTF
715     if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
716     #endif /* SUPPORT_UTF */
717 nigel 77 d = code[coptable[codevalue]];
718     if (codevalue >= OP_TYPESTAR)
719     {
720 nigel 93 switch(d)
721     {
722     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
723     case OP_NOTPROP:
724     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
725     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
726     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
727 ph10 178 case OP_NOT_HSPACE:
728 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
729 ph10 178 case OP_NOT_VSPACE:
730 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
731 nigel 93 default: break;
732     }
733 nigel 77 }
734     }
735     else
736     {
737     dlen = 0; /* Not strictly necessary, but compilers moan */
738 nigel 93 d = NOTACHAR; /* if these variables are not set. */
739 nigel 77 }
740    
741    
742     /* Now process the individual opcodes */
743    
744     switch (codevalue)
745     {
746 ph10 498 /* ========================================================================== */
747     /* These cases are never obeyed. This is a fudge that causes a compile-
748     time error if the vectors coptable or poptable, which are indexed by
749     opcode, are not the correct length. It seems to be the only way to do
750     such a check at compile time, as the sizeof() operator does not work
751     in the C preprocessor. */
752 ph10 507
753 ph10 498 case OP_TABLE_LENGTH:
754 ph10 507 case OP_TABLE_LENGTH +
755 ph10 498 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
756     (sizeof(poptable) == OP_TABLE_LENGTH)):
757 ph10 507 break;
758 nigel 77
759     /* ========================================================================== */
760     /* Reached a closing bracket. If not at the end of the pattern, carry
761 ph10 654 on with the next opcode. For repeating opcodes, also add the repeat
762     state. Note that KETRPOS will always be encountered at the end of the
763     subpattern, because the possessive subpattern repeats are always handled
764 ph10 604 using recursive calls. Thus, it never adds any new states.
765 ph10 654
766 ph10 604 At the end of the (sub)pattern, unless we have an empty string and
767 ph10 461 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
768 ph10 442 start of the subject, save the match data, shifting up all previous
769 nigel 77 matches so we always have the longest first. */
770    
771     case OP_KET:
772     case OP_KETRMIN:
773     case OP_KETRMAX:
774 ph10 654 case OP_KETRPOS:
775 nigel 77 if (code != end_code)
776     {
777     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
778     if (codevalue != OP_KET)
779     {
780     ADD_ACTIVE(state_offset - GET(code, 1), 0);
781     }
782     }
783 ph10 461 else
784 nigel 77 {
785 ph10 461 if (ptr > current_subject ||
786 ph10 442 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
787     ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
788     current_subject > start_subject + md->start_offset)))
789 nigel 77 {
790 ph10 428 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
791 ph10 680 else if (match_count > 0 && ++match_count * 2 > offsetcount)
792 ph10 428 match_count = 0;
793     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
794     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
795     if (offsetcount >= 2)
796     {
797 ph10 530 offsets[0] = (int)(current_subject - start_subject);
798     offsets[1] = (int)(ptr - start_subject);
799 ph10 428 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
800 ph10 979 offsets[1] - offsets[0], (char *)current_subject));
801 ph10 428 }
802     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
803     {
804     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
805     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
806     match_count, rlevel*2-2, SP));
807     return match_count;
808     }
809 ph10 461 }
810 nigel 77 }
811     break;
812    
813     /* ========================================================================== */
814     /* These opcodes add to the current list of states without looking
815     at the current character. */
816    
817     /*-----------------------------------------------------------------*/
818     case OP_ALT:
819     do { code += GET(code, 1); } while (*code == OP_ALT);
820 ph10 530 ADD_ACTIVE((int)(code - start_code), 0);
821 nigel 77 break;
822    
823     /*-----------------------------------------------------------------*/
824     case OP_BRA:
825 nigel 93 case OP_SBRA:
826 nigel 77 do
827     {
828 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
829 nigel 77 code += GET(code, 1);
830     }
831     while (*code == OP_ALT);
832     break;
833    
834     /*-----------------------------------------------------------------*/
835 nigel 93 case OP_CBRA:
836     case OP_SCBRA:
837 ph10 836 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
838 nigel 93 code += GET(code, 1);
839     while (*code == OP_ALT)
840     {
841 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
842 nigel 93 code += GET(code, 1);
843     }
844     break;
845    
846     /*-----------------------------------------------------------------*/
847 nigel 77 case OP_BRAZERO:
848     case OP_BRAMINZERO:
849     ADD_ACTIVE(state_offset + 1, 0);
850     code += 1 + GET(code, 2);
851     while (*code == OP_ALT) code += GET(code, 1);
852 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
853 nigel 77 break;
854    
855     /*-----------------------------------------------------------------*/
856 ph10 335 case OP_SKIPZERO:
857     code += 1 + GET(code, 2);
858     while (*code == OP_ALT) code += GET(code, 1);
859 ph10 530 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
860 ph10 335 break;
861    
862     /*-----------------------------------------------------------------*/
863 nigel 77 case OP_CIRC:
864 ph10 602 if (ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0)
865     { ADD_ACTIVE(state_offset + 1, 0); }
866     break;
867    
868     /*-----------------------------------------------------------------*/
869     case OP_CIRCM:
870 nigel 77 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
871 ph10 602 (ptr != end_subject && WAS_NEWLINE(ptr)))
872 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
873     break;
874    
875     /*-----------------------------------------------------------------*/
876     case OP_EOD:
877 ph10 579 if (ptr >= end_subject)
878     {
879 ph10 553 if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
880     could_continue = TRUE;
881     else { ADD_ACTIVE(state_offset + 1, 0); }
882     }
883 nigel 77 break;
884    
885     /*-----------------------------------------------------------------*/
886     case OP_SOD:
887     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
888     break;
889    
890     /*-----------------------------------------------------------------*/
891     case OP_SOM:
892     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
893     break;
894    
895    
896     /* ========================================================================== */
897     /* These opcodes inspect the next subject character, and sometimes
898     the previous one as well, but do not have an argument. The variable
899     clen contains the length of the current character and is zero if we are
900     at the end of the subject. */
901    
902     /*-----------------------------------------------------------------*/
903     case OP_ANY:
904 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
905 ph10 975 {
906 ph10 919 if (ptr + 1 >= md->end_subject &&
907     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
908     NLBLOCK->nltype == NLTYPE_FIXED &&
909 ph10 975 NLBLOCK->nllen == 2 &&
910 ph10 919 c == NLBLOCK->nl[0])
911     {
912 ph10 975 could_continue = partial_newline = TRUE;
913     }
914 ph10 919 else
915 ph10 975 {
916     ADD_NEW(state_offset + 1, 0);
917     }
918 ph10 919 }
919 nigel 77 break;
920    
921     /*-----------------------------------------------------------------*/
922 ph10 341 case OP_ALLANY:
923     if (clen > 0)
924     { ADD_NEW(state_offset + 1, 0); }
925     break;
926    
927     /*-----------------------------------------------------------------*/
928 nigel 77 case OP_EODN:
929 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
930     could_continue = TRUE;
931     else if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
932 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
933     break;
934    
935     /*-----------------------------------------------------------------*/
936     case OP_DOLL:
937     if ((md->moptions & PCRE_NOTEOL) == 0)
938     {
939 ph10 553 if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
940     could_continue = TRUE;
941     else if (clen == 0 ||
942 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
943 ph10 602 (ptr == end_subject - md->nllen)
944 nigel 91 ))
945 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
946 ph10 916 else if (ptr + 1 >= md->end_subject &&
947     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
948     NLBLOCK->nltype == NLTYPE_FIXED &&
949 ph10 975 NLBLOCK->nllen == 2 &&
950 ph10 916 c == NLBLOCK->nl[0])
951     {
952     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
953     {
954     reset_could_continue = TRUE;
955 ph10 975 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
956     }
957     else could_continue = partial_newline = TRUE;
958     }
959 nigel 77 }
960 ph10 602 break;
961    
962     /*-----------------------------------------------------------------*/
963     case OP_DOLLM:
964     if ((md->moptions & PCRE_NOTEOL) == 0)
965     {
966     if (clen == 0 && (md->moptions & PCRE_PARTIAL_HARD) != 0)
967     could_continue = TRUE;
968     else if (clen == 0 ||
969     ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
970     { ADD_ACTIVE(state_offset + 1, 0); }
971 ph10 916 else if (ptr + 1 >= md->end_subject &&
972     (md->moptions & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) != 0 &&
973     NLBLOCK->nltype == NLTYPE_FIXED &&
974 ph10 975 NLBLOCK->nllen == 2 &&
975 ph10 916 c == NLBLOCK->nl[0])
976     {
977     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
978     {
979     reset_could_continue = TRUE;
980 ph10 975 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
981     }
982     else could_continue = partial_newline = TRUE;
983     }
984 ph10 602 }
985     else if (IS_NEWLINE(ptr))
986 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
987     break;
988    
989     /*-----------------------------------------------------------------*/
990    
991     case OP_DIGIT:
992     case OP_WHITESPACE:
993     case OP_WORDCHAR:
994     if (clen > 0 && c < 256 &&
995     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
996     { ADD_NEW(state_offset + 1, 0); }
997     break;
998    
999     /*-----------------------------------------------------------------*/
1000     case OP_NOT_DIGIT:
1001     case OP_NOT_WHITESPACE:
1002     case OP_NOT_WORDCHAR:
1003     if (clen > 0 && (c >= 256 ||
1004     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1005     { ADD_NEW(state_offset + 1, 0); }
1006     break;
1007    
1008     /*-----------------------------------------------------------------*/
1009     case OP_WORD_BOUNDARY:
1010     case OP_NOT_WORD_BOUNDARY:
1011     {
1012     int left_word, right_word;
1013    
1014     if (ptr > start_subject)
1015     {
1016 ph10 836 const pcre_uchar *temp = ptr - 1;
1017 ph10 461 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
1018 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
1019 ph10 836 if (utf) { BACKCHAR(temp); }
1020 nigel 77 #endif
1021     GETCHARTEST(d, temp);
1022 ph10 535 #ifdef SUPPORT_UCP
1023 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
1024     {
1025     if (d == '_') left_word = TRUE; else
1026 ph10 535 {
1027 ph10 518 int cat = UCD_CATEGORY(d);
1028     left_word = (cat == ucp_L || cat == ucp_N);
1029 ph10 535 }
1030     }
1031     else
1032     #endif
1033 nigel 77 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1034     }
1035 ph10 518 else left_word = FALSE;
1036 nigel 77
1037 ph10 461 if (clen > 0)
1038 ph10 535 {
1039     #ifdef SUPPORT_UCP
1040 ph10 518 if ((md->poptions & PCRE_UCP) != 0)
1041     {
1042     if (c == '_') right_word = TRUE; else
1043 ph10 535 {
1044 ph10 518 int cat = UCD_CATEGORY(c);
1045     right_word = (cat == ucp_L || cat == ucp_N);
1046 ph10 535 }
1047     }
1048     else
1049     #endif
1050 ph10 428 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1051 ph10 535 }
1052 ph10 518 else right_word = FALSE;
1053 nigel 77
1054     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
1055     { ADD_ACTIVE(state_offset + 1, 0); }
1056     }
1057     break;
1058    
1059    
1060     /*-----------------------------------------------------------------*/
1061     /* Check the next character by Unicode property. We will get here only
1062     if the support is in the binary; otherwise a compile-time error occurs.
1063     */
1064    
1065 ph10 151 #ifdef SUPPORT_UCP
1066 nigel 77 case OP_PROP:
1067     case OP_NOTPROP:
1068     if (clen > 0)
1069     {
1070 nigel 87 BOOL OK;
1071 ph10 1221 const pcre_uint32 *cp;
1072 ph10 349 const ucd_record * prop = GET_UCD(c);
1073 nigel 87 switch(code[1])
1074 nigel 77 {
1075 nigel 87 case PT_ANY:
1076     OK = TRUE;
1077     break;
1078    
1079     case PT_LAMP:
1080 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1081 ph10 517 prop->chartype == ucp_Lt;
1082 nigel 87 break;
1083    
1084     case PT_GC:
1085 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1086 nigel 87 break;
1087    
1088     case PT_PC:
1089 ph10 349 OK = prop->chartype == code[2];
1090 nigel 87 break;
1091    
1092     case PT_SC:
1093 ph10 349 OK = prop->script == code[2];
1094 nigel 87 break;
1095 ph10 535
1096 ph10 517 /* These are specials for combination cases. */
1097 ph10 535
1098 ph10 517 case PT_ALNUM:
1099 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1100     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1101 ph10 535 break;
1102    
1103 ph10 1364 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1104     which means that Perl space and POSIX space are now identical. PCRE
1105     was changed at release 8.34. */
1106 ph10 1379
1107 ph10 517 case PT_SPACE: /* Perl space */
1108     case PT_PXSPACE: /* POSIX space */
1109 ph10 1376 switch(c)
1110     {
1111     HSPACE_CASES:
1112     VSPACE_CASES:
1113     OK = TRUE;
1114     break;
1115 ph10 1379
1116     default:
1117 ph10 1376 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1118     break;
1119 ph10 1379 }
1120 ph10 535 break;
1121    
1122 ph10 517 case PT_WORD:
1123 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1124     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1125 ph10 517 c == CHAR_UNDERSCORE;
1126 ph10 535 break;
1127 ph10 1221
1128 ph10 1046 case PT_CLIST:
1129 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[2];
1130 ph10 1046 for (;;)
1131     {
1132     if (c < *cp) { OK = FALSE; break; }
1133     if (c == *cp++) { OK = TRUE; break; }
1134 ph10 1221 }
1135     break;
1136 ph10 1320
1137 ph10 1260 case PT_UCNC:
1138     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1139 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1140 ph10 1260 c >= 0xe000;
1141 ph10 1320 break;
1142 nigel 87
1143     /* Should never occur, but keep compilers from grumbling. */
1144    
1145     default:
1146     OK = codevalue != OP_PROP;
1147     break;
1148 nigel 77 }
1149 nigel 87
1150     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1151 nigel 77 }
1152     break;
1153     #endif
1154    
1155    
1156    
1157     /* ========================================================================== */
1158     /* These opcodes likewise inspect the subject character, but have an
1159     argument that is not a data character. It is one of these opcodes:
1160 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1161     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1162 nigel 77
1163     case OP_TYPEPLUS:
1164     case OP_TYPEMINPLUS:
1165 nigel 93 case OP_TYPEPOSPLUS:
1166 nigel 77 count = current_state->count; /* Already matched */
1167     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1168     if (clen > 0)
1169     {
1170 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1171     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1172     NLBLOCK->nltype == NLTYPE_FIXED &&
1173 ph10 975 NLBLOCK->nllen == 2 &&
1174 ph10 919 c == NLBLOCK->nl[0])
1175     {
1176 ph10 975 could_continue = partial_newline = TRUE;
1177     }
1178 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1179 nigel 77 (c < 256 &&
1180 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1181 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1182     {
1183 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1184     {
1185     active_count--; /* Remove non-match possibility */
1186     next_active_state--;
1187     }
1188 nigel 77 count++;
1189     ADD_NEW(state_offset, count);
1190     }
1191     }
1192     break;
1193    
1194     /*-----------------------------------------------------------------*/
1195     case OP_TYPEQUERY:
1196     case OP_TYPEMINQUERY:
1197 nigel 93 case OP_TYPEPOSQUERY:
1198 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1199     if (clen > 0)
1200     {
1201 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1202     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1203     NLBLOCK->nltype == NLTYPE_FIXED &&
1204 ph10 975 NLBLOCK->nllen == 2 &&
1205 ph10 919 c == NLBLOCK->nl[0])
1206     {
1207 ph10 975 could_continue = partial_newline = TRUE;
1208     }
1209 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1210 nigel 77 (c < 256 &&
1211 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1212 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1213     {
1214 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
1215     {
1216     active_count--; /* Remove non-match possibility */
1217     next_active_state--;
1218     }
1219 nigel 77 ADD_NEW(state_offset + 2, 0);
1220     }
1221     }
1222     break;
1223    
1224     /*-----------------------------------------------------------------*/
1225     case OP_TYPESTAR:
1226     case OP_TYPEMINSTAR:
1227 nigel 93 case OP_TYPEPOSSTAR:
1228 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
1229     if (clen > 0)
1230     {
1231 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1232     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1233     NLBLOCK->nltype == NLTYPE_FIXED &&
1234 ph10 975 NLBLOCK->nllen == 2 &&
1235 ph10 919 c == NLBLOCK->nl[0])
1236     {
1237 ph10 975 could_continue = partial_newline = TRUE;
1238     }
1239 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1240 nigel 77 (c < 256 &&
1241 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1242 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1243     {
1244 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
1245     {
1246     active_count--; /* Remove non-match possibility */
1247     next_active_state--;
1248     }
1249 nigel 77 ADD_NEW(state_offset, 0);
1250     }
1251     }
1252     break;
1253    
1254     /*-----------------------------------------------------------------*/
1255     case OP_TYPEEXACT:
1256 nigel 93 count = current_state->count; /* Number already matched */
1257     if (clen > 0)
1258     {
1259 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1260     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1261     NLBLOCK->nltype == NLTYPE_FIXED &&
1262 ph10 975 NLBLOCK->nllen == 2 &&
1263 ph10 919 c == NLBLOCK->nl[0])
1264     {
1265 ph10 975 could_continue = partial_newline = TRUE;
1266     }
1267 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1268 nigel 93 (c < 256 &&
1269 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1270 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1271     {
1272 ph10 1334 if (++count >= (int)GET2(code, 1))
1273 ph10 836 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1274 nigel 93 else
1275     { ADD_NEW(state_offset, count); }
1276     }
1277     }
1278     break;
1279    
1280     /*-----------------------------------------------------------------*/
1281 nigel 77 case OP_TYPEUPTO:
1282     case OP_TYPEMINUPTO:
1283 nigel 93 case OP_TYPEPOSUPTO:
1284 ph10 836 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1285 nigel 77 count = current_state->count; /* Number already matched */
1286     if (clen > 0)
1287     {
1288 ph10 919 if (d == OP_ANY && ptr + 1 >= md->end_subject &&
1289     (md->moptions & (PCRE_PARTIAL_HARD)) != 0 &&
1290     NLBLOCK->nltype == NLTYPE_FIXED &&
1291 ph10 975 NLBLOCK->nllen == 2 &&
1292 ph10 919 c == NLBLOCK->nl[0])
1293     {
1294 ph10 975 could_continue = partial_newline = TRUE;
1295     }
1296 ph10 919 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1297 nigel 77 (c < 256 &&
1298 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1299 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1300     {
1301 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
1302     {
1303     active_count--; /* Remove non-match possibility */
1304     next_active_state--;
1305     }
1306 ph10 1334 if (++count >= (int)GET2(code, 1))
1307 ph10 836 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1308 nigel 77 else
1309     { ADD_NEW(state_offset, count); }
1310     }
1311     }
1312     break;
1313    
1314     /* ========================================================================== */
1315     /* These are virtual opcodes that are used when something like
1316 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1317     argument. It keeps the code above fast for the other cases. The argument
1318     is in the d variable. */
1319 nigel 77
1320 ph10 151 #ifdef SUPPORT_UCP
1321 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1322     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1323 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1324 nigel 77 count = current_state->count; /* Already matched */
1325 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1326 nigel 77 if (clen > 0)
1327     {
1328 nigel 87 BOOL OK;
1329 ph10 1221 const pcre_uint32 *cp;
1330 ph10 349 const ucd_record * prop = GET_UCD(c);
1331 nigel 87 switch(code[2])
1332     {
1333     case PT_ANY:
1334     OK = TRUE;
1335     break;
1336    
1337     case PT_LAMP:
1338 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1339 ph10 517 prop->chartype == ucp_Lt;
1340 nigel 87 break;
1341    
1342     case PT_GC:
1343 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1344 nigel 87 break;
1345    
1346     case PT_PC:
1347 ph10 349 OK = prop->chartype == code[3];
1348 nigel 87 break;
1349    
1350     case PT_SC:
1351 ph10 349 OK = prop->script == code[3];
1352 nigel 87 break;
1353    
1354 ph10 517 /* These are specials for combination cases. */
1355 ph10 535
1356 ph10 517 case PT_ALNUM:
1357 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1358     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1359 ph10 535 break;
1360    
1361 ph10 1364 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1362     which means that Perl space and POSIX space are now identical. PCRE
1363     was changed at release 8.34. */
1364 ph10 1379
1365 ph10 517 case PT_SPACE: /* Perl space */
1366     case PT_PXSPACE: /* POSIX space */
1367 ph10 1376 switch(c)
1368     {
1369     HSPACE_CASES:
1370     VSPACE_CASES:
1371     OK = TRUE;
1372     break;
1373 ph10 1379
1374     default:
1375 ph10 1376 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1376     break;
1377 ph10 1379 }
1378 ph10 535 break;
1379    
1380 ph10 517 case PT_WORD:
1381 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1382     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1383 ph10 517 c == CHAR_UNDERSCORE;
1384 ph10 535 break;
1385 ph10 517
1386 ph10 1046 case PT_CLIST:
1387 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[3];
1388 ph10 1046 for (;;)
1389     {
1390     if (c < *cp) { OK = FALSE; break; }
1391     if (c == *cp++) { OK = TRUE; break; }
1392 ph10 1221 }
1393     break;
1394 ph10 1046
1395 ph10 1260 case PT_UCNC:
1396     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1397 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1398 ph10 1260 c >= 0xe000;
1399 ph10 1320 break;
1400 ph10 1260
1401 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1402    
1403     default:
1404     OK = codevalue != OP_PROP;
1405     break;
1406     }
1407    
1408 nigel 93 if (OK == (d == OP_PROP))
1409     {
1410     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1411     {
1412     active_count--; /* Remove non-match possibility */
1413     next_active_state--;
1414     }
1415     count++;
1416     ADD_NEW(state_offset, count);
1417     }
1418 nigel 77 }
1419     break;
1420    
1421     /*-----------------------------------------------------------------*/
1422     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1423     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1424 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1425 nigel 77 count = current_state->count; /* Already matched */
1426     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1427 ph10 1011 if (clen > 0)
1428 nigel 77 {
1429 ph10 1033 int lgb, rgb;
1430 ph10 836 const pcre_uchar *nptr = ptr + clen;
1431 nigel 77 int ncount = 0;
1432 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1433     {
1434     active_count--; /* Remove non-match possibility */
1435     next_active_state--;
1436     }
1437 ph10 1033 lgb = UCD_GRAPHBREAK(c);
1438 nigel 77 while (nptr < end_subject)
1439     {
1440 ph10 1011 dlen = 1;
1441     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1442 ph10 1033 rgb = UCD_GRAPHBREAK(d);
1443 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1444 nigel 77 ncount++;
1445 ph10 1033 lgb = rgb;
1446 ph10 1011 nptr += dlen;
1447 nigel 77 }
1448     count++;
1449     ADD_NEW_DATA(-state_offset, count, ncount);
1450     }
1451     break;
1452 ph10 151 #endif
1453 nigel 77
1454     /*-----------------------------------------------------------------*/
1455 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1456     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1457     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1458     count = current_state->count; /* Already matched */
1459     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1460     if (clen > 0)
1461     {
1462     int ncount = 0;
1463     switch (c)
1464     {
1465 ph10 1033 case CHAR_VT:
1466     case CHAR_FF:
1467     case CHAR_NEL:
1468     #ifndef EBCDIC
1469 nigel 93 case 0x2028:
1470     case 0x2029:
1471 ph10 1033 #endif /* Not EBCDIC */
1472 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1473     goto ANYNL01;
1474    
1475 ph10 1033 case CHAR_CR:
1476 ph10 1431 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1477 ph10 231 /* Fall through */
1478    
1479     ANYNL01:
1480 ph10 1033 case CHAR_LF:
1481 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1482     {
1483     active_count--; /* Remove non-match possibility */
1484     next_active_state--;
1485     }
1486     count++;
1487     ADD_NEW_DATA(-state_offset, count, ncount);
1488     break;
1489 ph10 231
1490 nigel 93 default:
1491     break;
1492     }
1493     }
1494     break;
1495    
1496     /*-----------------------------------------------------------------*/
1497 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1498     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1499     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1500     count = current_state->count; /* Already matched */
1501     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1502     if (clen > 0)
1503     {
1504 ph10 182 BOOL OK;
1505 ph10 178 switch (c)
1506     {
1507 ph10 1221 VSPACE_CASES:
1508 ph10 178 OK = TRUE;
1509 ph10 182 break;
1510 ph10 178
1511     default:
1512     OK = FALSE;
1513 ph10 182 break;
1514 ph10 178 }
1515    
1516     if (OK == (d == OP_VSPACE))
1517 ph10 182 {
1518 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1519     {
1520     active_count--; /* Remove non-match possibility */
1521     next_active_state--;
1522     }
1523     count++;
1524     ADD_NEW_DATA(-state_offset, count, 0);
1525     }
1526     }
1527     break;
1528    
1529     /*-----------------------------------------------------------------*/
1530     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1531     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1532     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1533     count = current_state->count; /* Already matched */
1534     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1535     if (clen > 0)
1536     {
1537 ph10 182 BOOL OK;
1538 ph10 178 switch (c)
1539     {
1540 ph10 1221 HSPACE_CASES:
1541 ph10 178 OK = TRUE;
1542     break;
1543 ph10 182
1544 ph10 178 default:
1545     OK = FALSE;
1546     break;
1547     }
1548 ph10 182
1549 ph10 178 if (OK == (d == OP_HSPACE))
1550 ph10 182 {
1551 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1552     {
1553     active_count--; /* Remove non-match possibility */
1554     next_active_state--;
1555     }
1556     count++;
1557     ADD_NEW_DATA(-state_offset, count, 0);
1558     }
1559     }
1560     break;
1561    
1562     /*-----------------------------------------------------------------*/
1563 ph10 151 #ifdef SUPPORT_UCP
1564 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1565     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1566 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1567 nigel 87 count = 4;
1568 nigel 77 goto QS1;
1569    
1570     case OP_PROP_EXTRA + OP_TYPESTAR:
1571     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1572 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1573 nigel 77 count = 0;
1574    
1575     QS1:
1576    
1577 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1578 nigel 77 if (clen > 0)
1579     {
1580 nigel 87 BOOL OK;
1581 ph10 1221 const pcre_uint32 *cp;
1582 ph10 349 const ucd_record * prop = GET_UCD(c);
1583 nigel 87 switch(code[2])
1584     {
1585     case PT_ANY:
1586     OK = TRUE;
1587     break;
1588    
1589     case PT_LAMP:
1590 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1591 ph10 517 prop->chartype == ucp_Lt;
1592 nigel 87 break;
1593    
1594     case PT_GC:
1595 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1596 nigel 87 break;
1597    
1598     case PT_PC:
1599 ph10 349 OK = prop->chartype == code[3];
1600 nigel 87 break;
1601    
1602     case PT_SC:
1603 ph10 349 OK = prop->script == code[3];
1604 nigel 87 break;
1605 ph10 535
1606 ph10 517 /* These are specials for combination cases. */
1607 ph10 535
1608 ph10 517 case PT_ALNUM:
1609 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1610     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1611 ph10 535 break;
1612    
1613 ph10 1364 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1614     which means that Perl space and POSIX space are now identical. PCRE
1615     was changed at release 8.34. */
1616 ph10 1379
1617 ph10 517 case PT_SPACE: /* Perl space */
1618     case PT_PXSPACE: /* POSIX space */
1619 ph10 1376 switch(c)
1620     {
1621     HSPACE_CASES:
1622     VSPACE_CASES:
1623     OK = TRUE;
1624     break;
1625 ph10 1379
1626     default:
1627 ph10 1376 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1628     break;
1629 ph10 1379 }
1630 ph10 535 break;
1631    
1632 ph10 517 case PT_WORD:
1633 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1634     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1635 ph10 517 c == CHAR_UNDERSCORE;
1636 ph10 535 break;
1637 nigel 87
1638 ph10 1046 case PT_CLIST:
1639 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[3];
1640 ph10 1046 for (;;)
1641     {
1642     if (c < *cp) { OK = FALSE; break; }
1643     if (c == *cp++) { OK = TRUE; break; }
1644 ph10 1221 }
1645     break;
1646 ph10 1046
1647 ph10 1260 case PT_UCNC:
1648     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1649 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1650 ph10 1260 c >= 0xe000;
1651 ph10 1320 break;
1652 ph10 1260
1653 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1654    
1655     default:
1656     OK = codevalue != OP_PROP;
1657     break;
1658     }
1659    
1660 nigel 93 if (OK == (d == OP_PROP))
1661     {
1662     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1663     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1664     {
1665     active_count--; /* Remove non-match possibility */
1666     next_active_state--;
1667     }
1668     ADD_NEW(state_offset + count, 0);
1669     }
1670 nigel 77 }
1671     break;
1672    
1673     /*-----------------------------------------------------------------*/
1674     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1675     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1676 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1677 nigel 77 count = 2;
1678     goto QS2;
1679    
1680     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1681     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1682 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1683 nigel 77 count = 0;
1684    
1685     QS2:
1686    
1687     ADD_ACTIVE(state_offset + 2, 0);
1688 ph10 1011 if (clen > 0)
1689 nigel 77 {
1690 ph10 1033 int lgb, rgb;
1691 ph10 836 const pcre_uchar *nptr = ptr + clen;
1692 nigel 77 int ncount = 0;
1693 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1694     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1695     {
1696     active_count--; /* Remove non-match possibility */
1697     next_active_state--;
1698     }
1699 ph10 1033 lgb = UCD_GRAPHBREAK(c);
1700 nigel 77 while (nptr < end_subject)
1701     {
1702 ph10 1011 dlen = 1;
1703     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1704 ph10 1033 rgb = UCD_GRAPHBREAK(d);
1705 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1706 nigel 77 ncount++;
1707 ph10 1033 lgb = rgb;
1708 ph10 1011 nptr += dlen;
1709 nigel 77 }
1710     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1711     }
1712     break;
1713 ph10 151 #endif
1714 nigel 77
1715     /*-----------------------------------------------------------------*/
1716 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1717     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1718     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1719     count = 2;
1720     goto QS3;
1721    
1722     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1723     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1724     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1725     count = 0;
1726    
1727     QS3:
1728     ADD_ACTIVE(state_offset + 2, 0);
1729     if (clen > 0)
1730     {
1731     int ncount = 0;
1732     switch (c)
1733     {
1734 ph10 1033 case CHAR_VT:
1735     case CHAR_FF:
1736     case CHAR_NEL:
1737     #ifndef EBCDIC
1738 nigel 93 case 0x2028:
1739     case 0x2029:
1740 ph10 1033 #endif /* Not EBCDIC */
1741 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1742     goto ANYNL02;
1743    
1744 ph10 1033 case CHAR_CR:
1745 ph10 1431 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1746 ph10 231 /* Fall through */
1747    
1748     ANYNL02:
1749 ph10 1033 case CHAR_LF:
1750 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1751     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1752     {
1753     active_count--; /* Remove non-match possibility */
1754     next_active_state--;
1755     }
1756 ph10 1233 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1757 nigel 93 break;
1758 ph10 231
1759 nigel 93 default:
1760     break;
1761     }
1762     }
1763     break;
1764    
1765     /*-----------------------------------------------------------------*/
1766 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1767     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1768     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1769     count = 2;
1770     goto QS4;
1771    
1772     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1773     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1774     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1775     count = 0;
1776    
1777     QS4:
1778     ADD_ACTIVE(state_offset + 2, 0);
1779     if (clen > 0)
1780     {
1781 ph10 182 BOOL OK;
1782 ph10 178 switch (c)
1783     {
1784 ph10 1221 VSPACE_CASES:
1785 ph10 178 OK = TRUE;
1786     break;
1787 ph10 182
1788 ph10 178 default:
1789     OK = FALSE;
1790     break;
1791     }
1792     if (OK == (d == OP_VSPACE))
1793 ph10 182 {
1794 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1795     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1796     {
1797     active_count--; /* Remove non-match possibility */
1798     next_active_state--;
1799     }
1800 ph10 1233 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1801 ph10 178 }
1802     }
1803     break;
1804    
1805     /*-----------------------------------------------------------------*/
1806     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1807     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1808     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1809     count = 2;
1810     goto QS5;
1811    
1812     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1813     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1814     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1815     count = 0;
1816    
1817     QS5:
1818     ADD_ACTIVE(state_offset + 2, 0);
1819     if (clen > 0)
1820     {
1821 ph10 182 BOOL OK;
1822 ph10 178 switch (c)
1823     {
1824 ph10 1221 HSPACE_CASES:
1825 ph10 178 OK = TRUE;
1826     break;
1827 ph10 182
1828 ph10 178 default:
1829     OK = FALSE;
1830     break;
1831     }
1832 ph10 182
1833 ph10 178 if (OK == (d == OP_HSPACE))
1834 ph10 182 {
1835 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1836     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1837     {
1838     active_count--; /* Remove non-match possibility */
1839     next_active_state--;
1840     }
1841 ph10 1233 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1842 ph10 178 }
1843     }
1844     break;
1845    
1846     /*-----------------------------------------------------------------*/
1847 ph10 151 #ifdef SUPPORT_UCP
1848 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1849     case OP_PROP_EXTRA + OP_TYPEUPTO:
1850     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1851 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1852 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1853 ph10 836 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
1854 nigel 77 count = current_state->count; /* Number already matched */
1855     if (clen > 0)
1856     {
1857 nigel 87 BOOL OK;
1858 ph10 1221 const pcre_uint32 *cp;
1859 ph10 349 const ucd_record * prop = GET_UCD(c);
1860 ph10 836 switch(code[1 + IMM2_SIZE + 1])
1861 nigel 77 {
1862 nigel 87 case PT_ANY:
1863     OK = TRUE;
1864     break;
1865    
1866     case PT_LAMP:
1867 ph10 535 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll ||
1868 ph10 517 prop->chartype == ucp_Lt;
1869 nigel 87 break;
1870    
1871     case PT_GC:
1872 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
1873 nigel 87 break;
1874    
1875     case PT_PC:
1876 ph10 836 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
1877 nigel 87 break;
1878    
1879     case PT_SC:
1880 ph10 836 OK = prop->script == code[1 + IMM2_SIZE + 2];
1881 nigel 87 break;
1882 ph10 535
1883 ph10 517 /* These are specials for combination cases. */
1884 ph10 535
1885 ph10 517 case PT_ALNUM:
1886 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1887     PRIV(ucp_gentype)[prop->chartype] == ucp_N;
1888 ph10 535 break;
1889    
1890 ph10 1364 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1891     which means that Perl space and POSIX space are now identical. PCRE
1892     was changed at release 8.34. */
1893 ph10 1379
1894 ph10 517 case PT_SPACE: /* Perl space */
1895     case PT_PXSPACE: /* POSIX space */
1896 ph10 1376 switch(c)
1897     {
1898     HSPACE_CASES:
1899     VSPACE_CASES:
1900     OK = TRUE;
1901     break;
1902 ph10 1379
1903     default:
1904 ph10 1376 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1905     break;
1906 ph10 1379 }
1907 ph10 535 break;
1908    
1909 ph10 517 case PT_WORD:
1910 ph10 836 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
1911     PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1912 ph10 517 c == CHAR_UNDERSCORE;
1913 ph10 535 break;
1914 nigel 87
1915 ph10 1046 case PT_CLIST:
1916 ph10 1218 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
1917 ph10 1046 for (;;)
1918     {
1919     if (c < *cp) { OK = FALSE; break; }
1920     if (c == *cp++) { OK = TRUE; break; }
1921 ph10 1221 }
1922     break;
1923 ph10 1046
1924 ph10 1260 case PT_UCNC:
1925     OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1926 ph10 1320 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1927 ph10 1260 c >= 0xe000;
1928 ph10 1320 break;
1929 ph10 1260
1930 nigel 87 /* Should never occur, but keep compilers from grumbling. */
1931    
1932     default:
1933     OK = codevalue != OP_PROP;
1934     break;
1935     }
1936    
1937     if (OK == (d == OP_PROP))
1938     {
1939 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1940     {
1941     active_count--; /* Remove non-match possibility */
1942     next_active_state--;
1943     }
1944 ph10 1334 if (++count >= (int)GET2(code, 1))
1945 ph10 836 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
1946 nigel 77 else
1947     { ADD_NEW(state_offset, count); }
1948     }
1949     }
1950     break;
1951    
1952     /*-----------------------------------------------------------------*/
1953     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1954     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1955     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1956 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1957 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1958 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1959 nigel 77 count = current_state->count; /* Number already matched */
1960 ph10 1011 if (clen > 0)
1961 nigel 77 {
1962 ph10 1033 int lgb, rgb;
1963 ph10 836 const pcre_uchar *nptr = ptr + clen;
1964 nigel 77 int ncount = 0;
1965 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1966     {
1967     active_count--; /* Remove non-match possibility */
1968     next_active_state--;
1969     }
1970 ph10 1033 lgb = UCD_GRAPHBREAK(c);
1971 nigel 77 while (nptr < end_subject)
1972     {
1973 ph10 1011 dlen = 1;
1974     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
1975 ph10 1033 rgb = UCD_GRAPHBREAK(d);
1976 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
1977 nigel 77 ncount++;
1978 ph10 1033 lgb = rgb;
1979 ph10 1011 nptr += dlen;
1980 nigel 77 }
1981 ph10 975 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
1982     reset_could_continue = TRUE;
1983 ph10 1334 if (++count >= (int)GET2(code, 1))
1984 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
1985 nigel 77 else
1986     { ADD_NEW_DATA(-state_offset, count, ncount); }
1987     }
1988     break;
1989 ph10 151 #endif
1990 nigel 77
1991 nigel 93 /*-----------------------------------------------------------------*/
1992     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1993     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1994     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1995     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1996     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1997 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
1998 nigel 93 count = current_state->count; /* Number already matched */
1999     if (clen > 0)
2000     {
2001     int ncount = 0;
2002     switch (c)
2003     {
2004 ph10 1033 case CHAR_VT:
2005     case CHAR_FF:
2006     case CHAR_NEL:
2007     #ifndef EBCDIC
2008 nigel 93 case 0x2028:
2009     case 0x2029:
2010 ph10 1033 #endif /* Not EBCDIC */
2011 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2012     goto ANYNL03;
2013    
2014 ph10 1033 case CHAR_CR:
2015 ph10 1431 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2016 ph10 231 /* Fall through */
2017    
2018     ANYNL03:
2019 ph10 1033 case CHAR_LF:
2020 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2021     {
2022     active_count--; /* Remove non-match possibility */
2023     next_active_state--;
2024     }
2025 ph10 1334 if (++count >= (int)GET2(code, 1))
2026 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2027 nigel 93 else
2028     { ADD_NEW_DATA(-state_offset, count, ncount); }
2029     break;
2030 ph10 231
2031 nigel 93 default:
2032     break;
2033     }
2034     }
2035     break;
2036    
2037 ph10 178 /*-----------------------------------------------------------------*/
2038     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
2039     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
2040     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
2041     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
2042     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2043 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2044 ph10 178 count = current_state->count; /* Number already matched */
2045     if (clen > 0)
2046     {
2047 ph10 182 BOOL OK;
2048 ph10 178 switch (c)
2049     {
2050 ph10 1221 VSPACE_CASES:
2051 ph10 178 OK = TRUE;
2052     break;
2053 ph10 182
2054 ph10 178 default:
2055     OK = FALSE;
2056     }
2057 ph10 182
2058 ph10 178 if (OK == (d == OP_VSPACE))
2059 ph10 182 {
2060 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2061     {
2062     active_count--; /* Remove non-match possibility */
2063     next_active_state--;
2064     }
2065 ph10 1334 if (++count >= (int)GET2(code, 1))
2066 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2067 ph10 178 else
2068     { ADD_NEW_DATA(-state_offset, count, 0); }
2069     }
2070     }
2071     break;
2072    
2073     /*-----------------------------------------------------------------*/
2074     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
2075     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
2076     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
2077     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
2078     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2079 ph10 836 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2080 ph10 178 count = current_state->count; /* Number already matched */
2081     if (clen > 0)
2082     {
2083 ph10 182 BOOL OK;
2084 ph10 178 switch (c)
2085     {
2086 ph10 1221 HSPACE_CASES:
2087 ph10 178 OK = TRUE;
2088     break;
2089 ph10 182
2090 ph10 178 default:
2091     OK = FALSE;
2092     break;
2093     }
2094 ph10 182
2095 ph10 178 if (OK == (d == OP_HSPACE))
2096 ph10 182 {
2097 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2098     {
2099     active_count--; /* Remove non-match possibility */
2100     next_active_state--;
2101     }
2102 ph10 1334 if (++count >= (int)GET2(code, 1))
2103 ph10 836 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2104 ph10 178 else
2105     { ADD_NEW_DATA(-state_offset, count, 0); }
2106     }
2107     }
2108     break;
2109    
2110 nigel 77 /* ========================================================================== */
2111     /* These opcodes are followed by a character that is usually compared
2112     to the current subject character; it is loaded into d. We still get
2113     here even if there is no subject character, because in some cases zero
2114     repetitions are permitted. */
2115    
2116     /*-----------------------------------------------------------------*/
2117     case OP_CHAR:
2118     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2119     break;
2120    
2121     /*-----------------------------------------------------------------*/
2122 ph10 602 case OP_CHARI:
2123 nigel 77 if (clen == 0) break;
2124    
2125 ph10 836 #ifdef SUPPORT_UTF
2126     if (utf)
2127 nigel 77 {
2128     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2129     {
2130 nigel 93 unsigned int othercase;
2131 ph10 836 if (c < 128)
2132     othercase = fcc[c];
2133     else
2134     /* If we have Unicode property support, we can use it to test the
2135     other case of the character. */
2136 nigel 77 #ifdef SUPPORT_UCP
2137 ph10 836 othercase = UCD_OTHERCASE(c);
2138 nigel 87 #else
2139 ph10 836 othercase = NOTACHAR;
2140 nigel 77 #endif
2141    
2142     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2143     }
2144     }
2145     else
2146 ph10 836 #endif /* SUPPORT_UTF */
2147     /* Not UTF mode */
2148 nigel 77 {
2149 ph10 836 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2150     { ADD_NEW(state_offset + 2, 0); }
2151 nigel 77 }
2152     break;
2153    
2154    
2155     #ifdef SUPPORT_UCP
2156     /*-----------------------------------------------------------------*/
2157     /* This is a tricky one because it can match more than one character.
2158     Find out how many characters to skip, and then set up a negative state
2159     to wait for them to pass before continuing. */
2160    
2161     case OP_EXTUNI:
2162 ph10 1011 if (clen > 0)
2163 nigel 77 {
2164 ph10 1033 int lgb, rgb;
2165 ph10 836 const pcre_uchar *nptr = ptr + clen;
2166 nigel 77 int ncount = 0;
2167 ph10 1033 lgb = UCD_GRAPHBREAK(c);
2168 nigel 77 while (nptr < end_subject)
2169     {
2170 ph10 1011 dlen = 1;
2171     if (!utf) d = *nptr; else { GETCHARLEN(d, nptr, dlen); }
2172 ph10 1033 rgb = UCD_GRAPHBREAK(d);
2173 ph10 1015 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2174 nigel 77 ncount++;
2175 ph10 1033 lgb = rgb;
2176 ph10 1011 nptr += dlen;
2177 nigel 77 }
2178 ph10 975 if (nptr >= end_subject && (md->moptions & PCRE_PARTIAL_HARD) != 0)
2179     reset_could_continue = TRUE;
2180 nigel 77 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2181     }
2182     break;
2183     #endif
2184    
2185     /*-----------------------------------------------------------------*/
2186 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
2187     character (when CR is followed by LF). In this case, set up a negative
2188     state to wait for one character to pass before continuing. */
2189    
2190     case OP_ANYNL:
2191     if (clen > 0) switch(c)
2192     {
2193 ph10 1033 case CHAR_VT:
2194     case CHAR_FF:
2195     case CHAR_NEL:
2196     #ifndef EBCDIC
2197 nigel 93 case 0x2028:
2198     case 0x2029:
2199 ph10 1033 #endif /* Not EBCDIC */
2200 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
2201    
2202 ph10 1033 case CHAR_LF:
2203 nigel 93 ADD_NEW(state_offset + 1, 0);
2204     break;
2205 ph10 231
2206 ph10 1033 case CHAR_CR:
2207 ph10 975 if (ptr + 1 >= end_subject)
2208 nigel 93 {
2209 ph10 975 ADD_NEW(state_offset + 1, 0);
2210     if ((md->moptions & PCRE_PARTIAL_HARD) != 0)
2211     reset_could_continue = TRUE;
2212     }
2213 ph10 1431 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2214 ph10 916 {
2215 nigel 93 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2216     }
2217     else
2218 ph10 975 {
2219 nigel 93 ADD_NEW(state_offset + 1, 0);
2220 ph10 975 }
2221 nigel 93 break;
2222     }
2223     break;
2224    
2225     /*-----------------------------------------------------------------*/
2226 ph10 178 case OP_NOT_VSPACE:
2227     if (clen > 0) switch(c)
2228     {
2229 ph10 1221 VSPACE_CASES:
2230 ph10 178 break;
2231 ph10 182
2232     default:
2233 ph10 178 ADD_NEW(state_offset + 1, 0);
2234     break;
2235     }
2236     break;
2237    
2238     /*-----------------------------------------------------------------*/
2239     case OP_VSPACE:
2240     if (clen > 0) switch(c)
2241     {
2242 ph10 1221 VSPACE_CASES:
2243 ph10 178 ADD_NEW(state_offset + 1, 0);
2244     break;
2245 ph10 182
2246 ph10 1221 default:
2247 ph10 1041 break;
2248 ph10 178 }
2249     break;
2250    
2251     /*-----------------------------------------------------------------*/
2252     case OP_NOT_HSPACE:
2253     if (clen > 0) switch(c)
2254     {
2255 ph10 1221 HSPACE_CASES:
2256 ph10 178 break;
2257 ph10 182
2258     default:
2259 ph10 178 ADD_NEW(state_offset + 1, 0);
2260     break;
2261     }
2262     break;
2263    
2264     /*-----------------------------------------------------------------*/
2265     case OP_HSPACE:
2266     if (clen > 0) switch(c)
2267     {
2268 ph10 1221 HSPACE_CASES:
2269 ph10 178 ADD_NEW(state_offset + 1, 0);
2270     break;
2271 ph10 1221
2272 ph10 1041 default:
2273 ph10 1221 break;
2274 ph10 178 }
2275     break;
2276    
2277     /*-----------------------------------------------------------------*/
2278 ph10 925 /* Match a negated single character casefully. */
2279 nigel 77
2280     case OP_NOT:
2281 ph10 602 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2282 nigel 77 break;
2283    
2284     /*-----------------------------------------------------------------*/
2285 ph10 925 /* Match a negated single character caselessly. */
2286 ph10 602
2287     case OP_NOTI:
2288 ph10 925 if (clen > 0)
2289 ph10 975 {
2290 ph10 925 unsigned int otherd;
2291     #ifdef SUPPORT_UTF
2292     if (utf && d >= 128)
2293     {
2294     #ifdef SUPPORT_UCP
2295     otherd = UCD_OTHERCASE(d);
2296     #endif /* SUPPORT_UCP */
2297     }
2298     else
2299     #endif /* SUPPORT_UTF */
2300     otherd = TABLE_GET(d, fcc, d);
2301     if (c != d && c != otherd)
2302     { ADD_NEW(state_offset + dlen + 1, 0); }
2303 ph10 975 }
2304 ph10 602 break;
2305    
2306     /*-----------------------------------------------------------------*/
2307     case OP_PLUSI:
2308     case OP_MINPLUSI:
2309     case OP_POSPLUSI:
2310     case OP_NOTPLUSI:
2311     case OP_NOTMINPLUSI:
2312     case OP_NOTPOSPLUSI:
2313     caseless = TRUE;
2314     codevalue -= OP_STARI - OP_STAR;
2315 ph10 654
2316 ph10 602 /* Fall through */
2317 nigel 77 case OP_PLUS:
2318     case OP_MINPLUS:
2319 nigel 93 case OP_POSPLUS:
2320 nigel 77 case OP_NOTPLUS:
2321     case OP_NOTMINPLUS:
2322 nigel 93 case OP_NOTPOSPLUS:
2323 nigel 77 count = current_state->count; /* Already matched */
2324     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2325     if (clen > 0)
2326     {
2327 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2328 ph10 602 if (caseless)
2329 nigel 77 {
2330 ph10 836 #ifdef SUPPORT_UTF
2331     if (utf && d >= 128)
2332 nigel 77 {
2333     #ifdef SUPPORT_UCP
2334 ph10 349 otherd = UCD_OTHERCASE(d);
2335 nigel 77 #endif /* SUPPORT_UCP */
2336     }
2337     else
2338 ph10 836 #endif /* SUPPORT_UTF */
2339     otherd = TABLE_GET(d, fcc, d);
2340 nigel 77 }
2341     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2342 nigel 93 {
2343     if (count > 0 &&
2344     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2345     {
2346     active_count--; /* Remove non-match possibility */
2347     next_active_state--;
2348     }
2349     count++;
2350     ADD_NEW(state_offset, count);
2351     }
2352 nigel 77 }
2353     break;
2354    
2355     /*-----------------------------------------------------------------*/
2356 ph10 602 case OP_QUERYI:
2357     case OP_MINQUERYI:
2358     case OP_POSQUERYI:
2359     case OP_NOTQUERYI:
2360     case OP_NOTMINQUERYI:
2361     case OP_NOTPOSQUERYI:
2362     caseless = TRUE;
2363     codevalue -= OP_STARI - OP_STAR;
2364     /* Fall through */
2365 nigel 77 case OP_QUERY:
2366     case OP_MINQUERY:
2367 nigel 93 case OP_POSQUERY:
2368 nigel 77 case OP_NOTQUERY:
2369     case OP_NOTMINQUERY:
2370 nigel 93 case OP_NOTPOSQUERY:
2371 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2372     if (clen > 0)
2373     {
2374 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2375 ph10 602 if (caseless)
2376 nigel 77 {
2377 ph10 836 #ifdef SUPPORT_UTF
2378     if (utf && d >= 128)
2379 nigel 77 {
2380     #ifdef SUPPORT_UCP
2381 ph10 349 otherd = UCD_OTHERCASE(d);
2382 nigel 77 #endif /* SUPPORT_UCP */
2383     }
2384     else
2385 ph10 836 #endif /* SUPPORT_UTF */
2386     otherd = TABLE_GET(d, fcc, d);
2387 nigel 77 }
2388     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2389 nigel 93 {
2390     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2391     {
2392     active_count--; /* Remove non-match possibility */
2393     next_active_state--;
2394     }
2395     ADD_NEW(state_offset + dlen + 1, 0);
2396     }
2397 nigel 77 }
2398     break;
2399    
2400     /*-----------------------------------------------------------------*/
2401 ph10 602 case OP_STARI:
2402     case OP_MINSTARI:
2403     case OP_POSSTARI:
2404     case OP_NOTSTARI:
2405     case OP_NOTMINSTARI:
2406     case OP_NOTPOSSTARI:
2407     caseless = TRUE;
2408     codevalue -= OP_STARI - OP_STAR;
2409     /* Fall through */
2410 nigel 77 case OP_STAR:
2411     case OP_MINSTAR:
2412 nigel 93 case OP_POSSTAR:
2413 nigel 77 case OP_NOTSTAR:
2414     case OP_NOTMINSTAR:
2415 nigel 93 case OP_NOTPOSSTAR:
2416 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
2417     if (clen > 0)
2418     {
2419 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2420 ph10 602 if (caseless)
2421 nigel 77 {
2422 ph10 836 #ifdef SUPPORT_UTF
2423     if (utf && d >= 128)
2424 nigel 77 {
2425     #ifdef SUPPORT_UCP
2426 ph10 349 otherd = UCD_OTHERCASE(d);
2427 nigel 77 #endif /* SUPPORT_UCP */
2428     }
2429     else
2430 ph10 836 #endif /* SUPPORT_UTF */
2431     otherd = TABLE_GET(d, fcc, d);
2432 nigel 77 }
2433     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2434 nigel 93 {
2435     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2436     {
2437     active_count--; /* Remove non-match possibility */
2438     next_active_state--;
2439     }
2440     ADD_NEW(state_offset, 0);
2441     }
2442 nigel 77 }
2443     break;
2444    
2445     /*-----------------------------------------------------------------*/
2446 ph10 602 case OP_EXACTI:
2447     case OP_NOTEXACTI:
2448     caseless = TRUE;
2449     codevalue -= OP_STARI - OP_STAR;
2450     /* Fall through */
2451 nigel 77 case OP_EXACT:
2452 nigel 93 case OP_NOTEXACT:
2453     count = current_state->count; /* Number already matched */
2454     if (clen > 0)
2455     {
2456 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2457 ph10 602 if (caseless)
2458 nigel 93 {
2459 ph10 836 #ifdef SUPPORT_UTF
2460     if (utf && d >= 128)
2461 nigel 93 {
2462     #ifdef SUPPORT_UCP
2463 ph10 349 otherd = UCD_OTHERCASE(d);
2464 nigel 93 #endif /* SUPPORT_UCP */
2465     }
2466     else
2467 ph10 836 #endif /* SUPPORT_UTF */
2468     otherd = TABLE_GET(d, fcc, d);
2469 nigel 93 }
2470     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2471     {
2472 ph10 1334 if (++count >= (int)GET2(code, 1))
2473 ph10 836 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2474 nigel 93 else
2475     { ADD_NEW(state_offset, count); }
2476     }
2477     }
2478     break;
2479    
2480     /*-----------------------------------------------------------------*/
2481 ph10 602 case OP_UPTOI:
2482     case OP_MINUPTOI:
2483     case OP_POSUPTOI:
2484     case OP_NOTUPTOI:
2485     case OP_NOTMINUPTOI:
2486     case OP_NOTPOSUPTOI:
2487     caseless = TRUE;
2488     codevalue -= OP_STARI - OP_STAR;
2489     /* Fall through */
2490 nigel 77 case OP_UPTO:
2491     case OP_MINUPTO:
2492 nigel 93 case OP_POSUPTO:
2493 nigel 77 case OP_NOTUPTO:
2494     case OP_NOTMINUPTO:
2495 nigel 93 case OP_NOTPOSUPTO:
2496 ph10 836 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2497 nigel 77 count = current_state->count; /* Number already matched */
2498     if (clen > 0)
2499     {
2500 chpe 1100 pcre_uint32 otherd = NOTACHAR;
2501 ph10 602 if (caseless)
2502 nigel 77 {
2503 ph10 836 #ifdef SUPPORT_UTF
2504     if (utf && d >= 128)
2505 nigel 77 {
2506     #ifdef SUPPORT_UCP
2507 ph10 349 otherd = UCD_OTHERCASE(d);
2508 nigel 77 #endif /* SUPPORT_UCP */
2509     }
2510     else
2511 ph10 836 #endif /* SUPPORT_UTF */
2512     otherd = TABLE_GET(d, fcc, d);
2513 nigel 77 }
2514     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2515     {
2516 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2517     {
2518     active_count--; /* Remove non-match possibility */
2519     next_active_state--;
2520     }
2521 ph10 1334 if (++count >= (int)GET2(code, 1))
2522 ph10 836 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2523 nigel 77 else
2524     { ADD_NEW(state_offset, count); }
2525     }
2526     }
2527     break;
2528    
2529    
2530     /* ========================================================================== */
2531     /* These are the class-handling opcodes */
2532    
2533     case OP_CLASS:
2534     case OP_NCLASS:
2535     case OP_XCLASS:
2536     {
2537     BOOL isinclass = FALSE;
2538     int next_state_offset;
2539 ph10 836 const pcre_uchar *ecode;
2540 nigel 77
2541     /* For a simple class, there is always just a 32-byte table, and we
2542     can set isinclass from it. */
2543    
2544     if (codevalue != OP_XCLASS)
2545     {
2546 ph10 836 ecode = code + 1 + (32 / sizeof(pcre_uchar));
2547 nigel 77 if (clen > 0)
2548     {
2549     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2550 ph10 836 ((((pcre_uint8 *)(code + 1))[c/8] & (1 << (c&7))) != 0);
2551 nigel 77 }
2552     }
2553    
2554     /* An extended class may have a table or a list of single characters,
2555     ranges, or both, and it may be positive or negative. There's a
2556     function that sorts all this out. */
2557    
2558     else
2559     {
2560     ecode = code + GET(code, 1);
2561 ph10 836 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2562 nigel 77 }
2563    
2564     /* At this point, isinclass is set for all kinds of class, and ecode
2565     points to the byte after the end of the class. If there is a
2566     quantifier, this is where it will be. */
2567    
2568 ph10 530 next_state_offset = (int)(ecode - start_code);
2569 nigel 77
2570     switch (*ecode)
2571     {
2572     case OP_CRSTAR:
2573     case OP_CRMINSTAR:
2574 ph10 1379 case OP_CRPOSSTAR:
2575 nigel 77 ADD_ACTIVE(next_state_offset + 1, 0);
2576 ph10 1379 if (isinclass)
2577     {
2578     if (*ecode == OP_CRPOSSTAR)
2579     {
2580     active_count--; /* Remove non-match possibility */
2581     next_active_state--;
2582     }
2583     ADD_NEW(state_offset, 0);
2584     }
2585 nigel 77 break;
2586    
2587     case OP_CRPLUS:
2588     case OP_CRMINPLUS:
2589 ph10 1379 case OP_CRPOSPLUS:
2590 nigel 77 count = current_state->count; /* Already matched */
2591     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2592 ph10 1379 if (isinclass)
2593     {
2594     if (count > 0 && *ecode == OP_CRPOSPLUS)
2595     {
2596     active_count--; /* Remove non-match possibility */
2597     next_active_state--;
2598     }
2599     count++;
2600     ADD_NEW(state_offset, count);
2601     }
2602 nigel 77 break;
2603    
2604     case OP_CRQUERY:
2605     case OP_CRMINQUERY:
2606 ph10 1379 case OP_CRPOSQUERY:
2607 nigel 77 ADD_ACTIVE(next_state_offset + 1, 0);
2608 ph10 1379 if (isinclass)
2609     {
2610     if (*ecode == OP_CRPOSQUERY)
2611     {
2612     active_count--; /* Remove non-match possibility */
2613     next_active_state--;
2614     }
2615     ADD_NEW(next_state_offset + 1, 0);
2616     }
2617 nigel 77 break;
2618    
2619     case OP_CRRANGE:
2620     case OP_CRMINRANGE:
2621 ph10 1404 case OP_CRPOSRANGE:
2622 nigel 77 count = current_state->count; /* Already matched */
2623 ph10 1334 if (count >= (int)GET2(ecode, 1))
2624 ph10 836 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2625 nigel 77 if (isinclass)
2626     {
2627 ph10 1334 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2628 ph10 1379 if (*ecode == OP_CRPOSRANGE)
2629     {
2630     active_count--; /* Remove non-match possibility */
2631     next_active_state--;
2632 ph10 1404 }
2633 nigel 91 if (++count >= max && max != 0) /* Max 0 => no limit */
2634 ph10 836 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2635 nigel 77 else
2636     { ADD_NEW(state_offset, count); }
2637     }
2638     break;
2639    
2640     default:
2641     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2642     break;
2643     }
2644     }
2645     break;
2646    
2647     /* ========================================================================== */
2648     /* These are the opcodes for fancy brackets of various kinds. We have
2649 ph10 426 to use recursion in order to handle them. The "always failing" assertion
2650     (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2651 ph10 341 though the other "backtracking verbs" are not supported. */
2652 ph10 345
2653 ph10 341 case OP_FAIL:
2654 ph10 428 forced_fail++; /* Count FAILs for multiple states */
2655 ph10 345 break;
2656 nigel 77
2657     case OP_ASSERT:
2658     case OP_ASSERT_NOT:
2659     case OP_ASSERTBACK:
2660     case OP_ASSERTBACK_NOT:
2661     {
2662     int rc;
2663     int local_offsets[2];
2664     int local_workspace[1000];
2665 ph10 836 const pcre_uchar *endasscode = code + GET(code, 1);
2666 nigel 77
2667     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2668    
2669     rc = internal_dfa_exec(
2670     md, /* static match data */
2671     code, /* this subexpression's code */
2672     ptr, /* where we currently are */
2673 ph10 530 (int)(ptr - start_subject), /* start offset */
2674 nigel 77 local_offsets, /* offset vector */
2675     sizeof(local_offsets)/sizeof(int), /* size of same */
2676     local_workspace, /* workspace vector */
2677     sizeof(local_workspace)/sizeof(int), /* size of same */
2678 ph10 642 rlevel); /* function recursion level */
2679 ph10 487
2680 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2681 nigel 77 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2682 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2683 nigel 77 }
2684     break;
2685    
2686     /*-----------------------------------------------------------------*/
2687     case OP_COND:
2688 nigel 93 case OP_SCOND:
2689 nigel 77 {
2690     int local_offsets[1000];
2691     int local_workspace[1000];
2692 ph10 406 int codelink = GET(code, 1);
2693 ph10 397 int condcode;
2694 ph10 406
2695 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2696 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2697 ph10 398 happen for the other conditions. */
2698 nigel 77
2699 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2700 ph10 406 {
2701     rrc = 0;
2702 ph10 836 if (PUBL(callout) != NULL)
2703 ph10 397 {
2704 zherczeg 850 PUBL(callout_block) cb;
2705 ph10 397 cb.version = 1; /* Version 1 of the callout block */
2706     cb.callout_number = code[LINK_SIZE+2];
2707     cb.offset_vector = offsets;
2708 chpe 1055 #if defined COMPILE_PCRE8
2709 ph10 397 cb.subject = (PCRE_SPTR)start_subject;
2710 chpe 1055 #elif defined COMPILE_PCRE16
2711 zherczeg 852 cb.subject = (PCRE_SPTR16)start_subject;
2712 chpe 1055 #elif defined COMPILE_PCRE32
2713     cb.subject = (PCRE_SPTR32)start_subject;
2714 zherczeg 852 #endif
2715 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
2716     cb.start_match = (int)(current_subject - start_subject);
2717     cb.current_position = (int)(ptr - start_subject);
2718 ph10 397 cb.pattern_position = GET(code, LINK_SIZE + 3);
2719     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2720     cb.capture_top = 1;
2721     cb.capture_last = -1;
2722     cb.callout_data = md->callout_data;
2723 ph10 654 cb.mark = NULL; /* No (*MARK) support */
2724 ph10 836 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
2725 ph10 397 }
2726 ph10 398 if (rrc > 0) break; /* Fail this thread */
2727 ph10 836 code += PRIV(OP_lengths)[OP_CALLOUT]; /* Skip callout data */
2728 ph10 406 }
2729 ph10 398
2730 ph10 397 condcode = code[LINK_SIZE+1];
2731 ph10 406
2732 ph10 1365 /* Back reference conditions and duplicate named recursion conditions
2733     are not supported */
2734 nigel 77
2735 ph10 1379 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2736 ph10 1365 condcode == OP_DNRREF)
2737 ph10 459 return PCRE_ERROR_DFA_UCOND;
2738 nigel 93
2739     /* The DEFINE condition is always false */
2740    
2741     if (condcode == OP_DEF)
2742 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2743 nigel 93
2744     /* The only supported version of OP_RREF is for the value RREF_ANY,
2745     which means "test if in any recursion". We can't test for specifically
2746     recursed groups. */
2747    
2748 ph10 1365 else if (condcode == OP_RREF)
2749 nigel 93 {
2750 ph10 836 int value = GET2(code, LINK_SIZE + 2);
2751 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2752 ph10 654 if (md->recursive != NULL)
2753 ph10 836 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2754 ph10 398 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2755 nigel 77 }
2756    
2757     /* Otherwise, the condition is an assertion */
2758    
2759     else
2760     {
2761     int rc;
2762 ph10 836 const pcre_uchar *asscode = code + LINK_SIZE + 1;
2763     const pcre_uchar *endasscode = asscode + GET(asscode, 1);
2764 nigel 77
2765     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2766    
2767     rc = internal_dfa_exec(
2768     md, /* fixed match data */
2769     asscode, /* this subexpression's code */
2770     ptr, /* where we currently are */
2771 ph10 530 (int)(ptr - start_subject), /* start offset */
2772 nigel 77 local_offsets, /* offset vector */
2773     sizeof(local_offsets)/sizeof(int), /* size of same */
2774     local_workspace, /* workspace vector */
2775     sizeof(local_workspace)/sizeof(int), /* size of same */
2776 ph10 642 rlevel); /* function recursion level */
2777 nigel 77
2778 ph10 473 if (rc == PCRE_ERROR_DFA_UITEM) return rc;
2779 nigel 77 if ((rc >= 0) ==
2780     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2781 ph10 530 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2782 nigel 77 else
2783 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2784 nigel 77 }
2785     }
2786     break;
2787    
2788     /*-----------------------------------------------------------------*/
2789     case OP_RECURSE:
2790     {
2791 ph10 654 dfa_recursion_info *ri;
2792 nigel 77 int local_offsets[1000];
2793     int local_workspace[1000];
2794 ph10 836 const pcre_uchar *callpat = start_code + GET(code, 1);
2795 ph10 654 int recno = (callpat == md->start_code)? 0 :
2796     GET2(callpat, 1 + LINK_SIZE);
2797 nigel 77 int rc;
2798    
2799 ph10 642 DPRINTF(("%.*sStarting regex recursion\n", rlevel*2-2, SP));
2800 ph10 654
2801 ph10 642 /* Check for repeating a recursion without advancing the subject
2802     pointer. This should catch convoluted mutual recursions. (Some simple
2803     cases are caught at compile time.) */
2804 nigel 77
2805 ph10 654 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
2806     if (recno == ri->group_num && ptr == ri->subject_position)
2807     return PCRE_ERROR_RECURSELOOP;
2808    
2809     /* Remember this recursion and where we started it so as to
2810 ph10 642 catch infinite loops. */
2811 ph10 654
2812 ph10 642 new_recursive.group_num = recno;
2813     new_recursive.subject_position = ptr;
2814     new_recursive.prevrec = md->recursive;
2815 ph10 654 md->recursive = &new_recursive;
2816 ph10 642
2817 nigel 77 rc = internal_dfa_exec(
2818     md, /* fixed match data */
2819 ph10 642 callpat, /* this subexpression's code */
2820 nigel 77 ptr, /* where we currently are */
2821 ph10 530 (int)(ptr - start_subject), /* start offset */
2822 nigel 77 local_offsets, /* offset vector */
2823     sizeof(local_offsets)/sizeof(int), /* size of same */
2824     local_workspace, /* workspace vector */
2825     sizeof(local_workspace)/sizeof(int), /* size of same */
2826 ph10 642 rlevel); /* function recursion level */
2827 nigel 77
2828 ph10 642 md->recursive = new_recursive.prevrec; /* Done this recursion */
2829 nigel 77
2830 ph10 654 DPRINTF(("%.*sReturn from regex recursion: rc=%d\n", rlevel*2-2, SP,
2831 ph10 642 rc));
2832    
2833 nigel 77 /* Ran out of internal offsets */
2834    
2835     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2836    
2837     /* For each successful matched substring, set up the next state with a
2838     count of characters to skip before trying it. Note that the count is in
2839     characters, not bytes. */
2840    
2841     if (rc > 0)
2842     {
2843     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2844     {
2845 ph10 894 int charcount = local_offsets[rc+1] - local_offsets[rc];
2846 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2847 ph10 979 if (utf)
2848 ph10 982 {
2849 ph10 979 const pcre_uchar *p = start_subject + local_offsets[rc];
2850     const pcre_uchar *pp = start_subject + local_offsets[rc+1];
2851     while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2852 ph10 982 }
2853 ph10 836 #endif
2854 nigel 77 if (charcount > 0)
2855     {
2856     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2857     }
2858     else
2859     {
2860     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2861     }
2862     }
2863     }
2864     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2865     }
2866     break;
2867    
2868     /*-----------------------------------------------------------------*/
2869 ph10 604 case OP_BRAPOS:
2870     case OP_SBRAPOS:
2871     case OP_CBRAPOS:
2872     case OP_SCBRAPOS:
2873 ph10 654 case OP_BRAPOSZERO:
2874 ph10 604 {
2875     int charcount, matched_count;
2876 ph10 836 const pcre_uchar *local_ptr = ptr;
2877 ph10 604 BOOL allow_zero;
2878 ph10 654
2879 ph10 604 if (codevalue == OP_BRAPOSZERO)
2880     {
2881     allow_zero = TRUE;
2882     codevalue = *(++code); /* Codevalue will be one of above BRAs */
2883     }
2884 ph10 654 else allow_zero = FALSE;
2885    
2886     /* Loop to match the subpattern as many times as possible as if it were
2887     a complete pattern. */
2888    
2889 ph10 604 for (matched_count = 0;; matched_count++)
2890     {
2891     int local_offsets[2];
2892     int local_workspace[1000];
2893 ph10 654
2894 ph10 604 int rc = internal_dfa_exec(
2895     md, /* fixed match data */
2896     code, /* this subexpression's code */
2897     local_ptr, /* where we currently are */
2898     (int)(ptr - start_subject), /* start offset */
2899     local_offsets, /* offset vector */
2900     sizeof(local_offsets)/sizeof(int), /* size of same */
2901     local_workspace, /* workspace vector */
2902     sizeof(local_workspace)/sizeof(int), /* size of same */
2903 ph10 642 rlevel); /* function recursion level */
2904 ph10 654
2905 ph10 604 /* Failed to match */
2906 ph10 654
2907     if (rc < 0)
2908 ph10 604 {
2909     if (rc != PCRE_ERROR_NOMATCH) return rc;
2910     break;
2911 ph10 654 }
2912    
2913 ph10 604 /* Matched: break the loop if zero characters matched. */
2914 ph10 654
2915 ph10 604 charcount = local_offsets[1] - local_offsets[0];
2916 ph10 654 if (charcount == 0) break;
2917 ph10 604 local_ptr += charcount; /* Advance temporary position ptr */
2918 ph10 654 }
2919 ph10 604
2920     /* At this point we have matched the subpattern matched_count
2921 ph10 654 times, and local_ptr is pointing to the character after the end of the
2922     last match. */
2923 ph10 604
2924     if (matched_count > 0 || allow_zero)
2925 ph10 654 {
2926 ph10 836 const pcre_uchar *end_subpattern = code;
2927 ph10 604 int next_state_offset;
2928 ph10 654
2929 ph10 604 do { end_subpattern += GET(end_subpattern, 1); }
2930     while (*end_subpattern == OP_ALT);
2931     next_state_offset =
2932     (int)(end_subpattern - start_code + LINK_SIZE + 1);
2933    
2934     /* Optimization: if there are no more active states, and there
2935     are no new states yet set up, then skip over the subject string
2936     right here, to save looping. Otherwise, set up the new state to swing
2937     into action when the end of the matched substring is reached. */
2938    
2939     if (i + 1 >= active_count && new_count == 0)
2940     {
2941     ptr = local_ptr;
2942     clen = 0;
2943     ADD_NEW(next_state_offset, 0);
2944     }
2945     else
2946     {
2947 ph10 836 const pcre_uchar *p = ptr;
2948     const pcre_uchar *pp = local_ptr;
2949     charcount = (int)(pp - p);
2950 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
2951 ph10 979 if (utf) while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
2952 ph10 836 #endif
2953 ph10 604 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2954     }
2955 ph10 654 }
2956     }
2957 ph10 604 break;
2958 ph10 654
2959 ph10 604 /*-----------------------------------------------------------------*/
2960 nigel 77 case OP_ONCE:
2961 ph10 733 case OP_ONCE_NC:
2962 nigel 77 {
2963     int local_offsets[2];
2964     int local_workspace[1000];
2965    
2966     int rc = internal_dfa_exec(
2967     md, /* fixed match data */
2968     code, /* this subexpression's code */
2969     ptr, /* where we currently are */
2970 ph10 530 (int)(ptr - start_subject), /* start offset */
2971 nigel 77 local_offsets, /* offset vector */
2972     sizeof(local_offsets)/sizeof(int), /* size of same */
2973     local_workspace, /* workspace vector */
2974     sizeof(local_workspace)/sizeof(int), /* size of same */
2975 ph10 642 rlevel); /* function recursion level */
2976 nigel 77
2977     if (rc >= 0)
2978     {
2979 ph10 836 const pcre_uchar *end_subpattern = code;
2980 nigel 77 int charcount = local_offsets[1] - local_offsets[0];
2981     int next_state_offset, repeat_state_offset;
2982    
2983     do { end_subpattern += GET(end_subpattern, 1); }
2984     while (*end_subpattern == OP_ALT);
2985 ph10 535 next_state_offset =
2986 ph10 530 (int)(end_subpattern - start_code + LINK_SIZE + 1);
2987 nigel 77
2988     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2989     arrange for the repeat state also to be added to the relevant list.
2990     Calculate the offset, or set -1 for no repeat. */
2991    
2992     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2993     *end_subpattern == OP_KETRMIN)?
2994 ph10 530 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
2995 nigel 77
2996     /* If we have matched an empty string, add the next state at the
2997     current character pointer. This is important so that the duplicate
2998     checking kicks in, which is what breaks infinite loops that match an
2999     empty string. */
3000    
3001     if (charcount == 0)
3002     {
3003     ADD_ACTIVE(next_state_offset, 0);
3004     }
3005    
3006     /* Optimization: if there are no more active states, and there
3007     are no new states yet set up, then skip over the subject string
3008     right here, to save looping. Otherwise, set up the new state to swing
3009 ph10 604 into action when the end of the matched substring is reached. */
3010 nigel 77
3011     else if (i + 1 >= active_count && new_count == 0)
3012     {
3013     ptr += charcount;
3014     clen = 0;
3015     ADD_NEW(next_state_offset, 0);
3016    
3017     /* If we are adding a repeat state at the new character position,
3018     we must fudge things so that it is the only current state.
3019     Otherwise, it might be a duplicate of one we processed before, and
3020     that would cause it to be skipped. */
3021    
3022     if (repeat_state_offset >= 0)
3023     {
3024     next_active_state = active_states;
3025     active_count = 0;
3026     i = -1;
3027     ADD_ACTIVE(repeat_state_offset, 0);
3028     }
3029     }
3030     else
3031     {
3032 chpe 1055 #if defined SUPPORT_UTF && !defined COMPILE_PCRE32
3033 ph10 979 if (utf)
3034 ph10 982 {
3035 ph10 979 const pcre_uchar *p = start_subject + local_offsets[0];
3036     const pcre_uchar *pp = start_subject + local_offsets[1];
3037     while (p < pp) if (NOT_FIRSTCHAR(*p++)) charcount--;
3038 ph10 982 }
3039 ph10 836 #endif
3040 nigel 77 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
3041     if (repeat_state_offset >= 0)
3042     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
3043     }
3044     }
3045     else if (rc != PCRE_ERROR_NOMATCH) return rc;
3046     }
3047     break;
3048    
3049    
3050     /* ========================================================================== */
3051     /* Handle callouts */
3052    
3053     case OP_CALLOUT:
3054 ph10 406 rrc = 0;
3055 ph10 836 if (PUBL(callout) != NULL)
3056 nigel 77 {
3057 zherczeg 850 PUBL(callout_block) cb;
3058 nigel 77 cb.version = 1; /* Version 1 of the callout block */
3059     cb.callout_number = code[1];
3060     cb.offset_vector = offsets;
3061 chpe 1055 #if defined COMPILE_PCRE8
3062 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
3063 chpe 1055 #elif defined COMPILE_PCRE16
3064 zherczeg 852 cb.subject = (PCRE_SPTR16)start_subject;
3065 chpe 1055 #elif defined COMPILE_PCRE32
3066     cb.subject = (PCRE_SPTR32)start_subject;
3067 zherczeg 852 #endif
3068 ph10 530 cb.subject_length = (int)(end_subject - start_subject);
3069     cb.start_match = (int)(current_subject - start_subject);
3070     cb.current_position = (int)(ptr - start_subject);
3071 nigel 77 cb.pattern_position = GET(code, 2);
3072     cb.next_item_length = GET(code, 2 + LINK_SIZE);
3073     cb.capture_top = 1;
3074     cb.capture_last = -1;
3075     cb.callout_data = md->callout_data;
3076 ph10 654 cb.mark = NULL; /* No (*MARK) support */
3077 ph10 836 if ((rrc = (*PUBL(callout))(&cb)) < 0) return rrc; /* Abandon */
3078 ph10 406 }
3079     if (rrc == 0)
3080 ph10 836 { ADD_ACTIVE(state_offset + PRIV(OP_lengths)[OP_CALLOUT], 0); }
3081 nigel 77 break;
3082    
3083    
3084     /* ========================================================================== */
3085     default: /* Unsupported opcode */
3086     return PCRE_ERROR_DFA_UITEM;
3087     }
3088    
3089     NEXT_ACTIVE_STATE: continue;
3090    
3091     } /* End of loop scanning active states */
3092    
3093     /* We have finished the processing at the current subject character. If no
3094     new states have been set for the next character, we have found all the
3095     matches that we are going to find. If we are at the top level and partial
3096 ph10 463 matching has been requested, check for appropriate conditions.
3097    
3098 ph10 462 The "forced_ fail" variable counts the number of (*F) encountered for the
3099     character. If it is equal to the original active_count (saved in
3100     workspace[1]) it means that (*F) was found on every active state. In this
3101 ph10 463 case we don't want to give a partial match.
3102 nigel 77
3103 ph10 463 The "could_continue" variable is true if a state could have continued but
3104     for the fact that the end of the subject was reached. */
3105 ph10 975
3106 nigel 77 if (new_count <= 0)
3107     {
3108 ph10 427 if (rlevel == 1 && /* Top level, and */
3109 ph10 919 could_continue && /* Some could go on, and */
3110 ph10 428 forced_fail != workspace[1] && /* Not all forced fail & */
3111 ph10 427 ( /* either... */
3112     (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
3113     || /* or... */
3114     ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
3115     match_count < 0) /* no matches */
3116     ) && /* And... */
3117 ph10 916 (
3118 ph10 919 partial_newline || /* Either partial NL */
3119     ( /* or ... */
3120     ptr >= end_subject && /* End of subject and */
3121     ptr > md->start_used_ptr) /* Inspected non-empty string */
3122 ph10 975 )
3123     )
3124 nigel 77 match_count = PCRE_ERROR_PARTIAL;
3125     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
3126     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
3127     rlevel*2-2, SP));
3128 nigel 91 break; /* In effect, "return", but see the comment below */
3129 nigel 77 }
3130    
3131     /* One or more states are active for the next character. */
3132    
3133     ptr += clen; /* Advance to next subject character */
3134     } /* Loop to move along the subject string */
3135    
3136 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
3137     if we use "return" above, we have compiler trouble. Some compilers warn if
3138     there's nothing here because they think the function doesn't return a value. On
3139     the other hand, if we put a dummy statement here, some more clever compilers
3140     complain that it can't be reached. Sigh. */
3141 nigel 77
3142 nigel 91 return match_count;
3143 nigel 77 }
3144    
3145    
3146    
3147    
3148     /*************************************************
3149     * Execute a Regular Expression - DFA engine *
3150     *************************************************/
3151    
3152     /* This external function applies a compiled re to a subject string using a DFA
3153     engine. This function calls the internal function multiple times if the pattern
3154     is not anchored.
3155    
3156     Arguments:
3157     argument_re points to the compiled expression
3158 ph10 97 extra_data points to extra data or is NULL
3159 nigel 77 subject points to the subject string
3160     length length of subject string (may contain binary zeros)
3161     start_offset where to start in the subject string
3162     options option bits
3163     offsets vector of match offsets
3164     offsetcount size of same
3165     workspace workspace vector
3166     wscount size of same
3167    
3168     Returns: > 0 => number of match offset pairs placed in offsets
3169     = 0 => offsets overflowed; longest matches are present
3170     -1 => failed to match
3171     < -1 => some kind of unexpected problem
3172     */
3173    
3174 chpe 1055 #if defined COMPILE_PCRE8
3175 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3176 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
3177     const char *subject, int length, int start_offset, int options, int *offsets,
3178     int offsetcount, int *workspace, int wscount)
3179 chpe 1055 #elif defined COMPILE_PCRE16
3180 ph10 836 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3181 zherczeg 852 pcre16_dfa_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
3182 ph10 836 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
3183     int offsetcount, int *workspace, int wscount)
3184 chpe 1055 #elif defined COMPILE_PCRE32
3185     PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
3186     pcre32_dfa_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
3187     PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
3188     int offsetcount, int *workspace, int wscount)
3189 ph10 836 #endif
3190 nigel 77 {
3191 zherczeg 852 REAL_PCRE *re = (REAL_PCRE *)argument_re;
3192 nigel 77 dfa_match_data match_block;
3193 nigel 91 dfa_match_data *md = &match_block;
3194 ph10 836 BOOL utf, anchored, startline, firstline;
3195     const pcre_uchar *current_subject, *end_subject;
3196 nigel 77 const pcre_study_data *study = NULL;
3197    
3198 ph10 836 const pcre_uchar *req_char_ptr;
3199     const pcre_uint8 *start_bits = NULL;
3200     BOOL has_first_char = FALSE;
3201     BOOL has_req_char = FALSE;
3202     pcre_uchar first_char = 0;
3203     pcre_uchar first_char2 = 0;
3204     pcre_uchar req_char = 0;
3205     pcre_uchar req_char2 = 0;
3206 nigel 91 int newline;
3207 nigel 77
3208     /* Plausibility checks */
3209    
3210     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3211     if (re == NULL || subject == NULL || workspace == NULL ||
3212     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3213     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3214     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
3215 ph10 1189 if (length < 0) return PCRE_ERROR_BADLENGTH;
3216 ph10 567 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
3217 nigel 77
3218 ph10 960 /* Check that the first field in the block is the magic number. If it is not,
3219     return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
3220     REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
3221     means that the pattern is likely compiled with different endianness. */
3222 nigel 77
3223 ph10 960 if (re->magic_number != MAGIC_NUMBER)
3224     return re->magic_number == REVERSED_MAGIC_NUMBER?
3225     PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
3226     if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
3227    
3228 ph10 975 /* If restarting after a partial match, do some sanity checks on the contents
3229 ph10 960 of the workspace. */
3230    
3231     if ((options & PCRE_DFA_RESTART) != 0)
3232     {
3233 ph10 975 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3234 ph10 960 workspace[1] > (wscount - 2)/INTS_PER_STATEBLOCK)
3235 ph10 975 return PCRE_ERROR_DFA_BADRESTART;
3236     }
3237 ph10 960
3238     /* Set up study, callout, and table data */
3239    
3240 nigel 91 md->tables = re->tables;
3241     md->callout_data = NULL;
3242 nigel 77
3243     if (extra_data != NULL)
3244     {
3245 ph10 1477 unsigned long int flags = extra_data->flags;
3246 nigel 77 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3247     study = (const pcre_study_data *)extra_data->study_data;
3248     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
3249 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3250     return PCRE_ERROR_DFA_UMLIMIT;
3251 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3252 nigel 91 md->callout_data = extra_data->callout_data;
3253 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
3254 nigel 91 md->tables = extra_data->tables;
3255 nigel 77 }
3256 ph10 461
3257 nigel 77 /* Set some local values */
3258    
3259 ph10 836 current_subject = (const pcre_uchar *)subject + start_offset;
3260     end_subject = (const pcre_uchar *)subject + length;
3261     req_char_ptr = current_subject - 1;
3262 nigel 77
3263 ph10 836 #ifdef SUPPORT_UTF
3264 chpe 1055 /* PCRE_UTF(16|32) have the same value as PCRE_UTF8. */
3265 ph10 836 utf = (re->options & PCRE_UTF8) != 0;
3266 nigel 91 #else
3267 ph10 836 utf = FALSE;
3268 nigel 91 #endif
3269 nigel 77
3270 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
3271     (re->options & PCRE_ANCHORED) != 0;
3272    
3273 nigel 77 /* The remaining fixed data for passing around. */
3274    
3275 ph10 836 md->start_code = (const pcre_uchar *)argument_re +
3276 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
3277 ph10 836 md->start_subject = (const pcre_uchar *)subject;
3278 nigel 91 md->end_subject = end_subject;
3279 ph10 442 md->start_offset = start_offset;
3280 nigel 91 md->moptions = options;
3281     md->poptions = re->options;
3282 nigel 77
3283 ph10 231 /* If the BSR option is not set at match time, copy what was set
3284     at compile time. */
3285    
3286     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
3287     {
3288     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
3289     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
3290     #ifdef BSR_ANYCRLF
3291     else md->moptions |= PCRE_BSR_ANYCRLF;
3292 ph10 243 #endif
3293     }
3294 ph10 231
3295 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3296     nothing is set at run time, whatever was used at compile time applies. */
3297 nigel 91
3298 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3299 nigel 93 PCRE_NEWLINE_BITS)
3300 nigel 91 {
3301 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3302 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
3303     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
3304 nigel 91 case PCRE_NEWLINE_CR+
3305 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
3306 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3307 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3308 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3309 nigel 91 }
3310    
3311 ph10 149 if (newline == -2)
3312 nigel 91 {
3313 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3314     }
3315     else if (newline < 0)
3316     {
3317 nigel 93 md->nltype = NLTYPE_ANY;
3318 nigel 91 }
3319     else
3320     {
3321 nigel 93 md->nltype = NLTYPE_FIXED;
3322     if (newline > 255)
3323     {
3324     md->nllen = 2;
3325     md->nl[0] = (newline >> 8) & 255;
3326     md->nl[1] = newline & 255;
3327     }
3328     else
3329     {
3330     md->nllen = 1;
3331     md->nl[0] = newline;
3332     }
3333 nigel 91 }
3334    
3335 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3336     back the character offset. */
3337    
3338 ph10 836 #ifdef SUPPORT_UTF
3339     if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
3340 nigel 77 {
3341 ph10 654 int erroroffset;
3342 ph10 836 int errorcode = PRIV(valid_utf)((pcre_uchar *)subject, length, &erroroffset);
3343 ph10 606 if (errorcode != 0)
3344 ph10 598 {
3345     if (offsetcount >= 2)
3346     {
3347 ph10 606 offsets[0] = erroroffset;
3348 ph10 598 offsets[1] = errorcode;
3349 ph10 654 }
3350 chpe 1055 #if defined COMPILE_PCRE8
3351     return (errorcode <= PCRE_UTF8_ERR5 && (options & PCRE_PARTIAL_HARD) != 0) ?
3352 ph10 569 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
3353 chpe 1055 #elif defined COMPILE_PCRE16
3354     return (errorcode <= PCRE_UTF16_ERR1 && (options & PCRE_PARTIAL_HARD) != 0) ?
3355     PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
3356     #elif defined COMPILE_PCRE32
3357     return PCRE_ERROR_BADUTF32;
3358     #endif
3359 ph10 654 }
3360 chpe 1055 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
3361 ph10 606 if (start_offset > 0 && start_offset < length &&
3362 ph10 836 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
3363 ph10 606 return PCRE_ERROR_BADUTF8_OFFSET;
3364 chpe 1055 #endif
3365 nigel 77 }
3366     #endif
3367    
3368     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3369     is a feature that makes it possible to save compiled regex and re-use them
3370     in other programs later. */
3371    
3372 ph10 836 if (md->tables == NULL) md->tables = PRIV(default_tables);
3373 nigel 77
3374 ph10 881 /* The "must be at the start of a line" flags are used in a loop when finding
3375     where to start. */
3376 nigel 77
3377 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
3378 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3379    
3380     /* Set up the first character to match, if available. The first_byte value is
3381     never set for an anchored regular expression, but the anchoring may be forced
3382     at run time, so we have to test for anchoring. The first char may be unset for
3383     an unanchored pattern, of course. If there's no first char and the pattern was
3384     studied, there may be a bitmap of possible first characters. */
3385    
3386     if (!anchored)
3387     {
3388 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
3389 nigel 77 {
3390 ph10 836 has_first_char = TRUE;
3391 ph10 904 first_char = first_char2 = (pcre_uchar)(re->first_char);
3392 ph10 836 if ((re->flags & PCRE_FCH_CASELESS) != 0)
3393     {
3394     first_char2 = TABLE_GET(first_char, md->tables + fcc_offset, first_char);
3395     #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3396     if (utf && first_char > 127)
3397     first_char2 = UCD_OTHERCASE(first_char);
3398     #endif
3399     }
3400 nigel 77 }
3401     else
3402     {
3403 ph10 455 if (!startline && study != NULL &&
3404     (study->flags & PCRE_STUDY_MAPPED) != 0)
3405 nigel 77 start_bits = study->start_bits;
3406     }
3407     }
3408    
3409     /* For anchored or unanchored matches, there may be a "last known required
3410     character" set. */
3411    
3412 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
3413 nigel 77 {
3414 ph10 836 has_req_char = TRUE;
3415 ph10 904 req_char = req_char2 = (pcre_uchar)(re->req_char);
3416 ph10 836 if ((re->flags & PCRE_RCH_CASELESS) != 0)
3417     {
3418     req_char2 = TABLE_GET(req_char, md->tables + fcc_offset, req_char);
3419     #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
3420     if (utf && req_char > 127)
3421     req_char2 = UCD_OTHERCASE(req_char);
3422     #endif
3423     }
3424 nigel 77 }
3425    
3426     /* Call the main matching function, looping for a non-anchored regex after a
3427 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
3428     a match. */
3429 nigel 77
3430     for (;;)
3431     {
3432     int rc;
3433    
3434     if ((options & PCRE_DFA_RESTART) == 0)
3435     {
3436 ph10 836 const pcre_uchar *save_end_subject = end_subject;
3437 nigel 77
3438 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
3439     line of a multiline string. Implement this by temporarily adjusting
3440     end_subject so that we stop scanning at a newline. If the match fails at
3441     the newline, later code breaks this loop. */
3442 nigel 77
3443     if (firstline)
3444     {
3445 ph10 836 PCRE_PUCHAR t = current_subject;
3446     #ifdef SUPPORT_UTF
3447     if (utf)
3448 ph10 371 {
3449     while (t < md->end_subject && !IS_NEWLINE(t))
3450 ph10 365 {
3451     t++;
3452 ph10 836 ACROSSCHAR(t < end_subject, *t, t++);
3453 ph10 371 }
3454 ph10 365 }
3455     else
3456 ph10 371 #endif
3457 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3458 nigel 77 end_subject = t;
3459     }
3460 ph10 392
3461 ph10 389 /* There are some optimizations that avoid running the match if a known
3462 ph10 455 starting point is not found. However, there is an option that disables
3463 ph10 579 these, for testing and for ensuring that all callouts do actually occur.
3464 ph10 576 The option can be set in the regex by (*NO_START_OPT) or passed in
3465     match-time options. */
3466 nigel 77
3467 ph10 576 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
3468 ph10 392 {
3469 ph10 1430 /* Advance to a known first pcre_uchar (i.e. data item) */
3470 ph10 392
3471 ph10 836 if (has_first_char)
3472 nigel 77 {
3473 ph10 836 if (first_char != first_char2)
3474 chpe 1100 {
3475     pcre_uchar csc;
3476 ph10 389 while (current_subject < end_subject &&
3477 ph10 1431 (csc = UCHAR21TEST(current_subject)) != first_char && csc != first_char2)
3478 ph10 389 current_subject++;
3479 chpe 1100 }
3480 ph10 389 else
3481 ph10 392 while (current_subject < end_subject &&
3482 ph10 1431 UCHAR21TEST(current_subject) != first_char)
3483 ph10 389 current_subject++;
3484     }
3485 ph10 392
3486 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
3487 ph10 392
3488 ph10 389 else if (startline)
3489     {
3490     if (current_subject > md->start_subject + start_offset)
3491     {
3492 ph10 836 #ifdef SUPPORT_UTF
3493     if (utf)
3494 ph10 365 {
3495 ph10 392 while (current_subject < end_subject &&
3496 ph10 389 !WAS_NEWLINE(current_subject))
3497     {
3498 ph10 365 current_subject++;
3499 ph10 836 ACROSSCHAR(current_subject < end_subject, *current_subject,
3500     current_subject++);
3501 ph10 389 }
3502 ph10 371 }
3503 ph10 389 else
3504     #endif
3505     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
3506     current_subject++;
3507 ph10 392
3508 ph10 389 /* If we have just passed a CR and the newline option is ANY or
3509     ANYCRLF, and we are now at a LF, advance the match position by one
3510     more character. */
3511 ph10 392
3512 ph10 1431 if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
3513 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
3514 ph10 1431 current_subject < end_subject &&
3515     UCHAR21TEST(current_subject) == CHAR_NL)
3516 ph10 389 current_subject++;
3517 ph10 365 }
3518 nigel 77 }
3519 ph10 392
3520 ph10 1430 /* Advance to a non-unique first pcre_uchar after study */
3521 ph10 392
3522 ph10 389 else if (start_bits != NULL)
3523 nigel 77 {
3524 ph10 389 while (current_subject < end_subject)
3525     {
3526 ph10 1431 register pcre_uint32 c = UCHAR21TEST(current_subject);
3527 ph10 836 #ifndef COMPILE_PCRE8
3528     if (c > 255) c = 255;
3529     #endif
3530 ph10 1430 if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
3531     current_subject++;
3532 ph10 389 }
3533 nigel 77 }
3534 ph10 392 }
3535 nigel 77
3536     /* Restore fudged end_subject */
3537    
3538     end_subject = save_end_subject;
3539    
3540 ph10 461 /* The following two optimizations are disabled for partial matching or if
3541     disabling is explicitly requested (and of course, by the test above, this
3542 ph10 455 code is not obeyed when restarting after a partial match). */
3543 ph10 461
3544 ph10 728 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 &&
3545 ph10 455 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
3546 ph10 461 {
3547 ph10 455 /* If the pattern was studied, a minimum subject length may be set. This
3548     is a lower bound; no actual string of that length may actually match the
3549     pattern. Although the value is, strictly, in characters, we treat it as
3550 ph10 1430 in pcre_uchar units to avoid spending too much time in this optimization.
3551     */
3552 nigel 77
3553 ph10 455 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3554 ph10 476 (pcre_uint32)(end_subject - current_subject) < study->minlength)
3555 ph10 455 return PCRE_ERROR_NOMATCH;
3556 ph10 461
3557 ph10 1430 /* If req_char is set, we know that that pcre_uchar must appear in the
3558     subject for the match to succeed. If the first pcre_uchar is set,
3559     req_char must be later in the subject; otherwise the test starts at the
3560     match point. This optimization can save a huge amount of work in patterns
3561     with nested unlimited repeats that aren't going to match. Writing
3562     separate code for cased/caseless versions makes it go faster, as does
3563     using an autoincrement and backing off on a match.
3564 ph10 461
3565 ph10 455 HOWEVER: when the subject string is very, very long, searching to its end
3566     can take a long time, and give bad performance on quite ordinary
3567     patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3568     string... so we don't do this when the string is sufficiently long. */
3569 ph10 461
3570 ph10 836 if (has_req_char && end_subject - current_subject < REQ_BYTE_MAX)
3571 nigel 77 {
3572 ph10 836 register PCRE_PUCHAR p = current_subject + (has_first_char? 1:0);
3573 ph10 461
3574 ph10 455 /* We don't need to repeat the search if we haven't yet reached the
3575     place we found it at last time. */
3576 ph10 461
3577 ph10 836 if (p > req_char_ptr)
3578 nigel 77 {
3579 ph10 836 if (req_char != req_char2)
3580 ph10 455 {
3581     while (p < end_subject)
3582     {
3583 ph10 1431 register pcre_uint32 pp = UCHAR21INCTEST(p);
3584 ph10 836 if (pp == req_char || pp == req_char2) { p--; break; }
3585 ph10 455 }
3586     }
3587     else
3588     {
3589     while (p < end_subject)
3590     {
3591 ph10 1431 if (UCHAR21INCTEST(p) == req_char) { p--; break; }
3592 ph10 455 }
3593     }
3594 ph10 461
3595 ph10 1430 /* If we can't find the required pcre_uchar, break the matching loop,
3596 ph10 455 which will cause a return or PCRE_ERROR_NOMATCH. */
3597 ph10 461
3598 ph10 455 if (p >= end_subject) break;
3599 ph10 461
3600 ph10 1430 /* If we have found the required pcre_uchar, save the point where we
3601 ph10 455 found it, so that we don't search again next time round the loop if
3602 ph10 1430 the start hasn't passed this point yet. */
3603 ph10 461
3604 ph10 836 req_char_ptr = p;
3605 nigel 77 }
3606 ph10 461 }
3607 nigel 77 }
3608 ph10 455 } /* End of optimizations that are done when not restarting */
3609 nigel 77
3610     /* OK, now we can do the business */
3611    
3612 ph10 435 md->start_used_ptr = current_subject;
3613 ph10 654 md->recursive = NULL;
3614 ph10 461
3615 nigel 77 rc = internal_dfa_exec(
3616 nigel 91 md, /* fixed match data */
3617     md->start_code, /* this subexpression's code */
3618     current_subject, /* where we currently are */
3619     start_offset, /* start offset in subject */
3620     offsets, /* offset vector */
3621     offsetcount, /* size of same */
3622     workspace, /* workspace vector */
3623     wscount, /* size of same */
3624 ph10 642 0); /* function recurse level */
3625 nigel 77
3626     /* Anything other than "no match" means we are done, always; otherwise, carry
3627     on only if not anchored. */
3628    
3629 ph10 1320 if (rc != PCRE_ERROR_NOMATCH || anchored)
3630 ph10 1251 {
3631     if (rc == PCRE_ERROR_PARTIAL && offsetcount >= 2)
3632     {
3633     offsets[0] = (int)(md->start_used_ptr - (PCRE_PUCHAR)subject);
3634     offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
3635 ph10 1320 if (offsetcount > 2)
3636 ph10 1251 offsets[2] = (int)(current_subject - (PCRE_PUCHAR)subject);
3637     }
3638     return rc;
3639 ph10 1320 }
3640 nigel 77
3641     /* Advance to the next subject character unless we are at the end of a line
3642     and firstline is set. */
3643    
3644 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
3645 nigel 77 current_subject++;
3646 ph10 836 #ifdef SUPPORT_UTF
3647     if (utf)
3648 nigel 77 {
3649 ph10 836 ACROSSCHAR(current_subject < end_subject, *current_subject,
3650     current_subject++);
3651 nigel 77 }
3652 ph10 836 #endif
3653 nigel 77 if (current_subject > end_subject) break;
3654    
3655 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
3656 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
3657     or ANY or ANYCRLF, advance the match position by one more character. */
3658 nigel 93
3659 ph10 1431 if (UCHAR21TEST(current_subject - 1) == CHAR_CR &&
3660 ph10 226 current_subject < end_subject &&
3661 ph10 1431 UCHAR21TEST(current_subject) == CHAR_NL &&
3662 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
3663 ph10 226 (md->nltype == NLTYPE_ANY ||
3664     md->nltype == NLTYPE_ANYCRLF ||
3665     md->nllen == 2))
3666 nigel 93 current_subject++;
3667    
3668     } /* "Bumpalong" loop */
3669    
3670 nigel 77 return PCRE_ERROR_NOMATCH;
3671     }
3672    
3673     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12