/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 230 - (hide annotations) (download)
Mon Sep 10 13:23:56 2007 UTC (7 years, 1 month ago) by ph10
Original Path: code/trunk/pcre_dfa_exec.c
File MIME type: text/plain
File size: 94416 byte(s)
(1) Move internal flags out of the options field, to make room.
(2) \r and \n must be explicit to trigger the special CRLF handline exception.
(3) (?J) at the start now sets JCHANGED as well as DUPNAMES.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 ph10 200 #ifdef HAVE_CONFIG_H
48 ph10 199 #include <config.h>
49 ph10 200 #endif
50 ph10 199
51 nigel 93 #define NLBLOCK md /* Block containing newline information */
52     #define PSSTART start_subject /* Field containing processed string start */
53     #define PSEND end_subject /* Field containing processed string end */
54    
55 nigel 77 #include "pcre_internal.h"
56    
57    
58     /* For use to indent debugging output */
59    
60     #define SP " "
61    
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87     static uschar coptable[] = {
88     0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 nigel 77 0, 0, /* Any, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 211 0, 0 /* FAIL, ACCEPT */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141     static uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146     0 /* OP_ANY */
147     };
148    
149     static uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154     1 /* OP_ANY */
155     };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226     Returns: > 0 =>
227     = 0 =>
228     -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 nigel 77
458     /* Make the new state list into the active state list and empty the
459     new state list. */
460    
461     temp_states = active_states;
462     active_states = new_states;
463     new_states = temp_states;
464     active_count = new_count;
465     new_count = 0;
466    
467     workspace[0] ^= 1; /* Remember for the restarting feature */
468     workspace[1] = active_count;
469    
470     #ifdef DEBUG
471     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473     printf("\"\n");
474    
475     printf("%.*sActive states: ", rlevel*2-2, SP);
476     for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478     printf("\n");
479     #endif
480    
481     /* Set the pointers for adding new states */
482    
483     next_active_state = active_states + active_count;
484     next_new_state = new_states;
485    
486     /* Load the current character from the subject outside the loop, as many
487     different states may want to look at it, and we assume that at least one
488     will. */
489    
490     if (ptr < end_subject)
491     {
492 nigel 93 clen = 1; /* Number of bytes in the character */
493 nigel 77 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495     #endif /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498     else
499     {
500 nigel 93 clen = 0; /* This indicates the end of the subject */
501     c = NOTACHAR; /* This value should never actually be used */
502 nigel 77 }
503    
504     /* Scan up the active states and act on each one. The result of an action
505     may be to add more states to the currently active list (e.g. on hitting a
506     parenthesis) or it may be to put states on the new list, for considering
507     when we move the character pointer on. */
508    
509     for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514     int count, codevalue;
515 ph10 152 #ifdef SUPPORT_UCP
516 nigel 87 int chartype, script;
517 ph10 152 #endif
518 nigel 77
519     #ifdef DEBUG
520     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 nigel 93 if (clen == 0) printf("EOL\n");
522 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
523     else printf("0x%02x\n", c);
524     #endif
525    
526     /* This variable is referred to implicity in the ADD_xxx macros. */
527    
528     ims = current_state->ims;
529    
530     /* A negative offset is a special case meaning "hold off going to this
531     (negated) state until the number of characters in the data field have
532     been skipped". */
533    
534     if (state_offset < 0)
535     {
536     if (current_state->data > 0)
537     {
538     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539     ADD_NEW_DATA(state_offset, current_state->count,
540     current_state->data - 1);
541     continue;
542     }
543     else
544     {
545     current_state->offset = state_offset = -state_offset;
546     }
547     }
548    
549     /* Check for a duplicate state with the same count, and skip if found. */
550    
551     for (j = 0; j < i; j++)
552     {
553     if (active_states[j].offset == state_offset &&
554     active_states[j].count == current_state->count)
555     {
556     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557     goto NEXT_ACTIVE_STATE;
558     }
559     }
560    
561     /* The state offset is the offset to the opcode */
562    
563     code = start_code + state_offset;
564     codevalue = *code;
565    
566     /* If this opcode is followed by an inline character, load it. It is
567     tempting to test for the presence of a subject character here, but that
568     is wrong, because sometimes zero repetitions of the subject are
569     permitted.
570    
571     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 ph10 178 argument that is not a data character - but is always one byte long. We
573     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574     this case. To keep the other cases fast, convert these ones to new opcodes.
575     */
576 nigel 77
577     if (coptable[codevalue] > 0)
578     {
579     dlen = 1;
580     #ifdef SUPPORT_UTF8
581     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582     #endif /* SUPPORT_UTF8 */
583     d = code[coptable[codevalue]];
584     if (codevalue >= OP_TYPESTAR)
585     {
586 nigel 93 switch(d)
587     {
588     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
589     case OP_NOTPROP:
590     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593 ph10 178 case OP_NOT_HSPACE:
594 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595 ph10 178 case OP_NOT_VSPACE:
596 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 nigel 93 default: break;
598     }
599 nigel 77 }
600     }
601     else
602     {
603     dlen = 0; /* Not strictly necessary, but compilers moan */
604 nigel 93 d = NOTACHAR; /* if these variables are not set. */
605 nigel 77 }
606    
607    
608     /* Now process the individual opcodes */
609    
610     switch (codevalue)
611     {
612    
613     /* ========================================================================== */
614     /* Reached a closing bracket. If not at the end of the pattern, carry
615     on with the next opcode. Otherwise, unless we have an empty string and
616     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617     matches so we always have the longest first. */
618    
619     case OP_KET:
620     case OP_KETRMIN:
621     case OP_KETRMAX:
622     if (code != end_code)
623     {
624     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625     if (codevalue != OP_KET)
626     {
627     ADD_ACTIVE(state_offset - GET(code, 1), 0);
628     }
629     }
630     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
631     {
632     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
634     match_count = 0;
635     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637     if (offsetcount >= 2)
638     {
639     offsets[0] = current_subject - start_subject;
640     offsets[1] = ptr - start_subject;
641     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642     offsets[1] - offsets[0], current_subject));
643     }
644     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
645     {
646     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648     match_count, rlevel*2-2, SP));
649     return match_count;
650     }
651     }
652     break;
653    
654     /* ========================================================================== */
655     /* These opcodes add to the current list of states without looking
656     at the current character. */
657    
658     /*-----------------------------------------------------------------*/
659     case OP_ALT:
660     do { code += GET(code, 1); } while (*code == OP_ALT);
661     ADD_ACTIVE(code - start_code, 0);
662     break;
663    
664     /*-----------------------------------------------------------------*/
665     case OP_BRA:
666 nigel 93 case OP_SBRA:
667 nigel 77 do
668     {
669     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670     code += GET(code, 1);
671     }
672     while (*code == OP_ALT);
673     break;
674    
675     /*-----------------------------------------------------------------*/
676 nigel 93 case OP_CBRA:
677     case OP_SCBRA:
678     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679     code += GET(code, 1);
680     while (*code == OP_ALT)
681     {
682     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683     code += GET(code, 1);
684     }
685     break;
686    
687     /*-----------------------------------------------------------------*/
688 nigel 77 case OP_BRAZERO:
689     case OP_BRAMINZERO:
690     ADD_ACTIVE(state_offset + 1, 0);
691     code += 1 + GET(code, 2);
692     while (*code == OP_ALT) code += GET(code, 1);
693     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
694     break;
695    
696     /*-----------------------------------------------------------------*/
697     case OP_CIRC:
698     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
699 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
700     ptr != end_subject &&
701 nigel 93 WAS_NEWLINE(ptr)))
702 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
703     break;
704    
705     /*-----------------------------------------------------------------*/
706     case OP_EOD:
707     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
708     break;
709    
710     /*-----------------------------------------------------------------*/
711     case OP_OPT:
712     ims = code[1];
713     ADD_ACTIVE(state_offset + 2, 0);
714     break;
715    
716     /*-----------------------------------------------------------------*/
717     case OP_SOD:
718     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
719     break;
720    
721     /*-----------------------------------------------------------------*/
722     case OP_SOM:
723     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
724     break;
725    
726    
727     /* ========================================================================== */
728     /* These opcodes inspect the next subject character, and sometimes
729     the previous one as well, but do not have an argument. The variable
730     clen contains the length of the current character and is zero if we are
731     at the end of the subject. */
732    
733     /*-----------------------------------------------------------------*/
734     case OP_ANY:
735 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
736 nigel 77 { ADD_NEW(state_offset + 1, 0); }
737     break;
738    
739     /*-----------------------------------------------------------------*/
740     case OP_EODN:
741 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
742 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
743     break;
744    
745     /*-----------------------------------------------------------------*/
746     case OP_DOLL:
747     if ((md->moptions & PCRE_NOTEOL) == 0)
748     {
749 nigel 91 if (clen == 0 ||
750 nigel 93 (IS_NEWLINE(ptr) &&
751 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
752     ))
753 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
754     }
755 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
756 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
757     break;
758    
759     /*-----------------------------------------------------------------*/
760    
761     case OP_DIGIT:
762     case OP_WHITESPACE:
763     case OP_WORDCHAR:
764     if (clen > 0 && c < 256 &&
765     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
766     { ADD_NEW(state_offset + 1, 0); }
767     break;
768    
769     /*-----------------------------------------------------------------*/
770     case OP_NOT_DIGIT:
771     case OP_NOT_WHITESPACE:
772     case OP_NOT_WORDCHAR:
773     if (clen > 0 && (c >= 256 ||
774     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
775     { ADD_NEW(state_offset + 1, 0); }
776     break;
777    
778     /*-----------------------------------------------------------------*/
779     case OP_WORD_BOUNDARY:
780     case OP_NOT_WORD_BOUNDARY:
781     {
782     int left_word, right_word;
783    
784     if (ptr > start_subject)
785     {
786     const uschar *temp = ptr - 1;
787     #ifdef SUPPORT_UTF8
788     if (utf8) BACKCHAR(temp);
789     #endif
790     GETCHARTEST(d, temp);
791     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
792     }
793     else left_word = 0;
794    
795     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
796     else right_word = 0;
797    
798     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
799     { ADD_ACTIVE(state_offset + 1, 0); }
800     }
801     break;
802    
803    
804     /*-----------------------------------------------------------------*/
805     /* Check the next character by Unicode property. We will get here only
806     if the support is in the binary; otherwise a compile-time error occurs.
807     */
808    
809 ph10 151 #ifdef SUPPORT_UCP
810 nigel 77 case OP_PROP:
811     case OP_NOTPROP:
812     if (clen > 0)
813     {
814 nigel 87 BOOL OK;
815     int category = _pcre_ucp_findprop(c, &chartype, &script);
816     switch(code[1])
817 nigel 77 {
818 nigel 87 case PT_ANY:
819     OK = TRUE;
820     break;
821    
822     case PT_LAMP:
823     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
824     break;
825    
826     case PT_GC:
827     OK = category == code[2];
828     break;
829    
830     case PT_PC:
831     OK = chartype == code[2];
832     break;
833    
834     case PT_SC:
835     OK = script == code[2];
836     break;
837    
838     /* Should never occur, but keep compilers from grumbling. */
839    
840     default:
841     OK = codevalue != OP_PROP;
842     break;
843 nigel 77 }
844 nigel 87
845     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
846 nigel 77 }
847     break;
848     #endif
849    
850    
851    
852     /* ========================================================================== */
853     /* These opcodes likewise inspect the subject character, but have an
854     argument that is not a data character. It is one of these opcodes:
855     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
856     OP_NOT_WORDCHAR. The value is loaded into d. */
857    
858     case OP_TYPEPLUS:
859     case OP_TYPEMINPLUS:
860 nigel 93 case OP_TYPEPOSPLUS:
861 nigel 77 count = current_state->count; /* Already matched */
862     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
863     if (clen > 0)
864     {
865     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
866     (c < 256 &&
867 nigel 91 (d != OP_ANY ||
868     (ims & PCRE_DOTALL) != 0 ||
869     !IS_NEWLINE(ptr)
870     ) &&
871 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
872     {
873 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
874     {
875     active_count--; /* Remove non-match possibility */
876     next_active_state--;
877     }
878 nigel 77 count++;
879     ADD_NEW(state_offset, count);
880     }
881     }
882     break;
883    
884     /*-----------------------------------------------------------------*/
885     case OP_TYPEQUERY:
886     case OP_TYPEMINQUERY:
887 nigel 93 case OP_TYPEPOSQUERY:
888 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
889     if (clen > 0)
890     {
891     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
892     (c < 256 &&
893 nigel 91 (d != OP_ANY ||
894     (ims & PCRE_DOTALL) != 0 ||
895     !IS_NEWLINE(ptr)
896     ) &&
897 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
898     {
899 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
900     {
901     active_count--; /* Remove non-match possibility */
902     next_active_state--;
903     }
904 nigel 77 ADD_NEW(state_offset + 2, 0);
905     }
906     }
907     break;
908    
909     /*-----------------------------------------------------------------*/
910     case OP_TYPESTAR:
911     case OP_TYPEMINSTAR:
912 nigel 93 case OP_TYPEPOSSTAR:
913 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
914     if (clen > 0)
915     {
916     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
917     (c < 256 &&
918 nigel 91 (d != OP_ANY ||
919     (ims & PCRE_DOTALL) != 0 ||
920     !IS_NEWLINE(ptr)
921     ) &&
922 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
923     {
924 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
925     {
926     active_count--; /* Remove non-match possibility */
927     next_active_state--;
928     }
929 nigel 77 ADD_NEW(state_offset, 0);
930     }
931     }
932     break;
933    
934     /*-----------------------------------------------------------------*/
935     case OP_TYPEEXACT:
936 nigel 93 count = current_state->count; /* Number already matched */
937     if (clen > 0)
938     {
939     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
940     (c < 256 &&
941     (d != OP_ANY ||
942     (ims & PCRE_DOTALL) != 0 ||
943     !IS_NEWLINE(ptr)
944     ) &&
945     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
946     {
947     if (++count >= GET2(code, 1))
948     { ADD_NEW(state_offset + 4, 0); }
949     else
950     { ADD_NEW(state_offset, count); }
951     }
952     }
953     break;
954    
955     /*-----------------------------------------------------------------*/
956 nigel 77 case OP_TYPEUPTO:
957     case OP_TYPEMINUPTO:
958 nigel 93 case OP_TYPEPOSUPTO:
959     ADD_ACTIVE(state_offset + 4, 0);
960 nigel 77 count = current_state->count; /* Number already matched */
961     if (clen > 0)
962     {
963     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
964     (c < 256 &&
965 nigel 91 (d != OP_ANY ||
966     (ims & PCRE_DOTALL) != 0 ||
967     !IS_NEWLINE(ptr)
968     ) &&
969 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
970     {
971 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
972     {
973     active_count--; /* Remove non-match possibility */
974     next_active_state--;
975     }
976 nigel 77 if (++count >= GET2(code, 1))
977     { ADD_NEW(state_offset + 4, 0); }
978     else
979     { ADD_NEW(state_offset, count); }
980     }
981     }
982     break;
983    
984     /* ========================================================================== */
985     /* These are virtual opcodes that are used when something like
986 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
987     argument. It keeps the code above fast for the other cases. The argument
988     is in the d variable. */
989 nigel 77
990 ph10 151 #ifdef SUPPORT_UCP
991 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
992     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
993 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
994 nigel 77 count = current_state->count; /* Already matched */
995 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
996 nigel 77 if (clen > 0)
997     {
998 nigel 87 BOOL OK;
999     int category = _pcre_ucp_findprop(c, &chartype, &script);
1000     switch(code[2])
1001     {
1002     case PT_ANY:
1003     OK = TRUE;
1004     break;
1005    
1006     case PT_LAMP:
1007     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1008     break;
1009    
1010     case PT_GC:
1011     OK = category == code[3];
1012     break;
1013    
1014     case PT_PC:
1015     OK = chartype == code[3];
1016     break;
1017    
1018     case PT_SC:
1019     OK = script == code[3];
1020     break;
1021    
1022     /* Should never occur, but keep compilers from grumbling. */
1023    
1024     default:
1025     OK = codevalue != OP_PROP;
1026     break;
1027     }
1028    
1029 nigel 93 if (OK == (d == OP_PROP))
1030     {
1031     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1032     {
1033     active_count--; /* Remove non-match possibility */
1034     next_active_state--;
1035     }
1036     count++;
1037     ADD_NEW(state_offset, count);
1038     }
1039 nigel 77 }
1040     break;
1041    
1042     /*-----------------------------------------------------------------*/
1043     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1044     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1045 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1046 nigel 77 count = current_state->count; /* Already matched */
1047     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1048 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1049 nigel 77 {
1050     const uschar *nptr = ptr + clen;
1051     int ncount = 0;
1052 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1053     {
1054     active_count--; /* Remove non-match possibility */
1055     next_active_state--;
1056     }
1057 nigel 77 while (nptr < end_subject)
1058     {
1059     int nd;
1060     int ndlen = 1;
1061     GETCHARLEN(nd, nptr, ndlen);
1062 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1063 nigel 77 ncount++;
1064     nptr += ndlen;
1065     }
1066     count++;
1067     ADD_NEW_DATA(-state_offset, count, ncount);
1068     }
1069     break;
1070 ph10 151 #endif
1071 nigel 77
1072     /*-----------------------------------------------------------------*/
1073 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1074     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1075     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1076     count = current_state->count; /* Already matched */
1077     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1078     if (clen > 0)
1079     {
1080     int ncount = 0;
1081     switch (c)
1082     {
1083     case 0x000d:
1084     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1085     /* Fall through */
1086     case 0x000a:
1087     case 0x000b:
1088     case 0x000c:
1089     case 0x0085:
1090     case 0x2028:
1091     case 0x2029:
1092     if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093     {
1094     active_count--; /* Remove non-match possibility */
1095     next_active_state--;
1096     }
1097     count++;
1098     ADD_NEW_DATA(-state_offset, count, ncount);
1099     break;
1100     default:
1101     break;
1102     }
1103     }
1104     break;
1105    
1106     /*-----------------------------------------------------------------*/
1107 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1108     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1109     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1110     count = current_state->count; /* Already matched */
1111     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1112     if (clen > 0)
1113     {
1114 ph10 182 BOOL OK;
1115 ph10 178 switch (c)
1116     {
1117     case 0x000a:
1118     case 0x000b:
1119     case 0x000c:
1120     case 0x000d:
1121     case 0x0085:
1122     case 0x2028:
1123     case 0x2029:
1124     OK = TRUE;
1125 ph10 182 break;
1126 ph10 178
1127     default:
1128     OK = FALSE;
1129 ph10 182 break;
1130 ph10 178 }
1131    
1132     if (OK == (d == OP_VSPACE))
1133 ph10 182 {
1134 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1135     {
1136     active_count--; /* Remove non-match possibility */
1137     next_active_state--;
1138     }
1139     count++;
1140     ADD_NEW_DATA(-state_offset, count, 0);
1141     }
1142     }
1143     break;
1144    
1145     /*-----------------------------------------------------------------*/
1146     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1147     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1148     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1149     count = current_state->count; /* Already matched */
1150     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1151     if (clen > 0)
1152     {
1153 ph10 182 BOOL OK;
1154 ph10 178 switch (c)
1155     {
1156     case 0x09: /* HT */
1157     case 0x20: /* SPACE */
1158     case 0xa0: /* NBSP */
1159     case 0x1680: /* OGHAM SPACE MARK */
1160     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1161     case 0x2000: /* EN QUAD */
1162     case 0x2001: /* EM QUAD */
1163     case 0x2002: /* EN SPACE */
1164     case 0x2003: /* EM SPACE */
1165     case 0x2004: /* THREE-PER-EM SPACE */
1166     case 0x2005: /* FOUR-PER-EM SPACE */
1167     case 0x2006: /* SIX-PER-EM SPACE */
1168     case 0x2007: /* FIGURE SPACE */
1169     case 0x2008: /* PUNCTUATION SPACE */
1170     case 0x2009: /* THIN SPACE */
1171     case 0x200A: /* HAIR SPACE */
1172     case 0x202f: /* NARROW NO-BREAK SPACE */
1173     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1174     case 0x3000: /* IDEOGRAPHIC SPACE */
1175     OK = TRUE;
1176     break;
1177 ph10 182
1178 ph10 178 default:
1179     OK = FALSE;
1180     break;
1181     }
1182 ph10 182
1183 ph10 178 if (OK == (d == OP_HSPACE))
1184 ph10 182 {
1185 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1186     {
1187     active_count--; /* Remove non-match possibility */
1188     next_active_state--;
1189     }
1190     count++;
1191     ADD_NEW_DATA(-state_offset, count, 0);
1192     }
1193     }
1194     break;
1195    
1196     /*-----------------------------------------------------------------*/
1197 ph10 151 #ifdef SUPPORT_UCP
1198 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1199     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1200 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1201 nigel 87 count = 4;
1202 nigel 77 goto QS1;
1203    
1204     case OP_PROP_EXTRA + OP_TYPESTAR:
1205     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1206 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1207 nigel 77 count = 0;
1208    
1209     QS1:
1210    
1211 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1212 nigel 77 if (clen > 0)
1213     {
1214 nigel 87 BOOL OK;
1215     int category = _pcre_ucp_findprop(c, &chartype, &script);
1216     switch(code[2])
1217     {
1218     case PT_ANY:
1219     OK = TRUE;
1220     break;
1221    
1222     case PT_LAMP:
1223     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1224     break;
1225    
1226     case PT_GC:
1227     OK = category == code[3];
1228     break;
1229    
1230     case PT_PC:
1231     OK = chartype == code[3];
1232     break;
1233    
1234     case PT_SC:
1235     OK = script == code[3];
1236     break;
1237    
1238     /* Should never occur, but keep compilers from grumbling. */
1239    
1240     default:
1241     OK = codevalue != OP_PROP;
1242     break;
1243     }
1244    
1245 nigel 93 if (OK == (d == OP_PROP))
1246     {
1247     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1248     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1249     {
1250     active_count--; /* Remove non-match possibility */
1251     next_active_state--;
1252     }
1253     ADD_NEW(state_offset + count, 0);
1254     }
1255 nigel 77 }
1256     break;
1257    
1258     /*-----------------------------------------------------------------*/
1259     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1260     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1261 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1262 nigel 77 count = 2;
1263     goto QS2;
1264    
1265     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1266     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1267 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1268 nigel 77 count = 0;
1269    
1270     QS2:
1271    
1272     ADD_ACTIVE(state_offset + 2, 0);
1273 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1274 nigel 77 {
1275     const uschar *nptr = ptr + clen;
1276     int ncount = 0;
1277 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1278     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1279     {
1280     active_count--; /* Remove non-match possibility */
1281     next_active_state--;
1282     }
1283 nigel 77 while (nptr < end_subject)
1284     {
1285     int nd;
1286     int ndlen = 1;
1287     GETCHARLEN(nd, nptr, ndlen);
1288 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1289 nigel 77 ncount++;
1290     nptr += ndlen;
1291     }
1292     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1293     }
1294     break;
1295 ph10 151 #endif
1296 nigel 77
1297     /*-----------------------------------------------------------------*/
1298 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1299     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1300     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1301     count = 2;
1302     goto QS3;
1303    
1304     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1305     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1306     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1307     count = 0;
1308    
1309     QS3:
1310     ADD_ACTIVE(state_offset + 2, 0);
1311     if (clen > 0)
1312     {
1313     int ncount = 0;
1314     switch (c)
1315     {
1316     case 0x000d:
1317     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1318     /* Fall through */
1319     case 0x000a:
1320     case 0x000b:
1321     case 0x000c:
1322     case 0x0085:
1323     case 0x2028:
1324     case 0x2029:
1325     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1326     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1327     {
1328     active_count--; /* Remove non-match possibility */
1329     next_active_state--;
1330     }
1331     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1332     break;
1333     default:
1334     break;
1335     }
1336     }
1337     break;
1338    
1339     /*-----------------------------------------------------------------*/
1340 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1341     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1342     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1343     count = 2;
1344     goto QS4;
1345    
1346     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1347     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1348     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1349     count = 0;
1350    
1351     QS4:
1352     ADD_ACTIVE(state_offset + 2, 0);
1353     if (clen > 0)
1354     {
1355 ph10 182 BOOL OK;
1356 ph10 178 switch (c)
1357     {
1358     case 0x000a:
1359     case 0x000b:
1360     case 0x000c:
1361     case 0x000d:
1362     case 0x0085:
1363     case 0x2028:
1364     case 0x2029:
1365     OK = TRUE;
1366     break;
1367 ph10 182
1368 ph10 178 default:
1369     OK = FALSE;
1370     break;
1371     }
1372     if (OK == (d == OP_VSPACE))
1373 ph10 182 {
1374 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1375     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1376     {
1377     active_count--; /* Remove non-match possibility */
1378     next_active_state--;
1379     }
1380     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1381     }
1382     }
1383     break;
1384    
1385     /*-----------------------------------------------------------------*/
1386     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1387     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1388     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1389     count = 2;
1390     goto QS5;
1391    
1392     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1393     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1394     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1395     count = 0;
1396    
1397     QS5:
1398     ADD_ACTIVE(state_offset + 2, 0);
1399     if (clen > 0)
1400     {
1401 ph10 182 BOOL OK;
1402 ph10 178 switch (c)
1403     {
1404     case 0x09: /* HT */
1405     case 0x20: /* SPACE */
1406     case 0xa0: /* NBSP */
1407     case 0x1680: /* OGHAM SPACE MARK */
1408     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1409     case 0x2000: /* EN QUAD */
1410     case 0x2001: /* EM QUAD */
1411     case 0x2002: /* EN SPACE */
1412     case 0x2003: /* EM SPACE */
1413     case 0x2004: /* THREE-PER-EM SPACE */
1414     case 0x2005: /* FOUR-PER-EM SPACE */
1415     case 0x2006: /* SIX-PER-EM SPACE */
1416     case 0x2007: /* FIGURE SPACE */
1417     case 0x2008: /* PUNCTUATION SPACE */
1418     case 0x2009: /* THIN SPACE */
1419     case 0x200A: /* HAIR SPACE */
1420     case 0x202f: /* NARROW NO-BREAK SPACE */
1421     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1422     case 0x3000: /* IDEOGRAPHIC SPACE */
1423     OK = TRUE;
1424     break;
1425 ph10 182
1426 ph10 178 default:
1427     OK = FALSE;
1428     break;
1429     }
1430 ph10 182
1431 ph10 178 if (OK == (d == OP_HSPACE))
1432 ph10 182 {
1433 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1434     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1435     {
1436     active_count--; /* Remove non-match possibility */
1437     next_active_state--;
1438     }
1439     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1440     }
1441     }
1442     break;
1443    
1444     /*-----------------------------------------------------------------*/
1445 ph10 151 #ifdef SUPPORT_UCP
1446 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1447     case OP_PROP_EXTRA + OP_TYPEUPTO:
1448     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1449 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1450 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1451 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1452 nigel 77 count = current_state->count; /* Number already matched */
1453     if (clen > 0)
1454     {
1455 nigel 87 BOOL OK;
1456     int category = _pcre_ucp_findprop(c, &chartype, &script);
1457     switch(code[4])
1458 nigel 77 {
1459 nigel 87 case PT_ANY:
1460     OK = TRUE;
1461     break;
1462    
1463     case PT_LAMP:
1464     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1465     break;
1466    
1467     case PT_GC:
1468     OK = category == code[5];
1469     break;
1470    
1471     case PT_PC:
1472     OK = chartype == code[5];
1473     break;
1474    
1475     case PT_SC:
1476     OK = script == code[5];
1477     break;
1478    
1479     /* Should never occur, but keep compilers from grumbling. */
1480    
1481     default:
1482     OK = codevalue != OP_PROP;
1483     break;
1484     }
1485    
1486     if (OK == (d == OP_PROP))
1487     {
1488 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1489     {
1490     active_count--; /* Remove non-match possibility */
1491     next_active_state--;
1492     }
1493 nigel 77 if (++count >= GET2(code, 1))
1494 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1495 nigel 77 else
1496     { ADD_NEW(state_offset, count); }
1497     }
1498     }
1499     break;
1500    
1501     /*-----------------------------------------------------------------*/
1502     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1503     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1504     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1505 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1506 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1507     { ADD_ACTIVE(state_offset + 4, 0); }
1508     count = current_state->count; /* Number already matched */
1509 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1510 nigel 77 {
1511     const uschar *nptr = ptr + clen;
1512     int ncount = 0;
1513 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1514     {
1515     active_count--; /* Remove non-match possibility */
1516     next_active_state--;
1517     }
1518 nigel 77 while (nptr < end_subject)
1519     {
1520     int nd;
1521     int ndlen = 1;
1522     GETCHARLEN(nd, nptr, ndlen);
1523 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1524 nigel 77 ncount++;
1525     nptr += ndlen;
1526     }
1527     if (++count >= GET2(code, 1))
1528     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1529     else
1530     { ADD_NEW_DATA(-state_offset, count, ncount); }
1531     }
1532     break;
1533 ph10 151 #endif
1534 nigel 77
1535 nigel 93 /*-----------------------------------------------------------------*/
1536     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1537     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1538     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1539     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1540     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1541     { ADD_ACTIVE(state_offset + 4, 0); }
1542     count = current_state->count; /* Number already matched */
1543     if (clen > 0)
1544     {
1545     int ncount = 0;
1546     switch (c)
1547     {
1548     case 0x000d:
1549     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1550     /* Fall through */
1551     case 0x000a:
1552     case 0x000b:
1553     case 0x000c:
1554     case 0x0085:
1555     case 0x2028:
1556     case 0x2029:
1557     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1558     {
1559     active_count--; /* Remove non-match possibility */
1560     next_active_state--;
1561     }
1562     if (++count >= GET2(code, 1))
1563     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1564     else
1565     { ADD_NEW_DATA(-state_offset, count, ncount); }
1566     break;
1567     default:
1568     break;
1569     }
1570     }
1571     break;
1572    
1573 ph10 178 /*-----------------------------------------------------------------*/
1574     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1575     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1576     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1577     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1578     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1579     { ADD_ACTIVE(state_offset + 4, 0); }
1580     count = current_state->count; /* Number already matched */
1581     if (clen > 0)
1582     {
1583 ph10 182 BOOL OK;
1584 ph10 178 switch (c)
1585     {
1586     case 0x000a:
1587     case 0x000b:
1588     case 0x000c:
1589     case 0x000d:
1590     case 0x0085:
1591     case 0x2028:
1592     case 0x2029:
1593     OK = TRUE;
1594     break;
1595 ph10 182
1596 ph10 178 default:
1597     OK = FALSE;
1598     }
1599 ph10 182
1600 ph10 178 if (OK == (d == OP_VSPACE))
1601 ph10 182 {
1602 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1603     {
1604     active_count--; /* Remove non-match possibility */
1605     next_active_state--;
1606     }
1607     if (++count >= GET2(code, 1))
1608     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1609     else
1610     { ADD_NEW_DATA(-state_offset, count, 0); }
1611     }
1612     }
1613     break;
1614    
1615     /*-----------------------------------------------------------------*/
1616     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1617     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1618     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1619     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1620     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1621     { ADD_ACTIVE(state_offset + 4, 0); }
1622     count = current_state->count; /* Number already matched */
1623     if (clen > 0)
1624     {
1625 ph10 182 BOOL OK;
1626 ph10 178 switch (c)
1627     {
1628     case 0x09: /* HT */
1629     case 0x20: /* SPACE */
1630     case 0xa0: /* NBSP */
1631     case 0x1680: /* OGHAM SPACE MARK */
1632     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1633     case 0x2000: /* EN QUAD */
1634     case 0x2001: /* EM QUAD */
1635     case 0x2002: /* EN SPACE */
1636     case 0x2003: /* EM SPACE */
1637     case 0x2004: /* THREE-PER-EM SPACE */
1638     case 0x2005: /* FOUR-PER-EM SPACE */
1639     case 0x2006: /* SIX-PER-EM SPACE */
1640     case 0x2007: /* FIGURE SPACE */
1641     case 0x2008: /* PUNCTUATION SPACE */
1642     case 0x2009: /* THIN SPACE */
1643     case 0x200A: /* HAIR SPACE */
1644     case 0x202f: /* NARROW NO-BREAK SPACE */
1645     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1646     case 0x3000: /* IDEOGRAPHIC SPACE */
1647     OK = TRUE;
1648     break;
1649 ph10 182
1650 ph10 178 default:
1651     OK = FALSE;
1652     break;
1653     }
1654 ph10 182
1655 ph10 178 if (OK == (d == OP_HSPACE))
1656 ph10 182 {
1657 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1658     {
1659     active_count--; /* Remove non-match possibility */
1660     next_active_state--;
1661     }
1662     if (++count >= GET2(code, 1))
1663     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1664     else
1665     { ADD_NEW_DATA(-state_offset, count, 0); }
1666     }
1667     }
1668     break;
1669    
1670 nigel 77 /* ========================================================================== */
1671     /* These opcodes are followed by a character that is usually compared
1672     to the current subject character; it is loaded into d. We still get
1673     here even if there is no subject character, because in some cases zero
1674     repetitions are permitted. */
1675    
1676     /*-----------------------------------------------------------------*/
1677     case OP_CHAR:
1678     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1679     break;
1680    
1681     /*-----------------------------------------------------------------*/
1682     case OP_CHARNC:
1683     if (clen == 0) break;
1684    
1685     #ifdef SUPPORT_UTF8
1686     if (utf8)
1687     {
1688     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1689     {
1690 nigel 93 unsigned int othercase;
1691 nigel 77 if (c < 128) othercase = fcc[c]; else
1692    
1693     /* If we have Unicode property support, we can use it to test the
1694 nigel 87 other case of the character. */
1695 nigel 77
1696     #ifdef SUPPORT_UCP
1697 nigel 87 othercase = _pcre_ucp_othercase(c);
1698     #else
1699 nigel 93 othercase = NOTACHAR;
1700 nigel 77 #endif
1701    
1702     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1703     }
1704     }
1705     else
1706     #endif /* SUPPORT_UTF8 */
1707    
1708     /* Non-UTF-8 mode */
1709     {
1710     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1711     }
1712     break;
1713    
1714    
1715     #ifdef SUPPORT_UCP
1716     /*-----------------------------------------------------------------*/
1717     /* This is a tricky one because it can match more than one character.
1718     Find out how many characters to skip, and then set up a negative state
1719     to wait for them to pass before continuing. */
1720    
1721     case OP_EXTUNI:
1722 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1723 nigel 77 {
1724     const uschar *nptr = ptr + clen;
1725     int ncount = 0;
1726     while (nptr < end_subject)
1727     {
1728     int nclen = 1;
1729     GETCHARLEN(c, nptr, nclen);
1730 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1731 nigel 77 ncount++;
1732     nptr += nclen;
1733     }
1734     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1735     }
1736     break;
1737     #endif
1738    
1739     /*-----------------------------------------------------------------*/
1740 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1741     character (when CR is followed by LF). In this case, set up a negative
1742     state to wait for one character to pass before continuing. */
1743    
1744     case OP_ANYNL:
1745     if (clen > 0) switch(c)
1746     {
1747     case 0x000a:
1748     case 0x000b:
1749     case 0x000c:
1750     case 0x0085:
1751     case 0x2028:
1752     case 0x2029:
1753     ADD_NEW(state_offset + 1, 0);
1754     break;
1755     case 0x000d:
1756     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1757     {
1758     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1759     }
1760     else
1761     {
1762     ADD_NEW(state_offset + 1, 0);
1763     }
1764     break;
1765     }
1766     break;
1767    
1768     /*-----------------------------------------------------------------*/
1769 ph10 178 case OP_NOT_VSPACE:
1770     if (clen > 0) switch(c)
1771     {
1772     case 0x000a:
1773     case 0x000b:
1774     case 0x000c:
1775     case 0x000d:
1776     case 0x0085:
1777     case 0x2028:
1778     case 0x2029:
1779     break;
1780 ph10 182
1781     default:
1782 ph10 178 ADD_NEW(state_offset + 1, 0);
1783     break;
1784     }
1785     break;
1786    
1787     /*-----------------------------------------------------------------*/
1788     case OP_VSPACE:
1789     if (clen > 0) switch(c)
1790     {
1791     case 0x000a:
1792     case 0x000b:
1793     case 0x000c:
1794     case 0x000d:
1795     case 0x0085:
1796     case 0x2028:
1797     case 0x2029:
1798     ADD_NEW(state_offset + 1, 0);
1799     break;
1800 ph10 182
1801 ph10 178 default: break;
1802     }
1803     break;
1804    
1805     /*-----------------------------------------------------------------*/
1806     case OP_NOT_HSPACE:
1807     if (clen > 0) switch(c)
1808     {
1809     case 0x09: /* HT */
1810     case 0x20: /* SPACE */
1811     case 0xa0: /* NBSP */
1812     case 0x1680: /* OGHAM SPACE MARK */
1813     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1814     case 0x2000: /* EN QUAD */
1815     case 0x2001: /* EM QUAD */
1816     case 0x2002: /* EN SPACE */
1817     case 0x2003: /* EM SPACE */
1818     case 0x2004: /* THREE-PER-EM SPACE */
1819     case 0x2005: /* FOUR-PER-EM SPACE */
1820     case 0x2006: /* SIX-PER-EM SPACE */
1821     case 0x2007: /* FIGURE SPACE */
1822     case 0x2008: /* PUNCTUATION SPACE */
1823     case 0x2009: /* THIN SPACE */
1824     case 0x200A: /* HAIR SPACE */
1825     case 0x202f: /* NARROW NO-BREAK SPACE */
1826     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1827     case 0x3000: /* IDEOGRAPHIC SPACE */
1828     break;
1829 ph10 182
1830     default:
1831 ph10 178 ADD_NEW(state_offset + 1, 0);
1832     break;
1833     }
1834     break;
1835    
1836     /*-----------------------------------------------------------------*/
1837     case OP_HSPACE:
1838     if (clen > 0) switch(c)
1839     {
1840     case 0x09: /* HT */
1841     case 0x20: /* SPACE */
1842     case 0xa0: /* NBSP */
1843     case 0x1680: /* OGHAM SPACE MARK */
1844     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1845     case 0x2000: /* EN QUAD */
1846     case 0x2001: /* EM QUAD */
1847     case 0x2002: /* EN SPACE */
1848     case 0x2003: /* EM SPACE */
1849     case 0x2004: /* THREE-PER-EM SPACE */
1850     case 0x2005: /* FOUR-PER-EM SPACE */
1851     case 0x2006: /* SIX-PER-EM SPACE */
1852     case 0x2007: /* FIGURE SPACE */
1853     case 0x2008: /* PUNCTUATION SPACE */
1854     case 0x2009: /* THIN SPACE */
1855     case 0x200A: /* HAIR SPACE */
1856     case 0x202f: /* NARROW NO-BREAK SPACE */
1857     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1858     case 0x3000: /* IDEOGRAPHIC SPACE */
1859     ADD_NEW(state_offset + 1, 0);
1860     break;
1861     }
1862     break;
1863    
1864     /*-----------------------------------------------------------------*/
1865 nigel 77 /* Match a negated single character. This is only used for one-byte
1866     characters, that is, we know that d < 256. The character we are
1867     checking (c) can be multibyte. */
1868    
1869     case OP_NOT:
1870     if (clen > 0)
1871     {
1872 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1873 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1874     }
1875     break;
1876    
1877     /*-----------------------------------------------------------------*/
1878     case OP_PLUS:
1879     case OP_MINPLUS:
1880 nigel 93 case OP_POSPLUS:
1881 nigel 77 case OP_NOTPLUS:
1882     case OP_NOTMINPLUS:
1883 nigel 93 case OP_NOTPOSPLUS:
1884 nigel 77 count = current_state->count; /* Already matched */
1885     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1886     if (clen > 0)
1887     {
1888 nigel 93 unsigned int otherd = NOTACHAR;
1889 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1890     {
1891     #ifdef SUPPORT_UTF8
1892 nigel 87 if (utf8 && d >= 128)
1893 nigel 77 {
1894     #ifdef SUPPORT_UCP
1895 nigel 87 otherd = _pcre_ucp_othercase(d);
1896 nigel 77 #endif /* SUPPORT_UCP */
1897     }
1898     else
1899     #endif /* SUPPORT_UTF8 */
1900     otherd = fcc[d];
1901     }
1902     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1903 nigel 93 {
1904     if (count > 0 &&
1905     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1906     {
1907     active_count--; /* Remove non-match possibility */
1908     next_active_state--;
1909     }
1910     count++;
1911     ADD_NEW(state_offset, count);
1912     }
1913 nigel 77 }
1914     break;
1915    
1916     /*-----------------------------------------------------------------*/
1917     case OP_QUERY:
1918     case OP_MINQUERY:
1919 nigel 93 case OP_POSQUERY:
1920 nigel 77 case OP_NOTQUERY:
1921     case OP_NOTMINQUERY:
1922 nigel 93 case OP_NOTPOSQUERY:
1923 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1924     if (clen > 0)
1925     {
1926 nigel 93 unsigned int otherd = NOTACHAR;
1927 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1928 nigel 77 {
1929     #ifdef SUPPORT_UTF8
1930 nigel 87 if (utf8 && d >= 128)
1931 nigel 77 {
1932     #ifdef SUPPORT_UCP
1933 nigel 87 otherd = _pcre_ucp_othercase(d);
1934 nigel 77 #endif /* SUPPORT_UCP */
1935     }
1936     else
1937     #endif /* SUPPORT_UTF8 */
1938     otherd = fcc[d];
1939     }
1940     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1941 nigel 93 {
1942     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1943     {
1944     active_count--; /* Remove non-match possibility */
1945     next_active_state--;
1946     }
1947     ADD_NEW(state_offset + dlen + 1, 0);
1948     }
1949 nigel 77 }
1950     break;
1951    
1952     /*-----------------------------------------------------------------*/
1953     case OP_STAR:
1954     case OP_MINSTAR:
1955 nigel 93 case OP_POSSTAR:
1956 nigel 77 case OP_NOTSTAR:
1957     case OP_NOTMINSTAR:
1958 nigel 93 case OP_NOTPOSSTAR:
1959 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1960     if (clen > 0)
1961     {
1962 nigel 93 unsigned int otherd = NOTACHAR;
1963 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1964 nigel 77 {
1965     #ifdef SUPPORT_UTF8
1966 nigel 87 if (utf8 && d >= 128)
1967 nigel 77 {
1968     #ifdef SUPPORT_UCP
1969 nigel 87 otherd = _pcre_ucp_othercase(d);
1970 nigel 77 #endif /* SUPPORT_UCP */
1971     }
1972     else
1973     #endif /* SUPPORT_UTF8 */
1974     otherd = fcc[d];
1975     }
1976     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1977 nigel 93 {
1978     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1979     {
1980     active_count--; /* Remove non-match possibility */
1981     next_active_state--;
1982     }
1983     ADD_NEW(state_offset, 0);
1984     }
1985 nigel 77 }
1986     break;
1987    
1988     /*-----------------------------------------------------------------*/
1989     case OP_EXACT:
1990 nigel 93 case OP_NOTEXACT:
1991     count = current_state->count; /* Number already matched */
1992     if (clen > 0)
1993     {
1994     unsigned int otherd = NOTACHAR;
1995     if ((ims & PCRE_CASELESS) != 0)
1996     {
1997     #ifdef SUPPORT_UTF8
1998     if (utf8 && d >= 128)
1999     {
2000     #ifdef SUPPORT_UCP
2001     otherd = _pcre_ucp_othercase(d);
2002     #endif /* SUPPORT_UCP */
2003     }
2004     else
2005     #endif /* SUPPORT_UTF8 */
2006     otherd = fcc[d];
2007     }
2008     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2009     {
2010     if (++count >= GET2(code, 1))
2011     { ADD_NEW(state_offset + dlen + 3, 0); }
2012     else
2013     { ADD_NEW(state_offset, count); }
2014     }
2015     }
2016     break;
2017    
2018     /*-----------------------------------------------------------------*/
2019 nigel 77 case OP_UPTO:
2020     case OP_MINUPTO:
2021 nigel 93 case OP_POSUPTO:
2022 nigel 77 case OP_NOTUPTO:
2023     case OP_NOTMINUPTO:
2024 nigel 93 case OP_NOTPOSUPTO:
2025     ADD_ACTIVE(state_offset + dlen + 3, 0);
2026 nigel 77 count = current_state->count; /* Number already matched */
2027     if (clen > 0)
2028     {
2029 nigel 93 unsigned int otherd = NOTACHAR;
2030 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2031     {
2032     #ifdef SUPPORT_UTF8
2033 nigel 87 if (utf8 && d >= 128)
2034 nigel 77 {
2035     #ifdef SUPPORT_UCP
2036 nigel 87 otherd = _pcre_ucp_othercase(d);
2037 nigel 77 #endif /* SUPPORT_UCP */
2038     }
2039     else
2040     #endif /* SUPPORT_UTF8 */
2041     otherd = fcc[d];
2042     }
2043     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2044     {
2045 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2046     {
2047     active_count--; /* Remove non-match possibility */
2048     next_active_state--;
2049     }
2050 nigel 77 if (++count >= GET2(code, 1))
2051     { ADD_NEW(state_offset + dlen + 3, 0); }
2052     else
2053     { ADD_NEW(state_offset, count); }
2054     }
2055     }
2056     break;
2057    
2058    
2059     /* ========================================================================== */
2060     /* These are the class-handling opcodes */
2061    
2062     case OP_CLASS:
2063     case OP_NCLASS:
2064     case OP_XCLASS:
2065     {
2066     BOOL isinclass = FALSE;
2067     int next_state_offset;
2068     const uschar *ecode;
2069    
2070     /* For a simple class, there is always just a 32-byte table, and we
2071     can set isinclass from it. */
2072    
2073     if (codevalue != OP_XCLASS)
2074     {
2075     ecode = code + 33;
2076     if (clen > 0)
2077     {
2078     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2079     ((code[1 + c/8] & (1 << (c&7))) != 0);
2080     }
2081     }
2082    
2083     /* An extended class may have a table or a list of single characters,
2084     ranges, or both, and it may be positive or negative. There's a
2085     function that sorts all this out. */
2086    
2087     else
2088     {
2089     ecode = code + GET(code, 1);
2090     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2091     }
2092    
2093     /* At this point, isinclass is set for all kinds of class, and ecode
2094     points to the byte after the end of the class. If there is a
2095     quantifier, this is where it will be. */
2096    
2097     next_state_offset = ecode - start_code;
2098    
2099     switch (*ecode)
2100     {
2101     case OP_CRSTAR:
2102     case OP_CRMINSTAR:
2103     ADD_ACTIVE(next_state_offset + 1, 0);
2104     if (isinclass) { ADD_NEW(state_offset, 0); }
2105     break;
2106    
2107     case OP_CRPLUS:
2108     case OP_CRMINPLUS:
2109     count = current_state->count; /* Already matched */
2110     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2111     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2112     break;
2113    
2114     case OP_CRQUERY:
2115     case OP_CRMINQUERY:
2116     ADD_ACTIVE(next_state_offset + 1, 0);
2117     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2118     break;
2119    
2120     case OP_CRRANGE:
2121     case OP_CRMINRANGE:
2122     count = current_state->count; /* Already matched */
2123     if (count >= GET2(ecode, 1))
2124     { ADD_ACTIVE(next_state_offset + 5, 0); }
2125     if (isinclass)
2126     {
2127 nigel 91 int max = GET2(ecode, 3);
2128     if (++count >= max && max != 0) /* Max 0 => no limit */
2129 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2130     else
2131     { ADD_NEW(state_offset, count); }
2132     }
2133     break;
2134    
2135     default:
2136     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2137     break;
2138     }
2139     }
2140     break;
2141    
2142     /* ========================================================================== */
2143     /* These are the opcodes for fancy brackets of various kinds. We have
2144     to use recursion in order to handle them. */
2145    
2146     case OP_ASSERT:
2147     case OP_ASSERT_NOT:
2148     case OP_ASSERTBACK:
2149     case OP_ASSERTBACK_NOT:
2150     {
2151     int rc;
2152     int local_offsets[2];
2153     int local_workspace[1000];
2154     const uschar *endasscode = code + GET(code, 1);
2155    
2156     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2157    
2158     rc = internal_dfa_exec(
2159     md, /* static match data */
2160     code, /* this subexpression's code */
2161     ptr, /* where we currently are */
2162     ptr - start_subject, /* start offset */
2163     local_offsets, /* offset vector */
2164     sizeof(local_offsets)/sizeof(int), /* size of same */
2165     local_workspace, /* workspace vector */
2166     sizeof(local_workspace)/sizeof(int), /* size of same */
2167     ims, /* the current ims flags */
2168     rlevel, /* function recursion level */
2169     recursing); /* pass on regex recursion */
2170    
2171     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2172     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2173     }
2174     break;
2175    
2176     /*-----------------------------------------------------------------*/
2177     case OP_COND:
2178 nigel 93 case OP_SCOND:
2179 nigel 77 {
2180     int local_offsets[1000];
2181     int local_workspace[1000];
2182     int condcode = code[LINK_SIZE+1];
2183    
2184 nigel 93 /* Back reference conditions are not supported */
2185 nigel 77
2186 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2187    
2188     /* The DEFINE condition is always false */
2189    
2190     if (condcode == OP_DEF)
2191 nigel 77 {
2192 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2193     }
2194    
2195     /* The only supported version of OP_RREF is for the value RREF_ANY,
2196     which means "test if in any recursion". We can't test for specifically
2197     recursed groups. */
2198    
2199     else if (condcode == OP_RREF)
2200     {
2201 nigel 77 int value = GET2(code, LINK_SIZE+2);
2202 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2203 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2204     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2205     }
2206    
2207     /* Otherwise, the condition is an assertion */
2208    
2209     else
2210     {
2211     int rc;
2212     const uschar *asscode = code + LINK_SIZE + 1;
2213     const uschar *endasscode = asscode + GET(asscode, 1);
2214    
2215     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2216    
2217     rc = internal_dfa_exec(
2218     md, /* fixed match data */
2219     asscode, /* this subexpression's code */
2220     ptr, /* where we currently are */
2221     ptr - start_subject, /* start offset */
2222     local_offsets, /* offset vector */
2223     sizeof(local_offsets)/sizeof(int), /* size of same */
2224     local_workspace, /* workspace vector */
2225     sizeof(local_workspace)/sizeof(int), /* size of same */
2226     ims, /* the current ims flags */
2227     rlevel, /* function recursion level */
2228     recursing); /* pass on regex recursion */
2229    
2230     if ((rc >= 0) ==
2231     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2232     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2233     else
2234     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2235     }
2236     }
2237     break;
2238    
2239     /*-----------------------------------------------------------------*/
2240     case OP_RECURSE:
2241     {
2242     int local_offsets[1000];
2243     int local_workspace[1000];
2244     int rc;
2245    
2246     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2247     recursing + 1));
2248    
2249     rc = internal_dfa_exec(
2250     md, /* fixed match data */
2251     start_code + GET(code, 1), /* this subexpression's code */
2252     ptr, /* where we currently are */
2253     ptr - start_subject, /* start offset */
2254     local_offsets, /* offset vector */
2255     sizeof(local_offsets)/sizeof(int), /* size of same */
2256     local_workspace, /* workspace vector */
2257     sizeof(local_workspace)/sizeof(int), /* size of same */
2258     ims, /* the current ims flags */
2259     rlevel, /* function recursion level */
2260     recursing + 1); /* regex recurse level */
2261    
2262     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2263     recursing + 1, rc));
2264    
2265     /* Ran out of internal offsets */
2266    
2267     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2268    
2269     /* For each successful matched substring, set up the next state with a
2270     count of characters to skip before trying it. Note that the count is in
2271     characters, not bytes. */
2272    
2273     if (rc > 0)
2274     {
2275     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2276     {
2277     const uschar *p = start_subject + local_offsets[rc];
2278     const uschar *pp = start_subject + local_offsets[rc+1];
2279     int charcount = local_offsets[rc+1] - local_offsets[rc];
2280     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2281     if (charcount > 0)
2282     {
2283     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2284     }
2285     else
2286     {
2287     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2288     }
2289     }
2290     }
2291     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2292     }
2293     break;
2294    
2295     /*-----------------------------------------------------------------*/
2296     case OP_ONCE:
2297     {
2298     int local_offsets[2];
2299     int local_workspace[1000];
2300    
2301     int rc = internal_dfa_exec(
2302     md, /* fixed match data */
2303     code, /* this subexpression's code */
2304     ptr, /* where we currently are */
2305     ptr - start_subject, /* start offset */
2306     local_offsets, /* offset vector */
2307     sizeof(local_offsets)/sizeof(int), /* size of same */
2308     local_workspace, /* workspace vector */
2309     sizeof(local_workspace)/sizeof(int), /* size of same */
2310     ims, /* the current ims flags */
2311     rlevel, /* function recursion level */
2312     recursing); /* pass on regex recursion */
2313    
2314     if (rc >= 0)
2315     {
2316     const uschar *end_subpattern = code;
2317     int charcount = local_offsets[1] - local_offsets[0];
2318     int next_state_offset, repeat_state_offset;
2319    
2320     do { end_subpattern += GET(end_subpattern, 1); }
2321     while (*end_subpattern == OP_ALT);
2322     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2323    
2324     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2325     arrange for the repeat state also to be added to the relevant list.
2326     Calculate the offset, or set -1 for no repeat. */
2327    
2328     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2329     *end_subpattern == OP_KETRMIN)?
2330     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2331    
2332     /* If we have matched an empty string, add the next state at the
2333     current character pointer. This is important so that the duplicate
2334     checking kicks in, which is what breaks infinite loops that match an
2335     empty string. */
2336    
2337     if (charcount == 0)
2338     {
2339     ADD_ACTIVE(next_state_offset, 0);
2340     }
2341    
2342     /* Optimization: if there are no more active states, and there
2343     are no new states yet set up, then skip over the subject string
2344     right here, to save looping. Otherwise, set up the new state to swing
2345     into action when the end of the substring is reached. */
2346    
2347     else if (i + 1 >= active_count && new_count == 0)
2348     {
2349     ptr += charcount;
2350     clen = 0;
2351     ADD_NEW(next_state_offset, 0);
2352    
2353     /* If we are adding a repeat state at the new character position,
2354     we must fudge things so that it is the only current state.
2355     Otherwise, it might be a duplicate of one we processed before, and
2356     that would cause it to be skipped. */
2357    
2358     if (repeat_state_offset >= 0)
2359     {
2360     next_active_state = active_states;
2361     active_count = 0;
2362     i = -1;
2363     ADD_ACTIVE(repeat_state_offset, 0);
2364     }
2365     }
2366     else
2367     {
2368     const uschar *p = start_subject + local_offsets[0];
2369     const uschar *pp = start_subject + local_offsets[1];
2370     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2371     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2372     if (repeat_state_offset >= 0)
2373     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2374     }
2375    
2376     }
2377     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2378     }
2379     break;
2380    
2381    
2382     /* ========================================================================== */
2383     /* Handle callouts */
2384    
2385     case OP_CALLOUT:
2386     if (pcre_callout != NULL)
2387     {
2388     int rrc;
2389     pcre_callout_block cb;
2390     cb.version = 1; /* Version 1 of the callout block */
2391     cb.callout_number = code[1];
2392     cb.offset_vector = offsets;
2393 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2394 nigel 77 cb.subject_length = end_subject - start_subject;
2395     cb.start_match = current_subject - start_subject;
2396     cb.current_position = ptr - start_subject;
2397     cb.pattern_position = GET(code, 2);
2398     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2399     cb.capture_top = 1;
2400     cb.capture_last = -1;
2401     cb.callout_data = md->callout_data;
2402     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2403     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2404     }
2405     break;
2406    
2407    
2408     /* ========================================================================== */
2409     default: /* Unsupported opcode */
2410     return PCRE_ERROR_DFA_UITEM;
2411     }
2412    
2413     NEXT_ACTIVE_STATE: continue;
2414    
2415     } /* End of loop scanning active states */
2416    
2417     /* We have finished the processing at the current subject character. If no
2418     new states have been set for the next character, we have found all the
2419     matches that we are going to find. If we are at the top level and partial
2420     matching has been requested, check for appropriate conditions. */
2421    
2422     if (new_count <= 0)
2423     {
2424     if (match_count < 0 && /* No matches found */
2425     rlevel == 1 && /* Top level match function */
2426     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2427     ptr >= end_subject && /* Reached end of subject */
2428     ptr > current_subject) /* Matched non-empty string */
2429     {
2430     if (offsetcount >= 2)
2431     {
2432     offsets[0] = current_subject - start_subject;
2433     offsets[1] = end_subject - start_subject;
2434     }
2435     match_count = PCRE_ERROR_PARTIAL;
2436     }
2437    
2438     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2439     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2440     rlevel*2-2, SP));
2441 nigel 91 break; /* In effect, "return", but see the comment below */
2442 nigel 77 }
2443    
2444     /* One or more states are active for the next character. */
2445    
2446     ptr += clen; /* Advance to next subject character */
2447     } /* Loop to move along the subject string */
2448    
2449 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2450     if we use "return" above, we have compiler trouble. Some compilers warn if
2451     there's nothing here because they think the function doesn't return a value. On
2452     the other hand, if we put a dummy statement here, some more clever compilers
2453     complain that it can't be reached. Sigh. */
2454 nigel 77
2455 nigel 91 return match_count;
2456 nigel 77 }
2457    
2458    
2459    
2460    
2461     /*************************************************
2462     * Execute a Regular Expression - DFA engine *
2463     *************************************************/
2464    
2465     /* This external function applies a compiled re to a subject string using a DFA
2466     engine. This function calls the internal function multiple times if the pattern
2467     is not anchored.
2468    
2469     Arguments:
2470     argument_re points to the compiled expression
2471 ph10 97 extra_data points to extra data or is NULL
2472 nigel 77 subject points to the subject string
2473     length length of subject string (may contain binary zeros)
2474     start_offset where to start in the subject string
2475     options option bits
2476     offsets vector of match offsets
2477     offsetcount size of same
2478     workspace workspace vector
2479     wscount size of same
2480    
2481     Returns: > 0 => number of match offset pairs placed in offsets
2482     = 0 => offsets overflowed; longest matches are present
2483     -1 => failed to match
2484     < -1 => some kind of unexpected problem
2485     */
2486    
2487 ph10 145 PCRE_EXP_DEFN int
2488 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2489     const char *subject, int length, int start_offset, int options, int *offsets,
2490     int offsetcount, int *workspace, int wscount)
2491     {
2492     real_pcre *re = (real_pcre *)argument_re;
2493     dfa_match_data match_block;
2494 nigel 91 dfa_match_data *md = &match_block;
2495 nigel 77 BOOL utf8, anchored, startline, firstline;
2496     const uschar *current_subject, *end_subject, *lcc;
2497    
2498     pcre_study_data internal_study;
2499     const pcre_study_data *study = NULL;
2500     real_pcre internal_re;
2501    
2502     const uschar *req_byte_ptr;
2503     const uschar *start_bits = NULL;
2504     BOOL first_byte_caseless = FALSE;
2505     BOOL req_byte_caseless = FALSE;
2506     int first_byte = -1;
2507     int req_byte = -1;
2508     int req_byte2 = -1;
2509 nigel 91 int newline;
2510 nigel 77
2511     /* Plausibility checks */
2512    
2513     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2514     if (re == NULL || subject == NULL || workspace == NULL ||
2515     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2516     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2517     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2518    
2519     /* We need to find the pointer to any study data before we test for byte
2520     flipping, so we scan the extra_data block first. This may set two fields in the
2521     match block, so we must initialize them beforehand. However, the other fields
2522     in the match block must not be set until after the byte flipping. */
2523    
2524 nigel 91 md->tables = re->tables;
2525     md->callout_data = NULL;
2526 nigel 77
2527     if (extra_data != NULL)
2528     {
2529     unsigned int flags = extra_data->flags;
2530     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2531     study = (const pcre_study_data *)extra_data->study_data;
2532     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2533 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2534     return PCRE_ERROR_DFA_UMLIMIT;
2535 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2536 nigel 91 md->callout_data = extra_data->callout_data;
2537 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2538 nigel 91 md->tables = extra_data->tables;
2539 nigel 77 }
2540    
2541     /* Check that the first field in the block is the magic number. If it is not,
2542     test for a regex that was compiled on a host of opposite endianness. If this is
2543     the case, flipped values are put in internal_re and internal_study if there was
2544     study data too. */
2545    
2546     if (re->magic_number != MAGIC_NUMBER)
2547     {
2548     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2549     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2550     if (study != NULL) study = &internal_study;
2551     }
2552    
2553     /* Set some local values */
2554    
2555     current_subject = (const unsigned char *)subject + start_offset;
2556     end_subject = (const unsigned char *)subject + length;
2557     req_byte_ptr = current_subject - 1;
2558    
2559 nigel 91 #ifdef SUPPORT_UTF8
2560 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2561 nigel 91 #else
2562     utf8 = FALSE;
2563     #endif
2564 nigel 77
2565 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2566     (re->options & PCRE_ANCHORED) != 0;
2567    
2568 nigel 77 /* The remaining fixed data for passing around. */
2569    
2570 nigel 91 md->start_code = (const uschar *)argument_re +
2571 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2572 nigel 91 md->start_subject = (const unsigned char *)subject;
2573     md->end_subject = end_subject;
2574     md->moptions = options;
2575     md->poptions = re->options;
2576 nigel 77
2577 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2578     nothing is set at run time, whatever was used at compile time applies. */
2579 nigel 91
2580 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2581 nigel 93 PCRE_NEWLINE_BITS)
2582 nigel 91 {
2583 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2584 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2585     case PCRE_NEWLINE_LF: newline = '\n'; break;
2586     case PCRE_NEWLINE_CR+
2587     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2588 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2589 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2590 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2591 nigel 91 }
2592    
2593 ph10 149 if (newline == -2)
2594 nigel 91 {
2595 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2596     }
2597     else if (newline < 0)
2598     {
2599 nigel 93 md->nltype = NLTYPE_ANY;
2600 nigel 91 }
2601     else
2602     {
2603 nigel 93 md->nltype = NLTYPE_FIXED;
2604     if (newline > 255)
2605     {
2606     md->nllen = 2;
2607     md->nl[0] = (newline >> 8) & 255;
2608     md->nl[1] = newline & 255;
2609     }
2610     else
2611     {
2612     md->nllen = 1;
2613     md->nl[0] = newline;
2614     }
2615 nigel 91 }
2616    
2617 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2618     back the character offset. */
2619    
2620     #ifdef SUPPORT_UTF8
2621     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2622     {
2623     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2624     return PCRE_ERROR_BADUTF8;
2625     if (start_offset > 0 && start_offset < length)
2626     {
2627     int tb = ((uschar *)subject)[start_offset];
2628     if (tb > 127)
2629     {
2630     tb &= 0xc0;
2631     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2632     }
2633     }
2634     }
2635     #endif
2636    
2637     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2638     is a feature that makes it possible to save compiled regex and re-use them
2639     in other programs later. */
2640    
2641 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2642 nigel 77
2643     /* The lower casing table and the "must be at the start of a line" flag are
2644     used in a loop when finding where to start. */
2645    
2646 nigel 91 lcc = md->tables + lcc_offset;
2647 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2648 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2649    
2650     /* Set up the first character to match, if available. The first_byte value is
2651     never set for an anchored regular expression, but the anchoring may be forced
2652     at run time, so we have to test for anchoring. The first char may be unset for
2653     an unanchored pattern, of course. If there's no first char and the pattern was
2654     studied, there may be a bitmap of possible first characters. */
2655    
2656     if (!anchored)
2657     {
2658 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2659 nigel 77 {
2660     first_byte = re->first_byte & 255;
2661     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2662     first_byte = lcc[first_byte];
2663     }
2664     else
2665     {
2666     if (startline && study != NULL &&
2667     (study->options & PCRE_STUDY_MAPPED) != 0)
2668     start_bits = study->start_bits;
2669     }
2670     }
2671    
2672     /* For anchored or unanchored matches, there may be a "last known required
2673     character" set. */
2674    
2675 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2676 nigel 77 {
2677     req_byte = re->req_byte & 255;
2678     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2679 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2680 nigel 77 }
2681    
2682     /* Call the main matching function, looping for a non-anchored regex after a
2683     failed match. Unless restarting, optimize by moving to the first match
2684     character if possible, when not anchored. Then unless wanting a partial match,
2685     check for a required later character. */
2686    
2687     for (;;)
2688     {
2689     int rc;
2690    
2691     if ((options & PCRE_DFA_RESTART) == 0)
2692     {
2693     const uschar *save_end_subject = end_subject;
2694    
2695     /* Advance to a unique first char if possible. If firstline is TRUE, the
2696     start of the match is constrained to the first line of a multiline string.
2697 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2698     scanning at a newline. If the match fails at the newline, later code breaks
2699     this loop. */
2700 nigel 77
2701     if (firstline)
2702     {
2703     const uschar *t = current_subject;
2704 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2705 nigel 77 end_subject = t;
2706     }
2707    
2708     if (first_byte >= 0)
2709     {
2710     if (first_byte_caseless)
2711     while (current_subject < end_subject &&
2712     lcc[*current_subject] != first_byte)
2713     current_subject++;
2714     else
2715     while (current_subject < end_subject && *current_subject != first_byte)
2716     current_subject++;
2717     }
2718    
2719 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2720 nigel 77
2721     else if (startline)
2722     {
2723 nigel 93 if (current_subject > md->start_subject + start_offset)
2724 nigel 77 {
2725 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2726 nigel 77 current_subject++;
2727 ph10 130
2728 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2729     ANYCRLF, and we are now at a LF, advance the match position by one more
2730     character. */
2731 ph10 134
2732 ph10 130 if (current_subject[-1] == '\r' &&
2733 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2734 ph10 130 current_subject < end_subject &&
2735     *current_subject == '\n')
2736     current_subject++;
2737 nigel 77 }
2738     }
2739    
2740     /* Or to a non-unique first char after study */
2741    
2742     else if (start_bits != NULL)
2743     {
2744     while (current_subject < end_subject)
2745     {
2746     register unsigned int c = *current_subject;
2747     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2748     else break;
2749     }
2750     }
2751    
2752     /* Restore fudged end_subject */
2753    
2754     end_subject = save_end_subject;
2755     }
2756    
2757     /* If req_byte is set, we know that that character must appear in the subject
2758     for the match to succeed. If the first character is set, req_byte must be
2759     later in the subject; otherwise the test starts at the match point. This
2760     optimization can save a huge amount of work in patterns with nested unlimited
2761     repeats that aren't going to match. Writing separate code for cased/caseless
2762     versions makes it go faster, as does using an autoincrement and backing off
2763     on a match.
2764    
2765     HOWEVER: when the subject string is very, very long, searching to its end can
2766     take a long time, and give bad performance on quite ordinary patterns. This
2767     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2768     don't do this when the string is sufficiently long.
2769    
2770     ALSO: this processing is disabled when partial matching is requested.
2771     */
2772    
2773     if (req_byte >= 0 &&
2774     end_subject - current_subject < REQ_BYTE_MAX &&
2775     (options & PCRE_PARTIAL) == 0)
2776     {
2777     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2778    
2779     /* We don't need to repeat the search if we haven't yet reached the
2780     place we found it at last time. */
2781    
2782     if (p > req_byte_ptr)
2783     {
2784     if (req_byte_caseless)
2785     {
2786     while (p < end_subject)
2787     {
2788     register int pp = *p++;
2789     if (pp == req_byte || pp == req_byte2) { p--; break; }
2790     }
2791     }
2792     else
2793     {
2794     while (p < end_subject)
2795     {
2796     if (*p++ == req_byte) { p--; break; }
2797     }
2798     }
2799    
2800     /* If we can't find the required character, break the matching loop,
2801     which will cause a return or PCRE_ERROR_NOMATCH. */
2802    
2803     if (p >= end_subject) break;
2804    
2805     /* If we have found the required character, save the point where we
2806     found it, so that we don't search again next time round the loop if
2807     the start hasn't passed this character yet. */
2808    
2809     req_byte_ptr = p;
2810     }
2811     }
2812    
2813     /* OK, now we can do the business */
2814    
2815     rc = internal_dfa_exec(
2816 nigel 91 md, /* fixed match data */
2817     md->start_code, /* this subexpression's code */
2818     current_subject, /* where we currently are */
2819     start_offset, /* start offset in subject */
2820     offsets, /* offset vector */
2821     offsetcount, /* size of same */
2822     workspace, /* workspace vector */
2823     wscount, /* size of same */
2824 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2825 nigel 91 0, /* function recurse level */
2826     0); /* regex recurse level */
2827 nigel 77
2828     /* Anything other than "no match" means we are done, always; otherwise, carry
2829     on only if not anchored. */
2830    
2831     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2832    
2833     /* Advance to the next subject character unless we are at the end of a line
2834     and firstline is set. */
2835    
2836 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2837 nigel 77 current_subject++;
2838     if (utf8)
2839     {
2840     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2841     current_subject++;
2842     }
2843     if (current_subject > end_subject) break;
2844    
2845 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2846 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2847     or ANY or ANYCRLF, advance the match position by one more character. */
2848 nigel 93
2849     if (current_subject[-1] == '\r' &&
2850 ph10 226 current_subject < end_subject &&
2851     *current_subject == '\n' &&
2852 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2853 ph10 226 (md->nltype == NLTYPE_ANY ||
2854     md->nltype == NLTYPE_ANYCRLF ||
2855     md->nllen == 2))
2856 nigel 93 current_subject++;
2857    
2858     } /* "Bumpalong" loop */
2859    
2860 nigel 77 return PCRE_ERROR_NOMATCH;
2861     }
2862    
2863     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12