/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 231 - (hide annotations) (download)
Tue Sep 11 11:15:33 2007 UTC (7 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 95163 byte(s)
Add facility to make \R match only CR, LF, or CRLF.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 ph10 200 #ifdef HAVE_CONFIG_H
48 ph10 199 #include <config.h>
49 ph10 200 #endif
50 ph10 199
51 nigel 93 #define NLBLOCK md /* Block containing newline information */
52     #define PSSTART start_subject /* Field containing processed string start */
53     #define PSEND end_subject /* Field containing processed string end */
54    
55 nigel 77 #include "pcre_internal.h"
56    
57    
58     /* For use to indent debugging output */
59    
60     #define SP " "
61    
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87     static uschar coptable[] = {
88     0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 nigel 77 0, 0, /* Any, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 211 0, 0 /* FAIL, ACCEPT */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141     static uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146     0 /* OP_ANY */
147     };
148    
149     static uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154     1 /* OP_ANY */
155     };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226     Returns: > 0 =>
227     = 0 =>
228     -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 nigel 77
458     /* Make the new state list into the active state list and empty the
459     new state list. */
460    
461     temp_states = active_states;
462     active_states = new_states;
463     new_states = temp_states;
464     active_count = new_count;
465     new_count = 0;
466    
467     workspace[0] ^= 1; /* Remember for the restarting feature */
468     workspace[1] = active_count;
469    
470     #ifdef DEBUG
471     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473     printf("\"\n");
474    
475     printf("%.*sActive states: ", rlevel*2-2, SP);
476     for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478     printf("\n");
479     #endif
480    
481     /* Set the pointers for adding new states */
482    
483     next_active_state = active_states + active_count;
484     next_new_state = new_states;
485    
486     /* Load the current character from the subject outside the loop, as many
487     different states may want to look at it, and we assume that at least one
488     will. */
489    
490     if (ptr < end_subject)
491     {
492 nigel 93 clen = 1; /* Number of bytes in the character */
493 nigel 77 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495     #endif /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498     else
499     {
500 nigel 93 clen = 0; /* This indicates the end of the subject */
501     c = NOTACHAR; /* This value should never actually be used */
502 nigel 77 }
503    
504     /* Scan up the active states and act on each one. The result of an action
505     may be to add more states to the currently active list (e.g. on hitting a
506     parenthesis) or it may be to put states on the new list, for considering
507     when we move the character pointer on. */
508    
509     for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514     int count, codevalue;
515 ph10 152 #ifdef SUPPORT_UCP
516 nigel 87 int chartype, script;
517 ph10 152 #endif
518 nigel 77
519     #ifdef DEBUG
520     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 nigel 93 if (clen == 0) printf("EOL\n");
522 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
523     else printf("0x%02x\n", c);
524     #endif
525    
526     /* This variable is referred to implicity in the ADD_xxx macros. */
527    
528     ims = current_state->ims;
529    
530     /* A negative offset is a special case meaning "hold off going to this
531     (negated) state until the number of characters in the data field have
532     been skipped". */
533    
534     if (state_offset < 0)
535     {
536     if (current_state->data > 0)
537     {
538     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539     ADD_NEW_DATA(state_offset, current_state->count,
540     current_state->data - 1);
541     continue;
542     }
543     else
544     {
545     current_state->offset = state_offset = -state_offset;
546     }
547     }
548    
549     /* Check for a duplicate state with the same count, and skip if found. */
550    
551     for (j = 0; j < i; j++)
552     {
553     if (active_states[j].offset == state_offset &&
554     active_states[j].count == current_state->count)
555     {
556     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557     goto NEXT_ACTIVE_STATE;
558     }
559     }
560    
561     /* The state offset is the offset to the opcode */
562    
563     code = start_code + state_offset;
564     codevalue = *code;
565    
566     /* If this opcode is followed by an inline character, load it. It is
567     tempting to test for the presence of a subject character here, but that
568     is wrong, because sometimes zero repetitions of the subject are
569     permitted.
570    
571     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 ph10 178 argument that is not a data character - but is always one byte long. We
573     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574     this case. To keep the other cases fast, convert these ones to new opcodes.
575     */
576 nigel 77
577     if (coptable[codevalue] > 0)
578     {
579     dlen = 1;
580     #ifdef SUPPORT_UTF8
581     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582     #endif /* SUPPORT_UTF8 */
583     d = code[coptable[codevalue]];
584     if (codevalue >= OP_TYPESTAR)
585     {
586 nigel 93 switch(d)
587     {
588     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
589     case OP_NOTPROP:
590     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593 ph10 178 case OP_NOT_HSPACE:
594 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595 ph10 178 case OP_NOT_VSPACE:
596 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 nigel 93 default: break;
598     }
599 nigel 77 }
600     }
601     else
602     {
603     dlen = 0; /* Not strictly necessary, but compilers moan */
604 nigel 93 d = NOTACHAR; /* if these variables are not set. */
605 nigel 77 }
606    
607    
608     /* Now process the individual opcodes */
609    
610     switch (codevalue)
611     {
612    
613     /* ========================================================================== */
614     /* Reached a closing bracket. If not at the end of the pattern, carry
615     on with the next opcode. Otherwise, unless we have an empty string and
616     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617     matches so we always have the longest first. */
618    
619     case OP_KET:
620     case OP_KETRMIN:
621     case OP_KETRMAX:
622     if (code != end_code)
623     {
624     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625     if (codevalue != OP_KET)
626     {
627     ADD_ACTIVE(state_offset - GET(code, 1), 0);
628     }
629     }
630     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
631     {
632     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
634     match_count = 0;
635     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637     if (offsetcount >= 2)
638     {
639     offsets[0] = current_subject - start_subject;
640     offsets[1] = ptr - start_subject;
641     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642     offsets[1] - offsets[0], current_subject));
643     }
644     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
645     {
646     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648     match_count, rlevel*2-2, SP));
649     return match_count;
650     }
651     }
652     break;
653    
654     /* ========================================================================== */
655     /* These opcodes add to the current list of states without looking
656     at the current character. */
657    
658     /*-----------------------------------------------------------------*/
659     case OP_ALT:
660     do { code += GET(code, 1); } while (*code == OP_ALT);
661     ADD_ACTIVE(code - start_code, 0);
662     break;
663    
664     /*-----------------------------------------------------------------*/
665     case OP_BRA:
666 nigel 93 case OP_SBRA:
667 nigel 77 do
668     {
669     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670     code += GET(code, 1);
671     }
672     while (*code == OP_ALT);
673     break;
674    
675     /*-----------------------------------------------------------------*/
676 nigel 93 case OP_CBRA:
677     case OP_SCBRA:
678     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679     code += GET(code, 1);
680     while (*code == OP_ALT)
681     {
682     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683     code += GET(code, 1);
684     }
685     break;
686    
687     /*-----------------------------------------------------------------*/
688 nigel 77 case OP_BRAZERO:
689     case OP_BRAMINZERO:
690     ADD_ACTIVE(state_offset + 1, 0);
691     code += 1 + GET(code, 2);
692     while (*code == OP_ALT) code += GET(code, 1);
693     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
694     break;
695    
696     /*-----------------------------------------------------------------*/
697     case OP_CIRC:
698     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
699 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
700     ptr != end_subject &&
701 nigel 93 WAS_NEWLINE(ptr)))
702 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
703     break;
704    
705     /*-----------------------------------------------------------------*/
706     case OP_EOD:
707     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
708     break;
709    
710     /*-----------------------------------------------------------------*/
711     case OP_OPT:
712     ims = code[1];
713     ADD_ACTIVE(state_offset + 2, 0);
714     break;
715    
716     /*-----------------------------------------------------------------*/
717     case OP_SOD:
718     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
719     break;
720    
721     /*-----------------------------------------------------------------*/
722     case OP_SOM:
723     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
724     break;
725    
726    
727     /* ========================================================================== */
728     /* These opcodes inspect the next subject character, and sometimes
729     the previous one as well, but do not have an argument. The variable
730     clen contains the length of the current character and is zero if we are
731     at the end of the subject. */
732    
733     /*-----------------------------------------------------------------*/
734     case OP_ANY:
735 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
736 nigel 77 { ADD_NEW(state_offset + 1, 0); }
737     break;
738    
739     /*-----------------------------------------------------------------*/
740     case OP_EODN:
741 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
742 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
743     break;
744    
745     /*-----------------------------------------------------------------*/
746     case OP_DOLL:
747     if ((md->moptions & PCRE_NOTEOL) == 0)
748     {
749 nigel 91 if (clen == 0 ||
750 nigel 93 (IS_NEWLINE(ptr) &&
751 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
752     ))
753 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
754     }
755 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
756 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
757     break;
758    
759     /*-----------------------------------------------------------------*/
760    
761     case OP_DIGIT:
762     case OP_WHITESPACE:
763     case OP_WORDCHAR:
764     if (clen > 0 && c < 256 &&
765     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
766     { ADD_NEW(state_offset + 1, 0); }
767     break;
768    
769     /*-----------------------------------------------------------------*/
770     case OP_NOT_DIGIT:
771     case OP_NOT_WHITESPACE:
772     case OP_NOT_WORDCHAR:
773     if (clen > 0 && (c >= 256 ||
774     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
775     { ADD_NEW(state_offset + 1, 0); }
776     break;
777    
778     /*-----------------------------------------------------------------*/
779     case OP_WORD_BOUNDARY:
780     case OP_NOT_WORD_BOUNDARY:
781     {
782     int left_word, right_word;
783    
784     if (ptr > start_subject)
785     {
786     const uschar *temp = ptr - 1;
787     #ifdef SUPPORT_UTF8
788     if (utf8) BACKCHAR(temp);
789     #endif
790     GETCHARTEST(d, temp);
791     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
792     }
793     else left_word = 0;
794    
795     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
796     else right_word = 0;
797    
798     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
799     { ADD_ACTIVE(state_offset + 1, 0); }
800     }
801     break;
802    
803    
804     /*-----------------------------------------------------------------*/
805     /* Check the next character by Unicode property. We will get here only
806     if the support is in the binary; otherwise a compile-time error occurs.
807     */
808    
809 ph10 151 #ifdef SUPPORT_UCP
810 nigel 77 case OP_PROP:
811     case OP_NOTPROP:
812     if (clen > 0)
813     {
814 nigel 87 BOOL OK;
815     int category = _pcre_ucp_findprop(c, &chartype, &script);
816     switch(code[1])
817 nigel 77 {
818 nigel 87 case PT_ANY:
819     OK = TRUE;
820     break;
821    
822     case PT_LAMP:
823     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
824     break;
825    
826     case PT_GC:
827     OK = category == code[2];
828     break;
829    
830     case PT_PC:
831     OK = chartype == code[2];
832     break;
833    
834     case PT_SC:
835     OK = script == code[2];
836     break;
837    
838     /* Should never occur, but keep compilers from grumbling. */
839    
840     default:
841     OK = codevalue != OP_PROP;
842     break;
843 nigel 77 }
844 nigel 87
845     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
846 nigel 77 }
847     break;
848     #endif
849    
850    
851    
852     /* ========================================================================== */
853     /* These opcodes likewise inspect the subject character, but have an
854     argument that is not a data character. It is one of these opcodes:
855     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
856     OP_NOT_WORDCHAR. The value is loaded into d. */
857    
858     case OP_TYPEPLUS:
859     case OP_TYPEMINPLUS:
860 nigel 93 case OP_TYPEPOSPLUS:
861 nigel 77 count = current_state->count; /* Already matched */
862     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
863     if (clen > 0)
864     {
865     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
866     (c < 256 &&
867 nigel 91 (d != OP_ANY ||
868     (ims & PCRE_DOTALL) != 0 ||
869     !IS_NEWLINE(ptr)
870     ) &&
871 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
872     {
873 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
874     {
875     active_count--; /* Remove non-match possibility */
876     next_active_state--;
877     }
878 nigel 77 count++;
879     ADD_NEW(state_offset, count);
880     }
881     }
882     break;
883    
884     /*-----------------------------------------------------------------*/
885     case OP_TYPEQUERY:
886     case OP_TYPEMINQUERY:
887 nigel 93 case OP_TYPEPOSQUERY:
888 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
889     if (clen > 0)
890     {
891     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
892     (c < 256 &&
893 nigel 91 (d != OP_ANY ||
894     (ims & PCRE_DOTALL) != 0 ||
895     !IS_NEWLINE(ptr)
896     ) &&
897 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
898     {
899 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
900     {
901     active_count--; /* Remove non-match possibility */
902     next_active_state--;
903     }
904 nigel 77 ADD_NEW(state_offset + 2, 0);
905     }
906     }
907     break;
908    
909     /*-----------------------------------------------------------------*/
910     case OP_TYPESTAR:
911     case OP_TYPEMINSTAR:
912 nigel 93 case OP_TYPEPOSSTAR:
913 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
914     if (clen > 0)
915     {
916     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
917     (c < 256 &&
918 nigel 91 (d != OP_ANY ||
919     (ims & PCRE_DOTALL) != 0 ||
920     !IS_NEWLINE(ptr)
921     ) &&
922 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
923     {
924 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
925     {
926     active_count--; /* Remove non-match possibility */
927     next_active_state--;
928     }
929 nigel 77 ADD_NEW(state_offset, 0);
930     }
931     }
932     break;
933    
934     /*-----------------------------------------------------------------*/
935     case OP_TYPEEXACT:
936 nigel 93 count = current_state->count; /* Number already matched */
937     if (clen > 0)
938     {
939     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
940     (c < 256 &&
941     (d != OP_ANY ||
942     (ims & PCRE_DOTALL) != 0 ||
943     !IS_NEWLINE(ptr)
944     ) &&
945     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
946     {
947     if (++count >= GET2(code, 1))
948     { ADD_NEW(state_offset + 4, 0); }
949     else
950     { ADD_NEW(state_offset, count); }
951     }
952     }
953     break;
954    
955     /*-----------------------------------------------------------------*/
956 nigel 77 case OP_TYPEUPTO:
957     case OP_TYPEMINUPTO:
958 nigel 93 case OP_TYPEPOSUPTO:
959     ADD_ACTIVE(state_offset + 4, 0);
960 nigel 77 count = current_state->count; /* Number already matched */
961     if (clen > 0)
962     {
963     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
964     (c < 256 &&
965 nigel 91 (d != OP_ANY ||
966     (ims & PCRE_DOTALL) != 0 ||
967     !IS_NEWLINE(ptr)
968     ) &&
969 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
970     {
971 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
972     {
973     active_count--; /* Remove non-match possibility */
974     next_active_state--;
975     }
976 nigel 77 if (++count >= GET2(code, 1))
977     { ADD_NEW(state_offset + 4, 0); }
978     else
979     { ADD_NEW(state_offset, count); }
980     }
981     }
982     break;
983    
984     /* ========================================================================== */
985     /* These are virtual opcodes that are used when something like
986 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
987     argument. It keeps the code above fast for the other cases. The argument
988     is in the d variable. */
989 nigel 77
990 ph10 151 #ifdef SUPPORT_UCP
991 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
992     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
993 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
994 nigel 77 count = current_state->count; /* Already matched */
995 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
996 nigel 77 if (clen > 0)
997     {
998 nigel 87 BOOL OK;
999     int category = _pcre_ucp_findprop(c, &chartype, &script);
1000     switch(code[2])
1001     {
1002     case PT_ANY:
1003     OK = TRUE;
1004     break;
1005    
1006     case PT_LAMP:
1007     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1008     break;
1009    
1010     case PT_GC:
1011     OK = category == code[3];
1012     break;
1013    
1014     case PT_PC:
1015     OK = chartype == code[3];
1016     break;
1017    
1018     case PT_SC:
1019     OK = script == code[3];
1020     break;
1021    
1022     /* Should never occur, but keep compilers from grumbling. */
1023    
1024     default:
1025     OK = codevalue != OP_PROP;
1026     break;
1027     }
1028    
1029 nigel 93 if (OK == (d == OP_PROP))
1030     {
1031     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1032     {
1033     active_count--; /* Remove non-match possibility */
1034     next_active_state--;
1035     }
1036     count++;
1037     ADD_NEW(state_offset, count);
1038     }
1039 nigel 77 }
1040     break;
1041    
1042     /*-----------------------------------------------------------------*/
1043     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1044     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1045 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1046 nigel 77 count = current_state->count; /* Already matched */
1047     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1048 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1049 nigel 77 {
1050     const uschar *nptr = ptr + clen;
1051     int ncount = 0;
1052 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1053     {
1054     active_count--; /* Remove non-match possibility */
1055     next_active_state--;
1056     }
1057 nigel 77 while (nptr < end_subject)
1058     {
1059     int nd;
1060     int ndlen = 1;
1061     GETCHARLEN(nd, nptr, ndlen);
1062 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1063 nigel 77 ncount++;
1064     nptr += ndlen;
1065     }
1066     count++;
1067     ADD_NEW_DATA(-state_offset, count, ncount);
1068     }
1069     break;
1070 ph10 151 #endif
1071 nigel 77
1072     /*-----------------------------------------------------------------*/
1073 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1074     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1075     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1076     count = current_state->count; /* Already matched */
1077     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1078     if (clen > 0)
1079     {
1080     int ncount = 0;
1081     switch (c)
1082     {
1083     case 0x000b:
1084     case 0x000c:
1085     case 0x0085:
1086     case 0x2028:
1087     case 0x2029:
1088 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1089     goto ANYNL01;
1090    
1091     case 0x000d:
1092     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1093     /* Fall through */
1094    
1095     ANYNL01:
1096     case 0x000a:
1097 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1098     {
1099     active_count--; /* Remove non-match possibility */
1100     next_active_state--;
1101     }
1102     count++;
1103     ADD_NEW_DATA(-state_offset, count, ncount);
1104     break;
1105 ph10 231
1106 nigel 93 default:
1107     break;
1108     }
1109     }
1110     break;
1111    
1112     /*-----------------------------------------------------------------*/
1113 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1114     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1115     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1116     count = current_state->count; /* Already matched */
1117     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1118     if (clen > 0)
1119     {
1120 ph10 182 BOOL OK;
1121 ph10 178 switch (c)
1122     {
1123     case 0x000a:
1124     case 0x000b:
1125     case 0x000c:
1126     case 0x000d:
1127     case 0x0085:
1128     case 0x2028:
1129     case 0x2029:
1130     OK = TRUE;
1131 ph10 182 break;
1132 ph10 178
1133     default:
1134     OK = FALSE;
1135 ph10 182 break;
1136 ph10 178 }
1137    
1138     if (OK == (d == OP_VSPACE))
1139 ph10 182 {
1140 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1141     {
1142     active_count--; /* Remove non-match possibility */
1143     next_active_state--;
1144     }
1145     count++;
1146     ADD_NEW_DATA(-state_offset, count, 0);
1147     }
1148     }
1149     break;
1150    
1151     /*-----------------------------------------------------------------*/
1152     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1153     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1154     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1155     count = current_state->count; /* Already matched */
1156     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1157     if (clen > 0)
1158     {
1159 ph10 182 BOOL OK;
1160 ph10 178 switch (c)
1161     {
1162     case 0x09: /* HT */
1163     case 0x20: /* SPACE */
1164     case 0xa0: /* NBSP */
1165     case 0x1680: /* OGHAM SPACE MARK */
1166     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1167     case 0x2000: /* EN QUAD */
1168     case 0x2001: /* EM QUAD */
1169     case 0x2002: /* EN SPACE */
1170     case 0x2003: /* EM SPACE */
1171     case 0x2004: /* THREE-PER-EM SPACE */
1172     case 0x2005: /* FOUR-PER-EM SPACE */
1173     case 0x2006: /* SIX-PER-EM SPACE */
1174     case 0x2007: /* FIGURE SPACE */
1175     case 0x2008: /* PUNCTUATION SPACE */
1176     case 0x2009: /* THIN SPACE */
1177     case 0x200A: /* HAIR SPACE */
1178     case 0x202f: /* NARROW NO-BREAK SPACE */
1179     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1180     case 0x3000: /* IDEOGRAPHIC SPACE */
1181     OK = TRUE;
1182     break;
1183 ph10 182
1184 ph10 178 default:
1185     OK = FALSE;
1186     break;
1187     }
1188 ph10 182
1189 ph10 178 if (OK == (d == OP_HSPACE))
1190 ph10 182 {
1191 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1192     {
1193     active_count--; /* Remove non-match possibility */
1194     next_active_state--;
1195     }
1196     count++;
1197     ADD_NEW_DATA(-state_offset, count, 0);
1198     }
1199     }
1200     break;
1201    
1202     /*-----------------------------------------------------------------*/
1203 ph10 151 #ifdef SUPPORT_UCP
1204 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1205     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1206 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1207 nigel 87 count = 4;
1208 nigel 77 goto QS1;
1209    
1210     case OP_PROP_EXTRA + OP_TYPESTAR:
1211     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1212 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1213 nigel 77 count = 0;
1214    
1215     QS1:
1216    
1217 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1218 nigel 77 if (clen > 0)
1219     {
1220 nigel 87 BOOL OK;
1221     int category = _pcre_ucp_findprop(c, &chartype, &script);
1222     switch(code[2])
1223     {
1224     case PT_ANY:
1225     OK = TRUE;
1226     break;
1227    
1228     case PT_LAMP:
1229     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1230     break;
1231    
1232     case PT_GC:
1233     OK = category == code[3];
1234     break;
1235    
1236     case PT_PC:
1237     OK = chartype == code[3];
1238     break;
1239    
1240     case PT_SC:
1241     OK = script == code[3];
1242     break;
1243    
1244     /* Should never occur, but keep compilers from grumbling. */
1245    
1246     default:
1247     OK = codevalue != OP_PROP;
1248     break;
1249     }
1250    
1251 nigel 93 if (OK == (d == OP_PROP))
1252     {
1253     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1254     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1255     {
1256     active_count--; /* Remove non-match possibility */
1257     next_active_state--;
1258     }
1259     ADD_NEW(state_offset + count, 0);
1260     }
1261 nigel 77 }
1262     break;
1263    
1264     /*-----------------------------------------------------------------*/
1265     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1266     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1267 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1268 nigel 77 count = 2;
1269     goto QS2;
1270    
1271     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1272     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1273 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1274 nigel 77 count = 0;
1275    
1276     QS2:
1277    
1278     ADD_ACTIVE(state_offset + 2, 0);
1279 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1280 nigel 77 {
1281     const uschar *nptr = ptr + clen;
1282     int ncount = 0;
1283 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1284     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1285     {
1286     active_count--; /* Remove non-match possibility */
1287     next_active_state--;
1288     }
1289 nigel 77 while (nptr < end_subject)
1290     {
1291     int nd;
1292     int ndlen = 1;
1293     GETCHARLEN(nd, nptr, ndlen);
1294 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1295 nigel 77 ncount++;
1296     nptr += ndlen;
1297     }
1298     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1299     }
1300     break;
1301 ph10 151 #endif
1302 nigel 77
1303     /*-----------------------------------------------------------------*/
1304 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1305     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1306     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1307     count = 2;
1308     goto QS3;
1309    
1310     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1311     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1312     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1313     count = 0;
1314    
1315     QS3:
1316     ADD_ACTIVE(state_offset + 2, 0);
1317     if (clen > 0)
1318     {
1319     int ncount = 0;
1320     switch (c)
1321     {
1322     case 0x000b:
1323     case 0x000c:
1324     case 0x0085:
1325     case 0x2028:
1326     case 0x2029:
1327 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1328     goto ANYNL02;
1329    
1330     case 0x000d:
1331     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1332     /* Fall through */
1333    
1334     ANYNL02:
1335     case 0x000a:
1336 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1337     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1338     {
1339     active_count--; /* Remove non-match possibility */
1340     next_active_state--;
1341     }
1342     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1343     break;
1344 ph10 231
1345 nigel 93 default:
1346     break;
1347     }
1348     }
1349     break;
1350    
1351     /*-----------------------------------------------------------------*/
1352 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1353     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1354     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1355     count = 2;
1356     goto QS4;
1357    
1358     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1359     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1360     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1361     count = 0;
1362    
1363     QS4:
1364     ADD_ACTIVE(state_offset + 2, 0);
1365     if (clen > 0)
1366     {
1367 ph10 182 BOOL OK;
1368 ph10 178 switch (c)
1369     {
1370     case 0x000a:
1371     case 0x000b:
1372     case 0x000c:
1373     case 0x000d:
1374     case 0x0085:
1375     case 0x2028:
1376     case 0x2029:
1377     OK = TRUE;
1378     break;
1379 ph10 182
1380 ph10 178 default:
1381     OK = FALSE;
1382     break;
1383     }
1384     if (OK == (d == OP_VSPACE))
1385 ph10 182 {
1386 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1387     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1388     {
1389     active_count--; /* Remove non-match possibility */
1390     next_active_state--;
1391     }
1392     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1393     }
1394     }
1395     break;
1396    
1397     /*-----------------------------------------------------------------*/
1398     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1399     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1400     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1401     count = 2;
1402     goto QS5;
1403    
1404     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1405     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1406     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1407     count = 0;
1408    
1409     QS5:
1410     ADD_ACTIVE(state_offset + 2, 0);
1411     if (clen > 0)
1412     {
1413 ph10 182 BOOL OK;
1414 ph10 178 switch (c)
1415     {
1416     case 0x09: /* HT */
1417     case 0x20: /* SPACE */
1418     case 0xa0: /* NBSP */
1419     case 0x1680: /* OGHAM SPACE MARK */
1420     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1421     case 0x2000: /* EN QUAD */
1422     case 0x2001: /* EM QUAD */
1423     case 0x2002: /* EN SPACE */
1424     case 0x2003: /* EM SPACE */
1425     case 0x2004: /* THREE-PER-EM SPACE */
1426     case 0x2005: /* FOUR-PER-EM SPACE */
1427     case 0x2006: /* SIX-PER-EM SPACE */
1428     case 0x2007: /* FIGURE SPACE */
1429     case 0x2008: /* PUNCTUATION SPACE */
1430     case 0x2009: /* THIN SPACE */
1431     case 0x200A: /* HAIR SPACE */
1432     case 0x202f: /* NARROW NO-BREAK SPACE */
1433     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1434     case 0x3000: /* IDEOGRAPHIC SPACE */
1435     OK = TRUE;
1436     break;
1437 ph10 182
1438 ph10 178 default:
1439     OK = FALSE;
1440     break;
1441     }
1442 ph10 182
1443 ph10 178 if (OK == (d == OP_HSPACE))
1444 ph10 182 {
1445 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1446     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1447     {
1448     active_count--; /* Remove non-match possibility */
1449     next_active_state--;
1450     }
1451     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1452     }
1453     }
1454     break;
1455    
1456     /*-----------------------------------------------------------------*/
1457 ph10 151 #ifdef SUPPORT_UCP
1458 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1459     case OP_PROP_EXTRA + OP_TYPEUPTO:
1460     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1461 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1462 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1463 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1464 nigel 77 count = current_state->count; /* Number already matched */
1465     if (clen > 0)
1466     {
1467 nigel 87 BOOL OK;
1468     int category = _pcre_ucp_findprop(c, &chartype, &script);
1469     switch(code[4])
1470 nigel 77 {
1471 nigel 87 case PT_ANY:
1472     OK = TRUE;
1473     break;
1474    
1475     case PT_LAMP:
1476     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1477     break;
1478    
1479     case PT_GC:
1480     OK = category == code[5];
1481     break;
1482    
1483     case PT_PC:
1484     OK = chartype == code[5];
1485     break;
1486    
1487     case PT_SC:
1488     OK = script == code[5];
1489     break;
1490    
1491     /* Should never occur, but keep compilers from grumbling. */
1492    
1493     default:
1494     OK = codevalue != OP_PROP;
1495     break;
1496     }
1497    
1498     if (OK == (d == OP_PROP))
1499     {
1500 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1501     {
1502     active_count--; /* Remove non-match possibility */
1503     next_active_state--;
1504     }
1505 nigel 77 if (++count >= GET2(code, 1))
1506 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1507 nigel 77 else
1508     { ADD_NEW(state_offset, count); }
1509     }
1510     }
1511     break;
1512    
1513     /*-----------------------------------------------------------------*/
1514     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1515     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1516     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1517 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1518 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1519     { ADD_ACTIVE(state_offset + 4, 0); }
1520     count = current_state->count; /* Number already matched */
1521 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1522 nigel 77 {
1523     const uschar *nptr = ptr + clen;
1524     int ncount = 0;
1525 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1526     {
1527     active_count--; /* Remove non-match possibility */
1528     next_active_state--;
1529     }
1530 nigel 77 while (nptr < end_subject)
1531     {
1532     int nd;
1533     int ndlen = 1;
1534     GETCHARLEN(nd, nptr, ndlen);
1535 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1536 nigel 77 ncount++;
1537     nptr += ndlen;
1538     }
1539     if (++count >= GET2(code, 1))
1540     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1541     else
1542     { ADD_NEW_DATA(-state_offset, count, ncount); }
1543     }
1544     break;
1545 ph10 151 #endif
1546 nigel 77
1547 nigel 93 /*-----------------------------------------------------------------*/
1548     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1549     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1550     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1551     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1552     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1553     { ADD_ACTIVE(state_offset + 4, 0); }
1554     count = current_state->count; /* Number already matched */
1555     if (clen > 0)
1556     {
1557     int ncount = 0;
1558     switch (c)
1559     {
1560     case 0x000b:
1561     case 0x000c:
1562     case 0x0085:
1563     case 0x2028:
1564     case 0x2029:
1565 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1566     goto ANYNL03;
1567    
1568     case 0x000d:
1569     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1570     /* Fall through */
1571    
1572     ANYNL03:
1573     case 0x000a:
1574 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1575     {
1576     active_count--; /* Remove non-match possibility */
1577     next_active_state--;
1578     }
1579     if (++count >= GET2(code, 1))
1580     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1581     else
1582     { ADD_NEW_DATA(-state_offset, count, ncount); }
1583     break;
1584 ph10 231
1585 nigel 93 default:
1586     break;
1587     }
1588     }
1589     break;
1590    
1591 ph10 178 /*-----------------------------------------------------------------*/
1592     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1593     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1594     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1595     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1596     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1597     { ADD_ACTIVE(state_offset + 4, 0); }
1598     count = current_state->count; /* Number already matched */
1599     if (clen > 0)
1600     {
1601 ph10 182 BOOL OK;
1602 ph10 178 switch (c)
1603     {
1604     case 0x000a:
1605     case 0x000b:
1606     case 0x000c:
1607     case 0x000d:
1608     case 0x0085:
1609     case 0x2028:
1610     case 0x2029:
1611     OK = TRUE;
1612     break;
1613 ph10 182
1614 ph10 178 default:
1615     OK = FALSE;
1616     }
1617 ph10 182
1618 ph10 178 if (OK == (d == OP_VSPACE))
1619 ph10 182 {
1620 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1621     {
1622     active_count--; /* Remove non-match possibility */
1623     next_active_state--;
1624     }
1625     if (++count >= GET2(code, 1))
1626     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1627     else
1628     { ADD_NEW_DATA(-state_offset, count, 0); }
1629     }
1630     }
1631     break;
1632    
1633     /*-----------------------------------------------------------------*/
1634     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1635     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1636     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1637     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1638     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1639     { ADD_ACTIVE(state_offset + 4, 0); }
1640     count = current_state->count; /* Number already matched */
1641     if (clen > 0)
1642     {
1643 ph10 182 BOOL OK;
1644 ph10 178 switch (c)
1645     {
1646     case 0x09: /* HT */
1647     case 0x20: /* SPACE */
1648     case 0xa0: /* NBSP */
1649     case 0x1680: /* OGHAM SPACE MARK */
1650     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1651     case 0x2000: /* EN QUAD */
1652     case 0x2001: /* EM QUAD */
1653     case 0x2002: /* EN SPACE */
1654     case 0x2003: /* EM SPACE */
1655     case 0x2004: /* THREE-PER-EM SPACE */
1656     case 0x2005: /* FOUR-PER-EM SPACE */
1657     case 0x2006: /* SIX-PER-EM SPACE */
1658     case 0x2007: /* FIGURE SPACE */
1659     case 0x2008: /* PUNCTUATION SPACE */
1660     case 0x2009: /* THIN SPACE */
1661     case 0x200A: /* HAIR SPACE */
1662     case 0x202f: /* NARROW NO-BREAK SPACE */
1663     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1664     case 0x3000: /* IDEOGRAPHIC SPACE */
1665     OK = TRUE;
1666     break;
1667 ph10 182
1668 ph10 178 default:
1669     OK = FALSE;
1670     break;
1671     }
1672 ph10 182
1673 ph10 178 if (OK == (d == OP_HSPACE))
1674 ph10 182 {
1675 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1676     {
1677     active_count--; /* Remove non-match possibility */
1678     next_active_state--;
1679     }
1680     if (++count >= GET2(code, 1))
1681     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1682     else
1683     { ADD_NEW_DATA(-state_offset, count, 0); }
1684     }
1685     }
1686     break;
1687    
1688 nigel 77 /* ========================================================================== */
1689     /* These opcodes are followed by a character that is usually compared
1690     to the current subject character; it is loaded into d. We still get
1691     here even if there is no subject character, because in some cases zero
1692     repetitions are permitted. */
1693    
1694     /*-----------------------------------------------------------------*/
1695     case OP_CHAR:
1696     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1697     break;
1698    
1699     /*-----------------------------------------------------------------*/
1700     case OP_CHARNC:
1701     if (clen == 0) break;
1702    
1703     #ifdef SUPPORT_UTF8
1704     if (utf8)
1705     {
1706     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1707     {
1708 nigel 93 unsigned int othercase;
1709 nigel 77 if (c < 128) othercase = fcc[c]; else
1710    
1711     /* If we have Unicode property support, we can use it to test the
1712 nigel 87 other case of the character. */
1713 nigel 77
1714     #ifdef SUPPORT_UCP
1715 nigel 87 othercase = _pcre_ucp_othercase(c);
1716     #else
1717 nigel 93 othercase = NOTACHAR;
1718 nigel 77 #endif
1719    
1720     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1721     }
1722     }
1723     else
1724     #endif /* SUPPORT_UTF8 */
1725    
1726     /* Non-UTF-8 mode */
1727     {
1728     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1729     }
1730     break;
1731    
1732    
1733     #ifdef SUPPORT_UCP
1734     /*-----------------------------------------------------------------*/
1735     /* This is a tricky one because it can match more than one character.
1736     Find out how many characters to skip, and then set up a negative state
1737     to wait for them to pass before continuing. */
1738    
1739     case OP_EXTUNI:
1740 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1741 nigel 77 {
1742     const uschar *nptr = ptr + clen;
1743     int ncount = 0;
1744     while (nptr < end_subject)
1745     {
1746     int nclen = 1;
1747     GETCHARLEN(c, nptr, nclen);
1748 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1749 nigel 77 ncount++;
1750     nptr += nclen;
1751     }
1752     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1753     }
1754     break;
1755     #endif
1756    
1757     /*-----------------------------------------------------------------*/
1758 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1759     character (when CR is followed by LF). In this case, set up a negative
1760     state to wait for one character to pass before continuing. */
1761    
1762     case OP_ANYNL:
1763     if (clen > 0) switch(c)
1764     {
1765     case 0x000b:
1766     case 0x000c:
1767     case 0x0085:
1768     case 0x2028:
1769     case 0x2029:
1770 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1771    
1772     case 0x000a:
1773 nigel 93 ADD_NEW(state_offset + 1, 0);
1774     break;
1775 ph10 231
1776 nigel 93 case 0x000d:
1777     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1778     {
1779     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1780     }
1781     else
1782     {
1783     ADD_NEW(state_offset + 1, 0);
1784     }
1785     break;
1786     }
1787     break;
1788    
1789     /*-----------------------------------------------------------------*/
1790 ph10 178 case OP_NOT_VSPACE:
1791     if (clen > 0) switch(c)
1792     {
1793     case 0x000a:
1794     case 0x000b:
1795     case 0x000c:
1796     case 0x000d:
1797     case 0x0085:
1798     case 0x2028:
1799     case 0x2029:
1800     break;
1801 ph10 182
1802     default:
1803 ph10 178 ADD_NEW(state_offset + 1, 0);
1804     break;
1805     }
1806     break;
1807    
1808     /*-----------------------------------------------------------------*/
1809     case OP_VSPACE:
1810     if (clen > 0) switch(c)
1811     {
1812     case 0x000a:
1813     case 0x000b:
1814     case 0x000c:
1815     case 0x000d:
1816     case 0x0085:
1817     case 0x2028:
1818     case 0x2029:
1819     ADD_NEW(state_offset + 1, 0);
1820     break;
1821 ph10 182
1822 ph10 178 default: break;
1823     }
1824     break;
1825    
1826     /*-----------------------------------------------------------------*/
1827     case OP_NOT_HSPACE:
1828     if (clen > 0) switch(c)
1829     {
1830     case 0x09: /* HT */
1831     case 0x20: /* SPACE */
1832     case 0xa0: /* NBSP */
1833     case 0x1680: /* OGHAM SPACE MARK */
1834     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1835     case 0x2000: /* EN QUAD */
1836     case 0x2001: /* EM QUAD */
1837     case 0x2002: /* EN SPACE */
1838     case 0x2003: /* EM SPACE */
1839     case 0x2004: /* THREE-PER-EM SPACE */
1840     case 0x2005: /* FOUR-PER-EM SPACE */
1841     case 0x2006: /* SIX-PER-EM SPACE */
1842     case 0x2007: /* FIGURE SPACE */
1843     case 0x2008: /* PUNCTUATION SPACE */
1844     case 0x2009: /* THIN SPACE */
1845     case 0x200A: /* HAIR SPACE */
1846     case 0x202f: /* NARROW NO-BREAK SPACE */
1847     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1848     case 0x3000: /* IDEOGRAPHIC SPACE */
1849     break;
1850 ph10 182
1851     default:
1852 ph10 178 ADD_NEW(state_offset + 1, 0);
1853     break;
1854     }
1855     break;
1856    
1857     /*-----------------------------------------------------------------*/
1858     case OP_HSPACE:
1859     if (clen > 0) switch(c)
1860     {
1861     case 0x09: /* HT */
1862     case 0x20: /* SPACE */
1863     case 0xa0: /* NBSP */
1864     case 0x1680: /* OGHAM SPACE MARK */
1865     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1866     case 0x2000: /* EN QUAD */
1867     case 0x2001: /* EM QUAD */
1868     case 0x2002: /* EN SPACE */
1869     case 0x2003: /* EM SPACE */
1870     case 0x2004: /* THREE-PER-EM SPACE */
1871     case 0x2005: /* FOUR-PER-EM SPACE */
1872     case 0x2006: /* SIX-PER-EM SPACE */
1873     case 0x2007: /* FIGURE SPACE */
1874     case 0x2008: /* PUNCTUATION SPACE */
1875     case 0x2009: /* THIN SPACE */
1876     case 0x200A: /* HAIR SPACE */
1877     case 0x202f: /* NARROW NO-BREAK SPACE */
1878     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1879     case 0x3000: /* IDEOGRAPHIC SPACE */
1880     ADD_NEW(state_offset + 1, 0);
1881     break;
1882     }
1883     break;
1884    
1885     /*-----------------------------------------------------------------*/
1886 nigel 77 /* Match a negated single character. This is only used for one-byte
1887     characters, that is, we know that d < 256. The character we are
1888     checking (c) can be multibyte. */
1889    
1890     case OP_NOT:
1891     if (clen > 0)
1892     {
1893 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1894 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1895     }
1896     break;
1897    
1898     /*-----------------------------------------------------------------*/
1899     case OP_PLUS:
1900     case OP_MINPLUS:
1901 nigel 93 case OP_POSPLUS:
1902 nigel 77 case OP_NOTPLUS:
1903     case OP_NOTMINPLUS:
1904 nigel 93 case OP_NOTPOSPLUS:
1905 nigel 77 count = current_state->count; /* Already matched */
1906     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1907     if (clen > 0)
1908     {
1909 nigel 93 unsigned int otherd = NOTACHAR;
1910 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1911     {
1912     #ifdef SUPPORT_UTF8
1913 nigel 87 if (utf8 && d >= 128)
1914 nigel 77 {
1915     #ifdef SUPPORT_UCP
1916 nigel 87 otherd = _pcre_ucp_othercase(d);
1917 nigel 77 #endif /* SUPPORT_UCP */
1918     }
1919     else
1920     #endif /* SUPPORT_UTF8 */
1921     otherd = fcc[d];
1922     }
1923     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1924 nigel 93 {
1925     if (count > 0 &&
1926     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1927     {
1928     active_count--; /* Remove non-match possibility */
1929     next_active_state--;
1930     }
1931     count++;
1932     ADD_NEW(state_offset, count);
1933     }
1934 nigel 77 }
1935     break;
1936    
1937     /*-----------------------------------------------------------------*/
1938     case OP_QUERY:
1939     case OP_MINQUERY:
1940 nigel 93 case OP_POSQUERY:
1941 nigel 77 case OP_NOTQUERY:
1942     case OP_NOTMINQUERY:
1943 nigel 93 case OP_NOTPOSQUERY:
1944 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1945     if (clen > 0)
1946     {
1947 nigel 93 unsigned int otherd = NOTACHAR;
1948 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1949 nigel 77 {
1950     #ifdef SUPPORT_UTF8
1951 nigel 87 if (utf8 && d >= 128)
1952 nigel 77 {
1953     #ifdef SUPPORT_UCP
1954 nigel 87 otherd = _pcre_ucp_othercase(d);
1955 nigel 77 #endif /* SUPPORT_UCP */
1956     }
1957     else
1958     #endif /* SUPPORT_UTF8 */
1959     otherd = fcc[d];
1960     }
1961     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1962 nigel 93 {
1963     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1964     {
1965     active_count--; /* Remove non-match possibility */
1966     next_active_state--;
1967     }
1968     ADD_NEW(state_offset + dlen + 1, 0);
1969     }
1970 nigel 77 }
1971     break;
1972    
1973     /*-----------------------------------------------------------------*/
1974     case OP_STAR:
1975     case OP_MINSTAR:
1976 nigel 93 case OP_POSSTAR:
1977 nigel 77 case OP_NOTSTAR:
1978     case OP_NOTMINSTAR:
1979 nigel 93 case OP_NOTPOSSTAR:
1980 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1981     if (clen > 0)
1982     {
1983 nigel 93 unsigned int otherd = NOTACHAR;
1984 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1985 nigel 77 {
1986     #ifdef SUPPORT_UTF8
1987 nigel 87 if (utf8 && d >= 128)
1988 nigel 77 {
1989     #ifdef SUPPORT_UCP
1990 nigel 87 otherd = _pcre_ucp_othercase(d);
1991 nigel 77 #endif /* SUPPORT_UCP */
1992     }
1993     else
1994     #endif /* SUPPORT_UTF8 */
1995     otherd = fcc[d];
1996     }
1997     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1998 nigel 93 {
1999     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2000     {
2001     active_count--; /* Remove non-match possibility */
2002     next_active_state--;
2003     }
2004     ADD_NEW(state_offset, 0);
2005     }
2006 nigel 77 }
2007     break;
2008    
2009     /*-----------------------------------------------------------------*/
2010     case OP_EXACT:
2011 nigel 93 case OP_NOTEXACT:
2012     count = current_state->count; /* Number already matched */
2013     if (clen > 0)
2014     {
2015     unsigned int otherd = NOTACHAR;
2016     if ((ims & PCRE_CASELESS) != 0)
2017     {
2018     #ifdef SUPPORT_UTF8
2019     if (utf8 && d >= 128)
2020     {
2021     #ifdef SUPPORT_UCP
2022     otherd = _pcre_ucp_othercase(d);
2023     #endif /* SUPPORT_UCP */
2024     }
2025     else
2026     #endif /* SUPPORT_UTF8 */
2027     otherd = fcc[d];
2028     }
2029     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2030     {
2031     if (++count >= GET2(code, 1))
2032     { ADD_NEW(state_offset + dlen + 3, 0); }
2033     else
2034     { ADD_NEW(state_offset, count); }
2035     }
2036     }
2037     break;
2038    
2039     /*-----------------------------------------------------------------*/
2040 nigel 77 case OP_UPTO:
2041     case OP_MINUPTO:
2042 nigel 93 case OP_POSUPTO:
2043 nigel 77 case OP_NOTUPTO:
2044     case OP_NOTMINUPTO:
2045 nigel 93 case OP_NOTPOSUPTO:
2046     ADD_ACTIVE(state_offset + dlen + 3, 0);
2047 nigel 77 count = current_state->count; /* Number already matched */
2048     if (clen > 0)
2049     {
2050 nigel 93 unsigned int otherd = NOTACHAR;
2051 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2052     {
2053     #ifdef SUPPORT_UTF8
2054 nigel 87 if (utf8 && d >= 128)
2055 nigel 77 {
2056     #ifdef SUPPORT_UCP
2057 nigel 87 otherd = _pcre_ucp_othercase(d);
2058 nigel 77 #endif /* SUPPORT_UCP */
2059     }
2060     else
2061     #endif /* SUPPORT_UTF8 */
2062     otherd = fcc[d];
2063     }
2064     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2065     {
2066 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2067     {
2068     active_count--; /* Remove non-match possibility */
2069     next_active_state--;
2070     }
2071 nigel 77 if (++count >= GET2(code, 1))
2072     { ADD_NEW(state_offset + dlen + 3, 0); }
2073     else
2074     { ADD_NEW(state_offset, count); }
2075     }
2076     }
2077     break;
2078    
2079    
2080     /* ========================================================================== */
2081     /* These are the class-handling opcodes */
2082    
2083     case OP_CLASS:
2084     case OP_NCLASS:
2085     case OP_XCLASS:
2086     {
2087     BOOL isinclass = FALSE;
2088     int next_state_offset;
2089     const uschar *ecode;
2090    
2091     /* For a simple class, there is always just a 32-byte table, and we
2092     can set isinclass from it. */
2093    
2094     if (codevalue != OP_XCLASS)
2095     {
2096     ecode = code + 33;
2097     if (clen > 0)
2098     {
2099     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2100     ((code[1 + c/8] & (1 << (c&7))) != 0);
2101     }
2102     }
2103    
2104     /* An extended class may have a table or a list of single characters,
2105     ranges, or both, and it may be positive or negative. There's a
2106     function that sorts all this out. */
2107    
2108     else
2109     {
2110     ecode = code + GET(code, 1);
2111     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2112     }
2113    
2114     /* At this point, isinclass is set for all kinds of class, and ecode
2115     points to the byte after the end of the class. If there is a
2116     quantifier, this is where it will be. */
2117    
2118     next_state_offset = ecode - start_code;
2119    
2120     switch (*ecode)
2121     {
2122     case OP_CRSTAR:
2123     case OP_CRMINSTAR:
2124     ADD_ACTIVE(next_state_offset + 1, 0);
2125     if (isinclass) { ADD_NEW(state_offset, 0); }
2126     break;
2127    
2128     case OP_CRPLUS:
2129     case OP_CRMINPLUS:
2130     count = current_state->count; /* Already matched */
2131     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2132     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2133     break;
2134    
2135     case OP_CRQUERY:
2136     case OP_CRMINQUERY:
2137     ADD_ACTIVE(next_state_offset + 1, 0);
2138     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2139     break;
2140    
2141     case OP_CRRANGE:
2142     case OP_CRMINRANGE:
2143     count = current_state->count; /* Already matched */
2144     if (count >= GET2(ecode, 1))
2145     { ADD_ACTIVE(next_state_offset + 5, 0); }
2146     if (isinclass)
2147     {
2148 nigel 91 int max = GET2(ecode, 3);
2149     if (++count >= max && max != 0) /* Max 0 => no limit */
2150 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2151     else
2152     { ADD_NEW(state_offset, count); }
2153     }
2154     break;
2155    
2156     default:
2157     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2158     break;
2159     }
2160     }
2161     break;
2162    
2163     /* ========================================================================== */
2164     /* These are the opcodes for fancy brackets of various kinds. We have
2165     to use recursion in order to handle them. */
2166    
2167     case OP_ASSERT:
2168     case OP_ASSERT_NOT:
2169     case OP_ASSERTBACK:
2170     case OP_ASSERTBACK_NOT:
2171     {
2172     int rc;
2173     int local_offsets[2];
2174     int local_workspace[1000];
2175     const uschar *endasscode = code + GET(code, 1);
2176    
2177     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2178    
2179     rc = internal_dfa_exec(
2180     md, /* static match data */
2181     code, /* this subexpression's code */
2182     ptr, /* where we currently are */
2183     ptr - start_subject, /* start offset */
2184     local_offsets, /* offset vector */
2185     sizeof(local_offsets)/sizeof(int), /* size of same */
2186     local_workspace, /* workspace vector */
2187     sizeof(local_workspace)/sizeof(int), /* size of same */
2188     ims, /* the current ims flags */
2189     rlevel, /* function recursion level */
2190     recursing); /* pass on regex recursion */
2191    
2192     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2193     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2194     }
2195     break;
2196    
2197     /*-----------------------------------------------------------------*/
2198     case OP_COND:
2199 nigel 93 case OP_SCOND:
2200 nigel 77 {
2201     int local_offsets[1000];
2202     int local_workspace[1000];
2203     int condcode = code[LINK_SIZE+1];
2204    
2205 nigel 93 /* Back reference conditions are not supported */
2206 nigel 77
2207 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2208    
2209     /* The DEFINE condition is always false */
2210    
2211     if (condcode == OP_DEF)
2212 nigel 77 {
2213 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2214     }
2215    
2216     /* The only supported version of OP_RREF is for the value RREF_ANY,
2217     which means "test if in any recursion". We can't test for specifically
2218     recursed groups. */
2219    
2220     else if (condcode == OP_RREF)
2221     {
2222 nigel 77 int value = GET2(code, LINK_SIZE+2);
2223 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2224 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2225     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2226     }
2227    
2228     /* Otherwise, the condition is an assertion */
2229    
2230     else
2231     {
2232     int rc;
2233     const uschar *asscode = code + LINK_SIZE + 1;
2234     const uschar *endasscode = asscode + GET(asscode, 1);
2235    
2236     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2237    
2238     rc = internal_dfa_exec(
2239     md, /* fixed match data */
2240     asscode, /* this subexpression's code */
2241     ptr, /* where we currently are */
2242     ptr - start_subject, /* start offset */
2243     local_offsets, /* offset vector */
2244     sizeof(local_offsets)/sizeof(int), /* size of same */
2245     local_workspace, /* workspace vector */
2246     sizeof(local_workspace)/sizeof(int), /* size of same */
2247     ims, /* the current ims flags */
2248     rlevel, /* function recursion level */
2249     recursing); /* pass on regex recursion */
2250    
2251     if ((rc >= 0) ==
2252     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2253     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2254     else
2255     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2256     }
2257     }
2258     break;
2259    
2260     /*-----------------------------------------------------------------*/
2261     case OP_RECURSE:
2262     {
2263     int local_offsets[1000];
2264     int local_workspace[1000];
2265     int rc;
2266    
2267     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2268     recursing + 1));
2269    
2270     rc = internal_dfa_exec(
2271     md, /* fixed match data */
2272     start_code + GET(code, 1), /* this subexpression's code */
2273     ptr, /* where we currently are */
2274     ptr - start_subject, /* start offset */
2275     local_offsets, /* offset vector */
2276     sizeof(local_offsets)/sizeof(int), /* size of same */
2277     local_workspace, /* workspace vector */
2278     sizeof(local_workspace)/sizeof(int), /* size of same */
2279     ims, /* the current ims flags */
2280     rlevel, /* function recursion level */
2281     recursing + 1); /* regex recurse level */
2282    
2283     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2284     recursing + 1, rc));
2285    
2286     /* Ran out of internal offsets */
2287    
2288     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2289    
2290     /* For each successful matched substring, set up the next state with a
2291     count of characters to skip before trying it. Note that the count is in
2292     characters, not bytes. */
2293    
2294     if (rc > 0)
2295     {
2296     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2297     {
2298     const uschar *p = start_subject + local_offsets[rc];
2299     const uschar *pp = start_subject + local_offsets[rc+1];
2300     int charcount = local_offsets[rc+1] - local_offsets[rc];
2301     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2302     if (charcount > 0)
2303     {
2304     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2305     }
2306     else
2307     {
2308     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2309     }
2310     }
2311     }
2312     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2313     }
2314     break;
2315    
2316     /*-----------------------------------------------------------------*/
2317     case OP_ONCE:
2318     {
2319     int local_offsets[2];
2320     int local_workspace[1000];
2321    
2322     int rc = internal_dfa_exec(
2323     md, /* fixed match data */
2324     code, /* this subexpression's code */
2325     ptr, /* where we currently are */
2326     ptr - start_subject, /* start offset */
2327     local_offsets, /* offset vector */
2328     sizeof(local_offsets)/sizeof(int), /* size of same */
2329     local_workspace, /* workspace vector */
2330     sizeof(local_workspace)/sizeof(int), /* size of same */
2331     ims, /* the current ims flags */
2332     rlevel, /* function recursion level */
2333     recursing); /* pass on regex recursion */
2334    
2335     if (rc >= 0)
2336     {
2337     const uschar *end_subpattern = code;
2338     int charcount = local_offsets[1] - local_offsets[0];
2339     int next_state_offset, repeat_state_offset;
2340    
2341     do { end_subpattern += GET(end_subpattern, 1); }
2342     while (*end_subpattern == OP_ALT);
2343     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2344    
2345     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2346     arrange for the repeat state also to be added to the relevant list.
2347     Calculate the offset, or set -1 for no repeat. */
2348    
2349     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2350     *end_subpattern == OP_KETRMIN)?
2351     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2352    
2353     /* If we have matched an empty string, add the next state at the
2354     current character pointer. This is important so that the duplicate
2355     checking kicks in, which is what breaks infinite loops that match an
2356     empty string. */
2357    
2358     if (charcount == 0)
2359     {
2360     ADD_ACTIVE(next_state_offset, 0);
2361     }
2362    
2363     /* Optimization: if there are no more active states, and there
2364     are no new states yet set up, then skip over the subject string
2365     right here, to save looping. Otherwise, set up the new state to swing
2366     into action when the end of the substring is reached. */
2367    
2368     else if (i + 1 >= active_count && new_count == 0)
2369     {
2370     ptr += charcount;
2371     clen = 0;
2372     ADD_NEW(next_state_offset, 0);
2373    
2374     /* If we are adding a repeat state at the new character position,
2375     we must fudge things so that it is the only current state.
2376     Otherwise, it might be a duplicate of one we processed before, and
2377     that would cause it to be skipped. */
2378    
2379     if (repeat_state_offset >= 0)
2380     {
2381     next_active_state = active_states;
2382     active_count = 0;
2383     i = -1;
2384     ADD_ACTIVE(repeat_state_offset, 0);
2385     }
2386     }
2387     else
2388     {
2389     const uschar *p = start_subject + local_offsets[0];
2390     const uschar *pp = start_subject + local_offsets[1];
2391     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2392     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2393     if (repeat_state_offset >= 0)
2394     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2395     }
2396    
2397     }
2398     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2399     }
2400     break;
2401    
2402    
2403     /* ========================================================================== */
2404     /* Handle callouts */
2405    
2406     case OP_CALLOUT:
2407     if (pcre_callout != NULL)
2408     {
2409     int rrc;
2410     pcre_callout_block cb;
2411     cb.version = 1; /* Version 1 of the callout block */
2412     cb.callout_number = code[1];
2413     cb.offset_vector = offsets;
2414 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2415 nigel 77 cb.subject_length = end_subject - start_subject;
2416     cb.start_match = current_subject - start_subject;
2417     cb.current_position = ptr - start_subject;
2418     cb.pattern_position = GET(code, 2);
2419     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2420     cb.capture_top = 1;
2421     cb.capture_last = -1;
2422     cb.callout_data = md->callout_data;
2423     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2424     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2425     }
2426     break;
2427    
2428    
2429     /* ========================================================================== */
2430     default: /* Unsupported opcode */
2431     return PCRE_ERROR_DFA_UITEM;
2432     }
2433    
2434     NEXT_ACTIVE_STATE: continue;
2435    
2436     } /* End of loop scanning active states */
2437    
2438     /* We have finished the processing at the current subject character. If no
2439     new states have been set for the next character, we have found all the
2440     matches that we are going to find. If we are at the top level and partial
2441     matching has been requested, check for appropriate conditions. */
2442    
2443     if (new_count <= 0)
2444     {
2445     if (match_count < 0 && /* No matches found */
2446     rlevel == 1 && /* Top level match function */
2447     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2448     ptr >= end_subject && /* Reached end of subject */
2449     ptr > current_subject) /* Matched non-empty string */
2450     {
2451     if (offsetcount >= 2)
2452     {
2453     offsets[0] = current_subject - start_subject;
2454     offsets[1] = end_subject - start_subject;
2455     }
2456     match_count = PCRE_ERROR_PARTIAL;
2457     }
2458    
2459     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2460     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2461     rlevel*2-2, SP));
2462 nigel 91 break; /* In effect, "return", but see the comment below */
2463 nigel 77 }
2464    
2465     /* One or more states are active for the next character. */
2466    
2467     ptr += clen; /* Advance to next subject character */
2468     } /* Loop to move along the subject string */
2469    
2470 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2471     if we use "return" above, we have compiler trouble. Some compilers warn if
2472     there's nothing here because they think the function doesn't return a value. On
2473     the other hand, if we put a dummy statement here, some more clever compilers
2474     complain that it can't be reached. Sigh. */
2475 nigel 77
2476 nigel 91 return match_count;
2477 nigel 77 }
2478    
2479    
2480    
2481    
2482     /*************************************************
2483     * Execute a Regular Expression - DFA engine *
2484     *************************************************/
2485    
2486     /* This external function applies a compiled re to a subject string using a DFA
2487     engine. This function calls the internal function multiple times if the pattern
2488     is not anchored.
2489    
2490     Arguments:
2491     argument_re points to the compiled expression
2492 ph10 97 extra_data points to extra data or is NULL
2493 nigel 77 subject points to the subject string
2494     length length of subject string (may contain binary zeros)
2495     start_offset where to start in the subject string
2496     options option bits
2497     offsets vector of match offsets
2498     offsetcount size of same
2499     workspace workspace vector
2500     wscount size of same
2501    
2502     Returns: > 0 => number of match offset pairs placed in offsets
2503     = 0 => offsets overflowed; longest matches are present
2504     -1 => failed to match
2505     < -1 => some kind of unexpected problem
2506     */
2507    
2508 ph10 145 PCRE_EXP_DEFN int
2509 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2510     const char *subject, int length, int start_offset, int options, int *offsets,
2511     int offsetcount, int *workspace, int wscount)
2512     {
2513     real_pcre *re = (real_pcre *)argument_re;
2514     dfa_match_data match_block;
2515 nigel 91 dfa_match_data *md = &match_block;
2516 nigel 77 BOOL utf8, anchored, startline, firstline;
2517     const uschar *current_subject, *end_subject, *lcc;
2518    
2519     pcre_study_data internal_study;
2520     const pcre_study_data *study = NULL;
2521     real_pcre internal_re;
2522    
2523     const uschar *req_byte_ptr;
2524     const uschar *start_bits = NULL;
2525     BOOL first_byte_caseless = FALSE;
2526     BOOL req_byte_caseless = FALSE;
2527     int first_byte = -1;
2528     int req_byte = -1;
2529     int req_byte2 = -1;
2530 nigel 91 int newline;
2531 nigel 77
2532     /* Plausibility checks */
2533    
2534     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2535     if (re == NULL || subject == NULL || workspace == NULL ||
2536     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2537     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2538     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2539    
2540     /* We need to find the pointer to any study data before we test for byte
2541     flipping, so we scan the extra_data block first. This may set two fields in the
2542     match block, so we must initialize them beforehand. However, the other fields
2543     in the match block must not be set until after the byte flipping. */
2544    
2545 nigel 91 md->tables = re->tables;
2546     md->callout_data = NULL;
2547 nigel 77
2548     if (extra_data != NULL)
2549     {
2550     unsigned int flags = extra_data->flags;
2551     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2552     study = (const pcre_study_data *)extra_data->study_data;
2553     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2554 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2555     return PCRE_ERROR_DFA_UMLIMIT;
2556 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2557 nigel 91 md->callout_data = extra_data->callout_data;
2558 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2559 nigel 91 md->tables = extra_data->tables;
2560 nigel 77 }
2561    
2562     /* Check that the first field in the block is the magic number. If it is not,
2563     test for a regex that was compiled on a host of opposite endianness. If this is
2564     the case, flipped values are put in internal_re and internal_study if there was
2565     study data too. */
2566    
2567     if (re->magic_number != MAGIC_NUMBER)
2568     {
2569     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2570     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2571     if (study != NULL) study = &internal_study;
2572     }
2573    
2574     /* Set some local values */
2575    
2576     current_subject = (const unsigned char *)subject + start_offset;
2577     end_subject = (const unsigned char *)subject + length;
2578     req_byte_ptr = current_subject - 1;
2579    
2580 nigel 91 #ifdef SUPPORT_UTF8
2581 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2582 nigel 91 #else
2583     utf8 = FALSE;
2584     #endif
2585 nigel 77
2586 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2587     (re->options & PCRE_ANCHORED) != 0;
2588    
2589 nigel 77 /* The remaining fixed data for passing around. */
2590    
2591 nigel 91 md->start_code = (const uschar *)argument_re +
2592 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2593 nigel 91 md->start_subject = (const unsigned char *)subject;
2594     md->end_subject = end_subject;
2595     md->moptions = options;
2596     md->poptions = re->options;
2597 nigel 77
2598 ph10 231 /* If the BSR option is not set at match time, copy what was set
2599     at compile time. */
2600    
2601     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2602     {
2603     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2604     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2605     #ifdef BSR_ANYCRLF
2606     else md->moptions |= PCRE_BSR_ANYCRLF;
2607     #endif
2608     }
2609    
2610 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2611     nothing is set at run time, whatever was used at compile time applies. */
2612 nigel 91
2613 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2614 nigel 93 PCRE_NEWLINE_BITS)
2615 nigel 91 {
2616 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2617 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2618     case PCRE_NEWLINE_LF: newline = '\n'; break;
2619     case PCRE_NEWLINE_CR+
2620     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2621 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2622 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2623 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2624 nigel 91 }
2625    
2626 ph10 149 if (newline == -2)
2627 nigel 91 {
2628 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2629     }
2630     else if (newline < 0)
2631     {
2632 nigel 93 md->nltype = NLTYPE_ANY;
2633 nigel 91 }
2634     else
2635     {
2636 nigel 93 md->nltype = NLTYPE_FIXED;
2637     if (newline > 255)
2638     {
2639     md->nllen = 2;
2640     md->nl[0] = (newline >> 8) & 255;
2641     md->nl[1] = newline & 255;
2642     }
2643     else
2644     {
2645     md->nllen = 1;
2646     md->nl[0] = newline;
2647     }
2648 nigel 91 }
2649    
2650 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2651     back the character offset. */
2652    
2653     #ifdef SUPPORT_UTF8
2654     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2655     {
2656     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2657     return PCRE_ERROR_BADUTF8;
2658     if (start_offset > 0 && start_offset < length)
2659     {
2660     int tb = ((uschar *)subject)[start_offset];
2661     if (tb > 127)
2662     {
2663     tb &= 0xc0;
2664     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2665     }
2666     }
2667     }
2668     #endif
2669    
2670     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2671     is a feature that makes it possible to save compiled regex and re-use them
2672     in other programs later. */
2673    
2674 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2675 nigel 77
2676     /* The lower casing table and the "must be at the start of a line" flag are
2677     used in a loop when finding where to start. */
2678    
2679 nigel 91 lcc = md->tables + lcc_offset;
2680 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2681 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2682    
2683     /* Set up the first character to match, if available. The first_byte value is
2684     never set for an anchored regular expression, but the anchoring may be forced
2685     at run time, so we have to test for anchoring. The first char may be unset for
2686     an unanchored pattern, of course. If there's no first char and the pattern was
2687     studied, there may be a bitmap of possible first characters. */
2688    
2689     if (!anchored)
2690     {
2691 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2692 nigel 77 {
2693     first_byte = re->first_byte & 255;
2694     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2695     first_byte = lcc[first_byte];
2696     }
2697     else
2698     {
2699     if (startline && study != NULL &&
2700     (study->options & PCRE_STUDY_MAPPED) != 0)
2701     start_bits = study->start_bits;
2702     }
2703     }
2704    
2705     /* For anchored or unanchored matches, there may be a "last known required
2706     character" set. */
2707    
2708 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2709 nigel 77 {
2710     req_byte = re->req_byte & 255;
2711     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2712 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2713 nigel 77 }
2714    
2715     /* Call the main matching function, looping for a non-anchored regex after a
2716     failed match. Unless restarting, optimize by moving to the first match
2717     character if possible, when not anchored. Then unless wanting a partial match,
2718     check for a required later character. */
2719    
2720     for (;;)
2721     {
2722     int rc;
2723    
2724     if ((options & PCRE_DFA_RESTART) == 0)
2725     {
2726     const uschar *save_end_subject = end_subject;
2727    
2728     /* Advance to a unique first char if possible. If firstline is TRUE, the
2729     start of the match is constrained to the first line of a multiline string.
2730 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2731     scanning at a newline. If the match fails at the newline, later code breaks
2732     this loop. */
2733 nigel 77
2734     if (firstline)
2735     {
2736     const uschar *t = current_subject;
2737 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2738 nigel 77 end_subject = t;
2739     }
2740    
2741     if (first_byte >= 0)
2742     {
2743     if (first_byte_caseless)
2744     while (current_subject < end_subject &&
2745     lcc[*current_subject] != first_byte)
2746     current_subject++;
2747     else
2748     while (current_subject < end_subject && *current_subject != first_byte)
2749     current_subject++;
2750     }
2751    
2752 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2753 nigel 77
2754     else if (startline)
2755     {
2756 nigel 93 if (current_subject > md->start_subject + start_offset)
2757 nigel 77 {
2758 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2759 nigel 77 current_subject++;
2760 ph10 130
2761 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2762     ANYCRLF, and we are now at a LF, advance the match position by one more
2763     character. */
2764 ph10 134
2765 ph10 130 if (current_subject[-1] == '\r' &&
2766 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2767 ph10 130 current_subject < end_subject &&
2768     *current_subject == '\n')
2769     current_subject++;
2770 nigel 77 }
2771     }
2772    
2773     /* Or to a non-unique first char after study */
2774    
2775     else if (start_bits != NULL)
2776     {
2777     while (current_subject < end_subject)
2778     {
2779     register unsigned int c = *current_subject;
2780     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2781     else break;
2782     }
2783     }
2784    
2785     /* Restore fudged end_subject */
2786    
2787     end_subject = save_end_subject;
2788     }
2789    
2790     /* If req_byte is set, we know that that character must appear in the subject
2791     for the match to succeed. If the first character is set, req_byte must be
2792     later in the subject; otherwise the test starts at the match point. This
2793     optimization can save a huge amount of work in patterns with nested unlimited
2794     repeats that aren't going to match. Writing separate code for cased/caseless
2795     versions makes it go faster, as does using an autoincrement and backing off
2796     on a match.
2797    
2798     HOWEVER: when the subject string is very, very long, searching to its end can
2799     take a long time, and give bad performance on quite ordinary patterns. This
2800     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2801     don't do this when the string is sufficiently long.
2802    
2803     ALSO: this processing is disabled when partial matching is requested.
2804     */
2805    
2806     if (req_byte >= 0 &&
2807     end_subject - current_subject < REQ_BYTE_MAX &&
2808     (options & PCRE_PARTIAL) == 0)
2809     {
2810     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2811    
2812     /* We don't need to repeat the search if we haven't yet reached the
2813     place we found it at last time. */
2814    
2815     if (p > req_byte_ptr)
2816     {
2817     if (req_byte_caseless)
2818     {
2819     while (p < end_subject)
2820     {
2821     register int pp = *p++;
2822     if (pp == req_byte || pp == req_byte2) { p--; break; }
2823     }
2824     }
2825     else
2826     {
2827     while (p < end_subject)
2828     {
2829     if (*p++ == req_byte) { p--; break; }
2830     }
2831     }
2832    
2833     /* If we can't find the required character, break the matching loop,
2834     which will cause a return or PCRE_ERROR_NOMATCH. */
2835    
2836     if (p >= end_subject) break;
2837    
2838     /* If we have found the required character, save the point where we
2839     found it, so that we don't search again next time round the loop if
2840     the start hasn't passed this character yet. */
2841    
2842     req_byte_ptr = p;
2843     }
2844     }
2845    
2846     /* OK, now we can do the business */
2847    
2848     rc = internal_dfa_exec(
2849 nigel 91 md, /* fixed match data */
2850     md->start_code, /* this subexpression's code */
2851     current_subject, /* where we currently are */
2852     start_offset, /* start offset in subject */
2853     offsets, /* offset vector */
2854     offsetcount, /* size of same */
2855     workspace, /* workspace vector */
2856     wscount, /* size of same */
2857 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2858 nigel 91 0, /* function recurse level */
2859     0); /* regex recurse level */
2860 nigel 77
2861     /* Anything other than "no match" means we are done, always; otherwise, carry
2862     on only if not anchored. */
2863    
2864     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2865    
2866     /* Advance to the next subject character unless we are at the end of a line
2867     and firstline is set. */
2868    
2869 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2870 nigel 77 current_subject++;
2871     if (utf8)
2872     {
2873     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2874     current_subject++;
2875     }
2876     if (current_subject > end_subject) break;
2877    
2878 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2879 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2880     or ANY or ANYCRLF, advance the match position by one more character. */
2881 nigel 93
2882     if (current_subject[-1] == '\r' &&
2883 ph10 226 current_subject < end_subject &&
2884     *current_subject == '\n' &&
2885 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2886 ph10 226 (md->nltype == NLTYPE_ANY ||
2887     md->nltype == NLTYPE_ANYCRLF ||
2888     md->nllen == 2))
2889 nigel 93 current_subject++;
2890    
2891     } /* "Bumpalong" loop */
2892    
2893 nigel 77 return PCRE_ERROR_NOMATCH;
2894     }
2895    
2896     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12