/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (hide annotations) (download)
Tue Jul 31 14:39:09 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 94242 byte(s)
Daniel's patch for config.h and Windows DLL declarations (not fully working).

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 ph10 199 #ifdef HAVE_CONFIG_H
48     #include <config.h>
49     #endif
50    
51 nigel 93 #define NLBLOCK md /* Block containing newline information */
52     #define PSSTART start_subject /* Field containing processed string start */
53     #define PSEND end_subject /* Field containing processed string end */
54    
55 nigel 77 #include "pcre_internal.h"
56    
57    
58     /* For use to indent debugging output */
59    
60     #define SP " "
61    
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87     static uschar coptable[] = {
88     0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 nigel 77 0, 0, /* Any, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133     0, 0 /* BRAZERO, BRAMINZERO */
134 nigel 77 };
135    
136     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
137     and \w */
138    
139     static uschar toptable1[] = {
140 ph10 168 0, 0, 0, 0, 0, 0,
141 nigel 77 ctype_digit, ctype_digit,
142     ctype_space, ctype_space,
143     ctype_word, ctype_word,
144     0 /* OP_ANY */
145     };
146    
147     static uschar toptable2[] = {
148 ph10 168 0, 0, 0, 0, 0, 0,
149 nigel 77 ctype_digit, 0,
150     ctype_space, 0,
151     ctype_word, 0,
152     1 /* OP_ANY */
153     };
154    
155    
156     /* Structure for holding data about a particular state, which is in effect the
157     current data for an active path through the match tree. It must consist
158     entirely of ints because the working vector we are passed, and which we put
159     these structures in, is a vector of ints. */
160    
161     typedef struct stateblock {
162     int offset; /* Offset to opcode */
163     int count; /* Count for repeats */
164     int ims; /* ims flag bits */
165     int data; /* Some use extra data */
166     } stateblock;
167    
168     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
169    
170    
171     #ifdef DEBUG
172     /*************************************************
173     * Print character string *
174     *************************************************/
175    
176     /* Character string printing function for debugging.
177    
178     Arguments:
179     p points to string
180     length number of bytes
181     f where to print
182    
183     Returns: nothing
184     */
185    
186     static void
187     pchars(unsigned char *p, int length, FILE *f)
188     {
189     int c;
190     while (length-- > 0)
191     {
192     if (isprint(c = *(p++)))
193     fprintf(f, "%c", c);
194     else
195     fprintf(f, "\\x%02x", c);
196     }
197     }
198     #endif
199    
200    
201    
202     /*************************************************
203     * Execute a Regular Expression - DFA engine *
204     *************************************************/
205    
206     /* This internal function applies a compiled pattern to a subject string,
207     starting at a given point, using a DFA engine. This function is called from the
208     external one, possibly multiple times if the pattern is not anchored. The
209     function calls itself recursively for some kinds of subpattern.
210    
211     Arguments:
212     md the match_data block with fixed information
213     this_start_code the opening bracket of this subexpression's code
214     current_subject where we currently are in the subject string
215     start_offset start offset in the subject string
216     offsets vector to contain the matching string offsets
217     offsetcount size of same
218     workspace vector of workspace
219     wscount size of same
220     ims the current ims flags
221     rlevel function call recursion level
222     recursing regex recursive call level
223    
224     Returns: > 0 =>
225     = 0 =>
226     -1 => failed to match
227     < -1 => some kind of unexpected problem
228    
229     The following macros are used for adding states to the two state vectors (one
230     for the current character, one for the following character). */
231    
232     #define ADD_ACTIVE(x,y) \
233     if (active_count++ < wscount) \
234     { \
235     next_active_state->offset = (x); \
236     next_active_state->count = (y); \
237     next_active_state->ims = ims; \
238     next_active_state++; \
239     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
240     } \
241     else return PCRE_ERROR_DFA_WSSIZE
242    
243     #define ADD_ACTIVE_DATA(x,y,z) \
244     if (active_count++ < wscount) \
245     { \
246     next_active_state->offset = (x); \
247     next_active_state->count = (y); \
248     next_active_state->ims = ims; \
249     next_active_state->data = (z); \
250     next_active_state++; \
251     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
252     } \
253     else return PCRE_ERROR_DFA_WSSIZE
254    
255     #define ADD_NEW(x,y) \
256     if (new_count++ < wscount) \
257     { \
258     next_new_state->offset = (x); \
259     next_new_state->count = (y); \
260     next_new_state->ims = ims; \
261     next_new_state++; \
262     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
263     } \
264     else return PCRE_ERROR_DFA_WSSIZE
265    
266     #define ADD_NEW_DATA(x,y,z) \
267     if (new_count++ < wscount) \
268     { \
269     next_new_state->offset = (x); \
270     next_new_state->count = (y); \
271     next_new_state->ims = ims; \
272     next_new_state->data = (z); \
273     next_new_state++; \
274     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
275     } \
276     else return PCRE_ERROR_DFA_WSSIZE
277    
278     /* And now, here is the code */
279    
280     static int
281     internal_dfa_exec(
282     dfa_match_data *md,
283     const uschar *this_start_code,
284     const uschar *current_subject,
285     int start_offset,
286     int *offsets,
287     int offsetcount,
288     int *workspace,
289     int wscount,
290     int ims,
291     int rlevel,
292     int recursing)
293     {
294     stateblock *active_states, *new_states, *temp_states;
295     stateblock *next_active_state, *next_new_state;
296    
297     const uschar *ctypes, *lcc, *fcc;
298     const uschar *ptr;
299 nigel 93 const uschar *end_code, *first_op;
300 nigel 77
301     int active_count, new_count, match_count;
302    
303     /* Some fields in the md block are frequently referenced, so we load them into
304     independent variables in the hope that this will perform better. */
305    
306     const uschar *start_subject = md->start_subject;
307     const uschar *end_subject = md->end_subject;
308     const uschar *start_code = md->start_code;
309    
310 nigel 87 #ifdef SUPPORT_UTF8
311 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
312 nigel 93 #else
313     BOOL utf8 = FALSE;
314 nigel 87 #endif
315 nigel 77
316     rlevel++;
317     offsetcount &= (-2);
318    
319     wscount -= 2;
320     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
321     (2 * INTS_PER_STATEBLOCK);
322    
323     DPRINTF(("\n%.*s---------------------\n"
324     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
325     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
326    
327     ctypes = md->tables + ctypes_offset;
328     lcc = md->tables + lcc_offset;
329     fcc = md->tables + fcc_offset;
330    
331     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
332    
333     active_states = (stateblock *)(workspace + 2);
334     next_new_state = new_states = active_states + wscount;
335     new_count = 0;
336    
337 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
338     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
339    
340 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
341     the alternative states onto the list, and find out where the end is. This
342     makes is possible to use this function recursively, when we want to stop at a
343     matching internal ket rather than at the end.
344    
345     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
346     a backward assertion. In that case, we have to find out the maximum amount to
347     move back, and set up each alternative appropriately. */
348    
349 nigel 93 if (*first_op == OP_REVERSE)
350 nigel 77 {
351     int max_back = 0;
352     int gone_back;
353    
354     end_code = this_start_code;
355     do
356     {
357     int back = GET(end_code, 2+LINK_SIZE);
358     if (back > max_back) max_back = back;
359     end_code += GET(end_code, 1);
360     }
361     while (*end_code == OP_ALT);
362    
363     /* If we can't go back the amount required for the longest lookbehind
364     pattern, go back as far as we can; some alternatives may still be viable. */
365    
366     #ifdef SUPPORT_UTF8
367     /* In character mode we have to step back character by character */
368    
369     if (utf8)
370     {
371     for (gone_back = 0; gone_back < max_back; gone_back++)
372     {
373     if (current_subject <= start_subject) break;
374     current_subject--;
375     while (current_subject > start_subject &&
376     (*current_subject & 0xc0) == 0x80)
377     current_subject--;
378     }
379     }
380     else
381     #endif
382    
383     /* In byte-mode we can do this quickly. */
384    
385     {
386     gone_back = (current_subject - max_back < start_subject)?
387     current_subject - start_subject : max_back;
388     current_subject -= gone_back;
389     }
390    
391     /* Now we can process the individual branches. */
392    
393     end_code = this_start_code;
394     do
395     {
396     int back = GET(end_code, 2+LINK_SIZE);
397     if (back <= gone_back)
398     {
399     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
400     ADD_NEW_DATA(-bstate, 0, gone_back - back);
401     }
402     end_code += GET(end_code, 1);
403     }
404     while (*end_code == OP_ALT);
405     }
406    
407     /* This is the code for a "normal" subpattern (not a backward assertion). The
408     start of a whole pattern is always one of these. If we are at the top level,
409     we may be asked to restart matching from the same point that we reached for a
410     previous partial match. We still have to scan through the top-level branches to
411     find the end state. */
412    
413     else
414     {
415     end_code = this_start_code;
416    
417     /* Restarting */
418    
419     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
420     {
421     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
422     new_count = workspace[1];
423     if (!workspace[0])
424     memcpy(new_states, active_states, new_count * sizeof(stateblock));
425     }
426    
427     /* Not restarting */
428    
429     else
430     {
431 nigel 93 int length = 1 + LINK_SIZE +
432     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
433 nigel 77 do
434     {
435 nigel 93 ADD_NEW(end_code - start_code + length, 0);
436 nigel 77 end_code += GET(end_code, 1);
437 nigel 93 length = 1 + LINK_SIZE;
438 nigel 77 }
439     while (*end_code == OP_ALT);
440     }
441     }
442    
443     workspace[0] = 0; /* Bit indicating which vector is current */
444    
445     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
446    
447     /* Loop for scanning the subject */
448    
449     ptr = current_subject;
450     for (;;)
451     {
452     int i, j;
453 nigel 91 int clen, dlen;
454     unsigned int c, d;
455 nigel 77
456     /* Make the new state list into the active state list and empty the
457     new state list. */
458    
459     temp_states = active_states;
460     active_states = new_states;
461     new_states = temp_states;
462     active_count = new_count;
463     new_count = 0;
464    
465     workspace[0] ^= 1; /* Remember for the restarting feature */
466     workspace[1] = active_count;
467    
468     #ifdef DEBUG
469     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
470     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
471     printf("\"\n");
472    
473     printf("%.*sActive states: ", rlevel*2-2, SP);
474     for (i = 0; i < active_count; i++)
475     printf("%d/%d ", active_states[i].offset, active_states[i].count);
476     printf("\n");
477     #endif
478    
479     /* Set the pointers for adding new states */
480    
481     next_active_state = active_states + active_count;
482     next_new_state = new_states;
483    
484     /* Load the current character from the subject outside the loop, as many
485     different states may want to look at it, and we assume that at least one
486     will. */
487    
488     if (ptr < end_subject)
489     {
490 nigel 93 clen = 1; /* Number of bytes in the character */
491 nigel 77 #ifdef SUPPORT_UTF8
492     if (utf8) { GETCHARLEN(c, ptr, clen); } else
493     #endif /* SUPPORT_UTF8 */
494     c = *ptr;
495     }
496     else
497     {
498 nigel 93 clen = 0; /* This indicates the end of the subject */
499     c = NOTACHAR; /* This value should never actually be used */
500 nigel 77 }
501    
502     /* Scan up the active states and act on each one. The result of an action
503     may be to add more states to the currently active list (e.g. on hitting a
504     parenthesis) or it may be to put states on the new list, for considering
505     when we move the character pointer on. */
506    
507     for (i = 0; i < active_count; i++)
508     {
509     stateblock *current_state = active_states + i;
510     const uschar *code;
511     int state_offset = current_state->offset;
512     int count, codevalue;
513 ph10 152 #ifdef SUPPORT_UCP
514 nigel 87 int chartype, script;
515 ph10 152 #endif
516 nigel 77
517     #ifdef DEBUG
518     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
519 nigel 93 if (clen == 0) printf("EOL\n");
520 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
521     else printf("0x%02x\n", c);
522     #endif
523    
524     /* This variable is referred to implicity in the ADD_xxx macros. */
525    
526     ims = current_state->ims;
527    
528     /* A negative offset is a special case meaning "hold off going to this
529     (negated) state until the number of characters in the data field have
530     been skipped". */
531    
532     if (state_offset < 0)
533     {
534     if (current_state->data > 0)
535     {
536     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
537     ADD_NEW_DATA(state_offset, current_state->count,
538     current_state->data - 1);
539     continue;
540     }
541     else
542     {
543     current_state->offset = state_offset = -state_offset;
544     }
545     }
546    
547     /* Check for a duplicate state with the same count, and skip if found. */
548    
549     for (j = 0; j < i; j++)
550     {
551     if (active_states[j].offset == state_offset &&
552     active_states[j].count == current_state->count)
553     {
554     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
555     goto NEXT_ACTIVE_STATE;
556     }
557     }
558    
559     /* The state offset is the offset to the opcode */
560    
561     code = start_code + state_offset;
562     codevalue = *code;
563    
564     /* If this opcode is followed by an inline character, load it. It is
565     tempting to test for the presence of a subject character here, but that
566     is wrong, because sometimes zero repetitions of the subject are
567     permitted.
568    
569     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570 ph10 178 argument that is not a data character - but is always one byte long. We
571     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
572     this case. To keep the other cases fast, convert these ones to new opcodes.
573     */
574 nigel 77
575     if (coptable[codevalue] > 0)
576     {
577     dlen = 1;
578     #ifdef SUPPORT_UTF8
579     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
580     #endif /* SUPPORT_UTF8 */
581     d = code[coptable[codevalue]];
582     if (codevalue >= OP_TYPESTAR)
583     {
584 nigel 93 switch(d)
585     {
586     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
587     case OP_NOTPROP:
588     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591 ph10 178 case OP_NOT_HSPACE:
592 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593 ph10 178 case OP_NOT_VSPACE:
594 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595 nigel 93 default: break;
596     }
597 nigel 77 }
598     }
599     else
600     {
601     dlen = 0; /* Not strictly necessary, but compilers moan */
602 nigel 93 d = NOTACHAR; /* if these variables are not set. */
603 nigel 77 }
604    
605    
606     /* Now process the individual opcodes */
607    
608     switch (codevalue)
609     {
610    
611     /* ========================================================================== */
612     /* Reached a closing bracket. If not at the end of the pattern, carry
613     on with the next opcode. Otherwise, unless we have an empty string and
614     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
615     matches so we always have the longest first. */
616    
617     case OP_KET:
618     case OP_KETRMIN:
619     case OP_KETRMAX:
620     if (code != end_code)
621     {
622     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
623     if (codevalue != OP_KET)
624     {
625     ADD_ACTIVE(state_offset - GET(code, 1), 0);
626     }
627     }
628     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
629     {
630     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
631     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
632     match_count = 0;
633     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
634     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
635     if (offsetcount >= 2)
636     {
637     offsets[0] = current_subject - start_subject;
638     offsets[1] = ptr - start_subject;
639     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
640     offsets[1] - offsets[0], current_subject));
641     }
642     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
643     {
644     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
645     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
646     match_count, rlevel*2-2, SP));
647     return match_count;
648     }
649     }
650     break;
651    
652     /* ========================================================================== */
653     /* These opcodes add to the current list of states without looking
654     at the current character. */
655    
656     /*-----------------------------------------------------------------*/
657     case OP_ALT:
658     do { code += GET(code, 1); } while (*code == OP_ALT);
659     ADD_ACTIVE(code - start_code, 0);
660     break;
661    
662     /*-----------------------------------------------------------------*/
663     case OP_BRA:
664 nigel 93 case OP_SBRA:
665 nigel 77 do
666     {
667     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
668     code += GET(code, 1);
669     }
670     while (*code == OP_ALT);
671     break;
672    
673     /*-----------------------------------------------------------------*/
674 nigel 93 case OP_CBRA:
675     case OP_SCBRA:
676     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
677     code += GET(code, 1);
678     while (*code == OP_ALT)
679     {
680     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
681     code += GET(code, 1);
682     }
683     break;
684    
685     /*-----------------------------------------------------------------*/
686 nigel 77 case OP_BRAZERO:
687     case OP_BRAMINZERO:
688     ADD_ACTIVE(state_offset + 1, 0);
689     code += 1 + GET(code, 2);
690     while (*code == OP_ALT) code += GET(code, 1);
691     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
692     break;
693    
694     /*-----------------------------------------------------------------*/
695     case OP_CIRC:
696     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
697 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
698     ptr != end_subject &&
699 nigel 93 WAS_NEWLINE(ptr)))
700 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
701     break;
702    
703     /*-----------------------------------------------------------------*/
704     case OP_EOD:
705     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
706     break;
707    
708     /*-----------------------------------------------------------------*/
709     case OP_OPT:
710     ims = code[1];
711     ADD_ACTIVE(state_offset + 2, 0);
712     break;
713    
714     /*-----------------------------------------------------------------*/
715     case OP_SOD:
716     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
717     break;
718    
719     /*-----------------------------------------------------------------*/
720     case OP_SOM:
721     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
722     break;
723    
724    
725     /* ========================================================================== */
726     /* These opcodes inspect the next subject character, and sometimes
727     the previous one as well, but do not have an argument. The variable
728     clen contains the length of the current character and is zero if we are
729     at the end of the subject. */
730    
731     /*-----------------------------------------------------------------*/
732     case OP_ANY:
733 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
734 nigel 77 { ADD_NEW(state_offset + 1, 0); }
735     break;
736    
737     /*-----------------------------------------------------------------*/
738     case OP_EODN:
739 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
740 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
741     break;
742    
743     /*-----------------------------------------------------------------*/
744     case OP_DOLL:
745     if ((md->moptions & PCRE_NOTEOL) == 0)
746     {
747 nigel 91 if (clen == 0 ||
748 nigel 93 (IS_NEWLINE(ptr) &&
749 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
750     ))
751 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
752     }
753 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
754 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
755     break;
756    
757     /*-----------------------------------------------------------------*/
758    
759     case OP_DIGIT:
760     case OP_WHITESPACE:
761     case OP_WORDCHAR:
762     if (clen > 0 && c < 256 &&
763     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
764     { ADD_NEW(state_offset + 1, 0); }
765     break;
766    
767     /*-----------------------------------------------------------------*/
768     case OP_NOT_DIGIT:
769     case OP_NOT_WHITESPACE:
770     case OP_NOT_WORDCHAR:
771     if (clen > 0 && (c >= 256 ||
772     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
773     { ADD_NEW(state_offset + 1, 0); }
774     break;
775    
776     /*-----------------------------------------------------------------*/
777     case OP_WORD_BOUNDARY:
778     case OP_NOT_WORD_BOUNDARY:
779     {
780     int left_word, right_word;
781    
782     if (ptr > start_subject)
783     {
784     const uschar *temp = ptr - 1;
785     #ifdef SUPPORT_UTF8
786     if (utf8) BACKCHAR(temp);
787     #endif
788     GETCHARTEST(d, temp);
789     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
790     }
791     else left_word = 0;
792    
793     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
794     else right_word = 0;
795    
796     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
797     { ADD_ACTIVE(state_offset + 1, 0); }
798     }
799     break;
800    
801    
802     /*-----------------------------------------------------------------*/
803     /* Check the next character by Unicode property. We will get here only
804     if the support is in the binary; otherwise a compile-time error occurs.
805     */
806    
807 ph10 151 #ifdef SUPPORT_UCP
808 nigel 77 case OP_PROP:
809     case OP_NOTPROP:
810     if (clen > 0)
811     {
812 nigel 87 BOOL OK;
813     int category = _pcre_ucp_findprop(c, &chartype, &script);
814     switch(code[1])
815 nigel 77 {
816 nigel 87 case PT_ANY:
817     OK = TRUE;
818     break;
819    
820     case PT_LAMP:
821     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
822     break;
823    
824     case PT_GC:
825     OK = category == code[2];
826     break;
827    
828     case PT_PC:
829     OK = chartype == code[2];
830     break;
831    
832     case PT_SC:
833     OK = script == code[2];
834     break;
835    
836     /* Should never occur, but keep compilers from grumbling. */
837    
838     default:
839     OK = codevalue != OP_PROP;
840     break;
841 nigel 77 }
842 nigel 87
843     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
844 nigel 77 }
845     break;
846     #endif
847    
848    
849    
850     /* ========================================================================== */
851     /* These opcodes likewise inspect the subject character, but have an
852     argument that is not a data character. It is one of these opcodes:
853     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
854     OP_NOT_WORDCHAR. The value is loaded into d. */
855    
856     case OP_TYPEPLUS:
857     case OP_TYPEMINPLUS:
858 nigel 93 case OP_TYPEPOSPLUS:
859 nigel 77 count = current_state->count; /* Already matched */
860     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
861     if (clen > 0)
862     {
863     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
864     (c < 256 &&
865 nigel 91 (d != OP_ANY ||
866     (ims & PCRE_DOTALL) != 0 ||
867     !IS_NEWLINE(ptr)
868     ) &&
869 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
870     {
871 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
872     {
873     active_count--; /* Remove non-match possibility */
874     next_active_state--;
875     }
876 nigel 77 count++;
877     ADD_NEW(state_offset, count);
878     }
879     }
880     break;
881    
882     /*-----------------------------------------------------------------*/
883     case OP_TYPEQUERY:
884     case OP_TYPEMINQUERY:
885 nigel 93 case OP_TYPEPOSQUERY:
886 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
887     if (clen > 0)
888     {
889     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
890     (c < 256 &&
891 nigel 91 (d != OP_ANY ||
892     (ims & PCRE_DOTALL) != 0 ||
893     !IS_NEWLINE(ptr)
894     ) &&
895 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
896     {
897 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
898     {
899     active_count--; /* Remove non-match possibility */
900     next_active_state--;
901     }
902 nigel 77 ADD_NEW(state_offset + 2, 0);
903     }
904     }
905     break;
906    
907     /*-----------------------------------------------------------------*/
908     case OP_TYPESTAR:
909     case OP_TYPEMINSTAR:
910 nigel 93 case OP_TYPEPOSSTAR:
911 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
912     if (clen > 0)
913     {
914     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
915     (c < 256 &&
916 nigel 91 (d != OP_ANY ||
917     (ims & PCRE_DOTALL) != 0 ||
918     !IS_NEWLINE(ptr)
919     ) &&
920 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
921     {
922 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
923     {
924     active_count--; /* Remove non-match possibility */
925     next_active_state--;
926     }
927 nigel 77 ADD_NEW(state_offset, 0);
928     }
929     }
930     break;
931    
932     /*-----------------------------------------------------------------*/
933     case OP_TYPEEXACT:
934 nigel 93 count = current_state->count; /* Number already matched */
935     if (clen > 0)
936     {
937     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
938     (c < 256 &&
939     (d != OP_ANY ||
940     (ims & PCRE_DOTALL) != 0 ||
941     !IS_NEWLINE(ptr)
942     ) &&
943     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944     {
945     if (++count >= GET2(code, 1))
946     { ADD_NEW(state_offset + 4, 0); }
947     else
948     { ADD_NEW(state_offset, count); }
949     }
950     }
951     break;
952    
953     /*-----------------------------------------------------------------*/
954 nigel 77 case OP_TYPEUPTO:
955     case OP_TYPEMINUPTO:
956 nigel 93 case OP_TYPEPOSUPTO:
957     ADD_ACTIVE(state_offset + 4, 0);
958 nigel 77 count = current_state->count; /* Number already matched */
959     if (clen > 0)
960     {
961     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962     (c < 256 &&
963 nigel 91 (d != OP_ANY ||
964     (ims & PCRE_DOTALL) != 0 ||
965     !IS_NEWLINE(ptr)
966     ) &&
967 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
968     {
969 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
970     {
971     active_count--; /* Remove non-match possibility */
972     next_active_state--;
973     }
974 nigel 77 if (++count >= GET2(code, 1))
975     { ADD_NEW(state_offset + 4, 0); }
976     else
977     { ADD_NEW(state_offset, count); }
978     }
979     }
980     break;
981    
982     /* ========================================================================== */
983     /* These are virtual opcodes that are used when something like
984 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
985     argument. It keeps the code above fast for the other cases. The argument
986     is in the d variable. */
987 nigel 77
988 ph10 151 #ifdef SUPPORT_UCP
989 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
990     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
991 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
992 nigel 77 count = current_state->count; /* Already matched */
993 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
994 nigel 77 if (clen > 0)
995     {
996 nigel 87 BOOL OK;
997     int category = _pcre_ucp_findprop(c, &chartype, &script);
998     switch(code[2])
999     {
1000     case PT_ANY:
1001     OK = TRUE;
1002     break;
1003    
1004     case PT_LAMP:
1005     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1006     break;
1007    
1008     case PT_GC:
1009     OK = category == code[3];
1010     break;
1011    
1012     case PT_PC:
1013     OK = chartype == code[3];
1014     break;
1015    
1016     case PT_SC:
1017     OK = script == code[3];
1018     break;
1019    
1020     /* Should never occur, but keep compilers from grumbling. */
1021    
1022     default:
1023     OK = codevalue != OP_PROP;
1024     break;
1025     }
1026    
1027 nigel 93 if (OK == (d == OP_PROP))
1028     {
1029     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1030     {
1031     active_count--; /* Remove non-match possibility */
1032     next_active_state--;
1033     }
1034     count++;
1035     ADD_NEW(state_offset, count);
1036     }
1037 nigel 77 }
1038     break;
1039    
1040     /*-----------------------------------------------------------------*/
1041     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1042     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1043 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1044 nigel 77 count = current_state->count; /* Already matched */
1045     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1046 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1047 nigel 77 {
1048     const uschar *nptr = ptr + clen;
1049     int ncount = 0;
1050 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1051     {
1052     active_count--; /* Remove non-match possibility */
1053     next_active_state--;
1054     }
1055 nigel 77 while (nptr < end_subject)
1056     {
1057     int nd;
1058     int ndlen = 1;
1059     GETCHARLEN(nd, nptr, ndlen);
1060 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1061 nigel 77 ncount++;
1062     nptr += ndlen;
1063     }
1064     count++;
1065     ADD_NEW_DATA(-state_offset, count, ncount);
1066     }
1067     break;
1068 ph10 151 #endif
1069 nigel 77
1070     /*-----------------------------------------------------------------*/
1071 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1072     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1073     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1074     count = current_state->count; /* Already matched */
1075     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1076     if (clen > 0)
1077     {
1078     int ncount = 0;
1079     switch (c)
1080     {
1081     case 0x000d:
1082     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1083     /* Fall through */
1084     case 0x000a:
1085     case 0x000b:
1086     case 0x000c:
1087     case 0x0085:
1088     case 0x2028:
1089     case 0x2029:
1090     if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1091     {
1092     active_count--; /* Remove non-match possibility */
1093     next_active_state--;
1094     }
1095     count++;
1096     ADD_NEW_DATA(-state_offset, count, ncount);
1097     break;
1098     default:
1099     break;
1100     }
1101     }
1102     break;
1103    
1104     /*-----------------------------------------------------------------*/
1105 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1106     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1107     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1108     count = current_state->count; /* Already matched */
1109     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1110     if (clen > 0)
1111     {
1112 ph10 182 BOOL OK;
1113 ph10 178 switch (c)
1114     {
1115     case 0x000a:
1116     case 0x000b:
1117     case 0x000c:
1118     case 0x000d:
1119     case 0x0085:
1120     case 0x2028:
1121     case 0x2029:
1122     OK = TRUE;
1123 ph10 182 break;
1124 ph10 178
1125     default:
1126     OK = FALSE;
1127 ph10 182 break;
1128 ph10 178 }
1129    
1130     if (OK == (d == OP_VSPACE))
1131 ph10 182 {
1132 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1133     {
1134     active_count--; /* Remove non-match possibility */
1135     next_active_state--;
1136     }
1137     count++;
1138     ADD_NEW_DATA(-state_offset, count, 0);
1139     }
1140     }
1141     break;
1142    
1143     /*-----------------------------------------------------------------*/
1144     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1145     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1146     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1147     count = current_state->count; /* Already matched */
1148     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1149     if (clen > 0)
1150     {
1151 ph10 182 BOOL OK;
1152 ph10 178 switch (c)
1153     {
1154     case 0x09: /* HT */
1155     case 0x20: /* SPACE */
1156     case 0xa0: /* NBSP */
1157     case 0x1680: /* OGHAM SPACE MARK */
1158     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1159     case 0x2000: /* EN QUAD */
1160     case 0x2001: /* EM QUAD */
1161     case 0x2002: /* EN SPACE */
1162     case 0x2003: /* EM SPACE */
1163     case 0x2004: /* THREE-PER-EM SPACE */
1164     case 0x2005: /* FOUR-PER-EM SPACE */
1165     case 0x2006: /* SIX-PER-EM SPACE */
1166     case 0x2007: /* FIGURE SPACE */
1167     case 0x2008: /* PUNCTUATION SPACE */
1168     case 0x2009: /* THIN SPACE */
1169     case 0x200A: /* HAIR SPACE */
1170     case 0x202f: /* NARROW NO-BREAK SPACE */
1171     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1172     case 0x3000: /* IDEOGRAPHIC SPACE */
1173     OK = TRUE;
1174     break;
1175 ph10 182
1176 ph10 178 default:
1177     OK = FALSE;
1178     break;
1179     }
1180 ph10 182
1181 ph10 178 if (OK == (d == OP_HSPACE))
1182 ph10 182 {
1183 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1184     {
1185     active_count--; /* Remove non-match possibility */
1186     next_active_state--;
1187     }
1188     count++;
1189     ADD_NEW_DATA(-state_offset, count, 0);
1190     }
1191     }
1192     break;
1193    
1194     /*-----------------------------------------------------------------*/
1195 ph10 151 #ifdef SUPPORT_UCP
1196 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1197     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1198 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1199 nigel 87 count = 4;
1200 nigel 77 goto QS1;
1201    
1202     case OP_PROP_EXTRA + OP_TYPESTAR:
1203     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1204 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1205 nigel 77 count = 0;
1206    
1207     QS1:
1208    
1209 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1210 nigel 77 if (clen > 0)
1211     {
1212 nigel 87 BOOL OK;
1213     int category = _pcre_ucp_findprop(c, &chartype, &script);
1214     switch(code[2])
1215     {
1216     case PT_ANY:
1217     OK = TRUE;
1218     break;
1219    
1220     case PT_LAMP:
1221     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1222     break;
1223    
1224     case PT_GC:
1225     OK = category == code[3];
1226     break;
1227    
1228     case PT_PC:
1229     OK = chartype == code[3];
1230     break;
1231    
1232     case PT_SC:
1233     OK = script == code[3];
1234     break;
1235    
1236     /* Should never occur, but keep compilers from grumbling. */
1237    
1238     default:
1239     OK = codevalue != OP_PROP;
1240     break;
1241     }
1242    
1243 nigel 93 if (OK == (d == OP_PROP))
1244     {
1245     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1246     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1247     {
1248     active_count--; /* Remove non-match possibility */
1249     next_active_state--;
1250     }
1251     ADD_NEW(state_offset + count, 0);
1252     }
1253 nigel 77 }
1254     break;
1255    
1256     /*-----------------------------------------------------------------*/
1257     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1258     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1259 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1260 nigel 77 count = 2;
1261     goto QS2;
1262    
1263     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1264     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1265 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1266 nigel 77 count = 0;
1267    
1268     QS2:
1269    
1270     ADD_ACTIVE(state_offset + 2, 0);
1271 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1272 nigel 77 {
1273     const uschar *nptr = ptr + clen;
1274     int ncount = 0;
1275 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1276     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1277     {
1278     active_count--; /* Remove non-match possibility */
1279     next_active_state--;
1280     }
1281 nigel 77 while (nptr < end_subject)
1282     {
1283     int nd;
1284     int ndlen = 1;
1285     GETCHARLEN(nd, nptr, ndlen);
1286 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1287 nigel 77 ncount++;
1288     nptr += ndlen;
1289     }
1290     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1291     }
1292     break;
1293 ph10 151 #endif
1294 nigel 77
1295     /*-----------------------------------------------------------------*/
1296 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1297     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1298     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1299     count = 2;
1300     goto QS3;
1301    
1302     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1303     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1304     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1305     count = 0;
1306    
1307     QS3:
1308     ADD_ACTIVE(state_offset + 2, 0);
1309     if (clen > 0)
1310     {
1311     int ncount = 0;
1312     switch (c)
1313     {
1314     case 0x000d:
1315     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1316     /* Fall through */
1317     case 0x000a:
1318     case 0x000b:
1319     case 0x000c:
1320     case 0x0085:
1321     case 0x2028:
1322     case 0x2029:
1323     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1324     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1325     {
1326     active_count--; /* Remove non-match possibility */
1327     next_active_state--;
1328     }
1329     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1330     break;
1331     default:
1332     break;
1333     }
1334     }
1335     break;
1336    
1337     /*-----------------------------------------------------------------*/
1338 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1339     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1340     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1341     count = 2;
1342     goto QS4;
1343    
1344     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1345     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1346     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1347     count = 0;
1348    
1349     QS4:
1350     ADD_ACTIVE(state_offset + 2, 0);
1351     if (clen > 0)
1352     {
1353 ph10 182 BOOL OK;
1354 ph10 178 switch (c)
1355     {
1356     case 0x000a:
1357     case 0x000b:
1358     case 0x000c:
1359     case 0x000d:
1360     case 0x0085:
1361     case 0x2028:
1362     case 0x2029:
1363     OK = TRUE;
1364     break;
1365 ph10 182
1366 ph10 178 default:
1367     OK = FALSE;
1368     break;
1369     }
1370     if (OK == (d == OP_VSPACE))
1371 ph10 182 {
1372 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1373     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1374     {
1375     active_count--; /* Remove non-match possibility */
1376     next_active_state--;
1377     }
1378     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1379     }
1380     }
1381     break;
1382    
1383     /*-----------------------------------------------------------------*/
1384     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1385     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1386     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1387     count = 2;
1388     goto QS5;
1389    
1390     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1391     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1392     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1393     count = 0;
1394    
1395     QS5:
1396     ADD_ACTIVE(state_offset + 2, 0);
1397     if (clen > 0)
1398     {
1399 ph10 182 BOOL OK;
1400 ph10 178 switch (c)
1401     {
1402     case 0x09: /* HT */
1403     case 0x20: /* SPACE */
1404     case 0xa0: /* NBSP */
1405     case 0x1680: /* OGHAM SPACE MARK */
1406     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1407     case 0x2000: /* EN QUAD */
1408     case 0x2001: /* EM QUAD */
1409     case 0x2002: /* EN SPACE */
1410     case 0x2003: /* EM SPACE */
1411     case 0x2004: /* THREE-PER-EM SPACE */
1412     case 0x2005: /* FOUR-PER-EM SPACE */
1413     case 0x2006: /* SIX-PER-EM SPACE */
1414     case 0x2007: /* FIGURE SPACE */
1415     case 0x2008: /* PUNCTUATION SPACE */
1416     case 0x2009: /* THIN SPACE */
1417     case 0x200A: /* HAIR SPACE */
1418     case 0x202f: /* NARROW NO-BREAK SPACE */
1419     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1420     case 0x3000: /* IDEOGRAPHIC SPACE */
1421     OK = TRUE;
1422     break;
1423 ph10 182
1424 ph10 178 default:
1425     OK = FALSE;
1426     break;
1427     }
1428 ph10 182
1429 ph10 178 if (OK == (d == OP_HSPACE))
1430 ph10 182 {
1431 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1432     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1433     {
1434     active_count--; /* Remove non-match possibility */
1435     next_active_state--;
1436     }
1437     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1438     }
1439     }
1440     break;
1441    
1442     /*-----------------------------------------------------------------*/
1443 ph10 151 #ifdef SUPPORT_UCP
1444 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1445     case OP_PROP_EXTRA + OP_TYPEUPTO:
1446     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1447 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1448 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1449 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1450 nigel 77 count = current_state->count; /* Number already matched */
1451     if (clen > 0)
1452     {
1453 nigel 87 BOOL OK;
1454     int category = _pcre_ucp_findprop(c, &chartype, &script);
1455     switch(code[4])
1456 nigel 77 {
1457 nigel 87 case PT_ANY:
1458     OK = TRUE;
1459     break;
1460    
1461     case PT_LAMP:
1462     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1463     break;
1464    
1465     case PT_GC:
1466     OK = category == code[5];
1467     break;
1468    
1469     case PT_PC:
1470     OK = chartype == code[5];
1471     break;
1472    
1473     case PT_SC:
1474     OK = script == code[5];
1475     break;
1476    
1477     /* Should never occur, but keep compilers from grumbling. */
1478    
1479     default:
1480     OK = codevalue != OP_PROP;
1481     break;
1482     }
1483    
1484     if (OK == (d == OP_PROP))
1485     {
1486 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1487     {
1488     active_count--; /* Remove non-match possibility */
1489     next_active_state--;
1490     }
1491 nigel 77 if (++count >= GET2(code, 1))
1492 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1493 nigel 77 else
1494     { ADD_NEW(state_offset, count); }
1495     }
1496     }
1497     break;
1498    
1499     /*-----------------------------------------------------------------*/
1500     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1501     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1502     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1503 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1504 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1505     { ADD_ACTIVE(state_offset + 4, 0); }
1506     count = current_state->count; /* Number already matched */
1507 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1508 nigel 77 {
1509     const uschar *nptr = ptr + clen;
1510     int ncount = 0;
1511 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1512     {
1513     active_count--; /* Remove non-match possibility */
1514     next_active_state--;
1515     }
1516 nigel 77 while (nptr < end_subject)
1517     {
1518     int nd;
1519     int ndlen = 1;
1520     GETCHARLEN(nd, nptr, ndlen);
1521 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1522 nigel 77 ncount++;
1523     nptr += ndlen;
1524     }
1525     if (++count >= GET2(code, 1))
1526     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1527     else
1528     { ADD_NEW_DATA(-state_offset, count, ncount); }
1529     }
1530     break;
1531 ph10 151 #endif
1532 nigel 77
1533 nigel 93 /*-----------------------------------------------------------------*/
1534     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1535     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1536     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1537     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1538     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1539     { ADD_ACTIVE(state_offset + 4, 0); }
1540     count = current_state->count; /* Number already matched */
1541     if (clen > 0)
1542     {
1543     int ncount = 0;
1544     switch (c)
1545     {
1546     case 0x000d:
1547     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1548     /* Fall through */
1549     case 0x000a:
1550     case 0x000b:
1551     case 0x000c:
1552     case 0x0085:
1553     case 0x2028:
1554     case 0x2029:
1555     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1556     {
1557     active_count--; /* Remove non-match possibility */
1558     next_active_state--;
1559     }
1560     if (++count >= GET2(code, 1))
1561     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1562     else
1563     { ADD_NEW_DATA(-state_offset, count, ncount); }
1564     break;
1565     default:
1566     break;
1567     }
1568     }
1569     break;
1570    
1571 ph10 178 /*-----------------------------------------------------------------*/
1572     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1573     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1574     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1575     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1576     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1577     { ADD_ACTIVE(state_offset + 4, 0); }
1578     count = current_state->count; /* Number already matched */
1579     if (clen > 0)
1580     {
1581 ph10 182 BOOL OK;
1582 ph10 178 switch (c)
1583     {
1584     case 0x000a:
1585     case 0x000b:
1586     case 0x000c:
1587     case 0x000d:
1588     case 0x0085:
1589     case 0x2028:
1590     case 0x2029:
1591     OK = TRUE;
1592     break;
1593 ph10 182
1594 ph10 178 default:
1595     OK = FALSE;
1596     }
1597 ph10 182
1598 ph10 178 if (OK == (d == OP_VSPACE))
1599 ph10 182 {
1600 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1601     {
1602     active_count--; /* Remove non-match possibility */
1603     next_active_state--;
1604     }
1605     if (++count >= GET2(code, 1))
1606     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1607     else
1608     { ADD_NEW_DATA(-state_offset, count, 0); }
1609     }
1610     }
1611     break;
1612    
1613     /*-----------------------------------------------------------------*/
1614     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1615     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1616     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1617     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1618     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1619     { ADD_ACTIVE(state_offset + 4, 0); }
1620     count = current_state->count; /* Number already matched */
1621     if (clen > 0)
1622     {
1623 ph10 182 BOOL OK;
1624 ph10 178 switch (c)
1625     {
1626     case 0x09: /* HT */
1627     case 0x20: /* SPACE */
1628     case 0xa0: /* NBSP */
1629     case 0x1680: /* OGHAM SPACE MARK */
1630     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1631     case 0x2000: /* EN QUAD */
1632     case 0x2001: /* EM QUAD */
1633     case 0x2002: /* EN SPACE */
1634     case 0x2003: /* EM SPACE */
1635     case 0x2004: /* THREE-PER-EM SPACE */
1636     case 0x2005: /* FOUR-PER-EM SPACE */
1637     case 0x2006: /* SIX-PER-EM SPACE */
1638     case 0x2007: /* FIGURE SPACE */
1639     case 0x2008: /* PUNCTUATION SPACE */
1640     case 0x2009: /* THIN SPACE */
1641     case 0x200A: /* HAIR SPACE */
1642     case 0x202f: /* NARROW NO-BREAK SPACE */
1643     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1644     case 0x3000: /* IDEOGRAPHIC SPACE */
1645     OK = TRUE;
1646     break;
1647 ph10 182
1648 ph10 178 default:
1649     OK = FALSE;
1650     break;
1651     }
1652 ph10 182
1653 ph10 178 if (OK == (d == OP_HSPACE))
1654 ph10 182 {
1655 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1656     {
1657     active_count--; /* Remove non-match possibility */
1658     next_active_state--;
1659     }
1660     if (++count >= GET2(code, 1))
1661     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1662     else
1663     { ADD_NEW_DATA(-state_offset, count, 0); }
1664     }
1665     }
1666     break;
1667    
1668 nigel 77 /* ========================================================================== */
1669     /* These opcodes are followed by a character that is usually compared
1670     to the current subject character; it is loaded into d. We still get
1671     here even if there is no subject character, because in some cases zero
1672     repetitions are permitted. */
1673    
1674     /*-----------------------------------------------------------------*/
1675     case OP_CHAR:
1676     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1677     break;
1678    
1679     /*-----------------------------------------------------------------*/
1680     case OP_CHARNC:
1681     if (clen == 0) break;
1682    
1683     #ifdef SUPPORT_UTF8
1684     if (utf8)
1685     {
1686     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1687     {
1688 nigel 93 unsigned int othercase;
1689 nigel 77 if (c < 128) othercase = fcc[c]; else
1690    
1691     /* If we have Unicode property support, we can use it to test the
1692 nigel 87 other case of the character. */
1693 nigel 77
1694     #ifdef SUPPORT_UCP
1695 nigel 87 othercase = _pcre_ucp_othercase(c);
1696     #else
1697 nigel 93 othercase = NOTACHAR;
1698 nigel 77 #endif
1699    
1700     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1701     }
1702     }
1703     else
1704     #endif /* SUPPORT_UTF8 */
1705    
1706     /* Non-UTF-8 mode */
1707     {
1708     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1709     }
1710     break;
1711    
1712    
1713     #ifdef SUPPORT_UCP
1714     /*-----------------------------------------------------------------*/
1715     /* This is a tricky one because it can match more than one character.
1716     Find out how many characters to skip, and then set up a negative state
1717     to wait for them to pass before continuing. */
1718    
1719     case OP_EXTUNI:
1720 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1721 nigel 77 {
1722     const uschar *nptr = ptr + clen;
1723     int ncount = 0;
1724     while (nptr < end_subject)
1725     {
1726     int nclen = 1;
1727     GETCHARLEN(c, nptr, nclen);
1728 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1729 nigel 77 ncount++;
1730     nptr += nclen;
1731     }
1732     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1733     }
1734     break;
1735     #endif
1736    
1737     /*-----------------------------------------------------------------*/
1738 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1739     character (when CR is followed by LF). In this case, set up a negative
1740     state to wait for one character to pass before continuing. */
1741    
1742     case OP_ANYNL:
1743     if (clen > 0) switch(c)
1744     {
1745     case 0x000a:
1746     case 0x000b:
1747     case 0x000c:
1748     case 0x0085:
1749     case 0x2028:
1750     case 0x2029:
1751     ADD_NEW(state_offset + 1, 0);
1752     break;
1753     case 0x000d:
1754     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1755     {
1756     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1757     }
1758     else
1759     {
1760     ADD_NEW(state_offset + 1, 0);
1761     }
1762     break;
1763     }
1764     break;
1765    
1766     /*-----------------------------------------------------------------*/
1767 ph10 178 case OP_NOT_VSPACE:
1768     if (clen > 0) switch(c)
1769     {
1770     case 0x000a:
1771     case 0x000b:
1772     case 0x000c:
1773     case 0x000d:
1774     case 0x0085:
1775     case 0x2028:
1776     case 0x2029:
1777     break;
1778 ph10 182
1779     default:
1780 ph10 178 ADD_NEW(state_offset + 1, 0);
1781     break;
1782     }
1783     break;
1784    
1785     /*-----------------------------------------------------------------*/
1786     case OP_VSPACE:
1787     if (clen > 0) switch(c)
1788     {
1789     case 0x000a:
1790     case 0x000b:
1791     case 0x000c:
1792     case 0x000d:
1793     case 0x0085:
1794     case 0x2028:
1795     case 0x2029:
1796     ADD_NEW(state_offset + 1, 0);
1797     break;
1798 ph10 182
1799 ph10 178 default: break;
1800     }
1801     break;
1802    
1803     /*-----------------------------------------------------------------*/
1804     case OP_NOT_HSPACE:
1805     if (clen > 0) switch(c)
1806     {
1807     case 0x09: /* HT */
1808     case 0x20: /* SPACE */
1809     case 0xa0: /* NBSP */
1810     case 0x1680: /* OGHAM SPACE MARK */
1811     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1812     case 0x2000: /* EN QUAD */
1813     case 0x2001: /* EM QUAD */
1814     case 0x2002: /* EN SPACE */
1815     case 0x2003: /* EM SPACE */
1816     case 0x2004: /* THREE-PER-EM SPACE */
1817     case 0x2005: /* FOUR-PER-EM SPACE */
1818     case 0x2006: /* SIX-PER-EM SPACE */
1819     case 0x2007: /* FIGURE SPACE */
1820     case 0x2008: /* PUNCTUATION SPACE */
1821     case 0x2009: /* THIN SPACE */
1822     case 0x200A: /* HAIR SPACE */
1823     case 0x202f: /* NARROW NO-BREAK SPACE */
1824     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1825     case 0x3000: /* IDEOGRAPHIC SPACE */
1826     break;
1827 ph10 182
1828     default:
1829 ph10 178 ADD_NEW(state_offset + 1, 0);
1830     break;
1831     }
1832     break;
1833    
1834     /*-----------------------------------------------------------------*/
1835     case OP_HSPACE:
1836     if (clen > 0) switch(c)
1837     {
1838     case 0x09: /* HT */
1839     case 0x20: /* SPACE */
1840     case 0xa0: /* NBSP */
1841     case 0x1680: /* OGHAM SPACE MARK */
1842     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1843     case 0x2000: /* EN QUAD */
1844     case 0x2001: /* EM QUAD */
1845     case 0x2002: /* EN SPACE */
1846     case 0x2003: /* EM SPACE */
1847     case 0x2004: /* THREE-PER-EM SPACE */
1848     case 0x2005: /* FOUR-PER-EM SPACE */
1849     case 0x2006: /* SIX-PER-EM SPACE */
1850     case 0x2007: /* FIGURE SPACE */
1851     case 0x2008: /* PUNCTUATION SPACE */
1852     case 0x2009: /* THIN SPACE */
1853     case 0x200A: /* HAIR SPACE */
1854     case 0x202f: /* NARROW NO-BREAK SPACE */
1855     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1856     case 0x3000: /* IDEOGRAPHIC SPACE */
1857     ADD_NEW(state_offset + 1, 0);
1858     break;
1859     }
1860     break;
1861    
1862     /*-----------------------------------------------------------------*/
1863 nigel 77 /* Match a negated single character. This is only used for one-byte
1864     characters, that is, we know that d < 256. The character we are
1865     checking (c) can be multibyte. */
1866    
1867     case OP_NOT:
1868     if (clen > 0)
1869     {
1870 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1871 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1872     }
1873     break;
1874    
1875     /*-----------------------------------------------------------------*/
1876     case OP_PLUS:
1877     case OP_MINPLUS:
1878 nigel 93 case OP_POSPLUS:
1879 nigel 77 case OP_NOTPLUS:
1880     case OP_NOTMINPLUS:
1881 nigel 93 case OP_NOTPOSPLUS:
1882 nigel 77 count = current_state->count; /* Already matched */
1883     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1884     if (clen > 0)
1885     {
1886 nigel 93 unsigned int otherd = NOTACHAR;
1887 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1888     {
1889     #ifdef SUPPORT_UTF8
1890 nigel 87 if (utf8 && d >= 128)
1891 nigel 77 {
1892     #ifdef SUPPORT_UCP
1893 nigel 87 otherd = _pcre_ucp_othercase(d);
1894 nigel 77 #endif /* SUPPORT_UCP */
1895     }
1896     else
1897     #endif /* SUPPORT_UTF8 */
1898     otherd = fcc[d];
1899     }
1900     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1901 nigel 93 {
1902     if (count > 0 &&
1903     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1904     {
1905     active_count--; /* Remove non-match possibility */
1906     next_active_state--;
1907     }
1908     count++;
1909     ADD_NEW(state_offset, count);
1910     }
1911 nigel 77 }
1912     break;
1913    
1914     /*-----------------------------------------------------------------*/
1915     case OP_QUERY:
1916     case OP_MINQUERY:
1917 nigel 93 case OP_POSQUERY:
1918 nigel 77 case OP_NOTQUERY:
1919     case OP_NOTMINQUERY:
1920 nigel 93 case OP_NOTPOSQUERY:
1921 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1922     if (clen > 0)
1923     {
1924 nigel 93 unsigned int otherd = NOTACHAR;
1925 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1926 nigel 77 {
1927     #ifdef SUPPORT_UTF8
1928 nigel 87 if (utf8 && d >= 128)
1929 nigel 77 {
1930     #ifdef SUPPORT_UCP
1931 nigel 87 otherd = _pcre_ucp_othercase(d);
1932 nigel 77 #endif /* SUPPORT_UCP */
1933     }
1934     else
1935     #endif /* SUPPORT_UTF8 */
1936     otherd = fcc[d];
1937     }
1938     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1939 nigel 93 {
1940     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1941     {
1942     active_count--; /* Remove non-match possibility */
1943     next_active_state--;
1944     }
1945     ADD_NEW(state_offset + dlen + 1, 0);
1946     }
1947 nigel 77 }
1948     break;
1949    
1950     /*-----------------------------------------------------------------*/
1951     case OP_STAR:
1952     case OP_MINSTAR:
1953 nigel 93 case OP_POSSTAR:
1954 nigel 77 case OP_NOTSTAR:
1955     case OP_NOTMINSTAR:
1956 nigel 93 case OP_NOTPOSSTAR:
1957 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1958     if (clen > 0)
1959     {
1960 nigel 93 unsigned int otherd = NOTACHAR;
1961 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1962 nigel 77 {
1963     #ifdef SUPPORT_UTF8
1964 nigel 87 if (utf8 && d >= 128)
1965 nigel 77 {
1966     #ifdef SUPPORT_UCP
1967 nigel 87 otherd = _pcre_ucp_othercase(d);
1968 nigel 77 #endif /* SUPPORT_UCP */
1969     }
1970     else
1971     #endif /* SUPPORT_UTF8 */
1972     otherd = fcc[d];
1973     }
1974     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1975 nigel 93 {
1976     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1977     {
1978     active_count--; /* Remove non-match possibility */
1979     next_active_state--;
1980     }
1981     ADD_NEW(state_offset, 0);
1982     }
1983 nigel 77 }
1984     break;
1985    
1986     /*-----------------------------------------------------------------*/
1987     case OP_EXACT:
1988 nigel 93 case OP_NOTEXACT:
1989     count = current_state->count; /* Number already matched */
1990     if (clen > 0)
1991     {
1992     unsigned int otherd = NOTACHAR;
1993     if ((ims & PCRE_CASELESS) != 0)
1994     {
1995     #ifdef SUPPORT_UTF8
1996     if (utf8 && d >= 128)
1997     {
1998     #ifdef SUPPORT_UCP
1999     otherd = _pcre_ucp_othercase(d);
2000     #endif /* SUPPORT_UCP */
2001     }
2002     else
2003     #endif /* SUPPORT_UTF8 */
2004     otherd = fcc[d];
2005     }
2006     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2007     {
2008     if (++count >= GET2(code, 1))
2009     { ADD_NEW(state_offset + dlen + 3, 0); }
2010     else
2011     { ADD_NEW(state_offset, count); }
2012     }
2013     }
2014     break;
2015    
2016     /*-----------------------------------------------------------------*/
2017 nigel 77 case OP_UPTO:
2018     case OP_MINUPTO:
2019 nigel 93 case OP_POSUPTO:
2020 nigel 77 case OP_NOTUPTO:
2021     case OP_NOTMINUPTO:
2022 nigel 93 case OP_NOTPOSUPTO:
2023     ADD_ACTIVE(state_offset + dlen + 3, 0);
2024 nigel 77 count = current_state->count; /* Number already matched */
2025     if (clen > 0)
2026     {
2027 nigel 93 unsigned int otherd = NOTACHAR;
2028 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2029     {
2030     #ifdef SUPPORT_UTF8
2031 nigel 87 if (utf8 && d >= 128)
2032 nigel 77 {
2033     #ifdef SUPPORT_UCP
2034 nigel 87 otherd = _pcre_ucp_othercase(d);
2035 nigel 77 #endif /* SUPPORT_UCP */
2036     }
2037     else
2038     #endif /* SUPPORT_UTF8 */
2039     otherd = fcc[d];
2040     }
2041     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2042     {
2043 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2044     {
2045     active_count--; /* Remove non-match possibility */
2046     next_active_state--;
2047     }
2048 nigel 77 if (++count >= GET2(code, 1))
2049     { ADD_NEW(state_offset + dlen + 3, 0); }
2050     else
2051     { ADD_NEW(state_offset, count); }
2052     }
2053     }
2054     break;
2055    
2056    
2057     /* ========================================================================== */
2058     /* These are the class-handling opcodes */
2059    
2060     case OP_CLASS:
2061     case OP_NCLASS:
2062     case OP_XCLASS:
2063     {
2064     BOOL isinclass = FALSE;
2065     int next_state_offset;
2066     const uschar *ecode;
2067    
2068     /* For a simple class, there is always just a 32-byte table, and we
2069     can set isinclass from it. */
2070    
2071     if (codevalue != OP_XCLASS)
2072     {
2073     ecode = code + 33;
2074     if (clen > 0)
2075     {
2076     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2077     ((code[1 + c/8] & (1 << (c&7))) != 0);
2078     }
2079     }
2080    
2081     /* An extended class may have a table or a list of single characters,
2082     ranges, or both, and it may be positive or negative. There's a
2083     function that sorts all this out. */
2084    
2085     else
2086     {
2087     ecode = code + GET(code, 1);
2088     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2089     }
2090    
2091     /* At this point, isinclass is set for all kinds of class, and ecode
2092     points to the byte after the end of the class. If there is a
2093     quantifier, this is where it will be. */
2094    
2095     next_state_offset = ecode - start_code;
2096    
2097     switch (*ecode)
2098     {
2099     case OP_CRSTAR:
2100     case OP_CRMINSTAR:
2101     ADD_ACTIVE(next_state_offset + 1, 0);
2102     if (isinclass) { ADD_NEW(state_offset, 0); }
2103     break;
2104    
2105     case OP_CRPLUS:
2106     case OP_CRMINPLUS:
2107     count = current_state->count; /* Already matched */
2108     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2109     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2110     break;
2111    
2112     case OP_CRQUERY:
2113     case OP_CRMINQUERY:
2114     ADD_ACTIVE(next_state_offset + 1, 0);
2115     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2116     break;
2117    
2118     case OP_CRRANGE:
2119     case OP_CRMINRANGE:
2120     count = current_state->count; /* Already matched */
2121     if (count >= GET2(ecode, 1))
2122     { ADD_ACTIVE(next_state_offset + 5, 0); }
2123     if (isinclass)
2124     {
2125 nigel 91 int max = GET2(ecode, 3);
2126     if (++count >= max && max != 0) /* Max 0 => no limit */
2127 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2128     else
2129     { ADD_NEW(state_offset, count); }
2130     }
2131     break;
2132    
2133     default:
2134     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2135     break;
2136     }
2137     }
2138     break;
2139    
2140     /* ========================================================================== */
2141     /* These are the opcodes for fancy brackets of various kinds. We have
2142     to use recursion in order to handle them. */
2143    
2144     case OP_ASSERT:
2145     case OP_ASSERT_NOT:
2146     case OP_ASSERTBACK:
2147     case OP_ASSERTBACK_NOT:
2148     {
2149     int rc;
2150     int local_offsets[2];
2151     int local_workspace[1000];
2152     const uschar *endasscode = code + GET(code, 1);
2153    
2154     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2155    
2156     rc = internal_dfa_exec(
2157     md, /* static match data */
2158     code, /* this subexpression's code */
2159     ptr, /* where we currently are */
2160     ptr - start_subject, /* start offset */
2161     local_offsets, /* offset vector */
2162     sizeof(local_offsets)/sizeof(int), /* size of same */
2163     local_workspace, /* workspace vector */
2164     sizeof(local_workspace)/sizeof(int), /* size of same */
2165     ims, /* the current ims flags */
2166     rlevel, /* function recursion level */
2167     recursing); /* pass on regex recursion */
2168    
2169     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2170     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2171     }
2172     break;
2173    
2174     /*-----------------------------------------------------------------*/
2175     case OP_COND:
2176 nigel 93 case OP_SCOND:
2177 nigel 77 {
2178     int local_offsets[1000];
2179     int local_workspace[1000];
2180     int condcode = code[LINK_SIZE+1];
2181    
2182 nigel 93 /* Back reference conditions are not supported */
2183 nigel 77
2184 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2185    
2186     /* The DEFINE condition is always false */
2187    
2188     if (condcode == OP_DEF)
2189 nigel 77 {
2190 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2191     }
2192    
2193     /* The only supported version of OP_RREF is for the value RREF_ANY,
2194     which means "test if in any recursion". We can't test for specifically
2195     recursed groups. */
2196    
2197     else if (condcode == OP_RREF)
2198     {
2199 nigel 77 int value = GET2(code, LINK_SIZE+2);
2200 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2201 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2202     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2203     }
2204    
2205     /* Otherwise, the condition is an assertion */
2206    
2207     else
2208     {
2209     int rc;
2210     const uschar *asscode = code + LINK_SIZE + 1;
2211     const uschar *endasscode = asscode + GET(asscode, 1);
2212    
2213     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2214    
2215     rc = internal_dfa_exec(
2216     md, /* fixed match data */
2217     asscode, /* this subexpression's code */
2218     ptr, /* where we currently are */
2219     ptr - start_subject, /* start offset */
2220     local_offsets, /* offset vector */
2221     sizeof(local_offsets)/sizeof(int), /* size of same */
2222     local_workspace, /* workspace vector */
2223     sizeof(local_workspace)/sizeof(int), /* size of same */
2224     ims, /* the current ims flags */
2225     rlevel, /* function recursion level */
2226     recursing); /* pass on regex recursion */
2227    
2228     if ((rc >= 0) ==
2229     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2230     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2231     else
2232     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2233     }
2234     }
2235     break;
2236    
2237     /*-----------------------------------------------------------------*/
2238     case OP_RECURSE:
2239     {
2240     int local_offsets[1000];
2241     int local_workspace[1000];
2242     int rc;
2243    
2244     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2245     recursing + 1));
2246    
2247     rc = internal_dfa_exec(
2248     md, /* fixed match data */
2249     start_code + GET(code, 1), /* this subexpression's code */
2250     ptr, /* where we currently are */
2251     ptr - start_subject, /* start offset */
2252     local_offsets, /* offset vector */
2253     sizeof(local_offsets)/sizeof(int), /* size of same */
2254     local_workspace, /* workspace vector */
2255     sizeof(local_workspace)/sizeof(int), /* size of same */
2256     ims, /* the current ims flags */
2257     rlevel, /* function recursion level */
2258     recursing + 1); /* regex recurse level */
2259    
2260     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2261     recursing + 1, rc));
2262    
2263     /* Ran out of internal offsets */
2264    
2265     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2266    
2267     /* For each successful matched substring, set up the next state with a
2268     count of characters to skip before trying it. Note that the count is in
2269     characters, not bytes. */
2270    
2271     if (rc > 0)
2272     {
2273     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2274     {
2275     const uschar *p = start_subject + local_offsets[rc];
2276     const uschar *pp = start_subject + local_offsets[rc+1];
2277     int charcount = local_offsets[rc+1] - local_offsets[rc];
2278     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2279     if (charcount > 0)
2280     {
2281     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2282     }
2283     else
2284     {
2285     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2286     }
2287     }
2288     }
2289     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2290     }
2291     break;
2292    
2293     /*-----------------------------------------------------------------*/
2294     case OP_ONCE:
2295     {
2296     int local_offsets[2];
2297     int local_workspace[1000];
2298    
2299     int rc = internal_dfa_exec(
2300     md, /* fixed match data */
2301     code, /* this subexpression's code */
2302     ptr, /* where we currently are */
2303     ptr - start_subject, /* start offset */
2304     local_offsets, /* offset vector */
2305     sizeof(local_offsets)/sizeof(int), /* size of same */
2306     local_workspace, /* workspace vector */
2307     sizeof(local_workspace)/sizeof(int), /* size of same */
2308     ims, /* the current ims flags */
2309     rlevel, /* function recursion level */
2310     recursing); /* pass on regex recursion */
2311    
2312     if (rc >= 0)
2313     {
2314     const uschar *end_subpattern = code;
2315     int charcount = local_offsets[1] - local_offsets[0];
2316     int next_state_offset, repeat_state_offset;
2317    
2318     do { end_subpattern += GET(end_subpattern, 1); }
2319     while (*end_subpattern == OP_ALT);
2320     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2321    
2322     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2323     arrange for the repeat state also to be added to the relevant list.
2324     Calculate the offset, or set -1 for no repeat. */
2325    
2326     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2327     *end_subpattern == OP_KETRMIN)?
2328     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2329    
2330     /* If we have matched an empty string, add the next state at the
2331     current character pointer. This is important so that the duplicate
2332     checking kicks in, which is what breaks infinite loops that match an
2333     empty string. */
2334    
2335     if (charcount == 0)
2336     {
2337     ADD_ACTIVE(next_state_offset, 0);
2338     }
2339    
2340     /* Optimization: if there are no more active states, and there
2341     are no new states yet set up, then skip over the subject string
2342     right here, to save looping. Otherwise, set up the new state to swing
2343     into action when the end of the substring is reached. */
2344    
2345     else if (i + 1 >= active_count && new_count == 0)
2346     {
2347     ptr += charcount;
2348     clen = 0;
2349     ADD_NEW(next_state_offset, 0);
2350    
2351     /* If we are adding a repeat state at the new character position,
2352     we must fudge things so that it is the only current state.
2353     Otherwise, it might be a duplicate of one we processed before, and
2354     that would cause it to be skipped. */
2355    
2356     if (repeat_state_offset >= 0)
2357     {
2358     next_active_state = active_states;
2359     active_count = 0;
2360     i = -1;
2361     ADD_ACTIVE(repeat_state_offset, 0);
2362     }
2363     }
2364     else
2365     {
2366     const uschar *p = start_subject + local_offsets[0];
2367     const uschar *pp = start_subject + local_offsets[1];
2368     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2369     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2370     if (repeat_state_offset >= 0)
2371     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2372     }
2373    
2374     }
2375     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2376     }
2377     break;
2378    
2379    
2380     /* ========================================================================== */
2381     /* Handle callouts */
2382    
2383     case OP_CALLOUT:
2384     if (pcre_callout != NULL)
2385     {
2386     int rrc;
2387     pcre_callout_block cb;
2388     cb.version = 1; /* Version 1 of the callout block */
2389     cb.callout_number = code[1];
2390     cb.offset_vector = offsets;
2391 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2392 nigel 77 cb.subject_length = end_subject - start_subject;
2393     cb.start_match = current_subject - start_subject;
2394     cb.current_position = ptr - start_subject;
2395     cb.pattern_position = GET(code, 2);
2396     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2397     cb.capture_top = 1;
2398     cb.capture_last = -1;
2399     cb.callout_data = md->callout_data;
2400     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2401     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2402     }
2403     break;
2404    
2405    
2406     /* ========================================================================== */
2407     default: /* Unsupported opcode */
2408     return PCRE_ERROR_DFA_UITEM;
2409     }
2410    
2411     NEXT_ACTIVE_STATE: continue;
2412    
2413     } /* End of loop scanning active states */
2414    
2415     /* We have finished the processing at the current subject character. If no
2416     new states have been set for the next character, we have found all the
2417     matches that we are going to find. If we are at the top level and partial
2418     matching has been requested, check for appropriate conditions. */
2419    
2420     if (new_count <= 0)
2421     {
2422     if (match_count < 0 && /* No matches found */
2423     rlevel == 1 && /* Top level match function */
2424     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2425     ptr >= end_subject && /* Reached end of subject */
2426     ptr > current_subject) /* Matched non-empty string */
2427     {
2428     if (offsetcount >= 2)
2429     {
2430     offsets[0] = current_subject - start_subject;
2431     offsets[1] = end_subject - start_subject;
2432     }
2433     match_count = PCRE_ERROR_PARTIAL;
2434     }
2435    
2436     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2437     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2438     rlevel*2-2, SP));
2439 nigel 91 break; /* In effect, "return", but see the comment below */
2440 nigel 77 }
2441    
2442     /* One or more states are active for the next character. */
2443    
2444     ptr += clen; /* Advance to next subject character */
2445     } /* Loop to move along the subject string */
2446    
2447 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2448     if we use "return" above, we have compiler trouble. Some compilers warn if
2449     there's nothing here because they think the function doesn't return a value. On
2450     the other hand, if we put a dummy statement here, some more clever compilers
2451     complain that it can't be reached. Sigh. */
2452 nigel 77
2453 nigel 91 return match_count;
2454 nigel 77 }
2455    
2456    
2457    
2458    
2459     /*************************************************
2460     * Execute a Regular Expression - DFA engine *
2461     *************************************************/
2462    
2463     /* This external function applies a compiled re to a subject string using a DFA
2464     engine. This function calls the internal function multiple times if the pattern
2465     is not anchored.
2466    
2467     Arguments:
2468     argument_re points to the compiled expression
2469 ph10 97 extra_data points to extra data or is NULL
2470 nigel 77 subject points to the subject string
2471     length length of subject string (may contain binary zeros)
2472     start_offset where to start in the subject string
2473     options option bits
2474     offsets vector of match offsets
2475     offsetcount size of same
2476     workspace workspace vector
2477     wscount size of same
2478    
2479     Returns: > 0 => number of match offset pairs placed in offsets
2480     = 0 => offsets overflowed; longest matches are present
2481     -1 => failed to match
2482     < -1 => some kind of unexpected problem
2483     */
2484    
2485 ph10 145 PCRE_EXP_DEFN int
2486 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2487     const char *subject, int length, int start_offset, int options, int *offsets,
2488     int offsetcount, int *workspace, int wscount)
2489     {
2490     real_pcre *re = (real_pcre *)argument_re;
2491     dfa_match_data match_block;
2492 nigel 91 dfa_match_data *md = &match_block;
2493 nigel 77 BOOL utf8, anchored, startline, firstline;
2494     const uschar *current_subject, *end_subject, *lcc;
2495    
2496     pcre_study_data internal_study;
2497     const pcre_study_data *study = NULL;
2498     real_pcre internal_re;
2499    
2500     const uschar *req_byte_ptr;
2501     const uschar *start_bits = NULL;
2502     BOOL first_byte_caseless = FALSE;
2503     BOOL req_byte_caseless = FALSE;
2504     int first_byte = -1;
2505     int req_byte = -1;
2506     int req_byte2 = -1;
2507 nigel 91 int newline;
2508 nigel 77
2509     /* Plausibility checks */
2510    
2511     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2512     if (re == NULL || subject == NULL || workspace == NULL ||
2513     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2514     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2515     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2516    
2517     /* We need to find the pointer to any study data before we test for byte
2518     flipping, so we scan the extra_data block first. This may set two fields in the
2519     match block, so we must initialize them beforehand. However, the other fields
2520     in the match block must not be set until after the byte flipping. */
2521    
2522 nigel 91 md->tables = re->tables;
2523     md->callout_data = NULL;
2524 nigel 77
2525     if (extra_data != NULL)
2526     {
2527     unsigned int flags = extra_data->flags;
2528     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2529     study = (const pcre_study_data *)extra_data->study_data;
2530     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2531 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2532     return PCRE_ERROR_DFA_UMLIMIT;
2533 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2534 nigel 91 md->callout_data = extra_data->callout_data;
2535 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2536 nigel 91 md->tables = extra_data->tables;
2537 nigel 77 }
2538    
2539     /* Check that the first field in the block is the magic number. If it is not,
2540     test for a regex that was compiled on a host of opposite endianness. If this is
2541     the case, flipped values are put in internal_re and internal_study if there was
2542     study data too. */
2543    
2544     if (re->magic_number != MAGIC_NUMBER)
2545     {
2546     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2547     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2548     if (study != NULL) study = &internal_study;
2549     }
2550    
2551     /* Set some local values */
2552    
2553     current_subject = (const unsigned char *)subject + start_offset;
2554     end_subject = (const unsigned char *)subject + length;
2555     req_byte_ptr = current_subject - 1;
2556    
2557 nigel 91 #ifdef SUPPORT_UTF8
2558 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2559 nigel 91 #else
2560     utf8 = FALSE;
2561     #endif
2562 nigel 77
2563 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2564     (re->options & PCRE_ANCHORED) != 0;
2565    
2566 nigel 77 /* The remaining fixed data for passing around. */
2567    
2568 nigel 91 md->start_code = (const uschar *)argument_re +
2569 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2570 nigel 91 md->start_subject = (const unsigned char *)subject;
2571     md->end_subject = end_subject;
2572     md->moptions = options;
2573     md->poptions = re->options;
2574 nigel 77
2575 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2576     nothing is set at run time, whatever was used at compile time applies. */
2577 nigel 91
2578 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2579 nigel 93 PCRE_NEWLINE_BITS)
2580 nigel 91 {
2581 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2582 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2583     case PCRE_NEWLINE_LF: newline = '\n'; break;
2584     case PCRE_NEWLINE_CR+
2585     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2586 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2587 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2588 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2589 nigel 91 }
2590    
2591 ph10 149 if (newline == -2)
2592 nigel 91 {
2593 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2594     }
2595     else if (newline < 0)
2596     {
2597 nigel 93 md->nltype = NLTYPE_ANY;
2598 nigel 91 }
2599     else
2600     {
2601 nigel 93 md->nltype = NLTYPE_FIXED;
2602     if (newline > 255)
2603     {
2604     md->nllen = 2;
2605     md->nl[0] = (newline >> 8) & 255;
2606     md->nl[1] = newline & 255;
2607     }
2608     else
2609     {
2610     md->nllen = 1;
2611     md->nl[0] = newline;
2612     }
2613 nigel 91 }
2614    
2615 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2616     back the character offset. */
2617    
2618     #ifdef SUPPORT_UTF8
2619     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2620     {
2621     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2622     return PCRE_ERROR_BADUTF8;
2623     if (start_offset > 0 && start_offset < length)
2624     {
2625     int tb = ((uschar *)subject)[start_offset];
2626     if (tb > 127)
2627     {
2628     tb &= 0xc0;
2629     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2630     }
2631     }
2632     }
2633     #endif
2634    
2635     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2636     is a feature that makes it possible to save compiled regex and re-use them
2637     in other programs later. */
2638    
2639 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2640 nigel 77
2641     /* The lower casing table and the "must be at the start of a line" flag are
2642     used in a loop when finding where to start. */
2643    
2644 nigel 91 lcc = md->tables + lcc_offset;
2645 nigel 77 startline = (re->options & PCRE_STARTLINE) != 0;
2646     firstline = (re->options & PCRE_FIRSTLINE) != 0;
2647    
2648     /* Set up the first character to match, if available. The first_byte value is
2649     never set for an anchored regular expression, but the anchoring may be forced
2650     at run time, so we have to test for anchoring. The first char may be unset for
2651     an unanchored pattern, of course. If there's no first char and the pattern was
2652     studied, there may be a bitmap of possible first characters. */
2653    
2654     if (!anchored)
2655     {
2656     if ((re->options & PCRE_FIRSTSET) != 0)
2657     {
2658     first_byte = re->first_byte & 255;
2659     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2660     first_byte = lcc[first_byte];
2661     }
2662     else
2663     {
2664     if (startline && study != NULL &&
2665     (study->options & PCRE_STUDY_MAPPED) != 0)
2666     start_bits = study->start_bits;
2667     }
2668     }
2669    
2670     /* For anchored or unanchored matches, there may be a "last known required
2671     character" set. */
2672    
2673     if ((re->options & PCRE_REQCHSET) != 0)
2674     {
2675     req_byte = re->req_byte & 255;
2676     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2677 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2678 nigel 77 }
2679    
2680     /* Call the main matching function, looping for a non-anchored regex after a
2681     failed match. Unless restarting, optimize by moving to the first match
2682     character if possible, when not anchored. Then unless wanting a partial match,
2683     check for a required later character. */
2684    
2685     for (;;)
2686     {
2687     int rc;
2688    
2689     if ((options & PCRE_DFA_RESTART) == 0)
2690     {
2691     const uschar *save_end_subject = end_subject;
2692    
2693     /* Advance to a unique first char if possible. If firstline is TRUE, the
2694     start of the match is constrained to the first line of a multiline string.
2695 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2696     scanning at a newline. If the match fails at the newline, later code breaks
2697     this loop. */
2698 nigel 77
2699     if (firstline)
2700     {
2701     const uschar *t = current_subject;
2702 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2703 nigel 77 end_subject = t;
2704     }
2705    
2706     if (first_byte >= 0)
2707     {
2708     if (first_byte_caseless)
2709     while (current_subject < end_subject &&
2710     lcc[*current_subject] != first_byte)
2711     current_subject++;
2712     else
2713     while (current_subject < end_subject && *current_subject != first_byte)
2714     current_subject++;
2715     }
2716    
2717 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2718 nigel 77
2719     else if (startline)
2720     {
2721 nigel 93 if (current_subject > md->start_subject + start_offset)
2722 nigel 77 {
2723 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2724 nigel 77 current_subject++;
2725 ph10 130
2726 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2727     ANYCRLF, and we are now at a LF, advance the match position by one more
2728     character. */
2729 ph10 134
2730 ph10 130 if (current_subject[-1] == '\r' &&
2731 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2732 ph10 130 current_subject < end_subject &&
2733     *current_subject == '\n')
2734     current_subject++;
2735 nigel 77 }
2736     }
2737    
2738     /* Or to a non-unique first char after study */
2739    
2740     else if (start_bits != NULL)
2741     {
2742     while (current_subject < end_subject)
2743     {
2744     register unsigned int c = *current_subject;
2745     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2746     else break;
2747     }
2748     }
2749    
2750     /* Restore fudged end_subject */
2751    
2752     end_subject = save_end_subject;
2753     }
2754    
2755     /* If req_byte is set, we know that that character must appear in the subject
2756     for the match to succeed. If the first character is set, req_byte must be
2757     later in the subject; otherwise the test starts at the match point. This
2758     optimization can save a huge amount of work in patterns with nested unlimited
2759     repeats that aren't going to match. Writing separate code for cased/caseless
2760     versions makes it go faster, as does using an autoincrement and backing off
2761     on a match.
2762    
2763     HOWEVER: when the subject string is very, very long, searching to its end can
2764     take a long time, and give bad performance on quite ordinary patterns. This
2765     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2766     don't do this when the string is sufficiently long.
2767    
2768     ALSO: this processing is disabled when partial matching is requested.
2769     */
2770    
2771     if (req_byte >= 0 &&
2772     end_subject - current_subject < REQ_BYTE_MAX &&
2773     (options & PCRE_PARTIAL) == 0)
2774     {
2775     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2776    
2777     /* We don't need to repeat the search if we haven't yet reached the
2778     place we found it at last time. */
2779    
2780     if (p > req_byte_ptr)
2781     {
2782     if (req_byte_caseless)
2783     {
2784     while (p < end_subject)
2785     {
2786     register int pp = *p++;
2787     if (pp == req_byte || pp == req_byte2) { p--; break; }
2788     }
2789     }
2790     else
2791     {
2792     while (p < end_subject)
2793     {
2794     if (*p++ == req_byte) { p--; break; }
2795     }
2796     }
2797    
2798     /* If we can't find the required character, break the matching loop,
2799     which will cause a return or PCRE_ERROR_NOMATCH. */
2800    
2801     if (p >= end_subject) break;
2802    
2803     /* If we have found the required character, save the point where we
2804     found it, so that we don't search again next time round the loop if
2805     the start hasn't passed this character yet. */
2806    
2807     req_byte_ptr = p;
2808     }
2809     }
2810    
2811     /* OK, now we can do the business */
2812    
2813     rc = internal_dfa_exec(
2814 nigel 91 md, /* fixed match data */
2815     md->start_code, /* this subexpression's code */
2816     current_subject, /* where we currently are */
2817     start_offset, /* start offset in subject */
2818     offsets, /* offset vector */
2819     offsetcount, /* size of same */
2820     workspace, /* workspace vector */
2821     wscount, /* size of same */
2822 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2823 nigel 91 0, /* function recurse level */
2824     0); /* regex recurse level */
2825 nigel 77
2826     /* Anything other than "no match" means we are done, always; otherwise, carry
2827     on only if not anchored. */
2828    
2829     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2830    
2831     /* Advance to the next subject character unless we are at the end of a line
2832     and firstline is set. */
2833    
2834 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2835 nigel 77 current_subject++;
2836     if (utf8)
2837     {
2838     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2839     current_subject++;
2840     }
2841     if (current_subject > end_subject) break;
2842    
2843 ph10 150 /* If we have just passed a CR and the newline option is CRLF or ANY or
2844 ph10 149 ANYCRLF, and we are now at a LF, advance the match position by one more
2845     character. */
2846 nigel 93
2847     if (current_subject[-1] == '\r' &&
2848 ph10 150 (md->nltype == NLTYPE_ANY ||
2849     md->nltype == NLTYPE_ANYCRLF ||
2850 ph10 149 md->nllen == 2) &&
2851 nigel 93 current_subject < end_subject &&
2852     *current_subject == '\n')
2853     current_subject++;
2854    
2855     } /* "Bumpalong" loop */
2856    
2857 nigel 77 return PCRE_ERROR_NOMATCH;
2858     }
2859    
2860     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12