/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (hide annotations) (download)
Sat Feb 24 21:41:34 2007 UTC (7 years, 7 months ago) by nigel
File MIME type: text/plain
File size: 70023 byte(s)
Load pcre-6.7 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42     alternative matching function that uses a DFA algorithm. This is NOT Perl-
43     compatible, but it has advantages in certain applications. */
44    
45    
46 nigel 91 #define NLBLOCK md /* The block containing newline information */
47 nigel 77 #include "pcre_internal.h"
48    
49    
50     /* For use to indent debugging output */
51    
52     #define SP " "
53    
54    
55    
56     /*************************************************
57     * Code parameters and static tables *
58     *************************************************/
59    
60     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
61     into others, under special conditions. A gap of 10 between the blocks should be
62     enough. */
63    
64     #define OP_PROP_EXTRA (EXTRACT_BASIC_MAX+1)
65     #define OP_EXTUNI_EXTRA (EXTRACT_BASIC_MAX+11)
66    
67    
68     /* This table identifies those opcodes that are followed immediately by a
69     character that is to be tested in some way. This makes is possible to
70     centralize the loading of these characters. In the case of Type * etc, the
71     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
72     small value. */
73    
74     static uschar coptable[] = {
75     0, /* End */
76     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* \A, \G, \B, \b, \D, \d, \S, \s, \W, \w */
77     0, 0, /* Any, Anybyte */
78     0, 0, 0, /* NOTPROP, PROP, EXTUNI */
79     0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
80     1, /* Char */
81     1, /* Charnc */
82     1, /* not */
83     /* Positive single-char repeats */
84     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
85     3, 3, 3, /* upto, minupto, exact */
86     /* Negative single-char repeats - only for chars < 256 */
87     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
88     3, 3, 3, /* NOT upto, minupto, exact */
89     /* Positive type repeats */
90     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
91     3, 3, 3, /* Type upto, minupto, exact */
92     /* Character class & ref repeats */
93     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
94     0, 0, /* CRRANGE, CRMINRANGE */
95     0, /* CLASS */
96     0, /* NCLASS */
97     0, /* XCLASS - variable length */
98     0, /* REF */
99     0, /* RECURSE */
100     0, /* CALLOUT */
101     0, /* Alt */
102     0, /* Ket */
103     0, /* KetRmax */
104     0, /* KetRmin */
105     0, /* Assert */
106     0, /* Assert not */
107     0, /* Assert behind */
108     0, /* Assert behind not */
109     0, /* Reverse */
110     0, /* Once */
111     0, /* COND */
112     0, /* CREF */
113     0, 0, /* BRAZERO, BRAMINZERO */
114     0, /* BRANUMBER */
115     0 /* BRA */
116     };
117    
118     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
119     and \w */
120    
121     static uschar toptable1[] = {
122     0, 0, 0, 0, 0,
123     ctype_digit, ctype_digit,
124     ctype_space, ctype_space,
125     ctype_word, ctype_word,
126     0 /* OP_ANY */
127     };
128    
129     static uschar toptable2[] = {
130     0, 0, 0, 0, 0,
131     ctype_digit, 0,
132     ctype_space, 0,
133     ctype_word, 0,
134     1 /* OP_ANY */
135     };
136    
137    
138     /* Structure for holding data about a particular state, which is in effect the
139     current data for an active path through the match tree. It must consist
140     entirely of ints because the working vector we are passed, and which we put
141     these structures in, is a vector of ints. */
142    
143     typedef struct stateblock {
144     int offset; /* Offset to opcode */
145     int count; /* Count for repeats */
146     int ims; /* ims flag bits */
147     int data; /* Some use extra data */
148     } stateblock;
149    
150     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
151    
152    
153     #ifdef DEBUG
154     /*************************************************
155     * Print character string *
156     *************************************************/
157    
158     /* Character string printing function for debugging.
159    
160     Arguments:
161     p points to string
162     length number of bytes
163     f where to print
164    
165     Returns: nothing
166     */
167    
168     static void
169     pchars(unsigned char *p, int length, FILE *f)
170     {
171     int c;
172     while (length-- > 0)
173     {
174     if (isprint(c = *(p++)))
175     fprintf(f, "%c", c);
176     else
177     fprintf(f, "\\x%02x", c);
178     }
179     }
180     #endif
181    
182    
183    
184     /*************************************************
185     * Execute a Regular Expression - DFA engine *
186     *************************************************/
187    
188     /* This internal function applies a compiled pattern to a subject string,
189     starting at a given point, using a DFA engine. This function is called from the
190     external one, possibly multiple times if the pattern is not anchored. The
191     function calls itself recursively for some kinds of subpattern.
192    
193     Arguments:
194     md the match_data block with fixed information
195     this_start_code the opening bracket of this subexpression's code
196     current_subject where we currently are in the subject string
197     start_offset start offset in the subject string
198     offsets vector to contain the matching string offsets
199     offsetcount size of same
200     workspace vector of workspace
201     wscount size of same
202     ims the current ims flags
203     rlevel function call recursion level
204     recursing regex recursive call level
205    
206     Returns: > 0 =>
207     = 0 =>
208     -1 => failed to match
209     < -1 => some kind of unexpected problem
210    
211     The following macros are used for adding states to the two state vectors (one
212     for the current character, one for the following character). */
213    
214     #define ADD_ACTIVE(x,y) \
215     if (active_count++ < wscount) \
216     { \
217     next_active_state->offset = (x); \
218     next_active_state->count = (y); \
219     next_active_state->ims = ims; \
220     next_active_state++; \
221     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
222     } \
223     else return PCRE_ERROR_DFA_WSSIZE
224    
225     #define ADD_ACTIVE_DATA(x,y,z) \
226     if (active_count++ < wscount) \
227     { \
228     next_active_state->offset = (x); \
229     next_active_state->count = (y); \
230     next_active_state->ims = ims; \
231     next_active_state->data = (z); \
232     next_active_state++; \
233     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
234     } \
235     else return PCRE_ERROR_DFA_WSSIZE
236    
237     #define ADD_NEW(x,y) \
238     if (new_count++ < wscount) \
239     { \
240     next_new_state->offset = (x); \
241     next_new_state->count = (y); \
242     next_new_state->ims = ims; \
243     next_new_state++; \
244     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
245     } \
246     else return PCRE_ERROR_DFA_WSSIZE
247    
248     #define ADD_NEW_DATA(x,y,z) \
249     if (new_count++ < wscount) \
250     { \
251     next_new_state->offset = (x); \
252     next_new_state->count = (y); \
253     next_new_state->ims = ims; \
254     next_new_state->data = (z); \
255     next_new_state++; \
256     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
257     } \
258     else return PCRE_ERROR_DFA_WSSIZE
259    
260     /* And now, here is the code */
261    
262     static int
263     internal_dfa_exec(
264     dfa_match_data *md,
265     const uschar *this_start_code,
266     const uschar *current_subject,
267     int start_offset,
268     int *offsets,
269     int offsetcount,
270     int *workspace,
271     int wscount,
272     int ims,
273     int rlevel,
274     int recursing)
275     {
276     stateblock *active_states, *new_states, *temp_states;
277     stateblock *next_active_state, *next_new_state;
278    
279     const uschar *ctypes, *lcc, *fcc;
280     const uschar *ptr;
281     const uschar *end_code;
282    
283     int active_count, new_count, match_count;
284    
285     /* Some fields in the md block are frequently referenced, so we load them into
286     independent variables in the hope that this will perform better. */
287    
288     const uschar *start_subject = md->start_subject;
289     const uschar *end_subject = md->end_subject;
290     const uschar *start_code = md->start_code;
291    
292 nigel 87 #ifdef SUPPORT_UTF8
293 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
294 nigel 87 #endif
295 nigel 77
296     rlevel++;
297     offsetcount &= (-2);
298    
299     wscount -= 2;
300     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
301     (2 * INTS_PER_STATEBLOCK);
302    
303     DPRINTF(("\n%.*s---------------------\n"
304     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
305     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
306    
307     ctypes = md->tables + ctypes_offset;
308     lcc = md->tables + lcc_offset;
309     fcc = md->tables + fcc_offset;
310    
311     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
312    
313     active_states = (stateblock *)(workspace + 2);
314     next_new_state = new_states = active_states + wscount;
315     new_count = 0;
316    
317     /* The first thing in any (sub) pattern is a bracket of some sort. Push all
318     the alternative states onto the list, and find out where the end is. This
319     makes is possible to use this function recursively, when we want to stop at a
320     matching internal ket rather than at the end.
321    
322     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
323     a backward assertion. In that case, we have to find out the maximum amount to
324     move back, and set up each alternative appropriately. */
325    
326     if (this_start_code[1+LINK_SIZE] == OP_REVERSE)
327     {
328     int max_back = 0;
329     int gone_back;
330    
331     end_code = this_start_code;
332     do
333     {
334     int back = GET(end_code, 2+LINK_SIZE);
335     if (back > max_back) max_back = back;
336     end_code += GET(end_code, 1);
337     }
338     while (*end_code == OP_ALT);
339    
340     /* If we can't go back the amount required for the longest lookbehind
341     pattern, go back as far as we can; some alternatives may still be viable. */
342    
343     #ifdef SUPPORT_UTF8
344     /* In character mode we have to step back character by character */
345    
346     if (utf8)
347     {
348     for (gone_back = 0; gone_back < max_back; gone_back++)
349     {
350     if (current_subject <= start_subject) break;
351     current_subject--;
352     while (current_subject > start_subject &&
353     (*current_subject & 0xc0) == 0x80)
354     current_subject--;
355     }
356     }
357     else
358     #endif
359    
360     /* In byte-mode we can do this quickly. */
361    
362     {
363     gone_back = (current_subject - max_back < start_subject)?
364     current_subject - start_subject : max_back;
365     current_subject -= gone_back;
366     }
367    
368     /* Now we can process the individual branches. */
369    
370     end_code = this_start_code;
371     do
372     {
373     int back = GET(end_code, 2+LINK_SIZE);
374     if (back <= gone_back)
375     {
376     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
377     ADD_NEW_DATA(-bstate, 0, gone_back - back);
378     }
379     end_code += GET(end_code, 1);
380     }
381     while (*end_code == OP_ALT);
382     }
383    
384     /* This is the code for a "normal" subpattern (not a backward assertion). The
385     start of a whole pattern is always one of these. If we are at the top level,
386     we may be asked to restart matching from the same point that we reached for a
387     previous partial match. We still have to scan through the top-level branches to
388     find the end state. */
389    
390     else
391     {
392     end_code = this_start_code;
393    
394     /* Restarting */
395    
396     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
397     {
398     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
399     new_count = workspace[1];
400     if (!workspace[0])
401     memcpy(new_states, active_states, new_count * sizeof(stateblock));
402     }
403    
404     /* Not restarting */
405    
406     else
407     {
408     do
409     {
410     ADD_NEW(end_code - start_code + 1 + LINK_SIZE, 0);
411     end_code += GET(end_code, 1);
412     }
413     while (*end_code == OP_ALT);
414     }
415     }
416    
417     workspace[0] = 0; /* Bit indicating which vector is current */
418    
419     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
420    
421     /* Loop for scanning the subject */
422    
423     ptr = current_subject;
424     for (;;)
425     {
426     int i, j;
427 nigel 91 int clen, dlen;
428     unsigned int c, d;
429 nigel 77
430     /* Make the new state list into the active state list and empty the
431     new state list. */
432    
433     temp_states = active_states;
434     active_states = new_states;
435     new_states = temp_states;
436     active_count = new_count;
437     new_count = 0;
438    
439     workspace[0] ^= 1; /* Remember for the restarting feature */
440     workspace[1] = active_count;
441    
442     #ifdef DEBUG
443     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
444     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
445     printf("\"\n");
446    
447     printf("%.*sActive states: ", rlevel*2-2, SP);
448     for (i = 0; i < active_count; i++)
449     printf("%d/%d ", active_states[i].offset, active_states[i].count);
450     printf("\n");
451     #endif
452    
453     /* Set the pointers for adding new states */
454    
455     next_active_state = active_states + active_count;
456     next_new_state = new_states;
457    
458     /* Load the current character from the subject outside the loop, as many
459     different states may want to look at it, and we assume that at least one
460     will. */
461    
462     if (ptr < end_subject)
463     {
464     clen = 1;
465     #ifdef SUPPORT_UTF8
466     if (utf8) { GETCHARLEN(c, ptr, clen); } else
467     #endif /* SUPPORT_UTF8 */
468     c = *ptr;
469     }
470     else
471     {
472     clen = 0; /* At end subject */
473     c = -1;
474     }
475    
476     /* Scan up the active states and act on each one. The result of an action
477     may be to add more states to the currently active list (e.g. on hitting a
478     parenthesis) or it may be to put states on the new list, for considering
479     when we move the character pointer on. */
480    
481     for (i = 0; i < active_count; i++)
482     {
483     stateblock *current_state = active_states + i;
484     const uschar *code;
485     int state_offset = current_state->offset;
486     int count, codevalue;
487 nigel 87 int chartype, script;
488 nigel 77
489     #ifdef DEBUG
490     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
491     if (c < 0) printf("-1\n");
492     else if (c > 32 && c < 127) printf("'%c'\n", c);
493     else printf("0x%02x\n", c);
494     #endif
495    
496     /* This variable is referred to implicity in the ADD_xxx macros. */
497    
498     ims = current_state->ims;
499    
500     /* A negative offset is a special case meaning "hold off going to this
501     (negated) state until the number of characters in the data field have
502     been skipped". */
503    
504     if (state_offset < 0)
505     {
506     if (current_state->data > 0)
507     {
508     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
509     ADD_NEW_DATA(state_offset, current_state->count,
510     current_state->data - 1);
511     continue;
512     }
513     else
514     {
515     current_state->offset = state_offset = -state_offset;
516     }
517     }
518    
519     /* Check for a duplicate state with the same count, and skip if found. */
520    
521     for (j = 0; j < i; j++)
522     {
523     if (active_states[j].offset == state_offset &&
524     active_states[j].count == current_state->count)
525     {
526     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
527     goto NEXT_ACTIVE_STATE;
528     }
529     }
530    
531     /* The state offset is the offset to the opcode */
532    
533     code = start_code + state_offset;
534     codevalue = *code;
535     if (codevalue >= OP_BRA) codevalue = OP_BRA; /* All brackets are equal */
536    
537     /* If this opcode is followed by an inline character, load it. It is
538     tempting to test for the presence of a subject character here, but that
539     is wrong, because sometimes zero repetitions of the subject are
540     permitted.
541    
542     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
543     argument that is not a data character - but is always one byte long.
544     Unfortunately, we have to take special action to deal with \P, \p, and
545     \X in this case. To keep the other cases fast, convert these ones to new
546     opcodes. */
547    
548     if (coptable[codevalue] > 0)
549     {
550     dlen = 1;
551     #ifdef SUPPORT_UTF8
552     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
553     #endif /* SUPPORT_UTF8 */
554     d = code[coptable[codevalue]];
555     if (codevalue >= OP_TYPESTAR)
556     {
557     if (d == OP_ANYBYTE) return PCRE_ERROR_DFA_UITEM;
558     if (d >= OP_NOTPROP)
559     codevalue += (d == OP_EXTUNI)? OP_EXTUNI_EXTRA : OP_PROP_EXTRA;
560     }
561     }
562     else
563     {
564     dlen = 0; /* Not strictly necessary, but compilers moan */
565     d = -1; /* if these variables are not set. */
566     }
567    
568    
569     /* Now process the individual opcodes */
570    
571     switch (codevalue)
572     {
573    
574     /* ========================================================================== */
575     /* Reached a closing bracket. If not at the end of the pattern, carry
576     on with the next opcode. Otherwise, unless we have an empty string and
577     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
578     matches so we always have the longest first. */
579    
580     case OP_KET:
581     case OP_KETRMIN:
582     case OP_KETRMAX:
583     if (code != end_code)
584     {
585     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
586     if (codevalue != OP_KET)
587     {
588     ADD_ACTIVE(state_offset - GET(code, 1), 0);
589     }
590     }
591     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
592     {
593     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
594     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
595     match_count = 0;
596     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
597     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
598     if (offsetcount >= 2)
599     {
600     offsets[0] = current_subject - start_subject;
601     offsets[1] = ptr - start_subject;
602     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
603     offsets[1] - offsets[0], current_subject));
604     }
605     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
606     {
607     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
608     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
609     match_count, rlevel*2-2, SP));
610     return match_count;
611     }
612     }
613     break;
614    
615     /* ========================================================================== */
616     /* These opcodes add to the current list of states without looking
617     at the current character. */
618    
619     /*-----------------------------------------------------------------*/
620     case OP_ALT:
621     do { code += GET(code, 1); } while (*code == OP_ALT);
622     ADD_ACTIVE(code - start_code, 0);
623     break;
624    
625     /*-----------------------------------------------------------------*/
626     case OP_BRA:
627     do
628     {
629     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
630     code += GET(code, 1);
631     }
632     while (*code == OP_ALT);
633     break;
634    
635     /*-----------------------------------------------------------------*/
636     case OP_BRAZERO:
637     case OP_BRAMINZERO:
638     ADD_ACTIVE(state_offset + 1, 0);
639     code += 1 + GET(code, 2);
640     while (*code == OP_ALT) code += GET(code, 1);
641     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
642     break;
643    
644     /*-----------------------------------------------------------------*/
645     case OP_BRANUMBER:
646     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
647     break;
648    
649     /*-----------------------------------------------------------------*/
650     case OP_CIRC:
651     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
652 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
653     ptr >= start_subject + md->nllen &&
654     ptr != end_subject &&
655     IS_NEWLINE(ptr - md->nllen)))
656 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
657     break;
658    
659     /*-----------------------------------------------------------------*/
660     case OP_EOD:
661     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
662     break;
663    
664     /*-----------------------------------------------------------------*/
665     case OP_OPT:
666     ims = code[1];
667     ADD_ACTIVE(state_offset + 2, 0);
668     break;
669    
670     /*-----------------------------------------------------------------*/
671     case OP_SOD:
672     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
673     break;
674    
675     /*-----------------------------------------------------------------*/
676     case OP_SOM:
677     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
678     break;
679    
680    
681     /* ========================================================================== */
682     /* These opcodes inspect the next subject character, and sometimes
683     the previous one as well, but do not have an argument. The variable
684     clen contains the length of the current character and is zero if we are
685     at the end of the subject. */
686    
687     /*-----------------------------------------------------------------*/
688     case OP_ANY:
689 nigel 91 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 ||
690     ptr > end_subject - md->nllen ||
691     !IS_NEWLINE(ptr)))
692 nigel 77 { ADD_NEW(state_offset + 1, 0); }
693     break;
694    
695     /*-----------------------------------------------------------------*/
696     case OP_EODN:
697 nigel 91 if (clen == 0 ||
698     (ptr == end_subject - md->nllen && IS_NEWLINE(ptr)))
699 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
700     break;
701    
702     /*-----------------------------------------------------------------*/
703     case OP_DOLL:
704     if ((md->moptions & PCRE_NOTEOL) == 0)
705     {
706 nigel 91 if (clen == 0 ||
707     (ptr <= end_subject - md->nllen && IS_NEWLINE(ptr) &&
708     ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
709     ))
710 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
711     }
712 nigel 91 else if ((ims & PCRE_MULTILINE) != 0 &&
713     ptr <= end_subject - md->nllen && IS_NEWLINE(ptr))
714 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
715     break;
716    
717     /*-----------------------------------------------------------------*/
718    
719     case OP_DIGIT:
720     case OP_WHITESPACE:
721     case OP_WORDCHAR:
722     if (clen > 0 && c < 256 &&
723     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
724     { ADD_NEW(state_offset + 1, 0); }
725     break;
726    
727     /*-----------------------------------------------------------------*/
728     case OP_NOT_DIGIT:
729     case OP_NOT_WHITESPACE:
730     case OP_NOT_WORDCHAR:
731     if (clen > 0 && (c >= 256 ||
732     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
733     { ADD_NEW(state_offset + 1, 0); }
734     break;
735    
736     /*-----------------------------------------------------------------*/
737     case OP_WORD_BOUNDARY:
738     case OP_NOT_WORD_BOUNDARY:
739     {
740     int left_word, right_word;
741    
742     if (ptr > start_subject)
743     {
744     const uschar *temp = ptr - 1;
745     #ifdef SUPPORT_UTF8
746     if (utf8) BACKCHAR(temp);
747     #endif
748     GETCHARTEST(d, temp);
749     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
750     }
751     else left_word = 0;
752    
753     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
754     else right_word = 0;
755    
756     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
757     { ADD_ACTIVE(state_offset + 1, 0); }
758     }
759     break;
760    
761    
762     #ifdef SUPPORT_UCP
763    
764     /*-----------------------------------------------------------------*/
765     /* Check the next character by Unicode property. We will get here only
766     if the support is in the binary; otherwise a compile-time error occurs.
767     */
768    
769     case OP_PROP:
770     case OP_NOTPROP:
771     if (clen > 0)
772     {
773 nigel 87 BOOL OK;
774     int category = _pcre_ucp_findprop(c, &chartype, &script);
775     switch(code[1])
776 nigel 77 {
777 nigel 87 case PT_ANY:
778     OK = TRUE;
779     break;
780    
781     case PT_LAMP:
782     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
783     break;
784    
785     case PT_GC:
786     OK = category == code[2];
787     break;
788    
789     case PT_PC:
790     OK = chartype == code[2];
791     break;
792    
793     case PT_SC:
794     OK = script == code[2];
795     break;
796    
797     /* Should never occur, but keep compilers from grumbling. */
798    
799     default:
800     OK = codevalue != OP_PROP;
801     break;
802 nigel 77 }
803 nigel 87
804     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
805 nigel 77 }
806     break;
807     #endif
808    
809    
810    
811     /* ========================================================================== */
812     /* These opcodes likewise inspect the subject character, but have an
813     argument that is not a data character. It is one of these opcodes:
814     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
815     OP_NOT_WORDCHAR. The value is loaded into d. */
816    
817     case OP_TYPEPLUS:
818     case OP_TYPEMINPLUS:
819     count = current_state->count; /* Already matched */
820     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
821     if (clen > 0)
822     {
823     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
824     (c < 256 &&
825 nigel 91 (d != OP_ANY ||
826     (ims & PCRE_DOTALL) != 0 ||
827     ptr > end_subject - md->nllen ||
828     !IS_NEWLINE(ptr)
829     ) &&
830 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
831     {
832     count++;
833     ADD_NEW(state_offset, count);
834     }
835     }
836     break;
837    
838     /*-----------------------------------------------------------------*/
839     case OP_TYPEQUERY:
840     case OP_TYPEMINQUERY:
841     ADD_ACTIVE(state_offset + 2, 0);
842     if (clen > 0)
843     {
844     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
845     (c < 256 &&
846 nigel 91 (d != OP_ANY ||
847     (ims & PCRE_DOTALL) != 0 ||
848     ptr > end_subject - md->nllen ||
849     !IS_NEWLINE(ptr)
850     ) &&
851 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
852     {
853     ADD_NEW(state_offset + 2, 0);
854     }
855     }
856     break;
857    
858     /*-----------------------------------------------------------------*/
859     case OP_TYPESTAR:
860     case OP_TYPEMINSTAR:
861     ADD_ACTIVE(state_offset + 2, 0);
862     if (clen > 0)
863     {
864     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
865     (c < 256 &&
866 nigel 91 (d != OP_ANY ||
867     (ims & PCRE_DOTALL) != 0 ||
868     ptr > end_subject - md->nllen ||
869     !IS_NEWLINE(ptr)
870     ) &&
871 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
872     {
873     ADD_NEW(state_offset, 0);
874     }
875     }
876     break;
877    
878     /*-----------------------------------------------------------------*/
879     case OP_TYPEEXACT:
880     case OP_TYPEUPTO:
881     case OP_TYPEMINUPTO:
882     if (codevalue != OP_TYPEEXACT)
883     { ADD_ACTIVE(state_offset + 4, 0); }
884     count = current_state->count; /* Number already matched */
885     if (clen > 0)
886     {
887     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
888     (c < 256 &&
889 nigel 91 (d != OP_ANY ||
890     (ims & PCRE_DOTALL) != 0 ||
891     ptr > end_subject - md->nllen ||
892     !IS_NEWLINE(ptr)
893     ) &&
894 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
895     {
896     if (++count >= GET2(code, 1))
897     { ADD_NEW(state_offset + 4, 0); }
898     else
899     { ADD_NEW(state_offset, count); }
900     }
901     }
902     break;
903    
904     /* ========================================================================== */
905     /* These are virtual opcodes that are used when something like
906     OP_TYPEPLUS has OP_PROP, OP_NOTPROP, or OP_EXTUNI as its argument. It
907     keeps the code above fast for the other cases. The argument is in the
908     d variable. */
909    
910     case OP_PROP_EXTRA + OP_TYPEPLUS:
911     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
912     count = current_state->count; /* Already matched */
913 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
914 nigel 77 if (clen > 0)
915     {
916 nigel 87 BOOL OK;
917     int category = _pcre_ucp_findprop(c, &chartype, &script);
918     switch(code[2])
919     {
920     case PT_ANY:
921     OK = TRUE;
922     break;
923    
924     case PT_LAMP:
925     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
926     break;
927    
928     case PT_GC:
929     OK = category == code[3];
930     break;
931    
932     case PT_PC:
933     OK = chartype == code[3];
934     break;
935    
936     case PT_SC:
937     OK = script == code[3];
938     break;
939    
940     /* Should never occur, but keep compilers from grumbling. */
941    
942     default:
943     OK = codevalue != OP_PROP;
944     break;
945     }
946    
947     if (OK == (d == OP_PROP)) { count++; ADD_NEW(state_offset, count); }
948 nigel 77 }
949     break;
950    
951     /*-----------------------------------------------------------------*/
952     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
953     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
954     count = current_state->count; /* Already matched */
955     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
956 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
957 nigel 77 {
958     const uschar *nptr = ptr + clen;
959     int ncount = 0;
960     while (nptr < end_subject)
961     {
962     int nd;
963     int ndlen = 1;
964     GETCHARLEN(nd, nptr, ndlen);
965 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
966 nigel 77 ncount++;
967     nptr += ndlen;
968     }
969     count++;
970     ADD_NEW_DATA(-state_offset, count, ncount);
971     }
972     break;
973    
974     /*-----------------------------------------------------------------*/
975     case OP_PROP_EXTRA + OP_TYPEQUERY:
976     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
977 nigel 87 count = 4;
978 nigel 77 goto QS1;
979    
980     case OP_PROP_EXTRA + OP_TYPESTAR:
981     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
982     count = 0;
983    
984     QS1:
985    
986 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
987 nigel 77 if (clen > 0)
988     {
989 nigel 87 BOOL OK;
990     int category = _pcre_ucp_findprop(c, &chartype, &script);
991     switch(code[2])
992     {
993     case PT_ANY:
994     OK = TRUE;
995     break;
996    
997     case PT_LAMP:
998     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
999     break;
1000    
1001     case PT_GC:
1002     OK = category == code[3];
1003     break;
1004    
1005     case PT_PC:
1006     OK = chartype == code[3];
1007     break;
1008    
1009     case PT_SC:
1010     OK = script == code[3];
1011     break;
1012    
1013     /* Should never occur, but keep compilers from grumbling. */
1014    
1015     default:
1016     OK = codevalue != OP_PROP;
1017     break;
1018     }
1019    
1020     if (OK == (d == OP_PROP)) { ADD_NEW(state_offset + count, 0); }
1021 nigel 77 }
1022     break;
1023    
1024     /*-----------------------------------------------------------------*/
1025     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1026     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1027     count = 2;
1028     goto QS2;
1029    
1030     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1031     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1032     count = 0;
1033    
1034     QS2:
1035    
1036     ADD_ACTIVE(state_offset + 2, 0);
1037 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1038 nigel 77 {
1039     const uschar *nptr = ptr + clen;
1040     int ncount = 0;
1041     while (nptr < end_subject)
1042     {
1043     int nd;
1044     int ndlen = 1;
1045     GETCHARLEN(nd, nptr, ndlen);
1046 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1047 nigel 77 ncount++;
1048     nptr += ndlen;
1049     }
1050     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1051     }
1052     break;
1053    
1054     /*-----------------------------------------------------------------*/
1055     case OP_PROP_EXTRA + OP_TYPEEXACT:
1056     case OP_PROP_EXTRA + OP_TYPEUPTO:
1057     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1058     if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1059 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1060 nigel 77 count = current_state->count; /* Number already matched */
1061     if (clen > 0)
1062     {
1063 nigel 87 BOOL OK;
1064     int category = _pcre_ucp_findprop(c, &chartype, &script);
1065     switch(code[4])
1066 nigel 77 {
1067 nigel 87 case PT_ANY:
1068     OK = TRUE;
1069     break;
1070    
1071     case PT_LAMP:
1072     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1073     break;
1074    
1075     case PT_GC:
1076     OK = category == code[5];
1077     break;
1078    
1079     case PT_PC:
1080     OK = chartype == code[5];
1081     break;
1082    
1083     case PT_SC:
1084     OK = script == code[5];
1085     break;
1086    
1087     /* Should never occur, but keep compilers from grumbling. */
1088    
1089     default:
1090     OK = codevalue != OP_PROP;
1091     break;
1092     }
1093    
1094     if (OK == (d == OP_PROP))
1095     {
1096 nigel 77 if (++count >= GET2(code, 1))
1097 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1098 nigel 77 else
1099     { ADD_NEW(state_offset, count); }
1100     }
1101     }
1102     break;
1103    
1104     /*-----------------------------------------------------------------*/
1105     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1106     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1107     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1108     if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1109     { ADD_ACTIVE(state_offset + 4, 0); }
1110     count = current_state->count; /* Number already matched */
1111 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1112 nigel 77 {
1113     const uschar *nptr = ptr + clen;
1114     int ncount = 0;
1115     while (nptr < end_subject)
1116     {
1117     int nd;
1118     int ndlen = 1;
1119     GETCHARLEN(nd, nptr, ndlen);
1120 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1121 nigel 77 ncount++;
1122     nptr += ndlen;
1123     }
1124     if (++count >= GET2(code, 1))
1125     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1126     else
1127     { ADD_NEW_DATA(-state_offset, count, ncount); }
1128     }
1129     break;
1130    
1131     /* ========================================================================== */
1132     /* These opcodes are followed by a character that is usually compared
1133     to the current subject character; it is loaded into d. We still get
1134     here even if there is no subject character, because in some cases zero
1135     repetitions are permitted. */
1136    
1137     /*-----------------------------------------------------------------*/
1138     case OP_CHAR:
1139     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1140     break;
1141    
1142     /*-----------------------------------------------------------------*/
1143     case OP_CHARNC:
1144     if (clen == 0) break;
1145    
1146     #ifdef SUPPORT_UTF8
1147     if (utf8)
1148     {
1149     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1150     {
1151 nigel 87 int othercase;
1152 nigel 77 if (c < 128) othercase = fcc[c]; else
1153    
1154     /* If we have Unicode property support, we can use it to test the
1155 nigel 87 other case of the character. */
1156 nigel 77
1157     #ifdef SUPPORT_UCP
1158 nigel 87 othercase = _pcre_ucp_othercase(c);
1159     #else
1160     othercase = -1;
1161 nigel 77 #endif
1162    
1163     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1164     }
1165     }
1166     else
1167     #endif /* SUPPORT_UTF8 */
1168    
1169     /* Non-UTF-8 mode */
1170     {
1171     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1172     }
1173     break;
1174    
1175    
1176     #ifdef SUPPORT_UCP
1177     /*-----------------------------------------------------------------*/
1178     /* This is a tricky one because it can match more than one character.
1179     Find out how many characters to skip, and then set up a negative state
1180     to wait for them to pass before continuing. */
1181    
1182     case OP_EXTUNI:
1183 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1184 nigel 77 {
1185     const uschar *nptr = ptr + clen;
1186     int ncount = 0;
1187     while (nptr < end_subject)
1188     {
1189     int nclen = 1;
1190     GETCHARLEN(c, nptr, nclen);
1191 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1192 nigel 77 ncount++;
1193     nptr += nclen;
1194     }
1195     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1196     }
1197     break;
1198     #endif
1199    
1200     /*-----------------------------------------------------------------*/
1201     /* Match a negated single character. This is only used for one-byte
1202     characters, that is, we know that d < 256. The character we are
1203     checking (c) can be multibyte. */
1204    
1205     case OP_NOT:
1206     if (clen > 0)
1207     {
1208     int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1209     if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1210     }
1211     break;
1212    
1213     /*-----------------------------------------------------------------*/
1214     case OP_PLUS:
1215     case OP_MINPLUS:
1216     case OP_NOTPLUS:
1217     case OP_NOTMINPLUS:
1218     count = current_state->count; /* Already matched */
1219     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1220     if (clen > 0)
1221     {
1222     int otherd = -1;
1223     if ((ims & PCRE_CASELESS) != 0)
1224     {
1225     #ifdef SUPPORT_UTF8
1226 nigel 87 if (utf8 && d >= 128)
1227 nigel 77 {
1228     #ifdef SUPPORT_UCP
1229 nigel 87 otherd = _pcre_ucp_othercase(d);
1230 nigel 77 #endif /* SUPPORT_UCP */
1231     }
1232     else
1233     #endif /* SUPPORT_UTF8 */
1234     otherd = fcc[d];
1235     }
1236     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1237     { count++; ADD_NEW(state_offset, count); }
1238     }
1239     break;
1240    
1241     /*-----------------------------------------------------------------*/
1242     case OP_QUERY:
1243     case OP_MINQUERY:
1244     case OP_NOTQUERY:
1245     case OP_NOTMINQUERY:
1246     ADD_ACTIVE(state_offset + dlen + 1, 0);
1247     if (clen > 0)
1248     {
1249     int otherd = -1;
1250 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1251 nigel 77 {
1252     #ifdef SUPPORT_UTF8
1253 nigel 87 if (utf8 && d >= 128)
1254 nigel 77 {
1255     #ifdef SUPPORT_UCP
1256 nigel 87 otherd = _pcre_ucp_othercase(d);
1257 nigel 77 #endif /* SUPPORT_UCP */
1258     }
1259     else
1260     #endif /* SUPPORT_UTF8 */
1261     otherd = fcc[d];
1262     }
1263     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1264     { ADD_NEW(state_offset + dlen + 1, 0); }
1265     }
1266     break;
1267    
1268     /*-----------------------------------------------------------------*/
1269     case OP_STAR:
1270     case OP_MINSTAR:
1271     case OP_NOTSTAR:
1272     case OP_NOTMINSTAR:
1273     ADD_ACTIVE(state_offset + dlen + 1, 0);
1274     if (clen > 0)
1275     {
1276     int otherd = -1;
1277 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1278 nigel 77 {
1279     #ifdef SUPPORT_UTF8
1280 nigel 87 if (utf8 && d >= 128)
1281 nigel 77 {
1282     #ifdef SUPPORT_UCP
1283 nigel 87 otherd = _pcre_ucp_othercase(d);
1284 nigel 77 #endif /* SUPPORT_UCP */
1285     }
1286     else
1287     #endif /* SUPPORT_UTF8 */
1288     otherd = fcc[d];
1289     }
1290     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1291     { ADD_NEW(state_offset, 0); }
1292     }
1293     break;
1294    
1295     /*-----------------------------------------------------------------*/
1296     case OP_EXACT:
1297     case OP_UPTO:
1298     case OP_MINUPTO:
1299     case OP_NOTEXACT:
1300     case OP_NOTUPTO:
1301     case OP_NOTMINUPTO:
1302     if (codevalue != OP_EXACT && codevalue != OP_NOTEXACT)
1303     { ADD_ACTIVE(state_offset + dlen + 3, 0); }
1304     count = current_state->count; /* Number already matched */
1305     if (clen > 0)
1306     {
1307     int otherd = -1;
1308     if ((ims & PCRE_CASELESS) != 0)
1309     {
1310     #ifdef SUPPORT_UTF8
1311 nigel 87 if (utf8 && d >= 128)
1312 nigel 77 {
1313     #ifdef SUPPORT_UCP
1314 nigel 87 otherd = _pcre_ucp_othercase(d);
1315 nigel 77 #endif /* SUPPORT_UCP */
1316     }
1317     else
1318     #endif /* SUPPORT_UTF8 */
1319     otherd = fcc[d];
1320     }
1321     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1322     {
1323     if (++count >= GET2(code, 1))
1324     { ADD_NEW(state_offset + dlen + 3, 0); }
1325     else
1326     { ADD_NEW(state_offset, count); }
1327     }
1328     }
1329     break;
1330    
1331    
1332     /* ========================================================================== */
1333     /* These are the class-handling opcodes */
1334    
1335     case OP_CLASS:
1336     case OP_NCLASS:
1337     case OP_XCLASS:
1338     {
1339     BOOL isinclass = FALSE;
1340     int next_state_offset;
1341     const uschar *ecode;
1342    
1343     /* For a simple class, there is always just a 32-byte table, and we
1344     can set isinclass from it. */
1345    
1346     if (codevalue != OP_XCLASS)
1347     {
1348     ecode = code + 33;
1349     if (clen > 0)
1350     {
1351     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
1352     ((code[1 + c/8] & (1 << (c&7))) != 0);
1353     }
1354     }
1355    
1356     /* An extended class may have a table or a list of single characters,
1357     ranges, or both, and it may be positive or negative. There's a
1358     function that sorts all this out. */
1359    
1360     else
1361     {
1362     ecode = code + GET(code, 1);
1363     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
1364     }
1365    
1366     /* At this point, isinclass is set for all kinds of class, and ecode
1367     points to the byte after the end of the class. If there is a
1368     quantifier, this is where it will be. */
1369    
1370     next_state_offset = ecode - start_code;
1371    
1372     switch (*ecode)
1373     {
1374     case OP_CRSTAR:
1375     case OP_CRMINSTAR:
1376     ADD_ACTIVE(next_state_offset + 1, 0);
1377     if (isinclass) { ADD_NEW(state_offset, 0); }
1378     break;
1379    
1380     case OP_CRPLUS:
1381     case OP_CRMINPLUS:
1382     count = current_state->count; /* Already matched */
1383     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
1384     if (isinclass) { count++; ADD_NEW(state_offset, count); }
1385     break;
1386    
1387     case OP_CRQUERY:
1388     case OP_CRMINQUERY:
1389     ADD_ACTIVE(next_state_offset + 1, 0);
1390     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
1391     break;
1392    
1393     case OP_CRRANGE:
1394     case OP_CRMINRANGE:
1395     count = current_state->count; /* Already matched */
1396     if (count >= GET2(ecode, 1))
1397     { ADD_ACTIVE(next_state_offset + 5, 0); }
1398     if (isinclass)
1399     {
1400 nigel 91 int max = GET2(ecode, 3);
1401     if (++count >= max && max != 0) /* Max 0 => no limit */
1402 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
1403     else
1404     { ADD_NEW(state_offset, count); }
1405     }
1406     break;
1407    
1408     default:
1409     if (isinclass) { ADD_NEW(next_state_offset, 0); }
1410     break;
1411     }
1412     }
1413     break;
1414    
1415     /* ========================================================================== */
1416     /* These are the opcodes for fancy brackets of various kinds. We have
1417     to use recursion in order to handle them. */
1418    
1419     case OP_ASSERT:
1420     case OP_ASSERT_NOT:
1421     case OP_ASSERTBACK:
1422     case OP_ASSERTBACK_NOT:
1423     {
1424     int rc;
1425     int local_offsets[2];
1426     int local_workspace[1000];
1427     const uschar *endasscode = code + GET(code, 1);
1428    
1429     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1430    
1431     rc = internal_dfa_exec(
1432     md, /* static match data */
1433     code, /* this subexpression's code */
1434     ptr, /* where we currently are */
1435     ptr - start_subject, /* start offset */
1436     local_offsets, /* offset vector */
1437     sizeof(local_offsets)/sizeof(int), /* size of same */
1438     local_workspace, /* workspace vector */
1439     sizeof(local_workspace)/sizeof(int), /* size of same */
1440     ims, /* the current ims flags */
1441     rlevel, /* function recursion level */
1442     recursing); /* pass on regex recursion */
1443    
1444     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
1445     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1446     }
1447     break;
1448    
1449     /*-----------------------------------------------------------------*/
1450     case OP_COND:
1451     {
1452     int local_offsets[1000];
1453     int local_workspace[1000];
1454     int condcode = code[LINK_SIZE+1];
1455    
1456     /* The only supported version of OP_CREF is for the value 0xffff, which
1457     means "test if in a recursion". */
1458    
1459     if (condcode == OP_CREF)
1460     {
1461     int value = GET2(code, LINK_SIZE+2);
1462     if (value != 0xffff) return PCRE_ERROR_DFA_UCOND;
1463     if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
1464     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1465     }
1466    
1467     /* Otherwise, the condition is an assertion */
1468    
1469     else
1470     {
1471     int rc;
1472     const uschar *asscode = code + LINK_SIZE + 1;
1473     const uschar *endasscode = asscode + GET(asscode, 1);
1474    
1475     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
1476    
1477     rc = internal_dfa_exec(
1478     md, /* fixed match data */
1479     asscode, /* this subexpression's code */
1480     ptr, /* where we currently are */
1481     ptr - start_subject, /* start offset */
1482     local_offsets, /* offset vector */
1483     sizeof(local_offsets)/sizeof(int), /* size of same */
1484     local_workspace, /* workspace vector */
1485     sizeof(local_workspace)/sizeof(int), /* size of same */
1486     ims, /* the current ims flags */
1487     rlevel, /* function recursion level */
1488     recursing); /* pass on regex recursion */
1489    
1490     if ((rc >= 0) ==
1491     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
1492     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
1493     else
1494     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
1495     }
1496     }
1497     break;
1498    
1499     /*-----------------------------------------------------------------*/
1500     case OP_RECURSE:
1501     {
1502     int local_offsets[1000];
1503     int local_workspace[1000];
1504     int rc;
1505    
1506     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
1507     recursing + 1));
1508    
1509     rc = internal_dfa_exec(
1510     md, /* fixed match data */
1511     start_code + GET(code, 1), /* this subexpression's code */
1512     ptr, /* where we currently are */
1513     ptr - start_subject, /* start offset */
1514     local_offsets, /* offset vector */
1515     sizeof(local_offsets)/sizeof(int), /* size of same */
1516     local_workspace, /* workspace vector */
1517     sizeof(local_workspace)/sizeof(int), /* size of same */
1518     ims, /* the current ims flags */
1519     rlevel, /* function recursion level */
1520     recursing + 1); /* regex recurse level */
1521    
1522     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
1523     recursing + 1, rc));
1524    
1525     /* Ran out of internal offsets */
1526    
1527     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
1528    
1529     /* For each successful matched substring, set up the next state with a
1530     count of characters to skip before trying it. Note that the count is in
1531     characters, not bytes. */
1532    
1533     if (rc > 0)
1534     {
1535     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
1536     {
1537     const uschar *p = start_subject + local_offsets[rc];
1538     const uschar *pp = start_subject + local_offsets[rc+1];
1539     int charcount = local_offsets[rc+1] - local_offsets[rc];
1540     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1541     if (charcount > 0)
1542     {
1543     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
1544     }
1545     else
1546     {
1547     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
1548     }
1549     }
1550     }
1551     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1552     }
1553     break;
1554    
1555     /*-----------------------------------------------------------------*/
1556     case OP_ONCE:
1557     {
1558     int local_offsets[2];
1559     int local_workspace[1000];
1560    
1561     int rc = internal_dfa_exec(
1562     md, /* fixed match data */
1563     code, /* this subexpression's code */
1564     ptr, /* where we currently are */
1565     ptr - start_subject, /* start offset */
1566     local_offsets, /* offset vector */
1567     sizeof(local_offsets)/sizeof(int), /* size of same */
1568     local_workspace, /* workspace vector */
1569     sizeof(local_workspace)/sizeof(int), /* size of same */
1570     ims, /* the current ims flags */
1571     rlevel, /* function recursion level */
1572     recursing); /* pass on regex recursion */
1573    
1574     if (rc >= 0)
1575     {
1576     const uschar *end_subpattern = code;
1577     int charcount = local_offsets[1] - local_offsets[0];
1578     int next_state_offset, repeat_state_offset;
1579    
1580     do { end_subpattern += GET(end_subpattern, 1); }
1581     while (*end_subpattern == OP_ALT);
1582     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
1583    
1584     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
1585     arrange for the repeat state also to be added to the relevant list.
1586     Calculate the offset, or set -1 for no repeat. */
1587    
1588     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
1589     *end_subpattern == OP_KETRMIN)?
1590     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
1591    
1592     /* If we have matched an empty string, add the next state at the
1593     current character pointer. This is important so that the duplicate
1594     checking kicks in, which is what breaks infinite loops that match an
1595     empty string. */
1596    
1597     if (charcount == 0)
1598     {
1599     ADD_ACTIVE(next_state_offset, 0);
1600     }
1601    
1602     /* Optimization: if there are no more active states, and there
1603     are no new states yet set up, then skip over the subject string
1604     right here, to save looping. Otherwise, set up the new state to swing
1605     into action when the end of the substring is reached. */
1606    
1607     else if (i + 1 >= active_count && new_count == 0)
1608     {
1609     ptr += charcount;
1610     clen = 0;
1611     ADD_NEW(next_state_offset, 0);
1612    
1613     /* If we are adding a repeat state at the new character position,
1614     we must fudge things so that it is the only current state.
1615     Otherwise, it might be a duplicate of one we processed before, and
1616     that would cause it to be skipped. */
1617    
1618     if (repeat_state_offset >= 0)
1619     {
1620     next_active_state = active_states;
1621     active_count = 0;
1622     i = -1;
1623     ADD_ACTIVE(repeat_state_offset, 0);
1624     }
1625     }
1626     else
1627     {
1628     const uschar *p = start_subject + local_offsets[0];
1629     const uschar *pp = start_subject + local_offsets[1];
1630     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
1631     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
1632     if (repeat_state_offset >= 0)
1633     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
1634     }
1635    
1636     }
1637     else if (rc != PCRE_ERROR_NOMATCH) return rc;
1638     }
1639     break;
1640    
1641    
1642     /* ========================================================================== */
1643     /* Handle callouts */
1644    
1645     case OP_CALLOUT:
1646     if (pcre_callout != NULL)
1647     {
1648     int rrc;
1649     pcre_callout_block cb;
1650     cb.version = 1; /* Version 1 of the callout block */
1651     cb.callout_number = code[1];
1652     cb.offset_vector = offsets;
1653 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
1654 nigel 77 cb.subject_length = end_subject - start_subject;
1655     cb.start_match = current_subject - start_subject;
1656     cb.current_position = ptr - start_subject;
1657     cb.pattern_position = GET(code, 2);
1658     cb.next_item_length = GET(code, 2 + LINK_SIZE);
1659     cb.capture_top = 1;
1660     cb.capture_last = -1;
1661     cb.callout_data = md->callout_data;
1662     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
1663     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
1664     }
1665     break;
1666    
1667    
1668     /* ========================================================================== */
1669     default: /* Unsupported opcode */
1670     return PCRE_ERROR_DFA_UITEM;
1671     }
1672    
1673     NEXT_ACTIVE_STATE: continue;
1674    
1675     } /* End of loop scanning active states */
1676    
1677     /* We have finished the processing at the current subject character. If no
1678     new states have been set for the next character, we have found all the
1679     matches that we are going to find. If we are at the top level and partial
1680     matching has been requested, check for appropriate conditions. */
1681    
1682     if (new_count <= 0)
1683     {
1684     if (match_count < 0 && /* No matches found */
1685     rlevel == 1 && /* Top level match function */
1686     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
1687     ptr >= end_subject && /* Reached end of subject */
1688     ptr > current_subject) /* Matched non-empty string */
1689     {
1690     if (offsetcount >= 2)
1691     {
1692     offsets[0] = current_subject - start_subject;
1693     offsets[1] = end_subject - start_subject;
1694     }
1695     match_count = PCRE_ERROR_PARTIAL;
1696     }
1697    
1698     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
1699     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
1700     rlevel*2-2, SP));
1701 nigel 91 break; /* In effect, "return", but see the comment below */
1702 nigel 77 }
1703    
1704     /* One or more states are active for the next character. */
1705    
1706     ptr += clen; /* Advance to next subject character */
1707     } /* Loop to move along the subject string */
1708    
1709 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
1710     if we use "return" above, we have compiler trouble. Some compilers warn if
1711     there's nothing here because they think the function doesn't return a value. On
1712     the other hand, if we put a dummy statement here, some more clever compilers
1713     complain that it can't be reached. Sigh. */
1714 nigel 77
1715 nigel 91 return match_count;
1716 nigel 77 }
1717    
1718    
1719    
1720    
1721     /*************************************************
1722     * Execute a Regular Expression - DFA engine *
1723     *************************************************/
1724    
1725     /* This external function applies a compiled re to a subject string using a DFA
1726     engine. This function calls the internal function multiple times if the pattern
1727     is not anchored.
1728    
1729     Arguments:
1730     argument_re points to the compiled expression
1731     extra_data points to extra data or is NULL (not currently used)
1732     subject points to the subject string
1733     length length of subject string (may contain binary zeros)
1734     start_offset where to start in the subject string
1735     options option bits
1736     offsets vector of match offsets
1737     offsetcount size of same
1738     workspace workspace vector
1739     wscount size of same
1740    
1741     Returns: > 0 => number of match offset pairs placed in offsets
1742     = 0 => offsets overflowed; longest matches are present
1743     -1 => failed to match
1744     < -1 => some kind of unexpected problem
1745     */
1746    
1747 nigel 87 PCRE_DATA_SCOPE int
1748 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
1749     const char *subject, int length, int start_offset, int options, int *offsets,
1750     int offsetcount, int *workspace, int wscount)
1751     {
1752     real_pcre *re = (real_pcre *)argument_re;
1753     dfa_match_data match_block;
1754 nigel 91 dfa_match_data *md = &match_block;
1755 nigel 77 BOOL utf8, anchored, startline, firstline;
1756     const uschar *current_subject, *end_subject, *lcc;
1757    
1758     pcre_study_data internal_study;
1759     const pcre_study_data *study = NULL;
1760     real_pcre internal_re;
1761    
1762     const uschar *req_byte_ptr;
1763     const uschar *start_bits = NULL;
1764     BOOL first_byte_caseless = FALSE;
1765     BOOL req_byte_caseless = FALSE;
1766     int first_byte = -1;
1767     int req_byte = -1;
1768     int req_byte2 = -1;
1769 nigel 91 int newline;
1770 nigel 77
1771     /* Plausibility checks */
1772    
1773     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1774     if (re == NULL || subject == NULL || workspace == NULL ||
1775     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
1776     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1777     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
1778    
1779     /* We need to find the pointer to any study data before we test for byte
1780     flipping, so we scan the extra_data block first. This may set two fields in the
1781     match block, so we must initialize them beforehand. However, the other fields
1782     in the match block must not be set until after the byte flipping. */
1783    
1784 nigel 91 md->tables = re->tables;
1785     md->callout_data = NULL;
1786 nigel 77
1787     if (extra_data != NULL)
1788     {
1789     unsigned int flags = extra_data->flags;
1790     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
1791     study = (const pcre_study_data *)extra_data->study_data;
1792     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
1793 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
1794     return PCRE_ERROR_DFA_UMLIMIT;
1795 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
1796 nigel 91 md->callout_data = extra_data->callout_data;
1797 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
1798 nigel 91 md->tables = extra_data->tables;
1799 nigel 77 }
1800    
1801     /* Check that the first field in the block is the magic number. If it is not,
1802     test for a regex that was compiled on a host of opposite endianness. If this is
1803     the case, flipped values are put in internal_re and internal_study if there was
1804     study data too. */
1805    
1806     if (re->magic_number != MAGIC_NUMBER)
1807     {
1808     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
1809     if (re == NULL) return PCRE_ERROR_BADMAGIC;
1810     if (study != NULL) study = &internal_study;
1811     }
1812    
1813     /* Set some local values */
1814    
1815     current_subject = (const unsigned char *)subject + start_offset;
1816     end_subject = (const unsigned char *)subject + length;
1817     req_byte_ptr = current_subject - 1;
1818    
1819 nigel 91 #ifdef SUPPORT_UTF8
1820 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
1821 nigel 91 #else
1822     utf8 = FALSE;
1823     #endif
1824 nigel 77
1825 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
1826     (re->options & PCRE_ANCHORED) != 0;
1827    
1828 nigel 77 /* The remaining fixed data for passing around. */
1829    
1830 nigel 91 md->start_code = (const uschar *)argument_re +
1831 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
1832 nigel 91 md->start_subject = (const unsigned char *)subject;
1833     md->end_subject = end_subject;
1834     md->moptions = options;
1835     md->poptions = re->options;
1836 nigel 77
1837 nigel 91 /* Handle different types of newline. The two bits give four cases. If nothing
1838     is set at run time, whatever was used at compile time applies. */
1839    
1840     switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
1841     PCRE_NEWLINE_CRLF)
1842     {
1843     default: newline = NEWLINE; break; /* Compile-time default */
1844     case PCRE_NEWLINE_CR: newline = '\r'; break;
1845     case PCRE_NEWLINE_LF: newline = '\n'; break;
1846     case PCRE_NEWLINE_CR+
1847     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
1848     }
1849    
1850     if (newline > 255)
1851     {
1852     md->nllen = 2;
1853     md->nl[0] = (newline >> 8) & 255;
1854     md->nl[1] = newline & 255;
1855     }
1856     else
1857     {
1858     md->nllen = 1;
1859     md->nl[0] = newline;
1860     }
1861    
1862 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
1863     back the character offset. */
1864    
1865     #ifdef SUPPORT_UTF8
1866     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
1867     {
1868     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
1869     return PCRE_ERROR_BADUTF8;
1870     if (start_offset > 0 && start_offset < length)
1871     {
1872     int tb = ((uschar *)subject)[start_offset];
1873     if (tb > 127)
1874     {
1875     tb &= 0xc0;
1876     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
1877     }
1878     }
1879     }
1880     #endif
1881    
1882     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
1883     is a feature that makes it possible to save compiled regex and re-use them
1884     in other programs later. */
1885    
1886 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
1887 nigel 77
1888     /* The lower casing table and the "must be at the start of a line" flag are
1889     used in a loop when finding where to start. */
1890    
1891 nigel 91 lcc = md->tables + lcc_offset;
1892 nigel 77 startline = (re->options & PCRE_STARTLINE) != 0;
1893     firstline = (re->options & PCRE_FIRSTLINE) != 0;
1894    
1895     /* Set up the first character to match, if available. The first_byte value is
1896     never set for an anchored regular expression, but the anchoring may be forced
1897     at run time, so we have to test for anchoring. The first char may be unset for
1898     an unanchored pattern, of course. If there's no first char and the pattern was
1899     studied, there may be a bitmap of possible first characters. */
1900    
1901     if (!anchored)
1902     {
1903     if ((re->options & PCRE_FIRSTSET) != 0)
1904     {
1905     first_byte = re->first_byte & 255;
1906     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
1907     first_byte = lcc[first_byte];
1908     }
1909     else
1910     {
1911     if (startline && study != NULL &&
1912     (study->options & PCRE_STUDY_MAPPED) != 0)
1913     start_bits = study->start_bits;
1914     }
1915     }
1916    
1917     /* For anchored or unanchored matches, there may be a "last known required
1918     character" set. */
1919    
1920     if ((re->options & PCRE_REQCHSET) != 0)
1921     {
1922     req_byte = re->req_byte & 255;
1923     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
1924 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
1925 nigel 77 }
1926    
1927     /* Call the main matching function, looping for a non-anchored regex after a
1928     failed match. Unless restarting, optimize by moving to the first match
1929     character if possible, when not anchored. Then unless wanting a partial match,
1930     check for a required later character. */
1931    
1932     for (;;)
1933     {
1934     int rc;
1935    
1936     if ((options & PCRE_DFA_RESTART) == 0)
1937     {
1938     const uschar *save_end_subject = end_subject;
1939    
1940     /* Advance to a unique first char if possible. If firstline is TRUE, the
1941     start of the match is constrained to the first line of a multiline string.
1942 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
1943     scanning at a newline. If the match fails at the newline, later code breaks
1944     this loop. */
1945 nigel 77
1946     if (firstline)
1947     {
1948     const uschar *t = current_subject;
1949 nigel 91 while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
1950 nigel 77 end_subject = t;
1951     }
1952    
1953     if (first_byte >= 0)
1954     {
1955     if (first_byte_caseless)
1956     while (current_subject < end_subject &&
1957     lcc[*current_subject] != first_byte)
1958     current_subject++;
1959     else
1960     while (current_subject < end_subject && *current_subject != first_byte)
1961     current_subject++;
1962     }
1963    
1964 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
1965 nigel 77
1966     else if (startline)
1967     {
1968 nigel 91 if (current_subject > md->start_subject + md->nllen +
1969     start_offset)
1970 nigel 77 {
1971 nigel 91 while (current_subject <= end_subject &&
1972     !IS_NEWLINE(current_subject - md->nllen))
1973 nigel 77 current_subject++;
1974     }
1975     }
1976    
1977     /* Or to a non-unique first char after study */
1978    
1979     else if (start_bits != NULL)
1980     {
1981     while (current_subject < end_subject)
1982     {
1983     register unsigned int c = *current_subject;
1984     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
1985     else break;
1986     }
1987     }
1988    
1989     /* Restore fudged end_subject */
1990    
1991     end_subject = save_end_subject;
1992     }
1993    
1994     /* If req_byte is set, we know that that character must appear in the subject
1995     for the match to succeed. If the first character is set, req_byte must be
1996     later in the subject; otherwise the test starts at the match point. This
1997     optimization can save a huge amount of work in patterns with nested unlimited
1998     repeats that aren't going to match. Writing separate code for cased/caseless
1999     versions makes it go faster, as does using an autoincrement and backing off
2000     on a match.
2001    
2002     HOWEVER: when the subject string is very, very long, searching to its end can
2003     take a long time, and give bad performance on quite ordinary patterns. This
2004     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2005     don't do this when the string is sufficiently long.
2006    
2007     ALSO: this processing is disabled when partial matching is requested.
2008     */
2009    
2010     if (req_byte >= 0 &&
2011     end_subject - current_subject < REQ_BYTE_MAX &&
2012     (options & PCRE_PARTIAL) == 0)
2013     {
2014     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2015    
2016     /* We don't need to repeat the search if we haven't yet reached the
2017     place we found it at last time. */
2018    
2019     if (p > req_byte_ptr)
2020     {
2021     if (req_byte_caseless)
2022     {
2023     while (p < end_subject)
2024     {
2025     register int pp = *p++;
2026     if (pp == req_byte || pp == req_byte2) { p--; break; }
2027     }
2028     }
2029     else
2030     {
2031     while (p < end_subject)
2032     {
2033     if (*p++ == req_byte) { p--; break; }
2034     }
2035     }
2036    
2037     /* If we can't find the required character, break the matching loop,
2038     which will cause a return or PCRE_ERROR_NOMATCH. */
2039    
2040     if (p >= end_subject) break;
2041    
2042     /* If we have found the required character, save the point where we
2043     found it, so that we don't search again next time round the loop if
2044     the start hasn't passed this character yet. */
2045    
2046     req_byte_ptr = p;
2047     }
2048     }
2049    
2050     /* OK, now we can do the business */
2051    
2052     rc = internal_dfa_exec(
2053 nigel 91 md, /* fixed match data */
2054     md->start_code, /* this subexpression's code */
2055     current_subject, /* where we currently are */
2056     start_offset, /* start offset in subject */
2057     offsets, /* offset vector */
2058     offsetcount, /* size of same */
2059     workspace, /* workspace vector */
2060     wscount, /* size of same */
2061 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2062 nigel 91 0, /* function recurse level */
2063     0); /* regex recurse level */
2064 nigel 77
2065     /* Anything other than "no match" means we are done, always; otherwise, carry
2066     on only if not anchored. */
2067    
2068     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2069    
2070     /* Advance to the next subject character unless we are at the end of a line
2071     and firstline is set. */
2072    
2073 nigel 91 if (firstline &&
2074     current_subject <= end_subject - md->nllen &&
2075     IS_NEWLINE(current_subject)) break;
2076 nigel 77 current_subject++;
2077     if (utf8)
2078     {
2079     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2080     current_subject++;
2081     }
2082     if (current_subject > end_subject) break;
2083     }
2084    
2085     return PCRE_ERROR_NOMATCH;
2086     }
2087    
2088     /* End of pcre_dfa_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12