/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 178 - (hide annotations) (download)
Wed Jun 13 08:44:34 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 94293 byte(s)
Add support for \h, \H, \v, \V.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 nigel 93 #define NLBLOCK md /* Block containing newline information */
48     #define PSSTART start_subject /* Field containing processed string start */
49     #define PSEND end_subject /* Field containing processed string end */
50    
51 nigel 77 #include "pcre_internal.h"
52    
53    
54     /* For use to indent debugging output */
55    
56     #define SP " "
57    
58    
59    
60     /*************************************************
61     * Code parameters and static tables *
62     *************************************************/
63    
64     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
66 ph10 178 enough. The resulting opcodes don't have to be less than 256 because they are
67     never stored, so we push them well clear of the normal opcodes. */
68 nigel 77
69 ph10 178 #define OP_PROP_EXTRA 300
70     #define OP_EXTUNI_EXTRA 320
71     #define OP_ANYNL_EXTRA 340
72     #define OP_HSPACE_EXTRA 360
73     #define OP_VSPACE_EXTRA 380
74 nigel 77
75    
76     /* This table identifies those opcodes that are followed immediately by a
77     character that is to be tested in some way. This makes is possible to
78     centralize the loading of these characters. In the case of Type * etc, the
79     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
80 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
81 ph10 168 that follow must also be modified. */
82 nigel 77
83     static uschar coptable[] = {
84     0, /* End */
85 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
86     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
87 nigel 77 0, 0, /* Any, Anybyte */
88 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
89     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
90 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
91     1, /* Char */
92     1, /* Charnc */
93     1, /* not */
94     /* Positive single-char repeats */
95     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
96     3, 3, 3, /* upto, minupto, exact */
97 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
98 nigel 77 /* Negative single-char repeats - only for chars < 256 */
99     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* NOT upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
102 nigel 77 /* Positive type repeats */
103     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* Type upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
106 nigel 77 /* Character class & ref repeats */
107     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
108     0, 0, /* CRRANGE, CRMINRANGE */
109     0, /* CLASS */
110     0, /* NCLASS */
111     0, /* XCLASS - variable length */
112     0, /* REF */
113     0, /* RECURSE */
114     0, /* CALLOUT */
115     0, /* Alt */
116     0, /* Ket */
117     0, /* KetRmax */
118     0, /* KetRmin */
119     0, /* Assert */
120     0, /* Assert not */
121     0, /* Assert behind */
122     0, /* Assert behind not */
123     0, /* Reverse */
124 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
125     0, 0, 0, /* SBRA, SCBRA, SCOND */
126 nigel 77 0, /* CREF */
127 nigel 93 0, /* RREF */
128     0, /* DEF */
129     0, 0 /* BRAZERO, BRAMINZERO */
130 nigel 77 };
131    
132     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
133     and \w */
134    
135     static uschar toptable1[] = {
136 ph10 168 0, 0, 0, 0, 0, 0,
137 nigel 77 ctype_digit, ctype_digit,
138     ctype_space, ctype_space,
139     ctype_word, ctype_word,
140     0 /* OP_ANY */
141     };
142    
143     static uschar toptable2[] = {
144 ph10 168 0, 0, 0, 0, 0, 0,
145 nigel 77 ctype_digit, 0,
146     ctype_space, 0,
147     ctype_word, 0,
148     1 /* OP_ANY */
149     };
150    
151    
152     /* Structure for holding data about a particular state, which is in effect the
153     current data for an active path through the match tree. It must consist
154     entirely of ints because the working vector we are passed, and which we put
155     these structures in, is a vector of ints. */
156    
157     typedef struct stateblock {
158     int offset; /* Offset to opcode */
159     int count; /* Count for repeats */
160     int ims; /* ims flag bits */
161     int data; /* Some use extra data */
162     } stateblock;
163    
164     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
165    
166    
167     #ifdef DEBUG
168     /*************************************************
169     * Print character string *
170     *************************************************/
171    
172     /* Character string printing function for debugging.
173    
174     Arguments:
175     p points to string
176     length number of bytes
177     f where to print
178    
179     Returns: nothing
180     */
181    
182     static void
183     pchars(unsigned char *p, int length, FILE *f)
184     {
185     int c;
186     while (length-- > 0)
187     {
188     if (isprint(c = *(p++)))
189     fprintf(f, "%c", c);
190     else
191     fprintf(f, "\\x%02x", c);
192     }
193     }
194     #endif
195    
196    
197    
198     /*************************************************
199     * Execute a Regular Expression - DFA engine *
200     *************************************************/
201    
202     /* This internal function applies a compiled pattern to a subject string,
203     starting at a given point, using a DFA engine. This function is called from the
204     external one, possibly multiple times if the pattern is not anchored. The
205     function calls itself recursively for some kinds of subpattern.
206    
207     Arguments:
208     md the match_data block with fixed information
209     this_start_code the opening bracket of this subexpression's code
210     current_subject where we currently are in the subject string
211     start_offset start offset in the subject string
212     offsets vector to contain the matching string offsets
213     offsetcount size of same
214     workspace vector of workspace
215     wscount size of same
216     ims the current ims flags
217     rlevel function call recursion level
218     recursing regex recursive call level
219    
220     Returns: > 0 =>
221     = 0 =>
222     -1 => failed to match
223     < -1 => some kind of unexpected problem
224    
225     The following macros are used for adding states to the two state vectors (one
226     for the current character, one for the following character). */
227    
228     #define ADD_ACTIVE(x,y) \
229     if (active_count++ < wscount) \
230     { \
231     next_active_state->offset = (x); \
232     next_active_state->count = (y); \
233     next_active_state->ims = ims; \
234     next_active_state++; \
235     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
236     } \
237     else return PCRE_ERROR_DFA_WSSIZE
238    
239     #define ADD_ACTIVE_DATA(x,y,z) \
240     if (active_count++ < wscount) \
241     { \
242     next_active_state->offset = (x); \
243     next_active_state->count = (y); \
244     next_active_state->ims = ims; \
245     next_active_state->data = (z); \
246     next_active_state++; \
247     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
248     } \
249     else return PCRE_ERROR_DFA_WSSIZE
250    
251     #define ADD_NEW(x,y) \
252     if (new_count++ < wscount) \
253     { \
254     next_new_state->offset = (x); \
255     next_new_state->count = (y); \
256     next_new_state->ims = ims; \
257     next_new_state++; \
258     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
259     } \
260     else return PCRE_ERROR_DFA_WSSIZE
261    
262     #define ADD_NEW_DATA(x,y,z) \
263     if (new_count++ < wscount) \
264     { \
265     next_new_state->offset = (x); \
266     next_new_state->count = (y); \
267     next_new_state->ims = ims; \
268     next_new_state->data = (z); \
269     next_new_state++; \
270     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
271     } \
272     else return PCRE_ERROR_DFA_WSSIZE
273    
274     /* And now, here is the code */
275    
276     static int
277     internal_dfa_exec(
278     dfa_match_data *md,
279     const uschar *this_start_code,
280     const uschar *current_subject,
281     int start_offset,
282     int *offsets,
283     int offsetcount,
284     int *workspace,
285     int wscount,
286     int ims,
287     int rlevel,
288     int recursing)
289     {
290     stateblock *active_states, *new_states, *temp_states;
291     stateblock *next_active_state, *next_new_state;
292    
293     const uschar *ctypes, *lcc, *fcc;
294     const uschar *ptr;
295 nigel 93 const uschar *end_code, *first_op;
296 nigel 77
297     int active_count, new_count, match_count;
298    
299     /* Some fields in the md block are frequently referenced, so we load them into
300     independent variables in the hope that this will perform better. */
301    
302     const uschar *start_subject = md->start_subject;
303     const uschar *end_subject = md->end_subject;
304     const uschar *start_code = md->start_code;
305    
306 nigel 87 #ifdef SUPPORT_UTF8
307 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
308 nigel 93 #else
309     BOOL utf8 = FALSE;
310 nigel 87 #endif
311 nigel 77
312     rlevel++;
313     offsetcount &= (-2);
314    
315     wscount -= 2;
316     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
317     (2 * INTS_PER_STATEBLOCK);
318    
319     DPRINTF(("\n%.*s---------------------\n"
320     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
321     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
322    
323     ctypes = md->tables + ctypes_offset;
324     lcc = md->tables + lcc_offset;
325     fcc = md->tables + fcc_offset;
326    
327     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
328    
329     active_states = (stateblock *)(workspace + 2);
330     next_new_state = new_states = active_states + wscount;
331     new_count = 0;
332    
333 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
334     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
335    
336 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
337     the alternative states onto the list, and find out where the end is. This
338     makes is possible to use this function recursively, when we want to stop at a
339     matching internal ket rather than at the end.
340    
341     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
342     a backward assertion. In that case, we have to find out the maximum amount to
343     move back, and set up each alternative appropriately. */
344    
345 nigel 93 if (*first_op == OP_REVERSE)
346 nigel 77 {
347     int max_back = 0;
348     int gone_back;
349    
350     end_code = this_start_code;
351     do
352     {
353     int back = GET(end_code, 2+LINK_SIZE);
354     if (back > max_back) max_back = back;
355     end_code += GET(end_code, 1);
356     }
357     while (*end_code == OP_ALT);
358    
359     /* If we can't go back the amount required for the longest lookbehind
360     pattern, go back as far as we can; some alternatives may still be viable. */
361    
362     #ifdef SUPPORT_UTF8
363     /* In character mode we have to step back character by character */
364    
365     if (utf8)
366     {
367     for (gone_back = 0; gone_back < max_back; gone_back++)
368     {
369     if (current_subject <= start_subject) break;
370     current_subject--;
371     while (current_subject > start_subject &&
372     (*current_subject & 0xc0) == 0x80)
373     current_subject--;
374     }
375     }
376     else
377     #endif
378    
379     /* In byte-mode we can do this quickly. */
380    
381     {
382     gone_back = (current_subject - max_back < start_subject)?
383     current_subject - start_subject : max_back;
384     current_subject -= gone_back;
385     }
386    
387     /* Now we can process the individual branches. */
388    
389     end_code = this_start_code;
390     do
391     {
392     int back = GET(end_code, 2+LINK_SIZE);
393     if (back <= gone_back)
394     {
395     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
396     ADD_NEW_DATA(-bstate, 0, gone_back - back);
397     }
398     end_code += GET(end_code, 1);
399     }
400     while (*end_code == OP_ALT);
401     }
402    
403     /* This is the code for a "normal" subpattern (not a backward assertion). The
404     start of a whole pattern is always one of these. If we are at the top level,
405     we may be asked to restart matching from the same point that we reached for a
406     previous partial match. We still have to scan through the top-level branches to
407     find the end state. */
408    
409     else
410     {
411     end_code = this_start_code;
412    
413     /* Restarting */
414    
415     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
416     {
417     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
418     new_count = workspace[1];
419     if (!workspace[0])
420     memcpy(new_states, active_states, new_count * sizeof(stateblock));
421     }
422    
423     /* Not restarting */
424    
425     else
426     {
427 nigel 93 int length = 1 + LINK_SIZE +
428     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
429 nigel 77 do
430     {
431 nigel 93 ADD_NEW(end_code - start_code + length, 0);
432 nigel 77 end_code += GET(end_code, 1);
433 nigel 93 length = 1 + LINK_SIZE;
434 nigel 77 }
435     while (*end_code == OP_ALT);
436     }
437     }
438    
439     workspace[0] = 0; /* Bit indicating which vector is current */
440    
441     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
442    
443     /* Loop for scanning the subject */
444    
445     ptr = current_subject;
446     for (;;)
447     {
448     int i, j;
449 nigel 91 int clen, dlen;
450     unsigned int c, d;
451 nigel 77
452     /* Make the new state list into the active state list and empty the
453     new state list. */
454    
455     temp_states = active_states;
456     active_states = new_states;
457     new_states = temp_states;
458     active_count = new_count;
459     new_count = 0;
460    
461     workspace[0] ^= 1; /* Remember for the restarting feature */
462     workspace[1] = active_count;
463    
464     #ifdef DEBUG
465     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
466     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
467     printf("\"\n");
468    
469     printf("%.*sActive states: ", rlevel*2-2, SP);
470     for (i = 0; i < active_count; i++)
471     printf("%d/%d ", active_states[i].offset, active_states[i].count);
472     printf("\n");
473     #endif
474    
475     /* Set the pointers for adding new states */
476    
477     next_active_state = active_states + active_count;
478     next_new_state = new_states;
479    
480     /* Load the current character from the subject outside the loop, as many
481     different states may want to look at it, and we assume that at least one
482     will. */
483    
484     if (ptr < end_subject)
485     {
486 nigel 93 clen = 1; /* Number of bytes in the character */
487 nigel 77 #ifdef SUPPORT_UTF8
488     if (utf8) { GETCHARLEN(c, ptr, clen); } else
489     #endif /* SUPPORT_UTF8 */
490     c = *ptr;
491     }
492     else
493     {
494 nigel 93 clen = 0; /* This indicates the end of the subject */
495     c = NOTACHAR; /* This value should never actually be used */
496 nigel 77 }
497    
498     /* Scan up the active states and act on each one. The result of an action
499     may be to add more states to the currently active list (e.g. on hitting a
500     parenthesis) or it may be to put states on the new list, for considering
501     when we move the character pointer on. */
502    
503     for (i = 0; i < active_count; i++)
504     {
505     stateblock *current_state = active_states + i;
506     const uschar *code;
507     int state_offset = current_state->offset;
508     int count, codevalue;
509 ph10 152 #ifdef SUPPORT_UCP
510 nigel 87 int chartype, script;
511 ph10 152 #endif
512 nigel 77
513     #ifdef DEBUG
514     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
515 nigel 93 if (clen == 0) printf("EOL\n");
516 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
517     else printf("0x%02x\n", c);
518     #endif
519    
520     /* This variable is referred to implicity in the ADD_xxx macros. */
521    
522     ims = current_state->ims;
523    
524     /* A negative offset is a special case meaning "hold off going to this
525     (negated) state until the number of characters in the data field have
526     been skipped". */
527    
528     if (state_offset < 0)
529     {
530     if (current_state->data > 0)
531     {
532     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
533     ADD_NEW_DATA(state_offset, current_state->count,
534     current_state->data - 1);
535     continue;
536     }
537     else
538     {
539     current_state->offset = state_offset = -state_offset;
540     }
541     }
542    
543     /* Check for a duplicate state with the same count, and skip if found. */
544    
545     for (j = 0; j < i; j++)
546     {
547     if (active_states[j].offset == state_offset &&
548     active_states[j].count == current_state->count)
549     {
550     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
551     goto NEXT_ACTIVE_STATE;
552     }
553     }
554    
555     /* The state offset is the offset to the opcode */
556    
557     code = start_code + state_offset;
558     codevalue = *code;
559    
560     /* If this opcode is followed by an inline character, load it. It is
561     tempting to test for the presence of a subject character here, but that
562     is wrong, because sometimes zero repetitions of the subject are
563     permitted.
564    
565     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
566 ph10 178 argument that is not a data character - but is always one byte long. We
567     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
568     this case. To keep the other cases fast, convert these ones to new opcodes.
569     */
570 nigel 77
571     if (coptable[codevalue] > 0)
572     {
573     dlen = 1;
574     #ifdef SUPPORT_UTF8
575     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
576     #endif /* SUPPORT_UTF8 */
577     d = code[coptable[codevalue]];
578     if (codevalue >= OP_TYPESTAR)
579     {
580 nigel 93 switch(d)
581     {
582     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
583     case OP_NOTPROP:
584     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
585     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
586     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
587 ph10 178 case OP_NOT_HSPACE:
588     case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
589     case OP_NOT_VSPACE:
590     case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
591 nigel 93 default: break;
592     }
593 nigel 77 }
594     }
595     else
596     {
597     dlen = 0; /* Not strictly necessary, but compilers moan */
598 nigel 93 d = NOTACHAR; /* if these variables are not set. */
599 nigel 77 }
600    
601    
602     /* Now process the individual opcodes */
603    
604     switch (codevalue)
605     {
606    
607     /* ========================================================================== */
608     /* Reached a closing bracket. If not at the end of the pattern, carry
609     on with the next opcode. Otherwise, unless we have an empty string and
610     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
611     matches so we always have the longest first. */
612    
613     case OP_KET:
614     case OP_KETRMIN:
615     case OP_KETRMAX:
616     if (code != end_code)
617     {
618     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
619     if (codevalue != OP_KET)
620     {
621     ADD_ACTIVE(state_offset - GET(code, 1), 0);
622     }
623     }
624     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
625     {
626     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
627     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
628     match_count = 0;
629     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
630     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
631     if (offsetcount >= 2)
632     {
633     offsets[0] = current_subject - start_subject;
634     offsets[1] = ptr - start_subject;
635     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
636     offsets[1] - offsets[0], current_subject));
637     }
638     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
639     {
640     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
641     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
642     match_count, rlevel*2-2, SP));
643     return match_count;
644     }
645     }
646     break;
647    
648     /* ========================================================================== */
649     /* These opcodes add to the current list of states without looking
650     at the current character. */
651    
652     /*-----------------------------------------------------------------*/
653     case OP_ALT:
654     do { code += GET(code, 1); } while (*code == OP_ALT);
655     ADD_ACTIVE(code - start_code, 0);
656     break;
657    
658     /*-----------------------------------------------------------------*/
659     case OP_BRA:
660 nigel 93 case OP_SBRA:
661 nigel 77 do
662     {
663     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
664     code += GET(code, 1);
665     }
666     while (*code == OP_ALT);
667     break;
668    
669     /*-----------------------------------------------------------------*/
670 nigel 93 case OP_CBRA:
671     case OP_SCBRA:
672     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
673     code += GET(code, 1);
674     while (*code == OP_ALT)
675     {
676     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
677     code += GET(code, 1);
678     }
679     break;
680    
681     /*-----------------------------------------------------------------*/
682 nigel 77 case OP_BRAZERO:
683     case OP_BRAMINZERO:
684     ADD_ACTIVE(state_offset + 1, 0);
685     code += 1 + GET(code, 2);
686     while (*code == OP_ALT) code += GET(code, 1);
687     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
688     break;
689    
690     /*-----------------------------------------------------------------*/
691     case OP_CIRC:
692     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
693 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
694     ptr != end_subject &&
695 nigel 93 WAS_NEWLINE(ptr)))
696 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
697     break;
698    
699     /*-----------------------------------------------------------------*/
700     case OP_EOD:
701     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
702     break;
703    
704     /*-----------------------------------------------------------------*/
705     case OP_OPT:
706     ims = code[1];
707     ADD_ACTIVE(state_offset + 2, 0);
708     break;
709    
710     /*-----------------------------------------------------------------*/
711     case OP_SOD:
712     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
713     break;
714    
715     /*-----------------------------------------------------------------*/
716     case OP_SOM:
717     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
718     break;
719    
720    
721     /* ========================================================================== */
722     /* These opcodes inspect the next subject character, and sometimes
723     the previous one as well, but do not have an argument. The variable
724     clen contains the length of the current character and is zero if we are
725     at the end of the subject. */
726    
727     /*-----------------------------------------------------------------*/
728     case OP_ANY:
729 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
730 nigel 77 { ADD_NEW(state_offset + 1, 0); }
731     break;
732    
733     /*-----------------------------------------------------------------*/
734     case OP_EODN:
735 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
736 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
737     break;
738    
739     /*-----------------------------------------------------------------*/
740     case OP_DOLL:
741     if ((md->moptions & PCRE_NOTEOL) == 0)
742     {
743 nigel 91 if (clen == 0 ||
744 nigel 93 (IS_NEWLINE(ptr) &&
745 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
746     ))
747 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
748     }
749 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
750 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
751     break;
752    
753     /*-----------------------------------------------------------------*/
754    
755     case OP_DIGIT:
756     case OP_WHITESPACE:
757     case OP_WORDCHAR:
758     if (clen > 0 && c < 256 &&
759     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
760     { ADD_NEW(state_offset + 1, 0); }
761     break;
762    
763     /*-----------------------------------------------------------------*/
764     case OP_NOT_DIGIT:
765     case OP_NOT_WHITESPACE:
766     case OP_NOT_WORDCHAR:
767     if (clen > 0 && (c >= 256 ||
768     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
769     { ADD_NEW(state_offset + 1, 0); }
770     break;
771    
772     /*-----------------------------------------------------------------*/
773     case OP_WORD_BOUNDARY:
774     case OP_NOT_WORD_BOUNDARY:
775     {
776     int left_word, right_word;
777    
778     if (ptr > start_subject)
779     {
780     const uschar *temp = ptr - 1;
781     #ifdef SUPPORT_UTF8
782     if (utf8) BACKCHAR(temp);
783     #endif
784     GETCHARTEST(d, temp);
785     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
786     }
787     else left_word = 0;
788    
789     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
790     else right_word = 0;
791    
792     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
793     { ADD_ACTIVE(state_offset + 1, 0); }
794     }
795     break;
796    
797    
798     /*-----------------------------------------------------------------*/
799     /* Check the next character by Unicode property. We will get here only
800     if the support is in the binary; otherwise a compile-time error occurs.
801     */
802    
803 ph10 151 #ifdef SUPPORT_UCP
804 nigel 77 case OP_PROP:
805     case OP_NOTPROP:
806     if (clen > 0)
807     {
808 nigel 87 BOOL OK;
809     int category = _pcre_ucp_findprop(c, &chartype, &script);
810     switch(code[1])
811 nigel 77 {
812 nigel 87 case PT_ANY:
813     OK = TRUE;
814     break;
815    
816     case PT_LAMP:
817     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
818     break;
819    
820     case PT_GC:
821     OK = category == code[2];
822     break;
823    
824     case PT_PC:
825     OK = chartype == code[2];
826     break;
827    
828     case PT_SC:
829     OK = script == code[2];
830     break;
831    
832     /* Should never occur, but keep compilers from grumbling. */
833    
834     default:
835     OK = codevalue != OP_PROP;
836     break;
837 nigel 77 }
838 nigel 87
839     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
840 nigel 77 }
841     break;
842     #endif
843    
844    
845    
846     /* ========================================================================== */
847     /* These opcodes likewise inspect the subject character, but have an
848     argument that is not a data character. It is one of these opcodes:
849     OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
850     OP_NOT_WORDCHAR. The value is loaded into d. */
851    
852     case OP_TYPEPLUS:
853     case OP_TYPEMINPLUS:
854 nigel 93 case OP_TYPEPOSPLUS:
855 nigel 77 count = current_state->count; /* Already matched */
856     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
857     if (clen > 0)
858     {
859     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
860     (c < 256 &&
861 nigel 91 (d != OP_ANY ||
862     (ims & PCRE_DOTALL) != 0 ||
863     !IS_NEWLINE(ptr)
864     ) &&
865 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
866     {
867 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
868     {
869     active_count--; /* Remove non-match possibility */
870     next_active_state--;
871     }
872 nigel 77 count++;
873     ADD_NEW(state_offset, count);
874     }
875     }
876     break;
877    
878     /*-----------------------------------------------------------------*/
879     case OP_TYPEQUERY:
880     case OP_TYPEMINQUERY:
881 nigel 93 case OP_TYPEPOSQUERY:
882 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
883     if (clen > 0)
884     {
885     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
886     (c < 256 &&
887 nigel 91 (d != OP_ANY ||
888     (ims & PCRE_DOTALL) != 0 ||
889     !IS_NEWLINE(ptr)
890     ) &&
891 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
892     {
893 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
894     {
895     active_count--; /* Remove non-match possibility */
896     next_active_state--;
897     }
898 nigel 77 ADD_NEW(state_offset + 2, 0);
899     }
900     }
901     break;
902    
903     /*-----------------------------------------------------------------*/
904     case OP_TYPESTAR:
905     case OP_TYPEMINSTAR:
906 nigel 93 case OP_TYPEPOSSTAR:
907 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
908     if (clen > 0)
909     {
910     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
911     (c < 256 &&
912 nigel 91 (d != OP_ANY ||
913     (ims & PCRE_DOTALL) != 0 ||
914     !IS_NEWLINE(ptr)
915     ) &&
916 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
917     {
918 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
919     {
920     active_count--; /* Remove non-match possibility */
921     next_active_state--;
922     }
923 nigel 77 ADD_NEW(state_offset, 0);
924     }
925     }
926     break;
927    
928     /*-----------------------------------------------------------------*/
929     case OP_TYPEEXACT:
930 nigel 93 count = current_state->count; /* Number already matched */
931     if (clen > 0)
932     {
933     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
934     (c < 256 &&
935     (d != OP_ANY ||
936     (ims & PCRE_DOTALL) != 0 ||
937     !IS_NEWLINE(ptr)
938     ) &&
939     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
940     {
941     if (++count >= GET2(code, 1))
942     { ADD_NEW(state_offset + 4, 0); }
943     else
944     { ADD_NEW(state_offset, count); }
945     }
946     }
947     break;
948    
949     /*-----------------------------------------------------------------*/
950 nigel 77 case OP_TYPEUPTO:
951     case OP_TYPEMINUPTO:
952 nigel 93 case OP_TYPEPOSUPTO:
953     ADD_ACTIVE(state_offset + 4, 0);
954 nigel 77 count = current_state->count; /* Number already matched */
955     if (clen > 0)
956     {
957     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
958     (c < 256 &&
959 nigel 91 (d != OP_ANY ||
960     (ims & PCRE_DOTALL) != 0 ||
961     !IS_NEWLINE(ptr)
962     ) &&
963 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
964     {
965 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
966     {
967     active_count--; /* Remove non-match possibility */
968     next_active_state--;
969     }
970 nigel 77 if (++count >= GET2(code, 1))
971     { ADD_NEW(state_offset + 4, 0); }
972     else
973     { ADD_NEW(state_offset, count); }
974     }
975     }
976     break;
977    
978     /* ========================================================================== */
979     /* These are virtual opcodes that are used when something like
980 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
981     argument. It keeps the code above fast for the other cases. The argument
982     is in the d variable. */
983 nigel 77
984 ph10 151 #ifdef SUPPORT_UCP
985 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
986     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
987 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
988 nigel 77 count = current_state->count; /* Already matched */
989 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
990 nigel 77 if (clen > 0)
991     {
992 nigel 87 BOOL OK;
993     int category = _pcre_ucp_findprop(c, &chartype, &script);
994     switch(code[2])
995     {
996     case PT_ANY:
997     OK = TRUE;
998     break;
999    
1000     case PT_LAMP:
1001     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1002     break;
1003    
1004     case PT_GC:
1005     OK = category == code[3];
1006     break;
1007    
1008     case PT_PC:
1009     OK = chartype == code[3];
1010     break;
1011    
1012     case PT_SC:
1013     OK = script == code[3];
1014     break;
1015    
1016     /* Should never occur, but keep compilers from grumbling. */
1017    
1018     default:
1019     OK = codevalue != OP_PROP;
1020     break;
1021     }
1022    
1023 nigel 93 if (OK == (d == OP_PROP))
1024     {
1025     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1026     {
1027     active_count--; /* Remove non-match possibility */
1028     next_active_state--;
1029     }
1030     count++;
1031     ADD_NEW(state_offset, count);
1032     }
1033 nigel 77 }
1034     break;
1035    
1036     /*-----------------------------------------------------------------*/
1037     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1038     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1039 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1040 nigel 77 count = current_state->count; /* Already matched */
1041     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1042 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1043 nigel 77 {
1044     const uschar *nptr = ptr + clen;
1045     int ncount = 0;
1046 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1047     {
1048     active_count--; /* Remove non-match possibility */
1049     next_active_state--;
1050     }
1051 nigel 77 while (nptr < end_subject)
1052     {
1053     int nd;
1054     int ndlen = 1;
1055     GETCHARLEN(nd, nptr, ndlen);
1056 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1057 nigel 77 ncount++;
1058     nptr += ndlen;
1059     }
1060     count++;
1061     ADD_NEW_DATA(-state_offset, count, ncount);
1062     }
1063     break;
1064 ph10 151 #endif
1065 nigel 77
1066     /*-----------------------------------------------------------------*/
1067 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1068     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1069     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1070     count = current_state->count; /* Already matched */
1071     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1072     if (clen > 0)
1073     {
1074     int ncount = 0;
1075     switch (c)
1076     {
1077     case 0x000d:
1078     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1079     /* Fall through */
1080     case 0x000a:
1081     case 0x000b:
1082     case 0x000c:
1083     case 0x0085:
1084     case 0x2028:
1085     case 0x2029:
1086     if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1087     {
1088     active_count--; /* Remove non-match possibility */
1089     next_active_state--;
1090     }
1091     count++;
1092     ADD_NEW_DATA(-state_offset, count, ncount);
1093     break;
1094     default:
1095     break;
1096     }
1097     }
1098     break;
1099    
1100     /*-----------------------------------------------------------------*/
1101 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1102     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1103     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1104     count = current_state->count; /* Already matched */
1105     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1106     if (clen > 0)
1107     {
1108     BOOL OK;
1109     switch (c)
1110     {
1111     case 0x000a:
1112     case 0x000b:
1113     case 0x000c:
1114     case 0x000d:
1115     case 0x0085:
1116     case 0x2028:
1117     case 0x2029:
1118     OK = TRUE;
1119     break;
1120    
1121     default:
1122     OK = FALSE;
1123     break;
1124     }
1125    
1126     if (OK == (d == OP_VSPACE))
1127     {
1128     if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1129     {
1130     active_count--; /* Remove non-match possibility */
1131     next_active_state--;
1132     }
1133     count++;
1134     ADD_NEW_DATA(-state_offset, count, 0);
1135     }
1136     }
1137     break;
1138    
1139     /*-----------------------------------------------------------------*/
1140     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1141     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1142     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1143     count = current_state->count; /* Already matched */
1144     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1145     if (clen > 0)
1146     {
1147     BOOL OK;
1148     switch (c)
1149     {
1150     case 0x09: /* HT */
1151     case 0x20: /* SPACE */
1152     case 0xa0: /* NBSP */
1153     case 0x1680: /* OGHAM SPACE MARK */
1154     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1155     case 0x2000: /* EN QUAD */
1156     case 0x2001: /* EM QUAD */
1157     case 0x2002: /* EN SPACE */
1158     case 0x2003: /* EM SPACE */
1159     case 0x2004: /* THREE-PER-EM SPACE */
1160     case 0x2005: /* FOUR-PER-EM SPACE */
1161     case 0x2006: /* SIX-PER-EM SPACE */
1162     case 0x2007: /* FIGURE SPACE */
1163     case 0x2008: /* PUNCTUATION SPACE */
1164     case 0x2009: /* THIN SPACE */
1165     case 0x200A: /* HAIR SPACE */
1166     case 0x202f: /* NARROW NO-BREAK SPACE */
1167     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1168     case 0x3000: /* IDEOGRAPHIC SPACE */
1169     OK = TRUE;
1170     break;
1171    
1172     default:
1173     OK = FALSE;
1174     break;
1175     }
1176    
1177     if (OK == (d == OP_HSPACE))
1178     {
1179     if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1180     {
1181     active_count--; /* Remove non-match possibility */
1182     next_active_state--;
1183     }
1184     count++;
1185     ADD_NEW_DATA(-state_offset, count, 0);
1186     }
1187     }
1188     break;
1189    
1190     /*-----------------------------------------------------------------*/
1191 ph10 151 #ifdef SUPPORT_UCP
1192 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1193     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1194 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1195 nigel 87 count = 4;
1196 nigel 77 goto QS1;
1197    
1198     case OP_PROP_EXTRA + OP_TYPESTAR:
1199     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1200 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1201 nigel 77 count = 0;
1202    
1203     QS1:
1204    
1205 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1206 nigel 77 if (clen > 0)
1207     {
1208 nigel 87 BOOL OK;
1209     int category = _pcre_ucp_findprop(c, &chartype, &script);
1210     switch(code[2])
1211     {
1212     case PT_ANY:
1213     OK = TRUE;
1214     break;
1215    
1216     case PT_LAMP:
1217     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1218     break;
1219    
1220     case PT_GC:
1221     OK = category == code[3];
1222     break;
1223    
1224     case PT_PC:
1225     OK = chartype == code[3];
1226     break;
1227    
1228     case PT_SC:
1229     OK = script == code[3];
1230     break;
1231    
1232     /* Should never occur, but keep compilers from grumbling. */
1233    
1234     default:
1235     OK = codevalue != OP_PROP;
1236     break;
1237     }
1238    
1239 nigel 93 if (OK == (d == OP_PROP))
1240     {
1241     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1242     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1243     {
1244     active_count--; /* Remove non-match possibility */
1245     next_active_state--;
1246     }
1247     ADD_NEW(state_offset + count, 0);
1248     }
1249 nigel 77 }
1250     break;
1251    
1252     /*-----------------------------------------------------------------*/
1253     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1254     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1255 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1256 nigel 77 count = 2;
1257     goto QS2;
1258    
1259     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1260     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1261 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1262 nigel 77 count = 0;
1263    
1264     QS2:
1265    
1266     ADD_ACTIVE(state_offset + 2, 0);
1267 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1268 nigel 77 {
1269     const uschar *nptr = ptr + clen;
1270     int ncount = 0;
1271 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1272     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1273     {
1274     active_count--; /* Remove non-match possibility */
1275     next_active_state--;
1276     }
1277 nigel 77 while (nptr < end_subject)
1278     {
1279     int nd;
1280     int ndlen = 1;
1281     GETCHARLEN(nd, nptr, ndlen);
1282 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1283 nigel 77 ncount++;
1284     nptr += ndlen;
1285     }
1286     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1287     }
1288     break;
1289 ph10 151 #endif
1290 nigel 77
1291     /*-----------------------------------------------------------------*/
1292 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1293     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1294     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1295     count = 2;
1296     goto QS3;
1297    
1298     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1299     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1300     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1301     count = 0;
1302    
1303     QS3:
1304     ADD_ACTIVE(state_offset + 2, 0);
1305     if (clen > 0)
1306     {
1307     int ncount = 0;
1308     switch (c)
1309     {
1310     case 0x000d:
1311     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1312     /* Fall through */
1313     case 0x000a:
1314     case 0x000b:
1315     case 0x000c:
1316     case 0x0085:
1317     case 0x2028:
1318     case 0x2029:
1319     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1320     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1321     {
1322     active_count--; /* Remove non-match possibility */
1323     next_active_state--;
1324     }
1325     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1326     break;
1327     default:
1328     break;
1329     }
1330     }
1331     break;
1332    
1333     /*-----------------------------------------------------------------*/
1334 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1335     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1336     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1337     count = 2;
1338     goto QS4;
1339    
1340     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1341     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1342     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1343     count = 0;
1344    
1345     QS4:
1346     ADD_ACTIVE(state_offset + 2, 0);
1347     if (clen > 0)
1348     {
1349     BOOL OK;
1350     switch (c)
1351     {
1352     case 0x000a:
1353     case 0x000b:
1354     case 0x000c:
1355     case 0x000d:
1356     case 0x0085:
1357     case 0x2028:
1358     case 0x2029:
1359     OK = TRUE;
1360     break;
1361    
1362     default:
1363     OK = FALSE;
1364     break;
1365     }
1366     if (OK == (d == OP_VSPACE))
1367     {
1368     if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1369     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1370     {
1371     active_count--; /* Remove non-match possibility */
1372     next_active_state--;
1373     }
1374     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1375     }
1376     }
1377     break;
1378    
1379     /*-----------------------------------------------------------------*/
1380     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1381     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1382     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1383     count = 2;
1384     goto QS5;
1385    
1386     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1387     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1388     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1389     count = 0;
1390    
1391     QS5:
1392     ADD_ACTIVE(state_offset + 2, 0);
1393     if (clen > 0)
1394     {
1395     BOOL OK;
1396     switch (c)
1397     {
1398     case 0x09: /* HT */
1399     case 0x20: /* SPACE */
1400     case 0xa0: /* NBSP */
1401     case 0x1680: /* OGHAM SPACE MARK */
1402     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1403     case 0x2000: /* EN QUAD */
1404     case 0x2001: /* EM QUAD */
1405     case 0x2002: /* EN SPACE */
1406     case 0x2003: /* EM SPACE */
1407     case 0x2004: /* THREE-PER-EM SPACE */
1408     case 0x2005: /* FOUR-PER-EM SPACE */
1409     case 0x2006: /* SIX-PER-EM SPACE */
1410     case 0x2007: /* FIGURE SPACE */
1411     case 0x2008: /* PUNCTUATION SPACE */
1412     case 0x2009: /* THIN SPACE */
1413     case 0x200A: /* HAIR SPACE */
1414     case 0x202f: /* NARROW NO-BREAK SPACE */
1415     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1416     case 0x3000: /* IDEOGRAPHIC SPACE */
1417     OK = TRUE;
1418     break;
1419    
1420     default:
1421     OK = FALSE;
1422     break;
1423     }
1424    
1425     if (OK == (d == OP_HSPACE))
1426     {
1427     if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1428     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1429     {
1430     active_count--; /* Remove non-match possibility */
1431     next_active_state--;
1432     }
1433     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1434     }
1435     }
1436     break;
1437    
1438     /*-----------------------------------------------------------------*/
1439 ph10 151 #ifdef SUPPORT_UCP
1440 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1441     case OP_PROP_EXTRA + OP_TYPEUPTO:
1442     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1443 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1444 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1445 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1446 nigel 77 count = current_state->count; /* Number already matched */
1447     if (clen > 0)
1448     {
1449 nigel 87 BOOL OK;
1450     int category = _pcre_ucp_findprop(c, &chartype, &script);
1451     switch(code[4])
1452 nigel 77 {
1453 nigel 87 case PT_ANY:
1454     OK = TRUE;
1455     break;
1456    
1457     case PT_LAMP:
1458     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1459     break;
1460    
1461     case PT_GC:
1462     OK = category == code[5];
1463     break;
1464    
1465     case PT_PC:
1466     OK = chartype == code[5];
1467     break;
1468    
1469     case PT_SC:
1470     OK = script == code[5];
1471     break;
1472    
1473     /* Should never occur, but keep compilers from grumbling. */
1474    
1475     default:
1476     OK = codevalue != OP_PROP;
1477     break;
1478     }
1479    
1480     if (OK == (d == OP_PROP))
1481     {
1482 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1483     {
1484     active_count--; /* Remove non-match possibility */
1485     next_active_state--;
1486     }
1487 nigel 77 if (++count >= GET2(code, 1))
1488 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1489 nigel 77 else
1490     { ADD_NEW(state_offset, count); }
1491     }
1492     }
1493     break;
1494    
1495     /*-----------------------------------------------------------------*/
1496     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1497     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1498     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1499 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1500 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1501     { ADD_ACTIVE(state_offset + 4, 0); }
1502     count = current_state->count; /* Number already matched */
1503 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1504 nigel 77 {
1505     const uschar *nptr = ptr + clen;
1506     int ncount = 0;
1507 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1508     {
1509     active_count--; /* Remove non-match possibility */
1510     next_active_state--;
1511     }
1512 nigel 77 while (nptr < end_subject)
1513     {
1514     int nd;
1515     int ndlen = 1;
1516     GETCHARLEN(nd, nptr, ndlen);
1517 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1518 nigel 77 ncount++;
1519     nptr += ndlen;
1520     }
1521     if (++count >= GET2(code, 1))
1522     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1523     else
1524     { ADD_NEW_DATA(-state_offset, count, ncount); }
1525     }
1526     break;
1527 ph10 151 #endif
1528 nigel 77
1529 nigel 93 /*-----------------------------------------------------------------*/
1530     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1531     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1532     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1533     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1534     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1535     { ADD_ACTIVE(state_offset + 4, 0); }
1536     count = current_state->count; /* Number already matched */
1537     if (clen > 0)
1538     {
1539     int ncount = 0;
1540     switch (c)
1541     {
1542     case 0x000d:
1543     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1544     /* Fall through */
1545     case 0x000a:
1546     case 0x000b:
1547     case 0x000c:
1548     case 0x0085:
1549     case 0x2028:
1550     case 0x2029:
1551     if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1552     {
1553     active_count--; /* Remove non-match possibility */
1554     next_active_state--;
1555     }
1556     if (++count >= GET2(code, 1))
1557     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1558     else
1559     { ADD_NEW_DATA(-state_offset, count, ncount); }
1560     break;
1561     default:
1562     break;
1563     }
1564     }
1565     break;
1566    
1567 ph10 178 /*-----------------------------------------------------------------*/
1568     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1569     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1570     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1571     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1572     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1573     { ADD_ACTIVE(state_offset + 4, 0); }
1574     count = current_state->count; /* Number already matched */
1575     if (clen > 0)
1576     {
1577     BOOL OK;
1578     switch (c)
1579     {
1580     case 0x000a:
1581     case 0x000b:
1582     case 0x000c:
1583     case 0x000d:
1584     case 0x0085:
1585     case 0x2028:
1586     case 0x2029:
1587     OK = TRUE;
1588     break;
1589    
1590     default:
1591     OK = FALSE;
1592     }
1593    
1594     if (OK == (d == OP_VSPACE))
1595     {
1596     if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1597     {
1598     active_count--; /* Remove non-match possibility */
1599     next_active_state--;
1600     }
1601     if (++count >= GET2(code, 1))
1602     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1603     else
1604     { ADD_NEW_DATA(-state_offset, count, 0); }
1605     }
1606     }
1607     break;
1608    
1609     /*-----------------------------------------------------------------*/
1610     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1611     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1612     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1613     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1614     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1615     { ADD_ACTIVE(state_offset + 4, 0); }
1616     count = current_state->count; /* Number already matched */
1617     if (clen > 0)
1618     {
1619     BOOL OK;
1620     switch (c)
1621     {
1622     case 0x09: /* HT */
1623     case 0x20: /* SPACE */
1624     case 0xa0: /* NBSP */
1625     case 0x1680: /* OGHAM SPACE MARK */
1626     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1627     case 0x2000: /* EN QUAD */
1628     case 0x2001: /* EM QUAD */
1629     case 0x2002: /* EN SPACE */
1630     case 0x2003: /* EM SPACE */
1631     case 0x2004: /* THREE-PER-EM SPACE */
1632     case 0x2005: /* FOUR-PER-EM SPACE */
1633     case 0x2006: /* SIX-PER-EM SPACE */
1634     case 0x2007: /* FIGURE SPACE */
1635     case 0x2008: /* PUNCTUATION SPACE */
1636     case 0x2009: /* THIN SPACE */
1637     case 0x200A: /* HAIR SPACE */
1638     case 0x202f: /* NARROW NO-BREAK SPACE */
1639     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1640     case 0x3000: /* IDEOGRAPHIC SPACE */
1641     OK = TRUE;
1642     break;
1643    
1644     default:
1645     OK = FALSE;
1646     break;
1647     }
1648    
1649     if (OK == (d == OP_HSPACE))
1650     {
1651     if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1652     {
1653     active_count--; /* Remove non-match possibility */
1654     next_active_state--;
1655     }
1656     if (++count >= GET2(code, 1))
1657     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1658     else
1659     { ADD_NEW_DATA(-state_offset, count, 0); }
1660     }
1661     }
1662     break;
1663    
1664 nigel 77 /* ========================================================================== */
1665     /* These opcodes are followed by a character that is usually compared
1666     to the current subject character; it is loaded into d. We still get
1667     here even if there is no subject character, because in some cases zero
1668     repetitions are permitted. */
1669    
1670     /*-----------------------------------------------------------------*/
1671     case OP_CHAR:
1672     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1673     break;
1674    
1675     /*-----------------------------------------------------------------*/
1676     case OP_CHARNC:
1677     if (clen == 0) break;
1678    
1679     #ifdef SUPPORT_UTF8
1680     if (utf8)
1681     {
1682     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1683     {
1684 nigel 93 unsigned int othercase;
1685 nigel 77 if (c < 128) othercase = fcc[c]; else
1686    
1687     /* If we have Unicode property support, we can use it to test the
1688 nigel 87 other case of the character. */
1689 nigel 77
1690     #ifdef SUPPORT_UCP
1691 nigel 87 othercase = _pcre_ucp_othercase(c);
1692     #else
1693 nigel 93 othercase = NOTACHAR;
1694 nigel 77 #endif
1695    
1696     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1697     }
1698     }
1699     else
1700     #endif /* SUPPORT_UTF8 */
1701    
1702     /* Non-UTF-8 mode */
1703     {
1704     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1705     }
1706     break;
1707    
1708    
1709     #ifdef SUPPORT_UCP
1710     /*-----------------------------------------------------------------*/
1711     /* This is a tricky one because it can match more than one character.
1712     Find out how many characters to skip, and then set up a negative state
1713     to wait for them to pass before continuing. */
1714    
1715     case OP_EXTUNI:
1716 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1717 nigel 77 {
1718     const uschar *nptr = ptr + clen;
1719     int ncount = 0;
1720     while (nptr < end_subject)
1721     {
1722     int nclen = 1;
1723     GETCHARLEN(c, nptr, nclen);
1724 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1725 nigel 77 ncount++;
1726     nptr += nclen;
1727     }
1728     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1729     }
1730     break;
1731     #endif
1732    
1733     /*-----------------------------------------------------------------*/
1734 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1735     character (when CR is followed by LF). In this case, set up a negative
1736     state to wait for one character to pass before continuing. */
1737    
1738     case OP_ANYNL:
1739     if (clen > 0) switch(c)
1740     {
1741     case 0x000a:
1742     case 0x000b:
1743     case 0x000c:
1744     case 0x0085:
1745     case 0x2028:
1746     case 0x2029:
1747     ADD_NEW(state_offset + 1, 0);
1748     break;
1749     case 0x000d:
1750     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1751     {
1752     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1753     }
1754     else
1755     {
1756     ADD_NEW(state_offset + 1, 0);
1757     }
1758     break;
1759     }
1760     break;
1761    
1762     /*-----------------------------------------------------------------*/
1763 ph10 178 case OP_NOT_VSPACE:
1764     if (clen > 0) switch(c)
1765     {
1766     case 0x000a:
1767     case 0x000b:
1768     case 0x000c:
1769     case 0x000d:
1770     case 0x0085:
1771     case 0x2028:
1772     case 0x2029:
1773     break;
1774    
1775     default:
1776     ADD_NEW(state_offset + 1, 0);
1777     break;
1778     }
1779     break;
1780    
1781     /*-----------------------------------------------------------------*/
1782     case OP_VSPACE:
1783     if (clen > 0) switch(c)
1784     {
1785     case 0x000a:
1786     case 0x000b:
1787     case 0x000c:
1788     case 0x000d:
1789     case 0x0085:
1790     case 0x2028:
1791     case 0x2029:
1792     ADD_NEW(state_offset + 1, 0);
1793     break;
1794    
1795     default: break;
1796     }
1797     break;
1798    
1799     /*-----------------------------------------------------------------*/
1800     case OP_NOT_HSPACE:
1801     if (clen > 0) switch(c)
1802     {
1803     case 0x09: /* HT */
1804     case 0x20: /* SPACE */
1805     case 0xa0: /* NBSP */
1806     case 0x1680: /* OGHAM SPACE MARK */
1807     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1808     case 0x2000: /* EN QUAD */
1809     case 0x2001: /* EM QUAD */
1810     case 0x2002: /* EN SPACE */
1811     case 0x2003: /* EM SPACE */
1812     case 0x2004: /* THREE-PER-EM SPACE */
1813     case 0x2005: /* FOUR-PER-EM SPACE */
1814     case 0x2006: /* SIX-PER-EM SPACE */
1815     case 0x2007: /* FIGURE SPACE */
1816     case 0x2008: /* PUNCTUATION SPACE */
1817     case 0x2009: /* THIN SPACE */
1818     case 0x200A: /* HAIR SPACE */
1819     case 0x202f: /* NARROW NO-BREAK SPACE */
1820     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1821     case 0x3000: /* IDEOGRAPHIC SPACE */
1822     break;
1823    
1824     default:
1825     ADD_NEW(state_offset + 1, 0);
1826     break;
1827     }
1828     break;
1829    
1830     /*-----------------------------------------------------------------*/
1831     case OP_HSPACE:
1832     if (clen > 0) switch(c)
1833     {
1834     case 0x09: /* HT */
1835     case 0x20: /* SPACE */
1836     case 0xa0: /* NBSP */
1837     case 0x1680: /* OGHAM SPACE MARK */
1838     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1839     case 0x2000: /* EN QUAD */
1840     case 0x2001: /* EM QUAD */
1841     case 0x2002: /* EN SPACE */
1842     case 0x2003: /* EM SPACE */
1843     case 0x2004: /* THREE-PER-EM SPACE */
1844     case 0x2005: /* FOUR-PER-EM SPACE */
1845     case 0x2006: /* SIX-PER-EM SPACE */
1846     case 0x2007: /* FIGURE SPACE */
1847     case 0x2008: /* PUNCTUATION SPACE */
1848     case 0x2009: /* THIN SPACE */
1849     case 0x200A: /* HAIR SPACE */
1850     case 0x202f: /* NARROW NO-BREAK SPACE */
1851     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1852     case 0x3000: /* IDEOGRAPHIC SPACE */
1853     ADD_NEW(state_offset + 1, 0);
1854     break;
1855     }
1856     break;
1857    
1858     /*-----------------------------------------------------------------*/
1859 nigel 77 /* Match a negated single character. This is only used for one-byte
1860     characters, that is, we know that d < 256. The character we are
1861     checking (c) can be multibyte. */
1862    
1863     case OP_NOT:
1864     if (clen > 0)
1865     {
1866 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1867 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1868     }
1869     break;
1870    
1871     /*-----------------------------------------------------------------*/
1872     case OP_PLUS:
1873     case OP_MINPLUS:
1874 nigel 93 case OP_POSPLUS:
1875 nigel 77 case OP_NOTPLUS:
1876     case OP_NOTMINPLUS:
1877 nigel 93 case OP_NOTPOSPLUS:
1878 nigel 77 count = current_state->count; /* Already matched */
1879     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1880     if (clen > 0)
1881     {
1882 nigel 93 unsigned int otherd = NOTACHAR;
1883 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1884     {
1885     #ifdef SUPPORT_UTF8
1886 nigel 87 if (utf8 && d >= 128)
1887 nigel 77 {
1888     #ifdef SUPPORT_UCP
1889 nigel 87 otherd = _pcre_ucp_othercase(d);
1890 nigel 77 #endif /* SUPPORT_UCP */
1891     }
1892     else
1893     #endif /* SUPPORT_UTF8 */
1894     otherd = fcc[d];
1895     }
1896     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1897 nigel 93 {
1898     if (count > 0 &&
1899     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1900     {
1901     active_count--; /* Remove non-match possibility */
1902     next_active_state--;
1903     }
1904     count++;
1905     ADD_NEW(state_offset, count);
1906     }
1907 nigel 77 }
1908     break;
1909    
1910     /*-----------------------------------------------------------------*/
1911     case OP_QUERY:
1912     case OP_MINQUERY:
1913 nigel 93 case OP_POSQUERY:
1914 nigel 77 case OP_NOTQUERY:
1915     case OP_NOTMINQUERY:
1916 nigel 93 case OP_NOTPOSQUERY:
1917 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1918     if (clen > 0)
1919     {
1920 nigel 93 unsigned int otherd = NOTACHAR;
1921 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1922 nigel 77 {
1923     #ifdef SUPPORT_UTF8
1924 nigel 87 if (utf8 && d >= 128)
1925 nigel 77 {
1926     #ifdef SUPPORT_UCP
1927 nigel 87 otherd = _pcre_ucp_othercase(d);
1928 nigel 77 #endif /* SUPPORT_UCP */
1929     }
1930     else
1931     #endif /* SUPPORT_UTF8 */
1932     otherd = fcc[d];
1933     }
1934     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1935 nigel 93 {
1936     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1937     {
1938     active_count--; /* Remove non-match possibility */
1939     next_active_state--;
1940     }
1941     ADD_NEW(state_offset + dlen + 1, 0);
1942     }
1943 nigel 77 }
1944     break;
1945    
1946     /*-----------------------------------------------------------------*/
1947     case OP_STAR:
1948     case OP_MINSTAR:
1949 nigel 93 case OP_POSSTAR:
1950 nigel 77 case OP_NOTSTAR:
1951     case OP_NOTMINSTAR:
1952 nigel 93 case OP_NOTPOSSTAR:
1953 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1954     if (clen > 0)
1955     {
1956 nigel 93 unsigned int otherd = NOTACHAR;
1957 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1958 nigel 77 {
1959     #ifdef SUPPORT_UTF8
1960 nigel 87 if (utf8 && d >= 128)
1961 nigel 77 {
1962     #ifdef SUPPORT_UCP
1963 nigel 87 otherd = _pcre_ucp_othercase(d);
1964 nigel 77 #endif /* SUPPORT_UCP */
1965     }
1966     else
1967     #endif /* SUPPORT_UTF8 */
1968     otherd = fcc[d];
1969     }
1970     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1971 nigel 93 {
1972     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1973     {
1974     active_count--; /* Remove non-match possibility */
1975     next_active_state--;
1976     }
1977     ADD_NEW(state_offset, 0);
1978     }
1979 nigel 77 }
1980     break;
1981    
1982     /*-----------------------------------------------------------------*/
1983     case OP_EXACT:
1984 nigel 93 case OP_NOTEXACT:
1985     count = current_state->count; /* Number already matched */
1986     if (clen > 0)
1987     {
1988     unsigned int otherd = NOTACHAR;
1989     if ((ims & PCRE_CASELESS) != 0)
1990     {
1991     #ifdef SUPPORT_UTF8
1992     if (utf8 && d >= 128)
1993     {
1994     #ifdef SUPPORT_UCP
1995     otherd = _pcre_ucp_othercase(d);
1996     #endif /* SUPPORT_UCP */
1997     }
1998     else
1999     #endif /* SUPPORT_UTF8 */
2000     otherd = fcc[d];
2001     }
2002     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2003     {
2004     if (++count >= GET2(code, 1))
2005     { ADD_NEW(state_offset + dlen + 3, 0); }
2006     else
2007     { ADD_NEW(state_offset, count); }
2008     }
2009     }
2010     break;
2011    
2012     /*-----------------------------------------------------------------*/
2013 nigel 77 case OP_UPTO:
2014     case OP_MINUPTO:
2015 nigel 93 case OP_POSUPTO:
2016 nigel 77 case OP_NOTUPTO:
2017     case OP_NOTMINUPTO:
2018 nigel 93 case OP_NOTPOSUPTO:
2019     ADD_ACTIVE(state_offset + dlen + 3, 0);
2020 nigel 77 count = current_state->count; /* Number already matched */
2021     if (clen > 0)
2022     {
2023 nigel 93 unsigned int otherd = NOTACHAR;
2024 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2025     {
2026     #ifdef SUPPORT_UTF8
2027 nigel 87 if (utf8 && d >= 128)
2028 nigel 77 {
2029     #ifdef SUPPORT_UCP
2030 nigel 87 otherd = _pcre_ucp_othercase(d);
2031 nigel 77 #endif /* SUPPORT_UCP */
2032     }
2033     else
2034     #endif /* SUPPORT_UTF8 */
2035     otherd = fcc[d];
2036     }
2037     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2038     {
2039 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2040     {
2041     active_count--; /* Remove non-match possibility */
2042     next_active_state--;
2043     }
2044 nigel 77 if (++count >= GET2(code, 1))
2045     { ADD_NEW(state_offset + dlen + 3, 0); }
2046     else
2047     { ADD_NEW(state_offset, count); }
2048     }
2049     }
2050     break;
2051    
2052    
2053     /* ========================================================================== */
2054     /* These are the class-handling opcodes */
2055    
2056     case OP_CLASS:
2057     case OP_NCLASS:
2058     case OP_XCLASS:
2059     {
2060     BOOL isinclass = FALSE;
2061     int next_state_offset;
2062     const uschar *ecode;
2063    
2064     /* For a simple class, there is always just a 32-byte table, and we
2065     can set isinclass from it. */
2066    
2067     if (codevalue != OP_XCLASS)
2068     {
2069     ecode = code + 33;
2070     if (clen > 0)
2071     {
2072     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2073     ((code[1 + c/8] & (1 << (c&7))) != 0);
2074     }
2075     }
2076    
2077     /* An extended class may have a table or a list of single characters,
2078     ranges, or both, and it may be positive or negative. There's a
2079     function that sorts all this out. */
2080    
2081     else
2082     {
2083     ecode = code + GET(code, 1);
2084     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2085     }
2086    
2087     /* At this point, isinclass is set for all kinds of class, and ecode
2088     points to the byte after the end of the class. If there is a
2089     quantifier, this is where it will be. */
2090    
2091     next_state_offset = ecode - start_code;
2092    
2093     switch (*ecode)
2094     {
2095     case OP_CRSTAR:
2096     case OP_CRMINSTAR:
2097     ADD_ACTIVE(next_state_offset + 1, 0);
2098     if (isinclass) { ADD_NEW(state_offset, 0); }
2099     break;
2100    
2101     case OP_CRPLUS:
2102     case OP_CRMINPLUS:
2103     count = current_state->count; /* Already matched */
2104     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2105     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2106     break;
2107    
2108     case OP_CRQUERY:
2109     case OP_CRMINQUERY:
2110     ADD_ACTIVE(next_state_offset + 1, 0);
2111     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2112     break;
2113    
2114     case OP_CRRANGE:
2115     case OP_CRMINRANGE:
2116     count = current_state->count; /* Already matched */
2117     if (count >= GET2(ecode, 1))
2118     { ADD_ACTIVE(next_state_offset + 5, 0); }
2119     if (isinclass)
2120     {
2121 nigel 91 int max = GET2(ecode, 3);
2122     if (++count >= max && max != 0) /* Max 0 => no limit */
2123 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2124     else
2125     { ADD_NEW(state_offset, count); }
2126     }
2127     break;
2128    
2129     default:
2130     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2131     break;
2132     }
2133     }
2134     break;
2135    
2136     /* ========================================================================== */
2137     /* These are the opcodes for fancy brackets of various kinds. We have
2138     to use recursion in order to handle them. */
2139    
2140     case OP_ASSERT:
2141     case OP_ASSERT_NOT:
2142     case OP_ASSERTBACK:
2143     case OP_ASSERTBACK_NOT:
2144     {
2145     int rc;
2146     int local_offsets[2];
2147     int local_workspace[1000];
2148     const uschar *endasscode = code + GET(code, 1);
2149    
2150     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2151    
2152     rc = internal_dfa_exec(
2153     md, /* static match data */
2154     code, /* this subexpression's code */
2155     ptr, /* where we currently are */
2156     ptr - start_subject, /* start offset */
2157     local_offsets, /* offset vector */
2158     sizeof(local_offsets)/sizeof(int), /* size of same */
2159     local_workspace, /* workspace vector */
2160     sizeof(local_workspace)/sizeof(int), /* size of same */
2161     ims, /* the current ims flags */
2162     rlevel, /* function recursion level */
2163     recursing); /* pass on regex recursion */
2164    
2165     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2166     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2167     }
2168     break;
2169    
2170     /*-----------------------------------------------------------------*/
2171     case OP_COND:
2172 nigel 93 case OP_SCOND:
2173 nigel 77 {
2174     int local_offsets[1000];
2175     int local_workspace[1000];
2176     int condcode = code[LINK_SIZE+1];
2177    
2178 nigel 93 /* Back reference conditions are not supported */
2179 nigel 77
2180 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2181    
2182     /* The DEFINE condition is always false */
2183    
2184     if (condcode == OP_DEF)
2185 nigel 77 {
2186 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2187     }
2188    
2189     /* The only supported version of OP_RREF is for the value RREF_ANY,
2190     which means "test if in any recursion". We can't test for specifically
2191     recursed groups. */
2192    
2193     else if (condcode == OP_RREF)
2194     {
2195 nigel 77 int value = GET2(code, LINK_SIZE+2);
2196 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2197 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2198     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2199     }
2200    
2201     /* Otherwise, the condition is an assertion */
2202    
2203     else
2204     {
2205     int rc;
2206     const uschar *asscode = code + LINK_SIZE + 1;
2207     const uschar *endasscode = asscode + GET(asscode, 1);
2208    
2209     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2210    
2211     rc = internal_dfa_exec(
2212     md, /* fixed match data */
2213     asscode, /* this subexpression's code */
2214     ptr, /* where we currently are */
2215     ptr - start_subject, /* start offset */
2216     local_offsets, /* offset vector */
2217     sizeof(local_offsets)/sizeof(int), /* size of same */
2218     local_workspace, /* workspace vector */
2219     sizeof(local_workspace)/sizeof(int), /* size of same */
2220     ims, /* the current ims flags */
2221     rlevel, /* function recursion level */
2222     recursing); /* pass on regex recursion */
2223    
2224     if ((rc >= 0) ==
2225     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2226     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2227     else
2228     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2229     }
2230     }
2231     break;
2232    
2233     /*-----------------------------------------------------------------*/
2234     case OP_RECURSE:
2235     {
2236     int local_offsets[1000];
2237     int local_workspace[1000];
2238     int rc;
2239    
2240     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2241     recursing + 1));
2242    
2243     rc = internal_dfa_exec(
2244     md, /* fixed match data */
2245     start_code + GET(code, 1), /* this subexpression's code */
2246     ptr, /* where we currently are */
2247     ptr - start_subject, /* start offset */
2248     local_offsets, /* offset vector */
2249     sizeof(local_offsets)/sizeof(int), /* size of same */
2250     local_workspace, /* workspace vector */
2251     sizeof(local_workspace)/sizeof(int), /* size of same */
2252     ims, /* the current ims flags */
2253     rlevel, /* function recursion level */
2254     recursing + 1); /* regex recurse level */
2255    
2256     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2257     recursing + 1, rc));
2258    
2259     /* Ran out of internal offsets */
2260    
2261     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2262    
2263     /* For each successful matched substring, set up the next state with a
2264     count of characters to skip before trying it. Note that the count is in
2265     characters, not bytes. */
2266    
2267     if (rc > 0)
2268     {
2269     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2270     {
2271     const uschar *p = start_subject + local_offsets[rc];
2272     const uschar *pp = start_subject + local_offsets[rc+1];
2273     int charcount = local_offsets[rc+1] - local_offsets[rc];
2274     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2275     if (charcount > 0)
2276     {
2277     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2278     }
2279     else
2280     {
2281     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2282     }
2283     }
2284     }
2285     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2286     }
2287     break;
2288    
2289     /*-----------------------------------------------------------------*/
2290     case OP_ONCE:
2291     {
2292     int local_offsets[2];
2293     int local_workspace[1000];
2294    
2295     int rc = internal_dfa_exec(
2296     md, /* fixed match data */
2297     code, /* this subexpression's code */
2298     ptr, /* where we currently are */
2299     ptr - start_subject, /* start offset */
2300     local_offsets, /* offset vector */
2301     sizeof(local_offsets)/sizeof(int), /* size of same */
2302     local_workspace, /* workspace vector */
2303     sizeof(local_workspace)/sizeof(int), /* size of same */
2304     ims, /* the current ims flags */
2305     rlevel, /* function recursion level */
2306     recursing); /* pass on regex recursion */
2307    
2308     if (rc >= 0)
2309     {
2310     const uschar *end_subpattern = code;
2311     int charcount = local_offsets[1] - local_offsets[0];
2312     int next_state_offset, repeat_state_offset;
2313    
2314     do { end_subpattern += GET(end_subpattern, 1); }
2315     while (*end_subpattern == OP_ALT);
2316     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2317    
2318     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2319     arrange for the repeat state also to be added to the relevant list.
2320     Calculate the offset, or set -1 for no repeat. */
2321    
2322     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2323     *end_subpattern == OP_KETRMIN)?
2324     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2325    
2326     /* If we have matched an empty string, add the next state at the
2327     current character pointer. This is important so that the duplicate
2328     checking kicks in, which is what breaks infinite loops that match an
2329     empty string. */
2330    
2331     if (charcount == 0)
2332     {
2333     ADD_ACTIVE(next_state_offset, 0);
2334     }
2335    
2336     /* Optimization: if there are no more active states, and there
2337     are no new states yet set up, then skip over the subject string
2338     right here, to save looping. Otherwise, set up the new state to swing
2339     into action when the end of the substring is reached. */
2340    
2341     else if (i + 1 >= active_count && new_count == 0)
2342     {
2343     ptr += charcount;
2344     clen = 0;
2345     ADD_NEW(next_state_offset, 0);
2346    
2347     /* If we are adding a repeat state at the new character position,
2348     we must fudge things so that it is the only current state.
2349     Otherwise, it might be a duplicate of one we processed before, and
2350     that would cause it to be skipped. */
2351    
2352     if (repeat_state_offset >= 0)
2353     {
2354     next_active_state = active_states;
2355     active_count = 0;
2356     i = -1;
2357     ADD_ACTIVE(repeat_state_offset, 0);
2358     }
2359     }
2360     else
2361     {
2362     const uschar *p = start_subject + local_offsets[0];
2363     const uschar *pp = start_subject + local_offsets[1];
2364     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2365     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2366     if (repeat_state_offset >= 0)
2367     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2368     }
2369    
2370     }
2371     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2372     }
2373     break;
2374    
2375    
2376     /* ========================================================================== */
2377     /* Handle callouts */
2378    
2379     case OP_CALLOUT:
2380     if (pcre_callout != NULL)
2381     {
2382     int rrc;
2383     pcre_callout_block cb;
2384     cb.version = 1; /* Version 1 of the callout block */
2385     cb.callout_number = code[1];
2386     cb.offset_vector = offsets;
2387 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2388 nigel 77 cb.subject_length = end_subject - start_subject;
2389     cb.start_match = current_subject - start_subject;
2390     cb.current_position = ptr - start_subject;
2391     cb.pattern_position = GET(code, 2);
2392     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2393     cb.capture_top = 1;
2394     cb.capture_last = -1;
2395     cb.callout_data = md->callout_data;
2396     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2397     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2398     }
2399     break;
2400    
2401    
2402     /* ========================================================================== */
2403     default: /* Unsupported opcode */
2404     return PCRE_ERROR_DFA_UITEM;
2405     }
2406    
2407     NEXT_ACTIVE_STATE: continue;
2408    
2409     } /* End of loop scanning active states */
2410    
2411     /* We have finished the processing at the current subject character. If no
2412     new states have been set for the next character, we have found all the
2413     matches that we are going to find. If we are at the top level and partial
2414     matching has been requested, check for appropriate conditions. */
2415    
2416     if (new_count <= 0)
2417     {
2418     if (match_count < 0 && /* No matches found */
2419     rlevel == 1 && /* Top level match function */
2420     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2421     ptr >= end_subject && /* Reached end of subject */
2422     ptr > current_subject) /* Matched non-empty string */
2423     {
2424     if (offsetcount >= 2)
2425     {
2426     offsets[0] = current_subject - start_subject;
2427     offsets[1] = end_subject - start_subject;
2428     }
2429     match_count = PCRE_ERROR_PARTIAL;
2430     }
2431    
2432     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2433     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2434     rlevel*2-2, SP));
2435 nigel 91 break; /* In effect, "return", but see the comment below */
2436 nigel 77 }
2437    
2438     /* One or more states are active for the next character. */
2439    
2440     ptr += clen; /* Advance to next subject character */
2441     } /* Loop to move along the subject string */
2442    
2443 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2444     if we use "return" above, we have compiler trouble. Some compilers warn if
2445     there's nothing here because they think the function doesn't return a value. On
2446     the other hand, if we put a dummy statement here, some more clever compilers
2447     complain that it can't be reached. Sigh. */
2448 nigel 77
2449 nigel 91 return match_count;
2450 nigel 77 }
2451    
2452    
2453    
2454    
2455     /*************************************************
2456     * Execute a Regular Expression - DFA engine *
2457     *************************************************/
2458    
2459     /* This external function applies a compiled re to a subject string using a DFA
2460     engine. This function calls the internal function multiple times if the pattern
2461     is not anchored.
2462    
2463     Arguments:
2464     argument_re points to the compiled expression
2465 ph10 97 extra_data points to extra data or is NULL
2466 nigel 77 subject points to the subject string
2467     length length of subject string (may contain binary zeros)
2468     start_offset where to start in the subject string
2469     options option bits
2470     offsets vector of match offsets
2471     offsetcount size of same
2472     workspace workspace vector
2473     wscount size of same
2474    
2475     Returns: > 0 => number of match offset pairs placed in offsets
2476     = 0 => offsets overflowed; longest matches are present
2477     -1 => failed to match
2478     < -1 => some kind of unexpected problem
2479     */
2480    
2481 ph10 145 PCRE_EXP_DEFN int
2482 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2483     const char *subject, int length, int start_offset, int options, int *offsets,
2484     int offsetcount, int *workspace, int wscount)
2485     {
2486     real_pcre *re = (real_pcre *)argument_re;
2487     dfa_match_data match_block;
2488 nigel 91 dfa_match_data *md = &match_block;
2489 nigel 77 BOOL utf8, anchored, startline, firstline;
2490     const uschar *current_subject, *end_subject, *lcc;
2491    
2492     pcre_study_data internal_study;
2493     const pcre_study_data *study = NULL;
2494     real_pcre internal_re;
2495    
2496     const uschar *req_byte_ptr;
2497     const uschar *start_bits = NULL;
2498     BOOL first_byte_caseless = FALSE;
2499     BOOL req_byte_caseless = FALSE;
2500     int first_byte = -1;
2501     int req_byte = -1;
2502     int req_byte2 = -1;
2503 nigel 91 int newline;
2504 nigel 77
2505     /* Plausibility checks */
2506    
2507     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2508     if (re == NULL || subject == NULL || workspace == NULL ||
2509     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2510     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2511     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2512    
2513     /* We need to find the pointer to any study data before we test for byte
2514     flipping, so we scan the extra_data block first. This may set two fields in the
2515     match block, so we must initialize them beforehand. However, the other fields
2516     in the match block must not be set until after the byte flipping. */
2517    
2518 nigel 91 md->tables = re->tables;
2519     md->callout_data = NULL;
2520 nigel 77
2521     if (extra_data != NULL)
2522     {
2523     unsigned int flags = extra_data->flags;
2524     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2525     study = (const pcre_study_data *)extra_data->study_data;
2526     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2527 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2528     return PCRE_ERROR_DFA_UMLIMIT;
2529 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2530 nigel 91 md->callout_data = extra_data->callout_data;
2531 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2532 nigel 91 md->tables = extra_data->tables;
2533 nigel 77 }
2534    
2535     /* Check that the first field in the block is the magic number. If it is not,
2536     test for a regex that was compiled on a host of opposite endianness. If this is
2537     the case, flipped values are put in internal_re and internal_study if there was
2538     study data too. */
2539    
2540     if (re->magic_number != MAGIC_NUMBER)
2541     {
2542     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2543     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2544     if (study != NULL) study = &internal_study;
2545     }
2546    
2547     /* Set some local values */
2548    
2549     current_subject = (const unsigned char *)subject + start_offset;
2550     end_subject = (const unsigned char *)subject + length;
2551     req_byte_ptr = current_subject - 1;
2552    
2553 nigel 91 #ifdef SUPPORT_UTF8
2554 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2555 nigel 91 #else
2556     utf8 = FALSE;
2557     #endif
2558 nigel 77
2559 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2560     (re->options & PCRE_ANCHORED) != 0;
2561    
2562 nigel 77 /* The remaining fixed data for passing around. */
2563    
2564 nigel 91 md->start_code = (const uschar *)argument_re +
2565 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2566 nigel 91 md->start_subject = (const unsigned char *)subject;
2567     md->end_subject = end_subject;
2568     md->moptions = options;
2569     md->poptions = re->options;
2570 nigel 77
2571 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2572     nothing is set at run time, whatever was used at compile time applies. */
2573 nigel 91
2574 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2575 nigel 93 PCRE_NEWLINE_BITS)
2576 nigel 91 {
2577 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2578 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2579     case PCRE_NEWLINE_LF: newline = '\n'; break;
2580     case PCRE_NEWLINE_CR+
2581     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2582 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2583 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2584 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2585 nigel 91 }
2586    
2587 ph10 149 if (newline == -2)
2588 nigel 91 {
2589 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2590     }
2591     else if (newline < 0)
2592     {
2593 nigel 93 md->nltype = NLTYPE_ANY;
2594 nigel 91 }
2595     else
2596     {
2597 nigel 93 md->nltype = NLTYPE_FIXED;
2598     if (newline > 255)
2599     {
2600     md->nllen = 2;
2601     md->nl[0] = (newline >> 8) & 255;
2602     md->nl[1] = newline & 255;
2603     }
2604     else
2605     {
2606     md->nllen = 1;
2607     md->nl[0] = newline;
2608     }
2609 nigel 91 }
2610    
2611 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2612     back the character offset. */
2613    
2614     #ifdef SUPPORT_UTF8
2615     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2616     {
2617     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2618     return PCRE_ERROR_BADUTF8;
2619     if (start_offset > 0 && start_offset < length)
2620     {
2621     int tb = ((uschar *)subject)[start_offset];
2622     if (tb > 127)
2623     {
2624     tb &= 0xc0;
2625     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2626     }
2627     }
2628     }
2629     #endif
2630    
2631     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2632     is a feature that makes it possible to save compiled regex and re-use them
2633     in other programs later. */
2634    
2635 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2636 nigel 77
2637     /* The lower casing table and the "must be at the start of a line" flag are
2638     used in a loop when finding where to start. */
2639    
2640 nigel 91 lcc = md->tables + lcc_offset;
2641 nigel 77 startline = (re->options & PCRE_STARTLINE) != 0;
2642     firstline = (re->options & PCRE_FIRSTLINE) != 0;
2643    
2644     /* Set up the first character to match, if available. The first_byte value is
2645     never set for an anchored regular expression, but the anchoring may be forced
2646     at run time, so we have to test for anchoring. The first char may be unset for
2647     an unanchored pattern, of course. If there's no first char and the pattern was
2648     studied, there may be a bitmap of possible first characters. */
2649    
2650     if (!anchored)
2651     {
2652     if ((re->options & PCRE_FIRSTSET) != 0)
2653     {
2654     first_byte = re->first_byte & 255;
2655     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2656     first_byte = lcc[first_byte];
2657     }
2658     else
2659     {
2660     if (startline && study != NULL &&
2661     (study->options & PCRE_STUDY_MAPPED) != 0)
2662     start_bits = study->start_bits;
2663     }
2664     }
2665    
2666     /* For anchored or unanchored matches, there may be a "last known required
2667     character" set. */
2668    
2669     if ((re->options & PCRE_REQCHSET) != 0)
2670     {
2671     req_byte = re->req_byte & 255;
2672     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2673 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2674 nigel 77 }
2675    
2676     /* Call the main matching function, looping for a non-anchored regex after a
2677     failed match. Unless restarting, optimize by moving to the first match
2678     character if possible, when not anchored. Then unless wanting a partial match,
2679     check for a required later character. */
2680    
2681     for (;;)
2682     {
2683     int rc;
2684    
2685     if ((options & PCRE_DFA_RESTART) == 0)
2686     {
2687     const uschar *save_end_subject = end_subject;
2688    
2689     /* Advance to a unique first char if possible. If firstline is TRUE, the
2690     start of the match is constrained to the first line of a multiline string.
2691 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2692     scanning at a newline. If the match fails at the newline, later code breaks
2693     this loop. */
2694 nigel 77
2695     if (firstline)
2696     {
2697     const uschar *t = current_subject;
2698 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2699 nigel 77 end_subject = t;
2700     }
2701    
2702     if (first_byte >= 0)
2703     {
2704     if (first_byte_caseless)
2705     while (current_subject < end_subject &&
2706     lcc[*current_subject] != first_byte)
2707     current_subject++;
2708     else
2709     while (current_subject < end_subject && *current_subject != first_byte)
2710     current_subject++;
2711     }
2712    
2713 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2714 nigel 77
2715     else if (startline)
2716     {
2717 nigel 93 if (current_subject > md->start_subject + start_offset)
2718 nigel 77 {
2719 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2720 nigel 77 current_subject++;
2721 ph10 130
2722 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2723     ANYCRLF, and we are now at a LF, advance the match position by one more
2724     character. */
2725 ph10 134
2726 ph10 130 if (current_subject[-1] == '\r' &&
2727 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2728 ph10 130 current_subject < end_subject &&
2729     *current_subject == '\n')
2730     current_subject++;
2731 nigel 77 }
2732     }
2733    
2734     /* Or to a non-unique first char after study */
2735    
2736     else if (start_bits != NULL)
2737     {
2738     while (current_subject < end_subject)
2739     {
2740     register unsigned int c = *current_subject;
2741     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2742     else break;
2743     }
2744     }
2745    
2746     /* Restore fudged end_subject */
2747    
2748     end_subject = save_end_subject;
2749     }
2750    
2751     /* If req_byte is set, we know that that character must appear in the subject
2752     for the match to succeed. If the first character is set, req_byte must be
2753     later in the subject; otherwise the test starts at the match point. This
2754     optimization can save a huge amount of work in patterns with nested unlimited
2755     repeats that aren't going to match. Writing separate code for cased/caseless
2756     versions makes it go faster, as does using an autoincrement and backing off
2757     on a match.
2758    
2759     HOWEVER: when the subject string is very, very long, searching to its end can
2760     take a long time, and give bad performance on quite ordinary patterns. This
2761     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2762     don't do this when the string is sufficiently long.
2763    
2764     ALSO: this processing is disabled when partial matching is requested.
2765     */
2766    
2767     if (req_byte >= 0 &&
2768     end_subject - current_subject < REQ_BYTE_MAX &&
2769     (options & PCRE_PARTIAL) == 0)
2770     {
2771     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2772    
2773     /* We don't need to repeat the search if we haven't yet reached the
2774     place we found it at last time. */
2775    
2776     if (p > req_byte_ptr)
2777     {
2778     if (req_byte_caseless)
2779     {
2780     while (p < end_subject)
2781     {
2782     register int pp = *p++;
2783     if (pp == req_byte || pp == req_byte2) { p--; break; }
2784     }
2785     }
2786     else
2787     {
2788     while (p < end_subject)
2789     {
2790     if (*p++ == req_byte) { p--; break; }
2791     }
2792     }
2793    
2794     /* If we can't find the required character, break the matching loop,
2795     which will cause a return or PCRE_ERROR_NOMATCH. */
2796    
2797     if (p >= end_subject) break;
2798    
2799     /* If we have found the required character, save the point where we
2800     found it, so that we don't search again next time round the loop if
2801     the start hasn't passed this character yet. */
2802    
2803     req_byte_ptr = p;
2804     }
2805     }
2806    
2807     /* OK, now we can do the business */
2808    
2809     rc = internal_dfa_exec(
2810 nigel 91 md, /* fixed match data */
2811     md->start_code, /* this subexpression's code */
2812     current_subject, /* where we currently are */
2813     start_offset, /* start offset in subject */
2814     offsets, /* offset vector */
2815     offsetcount, /* size of same */
2816     workspace, /* workspace vector */
2817     wscount, /* size of same */
2818 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2819 nigel 91 0, /* function recurse level */
2820     0); /* regex recurse level */
2821 nigel 77
2822     /* Anything other than "no match" means we are done, always; otherwise, carry
2823     on only if not anchored. */
2824    
2825     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2826    
2827     /* Advance to the next subject character unless we are at the end of a line
2828     and firstline is set. */
2829    
2830 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2831 nigel 77 current_subject++;
2832     if (utf8)
2833     {
2834     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2835     current_subject++;
2836     }
2837     if (current_subject > end_subject) break;
2838    
2839 ph10 150 /* If we have just passed a CR and the newline option is CRLF or ANY or
2840 ph10 149 ANYCRLF, and we are now at a LF, advance the match position by one more
2841     character. */
2842 nigel 93
2843     if (current_subject[-1] == '\r' &&
2844 ph10 150 (md->nltype == NLTYPE_ANY ||
2845     md->nltype == NLTYPE_ANYCRLF ||
2846 ph10 149 md->nllen == 2) &&
2847 nigel 93 current_subject < end_subject &&
2848     *current_subject == '\n')
2849     current_subject++;
2850    
2851     } /* "Bumpalong" loop */
2852    
2853 nigel 77 return PCRE_ERROR_NOMATCH;
2854     }
2855    
2856     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12