/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 341 - (hide annotations) (download)
Sat Apr 19 16:41:04 2008 UTC (6 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 95949 byte(s)
Fix DFA (?!) bug; add support for JavaScript empty classes.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 ph10 200 #ifdef HAVE_CONFIG_H
48 ph10 236 #include "config.h"
49 ph10 200 #endif
50 ph10 199
51 nigel 93 #define NLBLOCK md /* Block containing newline information */
52     #define PSSTART start_subject /* Field containing processed string start */
53     #define PSEND end_subject /* Field containing processed string end */
54    
55 nigel 77 #include "pcre_internal.h"
56    
57    
58     /* For use to indent debugging output */
59    
60     #define SP " "
61    
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87 ph10 327 static const uschar coptable[] = {
88 nigel 77 0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141 ph10 327 static const uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
147 nigel 77 };
148    
149 ph10 327 static const uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
155 nigel 77 };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226 ph10 341 Returns: > 0 => number of match offset pairs placed in offsets
227     = 0 => offsets overflowed; longest matches are present
228 nigel 77 -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 nigel 77
458     /* Make the new state list into the active state list and empty the
459     new state list. */
460    
461     temp_states = active_states;
462     active_states = new_states;
463     new_states = temp_states;
464     active_count = new_count;
465     new_count = 0;
466    
467     workspace[0] ^= 1; /* Remember for the restarting feature */
468     workspace[1] = active_count;
469    
470     #ifdef DEBUG
471     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473     printf("\"\n");
474    
475     printf("%.*sActive states: ", rlevel*2-2, SP);
476     for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478     printf("\n");
479     #endif
480    
481     /* Set the pointers for adding new states */
482    
483     next_active_state = active_states + active_count;
484     next_new_state = new_states;
485    
486     /* Load the current character from the subject outside the loop, as many
487     different states may want to look at it, and we assume that at least one
488     will. */
489    
490     if (ptr < end_subject)
491     {
492 nigel 93 clen = 1; /* Number of bytes in the character */
493 nigel 77 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495     #endif /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498     else
499     {
500 nigel 93 clen = 0; /* This indicates the end of the subject */
501     c = NOTACHAR; /* This value should never actually be used */
502 nigel 77 }
503    
504     /* Scan up the active states and act on each one. The result of an action
505     may be to add more states to the currently active list (e.g. on hitting a
506     parenthesis) or it may be to put states on the new list, for considering
507     when we move the character pointer on. */
508    
509     for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514     int count, codevalue;
515 ph10 152 #ifdef SUPPORT_UCP
516 nigel 87 int chartype, script;
517 ph10 152 #endif
518 nigel 77
519     #ifdef DEBUG
520     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 nigel 93 if (clen == 0) printf("EOL\n");
522 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
523     else printf("0x%02x\n", c);
524     #endif
525    
526     /* This variable is referred to implicity in the ADD_xxx macros. */
527    
528     ims = current_state->ims;
529    
530     /* A negative offset is a special case meaning "hold off going to this
531     (negated) state until the number of characters in the data field have
532     been skipped". */
533    
534     if (state_offset < 0)
535     {
536     if (current_state->data > 0)
537     {
538     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539     ADD_NEW_DATA(state_offset, current_state->count,
540     current_state->data - 1);
541     continue;
542     }
543     else
544     {
545     current_state->offset = state_offset = -state_offset;
546     }
547     }
548    
549     /* Check for a duplicate state with the same count, and skip if found. */
550    
551     for (j = 0; j < i; j++)
552     {
553     if (active_states[j].offset == state_offset &&
554     active_states[j].count == current_state->count)
555     {
556     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557     goto NEXT_ACTIVE_STATE;
558     }
559     }
560    
561     /* The state offset is the offset to the opcode */
562    
563     code = start_code + state_offset;
564     codevalue = *code;
565    
566     /* If this opcode is followed by an inline character, load it. It is
567     tempting to test for the presence of a subject character here, but that
568     is wrong, because sometimes zero repetitions of the subject are
569     permitted.
570    
571     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 ph10 178 argument that is not a data character - but is always one byte long. We
573     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574     this case. To keep the other cases fast, convert these ones to new opcodes.
575     */
576 nigel 77
577     if (coptable[codevalue] > 0)
578     {
579     dlen = 1;
580     #ifdef SUPPORT_UTF8
581     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582     #endif /* SUPPORT_UTF8 */
583     d = code[coptable[codevalue]];
584     if (codevalue >= OP_TYPESTAR)
585     {
586 nigel 93 switch(d)
587     {
588     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
589     case OP_NOTPROP:
590     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593 ph10 178 case OP_NOT_HSPACE:
594 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595 ph10 178 case OP_NOT_VSPACE:
596 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 nigel 93 default: break;
598     }
599 nigel 77 }
600     }
601     else
602     {
603     dlen = 0; /* Not strictly necessary, but compilers moan */
604 nigel 93 d = NOTACHAR; /* if these variables are not set. */
605 nigel 77 }
606    
607    
608     /* Now process the individual opcodes */
609    
610     switch (codevalue)
611     {
612    
613     /* ========================================================================== */
614     /* Reached a closing bracket. If not at the end of the pattern, carry
615     on with the next opcode. Otherwise, unless we have an empty string and
616     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617     matches so we always have the longest first. */
618    
619     case OP_KET:
620     case OP_KETRMIN:
621     case OP_KETRMAX:
622     if (code != end_code)
623     {
624     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625     if (codevalue != OP_KET)
626     {
627     ADD_ACTIVE(state_offset - GET(code, 1), 0);
628     }
629     }
630     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
631     {
632     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
634     match_count = 0;
635     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637     if (offsetcount >= 2)
638     {
639     offsets[0] = current_subject - start_subject;
640     offsets[1] = ptr - start_subject;
641     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642     offsets[1] - offsets[0], current_subject));
643     }
644     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
645     {
646     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648     match_count, rlevel*2-2, SP));
649     return match_count;
650     }
651     }
652     break;
653    
654     /* ========================================================================== */
655     /* These opcodes add to the current list of states without looking
656     at the current character. */
657    
658     /*-----------------------------------------------------------------*/
659     case OP_ALT:
660     do { code += GET(code, 1); } while (*code == OP_ALT);
661     ADD_ACTIVE(code - start_code, 0);
662     break;
663    
664     /*-----------------------------------------------------------------*/
665     case OP_BRA:
666 nigel 93 case OP_SBRA:
667 nigel 77 do
668     {
669     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670     code += GET(code, 1);
671     }
672     while (*code == OP_ALT);
673     break;
674    
675     /*-----------------------------------------------------------------*/
676 nigel 93 case OP_CBRA:
677     case OP_SCBRA:
678     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679     code += GET(code, 1);
680     while (*code == OP_ALT)
681     {
682     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683     code += GET(code, 1);
684     }
685     break;
686    
687     /*-----------------------------------------------------------------*/
688 nigel 77 case OP_BRAZERO:
689     case OP_BRAMINZERO:
690     ADD_ACTIVE(state_offset + 1, 0);
691     code += 1 + GET(code, 2);
692     while (*code == OP_ALT) code += GET(code, 1);
693     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
694     break;
695    
696     /*-----------------------------------------------------------------*/
697 ph10 335 case OP_SKIPZERO:
698     code += 1 + GET(code, 2);
699     while (*code == OP_ALT) code += GET(code, 1);
700     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
701     break;
702    
703     /*-----------------------------------------------------------------*/
704 nigel 77 case OP_CIRC:
705     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
706 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
707     ptr != end_subject &&
708 nigel 93 WAS_NEWLINE(ptr)))
709 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
710     break;
711    
712     /*-----------------------------------------------------------------*/
713     case OP_EOD:
714     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
715     break;
716    
717     /*-----------------------------------------------------------------*/
718     case OP_OPT:
719     ims = code[1];
720     ADD_ACTIVE(state_offset + 2, 0);
721     break;
722    
723     /*-----------------------------------------------------------------*/
724     case OP_SOD:
725     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
726     break;
727    
728     /*-----------------------------------------------------------------*/
729     case OP_SOM:
730     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
731     break;
732    
733    
734     /* ========================================================================== */
735     /* These opcodes inspect the next subject character, and sometimes
736     the previous one as well, but do not have an argument. The variable
737     clen contains the length of the current character and is zero if we are
738     at the end of the subject. */
739    
740     /*-----------------------------------------------------------------*/
741     case OP_ANY:
742 nigel 93 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
743 nigel 77 { ADD_NEW(state_offset + 1, 0); }
744     break;
745    
746     /*-----------------------------------------------------------------*/
747 ph10 341 case OP_ALLANY:
748     if (clen > 0)
749     { ADD_NEW(state_offset + 1, 0); }
750     break;
751    
752     /*-----------------------------------------------------------------*/
753 nigel 77 case OP_EODN:
754 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
755 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
756     break;
757    
758     /*-----------------------------------------------------------------*/
759     case OP_DOLL:
760     if ((md->moptions & PCRE_NOTEOL) == 0)
761     {
762 nigel 91 if (clen == 0 ||
763 nigel 93 (IS_NEWLINE(ptr) &&
764 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
765     ))
766 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
767     }
768 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
769 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
770     break;
771    
772     /*-----------------------------------------------------------------*/
773    
774     case OP_DIGIT:
775     case OP_WHITESPACE:
776     case OP_WORDCHAR:
777     if (clen > 0 && c < 256 &&
778     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
779     { ADD_NEW(state_offset + 1, 0); }
780     break;
781    
782     /*-----------------------------------------------------------------*/
783     case OP_NOT_DIGIT:
784     case OP_NOT_WHITESPACE:
785     case OP_NOT_WORDCHAR:
786     if (clen > 0 && (c >= 256 ||
787     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
788     { ADD_NEW(state_offset + 1, 0); }
789     break;
790    
791     /*-----------------------------------------------------------------*/
792     case OP_WORD_BOUNDARY:
793     case OP_NOT_WORD_BOUNDARY:
794     {
795     int left_word, right_word;
796    
797     if (ptr > start_subject)
798     {
799     const uschar *temp = ptr - 1;
800     #ifdef SUPPORT_UTF8
801     if (utf8) BACKCHAR(temp);
802     #endif
803     GETCHARTEST(d, temp);
804     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
805     }
806     else left_word = 0;
807    
808     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
809     else right_word = 0;
810    
811     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
812     { ADD_ACTIVE(state_offset + 1, 0); }
813     }
814     break;
815    
816    
817     /*-----------------------------------------------------------------*/
818     /* Check the next character by Unicode property. We will get here only
819     if the support is in the binary; otherwise a compile-time error occurs.
820     */
821    
822 ph10 151 #ifdef SUPPORT_UCP
823 nigel 77 case OP_PROP:
824     case OP_NOTPROP:
825     if (clen > 0)
826     {
827 nigel 87 BOOL OK;
828     int category = _pcre_ucp_findprop(c, &chartype, &script);
829     switch(code[1])
830 nigel 77 {
831 nigel 87 case PT_ANY:
832     OK = TRUE;
833     break;
834    
835     case PT_LAMP:
836     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
837     break;
838    
839     case PT_GC:
840     OK = category == code[2];
841     break;
842    
843     case PT_PC:
844     OK = chartype == code[2];
845     break;
846    
847     case PT_SC:
848     OK = script == code[2];
849     break;
850    
851     /* Should never occur, but keep compilers from grumbling. */
852    
853     default:
854     OK = codevalue != OP_PROP;
855     break;
856 nigel 77 }
857 nigel 87
858     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
859 nigel 77 }
860     break;
861     #endif
862    
863    
864    
865     /* ========================================================================== */
866     /* These opcodes likewise inspect the subject character, but have an
867     argument that is not a data character. It is one of these opcodes:
868 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
869     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
870 nigel 77
871     case OP_TYPEPLUS:
872     case OP_TYPEMINPLUS:
873 nigel 93 case OP_TYPEPOSPLUS:
874 nigel 77 count = current_state->count; /* Already matched */
875     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
876     if (clen > 0)
877     {
878     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
879     (c < 256 &&
880 nigel 91 (d != OP_ANY ||
881     (ims & PCRE_DOTALL) != 0 ||
882     !IS_NEWLINE(ptr)
883     ) &&
884 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
885     {
886 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
887     {
888     active_count--; /* Remove non-match possibility */
889     next_active_state--;
890     }
891 nigel 77 count++;
892     ADD_NEW(state_offset, count);
893     }
894     }
895     break;
896    
897     /*-----------------------------------------------------------------*/
898     case OP_TYPEQUERY:
899     case OP_TYPEMINQUERY:
900 nigel 93 case OP_TYPEPOSQUERY:
901 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
902     if (clen > 0)
903     {
904     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
905     (c < 256 &&
906 nigel 91 (d != OP_ANY ||
907     (ims & PCRE_DOTALL) != 0 ||
908     !IS_NEWLINE(ptr)
909     ) &&
910 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
911     {
912 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
913     {
914     active_count--; /* Remove non-match possibility */
915     next_active_state--;
916     }
917 nigel 77 ADD_NEW(state_offset + 2, 0);
918     }
919     }
920     break;
921    
922     /*-----------------------------------------------------------------*/
923     case OP_TYPESTAR:
924     case OP_TYPEMINSTAR:
925 nigel 93 case OP_TYPEPOSSTAR:
926 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
927     if (clen > 0)
928     {
929     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
930     (c < 256 &&
931 nigel 91 (d != OP_ANY ||
932     (ims & PCRE_DOTALL) != 0 ||
933     !IS_NEWLINE(ptr)
934     ) &&
935 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
936     {
937 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
938     {
939     active_count--; /* Remove non-match possibility */
940     next_active_state--;
941     }
942 nigel 77 ADD_NEW(state_offset, 0);
943     }
944     }
945     break;
946    
947     /*-----------------------------------------------------------------*/
948     case OP_TYPEEXACT:
949 nigel 93 count = current_state->count; /* Number already matched */
950     if (clen > 0)
951     {
952     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
953     (c < 256 &&
954     (d != OP_ANY ||
955     (ims & PCRE_DOTALL) != 0 ||
956     !IS_NEWLINE(ptr)
957     ) &&
958     ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
959     {
960     if (++count >= GET2(code, 1))
961     { ADD_NEW(state_offset + 4, 0); }
962     else
963     { ADD_NEW(state_offset, count); }
964     }
965     }
966     break;
967    
968     /*-----------------------------------------------------------------*/
969 nigel 77 case OP_TYPEUPTO:
970     case OP_TYPEMINUPTO:
971 nigel 93 case OP_TYPEPOSUPTO:
972     ADD_ACTIVE(state_offset + 4, 0);
973 nigel 77 count = current_state->count; /* Number already matched */
974     if (clen > 0)
975     {
976     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
977     (c < 256 &&
978 nigel 91 (d != OP_ANY ||
979     (ims & PCRE_DOTALL) != 0 ||
980     !IS_NEWLINE(ptr)
981     ) &&
982 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
983     {
984 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
985     {
986     active_count--; /* Remove non-match possibility */
987     next_active_state--;
988     }
989 nigel 77 if (++count >= GET2(code, 1))
990     { ADD_NEW(state_offset + 4, 0); }
991     else
992     { ADD_NEW(state_offset, count); }
993     }
994     }
995     break;
996    
997     /* ========================================================================== */
998     /* These are virtual opcodes that are used when something like
999 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1000     argument. It keeps the code above fast for the other cases. The argument
1001     is in the d variable. */
1002 nigel 77
1003 ph10 151 #ifdef SUPPORT_UCP
1004 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
1005     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1006 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1007 nigel 77 count = current_state->count; /* Already matched */
1008 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1009 nigel 77 if (clen > 0)
1010     {
1011 nigel 87 BOOL OK;
1012     int category = _pcre_ucp_findprop(c, &chartype, &script);
1013     switch(code[2])
1014     {
1015     case PT_ANY:
1016     OK = TRUE;
1017     break;
1018    
1019     case PT_LAMP:
1020     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1021     break;
1022    
1023     case PT_GC:
1024     OK = category == code[3];
1025     break;
1026    
1027     case PT_PC:
1028     OK = chartype == code[3];
1029     break;
1030    
1031     case PT_SC:
1032     OK = script == code[3];
1033     break;
1034    
1035     /* Should never occur, but keep compilers from grumbling. */
1036    
1037     default:
1038     OK = codevalue != OP_PROP;
1039     break;
1040     }
1041    
1042 nigel 93 if (OK == (d == OP_PROP))
1043     {
1044     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1045     {
1046     active_count--; /* Remove non-match possibility */
1047     next_active_state--;
1048     }
1049     count++;
1050     ADD_NEW(state_offset, count);
1051     }
1052 nigel 77 }
1053     break;
1054    
1055     /*-----------------------------------------------------------------*/
1056     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1057     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1058 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1059 nigel 77 count = current_state->count; /* Already matched */
1060     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1061 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1062 nigel 77 {
1063     const uschar *nptr = ptr + clen;
1064     int ncount = 0;
1065 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1066     {
1067     active_count--; /* Remove non-match possibility */
1068     next_active_state--;
1069     }
1070 nigel 77 while (nptr < end_subject)
1071     {
1072     int nd;
1073     int ndlen = 1;
1074     GETCHARLEN(nd, nptr, ndlen);
1075 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1076 nigel 77 ncount++;
1077     nptr += ndlen;
1078     }
1079     count++;
1080     ADD_NEW_DATA(-state_offset, count, ncount);
1081     }
1082     break;
1083 ph10 151 #endif
1084 nigel 77
1085     /*-----------------------------------------------------------------*/
1086 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1087     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1088     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1089     count = current_state->count; /* Already matched */
1090     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1091     if (clen > 0)
1092     {
1093     int ncount = 0;
1094     switch (c)
1095     {
1096     case 0x000b:
1097     case 0x000c:
1098     case 0x0085:
1099     case 0x2028:
1100     case 0x2029:
1101 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1102     goto ANYNL01;
1103    
1104     case 0x000d:
1105     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1106     /* Fall through */
1107    
1108     ANYNL01:
1109     case 0x000a:
1110 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1111     {
1112     active_count--; /* Remove non-match possibility */
1113     next_active_state--;
1114     }
1115     count++;
1116     ADD_NEW_DATA(-state_offset, count, ncount);
1117     break;
1118 ph10 231
1119 nigel 93 default:
1120     break;
1121     }
1122     }
1123     break;
1124    
1125     /*-----------------------------------------------------------------*/
1126 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1127     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1128     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1129     count = current_state->count; /* Already matched */
1130     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1131     if (clen > 0)
1132     {
1133 ph10 182 BOOL OK;
1134 ph10 178 switch (c)
1135     {
1136     case 0x000a:
1137     case 0x000b:
1138     case 0x000c:
1139     case 0x000d:
1140     case 0x0085:
1141     case 0x2028:
1142     case 0x2029:
1143     OK = TRUE;
1144 ph10 182 break;
1145 ph10 178
1146     default:
1147     OK = FALSE;
1148 ph10 182 break;
1149 ph10 178 }
1150    
1151     if (OK == (d == OP_VSPACE))
1152 ph10 182 {
1153 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1154     {
1155     active_count--; /* Remove non-match possibility */
1156     next_active_state--;
1157     }
1158     count++;
1159     ADD_NEW_DATA(-state_offset, count, 0);
1160     }
1161     }
1162     break;
1163    
1164     /*-----------------------------------------------------------------*/
1165     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1166     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1167     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1168     count = current_state->count; /* Already matched */
1169     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1170     if (clen > 0)
1171     {
1172 ph10 182 BOOL OK;
1173 ph10 178 switch (c)
1174     {
1175     case 0x09: /* HT */
1176     case 0x20: /* SPACE */
1177     case 0xa0: /* NBSP */
1178     case 0x1680: /* OGHAM SPACE MARK */
1179     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1180     case 0x2000: /* EN QUAD */
1181     case 0x2001: /* EM QUAD */
1182     case 0x2002: /* EN SPACE */
1183     case 0x2003: /* EM SPACE */
1184     case 0x2004: /* THREE-PER-EM SPACE */
1185     case 0x2005: /* FOUR-PER-EM SPACE */
1186     case 0x2006: /* SIX-PER-EM SPACE */
1187     case 0x2007: /* FIGURE SPACE */
1188     case 0x2008: /* PUNCTUATION SPACE */
1189     case 0x2009: /* THIN SPACE */
1190     case 0x200A: /* HAIR SPACE */
1191     case 0x202f: /* NARROW NO-BREAK SPACE */
1192     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1193     case 0x3000: /* IDEOGRAPHIC SPACE */
1194     OK = TRUE;
1195     break;
1196 ph10 182
1197 ph10 178 default:
1198     OK = FALSE;
1199     break;
1200     }
1201 ph10 182
1202 ph10 178 if (OK == (d == OP_HSPACE))
1203 ph10 182 {
1204 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1205     {
1206     active_count--; /* Remove non-match possibility */
1207     next_active_state--;
1208     }
1209     count++;
1210     ADD_NEW_DATA(-state_offset, count, 0);
1211     }
1212     }
1213     break;
1214    
1215     /*-----------------------------------------------------------------*/
1216 ph10 151 #ifdef SUPPORT_UCP
1217 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1218     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1219 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1220 nigel 87 count = 4;
1221 nigel 77 goto QS1;
1222    
1223     case OP_PROP_EXTRA + OP_TYPESTAR:
1224     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1225 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1226 nigel 77 count = 0;
1227    
1228     QS1:
1229    
1230 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1231 nigel 77 if (clen > 0)
1232     {
1233 nigel 87 BOOL OK;
1234     int category = _pcre_ucp_findprop(c, &chartype, &script);
1235     switch(code[2])
1236     {
1237     case PT_ANY:
1238     OK = TRUE;
1239     break;
1240    
1241     case PT_LAMP:
1242     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1243     break;
1244    
1245     case PT_GC:
1246     OK = category == code[3];
1247     break;
1248    
1249     case PT_PC:
1250     OK = chartype == code[3];
1251     break;
1252    
1253     case PT_SC:
1254     OK = script == code[3];
1255     break;
1256    
1257     /* Should never occur, but keep compilers from grumbling. */
1258    
1259     default:
1260     OK = codevalue != OP_PROP;
1261     break;
1262     }
1263    
1264 nigel 93 if (OK == (d == OP_PROP))
1265     {
1266     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1267     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1268     {
1269     active_count--; /* Remove non-match possibility */
1270     next_active_state--;
1271     }
1272     ADD_NEW(state_offset + count, 0);
1273     }
1274 nigel 77 }
1275     break;
1276    
1277     /*-----------------------------------------------------------------*/
1278     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1279     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1280 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1281 nigel 77 count = 2;
1282     goto QS2;
1283    
1284     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1285     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1286 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1287 nigel 77 count = 0;
1288    
1289     QS2:
1290    
1291     ADD_ACTIVE(state_offset + 2, 0);
1292 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1293 nigel 77 {
1294     const uschar *nptr = ptr + clen;
1295     int ncount = 0;
1296 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1297     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1298     {
1299     active_count--; /* Remove non-match possibility */
1300     next_active_state--;
1301     }
1302 nigel 77 while (nptr < end_subject)
1303     {
1304     int nd;
1305     int ndlen = 1;
1306     GETCHARLEN(nd, nptr, ndlen);
1307 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1308 nigel 77 ncount++;
1309     nptr += ndlen;
1310     }
1311     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1312     }
1313     break;
1314 ph10 151 #endif
1315 nigel 77
1316     /*-----------------------------------------------------------------*/
1317 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1318     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1319     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1320     count = 2;
1321     goto QS3;
1322    
1323     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1324     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1325     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1326     count = 0;
1327    
1328     QS3:
1329     ADD_ACTIVE(state_offset + 2, 0);
1330     if (clen > 0)
1331     {
1332     int ncount = 0;
1333     switch (c)
1334     {
1335     case 0x000b:
1336     case 0x000c:
1337     case 0x0085:
1338     case 0x2028:
1339     case 0x2029:
1340 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1341     goto ANYNL02;
1342    
1343     case 0x000d:
1344     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1345     /* Fall through */
1346    
1347     ANYNL02:
1348     case 0x000a:
1349 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1350     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1351     {
1352     active_count--; /* Remove non-match possibility */
1353     next_active_state--;
1354     }
1355     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1356     break;
1357 ph10 231
1358 nigel 93 default:
1359     break;
1360     }
1361     }
1362     break;
1363    
1364     /*-----------------------------------------------------------------*/
1365 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1366     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1367     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1368     count = 2;
1369     goto QS4;
1370    
1371     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1372     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1373     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1374     count = 0;
1375    
1376     QS4:
1377     ADD_ACTIVE(state_offset + 2, 0);
1378     if (clen > 0)
1379     {
1380 ph10 182 BOOL OK;
1381 ph10 178 switch (c)
1382     {
1383     case 0x000a:
1384     case 0x000b:
1385     case 0x000c:
1386     case 0x000d:
1387     case 0x0085:
1388     case 0x2028:
1389     case 0x2029:
1390     OK = TRUE;
1391     break;
1392 ph10 182
1393 ph10 178 default:
1394     OK = FALSE;
1395     break;
1396     }
1397     if (OK == (d == OP_VSPACE))
1398 ph10 182 {
1399 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1400     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1401     {
1402     active_count--; /* Remove non-match possibility */
1403     next_active_state--;
1404     }
1405     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1406     }
1407     }
1408     break;
1409    
1410     /*-----------------------------------------------------------------*/
1411     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1412     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1413     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1414     count = 2;
1415     goto QS5;
1416    
1417     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1418     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1419     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1420     count = 0;
1421    
1422     QS5:
1423     ADD_ACTIVE(state_offset + 2, 0);
1424     if (clen > 0)
1425     {
1426 ph10 182 BOOL OK;
1427 ph10 178 switch (c)
1428     {
1429     case 0x09: /* HT */
1430     case 0x20: /* SPACE */
1431     case 0xa0: /* NBSP */
1432     case 0x1680: /* OGHAM SPACE MARK */
1433     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1434     case 0x2000: /* EN QUAD */
1435     case 0x2001: /* EM QUAD */
1436     case 0x2002: /* EN SPACE */
1437     case 0x2003: /* EM SPACE */
1438     case 0x2004: /* THREE-PER-EM SPACE */
1439     case 0x2005: /* FOUR-PER-EM SPACE */
1440     case 0x2006: /* SIX-PER-EM SPACE */
1441     case 0x2007: /* FIGURE SPACE */
1442     case 0x2008: /* PUNCTUATION SPACE */
1443     case 0x2009: /* THIN SPACE */
1444     case 0x200A: /* HAIR SPACE */
1445     case 0x202f: /* NARROW NO-BREAK SPACE */
1446     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1447     case 0x3000: /* IDEOGRAPHIC SPACE */
1448     OK = TRUE;
1449     break;
1450 ph10 182
1451 ph10 178 default:
1452     OK = FALSE;
1453     break;
1454     }
1455 ph10 182
1456 ph10 178 if (OK == (d == OP_HSPACE))
1457 ph10 182 {
1458 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1459     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1460     {
1461     active_count--; /* Remove non-match possibility */
1462     next_active_state--;
1463     }
1464     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1465     }
1466     }
1467     break;
1468    
1469     /*-----------------------------------------------------------------*/
1470 ph10 151 #ifdef SUPPORT_UCP
1471 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1472     case OP_PROP_EXTRA + OP_TYPEUPTO:
1473     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1474 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1475 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1476 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1477 nigel 77 count = current_state->count; /* Number already matched */
1478     if (clen > 0)
1479     {
1480 nigel 87 BOOL OK;
1481     int category = _pcre_ucp_findprop(c, &chartype, &script);
1482     switch(code[4])
1483 nigel 77 {
1484 nigel 87 case PT_ANY:
1485     OK = TRUE;
1486     break;
1487    
1488     case PT_LAMP:
1489     OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1490     break;
1491    
1492     case PT_GC:
1493     OK = category == code[5];
1494     break;
1495    
1496     case PT_PC:
1497     OK = chartype == code[5];
1498     break;
1499    
1500     case PT_SC:
1501     OK = script == code[5];
1502     break;
1503    
1504     /* Should never occur, but keep compilers from grumbling. */
1505    
1506     default:
1507     OK = codevalue != OP_PROP;
1508     break;
1509     }
1510    
1511     if (OK == (d == OP_PROP))
1512     {
1513 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1514     {
1515     active_count--; /* Remove non-match possibility */
1516     next_active_state--;
1517     }
1518 nigel 77 if (++count >= GET2(code, 1))
1519 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1520 nigel 77 else
1521     { ADD_NEW(state_offset, count); }
1522     }
1523     }
1524     break;
1525    
1526     /*-----------------------------------------------------------------*/
1527     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1528     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1529     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1530 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1531 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1532     { ADD_ACTIVE(state_offset + 4, 0); }
1533     count = current_state->count; /* Number already matched */
1534 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1535 nigel 77 {
1536     const uschar *nptr = ptr + clen;
1537     int ncount = 0;
1538 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1539     {
1540     active_count--; /* Remove non-match possibility */
1541     next_active_state--;
1542     }
1543 nigel 77 while (nptr < end_subject)
1544     {
1545     int nd;
1546     int ndlen = 1;
1547     GETCHARLEN(nd, nptr, ndlen);
1548 nigel 87 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1549 nigel 77 ncount++;
1550     nptr += ndlen;
1551     }
1552     if (++count >= GET2(code, 1))
1553     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1554     else
1555     { ADD_NEW_DATA(-state_offset, count, ncount); }
1556     }
1557     break;
1558 ph10 151 #endif
1559 nigel 77
1560 nigel 93 /*-----------------------------------------------------------------*/
1561     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1562     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1563     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1564     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1565     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1566     { ADD_ACTIVE(state_offset + 4, 0); }
1567     count = current_state->count; /* Number already matched */
1568     if (clen > 0)
1569     {
1570     int ncount = 0;
1571     switch (c)
1572     {
1573     case 0x000b:
1574     case 0x000c:
1575     case 0x0085:
1576     case 0x2028:
1577     case 0x2029:
1578 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1579     goto ANYNL03;
1580    
1581     case 0x000d:
1582     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1583     /* Fall through */
1584    
1585     ANYNL03:
1586     case 0x000a:
1587 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1588     {
1589     active_count--; /* Remove non-match possibility */
1590     next_active_state--;
1591     }
1592     if (++count >= GET2(code, 1))
1593     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1594     else
1595     { ADD_NEW_DATA(-state_offset, count, ncount); }
1596     break;
1597 ph10 231
1598 nigel 93 default:
1599     break;
1600     }
1601     }
1602     break;
1603    
1604 ph10 178 /*-----------------------------------------------------------------*/
1605     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1606     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1607     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1608     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1609     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1610     { ADD_ACTIVE(state_offset + 4, 0); }
1611     count = current_state->count; /* Number already matched */
1612     if (clen > 0)
1613     {
1614 ph10 182 BOOL OK;
1615 ph10 178 switch (c)
1616     {
1617     case 0x000a:
1618     case 0x000b:
1619     case 0x000c:
1620     case 0x000d:
1621     case 0x0085:
1622     case 0x2028:
1623     case 0x2029:
1624     OK = TRUE;
1625     break;
1626 ph10 182
1627 ph10 178 default:
1628     OK = FALSE;
1629     }
1630 ph10 182
1631 ph10 178 if (OK == (d == OP_VSPACE))
1632 ph10 182 {
1633 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1634     {
1635     active_count--; /* Remove non-match possibility */
1636     next_active_state--;
1637     }
1638     if (++count >= GET2(code, 1))
1639     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1640     else
1641     { ADD_NEW_DATA(-state_offset, count, 0); }
1642     }
1643     }
1644     break;
1645    
1646     /*-----------------------------------------------------------------*/
1647     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1648     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1649     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1650     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1651     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1652     { ADD_ACTIVE(state_offset + 4, 0); }
1653     count = current_state->count; /* Number already matched */
1654     if (clen > 0)
1655     {
1656 ph10 182 BOOL OK;
1657 ph10 178 switch (c)
1658     {
1659     case 0x09: /* HT */
1660     case 0x20: /* SPACE */
1661     case 0xa0: /* NBSP */
1662     case 0x1680: /* OGHAM SPACE MARK */
1663     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1664     case 0x2000: /* EN QUAD */
1665     case 0x2001: /* EM QUAD */
1666     case 0x2002: /* EN SPACE */
1667     case 0x2003: /* EM SPACE */
1668     case 0x2004: /* THREE-PER-EM SPACE */
1669     case 0x2005: /* FOUR-PER-EM SPACE */
1670     case 0x2006: /* SIX-PER-EM SPACE */
1671     case 0x2007: /* FIGURE SPACE */
1672     case 0x2008: /* PUNCTUATION SPACE */
1673     case 0x2009: /* THIN SPACE */
1674     case 0x200A: /* HAIR SPACE */
1675     case 0x202f: /* NARROW NO-BREAK SPACE */
1676     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1677     case 0x3000: /* IDEOGRAPHIC SPACE */
1678     OK = TRUE;
1679     break;
1680 ph10 182
1681 ph10 178 default:
1682     OK = FALSE;
1683     break;
1684     }
1685 ph10 182
1686 ph10 178 if (OK == (d == OP_HSPACE))
1687 ph10 182 {
1688 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1689     {
1690     active_count--; /* Remove non-match possibility */
1691     next_active_state--;
1692     }
1693     if (++count >= GET2(code, 1))
1694     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1695     else
1696     { ADD_NEW_DATA(-state_offset, count, 0); }
1697     }
1698     }
1699     break;
1700    
1701 nigel 77 /* ========================================================================== */
1702     /* These opcodes are followed by a character that is usually compared
1703     to the current subject character; it is loaded into d. We still get
1704     here even if there is no subject character, because in some cases zero
1705     repetitions are permitted. */
1706    
1707     /*-----------------------------------------------------------------*/
1708     case OP_CHAR:
1709     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1710     break;
1711    
1712     /*-----------------------------------------------------------------*/
1713     case OP_CHARNC:
1714     if (clen == 0) break;
1715    
1716     #ifdef SUPPORT_UTF8
1717     if (utf8)
1718     {
1719     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1720     {
1721 nigel 93 unsigned int othercase;
1722 nigel 77 if (c < 128) othercase = fcc[c]; else
1723    
1724     /* If we have Unicode property support, we can use it to test the
1725 nigel 87 other case of the character. */
1726 nigel 77
1727     #ifdef SUPPORT_UCP
1728 nigel 87 othercase = _pcre_ucp_othercase(c);
1729     #else
1730 nigel 93 othercase = NOTACHAR;
1731 nigel 77 #endif
1732    
1733     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1734     }
1735     }
1736     else
1737     #endif /* SUPPORT_UTF8 */
1738    
1739     /* Non-UTF-8 mode */
1740     {
1741     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1742     }
1743     break;
1744    
1745    
1746     #ifdef SUPPORT_UCP
1747     /*-----------------------------------------------------------------*/
1748     /* This is a tricky one because it can match more than one character.
1749     Find out how many characters to skip, and then set up a negative state
1750     to wait for them to pass before continuing. */
1751    
1752     case OP_EXTUNI:
1753 nigel 87 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1754 nigel 77 {
1755     const uschar *nptr = ptr + clen;
1756     int ncount = 0;
1757     while (nptr < end_subject)
1758     {
1759     int nclen = 1;
1760     GETCHARLEN(c, nptr, nclen);
1761 nigel 87 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1762 nigel 77 ncount++;
1763     nptr += nclen;
1764     }
1765     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1766     }
1767     break;
1768     #endif
1769    
1770     /*-----------------------------------------------------------------*/
1771 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1772     character (when CR is followed by LF). In this case, set up a negative
1773     state to wait for one character to pass before continuing. */
1774    
1775     case OP_ANYNL:
1776     if (clen > 0) switch(c)
1777     {
1778     case 0x000b:
1779     case 0x000c:
1780     case 0x0085:
1781     case 0x2028:
1782     case 0x2029:
1783 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1784    
1785     case 0x000a:
1786 nigel 93 ADD_NEW(state_offset + 1, 0);
1787     break;
1788 ph10 231
1789 nigel 93 case 0x000d:
1790     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1791     {
1792     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1793     }
1794     else
1795     {
1796     ADD_NEW(state_offset + 1, 0);
1797     }
1798     break;
1799     }
1800     break;
1801    
1802     /*-----------------------------------------------------------------*/
1803 ph10 178 case OP_NOT_VSPACE:
1804     if (clen > 0) switch(c)
1805     {
1806     case 0x000a:
1807     case 0x000b:
1808     case 0x000c:
1809     case 0x000d:
1810     case 0x0085:
1811     case 0x2028:
1812     case 0x2029:
1813     break;
1814 ph10 182
1815     default:
1816 ph10 178 ADD_NEW(state_offset + 1, 0);
1817     break;
1818     }
1819     break;
1820    
1821     /*-----------------------------------------------------------------*/
1822     case OP_VSPACE:
1823     if (clen > 0) switch(c)
1824     {
1825     case 0x000a:
1826     case 0x000b:
1827     case 0x000c:
1828     case 0x000d:
1829     case 0x0085:
1830     case 0x2028:
1831     case 0x2029:
1832     ADD_NEW(state_offset + 1, 0);
1833     break;
1834 ph10 182
1835 ph10 178 default: break;
1836     }
1837     break;
1838    
1839     /*-----------------------------------------------------------------*/
1840     case OP_NOT_HSPACE:
1841     if (clen > 0) switch(c)
1842     {
1843     case 0x09: /* HT */
1844     case 0x20: /* SPACE */
1845     case 0xa0: /* NBSP */
1846     case 0x1680: /* OGHAM SPACE MARK */
1847     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1848     case 0x2000: /* EN QUAD */
1849     case 0x2001: /* EM QUAD */
1850     case 0x2002: /* EN SPACE */
1851     case 0x2003: /* EM SPACE */
1852     case 0x2004: /* THREE-PER-EM SPACE */
1853     case 0x2005: /* FOUR-PER-EM SPACE */
1854     case 0x2006: /* SIX-PER-EM SPACE */
1855     case 0x2007: /* FIGURE SPACE */
1856     case 0x2008: /* PUNCTUATION SPACE */
1857     case 0x2009: /* THIN SPACE */
1858     case 0x200A: /* HAIR SPACE */
1859     case 0x202f: /* NARROW NO-BREAK SPACE */
1860     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1861     case 0x3000: /* IDEOGRAPHIC SPACE */
1862     break;
1863 ph10 182
1864     default:
1865 ph10 178 ADD_NEW(state_offset + 1, 0);
1866     break;
1867     }
1868     break;
1869    
1870     /*-----------------------------------------------------------------*/
1871     case OP_HSPACE:
1872     if (clen > 0) switch(c)
1873     {
1874     case 0x09: /* HT */
1875     case 0x20: /* SPACE */
1876     case 0xa0: /* NBSP */
1877     case 0x1680: /* OGHAM SPACE MARK */
1878     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1879     case 0x2000: /* EN QUAD */
1880     case 0x2001: /* EM QUAD */
1881     case 0x2002: /* EN SPACE */
1882     case 0x2003: /* EM SPACE */
1883     case 0x2004: /* THREE-PER-EM SPACE */
1884     case 0x2005: /* FOUR-PER-EM SPACE */
1885     case 0x2006: /* SIX-PER-EM SPACE */
1886     case 0x2007: /* FIGURE SPACE */
1887     case 0x2008: /* PUNCTUATION SPACE */
1888     case 0x2009: /* THIN SPACE */
1889     case 0x200A: /* HAIR SPACE */
1890     case 0x202f: /* NARROW NO-BREAK SPACE */
1891     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1892     case 0x3000: /* IDEOGRAPHIC SPACE */
1893     ADD_NEW(state_offset + 1, 0);
1894     break;
1895     }
1896     break;
1897    
1898     /*-----------------------------------------------------------------*/
1899 nigel 77 /* Match a negated single character. This is only used for one-byte
1900     characters, that is, we know that d < 256. The character we are
1901     checking (c) can be multibyte. */
1902    
1903     case OP_NOT:
1904     if (clen > 0)
1905     {
1906 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1907 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1908     }
1909     break;
1910    
1911     /*-----------------------------------------------------------------*/
1912     case OP_PLUS:
1913     case OP_MINPLUS:
1914 nigel 93 case OP_POSPLUS:
1915 nigel 77 case OP_NOTPLUS:
1916     case OP_NOTMINPLUS:
1917 nigel 93 case OP_NOTPOSPLUS:
1918 nigel 77 count = current_state->count; /* Already matched */
1919     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1920     if (clen > 0)
1921     {
1922 nigel 93 unsigned int otherd = NOTACHAR;
1923 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1924     {
1925     #ifdef SUPPORT_UTF8
1926 nigel 87 if (utf8 && d >= 128)
1927 nigel 77 {
1928     #ifdef SUPPORT_UCP
1929 nigel 87 otherd = _pcre_ucp_othercase(d);
1930 nigel 77 #endif /* SUPPORT_UCP */
1931     }
1932     else
1933     #endif /* SUPPORT_UTF8 */
1934     otherd = fcc[d];
1935     }
1936     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1937 nigel 93 {
1938     if (count > 0 &&
1939     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1940     {
1941     active_count--; /* Remove non-match possibility */
1942     next_active_state--;
1943     }
1944     count++;
1945     ADD_NEW(state_offset, count);
1946     }
1947 nigel 77 }
1948     break;
1949    
1950     /*-----------------------------------------------------------------*/
1951     case OP_QUERY:
1952     case OP_MINQUERY:
1953 nigel 93 case OP_POSQUERY:
1954 nigel 77 case OP_NOTQUERY:
1955     case OP_NOTMINQUERY:
1956 nigel 93 case OP_NOTPOSQUERY:
1957 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1958     if (clen > 0)
1959     {
1960 nigel 93 unsigned int otherd = NOTACHAR;
1961 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1962 nigel 77 {
1963     #ifdef SUPPORT_UTF8
1964 nigel 87 if (utf8 && d >= 128)
1965 nigel 77 {
1966     #ifdef SUPPORT_UCP
1967 nigel 87 otherd = _pcre_ucp_othercase(d);
1968 nigel 77 #endif /* SUPPORT_UCP */
1969     }
1970     else
1971     #endif /* SUPPORT_UTF8 */
1972     otherd = fcc[d];
1973     }
1974     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1975 nigel 93 {
1976     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1977     {
1978     active_count--; /* Remove non-match possibility */
1979     next_active_state--;
1980     }
1981     ADD_NEW(state_offset + dlen + 1, 0);
1982     }
1983 nigel 77 }
1984     break;
1985    
1986     /*-----------------------------------------------------------------*/
1987     case OP_STAR:
1988     case OP_MINSTAR:
1989 nigel 93 case OP_POSSTAR:
1990 nigel 77 case OP_NOTSTAR:
1991     case OP_NOTMINSTAR:
1992 nigel 93 case OP_NOTPOSSTAR:
1993 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1994     if (clen > 0)
1995     {
1996 nigel 93 unsigned int otherd = NOTACHAR;
1997 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1998 nigel 77 {
1999     #ifdef SUPPORT_UTF8
2000 nigel 87 if (utf8 && d >= 128)
2001 nigel 77 {
2002     #ifdef SUPPORT_UCP
2003 nigel 87 otherd = _pcre_ucp_othercase(d);
2004 nigel 77 #endif /* SUPPORT_UCP */
2005     }
2006     else
2007     #endif /* SUPPORT_UTF8 */
2008     otherd = fcc[d];
2009     }
2010     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2011 nigel 93 {
2012     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2013     {
2014     active_count--; /* Remove non-match possibility */
2015     next_active_state--;
2016     }
2017     ADD_NEW(state_offset, 0);
2018     }
2019 nigel 77 }
2020     break;
2021    
2022     /*-----------------------------------------------------------------*/
2023     case OP_EXACT:
2024 nigel 93 case OP_NOTEXACT:
2025     count = current_state->count; /* Number already matched */
2026     if (clen > 0)
2027     {
2028     unsigned int otherd = NOTACHAR;
2029     if ((ims & PCRE_CASELESS) != 0)
2030     {
2031     #ifdef SUPPORT_UTF8
2032     if (utf8 && d >= 128)
2033     {
2034     #ifdef SUPPORT_UCP
2035     otherd = _pcre_ucp_othercase(d);
2036     #endif /* SUPPORT_UCP */
2037     }
2038     else
2039     #endif /* SUPPORT_UTF8 */
2040     otherd = fcc[d];
2041     }
2042     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2043     {
2044     if (++count >= GET2(code, 1))
2045     { ADD_NEW(state_offset + dlen + 3, 0); }
2046     else
2047     { ADD_NEW(state_offset, count); }
2048     }
2049     }
2050     break;
2051    
2052     /*-----------------------------------------------------------------*/
2053 nigel 77 case OP_UPTO:
2054     case OP_MINUPTO:
2055 nigel 93 case OP_POSUPTO:
2056 nigel 77 case OP_NOTUPTO:
2057     case OP_NOTMINUPTO:
2058 nigel 93 case OP_NOTPOSUPTO:
2059     ADD_ACTIVE(state_offset + dlen + 3, 0);
2060 nigel 77 count = current_state->count; /* Number already matched */
2061     if (clen > 0)
2062     {
2063 nigel 93 unsigned int otherd = NOTACHAR;
2064 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2065     {
2066     #ifdef SUPPORT_UTF8
2067 nigel 87 if (utf8 && d >= 128)
2068 nigel 77 {
2069     #ifdef SUPPORT_UCP
2070 nigel 87 otherd = _pcre_ucp_othercase(d);
2071 nigel 77 #endif /* SUPPORT_UCP */
2072     }
2073     else
2074     #endif /* SUPPORT_UTF8 */
2075     otherd = fcc[d];
2076     }
2077     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2078     {
2079 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2080     {
2081     active_count--; /* Remove non-match possibility */
2082     next_active_state--;
2083     }
2084 nigel 77 if (++count >= GET2(code, 1))
2085     { ADD_NEW(state_offset + dlen + 3, 0); }
2086     else
2087     { ADD_NEW(state_offset, count); }
2088     }
2089     }
2090     break;
2091    
2092    
2093     /* ========================================================================== */
2094     /* These are the class-handling opcodes */
2095    
2096     case OP_CLASS:
2097     case OP_NCLASS:
2098     case OP_XCLASS:
2099     {
2100     BOOL isinclass = FALSE;
2101     int next_state_offset;
2102     const uschar *ecode;
2103    
2104     /* For a simple class, there is always just a 32-byte table, and we
2105     can set isinclass from it. */
2106    
2107     if (codevalue != OP_XCLASS)
2108     {
2109     ecode = code + 33;
2110     if (clen > 0)
2111     {
2112     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2113     ((code[1 + c/8] & (1 << (c&7))) != 0);
2114     }
2115     }
2116    
2117     /* An extended class may have a table or a list of single characters,
2118     ranges, or both, and it may be positive or negative. There's a
2119     function that sorts all this out. */
2120    
2121     else
2122     {
2123     ecode = code + GET(code, 1);
2124     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2125     }
2126    
2127     /* At this point, isinclass is set for all kinds of class, and ecode
2128     points to the byte after the end of the class. If there is a
2129     quantifier, this is where it will be. */
2130    
2131     next_state_offset = ecode - start_code;
2132    
2133     switch (*ecode)
2134     {
2135     case OP_CRSTAR:
2136     case OP_CRMINSTAR:
2137     ADD_ACTIVE(next_state_offset + 1, 0);
2138     if (isinclass) { ADD_NEW(state_offset, 0); }
2139     break;
2140    
2141     case OP_CRPLUS:
2142     case OP_CRMINPLUS:
2143     count = current_state->count; /* Already matched */
2144     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2145     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2146     break;
2147    
2148     case OP_CRQUERY:
2149     case OP_CRMINQUERY:
2150     ADD_ACTIVE(next_state_offset + 1, 0);
2151     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2152     break;
2153    
2154     case OP_CRRANGE:
2155     case OP_CRMINRANGE:
2156     count = current_state->count; /* Already matched */
2157     if (count >= GET2(ecode, 1))
2158     { ADD_ACTIVE(next_state_offset + 5, 0); }
2159     if (isinclass)
2160     {
2161 nigel 91 int max = GET2(ecode, 3);
2162     if (++count >= max && max != 0) /* Max 0 => no limit */
2163 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2164     else
2165     { ADD_NEW(state_offset, count); }
2166     }
2167     break;
2168    
2169     default:
2170     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2171     break;
2172     }
2173     }
2174     break;
2175    
2176     /* ========================================================================== */
2177     /* These are the opcodes for fancy brackets of various kinds. We have
2178 ph10 341 to use recursion in order to handle them. The "always failing" assersion
2179     (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2180     though the other "backtracking verbs" are not supported. */
2181    
2182     case OP_FAIL:
2183     break;
2184 nigel 77
2185     case OP_ASSERT:
2186     case OP_ASSERT_NOT:
2187     case OP_ASSERTBACK:
2188     case OP_ASSERTBACK_NOT:
2189     {
2190     int rc;
2191     int local_offsets[2];
2192     int local_workspace[1000];
2193     const uschar *endasscode = code + GET(code, 1);
2194    
2195     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2196    
2197     rc = internal_dfa_exec(
2198     md, /* static match data */
2199     code, /* this subexpression's code */
2200     ptr, /* where we currently are */
2201     ptr - start_subject, /* start offset */
2202     local_offsets, /* offset vector */
2203     sizeof(local_offsets)/sizeof(int), /* size of same */
2204     local_workspace, /* workspace vector */
2205     sizeof(local_workspace)/sizeof(int), /* size of same */
2206     ims, /* the current ims flags */
2207     rlevel, /* function recursion level */
2208     recursing); /* pass on regex recursion */
2209    
2210     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2211     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2212     }
2213     break;
2214    
2215     /*-----------------------------------------------------------------*/
2216     case OP_COND:
2217 nigel 93 case OP_SCOND:
2218 nigel 77 {
2219     int local_offsets[1000];
2220     int local_workspace[1000];
2221     int condcode = code[LINK_SIZE+1];
2222    
2223 nigel 93 /* Back reference conditions are not supported */
2224 nigel 77
2225 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2226    
2227     /* The DEFINE condition is always false */
2228    
2229     if (condcode == OP_DEF)
2230 nigel 77 {
2231 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2232     }
2233    
2234     /* The only supported version of OP_RREF is for the value RREF_ANY,
2235     which means "test if in any recursion". We can't test for specifically
2236     recursed groups. */
2237    
2238     else if (condcode == OP_RREF)
2239     {
2240 nigel 77 int value = GET2(code, LINK_SIZE+2);
2241 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2242 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2243     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2244     }
2245    
2246     /* Otherwise, the condition is an assertion */
2247    
2248     else
2249     {
2250     int rc;
2251     const uschar *asscode = code + LINK_SIZE + 1;
2252     const uschar *endasscode = asscode + GET(asscode, 1);
2253    
2254     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2255    
2256     rc = internal_dfa_exec(
2257     md, /* fixed match data */
2258     asscode, /* this subexpression's code */
2259     ptr, /* where we currently are */
2260     ptr - start_subject, /* start offset */
2261     local_offsets, /* offset vector */
2262     sizeof(local_offsets)/sizeof(int), /* size of same */
2263     local_workspace, /* workspace vector */
2264     sizeof(local_workspace)/sizeof(int), /* size of same */
2265     ims, /* the current ims flags */
2266     rlevel, /* function recursion level */
2267     recursing); /* pass on regex recursion */
2268    
2269     if ((rc >= 0) ==
2270     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2271     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2272     else
2273     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2274     }
2275     }
2276     break;
2277    
2278     /*-----------------------------------------------------------------*/
2279     case OP_RECURSE:
2280     {
2281     int local_offsets[1000];
2282     int local_workspace[1000];
2283     int rc;
2284    
2285     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2286     recursing + 1));
2287    
2288     rc = internal_dfa_exec(
2289     md, /* fixed match data */
2290     start_code + GET(code, 1), /* this subexpression's code */
2291     ptr, /* where we currently are */
2292     ptr - start_subject, /* start offset */
2293     local_offsets, /* offset vector */
2294     sizeof(local_offsets)/sizeof(int), /* size of same */
2295     local_workspace, /* workspace vector */
2296     sizeof(local_workspace)/sizeof(int), /* size of same */
2297     ims, /* the current ims flags */
2298     rlevel, /* function recursion level */
2299     recursing + 1); /* regex recurse level */
2300    
2301     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2302     recursing + 1, rc));
2303    
2304     /* Ran out of internal offsets */
2305    
2306     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2307    
2308     /* For each successful matched substring, set up the next state with a
2309     count of characters to skip before trying it. Note that the count is in
2310     characters, not bytes. */
2311    
2312     if (rc > 0)
2313     {
2314     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2315     {
2316     const uschar *p = start_subject + local_offsets[rc];
2317     const uschar *pp = start_subject + local_offsets[rc+1];
2318     int charcount = local_offsets[rc+1] - local_offsets[rc];
2319     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2320     if (charcount > 0)
2321     {
2322     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2323     }
2324     else
2325     {
2326     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2327     }
2328     }
2329     }
2330     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2331     }
2332     break;
2333    
2334     /*-----------------------------------------------------------------*/
2335     case OP_ONCE:
2336     {
2337     int local_offsets[2];
2338     int local_workspace[1000];
2339    
2340     int rc = internal_dfa_exec(
2341     md, /* fixed match data */
2342     code, /* this subexpression's code */
2343     ptr, /* where we currently are */
2344     ptr - start_subject, /* start offset */
2345     local_offsets, /* offset vector */
2346     sizeof(local_offsets)/sizeof(int), /* size of same */
2347     local_workspace, /* workspace vector */
2348     sizeof(local_workspace)/sizeof(int), /* size of same */
2349     ims, /* the current ims flags */
2350     rlevel, /* function recursion level */
2351     recursing); /* pass on regex recursion */
2352    
2353     if (rc >= 0)
2354     {
2355     const uschar *end_subpattern = code;
2356     int charcount = local_offsets[1] - local_offsets[0];
2357     int next_state_offset, repeat_state_offset;
2358    
2359     do { end_subpattern += GET(end_subpattern, 1); }
2360     while (*end_subpattern == OP_ALT);
2361     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2362    
2363     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2364     arrange for the repeat state also to be added to the relevant list.
2365     Calculate the offset, or set -1 for no repeat. */
2366    
2367     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2368     *end_subpattern == OP_KETRMIN)?
2369     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2370    
2371     /* If we have matched an empty string, add the next state at the
2372     current character pointer. This is important so that the duplicate
2373     checking kicks in, which is what breaks infinite loops that match an
2374     empty string. */
2375    
2376     if (charcount == 0)
2377     {
2378     ADD_ACTIVE(next_state_offset, 0);
2379     }
2380    
2381     /* Optimization: if there are no more active states, and there
2382     are no new states yet set up, then skip over the subject string
2383     right here, to save looping. Otherwise, set up the new state to swing
2384     into action when the end of the substring is reached. */
2385    
2386     else if (i + 1 >= active_count && new_count == 0)
2387     {
2388     ptr += charcount;
2389     clen = 0;
2390     ADD_NEW(next_state_offset, 0);
2391    
2392     /* If we are adding a repeat state at the new character position,
2393     we must fudge things so that it is the only current state.
2394     Otherwise, it might be a duplicate of one we processed before, and
2395     that would cause it to be skipped. */
2396    
2397     if (repeat_state_offset >= 0)
2398     {
2399     next_active_state = active_states;
2400     active_count = 0;
2401     i = -1;
2402     ADD_ACTIVE(repeat_state_offset, 0);
2403     }
2404     }
2405     else
2406     {
2407     const uschar *p = start_subject + local_offsets[0];
2408     const uschar *pp = start_subject + local_offsets[1];
2409     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2410     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2411     if (repeat_state_offset >= 0)
2412     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2413     }
2414    
2415     }
2416     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2417     }
2418     break;
2419    
2420    
2421     /* ========================================================================== */
2422     /* Handle callouts */
2423    
2424     case OP_CALLOUT:
2425     if (pcre_callout != NULL)
2426     {
2427     int rrc;
2428     pcre_callout_block cb;
2429     cb.version = 1; /* Version 1 of the callout block */
2430     cb.callout_number = code[1];
2431     cb.offset_vector = offsets;
2432 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2433 nigel 77 cb.subject_length = end_subject - start_subject;
2434     cb.start_match = current_subject - start_subject;
2435     cb.current_position = ptr - start_subject;
2436     cb.pattern_position = GET(code, 2);
2437     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2438     cb.capture_top = 1;
2439     cb.capture_last = -1;
2440     cb.callout_data = md->callout_data;
2441     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2442     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2443     }
2444     break;
2445    
2446    
2447     /* ========================================================================== */
2448     default: /* Unsupported opcode */
2449     return PCRE_ERROR_DFA_UITEM;
2450     }
2451    
2452     NEXT_ACTIVE_STATE: continue;
2453    
2454     } /* End of loop scanning active states */
2455    
2456     /* We have finished the processing at the current subject character. If no
2457     new states have been set for the next character, we have found all the
2458     matches that we are going to find. If we are at the top level and partial
2459     matching has been requested, check for appropriate conditions. */
2460    
2461     if (new_count <= 0)
2462     {
2463     if (match_count < 0 && /* No matches found */
2464     rlevel == 1 && /* Top level match function */
2465     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2466     ptr >= end_subject && /* Reached end of subject */
2467     ptr > current_subject) /* Matched non-empty string */
2468     {
2469     if (offsetcount >= 2)
2470     {
2471     offsets[0] = current_subject - start_subject;
2472     offsets[1] = end_subject - start_subject;
2473     }
2474     match_count = PCRE_ERROR_PARTIAL;
2475     }
2476    
2477     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2478     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2479     rlevel*2-2, SP));
2480 nigel 91 break; /* In effect, "return", but see the comment below */
2481 nigel 77 }
2482    
2483     /* One or more states are active for the next character. */
2484    
2485     ptr += clen; /* Advance to next subject character */
2486     } /* Loop to move along the subject string */
2487    
2488 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2489     if we use "return" above, we have compiler trouble. Some compilers warn if
2490     there's nothing here because they think the function doesn't return a value. On
2491     the other hand, if we put a dummy statement here, some more clever compilers
2492     complain that it can't be reached. Sigh. */
2493 nigel 77
2494 nigel 91 return match_count;
2495 nigel 77 }
2496    
2497    
2498    
2499    
2500     /*************************************************
2501     * Execute a Regular Expression - DFA engine *
2502     *************************************************/
2503    
2504     /* This external function applies a compiled re to a subject string using a DFA
2505     engine. This function calls the internal function multiple times if the pattern
2506     is not anchored.
2507    
2508     Arguments:
2509     argument_re points to the compiled expression
2510 ph10 97 extra_data points to extra data or is NULL
2511 nigel 77 subject points to the subject string
2512     length length of subject string (may contain binary zeros)
2513     start_offset where to start in the subject string
2514     options option bits
2515     offsets vector of match offsets
2516     offsetcount size of same
2517     workspace workspace vector
2518     wscount size of same
2519    
2520     Returns: > 0 => number of match offset pairs placed in offsets
2521     = 0 => offsets overflowed; longest matches are present
2522     -1 => failed to match
2523     < -1 => some kind of unexpected problem
2524     */
2525    
2526 ph10 145 PCRE_EXP_DEFN int
2527 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2528     const char *subject, int length, int start_offset, int options, int *offsets,
2529     int offsetcount, int *workspace, int wscount)
2530     {
2531     real_pcre *re = (real_pcre *)argument_re;
2532     dfa_match_data match_block;
2533 nigel 91 dfa_match_data *md = &match_block;
2534 nigel 77 BOOL utf8, anchored, startline, firstline;
2535     const uschar *current_subject, *end_subject, *lcc;
2536    
2537     pcre_study_data internal_study;
2538     const pcre_study_data *study = NULL;
2539     real_pcre internal_re;
2540    
2541     const uschar *req_byte_ptr;
2542     const uschar *start_bits = NULL;
2543     BOOL first_byte_caseless = FALSE;
2544     BOOL req_byte_caseless = FALSE;
2545     int first_byte = -1;
2546     int req_byte = -1;
2547     int req_byte2 = -1;
2548 nigel 91 int newline;
2549 nigel 77
2550     /* Plausibility checks */
2551    
2552     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2553     if (re == NULL || subject == NULL || workspace == NULL ||
2554     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2555     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2556     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2557    
2558     /* We need to find the pointer to any study data before we test for byte
2559     flipping, so we scan the extra_data block first. This may set two fields in the
2560     match block, so we must initialize them beforehand. However, the other fields
2561     in the match block must not be set until after the byte flipping. */
2562    
2563 nigel 91 md->tables = re->tables;
2564     md->callout_data = NULL;
2565 nigel 77
2566     if (extra_data != NULL)
2567     {
2568     unsigned int flags = extra_data->flags;
2569     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2570     study = (const pcre_study_data *)extra_data->study_data;
2571     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2572 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2573     return PCRE_ERROR_DFA_UMLIMIT;
2574 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2575 nigel 91 md->callout_data = extra_data->callout_data;
2576 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2577 nigel 91 md->tables = extra_data->tables;
2578 nigel 77 }
2579    
2580     /* Check that the first field in the block is the magic number. If it is not,
2581     test for a regex that was compiled on a host of opposite endianness. If this is
2582     the case, flipped values are put in internal_re and internal_study if there was
2583     study data too. */
2584    
2585     if (re->magic_number != MAGIC_NUMBER)
2586     {
2587     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2588     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2589     if (study != NULL) study = &internal_study;
2590     }
2591    
2592     /* Set some local values */
2593    
2594     current_subject = (const unsigned char *)subject + start_offset;
2595     end_subject = (const unsigned char *)subject + length;
2596     req_byte_ptr = current_subject - 1;
2597    
2598 nigel 91 #ifdef SUPPORT_UTF8
2599 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2600 nigel 91 #else
2601     utf8 = FALSE;
2602     #endif
2603 nigel 77
2604 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2605     (re->options & PCRE_ANCHORED) != 0;
2606    
2607 nigel 77 /* The remaining fixed data for passing around. */
2608    
2609 nigel 91 md->start_code = (const uschar *)argument_re +
2610 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2611 nigel 91 md->start_subject = (const unsigned char *)subject;
2612     md->end_subject = end_subject;
2613     md->moptions = options;
2614     md->poptions = re->options;
2615 nigel 77
2616 ph10 231 /* If the BSR option is not set at match time, copy what was set
2617     at compile time. */
2618    
2619     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2620     {
2621     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2622     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2623     #ifdef BSR_ANYCRLF
2624     else md->moptions |= PCRE_BSR_ANYCRLF;
2625 ph10 243 #endif
2626     }
2627 ph10 231
2628 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2629     nothing is set at run time, whatever was used at compile time applies. */
2630 nigel 91
2631 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2632 nigel 93 PCRE_NEWLINE_BITS)
2633 nigel 91 {
2634 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2635 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2636     case PCRE_NEWLINE_LF: newline = '\n'; break;
2637     case PCRE_NEWLINE_CR+
2638     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2639 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2640 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2641 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2642 nigel 91 }
2643    
2644 ph10 149 if (newline == -2)
2645 nigel 91 {
2646 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2647     }
2648     else if (newline < 0)
2649     {
2650 nigel 93 md->nltype = NLTYPE_ANY;
2651 nigel 91 }
2652     else
2653     {
2654 nigel 93 md->nltype = NLTYPE_FIXED;
2655     if (newline > 255)
2656     {
2657     md->nllen = 2;
2658     md->nl[0] = (newline >> 8) & 255;
2659     md->nl[1] = newline & 255;
2660     }
2661     else
2662     {
2663     md->nllen = 1;
2664     md->nl[0] = newline;
2665     }
2666 nigel 91 }
2667    
2668 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2669     back the character offset. */
2670    
2671     #ifdef SUPPORT_UTF8
2672     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2673     {
2674     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2675     return PCRE_ERROR_BADUTF8;
2676     if (start_offset > 0 && start_offset < length)
2677     {
2678     int tb = ((uschar *)subject)[start_offset];
2679     if (tb > 127)
2680     {
2681     tb &= 0xc0;
2682     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2683     }
2684     }
2685     }
2686     #endif
2687    
2688     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2689     is a feature that makes it possible to save compiled regex and re-use them
2690     in other programs later. */
2691    
2692 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2693 nigel 77
2694     /* The lower casing table and the "must be at the start of a line" flag are
2695     used in a loop when finding where to start. */
2696    
2697 nigel 91 lcc = md->tables + lcc_offset;
2698 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2699 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2700    
2701     /* Set up the first character to match, if available. The first_byte value is
2702     never set for an anchored regular expression, but the anchoring may be forced
2703     at run time, so we have to test for anchoring. The first char may be unset for
2704     an unanchored pattern, of course. If there's no first char and the pattern was
2705     studied, there may be a bitmap of possible first characters. */
2706    
2707     if (!anchored)
2708     {
2709 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2710 nigel 77 {
2711     first_byte = re->first_byte & 255;
2712     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2713     first_byte = lcc[first_byte];
2714     }
2715     else
2716     {
2717     if (startline && study != NULL &&
2718     (study->options & PCRE_STUDY_MAPPED) != 0)
2719     start_bits = study->start_bits;
2720     }
2721     }
2722    
2723     /* For anchored or unanchored matches, there may be a "last known required
2724     character" set. */
2725    
2726 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2727 nigel 77 {
2728     req_byte = re->req_byte & 255;
2729     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2730 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2731 nigel 77 }
2732    
2733     /* Call the main matching function, looping for a non-anchored regex after a
2734     failed match. Unless restarting, optimize by moving to the first match
2735     character if possible, when not anchored. Then unless wanting a partial match,
2736     check for a required later character. */
2737    
2738     for (;;)
2739     {
2740     int rc;
2741    
2742     if ((options & PCRE_DFA_RESTART) == 0)
2743     {
2744     const uschar *save_end_subject = end_subject;
2745    
2746     /* Advance to a unique first char if possible. If firstline is TRUE, the
2747     start of the match is constrained to the first line of a multiline string.
2748 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2749     scanning at a newline. If the match fails at the newline, later code breaks
2750     this loop. */
2751 nigel 77
2752     if (firstline)
2753     {
2754     const uschar *t = current_subject;
2755 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2756 nigel 77 end_subject = t;
2757     }
2758    
2759     if (first_byte >= 0)
2760     {
2761     if (first_byte_caseless)
2762     while (current_subject < end_subject &&
2763     lcc[*current_subject] != first_byte)
2764     current_subject++;
2765     else
2766     while (current_subject < end_subject && *current_subject != first_byte)
2767     current_subject++;
2768     }
2769    
2770 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2771 nigel 77
2772     else if (startline)
2773     {
2774 nigel 93 if (current_subject > md->start_subject + start_offset)
2775 nigel 77 {
2776 nigel 93 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2777 nigel 77 current_subject++;
2778 ph10 130
2779 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2780     ANYCRLF, and we are now at a LF, advance the match position by one more
2781     character. */
2782 ph10 134
2783 ph10 130 if (current_subject[-1] == '\r' &&
2784 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2785 ph10 130 current_subject < end_subject &&
2786     *current_subject == '\n')
2787     current_subject++;
2788 nigel 77 }
2789     }
2790    
2791     /* Or to a non-unique first char after study */
2792    
2793     else if (start_bits != NULL)
2794     {
2795     while (current_subject < end_subject)
2796     {
2797     register unsigned int c = *current_subject;
2798     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2799     else break;
2800     }
2801     }
2802    
2803     /* Restore fudged end_subject */
2804    
2805     end_subject = save_end_subject;
2806     }
2807    
2808     /* If req_byte is set, we know that that character must appear in the subject
2809     for the match to succeed. If the first character is set, req_byte must be
2810     later in the subject; otherwise the test starts at the match point. This
2811     optimization can save a huge amount of work in patterns with nested unlimited
2812     repeats that aren't going to match. Writing separate code for cased/caseless
2813     versions makes it go faster, as does using an autoincrement and backing off
2814     on a match.
2815    
2816     HOWEVER: when the subject string is very, very long, searching to its end can
2817     take a long time, and give bad performance on quite ordinary patterns. This
2818     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2819     don't do this when the string is sufficiently long.
2820    
2821     ALSO: this processing is disabled when partial matching is requested.
2822     */
2823    
2824     if (req_byte >= 0 &&
2825     end_subject - current_subject < REQ_BYTE_MAX &&
2826     (options & PCRE_PARTIAL) == 0)
2827     {
2828     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2829    
2830     /* We don't need to repeat the search if we haven't yet reached the
2831     place we found it at last time. */
2832    
2833     if (p > req_byte_ptr)
2834     {
2835     if (req_byte_caseless)
2836     {
2837     while (p < end_subject)
2838     {
2839     register int pp = *p++;
2840     if (pp == req_byte || pp == req_byte2) { p--; break; }
2841     }
2842     }
2843     else
2844     {
2845     while (p < end_subject)
2846     {
2847     if (*p++ == req_byte) { p--; break; }
2848     }
2849     }
2850    
2851     /* If we can't find the required character, break the matching loop,
2852     which will cause a return or PCRE_ERROR_NOMATCH. */
2853    
2854     if (p >= end_subject) break;
2855    
2856     /* If we have found the required character, save the point where we
2857     found it, so that we don't search again next time round the loop if
2858     the start hasn't passed this character yet. */
2859    
2860     req_byte_ptr = p;
2861     }
2862     }
2863    
2864     /* OK, now we can do the business */
2865    
2866     rc = internal_dfa_exec(
2867 nigel 91 md, /* fixed match data */
2868     md->start_code, /* this subexpression's code */
2869     current_subject, /* where we currently are */
2870     start_offset, /* start offset in subject */
2871     offsets, /* offset vector */
2872     offsetcount, /* size of same */
2873     workspace, /* workspace vector */
2874     wscount, /* size of same */
2875 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2876 nigel 91 0, /* function recurse level */
2877     0); /* regex recurse level */
2878 nigel 77
2879     /* Anything other than "no match" means we are done, always; otherwise, carry
2880     on only if not anchored. */
2881    
2882     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2883    
2884     /* Advance to the next subject character unless we are at the end of a line
2885     and firstline is set. */
2886    
2887 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2888 nigel 77 current_subject++;
2889     if (utf8)
2890     {
2891     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2892     current_subject++;
2893     }
2894     if (current_subject > end_subject) break;
2895    
2896 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2897 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2898     or ANY or ANYCRLF, advance the match position by one more character. */
2899 nigel 93
2900     if (current_subject[-1] == '\r' &&
2901 ph10 226 current_subject < end_subject &&
2902     *current_subject == '\n' &&
2903 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2904 ph10 226 (md->nltype == NLTYPE_ANY ||
2905     md->nltype == NLTYPE_ANYCRLF ||
2906     md->nllen == 2))
2907 nigel 93 current_subject++;
2908    
2909     } /* "Bumpalong" loop */
2910    
2911 nigel 77 return PCRE_ERROR_NOMATCH;
2912     }
2913    
2914     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12