/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 406 - (hide annotations) (download)
Mon Mar 23 12:05:43 2009 UTC (5 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 97910 byte(s)
Trailing space tidies

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 200 #ifdef HAVE_CONFIG_H
49 ph10 236 #include "config.h"
50 ph10 200 #endif
51 ph10 199
52 nigel 93 #define NLBLOCK md /* Block containing newline information */
53     #define PSSTART start_subject /* Field containing processed string start */
54     #define PSEND end_subject /* Field containing processed string end */
55    
56 nigel 77 #include "pcre_internal.h"
57    
58    
59     /* For use to indent debugging output */
60    
61     #define SP " "
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87 ph10 327 static const uschar coptable[] = {
88 nigel 77 0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141 ph10 327 static const uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
147 nigel 77 };
148    
149 ph10 327 static const uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
155 nigel 77 };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
227 ph10 341 = 0 => offsets overflowed; longest matches are present
228 nigel 77 -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 nigel 77
458     /* Make the new state list into the active state list and empty the
459     new state list. */
460    
461     temp_states = active_states;
462     active_states = new_states;
463     new_states = temp_states;
464     active_count = new_count;
465     new_count = 0;
466    
467     workspace[0] ^= 1; /* Remember for the restarting feature */
468     workspace[1] = active_count;
469    
470     #ifdef DEBUG
471     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473     printf("\"\n");
474    
475     printf("%.*sActive states: ", rlevel*2-2, SP);
476     for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478     printf("\n");
479     #endif
480    
481     /* Set the pointers for adding new states */
482    
483     next_active_state = active_states + active_count;
484     next_new_state = new_states;
485    
486     /* Load the current character from the subject outside the loop, as many
487     different states may want to look at it, and we assume that at least one
488     will. */
489    
490     if (ptr < end_subject)
491     {
492 nigel 93 clen = 1; /* Number of bytes in the character */
493 nigel 77 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495     #endif /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498     else
499     {
500 nigel 93 clen = 0; /* This indicates the end of the subject */
501     c = NOTACHAR; /* This value should never actually be used */
502 nigel 77 }
503    
504     /* Scan up the active states and act on each one. The result of an action
505     may be to add more states to the currently active list (e.g. on hitting a
506     parenthesis) or it may be to put states on the new list, for considering
507     when we move the character pointer on. */
508    
509     for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514 ph10 397 int count, codevalue, rrc;
515 nigel 77
516     #ifdef DEBUG
517     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
518 nigel 93 if (clen == 0) printf("EOL\n");
519 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
520     else printf("0x%02x\n", c);
521     #endif
522    
523     /* This variable is referred to implicity in the ADD_xxx macros. */
524    
525     ims = current_state->ims;
526    
527     /* A negative offset is a special case meaning "hold off going to this
528     (negated) state until the number of characters in the data field have
529     been skipped". */
530    
531     if (state_offset < 0)
532     {
533     if (current_state->data > 0)
534     {
535     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
536     ADD_NEW_DATA(state_offset, current_state->count,
537     current_state->data - 1);
538     continue;
539     }
540     else
541     {
542     current_state->offset = state_offset = -state_offset;
543     }
544     }
545    
546     /* Check for a duplicate state with the same count, and skip if found. */
547    
548     for (j = 0; j < i; j++)
549     {
550     if (active_states[j].offset == state_offset &&
551     active_states[j].count == current_state->count)
552     {
553     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
554     goto NEXT_ACTIVE_STATE;
555     }
556     }
557    
558     /* The state offset is the offset to the opcode */
559    
560     code = start_code + state_offset;
561     codevalue = *code;
562    
563     /* If this opcode is followed by an inline character, load it. It is
564     tempting to test for the presence of a subject character here, but that
565     is wrong, because sometimes zero repetitions of the subject are
566     permitted.
567    
568     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
569 ph10 178 argument that is not a data character - but is always one byte long. We
570     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
571     this case. To keep the other cases fast, convert these ones to new opcodes.
572     */
573 nigel 77
574     if (coptable[codevalue] > 0)
575     {
576     dlen = 1;
577     #ifdef SUPPORT_UTF8
578     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
579     #endif /* SUPPORT_UTF8 */
580     d = code[coptable[codevalue]];
581     if (codevalue >= OP_TYPESTAR)
582     {
583 nigel 93 switch(d)
584     {
585     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
586     case OP_NOTPROP:
587     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
588     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
589     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
590 ph10 178 case OP_NOT_HSPACE:
591 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
592 ph10 178 case OP_NOT_VSPACE:
593 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
594 nigel 93 default: break;
595     }
596 nigel 77 }
597     }
598     else
599     {
600     dlen = 0; /* Not strictly necessary, but compilers moan */
601 nigel 93 d = NOTACHAR; /* if these variables are not set. */
602 nigel 77 }
603    
604    
605     /* Now process the individual opcodes */
606    
607     switch (codevalue)
608     {
609    
610     /* ========================================================================== */
611     /* Reached a closing bracket. If not at the end of the pattern, carry
612     on with the next opcode. Otherwise, unless we have an empty string and
613     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
614     matches so we always have the longest first. */
615    
616     case OP_KET:
617     case OP_KETRMIN:
618     case OP_KETRMAX:
619     if (code != end_code)
620     {
621     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
622     if (codevalue != OP_KET)
623     {
624     ADD_ACTIVE(state_offset - GET(code, 1), 0);
625     }
626     }
627     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
628     {
629     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
630     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
631     match_count = 0;
632     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
633     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
634     if (offsetcount >= 2)
635     {
636     offsets[0] = current_subject - start_subject;
637     offsets[1] = ptr - start_subject;
638     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
639     offsets[1] - offsets[0], current_subject));
640     }
641     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
642     {
643     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
644     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
645     match_count, rlevel*2-2, SP));
646     return match_count;
647     }
648     }
649     break;
650    
651     /* ========================================================================== */
652     /* These opcodes add to the current list of states without looking
653     at the current character. */
654    
655     /*-----------------------------------------------------------------*/
656     case OP_ALT:
657     do { code += GET(code, 1); } while (*code == OP_ALT);
658     ADD_ACTIVE(code - start_code, 0);
659     break;
660    
661     /*-----------------------------------------------------------------*/
662     case OP_BRA:
663 nigel 93 case OP_SBRA:
664 nigel 77 do
665     {
666     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
667     code += GET(code, 1);
668     }
669     while (*code == OP_ALT);
670     break;
671    
672     /*-----------------------------------------------------------------*/
673 nigel 93 case OP_CBRA:
674     case OP_SCBRA:
675     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
676     code += GET(code, 1);
677     while (*code == OP_ALT)
678     {
679     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
680     code += GET(code, 1);
681     }
682     break;
683    
684     /*-----------------------------------------------------------------*/
685 nigel 77 case OP_BRAZERO:
686     case OP_BRAMINZERO:
687     ADD_ACTIVE(state_offset + 1, 0);
688     code += 1 + GET(code, 2);
689     while (*code == OP_ALT) code += GET(code, 1);
690     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
691     break;
692    
693     /*-----------------------------------------------------------------*/
694 ph10 335 case OP_SKIPZERO:
695     code += 1 + GET(code, 2);
696     while (*code == OP_ALT) code += GET(code, 1);
697     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
698     break;
699    
700     /*-----------------------------------------------------------------*/
701 nigel 77 case OP_CIRC:
702     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
703 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
704     ptr != end_subject &&
705 nigel 93 WAS_NEWLINE(ptr)))
706 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
707     break;
708    
709     /*-----------------------------------------------------------------*/
710     case OP_EOD:
711     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
712     break;
713    
714     /*-----------------------------------------------------------------*/
715     case OP_OPT:
716     ims = code[1];
717     ADD_ACTIVE(state_offset + 2, 0);
718     break;
719    
720     /*-----------------------------------------------------------------*/
721     case OP_SOD:
722     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
723     break;
724    
725     /*-----------------------------------------------------------------*/
726     case OP_SOM:
727     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
728     break;
729    
730    
731     /* ========================================================================== */
732     /* These opcodes inspect the next subject character, and sometimes
733     the previous one as well, but do not have an argument. The variable
734     clen contains the length of the current character and is zero if we are
735     at the end of the subject. */
736    
737     /*-----------------------------------------------------------------*/
738     case OP_ANY:
739 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
740 nigel 77 { ADD_NEW(state_offset + 1, 0); }
741     break;
742    
743     /*-----------------------------------------------------------------*/
744 ph10 341 case OP_ALLANY:
745     if (clen > 0)
746     { ADD_NEW(state_offset + 1, 0); }
747     break;
748    
749     /*-----------------------------------------------------------------*/
750 nigel 77 case OP_EODN:
751 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
752 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
753     break;
754    
755     /*-----------------------------------------------------------------*/
756     case OP_DOLL:
757     if ((md->moptions & PCRE_NOTEOL) == 0)
758     {
759 nigel 91 if (clen == 0 ||
760 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
761 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
762     ))
763 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
764     }
765 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
766 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
767     break;
768    
769     /*-----------------------------------------------------------------*/
770    
771     case OP_DIGIT:
772     case OP_WHITESPACE:
773     case OP_WORDCHAR:
774     if (clen > 0 && c < 256 &&
775     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
776     { ADD_NEW(state_offset + 1, 0); }
777     break;
778    
779     /*-----------------------------------------------------------------*/
780     case OP_NOT_DIGIT:
781     case OP_NOT_WHITESPACE:
782     case OP_NOT_WORDCHAR:
783     if (clen > 0 && (c >= 256 ||
784     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
785     { ADD_NEW(state_offset + 1, 0); }
786     break;
787    
788     /*-----------------------------------------------------------------*/
789     case OP_WORD_BOUNDARY:
790     case OP_NOT_WORD_BOUNDARY:
791     {
792     int left_word, right_word;
793    
794     if (ptr > start_subject)
795     {
796     const uschar *temp = ptr - 1;
797     #ifdef SUPPORT_UTF8
798     if (utf8) BACKCHAR(temp);
799     #endif
800     GETCHARTEST(d, temp);
801     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
802     }
803     else left_word = 0;
804    
805     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
806     else right_word = 0;
807    
808     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
809     { ADD_ACTIVE(state_offset + 1, 0); }
810     }
811     break;
812    
813    
814     /*-----------------------------------------------------------------*/
815     /* Check the next character by Unicode property. We will get here only
816     if the support is in the binary; otherwise a compile-time error occurs.
817     */
818    
819 ph10 151 #ifdef SUPPORT_UCP
820 nigel 77 case OP_PROP:
821     case OP_NOTPROP:
822     if (clen > 0)
823     {
824 nigel 87 BOOL OK;
825 ph10 349 const ucd_record * prop = GET_UCD(c);
826 nigel 87 switch(code[1])
827 nigel 77 {
828 nigel 87 case PT_ANY:
829     OK = TRUE;
830     break;
831    
832     case PT_LAMP:
833 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
834 nigel 87 break;
835    
836     case PT_GC:
837 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
838 nigel 87 break;
839    
840     case PT_PC:
841 ph10 349 OK = prop->chartype == code[2];
842 nigel 87 break;
843    
844     case PT_SC:
845 ph10 349 OK = prop->script == code[2];
846 nigel 87 break;
847    
848     /* Should never occur, but keep compilers from grumbling. */
849    
850     default:
851     OK = codevalue != OP_PROP;
852     break;
853 nigel 77 }
854 nigel 87
855     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
856 nigel 77 }
857     break;
858     #endif
859    
860    
861    
862     /* ========================================================================== */
863     /* These opcodes likewise inspect the subject character, but have an
864     argument that is not a data character. It is one of these opcodes:
865 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
866     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
867 nigel 77
868     case OP_TYPEPLUS:
869     case OP_TYPEMINPLUS:
870 nigel 93 case OP_TYPEPOSPLUS:
871 nigel 77 count = current_state->count; /* Already matched */
872     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
873     if (clen > 0)
874     {
875     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876     (c < 256 &&
877 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
878 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879     {
880 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
881     {
882     active_count--; /* Remove non-match possibility */
883     next_active_state--;
884     }
885 nigel 77 count++;
886     ADD_NEW(state_offset, count);
887     }
888     }
889     break;
890    
891     /*-----------------------------------------------------------------*/
892     case OP_TYPEQUERY:
893     case OP_TYPEMINQUERY:
894 nigel 93 case OP_TYPEPOSQUERY:
895 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
896     if (clen > 0)
897     {
898     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899     (c < 256 &&
900 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
901 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
902     {
903 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
904     {
905     active_count--; /* Remove non-match possibility */
906     next_active_state--;
907     }
908 nigel 77 ADD_NEW(state_offset + 2, 0);
909     }
910     }
911     break;
912    
913     /*-----------------------------------------------------------------*/
914     case OP_TYPESTAR:
915     case OP_TYPEMINSTAR:
916 nigel 93 case OP_TYPEPOSSTAR:
917 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
918     if (clen > 0)
919     {
920     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
921     (c < 256 &&
922 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
923 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
924     {
925 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
926     {
927     active_count--; /* Remove non-match possibility */
928     next_active_state--;
929     }
930 nigel 77 ADD_NEW(state_offset, 0);
931     }
932     }
933     break;
934    
935     /*-----------------------------------------------------------------*/
936     case OP_TYPEEXACT:
937 nigel 93 count = current_state->count; /* Number already matched */
938     if (clen > 0)
939     {
940     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
941     (c < 256 &&
942 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
943 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944     {
945     if (++count >= GET2(code, 1))
946     { ADD_NEW(state_offset + 4, 0); }
947     else
948     { ADD_NEW(state_offset, count); }
949     }
950     }
951     break;
952    
953     /*-----------------------------------------------------------------*/
954 nigel 77 case OP_TYPEUPTO:
955     case OP_TYPEMINUPTO:
956 nigel 93 case OP_TYPEPOSUPTO:
957     ADD_ACTIVE(state_offset + 4, 0);
958 nigel 77 count = current_state->count; /* Number already matched */
959     if (clen > 0)
960     {
961     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962     (c < 256 &&
963 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
964 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
965     {
966 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
967     {
968     active_count--; /* Remove non-match possibility */
969     next_active_state--;
970     }
971 nigel 77 if (++count >= GET2(code, 1))
972     { ADD_NEW(state_offset + 4, 0); }
973     else
974     { ADD_NEW(state_offset, count); }
975     }
976     }
977     break;
978    
979     /* ========================================================================== */
980     /* These are virtual opcodes that are used when something like
981 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
982     argument. It keeps the code above fast for the other cases. The argument
983     is in the d variable. */
984 nigel 77
985 ph10 151 #ifdef SUPPORT_UCP
986 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
987     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
988 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
989 nigel 77 count = current_state->count; /* Already matched */
990 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
991 nigel 77 if (clen > 0)
992     {
993 nigel 87 BOOL OK;
994 ph10 349 const ucd_record * prop = GET_UCD(c);
995 nigel 87 switch(code[2])
996     {
997     case PT_ANY:
998     OK = TRUE;
999     break;
1000    
1001     case PT_LAMP:
1002 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1003 nigel 87 break;
1004    
1005     case PT_GC:
1006 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1007 nigel 87 break;
1008    
1009     case PT_PC:
1010 ph10 349 OK = prop->chartype == code[3];
1011 nigel 87 break;
1012    
1013     case PT_SC:
1014 ph10 349 OK = prop->script == code[3];
1015 nigel 87 break;
1016    
1017     /* Should never occur, but keep compilers from grumbling. */
1018    
1019     default:
1020     OK = codevalue != OP_PROP;
1021     break;
1022     }
1023    
1024 nigel 93 if (OK == (d == OP_PROP))
1025     {
1026     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1027     {
1028     active_count--; /* Remove non-match possibility */
1029     next_active_state--;
1030     }
1031     count++;
1032     ADD_NEW(state_offset, count);
1033     }
1034 nigel 77 }
1035     break;
1036    
1037     /*-----------------------------------------------------------------*/
1038     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1039     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1040 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1041 nigel 77 count = current_state->count; /* Already matched */
1042     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1043 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1044 nigel 77 {
1045     const uschar *nptr = ptr + clen;
1046     int ncount = 0;
1047 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1048     {
1049     active_count--; /* Remove non-match possibility */
1050     next_active_state--;
1051     }
1052 nigel 77 while (nptr < end_subject)
1053     {
1054     int nd;
1055     int ndlen = 1;
1056     GETCHARLEN(nd, nptr, ndlen);
1057 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1058 nigel 77 ncount++;
1059     nptr += ndlen;
1060     }
1061     count++;
1062     ADD_NEW_DATA(-state_offset, count, ncount);
1063     }
1064     break;
1065 ph10 151 #endif
1066 nigel 77
1067     /*-----------------------------------------------------------------*/
1068 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1069     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1070     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1071     count = current_state->count; /* Already matched */
1072     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1073     if (clen > 0)
1074     {
1075     int ncount = 0;
1076     switch (c)
1077     {
1078     case 0x000b:
1079     case 0x000c:
1080     case 0x0085:
1081     case 0x2028:
1082     case 0x2029:
1083 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1084     goto ANYNL01;
1085    
1086     case 0x000d:
1087     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1088     /* Fall through */
1089    
1090     ANYNL01:
1091     case 0x000a:
1092 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093     {
1094     active_count--; /* Remove non-match possibility */
1095     next_active_state--;
1096     }
1097     count++;
1098     ADD_NEW_DATA(-state_offset, count, ncount);
1099     break;
1100 ph10 231
1101 nigel 93 default:
1102     break;
1103     }
1104     }
1105     break;
1106    
1107     /*-----------------------------------------------------------------*/
1108 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1109     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1110     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1111     count = current_state->count; /* Already matched */
1112     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113     if (clen > 0)
1114     {
1115 ph10 182 BOOL OK;
1116 ph10 178 switch (c)
1117     {
1118     case 0x000a:
1119     case 0x000b:
1120     case 0x000c:
1121     case 0x000d:
1122     case 0x0085:
1123     case 0x2028:
1124     case 0x2029:
1125     OK = TRUE;
1126 ph10 182 break;
1127 ph10 178
1128     default:
1129     OK = FALSE;
1130 ph10 182 break;
1131 ph10 178 }
1132    
1133     if (OK == (d == OP_VSPACE))
1134 ph10 182 {
1135 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1136     {
1137     active_count--; /* Remove non-match possibility */
1138     next_active_state--;
1139     }
1140     count++;
1141     ADD_NEW_DATA(-state_offset, count, 0);
1142     }
1143     }
1144     break;
1145    
1146     /*-----------------------------------------------------------------*/
1147     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1148     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1149     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1150     count = current_state->count; /* Already matched */
1151     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152     if (clen > 0)
1153     {
1154 ph10 182 BOOL OK;
1155 ph10 178 switch (c)
1156     {
1157     case 0x09: /* HT */
1158     case 0x20: /* SPACE */
1159     case 0xa0: /* NBSP */
1160     case 0x1680: /* OGHAM SPACE MARK */
1161     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1162     case 0x2000: /* EN QUAD */
1163     case 0x2001: /* EM QUAD */
1164     case 0x2002: /* EN SPACE */
1165     case 0x2003: /* EM SPACE */
1166     case 0x2004: /* THREE-PER-EM SPACE */
1167     case 0x2005: /* FOUR-PER-EM SPACE */
1168     case 0x2006: /* SIX-PER-EM SPACE */
1169     case 0x2007: /* FIGURE SPACE */
1170     case 0x2008: /* PUNCTUATION SPACE */
1171     case 0x2009: /* THIN SPACE */
1172     case 0x200A: /* HAIR SPACE */
1173     case 0x202f: /* NARROW NO-BREAK SPACE */
1174     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1175     case 0x3000: /* IDEOGRAPHIC SPACE */
1176     OK = TRUE;
1177     break;
1178 ph10 182
1179 ph10 178 default:
1180     OK = FALSE;
1181     break;
1182     }
1183 ph10 182
1184 ph10 178 if (OK == (d == OP_HSPACE))
1185 ph10 182 {
1186 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1187     {
1188     active_count--; /* Remove non-match possibility */
1189     next_active_state--;
1190     }
1191     count++;
1192     ADD_NEW_DATA(-state_offset, count, 0);
1193     }
1194     }
1195     break;
1196    
1197     /*-----------------------------------------------------------------*/
1198 ph10 151 #ifdef SUPPORT_UCP
1199 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1200     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1201 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1202 nigel 87 count = 4;
1203 nigel 77 goto QS1;
1204    
1205     case OP_PROP_EXTRA + OP_TYPESTAR:
1206     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1207 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1208 nigel 77 count = 0;
1209    
1210     QS1:
1211    
1212 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1213 nigel 77 if (clen > 0)
1214     {
1215 nigel 87 BOOL OK;
1216 ph10 349 const ucd_record * prop = GET_UCD(c);
1217 nigel 87 switch(code[2])
1218     {
1219     case PT_ANY:
1220     OK = TRUE;
1221     break;
1222    
1223     case PT_LAMP:
1224 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1225 nigel 87 break;
1226    
1227     case PT_GC:
1228 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1229 nigel 87 break;
1230    
1231     case PT_PC:
1232 ph10 349 OK = prop->chartype == code[3];
1233 nigel 87 break;
1234    
1235     case PT_SC:
1236 ph10 349 OK = prop->script == code[3];
1237 nigel 87 break;
1238    
1239     /* Should never occur, but keep compilers from grumbling. */
1240    
1241     default:
1242     OK = codevalue != OP_PROP;
1243     break;
1244     }
1245    
1246 nigel 93 if (OK == (d == OP_PROP))
1247     {
1248     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1249     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1250     {
1251     active_count--; /* Remove non-match possibility */
1252     next_active_state--;
1253     }
1254     ADD_NEW(state_offset + count, 0);
1255     }
1256 nigel 77 }
1257     break;
1258    
1259     /*-----------------------------------------------------------------*/
1260     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1261     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1262 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1263 nigel 77 count = 2;
1264     goto QS2;
1265    
1266     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1267     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1268 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1269 nigel 77 count = 0;
1270    
1271     QS2:
1272    
1273     ADD_ACTIVE(state_offset + 2, 0);
1274 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1275 nigel 77 {
1276     const uschar *nptr = ptr + clen;
1277     int ncount = 0;
1278 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1279     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1280     {
1281     active_count--; /* Remove non-match possibility */
1282     next_active_state--;
1283     }
1284 nigel 77 while (nptr < end_subject)
1285     {
1286     int nd;
1287     int ndlen = 1;
1288     GETCHARLEN(nd, nptr, ndlen);
1289 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1290 nigel 77 ncount++;
1291     nptr += ndlen;
1292     }
1293     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1294     }
1295     break;
1296 ph10 151 #endif
1297 nigel 77
1298     /*-----------------------------------------------------------------*/
1299 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1300     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1301     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1302     count = 2;
1303     goto QS3;
1304    
1305     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1306     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1307     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1308     count = 0;
1309    
1310     QS3:
1311     ADD_ACTIVE(state_offset + 2, 0);
1312     if (clen > 0)
1313     {
1314     int ncount = 0;
1315     switch (c)
1316     {
1317     case 0x000b:
1318     case 0x000c:
1319     case 0x0085:
1320     case 0x2028:
1321     case 0x2029:
1322 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323     goto ANYNL02;
1324    
1325     case 0x000d:
1326     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327     /* Fall through */
1328    
1329     ANYNL02:
1330     case 0x000a:
1331 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1332     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1333     {
1334     active_count--; /* Remove non-match possibility */
1335     next_active_state--;
1336     }
1337     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1338     break;
1339 ph10 231
1340 nigel 93 default:
1341     break;
1342     }
1343     }
1344     break;
1345    
1346     /*-----------------------------------------------------------------*/
1347 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1348     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1349     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1350     count = 2;
1351     goto QS4;
1352    
1353     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1354     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1355     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1356     count = 0;
1357    
1358     QS4:
1359     ADD_ACTIVE(state_offset + 2, 0);
1360     if (clen > 0)
1361     {
1362 ph10 182 BOOL OK;
1363 ph10 178 switch (c)
1364     {
1365     case 0x000a:
1366     case 0x000b:
1367     case 0x000c:
1368     case 0x000d:
1369     case 0x0085:
1370     case 0x2028:
1371     case 0x2029:
1372     OK = TRUE;
1373     break;
1374 ph10 182
1375 ph10 178 default:
1376     OK = FALSE;
1377     break;
1378     }
1379     if (OK == (d == OP_VSPACE))
1380 ph10 182 {
1381 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1382     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1383     {
1384     active_count--; /* Remove non-match possibility */
1385     next_active_state--;
1386     }
1387     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1388     }
1389     }
1390     break;
1391    
1392     /*-----------------------------------------------------------------*/
1393     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1394     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1395     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1396     count = 2;
1397     goto QS5;
1398    
1399     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1400     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1401     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1402     count = 0;
1403    
1404     QS5:
1405     ADD_ACTIVE(state_offset + 2, 0);
1406     if (clen > 0)
1407     {
1408 ph10 182 BOOL OK;
1409 ph10 178 switch (c)
1410     {
1411     case 0x09: /* HT */
1412     case 0x20: /* SPACE */
1413     case 0xa0: /* NBSP */
1414     case 0x1680: /* OGHAM SPACE MARK */
1415     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1416     case 0x2000: /* EN QUAD */
1417     case 0x2001: /* EM QUAD */
1418     case 0x2002: /* EN SPACE */
1419     case 0x2003: /* EM SPACE */
1420     case 0x2004: /* THREE-PER-EM SPACE */
1421     case 0x2005: /* FOUR-PER-EM SPACE */
1422     case 0x2006: /* SIX-PER-EM SPACE */
1423     case 0x2007: /* FIGURE SPACE */
1424     case 0x2008: /* PUNCTUATION SPACE */
1425     case 0x2009: /* THIN SPACE */
1426     case 0x200A: /* HAIR SPACE */
1427     case 0x202f: /* NARROW NO-BREAK SPACE */
1428     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1429     case 0x3000: /* IDEOGRAPHIC SPACE */
1430     OK = TRUE;
1431     break;
1432 ph10 182
1433 ph10 178 default:
1434     OK = FALSE;
1435     break;
1436     }
1437 ph10 182
1438 ph10 178 if (OK == (d == OP_HSPACE))
1439 ph10 182 {
1440 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1441     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1442     {
1443     active_count--; /* Remove non-match possibility */
1444     next_active_state--;
1445     }
1446     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1447     }
1448     }
1449     break;
1450    
1451     /*-----------------------------------------------------------------*/
1452 ph10 151 #ifdef SUPPORT_UCP
1453 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1454     case OP_PROP_EXTRA + OP_TYPEUPTO:
1455     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1456 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1457 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1458 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1459 nigel 77 count = current_state->count; /* Number already matched */
1460     if (clen > 0)
1461     {
1462 nigel 87 BOOL OK;
1463 ph10 349 const ucd_record * prop = GET_UCD(c);
1464 nigel 87 switch(code[4])
1465 nigel 77 {
1466 nigel 87 case PT_ANY:
1467     OK = TRUE;
1468     break;
1469    
1470     case PT_LAMP:
1471 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1472 nigel 87 break;
1473    
1474     case PT_GC:
1475 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1476 nigel 87 break;
1477    
1478     case PT_PC:
1479 ph10 349 OK = prop->chartype == code[5];
1480 nigel 87 break;
1481    
1482     case PT_SC:
1483 ph10 349 OK = prop->script == code[5];
1484 nigel 87 break;
1485    
1486     /* Should never occur, but keep compilers from grumbling. */
1487    
1488     default:
1489     OK = codevalue != OP_PROP;
1490     break;
1491     }
1492    
1493     if (OK == (d == OP_PROP))
1494     {
1495 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1496     {
1497     active_count--; /* Remove non-match possibility */
1498     next_active_state--;
1499     }
1500 nigel 77 if (++count >= GET2(code, 1))
1501 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1502 nigel 77 else
1503     { ADD_NEW(state_offset, count); }
1504     }
1505     }
1506     break;
1507    
1508     /*-----------------------------------------------------------------*/
1509     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1510     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1511     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1512 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1513 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1514     { ADD_ACTIVE(state_offset + 4, 0); }
1515     count = current_state->count; /* Number already matched */
1516 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1517 nigel 77 {
1518     const uschar *nptr = ptr + clen;
1519     int ncount = 0;
1520 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1521     {
1522     active_count--; /* Remove non-match possibility */
1523     next_active_state--;
1524     }
1525 nigel 77 while (nptr < end_subject)
1526     {
1527     int nd;
1528     int ndlen = 1;
1529     GETCHARLEN(nd, nptr, ndlen);
1530 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1531 nigel 77 ncount++;
1532     nptr += ndlen;
1533     }
1534     if (++count >= GET2(code, 1))
1535     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1536     else
1537     { ADD_NEW_DATA(-state_offset, count, ncount); }
1538     }
1539     break;
1540 ph10 151 #endif
1541 nigel 77
1542 nigel 93 /*-----------------------------------------------------------------*/
1543     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1544     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1545     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1546     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1547     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1548     { ADD_ACTIVE(state_offset + 4, 0); }
1549     count = current_state->count; /* Number already matched */
1550     if (clen > 0)
1551     {
1552     int ncount = 0;
1553     switch (c)
1554     {
1555     case 0x000b:
1556     case 0x000c:
1557     case 0x0085:
1558     case 0x2028:
1559     case 0x2029:
1560 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1561     goto ANYNL03;
1562    
1563     case 0x000d:
1564     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1565     /* Fall through */
1566    
1567     ANYNL03:
1568     case 0x000a:
1569 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1570     {
1571     active_count--; /* Remove non-match possibility */
1572     next_active_state--;
1573     }
1574     if (++count >= GET2(code, 1))
1575     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1576     else
1577     { ADD_NEW_DATA(-state_offset, count, ncount); }
1578     break;
1579 ph10 231
1580 nigel 93 default:
1581     break;
1582     }
1583     }
1584     break;
1585    
1586 ph10 178 /*-----------------------------------------------------------------*/
1587     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1588     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1589     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1590     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1591     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1592     { ADD_ACTIVE(state_offset + 4, 0); }
1593     count = current_state->count; /* Number already matched */
1594     if (clen > 0)
1595     {
1596 ph10 182 BOOL OK;
1597 ph10 178 switch (c)
1598     {
1599     case 0x000a:
1600     case 0x000b:
1601     case 0x000c:
1602     case 0x000d:
1603     case 0x0085:
1604     case 0x2028:
1605     case 0x2029:
1606     OK = TRUE;
1607     break;
1608 ph10 182
1609 ph10 178 default:
1610     OK = FALSE;
1611     }
1612 ph10 182
1613 ph10 178 if (OK == (d == OP_VSPACE))
1614 ph10 182 {
1615 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1616     {
1617     active_count--; /* Remove non-match possibility */
1618     next_active_state--;
1619     }
1620     if (++count >= GET2(code, 1))
1621     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1622     else
1623     { ADD_NEW_DATA(-state_offset, count, 0); }
1624     }
1625     }
1626     break;
1627    
1628     /*-----------------------------------------------------------------*/
1629     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1630     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1631     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1632     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1633     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1634     { ADD_ACTIVE(state_offset + 4, 0); }
1635     count = current_state->count; /* Number already matched */
1636     if (clen > 0)
1637     {
1638 ph10 182 BOOL OK;
1639 ph10 178 switch (c)
1640     {
1641     case 0x09: /* HT */
1642     case 0x20: /* SPACE */
1643     case 0xa0: /* NBSP */
1644     case 0x1680: /* OGHAM SPACE MARK */
1645     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1646     case 0x2000: /* EN QUAD */
1647     case 0x2001: /* EM QUAD */
1648     case 0x2002: /* EN SPACE */
1649     case 0x2003: /* EM SPACE */
1650     case 0x2004: /* THREE-PER-EM SPACE */
1651     case 0x2005: /* FOUR-PER-EM SPACE */
1652     case 0x2006: /* SIX-PER-EM SPACE */
1653     case 0x2007: /* FIGURE SPACE */
1654     case 0x2008: /* PUNCTUATION SPACE */
1655     case 0x2009: /* THIN SPACE */
1656     case 0x200A: /* HAIR SPACE */
1657     case 0x202f: /* NARROW NO-BREAK SPACE */
1658     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1659     case 0x3000: /* IDEOGRAPHIC SPACE */
1660     OK = TRUE;
1661     break;
1662 ph10 182
1663 ph10 178 default:
1664     OK = FALSE;
1665     break;
1666     }
1667 ph10 182
1668 ph10 178 if (OK == (d == OP_HSPACE))
1669 ph10 182 {
1670 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1671     {
1672     active_count--; /* Remove non-match possibility */
1673     next_active_state--;
1674     }
1675     if (++count >= GET2(code, 1))
1676     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1677     else
1678     { ADD_NEW_DATA(-state_offset, count, 0); }
1679     }
1680     }
1681     break;
1682    
1683 nigel 77 /* ========================================================================== */
1684     /* These opcodes are followed by a character that is usually compared
1685     to the current subject character; it is loaded into d. We still get
1686     here even if there is no subject character, because in some cases zero
1687     repetitions are permitted. */
1688    
1689     /*-----------------------------------------------------------------*/
1690     case OP_CHAR:
1691     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1692     break;
1693    
1694     /*-----------------------------------------------------------------*/
1695     case OP_CHARNC:
1696     if (clen == 0) break;
1697    
1698     #ifdef SUPPORT_UTF8
1699     if (utf8)
1700     {
1701     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1702     {
1703 nigel 93 unsigned int othercase;
1704 nigel 77 if (c < 128) othercase = fcc[c]; else
1705    
1706     /* If we have Unicode property support, we can use it to test the
1707 nigel 87 other case of the character. */
1708 nigel 77
1709     #ifdef SUPPORT_UCP
1710 ph10 349 othercase = UCD_OTHERCASE(c);
1711 nigel 87 #else
1712 nigel 93 othercase = NOTACHAR;
1713 nigel 77 #endif
1714    
1715     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1716     }
1717     }
1718     else
1719     #endif /* SUPPORT_UTF8 */
1720    
1721     /* Non-UTF-8 mode */
1722     {
1723     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1724     }
1725     break;
1726    
1727    
1728     #ifdef SUPPORT_UCP
1729     /*-----------------------------------------------------------------*/
1730     /* This is a tricky one because it can match more than one character.
1731     Find out how many characters to skip, and then set up a negative state
1732     to wait for them to pass before continuing. */
1733    
1734     case OP_EXTUNI:
1735 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1736 nigel 77 {
1737     const uschar *nptr = ptr + clen;
1738     int ncount = 0;
1739     while (nptr < end_subject)
1740     {
1741     int nclen = 1;
1742     GETCHARLEN(c, nptr, nclen);
1743 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1744 nigel 77 ncount++;
1745     nptr += nclen;
1746     }
1747     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1748     }
1749     break;
1750     #endif
1751    
1752     /*-----------------------------------------------------------------*/
1753 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1754     character (when CR is followed by LF). In this case, set up a negative
1755     state to wait for one character to pass before continuing. */
1756    
1757     case OP_ANYNL:
1758     if (clen > 0) switch(c)
1759     {
1760     case 0x000b:
1761     case 0x000c:
1762     case 0x0085:
1763     case 0x2028:
1764     case 0x2029:
1765 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1766    
1767     case 0x000a:
1768 nigel 93 ADD_NEW(state_offset + 1, 0);
1769     break;
1770 ph10 231
1771 nigel 93 case 0x000d:
1772     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1773     {
1774     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1775     }
1776     else
1777     {
1778     ADD_NEW(state_offset + 1, 0);
1779     }
1780     break;
1781     }
1782     break;
1783    
1784     /*-----------------------------------------------------------------*/
1785 ph10 178 case OP_NOT_VSPACE:
1786     if (clen > 0) switch(c)
1787     {
1788     case 0x000a:
1789     case 0x000b:
1790     case 0x000c:
1791     case 0x000d:
1792     case 0x0085:
1793     case 0x2028:
1794     case 0x2029:
1795     break;
1796 ph10 182
1797     default:
1798 ph10 178 ADD_NEW(state_offset + 1, 0);
1799     break;
1800     }
1801     break;
1802    
1803     /*-----------------------------------------------------------------*/
1804     case OP_VSPACE:
1805     if (clen > 0) switch(c)
1806     {
1807     case 0x000a:
1808     case 0x000b:
1809     case 0x000c:
1810     case 0x000d:
1811     case 0x0085:
1812     case 0x2028:
1813     case 0x2029:
1814     ADD_NEW(state_offset + 1, 0);
1815     break;
1816 ph10 182
1817 ph10 178 default: break;
1818     }
1819     break;
1820    
1821     /*-----------------------------------------------------------------*/
1822     case OP_NOT_HSPACE:
1823     if (clen > 0) switch(c)
1824     {
1825     case 0x09: /* HT */
1826     case 0x20: /* SPACE */
1827     case 0xa0: /* NBSP */
1828     case 0x1680: /* OGHAM SPACE MARK */
1829     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1830     case 0x2000: /* EN QUAD */
1831     case 0x2001: /* EM QUAD */
1832     case 0x2002: /* EN SPACE */
1833     case 0x2003: /* EM SPACE */
1834     case 0x2004: /* THREE-PER-EM SPACE */
1835     case 0x2005: /* FOUR-PER-EM SPACE */
1836     case 0x2006: /* SIX-PER-EM SPACE */
1837     case 0x2007: /* FIGURE SPACE */
1838     case 0x2008: /* PUNCTUATION SPACE */
1839     case 0x2009: /* THIN SPACE */
1840     case 0x200A: /* HAIR SPACE */
1841     case 0x202f: /* NARROW NO-BREAK SPACE */
1842     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1843     case 0x3000: /* IDEOGRAPHIC SPACE */
1844     break;
1845 ph10 182
1846     default:
1847 ph10 178 ADD_NEW(state_offset + 1, 0);
1848     break;
1849     }
1850     break;
1851    
1852     /*-----------------------------------------------------------------*/
1853     case OP_HSPACE:
1854     if (clen > 0) switch(c)
1855     {
1856     case 0x09: /* HT */
1857     case 0x20: /* SPACE */
1858     case 0xa0: /* NBSP */
1859     case 0x1680: /* OGHAM SPACE MARK */
1860     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1861     case 0x2000: /* EN QUAD */
1862     case 0x2001: /* EM QUAD */
1863     case 0x2002: /* EN SPACE */
1864     case 0x2003: /* EM SPACE */
1865     case 0x2004: /* THREE-PER-EM SPACE */
1866     case 0x2005: /* FOUR-PER-EM SPACE */
1867     case 0x2006: /* SIX-PER-EM SPACE */
1868     case 0x2007: /* FIGURE SPACE */
1869     case 0x2008: /* PUNCTUATION SPACE */
1870     case 0x2009: /* THIN SPACE */
1871     case 0x200A: /* HAIR SPACE */
1872     case 0x202f: /* NARROW NO-BREAK SPACE */
1873     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1874     case 0x3000: /* IDEOGRAPHIC SPACE */
1875     ADD_NEW(state_offset + 1, 0);
1876     break;
1877     }
1878     break;
1879    
1880     /*-----------------------------------------------------------------*/
1881 nigel 77 /* Match a negated single character. This is only used for one-byte
1882     characters, that is, we know that d < 256. The character we are
1883     checking (c) can be multibyte. */
1884    
1885     case OP_NOT:
1886     if (clen > 0)
1887     {
1888 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1889 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1890     }
1891     break;
1892    
1893     /*-----------------------------------------------------------------*/
1894     case OP_PLUS:
1895     case OP_MINPLUS:
1896 nigel 93 case OP_POSPLUS:
1897 nigel 77 case OP_NOTPLUS:
1898     case OP_NOTMINPLUS:
1899 nigel 93 case OP_NOTPOSPLUS:
1900 nigel 77 count = current_state->count; /* Already matched */
1901     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1902     if (clen > 0)
1903     {
1904 nigel 93 unsigned int otherd = NOTACHAR;
1905 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1906     {
1907     #ifdef SUPPORT_UTF8
1908 nigel 87 if (utf8 && d >= 128)
1909 nigel 77 {
1910     #ifdef SUPPORT_UCP
1911 ph10 349 otherd = UCD_OTHERCASE(d);
1912 nigel 77 #endif /* SUPPORT_UCP */
1913     }
1914     else
1915     #endif /* SUPPORT_UTF8 */
1916     otherd = fcc[d];
1917     }
1918     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1919 nigel 93 {
1920     if (count > 0 &&
1921     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1922     {
1923     active_count--; /* Remove non-match possibility */
1924     next_active_state--;
1925     }
1926     count++;
1927     ADD_NEW(state_offset, count);
1928     }
1929 nigel 77 }
1930     break;
1931    
1932     /*-----------------------------------------------------------------*/
1933     case OP_QUERY:
1934     case OP_MINQUERY:
1935 nigel 93 case OP_POSQUERY:
1936 nigel 77 case OP_NOTQUERY:
1937     case OP_NOTMINQUERY:
1938 nigel 93 case OP_NOTPOSQUERY:
1939 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1940     if (clen > 0)
1941     {
1942 nigel 93 unsigned int otherd = NOTACHAR;
1943 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1944 nigel 77 {
1945     #ifdef SUPPORT_UTF8
1946 nigel 87 if (utf8 && d >= 128)
1947 nigel 77 {
1948     #ifdef SUPPORT_UCP
1949 ph10 349 otherd = UCD_OTHERCASE(d);
1950 nigel 77 #endif /* SUPPORT_UCP */
1951     }
1952     else
1953     #endif /* SUPPORT_UTF8 */
1954     otherd = fcc[d];
1955     }
1956     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1957 nigel 93 {
1958     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1959     {
1960     active_count--; /* Remove non-match possibility */
1961     next_active_state--;
1962     }
1963     ADD_NEW(state_offset + dlen + 1, 0);
1964     }
1965 nigel 77 }
1966     break;
1967    
1968     /*-----------------------------------------------------------------*/
1969     case OP_STAR:
1970     case OP_MINSTAR:
1971 nigel 93 case OP_POSSTAR:
1972 nigel 77 case OP_NOTSTAR:
1973     case OP_NOTMINSTAR:
1974 nigel 93 case OP_NOTPOSSTAR:
1975 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1976     if (clen > 0)
1977     {
1978 nigel 93 unsigned int otherd = NOTACHAR;
1979 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1980 nigel 77 {
1981     #ifdef SUPPORT_UTF8
1982 nigel 87 if (utf8 && d >= 128)
1983 nigel 77 {
1984     #ifdef SUPPORT_UCP
1985 ph10 349 otherd = UCD_OTHERCASE(d);
1986 nigel 77 #endif /* SUPPORT_UCP */
1987     }
1988     else
1989     #endif /* SUPPORT_UTF8 */
1990     otherd = fcc[d];
1991     }
1992     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1993 nigel 93 {
1994     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1995     {
1996     active_count--; /* Remove non-match possibility */
1997     next_active_state--;
1998     }
1999     ADD_NEW(state_offset, 0);
2000     }
2001 nigel 77 }
2002     break;
2003    
2004     /*-----------------------------------------------------------------*/
2005     case OP_EXACT:
2006 nigel 93 case OP_NOTEXACT:
2007     count = current_state->count; /* Number already matched */
2008     if (clen > 0)
2009     {
2010     unsigned int otherd = NOTACHAR;
2011     if ((ims & PCRE_CASELESS) != 0)
2012     {
2013     #ifdef SUPPORT_UTF8
2014     if (utf8 && d >= 128)
2015     {
2016     #ifdef SUPPORT_UCP
2017 ph10 349 otherd = UCD_OTHERCASE(d);
2018 nigel 93 #endif /* SUPPORT_UCP */
2019     }
2020     else
2021     #endif /* SUPPORT_UTF8 */
2022     otherd = fcc[d];
2023     }
2024     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2025     {
2026     if (++count >= GET2(code, 1))
2027     { ADD_NEW(state_offset + dlen + 3, 0); }
2028     else
2029     { ADD_NEW(state_offset, count); }
2030     }
2031     }
2032     break;
2033    
2034     /*-----------------------------------------------------------------*/
2035 nigel 77 case OP_UPTO:
2036     case OP_MINUPTO:
2037 nigel 93 case OP_POSUPTO:
2038 nigel 77 case OP_NOTUPTO:
2039     case OP_NOTMINUPTO:
2040 nigel 93 case OP_NOTPOSUPTO:
2041     ADD_ACTIVE(state_offset + dlen + 3, 0);
2042 nigel 77 count = current_state->count; /* Number already matched */
2043     if (clen > 0)
2044     {
2045 nigel 93 unsigned int otherd = NOTACHAR;
2046 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2047     {
2048     #ifdef SUPPORT_UTF8
2049 nigel 87 if (utf8 && d >= 128)
2050 nigel 77 {
2051     #ifdef SUPPORT_UCP
2052 ph10 349 otherd = UCD_OTHERCASE(d);
2053 nigel 77 #endif /* SUPPORT_UCP */
2054     }
2055     else
2056     #endif /* SUPPORT_UTF8 */
2057     otherd = fcc[d];
2058     }
2059     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2060     {
2061 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2062     {
2063     active_count--; /* Remove non-match possibility */
2064     next_active_state--;
2065     }
2066 nigel 77 if (++count >= GET2(code, 1))
2067     { ADD_NEW(state_offset + dlen + 3, 0); }
2068     else
2069     { ADD_NEW(state_offset, count); }
2070     }
2071     }
2072     break;
2073    
2074    
2075     /* ========================================================================== */
2076     /* These are the class-handling opcodes */
2077    
2078     case OP_CLASS:
2079     case OP_NCLASS:
2080     case OP_XCLASS:
2081     {
2082     BOOL isinclass = FALSE;
2083     int next_state_offset;
2084     const uschar *ecode;
2085    
2086     /* For a simple class, there is always just a 32-byte table, and we
2087     can set isinclass from it. */
2088    
2089     if (codevalue != OP_XCLASS)
2090     {
2091     ecode = code + 33;
2092     if (clen > 0)
2093     {
2094     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2095     ((code[1 + c/8] & (1 << (c&7))) != 0);
2096     }
2097     }
2098    
2099     /* An extended class may have a table or a list of single characters,
2100     ranges, or both, and it may be positive or negative. There's a
2101     function that sorts all this out. */
2102    
2103     else
2104     {
2105     ecode = code + GET(code, 1);
2106     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2107     }
2108    
2109     /* At this point, isinclass is set for all kinds of class, and ecode
2110     points to the byte after the end of the class. If there is a
2111     quantifier, this is where it will be. */
2112    
2113     next_state_offset = ecode - start_code;
2114    
2115     switch (*ecode)
2116     {
2117     case OP_CRSTAR:
2118     case OP_CRMINSTAR:
2119     ADD_ACTIVE(next_state_offset + 1, 0);
2120     if (isinclass) { ADD_NEW(state_offset, 0); }
2121     break;
2122    
2123     case OP_CRPLUS:
2124     case OP_CRMINPLUS:
2125     count = current_state->count; /* Already matched */
2126     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2127     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2128     break;
2129    
2130     case OP_CRQUERY:
2131     case OP_CRMINQUERY:
2132     ADD_ACTIVE(next_state_offset + 1, 0);
2133     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2134     break;
2135    
2136     case OP_CRRANGE:
2137     case OP_CRMINRANGE:
2138     count = current_state->count; /* Already matched */
2139     if (count >= GET2(ecode, 1))
2140     { ADD_ACTIVE(next_state_offset + 5, 0); }
2141     if (isinclass)
2142     {
2143 nigel 91 int max = GET2(ecode, 3);
2144     if (++count >= max && max != 0) /* Max 0 => no limit */
2145 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2146     else
2147     { ADD_NEW(state_offset, count); }
2148     }
2149     break;
2150    
2151     default:
2152     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2153     break;
2154     }
2155     }
2156     break;
2157    
2158     /* ========================================================================== */
2159     /* These are the opcodes for fancy brackets of various kinds. We have
2160 ph10 345 to use recursion in order to handle them. The "always failing" assersion
2161 ph10 341 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2162     though the other "backtracking verbs" are not supported. */
2163 ph10 345
2164 ph10 341 case OP_FAIL:
2165 ph10 345 break;
2166 nigel 77
2167     case OP_ASSERT:
2168     case OP_ASSERT_NOT:
2169     case OP_ASSERTBACK:
2170     case OP_ASSERTBACK_NOT:
2171     {
2172     int rc;
2173     int local_offsets[2];
2174     int local_workspace[1000];
2175     const uschar *endasscode = code + GET(code, 1);
2176    
2177     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2178    
2179     rc = internal_dfa_exec(
2180     md, /* static match data */
2181     code, /* this subexpression's code */
2182     ptr, /* where we currently are */
2183     ptr - start_subject, /* start offset */
2184     local_offsets, /* offset vector */
2185     sizeof(local_offsets)/sizeof(int), /* size of same */
2186     local_workspace, /* workspace vector */
2187     sizeof(local_workspace)/sizeof(int), /* size of same */
2188     ims, /* the current ims flags */
2189     rlevel, /* function recursion level */
2190     recursing); /* pass on regex recursion */
2191    
2192     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2193     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2194     }
2195     break;
2196    
2197     /*-----------------------------------------------------------------*/
2198     case OP_COND:
2199 nigel 93 case OP_SCOND:
2200 nigel 77 {
2201     int local_offsets[1000];
2202     int local_workspace[1000];
2203 ph10 406 int codelink = GET(code, 1);
2204 ph10 397 int condcode;
2205 ph10 406
2206 ph10 397 /* Because of the way auto-callout works during compile, a callout item
2207 ph10 406 is inserted between OP_COND and an assertion condition. This does not
2208 ph10 398 happen for the other conditions. */
2209 nigel 77
2210 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2211 ph10 406 {
2212     rrc = 0;
2213 ph10 397 if (pcre_callout != NULL)
2214     {
2215     pcre_callout_block cb;
2216     cb.version = 1; /* Version 1 of the callout block */
2217     cb.callout_number = code[LINK_SIZE+2];
2218     cb.offset_vector = offsets;
2219     cb.subject = (PCRE_SPTR)start_subject;
2220     cb.subject_length = end_subject - start_subject;
2221     cb.start_match = current_subject - start_subject;
2222     cb.current_position = ptr - start_subject;
2223     cb.pattern_position = GET(code, LINK_SIZE + 3);
2224     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225     cb.capture_top = 1;
2226     cb.capture_last = -1;
2227     cb.callout_data = md->callout_data;
2228     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2229     }
2230 ph10 398 if (rrc > 0) break; /* Fail this thread */
2231     code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2232 ph10 406 }
2233 ph10 398
2234 ph10 397 condcode = code[LINK_SIZE+1];
2235 ph10 406
2236 nigel 93 /* Back reference conditions are not supported */
2237 nigel 77
2238 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2239    
2240     /* The DEFINE condition is always false */
2241    
2242     if (condcode == OP_DEF)
2243 ph10 398 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2244 nigel 93
2245     /* The only supported version of OP_RREF is for the value RREF_ANY,
2246     which means "test if in any recursion". We can't test for specifically
2247     recursed groups. */
2248    
2249     else if (condcode == OP_RREF)
2250     {
2251 nigel 77 int value = GET2(code, LINK_SIZE+2);
2252 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2253 ph10 406 if (recursing > 0)
2254 ph10 398 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2255     else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2256 nigel 77 }
2257    
2258     /* Otherwise, the condition is an assertion */
2259    
2260     else
2261     {
2262     int rc;
2263     const uschar *asscode = code + LINK_SIZE + 1;
2264     const uschar *endasscode = asscode + GET(asscode, 1);
2265    
2266     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2267    
2268     rc = internal_dfa_exec(
2269     md, /* fixed match data */
2270     asscode, /* this subexpression's code */
2271     ptr, /* where we currently are */
2272     ptr - start_subject, /* start offset */
2273     local_offsets, /* offset vector */
2274     sizeof(local_offsets)/sizeof(int), /* size of same */
2275     local_workspace, /* workspace vector */
2276     sizeof(local_workspace)/sizeof(int), /* size of same */
2277     ims, /* the current ims flags */
2278     rlevel, /* function recursion level */
2279     recursing); /* pass on regex recursion */
2280    
2281     if ((rc >= 0) ==
2282     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2283 ph10 398 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2284 nigel 77 else
2285 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2286 nigel 77 }
2287     }
2288     break;
2289    
2290     /*-----------------------------------------------------------------*/
2291     case OP_RECURSE:
2292     {
2293     int local_offsets[1000];
2294     int local_workspace[1000];
2295     int rc;
2296    
2297     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2298     recursing + 1));
2299    
2300     rc = internal_dfa_exec(
2301     md, /* fixed match data */
2302     start_code + GET(code, 1), /* this subexpression's code */
2303     ptr, /* where we currently are */
2304     ptr - start_subject, /* start offset */
2305     local_offsets, /* offset vector */
2306     sizeof(local_offsets)/sizeof(int), /* size of same */
2307     local_workspace, /* workspace vector */
2308     sizeof(local_workspace)/sizeof(int), /* size of same */
2309     ims, /* the current ims flags */
2310     rlevel, /* function recursion level */
2311     recursing + 1); /* regex recurse level */
2312    
2313     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2314     recursing + 1, rc));
2315    
2316     /* Ran out of internal offsets */
2317    
2318     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2319    
2320     /* For each successful matched substring, set up the next state with a
2321     count of characters to skip before trying it. Note that the count is in
2322     characters, not bytes. */
2323    
2324     if (rc > 0)
2325     {
2326     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2327     {
2328     const uschar *p = start_subject + local_offsets[rc];
2329     const uschar *pp = start_subject + local_offsets[rc+1];
2330     int charcount = local_offsets[rc+1] - local_offsets[rc];
2331     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2332     if (charcount > 0)
2333     {
2334     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2335     }
2336     else
2337     {
2338     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2339     }
2340     }
2341     }
2342     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2343     }
2344     break;
2345    
2346     /*-----------------------------------------------------------------*/
2347     case OP_ONCE:
2348     {
2349     int local_offsets[2];
2350     int local_workspace[1000];
2351    
2352     int rc = internal_dfa_exec(
2353     md, /* fixed match data */
2354     code, /* this subexpression's code */
2355     ptr, /* where we currently are */
2356     ptr - start_subject, /* start offset */
2357     local_offsets, /* offset vector */
2358     sizeof(local_offsets)/sizeof(int), /* size of same */
2359     local_workspace, /* workspace vector */
2360     sizeof(local_workspace)/sizeof(int), /* size of same */
2361     ims, /* the current ims flags */
2362     rlevel, /* function recursion level */
2363     recursing); /* pass on regex recursion */
2364    
2365     if (rc >= 0)
2366     {
2367     const uschar *end_subpattern = code;
2368     int charcount = local_offsets[1] - local_offsets[0];
2369     int next_state_offset, repeat_state_offset;
2370    
2371     do { end_subpattern += GET(end_subpattern, 1); }
2372     while (*end_subpattern == OP_ALT);
2373     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2374    
2375     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2376     arrange for the repeat state also to be added to the relevant list.
2377     Calculate the offset, or set -1 for no repeat. */
2378    
2379     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2380     *end_subpattern == OP_KETRMIN)?
2381     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2382    
2383     /* If we have matched an empty string, add the next state at the
2384     current character pointer. This is important so that the duplicate
2385     checking kicks in, which is what breaks infinite loops that match an
2386     empty string. */
2387    
2388     if (charcount == 0)
2389     {
2390     ADD_ACTIVE(next_state_offset, 0);
2391     }
2392    
2393     /* Optimization: if there are no more active states, and there
2394     are no new states yet set up, then skip over the subject string
2395     right here, to save looping. Otherwise, set up the new state to swing
2396     into action when the end of the substring is reached. */
2397    
2398     else if (i + 1 >= active_count && new_count == 0)
2399     {
2400     ptr += charcount;
2401     clen = 0;
2402     ADD_NEW(next_state_offset, 0);
2403    
2404     /* If we are adding a repeat state at the new character position,
2405     we must fudge things so that it is the only current state.
2406     Otherwise, it might be a duplicate of one we processed before, and
2407     that would cause it to be skipped. */
2408    
2409     if (repeat_state_offset >= 0)
2410     {
2411     next_active_state = active_states;
2412     active_count = 0;
2413     i = -1;
2414     ADD_ACTIVE(repeat_state_offset, 0);
2415     }
2416     }
2417     else
2418     {
2419     const uschar *p = start_subject + local_offsets[0];
2420     const uschar *pp = start_subject + local_offsets[1];
2421     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2422     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2423     if (repeat_state_offset >= 0)
2424     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2425     }
2426    
2427     }
2428     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2429     }
2430     break;
2431    
2432    
2433     /* ========================================================================== */
2434     /* Handle callouts */
2435    
2436     case OP_CALLOUT:
2437 ph10 406 rrc = 0;
2438 nigel 77 if (pcre_callout != NULL)
2439     {
2440     pcre_callout_block cb;
2441     cb.version = 1; /* Version 1 of the callout block */
2442     cb.callout_number = code[1];
2443     cb.offset_vector = offsets;
2444 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2445 nigel 77 cb.subject_length = end_subject - start_subject;
2446     cb.start_match = current_subject - start_subject;
2447     cb.current_position = ptr - start_subject;
2448     cb.pattern_position = GET(code, 2);
2449     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2450     cb.capture_top = 1;
2451     cb.capture_last = -1;
2452     cb.callout_data = md->callout_data;
2453     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2454 ph10 406 }
2455     if (rrc == 0)
2456     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2457 nigel 77 break;
2458    
2459    
2460     /* ========================================================================== */
2461     default: /* Unsupported opcode */
2462     return PCRE_ERROR_DFA_UITEM;
2463     }
2464    
2465     NEXT_ACTIVE_STATE: continue;
2466    
2467     } /* End of loop scanning active states */
2468    
2469     /* We have finished the processing at the current subject character. If no
2470     new states have been set for the next character, we have found all the
2471     matches that we are going to find. If we are at the top level and partial
2472     matching has been requested, check for appropriate conditions. */
2473    
2474     if (new_count <= 0)
2475     {
2476     if (match_count < 0 && /* No matches found */
2477     rlevel == 1 && /* Top level match function */
2478     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2479     ptr >= end_subject && /* Reached end of subject */
2480     ptr > current_subject) /* Matched non-empty string */
2481     {
2482     if (offsetcount >= 2)
2483     {
2484     offsets[0] = current_subject - start_subject;
2485     offsets[1] = end_subject - start_subject;
2486     }
2487     match_count = PCRE_ERROR_PARTIAL;
2488     }
2489    
2490     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2491     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2492     rlevel*2-2, SP));
2493 nigel 91 break; /* In effect, "return", but see the comment below */
2494 nigel 77 }
2495    
2496     /* One or more states are active for the next character. */
2497    
2498     ptr += clen; /* Advance to next subject character */
2499     } /* Loop to move along the subject string */
2500    
2501 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2502     if we use "return" above, we have compiler trouble. Some compilers warn if
2503     there's nothing here because they think the function doesn't return a value. On
2504     the other hand, if we put a dummy statement here, some more clever compilers
2505     complain that it can't be reached. Sigh. */
2506 nigel 77
2507 nigel 91 return match_count;
2508 nigel 77 }
2509    
2510    
2511    
2512    
2513     /*************************************************
2514     * Execute a Regular Expression - DFA engine *
2515     *************************************************/
2516    
2517     /* This external function applies a compiled re to a subject string using a DFA
2518     engine. This function calls the internal function multiple times if the pattern
2519     is not anchored.
2520    
2521     Arguments:
2522     argument_re points to the compiled expression
2523 ph10 97 extra_data points to extra data or is NULL
2524 nigel 77 subject points to the subject string
2525     length length of subject string (may contain binary zeros)
2526     start_offset where to start in the subject string
2527     options option bits
2528     offsets vector of match offsets
2529     offsetcount size of same
2530     workspace workspace vector
2531     wscount size of same
2532    
2533     Returns: > 0 => number of match offset pairs placed in offsets
2534     = 0 => offsets overflowed; longest matches are present
2535     -1 => failed to match
2536     < -1 => some kind of unexpected problem
2537     */
2538    
2539 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2540 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2541     const char *subject, int length, int start_offset, int options, int *offsets,
2542     int offsetcount, int *workspace, int wscount)
2543     {
2544     real_pcre *re = (real_pcre *)argument_re;
2545     dfa_match_data match_block;
2546 nigel 91 dfa_match_data *md = &match_block;
2547 nigel 77 BOOL utf8, anchored, startline, firstline;
2548     const uschar *current_subject, *end_subject, *lcc;
2549    
2550     pcre_study_data internal_study;
2551     const pcre_study_data *study = NULL;
2552     real_pcre internal_re;
2553    
2554     const uschar *req_byte_ptr;
2555     const uschar *start_bits = NULL;
2556     BOOL first_byte_caseless = FALSE;
2557     BOOL req_byte_caseless = FALSE;
2558     int first_byte = -1;
2559     int req_byte = -1;
2560     int req_byte2 = -1;
2561 nigel 91 int newline;
2562 nigel 77
2563     /* Plausibility checks */
2564    
2565     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2566     if (re == NULL || subject == NULL || workspace == NULL ||
2567     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2568     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2569     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2570    
2571     /* We need to find the pointer to any study data before we test for byte
2572     flipping, so we scan the extra_data block first. This may set two fields in the
2573     match block, so we must initialize them beforehand. However, the other fields
2574     in the match block must not be set until after the byte flipping. */
2575    
2576 nigel 91 md->tables = re->tables;
2577     md->callout_data = NULL;
2578 nigel 77
2579     if (extra_data != NULL)
2580     {
2581     unsigned int flags = extra_data->flags;
2582     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2583     study = (const pcre_study_data *)extra_data->study_data;
2584     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2585 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2586     return PCRE_ERROR_DFA_UMLIMIT;
2587 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2588 nigel 91 md->callout_data = extra_data->callout_data;
2589 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2590 nigel 91 md->tables = extra_data->tables;
2591 nigel 77 }
2592    
2593     /* Check that the first field in the block is the magic number. If it is not,
2594     test for a regex that was compiled on a host of opposite endianness. If this is
2595     the case, flipped values are put in internal_re and internal_study if there was
2596     study data too. */
2597    
2598     if (re->magic_number != MAGIC_NUMBER)
2599     {
2600     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2601     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2602     if (study != NULL) study = &internal_study;
2603     }
2604    
2605     /* Set some local values */
2606    
2607     current_subject = (const unsigned char *)subject + start_offset;
2608     end_subject = (const unsigned char *)subject + length;
2609     req_byte_ptr = current_subject - 1;
2610    
2611 nigel 91 #ifdef SUPPORT_UTF8
2612 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2613 nigel 91 #else
2614     utf8 = FALSE;
2615     #endif
2616 nigel 77
2617 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2618     (re->options & PCRE_ANCHORED) != 0;
2619    
2620 nigel 77 /* The remaining fixed data for passing around. */
2621    
2622 nigel 91 md->start_code = (const uschar *)argument_re +
2623 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2624 nigel 91 md->start_subject = (const unsigned char *)subject;
2625     md->end_subject = end_subject;
2626     md->moptions = options;
2627     md->poptions = re->options;
2628 nigel 77
2629 ph10 231 /* If the BSR option is not set at match time, copy what was set
2630     at compile time. */
2631    
2632     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2633     {
2634     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2635     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2636     #ifdef BSR_ANYCRLF
2637     else md->moptions |= PCRE_BSR_ANYCRLF;
2638 ph10 243 #endif
2639     }
2640 ph10 231
2641 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2642     nothing is set at run time, whatever was used at compile time applies. */
2643 nigel 91
2644 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2645 nigel 93 PCRE_NEWLINE_BITS)
2646 nigel 91 {
2647 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2648 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2649     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2650 nigel 91 case PCRE_NEWLINE_CR+
2651 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2652 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2653 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2654 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2655 nigel 91 }
2656    
2657 ph10 149 if (newline == -2)
2658 nigel 91 {
2659 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2660     }
2661     else if (newline < 0)
2662     {
2663 nigel 93 md->nltype = NLTYPE_ANY;
2664 nigel 91 }
2665     else
2666     {
2667 nigel 93 md->nltype = NLTYPE_FIXED;
2668     if (newline > 255)
2669     {
2670     md->nllen = 2;
2671     md->nl[0] = (newline >> 8) & 255;
2672     md->nl[1] = newline & 255;
2673     }
2674     else
2675     {
2676     md->nllen = 1;
2677     md->nl[0] = newline;
2678     }
2679 nigel 91 }
2680    
2681 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2682     back the character offset. */
2683    
2684     #ifdef SUPPORT_UTF8
2685     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2686     {
2687     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2688     return PCRE_ERROR_BADUTF8;
2689     if (start_offset > 0 && start_offset < length)
2690     {
2691     int tb = ((uschar *)subject)[start_offset];
2692     if (tb > 127)
2693     {
2694     tb &= 0xc0;
2695     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2696     }
2697     }
2698     }
2699     #endif
2700    
2701     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2702     is a feature that makes it possible to save compiled regex and re-use them
2703     in other programs later. */
2704    
2705 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2706 nigel 77
2707     /* The lower casing table and the "must be at the start of a line" flag are
2708     used in a loop when finding where to start. */
2709    
2710 nigel 91 lcc = md->tables + lcc_offset;
2711 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2712 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2713    
2714     /* Set up the first character to match, if available. The first_byte value is
2715     never set for an anchored regular expression, but the anchoring may be forced
2716     at run time, so we have to test for anchoring. The first char may be unset for
2717     an unanchored pattern, of course. If there's no first char and the pattern was
2718     studied, there may be a bitmap of possible first characters. */
2719    
2720     if (!anchored)
2721     {
2722 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2723 nigel 77 {
2724     first_byte = re->first_byte & 255;
2725     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2726     first_byte = lcc[first_byte];
2727     }
2728     else
2729     {
2730     if (startline && study != NULL &&
2731     (study->options & PCRE_STUDY_MAPPED) != 0)
2732     start_bits = study->start_bits;
2733     }
2734     }
2735    
2736     /* For anchored or unanchored matches, there may be a "last known required
2737     character" set. */
2738    
2739 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2740 nigel 77 {
2741     req_byte = re->req_byte & 255;
2742     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2743 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2744 nigel 77 }
2745    
2746     /* Call the main matching function, looping for a non-anchored regex after a
2747 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2748     a match. */
2749 nigel 77
2750     for (;;)
2751     {
2752     int rc;
2753    
2754     if ((options & PCRE_DFA_RESTART) == 0)
2755     {
2756     const uschar *save_end_subject = end_subject;
2757    
2758 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2759     line of a multiline string. Implement this by temporarily adjusting
2760     end_subject so that we stop scanning at a newline. If the match fails at
2761     the newline, later code breaks this loop. */
2762 nigel 77
2763     if (firstline)
2764     {
2765 ph10 365 USPTR t = current_subject;
2766     #ifdef SUPPORT_UTF8
2767     if (utf8)
2768 ph10 371 {
2769     while (t < md->end_subject && !IS_NEWLINE(t))
2770 ph10 365 {
2771     t++;
2772     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2773 ph10 371 }
2774 ph10 365 }
2775     else
2776 ph10 371 #endif
2777 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2778 nigel 77 end_subject = t;
2779     }
2780 ph10 392
2781 ph10 389 /* There are some optimizations that avoid running the match if a known
2782     starting point is not found, or if a known later character is not present.
2783     However, there is an option that disables these, for testing and for
2784     ensuring that all callouts do actually occur. */
2785 nigel 77
2786 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2787 ph10 392 {
2788    
2789 ph10 389 /* Advance to a known first byte. */
2790 ph10 392
2791 ph10 389 if (first_byte >= 0)
2792 nigel 77 {
2793 ph10 389 if (first_byte_caseless)
2794     while (current_subject < end_subject &&
2795     lcc[*current_subject] != first_byte)
2796     current_subject++;
2797     else
2798 ph10 392 while (current_subject < end_subject &&
2799 ph10 389 *current_subject != first_byte)
2800     current_subject++;
2801     }
2802 ph10 392
2803 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2804 ph10 392
2805 ph10 389 else if (startline)
2806     {
2807     if (current_subject > md->start_subject + start_offset)
2808     {
2809 ph10 365 #ifdef SUPPORT_UTF8
2810 ph10 389 if (utf8)
2811 ph10 365 {
2812 ph10 392 while (current_subject < end_subject &&
2813 ph10 389 !WAS_NEWLINE(current_subject))
2814     {
2815 ph10 365 current_subject++;
2816 ph10 389 while(current_subject < end_subject &&
2817     (*current_subject & 0xc0) == 0x80)
2818     current_subject++;
2819     }
2820 ph10 371 }
2821 ph10 389 else
2822     #endif
2823     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2824     current_subject++;
2825 ph10 392
2826 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2827     ANYCRLF, and we are now at a LF, advance the match position by one
2828     more character. */
2829 ph10 392
2830 ph10 391 if (current_subject[-1] == CHAR_CR &&
2831 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2832     current_subject < end_subject &&
2833 ph10 391 *current_subject == CHAR_NL)
2834 ph10 389 current_subject++;
2835 ph10 365 }
2836 nigel 77 }
2837 ph10 392
2838 ph10 389 /* Or to a non-unique first char after study */
2839 ph10 392
2840 ph10 389 else if (start_bits != NULL)
2841 nigel 77 {
2842 ph10 389 while (current_subject < end_subject)
2843     {
2844     register unsigned int c = *current_subject;
2845     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2846     else break;
2847     }
2848 nigel 77 }
2849 ph10 392 }
2850 nigel 77
2851     /* Restore fudged end_subject */
2852    
2853     end_subject = save_end_subject;
2854     }
2855    
2856     /* If req_byte is set, we know that that character must appear in the subject
2857     for the match to succeed. If the first character is set, req_byte must be
2858     later in the subject; otherwise the test starts at the match point. This
2859     optimization can save a huge amount of work in patterns with nested unlimited
2860     repeats that aren't going to match. Writing separate code for cased/caseless
2861     versions makes it go faster, as does using an autoincrement and backing off
2862     on a match.
2863    
2864     HOWEVER: when the subject string is very, very long, searching to its end can
2865     take a long time, and give bad performance on quite ordinary patterns. This
2866     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2867     don't do this when the string is sufficiently long.
2868    
2869 ph10 392 ALSO: this processing is disabled when partial matching is requested, and can
2870 ph10 389 also be explicitly deactivated. */
2871 nigel 77
2872 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2873     req_byte >= 0 &&
2874 nigel 77 end_subject - current_subject < REQ_BYTE_MAX &&
2875     (options & PCRE_PARTIAL) == 0)
2876     {
2877     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2878    
2879     /* We don't need to repeat the search if we haven't yet reached the
2880     place we found it at last time. */
2881    
2882     if (p > req_byte_ptr)
2883     {
2884     if (req_byte_caseless)
2885     {
2886     while (p < end_subject)
2887     {
2888     register int pp = *p++;
2889     if (pp == req_byte || pp == req_byte2) { p--; break; }
2890     }
2891     }
2892     else
2893     {
2894     while (p < end_subject)
2895     {
2896     if (*p++ == req_byte) { p--; break; }
2897     }
2898     }
2899    
2900     /* If we can't find the required character, break the matching loop,
2901     which will cause a return or PCRE_ERROR_NOMATCH. */
2902    
2903     if (p >= end_subject) break;
2904    
2905     /* If we have found the required character, save the point where we
2906     found it, so that we don't search again next time round the loop if
2907     the start hasn't passed this character yet. */
2908    
2909     req_byte_ptr = p;
2910     }
2911     }
2912    
2913     /* OK, now we can do the business */
2914    
2915     rc = internal_dfa_exec(
2916 nigel 91 md, /* fixed match data */
2917     md->start_code, /* this subexpression's code */
2918     current_subject, /* where we currently are */
2919     start_offset, /* start offset in subject */
2920     offsets, /* offset vector */
2921     offsetcount, /* size of same */
2922     workspace, /* workspace vector */
2923     wscount, /* size of same */
2924 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2925 nigel 91 0, /* function recurse level */
2926     0); /* regex recurse level */
2927 nigel 77
2928     /* Anything other than "no match" means we are done, always; otherwise, carry
2929     on only if not anchored. */
2930    
2931     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2932    
2933     /* Advance to the next subject character unless we are at the end of a line
2934     and firstline is set. */
2935    
2936 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2937 nigel 77 current_subject++;
2938     if (utf8)
2939     {
2940     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2941     current_subject++;
2942     }
2943     if (current_subject > end_subject) break;
2944    
2945 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2946 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2947     or ANY or ANYCRLF, advance the match position by one more character. */
2948 nigel 93
2949 ph10 391 if (current_subject[-1] == CHAR_CR &&
2950 ph10 226 current_subject < end_subject &&
2951 ph10 391 *current_subject == CHAR_NL &&
2952 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2953 ph10 226 (md->nltype == NLTYPE_ANY ||
2954     md->nltype == NLTYPE_ANYCRLF ||
2955     md->nllen == 2))
2956 nigel 93 current_subject++;
2957    
2958     } /* "Bumpalong" loop */
2959    
2960 nigel 77 return PCRE_ERROR_NOMATCH;
2961     }
2962    
2963     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12