/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 397 - (hide annotations) (download)
Fri Mar 20 19:40:08 2009 UTC (4 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 97906 byte(s)
Fix non-callout function supplied for pcre_dfa_exec() bug.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 392 and semantics are as close as possible to those of the Perl 5 language (but see
7 ph10 383 below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 200 #ifdef HAVE_CONFIG_H
49 ph10 236 #include "config.h"
50 ph10 200 #endif
51 ph10 199
52 nigel 93 #define NLBLOCK md /* Block containing newline information */
53     #define PSSTART start_subject /* Field containing processed string start */
54     #define PSEND end_subject /* Field containing processed string end */
55    
56 nigel 77 #include "pcre_internal.h"
57    
58    
59     /* For use to indent debugging output */
60    
61     #define SP " "
62    
63    
64    
65     /*************************************************
66     * Code parameters and static tables *
67     *************************************************/
68    
69     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
70 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
71 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
72 ph10 178 never stored, so we push them well clear of the normal opcodes. */
73 nigel 77
74 ph10 178 #define OP_PROP_EXTRA 300
75     #define OP_EXTUNI_EXTRA 320
76     #define OP_ANYNL_EXTRA 340
77     #define OP_HSPACE_EXTRA 360
78     #define OP_VSPACE_EXTRA 380
79 nigel 77
80    
81     /* This table identifies those opcodes that are followed immediately by a
82     character that is to be tested in some way. This makes is possible to
83     centralize the loading of these characters. In the case of Type * etc, the
84     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
85 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
86 ph10 168 that follow must also be modified. */
87 nigel 77
88 ph10 327 static const uschar coptable[] = {
89 nigel 77 0, /* End */
90 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
91     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
92 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
93 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
94     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
95 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
96     1, /* Char */
97     1, /* Charnc */
98     1, /* not */
99     /* Positive single-char repeats */
100     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
101     3, 3, 3, /* upto, minupto, exact */
102 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
103 nigel 77 /* Negative single-char repeats - only for chars < 256 */
104     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
105     3, 3, 3, /* NOT upto, minupto, exact */
106 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
107 nigel 77 /* Positive type repeats */
108     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
109     3, 3, 3, /* Type upto, minupto, exact */
110 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
111 nigel 77 /* Character class & ref repeats */
112     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
113     0, 0, /* CRRANGE, CRMINRANGE */
114     0, /* CLASS */
115     0, /* NCLASS */
116     0, /* XCLASS - variable length */
117     0, /* REF */
118     0, /* RECURSE */
119     0, /* CALLOUT */
120     0, /* Alt */
121     0, /* Ket */
122     0, /* KetRmax */
123     0, /* KetRmin */
124     0, /* Assert */
125     0, /* Assert not */
126     0, /* Assert behind */
127     0, /* Assert behind not */
128     0, /* Reverse */
129 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
130     0, 0, 0, /* SBRA, SCBRA, SCOND */
131 nigel 77 0, /* CREF */
132 nigel 93 0, /* RREF */
133     0, /* DEF */
134 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
135     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
136 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
137 nigel 77 };
138    
139     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
140     and \w */
141    
142 ph10 327 static const uschar toptable1[] = {
143 ph10 168 0, 0, 0, 0, 0, 0,
144 nigel 77 ctype_digit, ctype_digit,
145     ctype_space, ctype_space,
146     ctype_word, ctype_word,
147 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
148 nigel 77 };
149    
150 ph10 327 static const uschar toptable2[] = {
151 ph10 168 0, 0, 0, 0, 0, 0,
152 nigel 77 ctype_digit, 0,
153     ctype_space, 0,
154     ctype_word, 0,
155 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
156 nigel 77 };
157    
158    
159     /* Structure for holding data about a particular state, which is in effect the
160     current data for an active path through the match tree. It must consist
161     entirely of ints because the working vector we are passed, and which we put
162     these structures in, is a vector of ints. */
163    
164     typedef struct stateblock {
165     int offset; /* Offset to opcode */
166     int count; /* Count for repeats */
167     int ims; /* ims flag bits */
168     int data; /* Some use extra data */
169     } stateblock;
170    
171     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
172    
173    
174     #ifdef DEBUG
175     /*************************************************
176     * Print character string *
177     *************************************************/
178    
179     /* Character string printing function for debugging.
180    
181     Arguments:
182     p points to string
183     length number of bytes
184     f where to print
185    
186     Returns: nothing
187     */
188    
189     static void
190     pchars(unsigned char *p, int length, FILE *f)
191     {
192     int c;
193     while (length-- > 0)
194     {
195     if (isprint(c = *(p++)))
196     fprintf(f, "%c", c);
197     else
198     fprintf(f, "\\x%02x", c);
199     }
200     }
201     #endif
202    
203    
204    
205     /*************************************************
206     * Execute a Regular Expression - DFA engine *
207     *************************************************/
208    
209     /* This internal function applies a compiled pattern to a subject string,
210     starting at a given point, using a DFA engine. This function is called from the
211     external one, possibly multiple times if the pattern is not anchored. The
212     function calls itself recursively for some kinds of subpattern.
213    
214     Arguments:
215     md the match_data block with fixed information
216     this_start_code the opening bracket of this subexpression's code
217     current_subject where we currently are in the subject string
218     start_offset start offset in the subject string
219     offsets vector to contain the matching string offsets
220     offsetcount size of same
221     workspace vector of workspace
222     wscount size of same
223     ims the current ims flags
224     rlevel function call recursion level
225     recursing regex recursive call level
226    
227 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
228 ph10 341 = 0 => offsets overflowed; longest matches are present
229 nigel 77 -1 => failed to match
230     < -1 => some kind of unexpected problem
231    
232     The following macros are used for adding states to the two state vectors (one
233     for the current character, one for the following character). */
234    
235     #define ADD_ACTIVE(x,y) \
236     if (active_count++ < wscount) \
237     { \
238     next_active_state->offset = (x); \
239     next_active_state->count = (y); \
240     next_active_state->ims = ims; \
241     next_active_state++; \
242     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
243     } \
244     else return PCRE_ERROR_DFA_WSSIZE
245    
246     #define ADD_ACTIVE_DATA(x,y,z) \
247     if (active_count++ < wscount) \
248     { \
249     next_active_state->offset = (x); \
250     next_active_state->count = (y); \
251     next_active_state->ims = ims; \
252     next_active_state->data = (z); \
253     next_active_state++; \
254     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
255     } \
256     else return PCRE_ERROR_DFA_WSSIZE
257    
258     #define ADD_NEW(x,y) \
259     if (new_count++ < wscount) \
260     { \
261     next_new_state->offset = (x); \
262     next_new_state->count = (y); \
263     next_new_state->ims = ims; \
264     next_new_state++; \
265     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
266     } \
267     else return PCRE_ERROR_DFA_WSSIZE
268    
269     #define ADD_NEW_DATA(x,y,z) \
270     if (new_count++ < wscount) \
271     { \
272     next_new_state->offset = (x); \
273     next_new_state->count = (y); \
274     next_new_state->ims = ims; \
275     next_new_state->data = (z); \
276     next_new_state++; \
277     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
278     } \
279     else return PCRE_ERROR_DFA_WSSIZE
280    
281     /* And now, here is the code */
282    
283     static int
284     internal_dfa_exec(
285     dfa_match_data *md,
286     const uschar *this_start_code,
287     const uschar *current_subject,
288     int start_offset,
289     int *offsets,
290     int offsetcount,
291     int *workspace,
292     int wscount,
293     int ims,
294     int rlevel,
295     int recursing)
296     {
297     stateblock *active_states, *new_states, *temp_states;
298     stateblock *next_active_state, *next_new_state;
299    
300     const uschar *ctypes, *lcc, *fcc;
301     const uschar *ptr;
302 nigel 93 const uschar *end_code, *first_op;
303 nigel 77
304     int active_count, new_count, match_count;
305    
306     /* Some fields in the md block are frequently referenced, so we load them into
307     independent variables in the hope that this will perform better. */
308    
309     const uschar *start_subject = md->start_subject;
310     const uschar *end_subject = md->end_subject;
311     const uschar *start_code = md->start_code;
312    
313 nigel 87 #ifdef SUPPORT_UTF8
314 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
315 nigel 93 #else
316     BOOL utf8 = FALSE;
317 nigel 87 #endif
318 nigel 77
319     rlevel++;
320     offsetcount &= (-2);
321    
322     wscount -= 2;
323     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
324     (2 * INTS_PER_STATEBLOCK);
325    
326     DPRINTF(("\n%.*s---------------------\n"
327     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
328     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
329    
330     ctypes = md->tables + ctypes_offset;
331     lcc = md->tables + lcc_offset;
332     fcc = md->tables + fcc_offset;
333    
334     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
335    
336     active_states = (stateblock *)(workspace + 2);
337     next_new_state = new_states = active_states + wscount;
338     new_count = 0;
339    
340 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
341     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
342    
343 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
344     the alternative states onto the list, and find out where the end is. This
345     makes is possible to use this function recursively, when we want to stop at a
346     matching internal ket rather than at the end.
347    
348     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
349     a backward assertion. In that case, we have to find out the maximum amount to
350     move back, and set up each alternative appropriately. */
351    
352 nigel 93 if (*first_op == OP_REVERSE)
353 nigel 77 {
354     int max_back = 0;
355     int gone_back;
356    
357     end_code = this_start_code;
358     do
359     {
360     int back = GET(end_code, 2+LINK_SIZE);
361     if (back > max_back) max_back = back;
362     end_code += GET(end_code, 1);
363     }
364     while (*end_code == OP_ALT);
365    
366     /* If we can't go back the amount required for the longest lookbehind
367     pattern, go back as far as we can; some alternatives may still be viable. */
368    
369     #ifdef SUPPORT_UTF8
370     /* In character mode we have to step back character by character */
371    
372     if (utf8)
373     {
374     for (gone_back = 0; gone_back < max_back; gone_back++)
375     {
376     if (current_subject <= start_subject) break;
377     current_subject--;
378     while (current_subject > start_subject &&
379     (*current_subject & 0xc0) == 0x80)
380     current_subject--;
381     }
382     }
383     else
384     #endif
385    
386     /* In byte-mode we can do this quickly. */
387    
388     {
389     gone_back = (current_subject - max_back < start_subject)?
390     current_subject - start_subject : max_back;
391     current_subject -= gone_back;
392     }
393    
394     /* Now we can process the individual branches. */
395    
396     end_code = this_start_code;
397     do
398     {
399     int back = GET(end_code, 2+LINK_SIZE);
400     if (back <= gone_back)
401     {
402     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
403     ADD_NEW_DATA(-bstate, 0, gone_back - back);
404     }
405     end_code += GET(end_code, 1);
406     }
407     while (*end_code == OP_ALT);
408     }
409    
410     /* This is the code for a "normal" subpattern (not a backward assertion). The
411     start of a whole pattern is always one of these. If we are at the top level,
412     we may be asked to restart matching from the same point that we reached for a
413     previous partial match. We still have to scan through the top-level branches to
414     find the end state. */
415    
416     else
417     {
418     end_code = this_start_code;
419    
420     /* Restarting */
421    
422     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
423     {
424     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
425     new_count = workspace[1];
426     if (!workspace[0])
427     memcpy(new_states, active_states, new_count * sizeof(stateblock));
428     }
429    
430     /* Not restarting */
431    
432     else
433     {
434 nigel 93 int length = 1 + LINK_SIZE +
435     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
436 nigel 77 do
437     {
438 nigel 93 ADD_NEW(end_code - start_code + length, 0);
439 nigel 77 end_code += GET(end_code, 1);
440 nigel 93 length = 1 + LINK_SIZE;
441 nigel 77 }
442     while (*end_code == OP_ALT);
443     }
444     }
445    
446     workspace[0] = 0; /* Bit indicating which vector is current */
447    
448     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
449    
450     /* Loop for scanning the subject */
451    
452     ptr = current_subject;
453     for (;;)
454     {
455     int i, j;
456 nigel 91 int clen, dlen;
457     unsigned int c, d;
458 nigel 77
459     /* Make the new state list into the active state list and empty the
460     new state list. */
461    
462     temp_states = active_states;
463     active_states = new_states;
464     new_states = temp_states;
465     active_count = new_count;
466     new_count = 0;
467    
468     workspace[0] ^= 1; /* Remember for the restarting feature */
469     workspace[1] = active_count;
470    
471     #ifdef DEBUG
472     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
473     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
474     printf("\"\n");
475    
476     printf("%.*sActive states: ", rlevel*2-2, SP);
477     for (i = 0; i < active_count; i++)
478     printf("%d/%d ", active_states[i].offset, active_states[i].count);
479     printf("\n");
480     #endif
481    
482     /* Set the pointers for adding new states */
483    
484     next_active_state = active_states + active_count;
485     next_new_state = new_states;
486    
487     /* Load the current character from the subject outside the loop, as many
488     different states may want to look at it, and we assume that at least one
489     will. */
490    
491     if (ptr < end_subject)
492     {
493 nigel 93 clen = 1; /* Number of bytes in the character */
494 nigel 77 #ifdef SUPPORT_UTF8
495     if (utf8) { GETCHARLEN(c, ptr, clen); } else
496     #endif /* SUPPORT_UTF8 */
497     c = *ptr;
498     }
499     else
500     {
501 nigel 93 clen = 0; /* This indicates the end of the subject */
502     c = NOTACHAR; /* This value should never actually be used */
503 nigel 77 }
504    
505     /* Scan up the active states and act on each one. The result of an action
506     may be to add more states to the currently active list (e.g. on hitting a
507     parenthesis) or it may be to put states on the new list, for considering
508     when we move the character pointer on. */
509    
510     for (i = 0; i < active_count; i++)
511     {
512     stateblock *current_state = active_states + i;
513     const uschar *code;
514     int state_offset = current_state->offset;
515 ph10 397 int count, codevalue, rrc;
516 nigel 77
517     #ifdef DEBUG
518     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
519 nigel 93 if (clen == 0) printf("EOL\n");
520 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
521     else printf("0x%02x\n", c);
522     #endif
523    
524     /* This variable is referred to implicity in the ADD_xxx macros. */
525    
526     ims = current_state->ims;
527    
528     /* A negative offset is a special case meaning "hold off going to this
529     (negated) state until the number of characters in the data field have
530     been skipped". */
531    
532     if (state_offset < 0)
533     {
534     if (current_state->data > 0)
535     {
536     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
537     ADD_NEW_DATA(state_offset, current_state->count,
538     current_state->data - 1);
539     continue;
540     }
541     else
542     {
543     current_state->offset = state_offset = -state_offset;
544     }
545     }
546    
547     /* Check for a duplicate state with the same count, and skip if found. */
548    
549     for (j = 0; j < i; j++)
550     {
551     if (active_states[j].offset == state_offset &&
552     active_states[j].count == current_state->count)
553     {
554     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
555     goto NEXT_ACTIVE_STATE;
556     }
557     }
558    
559     /* The state offset is the offset to the opcode */
560    
561     code = start_code + state_offset;
562     codevalue = *code;
563    
564     /* If this opcode is followed by an inline character, load it. It is
565     tempting to test for the presence of a subject character here, but that
566     is wrong, because sometimes zero repetitions of the subject are
567     permitted.
568    
569     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570 ph10 178 argument that is not a data character - but is always one byte long. We
571     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
572     this case. To keep the other cases fast, convert these ones to new opcodes.
573     */
574 nigel 77
575     if (coptable[codevalue] > 0)
576     {
577     dlen = 1;
578     #ifdef SUPPORT_UTF8
579     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
580     #endif /* SUPPORT_UTF8 */
581     d = code[coptable[codevalue]];
582     if (codevalue >= OP_TYPESTAR)
583     {
584 nigel 93 switch(d)
585     {
586     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
587     case OP_NOTPROP:
588     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591 ph10 178 case OP_NOT_HSPACE:
592 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593 ph10 178 case OP_NOT_VSPACE:
594 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595 nigel 93 default: break;
596     }
597 nigel 77 }
598     }
599     else
600     {
601     dlen = 0; /* Not strictly necessary, but compilers moan */
602 nigel 93 d = NOTACHAR; /* if these variables are not set. */
603 nigel 77 }
604    
605    
606     /* Now process the individual opcodes */
607    
608     switch (codevalue)
609     {
610    
611     /* ========================================================================== */
612     /* Reached a closing bracket. If not at the end of the pattern, carry
613     on with the next opcode. Otherwise, unless we have an empty string and
614     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
615     matches so we always have the longest first. */
616    
617     case OP_KET:
618     case OP_KETRMIN:
619     case OP_KETRMAX:
620     if (code != end_code)
621     {
622     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
623     if (codevalue != OP_KET)
624     {
625     ADD_ACTIVE(state_offset - GET(code, 1), 0);
626     }
627     }
628     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
629     {
630     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
631     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
632     match_count = 0;
633     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
634     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
635     if (offsetcount >= 2)
636     {
637     offsets[0] = current_subject - start_subject;
638     offsets[1] = ptr - start_subject;
639     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
640     offsets[1] - offsets[0], current_subject));
641     }
642     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
643     {
644     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
645     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
646     match_count, rlevel*2-2, SP));
647     return match_count;
648     }
649     }
650     break;
651    
652     /* ========================================================================== */
653     /* These opcodes add to the current list of states without looking
654     at the current character. */
655    
656     /*-----------------------------------------------------------------*/
657     case OP_ALT:
658     do { code += GET(code, 1); } while (*code == OP_ALT);
659     ADD_ACTIVE(code - start_code, 0);
660     break;
661    
662     /*-----------------------------------------------------------------*/
663     case OP_BRA:
664 nigel 93 case OP_SBRA:
665 nigel 77 do
666     {
667     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
668     code += GET(code, 1);
669     }
670     while (*code == OP_ALT);
671     break;
672    
673     /*-----------------------------------------------------------------*/
674 nigel 93 case OP_CBRA:
675     case OP_SCBRA:
676     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
677     code += GET(code, 1);
678     while (*code == OP_ALT)
679     {
680     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
681     code += GET(code, 1);
682     }
683     break;
684    
685     /*-----------------------------------------------------------------*/
686 nigel 77 case OP_BRAZERO:
687     case OP_BRAMINZERO:
688     ADD_ACTIVE(state_offset + 1, 0);
689     code += 1 + GET(code, 2);
690     while (*code == OP_ALT) code += GET(code, 1);
691     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
692     break;
693    
694     /*-----------------------------------------------------------------*/
695 ph10 335 case OP_SKIPZERO:
696     code += 1 + GET(code, 2);
697     while (*code == OP_ALT) code += GET(code, 1);
698     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
699     break;
700    
701     /*-----------------------------------------------------------------*/
702 nigel 77 case OP_CIRC:
703     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
704 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
705     ptr != end_subject &&
706 nigel 93 WAS_NEWLINE(ptr)))
707 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
708     break;
709    
710     /*-----------------------------------------------------------------*/
711     case OP_EOD:
712     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
713     break;
714    
715     /*-----------------------------------------------------------------*/
716     case OP_OPT:
717     ims = code[1];
718     ADD_ACTIVE(state_offset + 2, 0);
719     break;
720    
721     /*-----------------------------------------------------------------*/
722     case OP_SOD:
723     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
724     break;
725    
726     /*-----------------------------------------------------------------*/
727     case OP_SOM:
728     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
729     break;
730    
731    
732     /* ========================================================================== */
733     /* These opcodes inspect the next subject character, and sometimes
734     the previous one as well, but do not have an argument. The variable
735     clen contains the length of the current character and is zero if we are
736     at the end of the subject. */
737    
738     /*-----------------------------------------------------------------*/
739     case OP_ANY:
740 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
741 nigel 77 { ADD_NEW(state_offset + 1, 0); }
742     break;
743    
744     /*-----------------------------------------------------------------*/
745 ph10 341 case OP_ALLANY:
746     if (clen > 0)
747     { ADD_NEW(state_offset + 1, 0); }
748     break;
749    
750     /*-----------------------------------------------------------------*/
751 nigel 77 case OP_EODN:
752 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
753 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
754     break;
755    
756     /*-----------------------------------------------------------------*/
757     case OP_DOLL:
758     if ((md->moptions & PCRE_NOTEOL) == 0)
759     {
760 nigel 91 if (clen == 0 ||
761 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
762 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
763     ))
764 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
765     }
766 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
767 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
768     break;
769    
770     /*-----------------------------------------------------------------*/
771    
772     case OP_DIGIT:
773     case OP_WHITESPACE:
774     case OP_WORDCHAR:
775     if (clen > 0 && c < 256 &&
776     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
777     { ADD_NEW(state_offset + 1, 0); }
778     break;
779    
780     /*-----------------------------------------------------------------*/
781     case OP_NOT_DIGIT:
782     case OP_NOT_WHITESPACE:
783     case OP_NOT_WORDCHAR:
784     if (clen > 0 && (c >= 256 ||
785     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
786     { ADD_NEW(state_offset + 1, 0); }
787     break;
788    
789     /*-----------------------------------------------------------------*/
790     case OP_WORD_BOUNDARY:
791     case OP_NOT_WORD_BOUNDARY:
792     {
793     int left_word, right_word;
794    
795     if (ptr > start_subject)
796     {
797     const uschar *temp = ptr - 1;
798     #ifdef SUPPORT_UTF8
799     if (utf8) BACKCHAR(temp);
800     #endif
801     GETCHARTEST(d, temp);
802     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
803     }
804     else left_word = 0;
805    
806     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
807     else right_word = 0;
808    
809     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
810     { ADD_ACTIVE(state_offset + 1, 0); }
811     }
812     break;
813    
814    
815     /*-----------------------------------------------------------------*/
816     /* Check the next character by Unicode property. We will get here only
817     if the support is in the binary; otherwise a compile-time error occurs.
818     */
819    
820 ph10 151 #ifdef SUPPORT_UCP
821 nigel 77 case OP_PROP:
822     case OP_NOTPROP:
823     if (clen > 0)
824     {
825 nigel 87 BOOL OK;
826 ph10 349 const ucd_record * prop = GET_UCD(c);
827 nigel 87 switch(code[1])
828 nigel 77 {
829 nigel 87 case PT_ANY:
830     OK = TRUE;
831     break;
832    
833     case PT_LAMP:
834 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
835 nigel 87 break;
836    
837     case PT_GC:
838 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
839 nigel 87 break;
840    
841     case PT_PC:
842 ph10 349 OK = prop->chartype == code[2];
843 nigel 87 break;
844    
845     case PT_SC:
846 ph10 349 OK = prop->script == code[2];
847 nigel 87 break;
848    
849     /* Should never occur, but keep compilers from grumbling. */
850    
851     default:
852     OK = codevalue != OP_PROP;
853     break;
854 nigel 77 }
855 nigel 87
856     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
857 nigel 77 }
858     break;
859     #endif
860    
861    
862    
863     /* ========================================================================== */
864     /* These opcodes likewise inspect the subject character, but have an
865     argument that is not a data character. It is one of these opcodes:
866 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
867     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
868 nigel 77
869     case OP_TYPEPLUS:
870     case OP_TYPEMINPLUS:
871 nigel 93 case OP_TYPEPOSPLUS:
872 nigel 77 count = current_state->count; /* Already matched */
873     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
874     if (clen > 0)
875     {
876     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
877     (c < 256 &&
878 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
879 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
880     {
881 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
882     {
883     active_count--; /* Remove non-match possibility */
884     next_active_state--;
885     }
886 nigel 77 count++;
887     ADD_NEW(state_offset, count);
888     }
889     }
890     break;
891    
892     /*-----------------------------------------------------------------*/
893     case OP_TYPEQUERY:
894     case OP_TYPEMINQUERY:
895 nigel 93 case OP_TYPEPOSQUERY:
896 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
897     if (clen > 0)
898     {
899     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900     (c < 256 &&
901 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
902 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
903     {
904 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
905     {
906     active_count--; /* Remove non-match possibility */
907     next_active_state--;
908     }
909 nigel 77 ADD_NEW(state_offset + 2, 0);
910     }
911     }
912     break;
913    
914     /*-----------------------------------------------------------------*/
915     case OP_TYPESTAR:
916     case OP_TYPEMINSTAR:
917 nigel 93 case OP_TYPEPOSSTAR:
918 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
919     if (clen > 0)
920     {
921     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
922     (c < 256 &&
923 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
924 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
925     {
926 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
927     {
928     active_count--; /* Remove non-match possibility */
929     next_active_state--;
930     }
931 nigel 77 ADD_NEW(state_offset, 0);
932     }
933     }
934     break;
935    
936     /*-----------------------------------------------------------------*/
937     case OP_TYPEEXACT:
938 nigel 93 count = current_state->count; /* Number already matched */
939     if (clen > 0)
940     {
941     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
942     (c < 256 &&
943 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
944 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
945     {
946     if (++count >= GET2(code, 1))
947     { ADD_NEW(state_offset + 4, 0); }
948     else
949     { ADD_NEW(state_offset, count); }
950     }
951     }
952     break;
953    
954     /*-----------------------------------------------------------------*/
955 nigel 77 case OP_TYPEUPTO:
956     case OP_TYPEMINUPTO:
957 nigel 93 case OP_TYPEPOSUPTO:
958     ADD_ACTIVE(state_offset + 4, 0);
959 nigel 77 count = current_state->count; /* Number already matched */
960     if (clen > 0)
961     {
962     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
963     (c < 256 &&
964 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
965 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
966     {
967 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
968     {
969     active_count--; /* Remove non-match possibility */
970     next_active_state--;
971     }
972 nigel 77 if (++count >= GET2(code, 1))
973     { ADD_NEW(state_offset + 4, 0); }
974     else
975     { ADD_NEW(state_offset, count); }
976     }
977     }
978     break;
979    
980     /* ========================================================================== */
981     /* These are virtual opcodes that are used when something like
982 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
983     argument. It keeps the code above fast for the other cases. The argument
984     is in the d variable. */
985 nigel 77
986 ph10 151 #ifdef SUPPORT_UCP
987 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
988     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
989 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
990 nigel 77 count = current_state->count; /* Already matched */
991 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
992 nigel 77 if (clen > 0)
993     {
994 nigel 87 BOOL OK;
995 ph10 349 const ucd_record * prop = GET_UCD(c);
996 nigel 87 switch(code[2])
997     {
998     case PT_ANY:
999     OK = TRUE;
1000     break;
1001    
1002     case PT_LAMP:
1003 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1004 nigel 87 break;
1005    
1006     case PT_GC:
1007 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1008 nigel 87 break;
1009    
1010     case PT_PC:
1011 ph10 349 OK = prop->chartype == code[3];
1012 nigel 87 break;
1013    
1014     case PT_SC:
1015 ph10 349 OK = prop->script == code[3];
1016 nigel 87 break;
1017    
1018     /* Should never occur, but keep compilers from grumbling. */
1019    
1020     default:
1021     OK = codevalue != OP_PROP;
1022     break;
1023     }
1024    
1025 nigel 93 if (OK == (d == OP_PROP))
1026     {
1027     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1028     {
1029     active_count--; /* Remove non-match possibility */
1030     next_active_state--;
1031     }
1032     count++;
1033     ADD_NEW(state_offset, count);
1034     }
1035 nigel 77 }
1036     break;
1037    
1038     /*-----------------------------------------------------------------*/
1039     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1040     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1041 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1042 nigel 77 count = current_state->count; /* Already matched */
1043     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1044 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1045 nigel 77 {
1046     const uschar *nptr = ptr + clen;
1047     int ncount = 0;
1048 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1049     {
1050     active_count--; /* Remove non-match possibility */
1051     next_active_state--;
1052     }
1053 nigel 77 while (nptr < end_subject)
1054     {
1055     int nd;
1056     int ndlen = 1;
1057     GETCHARLEN(nd, nptr, ndlen);
1058 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1059 nigel 77 ncount++;
1060     nptr += ndlen;
1061     }
1062     count++;
1063     ADD_NEW_DATA(-state_offset, count, ncount);
1064     }
1065     break;
1066 ph10 151 #endif
1067 nigel 77
1068     /*-----------------------------------------------------------------*/
1069 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1070     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1071     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1072     count = current_state->count; /* Already matched */
1073     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1074     if (clen > 0)
1075     {
1076     int ncount = 0;
1077     switch (c)
1078     {
1079     case 0x000b:
1080     case 0x000c:
1081     case 0x0085:
1082     case 0x2028:
1083     case 0x2029:
1084 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1085     goto ANYNL01;
1086    
1087     case 0x000d:
1088     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1089     /* Fall through */
1090    
1091     ANYNL01:
1092     case 0x000a:
1093 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1094     {
1095     active_count--; /* Remove non-match possibility */
1096     next_active_state--;
1097     }
1098     count++;
1099     ADD_NEW_DATA(-state_offset, count, ncount);
1100     break;
1101 ph10 231
1102 nigel 93 default:
1103     break;
1104     }
1105     }
1106     break;
1107    
1108     /*-----------------------------------------------------------------*/
1109 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1110     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1111     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1112     count = current_state->count; /* Already matched */
1113     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1114     if (clen > 0)
1115     {
1116 ph10 182 BOOL OK;
1117 ph10 178 switch (c)
1118     {
1119     case 0x000a:
1120     case 0x000b:
1121     case 0x000c:
1122     case 0x000d:
1123     case 0x0085:
1124     case 0x2028:
1125     case 0x2029:
1126     OK = TRUE;
1127 ph10 182 break;
1128 ph10 178
1129     default:
1130     OK = FALSE;
1131 ph10 182 break;
1132 ph10 178 }
1133    
1134     if (OK == (d == OP_VSPACE))
1135 ph10 182 {
1136 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1137     {
1138     active_count--; /* Remove non-match possibility */
1139     next_active_state--;
1140     }
1141     count++;
1142     ADD_NEW_DATA(-state_offset, count, 0);
1143     }
1144     }
1145     break;
1146    
1147     /*-----------------------------------------------------------------*/
1148     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1149     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1150     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1151     count = current_state->count; /* Already matched */
1152     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1153     if (clen > 0)
1154     {
1155 ph10 182 BOOL OK;
1156 ph10 178 switch (c)
1157     {
1158     case 0x09: /* HT */
1159     case 0x20: /* SPACE */
1160     case 0xa0: /* NBSP */
1161     case 0x1680: /* OGHAM SPACE MARK */
1162     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1163     case 0x2000: /* EN QUAD */
1164     case 0x2001: /* EM QUAD */
1165     case 0x2002: /* EN SPACE */
1166     case 0x2003: /* EM SPACE */
1167     case 0x2004: /* THREE-PER-EM SPACE */
1168     case 0x2005: /* FOUR-PER-EM SPACE */
1169     case 0x2006: /* SIX-PER-EM SPACE */
1170     case 0x2007: /* FIGURE SPACE */
1171     case 0x2008: /* PUNCTUATION SPACE */
1172     case 0x2009: /* THIN SPACE */
1173     case 0x200A: /* HAIR SPACE */
1174     case 0x202f: /* NARROW NO-BREAK SPACE */
1175     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1176     case 0x3000: /* IDEOGRAPHIC SPACE */
1177     OK = TRUE;
1178     break;
1179 ph10 182
1180 ph10 178 default:
1181     OK = FALSE;
1182     break;
1183     }
1184 ph10 182
1185 ph10 178 if (OK == (d == OP_HSPACE))
1186 ph10 182 {
1187 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1188     {
1189     active_count--; /* Remove non-match possibility */
1190     next_active_state--;
1191     }
1192     count++;
1193     ADD_NEW_DATA(-state_offset, count, 0);
1194     }
1195     }
1196     break;
1197    
1198     /*-----------------------------------------------------------------*/
1199 ph10 151 #ifdef SUPPORT_UCP
1200 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1201     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1202 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1203 nigel 87 count = 4;
1204 nigel 77 goto QS1;
1205    
1206     case OP_PROP_EXTRA + OP_TYPESTAR:
1207     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1208 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1209 nigel 77 count = 0;
1210    
1211     QS1:
1212    
1213 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1214 nigel 77 if (clen > 0)
1215     {
1216 nigel 87 BOOL OK;
1217 ph10 349 const ucd_record * prop = GET_UCD(c);
1218 nigel 87 switch(code[2])
1219     {
1220     case PT_ANY:
1221     OK = TRUE;
1222     break;
1223    
1224     case PT_LAMP:
1225 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1226 nigel 87 break;
1227    
1228     case PT_GC:
1229 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1230 nigel 87 break;
1231    
1232     case PT_PC:
1233 ph10 349 OK = prop->chartype == code[3];
1234 nigel 87 break;
1235    
1236     case PT_SC:
1237 ph10 349 OK = prop->script == code[3];
1238 nigel 87 break;
1239    
1240     /* Should never occur, but keep compilers from grumbling. */
1241    
1242     default:
1243     OK = codevalue != OP_PROP;
1244     break;
1245     }
1246    
1247 nigel 93 if (OK == (d == OP_PROP))
1248     {
1249     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1250     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1251     {
1252     active_count--; /* Remove non-match possibility */
1253     next_active_state--;
1254     }
1255     ADD_NEW(state_offset + count, 0);
1256     }
1257 nigel 77 }
1258     break;
1259    
1260     /*-----------------------------------------------------------------*/
1261     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1262     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1263 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1264 nigel 77 count = 2;
1265     goto QS2;
1266    
1267     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1268     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1269 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1270 nigel 77 count = 0;
1271    
1272     QS2:
1273    
1274     ADD_ACTIVE(state_offset + 2, 0);
1275 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1276 nigel 77 {
1277     const uschar *nptr = ptr + clen;
1278     int ncount = 0;
1279 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1280     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1281     {
1282     active_count--; /* Remove non-match possibility */
1283     next_active_state--;
1284     }
1285 nigel 77 while (nptr < end_subject)
1286     {
1287     int nd;
1288     int ndlen = 1;
1289     GETCHARLEN(nd, nptr, ndlen);
1290 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1291 nigel 77 ncount++;
1292     nptr += ndlen;
1293     }
1294     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1295     }
1296     break;
1297 ph10 151 #endif
1298 nigel 77
1299     /*-----------------------------------------------------------------*/
1300 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1301     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1302     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1303     count = 2;
1304     goto QS3;
1305    
1306     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1307     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1308     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1309     count = 0;
1310    
1311     QS3:
1312     ADD_ACTIVE(state_offset + 2, 0);
1313     if (clen > 0)
1314     {
1315     int ncount = 0;
1316     switch (c)
1317     {
1318     case 0x000b:
1319     case 0x000c:
1320     case 0x0085:
1321     case 0x2028:
1322     case 0x2029:
1323 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324     goto ANYNL02;
1325    
1326     case 0x000d:
1327     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328     /* Fall through */
1329    
1330     ANYNL02:
1331     case 0x000a:
1332 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1333     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1334     {
1335     active_count--; /* Remove non-match possibility */
1336     next_active_state--;
1337     }
1338     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1339     break;
1340 ph10 231
1341 nigel 93 default:
1342     break;
1343     }
1344     }
1345     break;
1346    
1347     /*-----------------------------------------------------------------*/
1348 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1349     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1350     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1351     count = 2;
1352     goto QS4;
1353    
1354     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1355     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1356     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1357     count = 0;
1358    
1359     QS4:
1360     ADD_ACTIVE(state_offset + 2, 0);
1361     if (clen > 0)
1362     {
1363 ph10 182 BOOL OK;
1364 ph10 178 switch (c)
1365     {
1366     case 0x000a:
1367     case 0x000b:
1368     case 0x000c:
1369     case 0x000d:
1370     case 0x0085:
1371     case 0x2028:
1372     case 0x2029:
1373     OK = TRUE;
1374     break;
1375 ph10 182
1376 ph10 178 default:
1377     OK = FALSE;
1378     break;
1379     }
1380     if (OK == (d == OP_VSPACE))
1381 ph10 182 {
1382 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1383     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1384     {
1385     active_count--; /* Remove non-match possibility */
1386     next_active_state--;
1387     }
1388     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1389     }
1390     }
1391     break;
1392    
1393     /*-----------------------------------------------------------------*/
1394     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1395     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1396     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1397     count = 2;
1398     goto QS5;
1399    
1400     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1401     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1402     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1403     count = 0;
1404    
1405     QS5:
1406     ADD_ACTIVE(state_offset + 2, 0);
1407     if (clen > 0)
1408     {
1409 ph10 182 BOOL OK;
1410 ph10 178 switch (c)
1411     {
1412     case 0x09: /* HT */
1413     case 0x20: /* SPACE */
1414     case 0xa0: /* NBSP */
1415     case 0x1680: /* OGHAM SPACE MARK */
1416     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1417     case 0x2000: /* EN QUAD */
1418     case 0x2001: /* EM QUAD */
1419     case 0x2002: /* EN SPACE */
1420     case 0x2003: /* EM SPACE */
1421     case 0x2004: /* THREE-PER-EM SPACE */
1422     case 0x2005: /* FOUR-PER-EM SPACE */
1423     case 0x2006: /* SIX-PER-EM SPACE */
1424     case 0x2007: /* FIGURE SPACE */
1425     case 0x2008: /* PUNCTUATION SPACE */
1426     case 0x2009: /* THIN SPACE */
1427     case 0x200A: /* HAIR SPACE */
1428     case 0x202f: /* NARROW NO-BREAK SPACE */
1429     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1430     case 0x3000: /* IDEOGRAPHIC SPACE */
1431     OK = TRUE;
1432     break;
1433 ph10 182
1434 ph10 178 default:
1435     OK = FALSE;
1436     break;
1437     }
1438 ph10 182
1439 ph10 178 if (OK == (d == OP_HSPACE))
1440 ph10 182 {
1441 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1442     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1443     {
1444     active_count--; /* Remove non-match possibility */
1445     next_active_state--;
1446     }
1447     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1448     }
1449     }
1450     break;
1451    
1452     /*-----------------------------------------------------------------*/
1453 ph10 151 #ifdef SUPPORT_UCP
1454 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1455     case OP_PROP_EXTRA + OP_TYPEUPTO:
1456     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1457 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1458 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1459 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1460 nigel 77 count = current_state->count; /* Number already matched */
1461     if (clen > 0)
1462     {
1463 nigel 87 BOOL OK;
1464 ph10 349 const ucd_record * prop = GET_UCD(c);
1465 nigel 87 switch(code[4])
1466 nigel 77 {
1467 nigel 87 case PT_ANY:
1468     OK = TRUE;
1469     break;
1470    
1471     case PT_LAMP:
1472 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1473 nigel 87 break;
1474    
1475     case PT_GC:
1476 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1477 nigel 87 break;
1478    
1479     case PT_PC:
1480 ph10 349 OK = prop->chartype == code[5];
1481 nigel 87 break;
1482    
1483     case PT_SC:
1484 ph10 349 OK = prop->script == code[5];
1485 nigel 87 break;
1486    
1487     /* Should never occur, but keep compilers from grumbling. */
1488    
1489     default:
1490     OK = codevalue != OP_PROP;
1491     break;
1492     }
1493    
1494     if (OK == (d == OP_PROP))
1495     {
1496 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1497     {
1498     active_count--; /* Remove non-match possibility */
1499     next_active_state--;
1500     }
1501 nigel 77 if (++count >= GET2(code, 1))
1502 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1503 nigel 77 else
1504     { ADD_NEW(state_offset, count); }
1505     }
1506     }
1507     break;
1508    
1509     /*-----------------------------------------------------------------*/
1510     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1511     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1512     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1513 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1514 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1515     { ADD_ACTIVE(state_offset + 4, 0); }
1516     count = current_state->count; /* Number already matched */
1517 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1518 nigel 77 {
1519     const uschar *nptr = ptr + clen;
1520     int ncount = 0;
1521 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1522     {
1523     active_count--; /* Remove non-match possibility */
1524     next_active_state--;
1525     }
1526 nigel 77 while (nptr < end_subject)
1527     {
1528     int nd;
1529     int ndlen = 1;
1530     GETCHARLEN(nd, nptr, ndlen);
1531 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1532 nigel 77 ncount++;
1533     nptr += ndlen;
1534     }
1535     if (++count >= GET2(code, 1))
1536     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1537     else
1538     { ADD_NEW_DATA(-state_offset, count, ncount); }
1539     }
1540     break;
1541 ph10 151 #endif
1542 nigel 77
1543 nigel 93 /*-----------------------------------------------------------------*/
1544     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1545     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1546     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1547     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1548     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1549     { ADD_ACTIVE(state_offset + 4, 0); }
1550     count = current_state->count; /* Number already matched */
1551     if (clen > 0)
1552     {
1553     int ncount = 0;
1554     switch (c)
1555     {
1556     case 0x000b:
1557     case 0x000c:
1558     case 0x0085:
1559     case 0x2028:
1560     case 0x2029:
1561 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1562     goto ANYNL03;
1563    
1564     case 0x000d:
1565     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1566     /* Fall through */
1567    
1568     ANYNL03:
1569     case 0x000a:
1570 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1571     {
1572     active_count--; /* Remove non-match possibility */
1573     next_active_state--;
1574     }
1575     if (++count >= GET2(code, 1))
1576     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1577     else
1578     { ADD_NEW_DATA(-state_offset, count, ncount); }
1579     break;
1580 ph10 231
1581 nigel 93 default:
1582     break;
1583     }
1584     }
1585     break;
1586    
1587 ph10 178 /*-----------------------------------------------------------------*/
1588     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1589     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1590     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1591     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1592     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1593     { ADD_ACTIVE(state_offset + 4, 0); }
1594     count = current_state->count; /* Number already matched */
1595     if (clen > 0)
1596     {
1597 ph10 182 BOOL OK;
1598 ph10 178 switch (c)
1599     {
1600     case 0x000a:
1601     case 0x000b:
1602     case 0x000c:
1603     case 0x000d:
1604     case 0x0085:
1605     case 0x2028:
1606     case 0x2029:
1607     OK = TRUE;
1608     break;
1609 ph10 182
1610 ph10 178 default:
1611     OK = FALSE;
1612     }
1613 ph10 182
1614 ph10 178 if (OK == (d == OP_VSPACE))
1615 ph10 182 {
1616 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1617     {
1618     active_count--; /* Remove non-match possibility */
1619     next_active_state--;
1620     }
1621     if (++count >= GET2(code, 1))
1622     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1623     else
1624     { ADD_NEW_DATA(-state_offset, count, 0); }
1625     }
1626     }
1627     break;
1628    
1629     /*-----------------------------------------------------------------*/
1630     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1631     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1632     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1633     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1634     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1635     { ADD_ACTIVE(state_offset + 4, 0); }
1636     count = current_state->count; /* Number already matched */
1637     if (clen > 0)
1638     {
1639 ph10 182 BOOL OK;
1640 ph10 178 switch (c)
1641     {
1642     case 0x09: /* HT */
1643     case 0x20: /* SPACE */
1644     case 0xa0: /* NBSP */
1645     case 0x1680: /* OGHAM SPACE MARK */
1646     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1647     case 0x2000: /* EN QUAD */
1648     case 0x2001: /* EM QUAD */
1649     case 0x2002: /* EN SPACE */
1650     case 0x2003: /* EM SPACE */
1651     case 0x2004: /* THREE-PER-EM SPACE */
1652     case 0x2005: /* FOUR-PER-EM SPACE */
1653     case 0x2006: /* SIX-PER-EM SPACE */
1654     case 0x2007: /* FIGURE SPACE */
1655     case 0x2008: /* PUNCTUATION SPACE */
1656     case 0x2009: /* THIN SPACE */
1657     case 0x200A: /* HAIR SPACE */
1658     case 0x202f: /* NARROW NO-BREAK SPACE */
1659     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1660     case 0x3000: /* IDEOGRAPHIC SPACE */
1661     OK = TRUE;
1662     break;
1663 ph10 182
1664 ph10 178 default:
1665     OK = FALSE;
1666     break;
1667     }
1668 ph10 182
1669 ph10 178 if (OK == (d == OP_HSPACE))
1670 ph10 182 {
1671 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1672     {
1673     active_count--; /* Remove non-match possibility */
1674     next_active_state--;
1675     }
1676     if (++count >= GET2(code, 1))
1677     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1678     else
1679     { ADD_NEW_DATA(-state_offset, count, 0); }
1680     }
1681     }
1682     break;
1683    
1684 nigel 77 /* ========================================================================== */
1685     /* These opcodes are followed by a character that is usually compared
1686     to the current subject character; it is loaded into d. We still get
1687     here even if there is no subject character, because in some cases zero
1688     repetitions are permitted. */
1689    
1690     /*-----------------------------------------------------------------*/
1691     case OP_CHAR:
1692     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1693     break;
1694    
1695     /*-----------------------------------------------------------------*/
1696     case OP_CHARNC:
1697     if (clen == 0) break;
1698    
1699     #ifdef SUPPORT_UTF8
1700     if (utf8)
1701     {
1702     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1703     {
1704 nigel 93 unsigned int othercase;
1705 nigel 77 if (c < 128) othercase = fcc[c]; else
1706    
1707     /* If we have Unicode property support, we can use it to test the
1708 nigel 87 other case of the character. */
1709 nigel 77
1710     #ifdef SUPPORT_UCP
1711 ph10 349 othercase = UCD_OTHERCASE(c);
1712 nigel 87 #else
1713 nigel 93 othercase = NOTACHAR;
1714 nigel 77 #endif
1715    
1716     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1717     }
1718     }
1719     else
1720     #endif /* SUPPORT_UTF8 */
1721    
1722     /* Non-UTF-8 mode */
1723     {
1724     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1725     }
1726     break;
1727    
1728    
1729     #ifdef SUPPORT_UCP
1730     /*-----------------------------------------------------------------*/
1731     /* This is a tricky one because it can match more than one character.
1732     Find out how many characters to skip, and then set up a negative state
1733     to wait for them to pass before continuing. */
1734    
1735     case OP_EXTUNI:
1736 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1737 nigel 77 {
1738     const uschar *nptr = ptr + clen;
1739     int ncount = 0;
1740     while (nptr < end_subject)
1741     {
1742     int nclen = 1;
1743     GETCHARLEN(c, nptr, nclen);
1744 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1745 nigel 77 ncount++;
1746     nptr += nclen;
1747     }
1748     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1749     }
1750     break;
1751     #endif
1752    
1753     /*-----------------------------------------------------------------*/
1754 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1755     character (when CR is followed by LF). In this case, set up a negative
1756     state to wait for one character to pass before continuing. */
1757    
1758     case OP_ANYNL:
1759     if (clen > 0) switch(c)
1760     {
1761     case 0x000b:
1762     case 0x000c:
1763     case 0x0085:
1764     case 0x2028:
1765     case 0x2029:
1766 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1767    
1768     case 0x000a:
1769 nigel 93 ADD_NEW(state_offset + 1, 0);
1770     break;
1771 ph10 231
1772 nigel 93 case 0x000d:
1773     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1774     {
1775     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1776     }
1777     else
1778     {
1779     ADD_NEW(state_offset + 1, 0);
1780     }
1781     break;
1782     }
1783     break;
1784    
1785     /*-----------------------------------------------------------------*/
1786 ph10 178 case OP_NOT_VSPACE:
1787     if (clen > 0) switch(c)
1788     {
1789     case 0x000a:
1790     case 0x000b:
1791     case 0x000c:
1792     case 0x000d:
1793     case 0x0085:
1794     case 0x2028:
1795     case 0x2029:
1796     break;
1797 ph10 182
1798     default:
1799 ph10 178 ADD_NEW(state_offset + 1, 0);
1800     break;
1801     }
1802     break;
1803    
1804     /*-----------------------------------------------------------------*/
1805     case OP_VSPACE:
1806     if (clen > 0) switch(c)
1807     {
1808     case 0x000a:
1809     case 0x000b:
1810     case 0x000c:
1811     case 0x000d:
1812     case 0x0085:
1813     case 0x2028:
1814     case 0x2029:
1815     ADD_NEW(state_offset + 1, 0);
1816     break;
1817 ph10 182
1818 ph10 178 default: break;
1819     }
1820     break;
1821    
1822     /*-----------------------------------------------------------------*/
1823     case OP_NOT_HSPACE:
1824     if (clen > 0) switch(c)
1825     {
1826     case 0x09: /* HT */
1827     case 0x20: /* SPACE */
1828     case 0xa0: /* NBSP */
1829     case 0x1680: /* OGHAM SPACE MARK */
1830     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1831     case 0x2000: /* EN QUAD */
1832     case 0x2001: /* EM QUAD */
1833     case 0x2002: /* EN SPACE */
1834     case 0x2003: /* EM SPACE */
1835     case 0x2004: /* THREE-PER-EM SPACE */
1836     case 0x2005: /* FOUR-PER-EM SPACE */
1837     case 0x2006: /* SIX-PER-EM SPACE */
1838     case 0x2007: /* FIGURE SPACE */
1839     case 0x2008: /* PUNCTUATION SPACE */
1840     case 0x2009: /* THIN SPACE */
1841     case 0x200A: /* HAIR SPACE */
1842     case 0x202f: /* NARROW NO-BREAK SPACE */
1843     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1844     case 0x3000: /* IDEOGRAPHIC SPACE */
1845     break;
1846 ph10 182
1847     default:
1848 ph10 178 ADD_NEW(state_offset + 1, 0);
1849     break;
1850     }
1851     break;
1852    
1853     /*-----------------------------------------------------------------*/
1854     case OP_HSPACE:
1855     if (clen > 0) switch(c)
1856     {
1857     case 0x09: /* HT */
1858     case 0x20: /* SPACE */
1859     case 0xa0: /* NBSP */
1860     case 0x1680: /* OGHAM SPACE MARK */
1861     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1862     case 0x2000: /* EN QUAD */
1863     case 0x2001: /* EM QUAD */
1864     case 0x2002: /* EN SPACE */
1865     case 0x2003: /* EM SPACE */
1866     case 0x2004: /* THREE-PER-EM SPACE */
1867     case 0x2005: /* FOUR-PER-EM SPACE */
1868     case 0x2006: /* SIX-PER-EM SPACE */
1869     case 0x2007: /* FIGURE SPACE */
1870     case 0x2008: /* PUNCTUATION SPACE */
1871     case 0x2009: /* THIN SPACE */
1872     case 0x200A: /* HAIR SPACE */
1873     case 0x202f: /* NARROW NO-BREAK SPACE */
1874     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1875     case 0x3000: /* IDEOGRAPHIC SPACE */
1876     ADD_NEW(state_offset + 1, 0);
1877     break;
1878     }
1879     break;
1880    
1881     /*-----------------------------------------------------------------*/
1882 nigel 77 /* Match a negated single character. This is only used for one-byte
1883     characters, that is, we know that d < 256. The character we are
1884     checking (c) can be multibyte. */
1885    
1886     case OP_NOT:
1887     if (clen > 0)
1888     {
1889 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1890 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1891     }
1892     break;
1893    
1894     /*-----------------------------------------------------------------*/
1895     case OP_PLUS:
1896     case OP_MINPLUS:
1897 nigel 93 case OP_POSPLUS:
1898 nigel 77 case OP_NOTPLUS:
1899     case OP_NOTMINPLUS:
1900 nigel 93 case OP_NOTPOSPLUS:
1901 nigel 77 count = current_state->count; /* Already matched */
1902     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1903     if (clen > 0)
1904     {
1905 nigel 93 unsigned int otherd = NOTACHAR;
1906 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1907     {
1908     #ifdef SUPPORT_UTF8
1909 nigel 87 if (utf8 && d >= 128)
1910 nigel 77 {
1911     #ifdef SUPPORT_UCP
1912 ph10 349 otherd = UCD_OTHERCASE(d);
1913 nigel 77 #endif /* SUPPORT_UCP */
1914     }
1915     else
1916     #endif /* SUPPORT_UTF8 */
1917     otherd = fcc[d];
1918     }
1919     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1920 nigel 93 {
1921     if (count > 0 &&
1922     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1923     {
1924     active_count--; /* Remove non-match possibility */
1925     next_active_state--;
1926     }
1927     count++;
1928     ADD_NEW(state_offset, count);
1929     }
1930 nigel 77 }
1931     break;
1932    
1933     /*-----------------------------------------------------------------*/
1934     case OP_QUERY:
1935     case OP_MINQUERY:
1936 nigel 93 case OP_POSQUERY:
1937 nigel 77 case OP_NOTQUERY:
1938     case OP_NOTMINQUERY:
1939 nigel 93 case OP_NOTPOSQUERY:
1940 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1941     if (clen > 0)
1942     {
1943 nigel 93 unsigned int otherd = NOTACHAR;
1944 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1945 nigel 77 {
1946     #ifdef SUPPORT_UTF8
1947 nigel 87 if (utf8 && d >= 128)
1948 nigel 77 {
1949     #ifdef SUPPORT_UCP
1950 ph10 349 otherd = UCD_OTHERCASE(d);
1951 nigel 77 #endif /* SUPPORT_UCP */
1952     }
1953     else
1954     #endif /* SUPPORT_UTF8 */
1955     otherd = fcc[d];
1956     }
1957     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1958 nigel 93 {
1959     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1960     {
1961     active_count--; /* Remove non-match possibility */
1962     next_active_state--;
1963     }
1964     ADD_NEW(state_offset + dlen + 1, 0);
1965     }
1966 nigel 77 }
1967     break;
1968    
1969     /*-----------------------------------------------------------------*/
1970     case OP_STAR:
1971     case OP_MINSTAR:
1972 nigel 93 case OP_POSSTAR:
1973 nigel 77 case OP_NOTSTAR:
1974     case OP_NOTMINSTAR:
1975 nigel 93 case OP_NOTPOSSTAR:
1976 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1977     if (clen > 0)
1978     {
1979 nigel 93 unsigned int otherd = NOTACHAR;
1980 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1981 nigel 77 {
1982     #ifdef SUPPORT_UTF8
1983 nigel 87 if (utf8 && d >= 128)
1984 nigel 77 {
1985     #ifdef SUPPORT_UCP
1986 ph10 349 otherd = UCD_OTHERCASE(d);
1987 nigel 77 #endif /* SUPPORT_UCP */
1988     }
1989     else
1990     #endif /* SUPPORT_UTF8 */
1991     otherd = fcc[d];
1992     }
1993     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1994 nigel 93 {
1995     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1996     {
1997     active_count--; /* Remove non-match possibility */
1998     next_active_state--;
1999     }
2000     ADD_NEW(state_offset, 0);
2001     }
2002 nigel 77 }
2003     break;
2004    
2005     /*-----------------------------------------------------------------*/
2006     case OP_EXACT:
2007 nigel 93 case OP_NOTEXACT:
2008     count = current_state->count; /* Number already matched */
2009     if (clen > 0)
2010     {
2011     unsigned int otherd = NOTACHAR;
2012     if ((ims & PCRE_CASELESS) != 0)
2013     {
2014     #ifdef SUPPORT_UTF8
2015     if (utf8 && d >= 128)
2016     {
2017     #ifdef SUPPORT_UCP
2018 ph10 349 otherd = UCD_OTHERCASE(d);
2019 nigel 93 #endif /* SUPPORT_UCP */
2020     }
2021     else
2022     #endif /* SUPPORT_UTF8 */
2023     otherd = fcc[d];
2024     }
2025     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2026     {
2027     if (++count >= GET2(code, 1))
2028     { ADD_NEW(state_offset + dlen + 3, 0); }
2029     else
2030     { ADD_NEW(state_offset, count); }
2031     }
2032     }
2033     break;
2034    
2035     /*-----------------------------------------------------------------*/
2036 nigel 77 case OP_UPTO:
2037     case OP_MINUPTO:
2038 nigel 93 case OP_POSUPTO:
2039 nigel 77 case OP_NOTUPTO:
2040     case OP_NOTMINUPTO:
2041 nigel 93 case OP_NOTPOSUPTO:
2042     ADD_ACTIVE(state_offset + dlen + 3, 0);
2043 nigel 77 count = current_state->count; /* Number already matched */
2044     if (clen > 0)
2045     {
2046 nigel 93 unsigned int otherd = NOTACHAR;
2047 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2048     {
2049     #ifdef SUPPORT_UTF8
2050 nigel 87 if (utf8 && d >= 128)
2051 nigel 77 {
2052     #ifdef SUPPORT_UCP
2053 ph10 349 otherd = UCD_OTHERCASE(d);
2054 nigel 77 #endif /* SUPPORT_UCP */
2055     }
2056     else
2057     #endif /* SUPPORT_UTF8 */
2058     otherd = fcc[d];
2059     }
2060     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2061     {
2062 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2063     {
2064     active_count--; /* Remove non-match possibility */
2065     next_active_state--;
2066     }
2067 nigel 77 if (++count >= GET2(code, 1))
2068     { ADD_NEW(state_offset + dlen + 3, 0); }
2069     else
2070     { ADD_NEW(state_offset, count); }
2071     }
2072     }
2073     break;
2074    
2075    
2076     /* ========================================================================== */
2077     /* These are the class-handling opcodes */
2078    
2079     case OP_CLASS:
2080     case OP_NCLASS:
2081     case OP_XCLASS:
2082     {
2083     BOOL isinclass = FALSE;
2084     int next_state_offset;
2085     const uschar *ecode;
2086    
2087     /* For a simple class, there is always just a 32-byte table, and we
2088     can set isinclass from it. */
2089    
2090     if (codevalue != OP_XCLASS)
2091     {
2092     ecode = code + 33;
2093     if (clen > 0)
2094     {
2095     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2096     ((code[1 + c/8] & (1 << (c&7))) != 0);
2097     }
2098     }
2099    
2100     /* An extended class may have a table or a list of single characters,
2101     ranges, or both, and it may be positive or negative. There's a
2102     function that sorts all this out. */
2103    
2104     else
2105     {
2106     ecode = code + GET(code, 1);
2107     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2108     }
2109    
2110     /* At this point, isinclass is set for all kinds of class, and ecode
2111     points to the byte after the end of the class. If there is a
2112     quantifier, this is where it will be. */
2113    
2114     next_state_offset = ecode - start_code;
2115    
2116     switch (*ecode)
2117     {
2118     case OP_CRSTAR:
2119     case OP_CRMINSTAR:
2120     ADD_ACTIVE(next_state_offset + 1, 0);
2121     if (isinclass) { ADD_NEW(state_offset, 0); }
2122     break;
2123    
2124     case OP_CRPLUS:
2125     case OP_CRMINPLUS:
2126     count = current_state->count; /* Already matched */
2127     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2128     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2129     break;
2130    
2131     case OP_CRQUERY:
2132     case OP_CRMINQUERY:
2133     ADD_ACTIVE(next_state_offset + 1, 0);
2134     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2135     break;
2136    
2137     case OP_CRRANGE:
2138     case OP_CRMINRANGE:
2139     count = current_state->count; /* Already matched */
2140     if (count >= GET2(ecode, 1))
2141     { ADD_ACTIVE(next_state_offset + 5, 0); }
2142     if (isinclass)
2143     {
2144 nigel 91 int max = GET2(ecode, 3);
2145     if (++count >= max && max != 0) /* Max 0 => no limit */
2146 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2147     else
2148     { ADD_NEW(state_offset, count); }
2149     }
2150     break;
2151    
2152     default:
2153     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2154     break;
2155     }
2156     }
2157     break;
2158    
2159     /* ========================================================================== */
2160     /* These are the opcodes for fancy brackets of various kinds. We have
2161 ph10 345 to use recursion in order to handle them. The "always failing" assersion
2162 ph10 341 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2163     though the other "backtracking verbs" are not supported. */
2164 ph10 345
2165 ph10 341 case OP_FAIL:
2166 ph10 345 break;
2167 nigel 77
2168     case OP_ASSERT:
2169     case OP_ASSERT_NOT:
2170     case OP_ASSERTBACK:
2171     case OP_ASSERTBACK_NOT:
2172     {
2173     int rc;
2174     int local_offsets[2];
2175     int local_workspace[1000];
2176     const uschar *endasscode = code + GET(code, 1);
2177    
2178     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2179    
2180     rc = internal_dfa_exec(
2181     md, /* static match data */
2182     code, /* this subexpression's code */
2183     ptr, /* where we currently are */
2184     ptr - start_subject, /* start offset */
2185     local_offsets, /* offset vector */
2186     sizeof(local_offsets)/sizeof(int), /* size of same */
2187     local_workspace, /* workspace vector */
2188     sizeof(local_workspace)/sizeof(int), /* size of same */
2189     ims, /* the current ims flags */
2190     rlevel, /* function recursion level */
2191     recursing); /* pass on regex recursion */
2192    
2193     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2194     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2195     }
2196     break;
2197    
2198     /*-----------------------------------------------------------------*/
2199     case OP_COND:
2200 nigel 93 case OP_SCOND:
2201 nigel 77 {
2202     int local_offsets[1000];
2203     int local_workspace[1000];
2204 ph10 397 int codelink = GET(code, 1);
2205     int condcode;
2206    
2207     /* Because of the way auto-callout works during compile, a callout item
2208     is inserted between OP_COND and an assertion condition. */
2209 nigel 77
2210 ph10 397 if (code[LINK_SIZE+1] == OP_CALLOUT)
2211     {
2212     if (pcre_callout != NULL)
2213     {
2214     int rrc;
2215     pcre_callout_block cb;
2216     cb.version = 1; /* Version 1 of the callout block */
2217     cb.callout_number = code[LINK_SIZE+2];
2218     cb.offset_vector = offsets;
2219     cb.subject = (PCRE_SPTR)start_subject;
2220     cb.subject_length = end_subject - start_subject;
2221     cb.start_match = current_subject - start_subject;
2222     cb.current_position = ptr - start_subject;
2223     cb.pattern_position = GET(code, LINK_SIZE + 3);
2224     cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225     cb.capture_top = 1;
2226     cb.capture_last = -1;
2227     cb.callout_data = md->callout_data;
2228     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2229     if (rrc == 0) { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2230     }
2231     code += _pcre_OP_lengths[OP_CALLOUT];
2232     }
2233    
2234     condcode = code[LINK_SIZE+1];
2235    
2236 nigel 93 /* Back reference conditions are not supported */
2237 nigel 77
2238 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2239    
2240     /* The DEFINE condition is always false */
2241    
2242     if (condcode == OP_DEF)
2243 nigel 77 {
2244 ph10 397 ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0);
2245 nigel 93 }
2246    
2247     /* The only supported version of OP_RREF is for the value RREF_ANY,
2248     which means "test if in any recursion". We can't test for specifically
2249     recursed groups. */
2250    
2251     else if (condcode == OP_RREF)
2252     {
2253 nigel 77 int value = GET2(code, LINK_SIZE+2);
2254 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2255 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2256 ph10 397 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2257 nigel 77 }
2258    
2259     /* Otherwise, the condition is an assertion */
2260    
2261     else
2262     {
2263     int rc;
2264     const uschar *asscode = code + LINK_SIZE + 1;
2265     const uschar *endasscode = asscode + GET(asscode, 1);
2266    
2267     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2268    
2269     rc = internal_dfa_exec(
2270     md, /* fixed match data */
2271     asscode, /* this subexpression's code */
2272     ptr, /* where we currently are */
2273     ptr - start_subject, /* start offset */
2274     local_offsets, /* offset vector */
2275     sizeof(local_offsets)/sizeof(int), /* size of same */
2276     local_workspace, /* workspace vector */
2277     sizeof(local_workspace)/sizeof(int), /* size of same */
2278     ims, /* the current ims flags */
2279     rlevel, /* function recursion level */
2280     recursing); /* pass on regex recursion */
2281    
2282     if ((rc >= 0) ==
2283     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2284 ph10 397 {
2285     ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0);
2286     }
2287 nigel 77 else
2288 ph10 397 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2289 nigel 77 }
2290     }
2291     break;
2292    
2293     /*-----------------------------------------------------------------*/
2294     case OP_RECURSE:
2295     {
2296     int local_offsets[1000];
2297     int local_workspace[1000];
2298     int rc;
2299    
2300     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2301     recursing + 1));
2302    
2303     rc = internal_dfa_exec(
2304     md, /* fixed match data */
2305     start_code + GET(code, 1), /* this subexpression's code */
2306     ptr, /* where we currently are */
2307     ptr - start_subject, /* start offset */
2308     local_offsets, /* offset vector */
2309     sizeof(local_offsets)/sizeof(int), /* size of same */
2310     local_workspace, /* workspace vector */
2311     sizeof(local_workspace)/sizeof(int), /* size of same */
2312     ims, /* the current ims flags */
2313     rlevel, /* function recursion level */
2314     recursing + 1); /* regex recurse level */
2315    
2316     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2317     recursing + 1, rc));
2318    
2319     /* Ran out of internal offsets */
2320    
2321     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2322    
2323     /* For each successful matched substring, set up the next state with a
2324     count of characters to skip before trying it. Note that the count is in
2325     characters, not bytes. */
2326    
2327     if (rc > 0)
2328     {
2329     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2330     {
2331     const uschar *p = start_subject + local_offsets[rc];
2332     const uschar *pp = start_subject + local_offsets[rc+1];
2333     int charcount = local_offsets[rc+1] - local_offsets[rc];
2334     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2335     if (charcount > 0)
2336     {
2337     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2338     }
2339     else
2340     {
2341     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2342     }
2343     }
2344     }
2345     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2346     }
2347     break;
2348    
2349     /*-----------------------------------------------------------------*/
2350     case OP_ONCE:
2351     {
2352     int local_offsets[2];
2353     int local_workspace[1000];
2354    
2355     int rc = internal_dfa_exec(
2356     md, /* fixed match data */
2357     code, /* this subexpression's code */
2358     ptr, /* where we currently are */
2359     ptr - start_subject, /* start offset */
2360     local_offsets, /* offset vector */
2361     sizeof(local_offsets)/sizeof(int), /* size of same */
2362     local_workspace, /* workspace vector */
2363     sizeof(local_workspace)/sizeof(int), /* size of same */
2364     ims, /* the current ims flags */
2365     rlevel, /* function recursion level */
2366     recursing); /* pass on regex recursion */
2367    
2368     if (rc >= 0)
2369     {
2370     const uschar *end_subpattern = code;
2371     int charcount = local_offsets[1] - local_offsets[0];
2372     int next_state_offset, repeat_state_offset;
2373    
2374     do { end_subpattern += GET(end_subpattern, 1); }
2375     while (*end_subpattern == OP_ALT);
2376     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2377    
2378     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2379     arrange for the repeat state also to be added to the relevant list.
2380     Calculate the offset, or set -1 for no repeat. */
2381    
2382     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2383     *end_subpattern == OP_KETRMIN)?
2384     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2385    
2386     /* If we have matched an empty string, add the next state at the
2387     current character pointer. This is important so that the duplicate
2388     checking kicks in, which is what breaks infinite loops that match an
2389     empty string. */
2390    
2391     if (charcount == 0)
2392     {
2393     ADD_ACTIVE(next_state_offset, 0);
2394     }
2395    
2396     /* Optimization: if there are no more active states, and there
2397     are no new states yet set up, then skip over the subject string
2398     right here, to save looping. Otherwise, set up the new state to swing
2399     into action when the end of the substring is reached. */
2400    
2401     else if (i + 1 >= active_count && new_count == 0)
2402     {
2403     ptr += charcount;
2404     clen = 0;
2405     ADD_NEW(next_state_offset, 0);
2406    
2407     /* If we are adding a repeat state at the new character position,
2408     we must fudge things so that it is the only current state.
2409     Otherwise, it might be a duplicate of one we processed before, and
2410     that would cause it to be skipped. */
2411    
2412     if (repeat_state_offset >= 0)
2413     {
2414     next_active_state = active_states;
2415     active_count = 0;
2416     i = -1;
2417     ADD_ACTIVE(repeat_state_offset, 0);
2418     }
2419     }
2420     else
2421     {
2422     const uschar *p = start_subject + local_offsets[0];
2423     const uschar *pp = start_subject + local_offsets[1];
2424     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2425     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2426     if (repeat_state_offset >= 0)
2427     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2428     }
2429    
2430     }
2431     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2432     }
2433     break;
2434    
2435    
2436     /* ========================================================================== */
2437     /* Handle callouts */
2438    
2439     case OP_CALLOUT:
2440 ph10 397 rrc = 0;
2441 nigel 77 if (pcre_callout != NULL)
2442     {
2443     pcre_callout_block cb;
2444     cb.version = 1; /* Version 1 of the callout block */
2445     cb.callout_number = code[1];
2446     cb.offset_vector = offsets;
2447 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2448 nigel 77 cb.subject_length = end_subject - start_subject;
2449     cb.start_match = current_subject - start_subject;
2450     cb.current_position = ptr - start_subject;
2451     cb.pattern_position = GET(code, 2);
2452     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2453     cb.capture_top = 1;
2454     cb.capture_last = -1;
2455     cb.callout_data = md->callout_data;
2456     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2457 ph10 397 }
2458     if (rrc == 0)
2459     { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2460 nigel 77 break;
2461    
2462    
2463     /* ========================================================================== */
2464     default: /* Unsupported opcode */
2465     return PCRE_ERROR_DFA_UITEM;
2466     }
2467    
2468     NEXT_ACTIVE_STATE: continue;
2469    
2470     } /* End of loop scanning active states */
2471    
2472     /* We have finished the processing at the current subject character. If no
2473     new states have been set for the next character, we have found all the
2474     matches that we are going to find. If we are at the top level and partial
2475     matching has been requested, check for appropriate conditions. */
2476    
2477     if (new_count <= 0)
2478     {
2479     if (match_count < 0 && /* No matches found */
2480     rlevel == 1 && /* Top level match function */
2481     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2482     ptr >= end_subject && /* Reached end of subject */
2483     ptr > current_subject) /* Matched non-empty string */
2484     {
2485     if (offsetcount >= 2)
2486     {
2487     offsets[0] = current_subject - start_subject;
2488     offsets[1] = end_subject - start_subject;
2489     }
2490     match_count = PCRE_ERROR_PARTIAL;
2491     }
2492    
2493     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2494     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2495     rlevel*2-2, SP));
2496 nigel 91 break; /* In effect, "return", but see the comment below */
2497 nigel 77 }
2498    
2499     /* One or more states are active for the next character. */
2500    
2501     ptr += clen; /* Advance to next subject character */
2502     } /* Loop to move along the subject string */
2503    
2504 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2505     if we use "return" above, we have compiler trouble. Some compilers warn if
2506     there's nothing here because they think the function doesn't return a value. On
2507     the other hand, if we put a dummy statement here, some more clever compilers
2508     complain that it can't be reached. Sigh. */
2509 nigel 77
2510 nigel 91 return match_count;
2511 nigel 77 }
2512    
2513    
2514    
2515    
2516     /*************************************************
2517     * Execute a Regular Expression - DFA engine *
2518     *************************************************/
2519    
2520     /* This external function applies a compiled re to a subject string using a DFA
2521     engine. This function calls the internal function multiple times if the pattern
2522     is not anchored.
2523    
2524     Arguments:
2525     argument_re points to the compiled expression
2526 ph10 97 extra_data points to extra data or is NULL
2527 nigel 77 subject points to the subject string
2528     length length of subject string (may contain binary zeros)
2529     start_offset where to start in the subject string
2530     options option bits
2531     offsets vector of match offsets
2532     offsetcount size of same
2533     workspace workspace vector
2534     wscount size of same
2535    
2536     Returns: > 0 => number of match offset pairs placed in offsets
2537     = 0 => offsets overflowed; longest matches are present
2538     -1 => failed to match
2539     < -1 => some kind of unexpected problem
2540     */
2541    
2542 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2543 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2544     const char *subject, int length, int start_offset, int options, int *offsets,
2545     int offsetcount, int *workspace, int wscount)
2546     {
2547     real_pcre *re = (real_pcre *)argument_re;
2548     dfa_match_data match_block;
2549 nigel 91 dfa_match_data *md = &match_block;
2550 nigel 77 BOOL utf8, anchored, startline, firstline;
2551     const uschar *current_subject, *end_subject, *lcc;
2552    
2553     pcre_study_data internal_study;
2554     const pcre_study_data *study = NULL;
2555     real_pcre internal_re;
2556    
2557     const uschar *req_byte_ptr;
2558     const uschar *start_bits = NULL;
2559     BOOL first_byte_caseless = FALSE;
2560     BOOL req_byte_caseless = FALSE;
2561     int first_byte = -1;
2562     int req_byte = -1;
2563     int req_byte2 = -1;
2564 nigel 91 int newline;
2565 nigel 77
2566     /* Plausibility checks */
2567    
2568     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2569     if (re == NULL || subject == NULL || workspace == NULL ||
2570     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2571     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2572     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2573    
2574     /* We need to find the pointer to any study data before we test for byte
2575     flipping, so we scan the extra_data block first. This may set two fields in the
2576     match block, so we must initialize them beforehand. However, the other fields
2577     in the match block must not be set until after the byte flipping. */
2578    
2579 nigel 91 md->tables = re->tables;
2580     md->callout_data = NULL;
2581 nigel 77
2582     if (extra_data != NULL)
2583     {
2584     unsigned int flags = extra_data->flags;
2585     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2586     study = (const pcre_study_data *)extra_data->study_data;
2587     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2588 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2589     return PCRE_ERROR_DFA_UMLIMIT;
2590 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2591 nigel 91 md->callout_data = extra_data->callout_data;
2592 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2593 nigel 91 md->tables = extra_data->tables;
2594 nigel 77 }
2595    
2596     /* Check that the first field in the block is the magic number. If it is not,
2597     test for a regex that was compiled on a host of opposite endianness. If this is
2598     the case, flipped values are put in internal_re and internal_study if there was
2599     study data too. */
2600    
2601     if (re->magic_number != MAGIC_NUMBER)
2602     {
2603     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2604     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2605     if (study != NULL) study = &internal_study;
2606     }
2607    
2608     /* Set some local values */
2609    
2610     current_subject = (const unsigned char *)subject + start_offset;
2611     end_subject = (const unsigned char *)subject + length;
2612     req_byte_ptr = current_subject - 1;
2613    
2614 nigel 91 #ifdef SUPPORT_UTF8
2615 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2616 nigel 91 #else
2617     utf8 = FALSE;
2618     #endif
2619 nigel 77
2620 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2621     (re->options & PCRE_ANCHORED) != 0;
2622    
2623 nigel 77 /* The remaining fixed data for passing around. */
2624    
2625 nigel 91 md->start_code = (const uschar *)argument_re +
2626 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2627 nigel 91 md->start_subject = (const unsigned char *)subject;
2628     md->end_subject = end_subject;
2629     md->moptions = options;
2630     md->poptions = re->options;
2631 nigel 77
2632 ph10 231 /* If the BSR option is not set at match time, copy what was set
2633     at compile time. */
2634    
2635     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2636     {
2637     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2638     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2639     #ifdef BSR_ANYCRLF
2640     else md->moptions |= PCRE_BSR_ANYCRLF;
2641 ph10 243 #endif
2642     }
2643 ph10 231
2644 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2645     nothing is set at run time, whatever was used at compile time applies. */
2646 nigel 91
2647 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2648 nigel 93 PCRE_NEWLINE_BITS)
2649 nigel 91 {
2650 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2651 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2652     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2653 nigel 91 case PCRE_NEWLINE_CR+
2654 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2655 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2656 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2657 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2658 nigel 91 }
2659    
2660 ph10 149 if (newline == -2)
2661 nigel 91 {
2662 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2663     }
2664     else if (newline < 0)
2665     {
2666 nigel 93 md->nltype = NLTYPE_ANY;
2667 nigel 91 }
2668     else
2669     {
2670 nigel 93 md->nltype = NLTYPE_FIXED;
2671     if (newline > 255)
2672     {
2673     md->nllen = 2;
2674     md->nl[0] = (newline >> 8) & 255;
2675     md->nl[1] = newline & 255;
2676     }
2677     else
2678     {
2679     md->nllen = 1;
2680     md->nl[0] = newline;
2681     }
2682 nigel 91 }
2683    
2684 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2685     back the character offset. */
2686    
2687     #ifdef SUPPORT_UTF8
2688     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2689     {
2690     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2691     return PCRE_ERROR_BADUTF8;
2692     if (start_offset > 0 && start_offset < length)
2693     {
2694     int tb = ((uschar *)subject)[start_offset];
2695     if (tb > 127)
2696     {
2697     tb &= 0xc0;
2698     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2699     }
2700     }
2701     }
2702     #endif
2703    
2704     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2705     is a feature that makes it possible to save compiled regex and re-use them
2706     in other programs later. */
2707    
2708 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2709 nigel 77
2710     /* The lower casing table and the "must be at the start of a line" flag are
2711     used in a loop when finding where to start. */
2712    
2713 nigel 91 lcc = md->tables + lcc_offset;
2714 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2715 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2716    
2717     /* Set up the first character to match, if available. The first_byte value is
2718     never set for an anchored regular expression, but the anchoring may be forced
2719     at run time, so we have to test for anchoring. The first char may be unset for
2720     an unanchored pattern, of course. If there's no first char and the pattern was
2721     studied, there may be a bitmap of possible first characters. */
2722    
2723     if (!anchored)
2724     {
2725 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2726 nigel 77 {
2727     first_byte = re->first_byte & 255;
2728     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2729     first_byte = lcc[first_byte];
2730     }
2731     else
2732     {
2733     if (startline && study != NULL &&
2734     (study->options & PCRE_STUDY_MAPPED) != 0)
2735     start_bits = study->start_bits;
2736     }
2737     }
2738    
2739     /* For anchored or unanchored matches, there may be a "last known required
2740     character" set. */
2741    
2742 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2743 nigel 77 {
2744     req_byte = re->req_byte & 255;
2745     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2746 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2747 nigel 77 }
2748    
2749     /* Call the main matching function, looping for a non-anchored regex after a
2750 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2751     a match. */
2752 nigel 77
2753     for (;;)
2754     {
2755     int rc;
2756    
2757     if ((options & PCRE_DFA_RESTART) == 0)
2758     {
2759     const uschar *save_end_subject = end_subject;
2760    
2761 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2762     line of a multiline string. Implement this by temporarily adjusting
2763     end_subject so that we stop scanning at a newline. If the match fails at
2764     the newline, later code breaks this loop. */
2765 nigel 77
2766     if (firstline)
2767     {
2768 ph10 365 USPTR t = current_subject;
2769     #ifdef SUPPORT_UTF8
2770     if (utf8)
2771 ph10 371 {
2772     while (t < md->end_subject && !IS_NEWLINE(t))
2773 ph10 365 {
2774     t++;
2775     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2776 ph10 371 }
2777 ph10 365 }
2778     else
2779 ph10 371 #endif
2780 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2781 nigel 77 end_subject = t;
2782     }
2783 ph10 392
2784 ph10 389 /* There are some optimizations that avoid running the match if a known
2785     starting point is not found, or if a known later character is not present.
2786     However, there is an option that disables these, for testing and for
2787     ensuring that all callouts do actually occur. */
2788 nigel 77
2789 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2790 ph10 392 {
2791    
2792 ph10 389 /* Advance to a known first byte. */
2793 ph10 392
2794 ph10 389 if (first_byte >= 0)
2795 nigel 77 {
2796 ph10 389 if (first_byte_caseless)
2797     while (current_subject < end_subject &&
2798     lcc[*current_subject] != first_byte)
2799     current_subject++;
2800     else
2801 ph10 392 while (current_subject < end_subject &&
2802 ph10 389 *current_subject != first_byte)
2803     current_subject++;
2804     }
2805 ph10 392
2806 ph10 389 /* Or to just after a linebreak for a multiline match if possible */
2807 ph10 392
2808 ph10 389 else if (startline)
2809     {
2810     if (current_subject > md->start_subject + start_offset)
2811     {
2812 ph10 365 #ifdef SUPPORT_UTF8
2813 ph10 389 if (utf8)
2814 ph10 365 {
2815 ph10 392 while (current_subject < end_subject &&
2816 ph10 389 !WAS_NEWLINE(current_subject))
2817     {
2818 ph10 365 current_subject++;
2819 ph10 389 while(current_subject < end_subject &&
2820     (*current_subject & 0xc0) == 0x80)
2821     current_subject++;
2822     }
2823 ph10 371 }
2824 ph10 389 else
2825     #endif
2826     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2827     current_subject++;
2828 ph10 392
2829 ph10 389 /* If we have just passed a CR and the newline option is ANY or
2830     ANYCRLF, and we are now at a LF, advance the match position by one
2831     more character. */
2832 ph10 392
2833 ph10 391 if (current_subject[-1] == CHAR_CR &&
2834 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2835     current_subject < end_subject &&
2836 ph10 391 *current_subject == CHAR_NL)
2837 ph10 389 current_subject++;
2838 ph10 365 }
2839 nigel 77 }
2840 ph10 392
2841 ph10 389 /* Or to a non-unique first char after study */
2842 ph10 392
2843 ph10 389 else if (start_bits != NULL)
2844 nigel 77 {
2845 ph10 389 while (current_subject < end_subject)
2846     {
2847     register unsigned int c = *current_subject;
2848     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2849     else break;
2850     }
2851 nigel 77 }
2852 ph10 392 }
2853 nigel 77
2854     /* Restore fudged end_subject */
2855    
2856     end_subject = save_end_subject;
2857     }
2858    
2859     /* If req_byte is set, we know that that character must appear in the subject
2860     for the match to succeed. If the first character is set, req_byte must be
2861     later in the subject; otherwise the test starts at the match point. This
2862     optimization can save a huge amount of work in patterns with nested unlimited
2863     repeats that aren't going to match. Writing separate code for cased/caseless
2864     versions makes it go faster, as does using an autoincrement and backing off
2865     on a match.
2866    
2867     HOWEVER: when the subject string is very, very long, searching to its end can
2868     take a long time, and give bad performance on quite ordinary patterns. This
2869     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2870     don't do this when the string is sufficiently long.
2871    
2872 ph10 392 ALSO: this processing is disabled when partial matching is requested, and can
2873 ph10 389 also be explicitly deactivated. */
2874 nigel 77
2875 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2876     req_byte >= 0 &&
2877 nigel 77 end_subject - current_subject < REQ_BYTE_MAX &&
2878     (options & PCRE_PARTIAL) == 0)
2879     {
2880     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2881    
2882     /* We don't need to repeat the search if we haven't yet reached the
2883     place we found it at last time. */
2884    
2885     if (p > req_byte_ptr)
2886     {
2887     if (req_byte_caseless)
2888     {
2889     while (p < end_subject)
2890     {
2891     register int pp = *p++;
2892     if (pp == req_byte || pp == req_byte2) { p--; break; }
2893     }
2894     }
2895     else
2896     {
2897     while (p < end_subject)
2898     {
2899     if (*p++ == req_byte) { p--; break; }
2900     }
2901     }
2902    
2903     /* If we can't find the required character, break the matching loop,
2904     which will cause a return or PCRE_ERROR_NOMATCH. */
2905    
2906     if (p >= end_subject) break;
2907    
2908     /* If we have found the required character, save the point where we
2909     found it, so that we don't search again next time round the loop if
2910     the start hasn't passed this character yet. */
2911    
2912     req_byte_ptr = p;
2913     }
2914     }
2915    
2916     /* OK, now we can do the business */
2917    
2918     rc = internal_dfa_exec(
2919 nigel 91 md, /* fixed match data */
2920     md->start_code, /* this subexpression's code */
2921     current_subject, /* where we currently are */
2922     start_offset, /* start offset in subject */
2923     offsets, /* offset vector */
2924     offsetcount, /* size of same */
2925     workspace, /* workspace vector */
2926     wscount, /* size of same */
2927 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2928 nigel 91 0, /* function recurse level */
2929     0); /* regex recurse level */
2930 nigel 77
2931     /* Anything other than "no match" means we are done, always; otherwise, carry
2932     on only if not anchored. */
2933    
2934     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2935    
2936     /* Advance to the next subject character unless we are at the end of a line
2937     and firstline is set. */
2938    
2939 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2940 nigel 77 current_subject++;
2941     if (utf8)
2942     {
2943     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2944     current_subject++;
2945     }
2946     if (current_subject > end_subject) break;
2947    
2948 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2949 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2950     or ANY or ANYCRLF, advance the match position by one more character. */
2951 nigel 93
2952 ph10 391 if (current_subject[-1] == CHAR_CR &&
2953 ph10 226 current_subject < end_subject &&
2954 ph10 391 *current_subject == CHAR_NL &&
2955 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2956 ph10 226 (md->nltype == NLTYPE_ANY ||
2957     md->nltype == NLTYPE_ANYCRLF ||
2958     md->nllen == 2))
2959 nigel 93 current_subject++;
2960    
2961     } /* "Bumpalong" loop */
2962    
2963 nigel 77 return PCRE_ERROR_NOMATCH;
2964     }
2965    
2966     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12