/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 361 - (hide annotations) (download)
Thu Jul 10 16:03:28 2008 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 95408 byte(s)
Fix off-end-of-buffer bug for patterns that match only at start of line.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains the external function pcre_dfa_exec(), which is an
42 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
43     FSM). This is NOT Perl- compatible, but it has advantages in certain
44     applications. */
45 nigel 77
46    
47 ph10 200 #ifdef HAVE_CONFIG_H
48 ph10 236 #include "config.h"
49 ph10 200 #endif
50 ph10 199
51 nigel 93 #define NLBLOCK md /* Block containing newline information */
52     #define PSSTART start_subject /* Field containing processed string start */
53     #define PSEND end_subject /* Field containing processed string end */
54    
55 nigel 77 #include "pcre_internal.h"
56    
57    
58     /* For use to indent debugging output */
59    
60     #define SP " "
61    
62    
63    
64     /*************************************************
65     * Code parameters and static tables *
66     *************************************************/
67    
68     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
70 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
71 ph10 178 never stored, so we push them well clear of the normal opcodes. */
72 nigel 77
73 ph10 178 #define OP_PROP_EXTRA 300
74     #define OP_EXTUNI_EXTRA 320
75     #define OP_ANYNL_EXTRA 340
76     #define OP_HSPACE_EXTRA 360
77     #define OP_VSPACE_EXTRA 380
78 nigel 77
79    
80     /* This table identifies those opcodes that are followed immediately by a
81     character that is to be tested in some way. This makes is possible to
82     centralize the loading of these characters. In the case of Type * etc, the
83     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
85 ph10 168 that follow must also be modified. */
86 nigel 77
87 ph10 327 static const uschar coptable[] = {
88 nigel 77 0, /* End */
89 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
92 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95     1, /* Char */
96     1, /* Charnc */
97     1, /* not */
98     /* Positive single-char repeats */
99     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100     3, 3, 3, /* upto, minupto, exact */
101 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 nigel 77 /* Negative single-char repeats - only for chars < 256 */
103     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104     3, 3, 3, /* NOT upto, minupto, exact */
105 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 nigel 77 /* Positive type repeats */
107     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108     3, 3, 3, /* Type upto, minupto, exact */
109 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 nigel 77 /* Character class & ref repeats */
111     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112     0, 0, /* CRRANGE, CRMINRANGE */
113     0, /* CLASS */
114     0, /* NCLASS */
115     0, /* XCLASS - variable length */
116     0, /* REF */
117     0, /* RECURSE */
118     0, /* CALLOUT */
119     0, /* Alt */
120     0, /* Ket */
121     0, /* KetRmax */
122     0, /* KetRmin */
123     0, /* Assert */
124     0, /* Assert not */
125     0, /* Assert behind */
126     0, /* Assert behind not */
127     0, /* Reverse */
128 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129     0, 0, 0, /* SBRA, SCBRA, SCOND */
130 nigel 77 0, /* CREF */
131 nigel 93 0, /* RREF */
132     0, /* DEF */
133 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
134     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 nigel 77 };
137    
138     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139     and \w */
140    
141 ph10 327 static const uschar toptable1[] = {
142 ph10 168 0, 0, 0, 0, 0, 0,
143 nigel 77 ctype_digit, ctype_digit,
144     ctype_space, ctype_space,
145     ctype_word, ctype_word,
146 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
147 nigel 77 };
148    
149 ph10 327 static const uschar toptable2[] = {
150 ph10 168 0, 0, 0, 0, 0, 0,
151 nigel 77 ctype_digit, 0,
152     ctype_space, 0,
153     ctype_word, 0,
154 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
155 nigel 77 };
156    
157    
158     /* Structure for holding data about a particular state, which is in effect the
159     current data for an active path through the match tree. It must consist
160     entirely of ints because the working vector we are passed, and which we put
161     these structures in, is a vector of ints. */
162    
163     typedef struct stateblock {
164     int offset; /* Offset to opcode */
165     int count; /* Count for repeats */
166     int ims; /* ims flag bits */
167     int data; /* Some use extra data */
168     } stateblock;
169    
170     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171    
172    
173     #ifdef DEBUG
174     /*************************************************
175     * Print character string *
176     *************************************************/
177    
178     /* Character string printing function for debugging.
179    
180     Arguments:
181     p points to string
182     length number of bytes
183     f where to print
184    
185     Returns: nothing
186     */
187    
188     static void
189     pchars(unsigned char *p, int length, FILE *f)
190     {
191     int c;
192     while (length-- > 0)
193     {
194     if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196     else
197     fprintf(f, "\\x%02x", c);
198     }
199     }
200     #endif
201    
202    
203    
204     /*************************************************
205     * Execute a Regular Expression - DFA engine *
206     *************************************************/
207    
208     /* This internal function applies a compiled pattern to a subject string,
209     starting at a given point, using a DFA engine. This function is called from the
210     external one, possibly multiple times if the pattern is not anchored. The
211     function calls itself recursively for some kinds of subpattern.
212    
213     Arguments:
214     md the match_data block with fixed information
215     this_start_code the opening bracket of this subexpression's code
216     current_subject where we currently are in the subject string
217     start_offset start offset in the subject string
218     offsets vector to contain the matching string offsets
219     offsetcount size of same
220     workspace vector of workspace
221     wscount size of same
222     ims the current ims flags
223     rlevel function call recursion level
224     recursing regex recursive call level
225    
226 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
227 ph10 341 = 0 => offsets overflowed; longest matches are present
228 nigel 77 -1 => failed to match
229     < -1 => some kind of unexpected problem
230    
231     The following macros are used for adding states to the two state vectors (one
232     for the current character, one for the following character). */
233    
234     #define ADD_ACTIVE(x,y) \
235     if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count = (y); \
239     next_active_state->ims = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243     else return PCRE_ERROR_DFA_WSSIZE
244    
245     #define ADD_ACTIVE_DATA(x,y,z) \
246     if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count = (y); \
250     next_active_state->ims = ims; \
251     next_active_state->data = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255     else return PCRE_ERROR_DFA_WSSIZE
256    
257     #define ADD_NEW(x,y) \
258     if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count = (y); \
262     next_new_state->ims = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266     else return PCRE_ERROR_DFA_WSSIZE
267    
268     #define ADD_NEW_DATA(x,y,z) \
269     if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count = (y); \
273     next_new_state->ims = ims; \
274     next_new_state->data = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278     else return PCRE_ERROR_DFA_WSSIZE
279    
280     /* And now, here is the code */
281    
282     static int
283     internal_dfa_exec(
284     dfa_match_data *md,
285     const uschar *this_start_code,
286     const uschar *current_subject,
287     int start_offset,
288     int *offsets,
289     int offsetcount,
290     int *workspace,
291     int wscount,
292     int ims,
293     int rlevel,
294     int recursing)
295     {
296     stateblock *active_states, *new_states, *temp_states;
297     stateblock *next_active_state, *next_new_state;
298    
299     const uschar *ctypes, *lcc, *fcc;
300     const uschar *ptr;
301 nigel 93 const uschar *end_code, *first_op;
302 nigel 77
303     int active_count, new_count, match_count;
304    
305     /* Some fields in the md block are frequently referenced, so we load them into
306     independent variables in the hope that this will perform better. */
307    
308     const uschar *start_subject = md->start_subject;
309     const uschar *end_subject = md->end_subject;
310     const uschar *start_code = md->start_code;
311    
312 nigel 87 #ifdef SUPPORT_UTF8
313 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 nigel 93 #else
315     BOOL utf8 = FALSE;
316 nigel 87 #endif
317 nigel 77
318     rlevel++;
319     offsetcount &= (-2);
320    
321     wscount -= 2;
322     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323     (2 * INTS_PER_STATEBLOCK);
324    
325     DPRINTF(("\n%.*s---------------------\n"
326     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328    
329     ctypes = md->tables + ctypes_offset;
330     lcc = md->tables + lcc_offset;
331     fcc = md->tables + fcc_offset;
332    
333     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334    
335     active_states = (stateblock *)(workspace + 2);
336     next_new_state = new_states = active_states + wscount;
337     new_count = 0;
338    
339 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
340     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341    
342 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343     the alternative states onto the list, and find out where the end is. This
344     makes is possible to use this function recursively, when we want to stop at a
345     matching internal ket rather than at the end.
346    
347     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348     a backward assertion. In that case, we have to find out the maximum amount to
349     move back, and set up each alternative appropriately. */
350    
351 nigel 93 if (*first_op == OP_REVERSE)
352 nigel 77 {
353     int max_back = 0;
354     int gone_back;
355    
356     end_code = this_start_code;
357     do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363     while (*end_code == OP_ALT);
364    
365     /* If we can't go back the amount required for the longest lookbehind
366     pattern, go back as far as we can; some alternatives may still be viable. */
367    
368     #ifdef SUPPORT_UTF8
369     /* In character mode we have to step back character by character */
370    
371     if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374     {
375     if (current_subject <= start_subject) break;
376     current_subject--;
377     while (current_subject > start_subject &&
378     (*current_subject & 0xc0) == 0x80)
379     current_subject--;
380     }
381     }
382     else
383     #endif
384    
385     /* In byte-mode we can do this quickly. */
386    
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389     current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392    
393     /* Now we can process the individual branches. */
394    
395     end_code = this_start_code;
396     do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400     {
401     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402     ADD_NEW_DATA(-bstate, 0, gone_back - back);
403     }
404     end_code += GET(end_code, 1);
405     }
406     while (*end_code == OP_ALT);
407     }
408    
409     /* This is the code for a "normal" subpattern (not a backward assertion). The
410     start of a whole pattern is always one of these. If we are at the top level,
411     we may be asked to restart matching from the same point that we reached for a
412     previous partial match. We still have to scan through the top-level branches to
413     find the end state. */
414    
415     else
416     {
417     end_code = this_start_code;
418    
419     /* Restarting */
420    
421     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426     memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428    
429     /* Not restarting */
430    
431     else
432     {
433 nigel 93 int length = 1 + LINK_SIZE +
434     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 nigel 77 do
436     {
437 nigel 93 ADD_NEW(end_code - start_code + length, 0);
438 nigel 77 end_code += GET(end_code, 1);
439 nigel 93 length = 1 + LINK_SIZE;
440 nigel 77 }
441     while (*end_code == OP_ALT);
442     }
443     }
444    
445     workspace[0] = 0; /* Bit indicating which vector is current */
446    
447     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448    
449     /* Loop for scanning the subject */
450    
451     ptr = current_subject;
452     for (;;)
453     {
454     int i, j;
455 nigel 91 int clen, dlen;
456     unsigned int c, d;
457 nigel 77
458     /* Make the new state list into the active state list and empty the
459     new state list. */
460    
461     temp_states = active_states;
462     active_states = new_states;
463     new_states = temp_states;
464     active_count = new_count;
465     new_count = 0;
466    
467     workspace[0] ^= 1; /* Remember for the restarting feature */
468     workspace[1] = active_count;
469    
470     #ifdef DEBUG
471     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473     printf("\"\n");
474    
475     printf("%.*sActive states: ", rlevel*2-2, SP);
476     for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478     printf("\n");
479     #endif
480    
481     /* Set the pointers for adding new states */
482    
483     next_active_state = active_states + active_count;
484     next_new_state = new_states;
485    
486     /* Load the current character from the subject outside the loop, as many
487     different states may want to look at it, and we assume that at least one
488     will. */
489    
490     if (ptr < end_subject)
491     {
492 nigel 93 clen = 1; /* Number of bytes in the character */
493 nigel 77 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495     #endif /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498     else
499     {
500 nigel 93 clen = 0; /* This indicates the end of the subject */
501     c = NOTACHAR; /* This value should never actually be used */
502 nigel 77 }
503    
504     /* Scan up the active states and act on each one. The result of an action
505     may be to add more states to the currently active list (e.g. on hitting a
506     parenthesis) or it may be to put states on the new list, for considering
507     when we move the character pointer on. */
508    
509     for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514     int count, codevalue;
515    
516     #ifdef DEBUG
517     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
518 nigel 93 if (clen == 0) printf("EOL\n");
519 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
520     else printf("0x%02x\n", c);
521     #endif
522    
523     /* This variable is referred to implicity in the ADD_xxx macros. */
524    
525     ims = current_state->ims;
526    
527     /* A negative offset is a special case meaning "hold off going to this
528     (negated) state until the number of characters in the data field have
529     been skipped". */
530    
531     if (state_offset < 0)
532     {
533     if (current_state->data > 0)
534     {
535     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
536     ADD_NEW_DATA(state_offset, current_state->count,
537     current_state->data - 1);
538     continue;
539     }
540     else
541     {
542     current_state->offset = state_offset = -state_offset;
543     }
544     }
545    
546     /* Check for a duplicate state with the same count, and skip if found. */
547    
548     for (j = 0; j < i; j++)
549     {
550     if (active_states[j].offset == state_offset &&
551     active_states[j].count == current_state->count)
552     {
553     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
554     goto NEXT_ACTIVE_STATE;
555     }
556     }
557    
558     /* The state offset is the offset to the opcode */
559    
560     code = start_code + state_offset;
561     codevalue = *code;
562    
563     /* If this opcode is followed by an inline character, load it. It is
564     tempting to test for the presence of a subject character here, but that
565     is wrong, because sometimes zero repetitions of the subject are
566     permitted.
567    
568     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
569 ph10 178 argument that is not a data character - but is always one byte long. We
570     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
571     this case. To keep the other cases fast, convert these ones to new opcodes.
572     */
573 nigel 77
574     if (coptable[codevalue] > 0)
575     {
576     dlen = 1;
577     #ifdef SUPPORT_UTF8
578     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
579     #endif /* SUPPORT_UTF8 */
580     d = code[coptable[codevalue]];
581     if (codevalue >= OP_TYPESTAR)
582     {
583 nigel 93 switch(d)
584     {
585     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
586     case OP_NOTPROP:
587     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
588     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
589     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
590 ph10 178 case OP_NOT_HSPACE:
591 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
592 ph10 178 case OP_NOT_VSPACE:
593 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
594 nigel 93 default: break;
595     }
596 nigel 77 }
597     }
598     else
599     {
600     dlen = 0; /* Not strictly necessary, but compilers moan */
601 nigel 93 d = NOTACHAR; /* if these variables are not set. */
602 nigel 77 }
603    
604    
605     /* Now process the individual opcodes */
606    
607     switch (codevalue)
608     {
609    
610     /* ========================================================================== */
611     /* Reached a closing bracket. If not at the end of the pattern, carry
612     on with the next opcode. Otherwise, unless we have an empty string and
613     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
614     matches so we always have the longest first. */
615    
616     case OP_KET:
617     case OP_KETRMIN:
618     case OP_KETRMAX:
619     if (code != end_code)
620     {
621     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
622     if (codevalue != OP_KET)
623     {
624     ADD_ACTIVE(state_offset - GET(code, 1), 0);
625     }
626     }
627     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
628     {
629     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
630     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
631     match_count = 0;
632     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
633     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
634     if (offsetcount >= 2)
635     {
636     offsets[0] = current_subject - start_subject;
637     offsets[1] = ptr - start_subject;
638     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
639     offsets[1] - offsets[0], current_subject));
640     }
641     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
642     {
643     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
644     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
645     match_count, rlevel*2-2, SP));
646     return match_count;
647     }
648     }
649     break;
650    
651     /* ========================================================================== */
652     /* These opcodes add to the current list of states without looking
653     at the current character. */
654    
655     /*-----------------------------------------------------------------*/
656     case OP_ALT:
657     do { code += GET(code, 1); } while (*code == OP_ALT);
658     ADD_ACTIVE(code - start_code, 0);
659     break;
660    
661     /*-----------------------------------------------------------------*/
662     case OP_BRA:
663 nigel 93 case OP_SBRA:
664 nigel 77 do
665     {
666     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
667     code += GET(code, 1);
668     }
669     while (*code == OP_ALT);
670     break;
671    
672     /*-----------------------------------------------------------------*/
673 nigel 93 case OP_CBRA:
674     case OP_SCBRA:
675     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
676     code += GET(code, 1);
677     while (*code == OP_ALT)
678     {
679     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
680     code += GET(code, 1);
681     }
682     break;
683    
684     /*-----------------------------------------------------------------*/
685 nigel 77 case OP_BRAZERO:
686     case OP_BRAMINZERO:
687     ADD_ACTIVE(state_offset + 1, 0);
688     code += 1 + GET(code, 2);
689     while (*code == OP_ALT) code += GET(code, 1);
690     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
691     break;
692    
693     /*-----------------------------------------------------------------*/
694 ph10 335 case OP_SKIPZERO:
695     code += 1 + GET(code, 2);
696     while (*code == OP_ALT) code += GET(code, 1);
697     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
698     break;
699    
700     /*-----------------------------------------------------------------*/
701 nigel 77 case OP_CIRC:
702     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
703 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
704     ptr != end_subject &&
705 nigel 93 WAS_NEWLINE(ptr)))
706 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
707     break;
708    
709     /*-----------------------------------------------------------------*/
710     case OP_EOD:
711     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
712     break;
713    
714     /*-----------------------------------------------------------------*/
715     case OP_OPT:
716     ims = code[1];
717     ADD_ACTIVE(state_offset + 2, 0);
718     break;
719    
720     /*-----------------------------------------------------------------*/
721     case OP_SOD:
722     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
723     break;
724    
725     /*-----------------------------------------------------------------*/
726     case OP_SOM:
727     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
728     break;
729    
730    
731     /* ========================================================================== */
732     /* These opcodes inspect the next subject character, and sometimes
733     the previous one as well, but do not have an argument. The variable
734     clen contains the length of the current character and is zero if we are
735     at the end of the subject. */
736    
737     /*-----------------------------------------------------------------*/
738     case OP_ANY:
739 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
740 nigel 77 { ADD_NEW(state_offset + 1, 0); }
741     break;
742    
743     /*-----------------------------------------------------------------*/
744 ph10 341 case OP_ALLANY:
745     if (clen > 0)
746     { ADD_NEW(state_offset + 1, 0); }
747     break;
748    
749     /*-----------------------------------------------------------------*/
750 nigel 77 case OP_EODN:
751 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
752 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
753     break;
754    
755     /*-----------------------------------------------------------------*/
756     case OP_DOLL:
757     if ((md->moptions & PCRE_NOTEOL) == 0)
758     {
759 nigel 91 if (clen == 0 ||
760 nigel 93 (IS_NEWLINE(ptr) &&
761 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
762     ))
763 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
764     }
765 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
766 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
767     break;
768    
769     /*-----------------------------------------------------------------*/
770    
771     case OP_DIGIT:
772     case OP_WHITESPACE:
773     case OP_WORDCHAR:
774     if (clen > 0 && c < 256 &&
775     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
776     { ADD_NEW(state_offset + 1, 0); }
777     break;
778    
779     /*-----------------------------------------------------------------*/
780     case OP_NOT_DIGIT:
781     case OP_NOT_WHITESPACE:
782     case OP_NOT_WORDCHAR:
783     if (clen > 0 && (c >= 256 ||
784     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
785     { ADD_NEW(state_offset + 1, 0); }
786     break;
787    
788     /*-----------------------------------------------------------------*/
789     case OP_WORD_BOUNDARY:
790     case OP_NOT_WORD_BOUNDARY:
791     {
792     int left_word, right_word;
793    
794     if (ptr > start_subject)
795     {
796     const uschar *temp = ptr - 1;
797     #ifdef SUPPORT_UTF8
798     if (utf8) BACKCHAR(temp);
799     #endif
800     GETCHARTEST(d, temp);
801     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
802     }
803     else left_word = 0;
804    
805     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
806     else right_word = 0;
807    
808     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
809     { ADD_ACTIVE(state_offset + 1, 0); }
810     }
811     break;
812    
813    
814     /*-----------------------------------------------------------------*/
815     /* Check the next character by Unicode property. We will get here only
816     if the support is in the binary; otherwise a compile-time error occurs.
817     */
818    
819 ph10 151 #ifdef SUPPORT_UCP
820 nigel 77 case OP_PROP:
821     case OP_NOTPROP:
822     if (clen > 0)
823     {
824 nigel 87 BOOL OK;
825 ph10 349 const ucd_record * prop = GET_UCD(c);
826 nigel 87 switch(code[1])
827 nigel 77 {
828 nigel 87 case PT_ANY:
829     OK = TRUE;
830     break;
831    
832     case PT_LAMP:
833 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
834 nigel 87 break;
835    
836     case PT_GC:
837 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
838 nigel 87 break;
839    
840     case PT_PC:
841 ph10 349 OK = prop->chartype == code[2];
842 nigel 87 break;
843    
844     case PT_SC:
845 ph10 349 OK = prop->script == code[2];
846 nigel 87 break;
847    
848     /* Should never occur, but keep compilers from grumbling. */
849    
850     default:
851     OK = codevalue != OP_PROP;
852     break;
853 nigel 77 }
854 nigel 87
855     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
856 nigel 77 }
857     break;
858     #endif
859    
860    
861    
862     /* ========================================================================== */
863     /* These opcodes likewise inspect the subject character, but have an
864     argument that is not a data character. It is one of these opcodes:
865 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
866     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
867 nigel 77
868     case OP_TYPEPLUS:
869     case OP_TYPEMINPLUS:
870 nigel 93 case OP_TYPEPOSPLUS:
871 nigel 77 count = current_state->count; /* Already matched */
872     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
873     if (clen > 0)
874     {
875     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876     (c < 256 &&
877 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
878 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879     {
880 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
881     {
882     active_count--; /* Remove non-match possibility */
883     next_active_state--;
884     }
885 nigel 77 count++;
886     ADD_NEW(state_offset, count);
887     }
888     }
889     break;
890    
891     /*-----------------------------------------------------------------*/
892     case OP_TYPEQUERY:
893     case OP_TYPEMINQUERY:
894 nigel 93 case OP_TYPEPOSQUERY:
895 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
896     if (clen > 0)
897     {
898     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899     (c < 256 &&
900 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
901 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
902     {
903 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
904     {
905     active_count--; /* Remove non-match possibility */
906     next_active_state--;
907     }
908 nigel 77 ADD_NEW(state_offset + 2, 0);
909     }
910     }
911     break;
912    
913     /*-----------------------------------------------------------------*/
914     case OP_TYPESTAR:
915     case OP_TYPEMINSTAR:
916 nigel 93 case OP_TYPEPOSSTAR:
917 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
918     if (clen > 0)
919     {
920     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
921     (c < 256 &&
922 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
923 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
924     {
925 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
926     {
927     active_count--; /* Remove non-match possibility */
928     next_active_state--;
929     }
930 nigel 77 ADD_NEW(state_offset, 0);
931     }
932     }
933     break;
934    
935     /*-----------------------------------------------------------------*/
936     case OP_TYPEEXACT:
937 nigel 93 count = current_state->count; /* Number already matched */
938     if (clen > 0)
939     {
940     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
941     (c < 256 &&
942 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
943 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944     {
945     if (++count >= GET2(code, 1))
946     { ADD_NEW(state_offset + 4, 0); }
947     else
948     { ADD_NEW(state_offset, count); }
949     }
950     }
951     break;
952    
953     /*-----------------------------------------------------------------*/
954 nigel 77 case OP_TYPEUPTO:
955     case OP_TYPEMINUPTO:
956 nigel 93 case OP_TYPEPOSUPTO:
957     ADD_ACTIVE(state_offset + 4, 0);
958 nigel 77 count = current_state->count; /* Number already matched */
959     if (clen > 0)
960     {
961     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962     (c < 256 &&
963 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
964 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
965     {
966 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
967     {
968     active_count--; /* Remove non-match possibility */
969     next_active_state--;
970     }
971 nigel 77 if (++count >= GET2(code, 1))
972     { ADD_NEW(state_offset + 4, 0); }
973     else
974     { ADD_NEW(state_offset, count); }
975     }
976     }
977     break;
978    
979     /* ========================================================================== */
980     /* These are virtual opcodes that are used when something like
981 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
982     argument. It keeps the code above fast for the other cases. The argument
983     is in the d variable. */
984 nigel 77
985 ph10 151 #ifdef SUPPORT_UCP
986 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
987     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
988 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
989 nigel 77 count = current_state->count; /* Already matched */
990 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
991 nigel 77 if (clen > 0)
992     {
993 nigel 87 BOOL OK;
994 ph10 349 const ucd_record * prop = GET_UCD(c);
995 nigel 87 switch(code[2])
996     {
997     case PT_ANY:
998     OK = TRUE;
999     break;
1000    
1001     case PT_LAMP:
1002 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1003 nigel 87 break;
1004    
1005     case PT_GC:
1006 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1007 nigel 87 break;
1008    
1009     case PT_PC:
1010 ph10 349 OK = prop->chartype == code[3];
1011 nigel 87 break;
1012    
1013     case PT_SC:
1014 ph10 349 OK = prop->script == code[3];
1015 nigel 87 break;
1016    
1017     /* Should never occur, but keep compilers from grumbling. */
1018    
1019     default:
1020     OK = codevalue != OP_PROP;
1021     break;
1022     }
1023    
1024 nigel 93 if (OK == (d == OP_PROP))
1025     {
1026     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1027     {
1028     active_count--; /* Remove non-match possibility */
1029     next_active_state--;
1030     }
1031     count++;
1032     ADD_NEW(state_offset, count);
1033     }
1034 nigel 77 }
1035     break;
1036    
1037     /*-----------------------------------------------------------------*/
1038     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1039     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1040 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1041 nigel 77 count = current_state->count; /* Already matched */
1042     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1043 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1044 nigel 77 {
1045     const uschar *nptr = ptr + clen;
1046     int ncount = 0;
1047 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1048     {
1049     active_count--; /* Remove non-match possibility */
1050     next_active_state--;
1051     }
1052 nigel 77 while (nptr < end_subject)
1053     {
1054     int nd;
1055     int ndlen = 1;
1056     GETCHARLEN(nd, nptr, ndlen);
1057 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1058 nigel 77 ncount++;
1059     nptr += ndlen;
1060     }
1061     count++;
1062     ADD_NEW_DATA(-state_offset, count, ncount);
1063     }
1064     break;
1065 ph10 151 #endif
1066 nigel 77
1067     /*-----------------------------------------------------------------*/
1068 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1069     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1070     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1071     count = current_state->count; /* Already matched */
1072     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1073     if (clen > 0)
1074     {
1075     int ncount = 0;
1076     switch (c)
1077     {
1078     case 0x000b:
1079     case 0x000c:
1080     case 0x0085:
1081     case 0x2028:
1082     case 0x2029:
1083 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1084     goto ANYNL01;
1085    
1086     case 0x000d:
1087     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1088     /* Fall through */
1089    
1090     ANYNL01:
1091     case 0x000a:
1092 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093     {
1094     active_count--; /* Remove non-match possibility */
1095     next_active_state--;
1096     }
1097     count++;
1098     ADD_NEW_DATA(-state_offset, count, ncount);
1099     break;
1100 ph10 231
1101 nigel 93 default:
1102     break;
1103     }
1104     }
1105     break;
1106    
1107     /*-----------------------------------------------------------------*/
1108 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1109     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1110     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1111     count = current_state->count; /* Already matched */
1112     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113     if (clen > 0)
1114     {
1115 ph10 182 BOOL OK;
1116 ph10 178 switch (c)
1117     {
1118     case 0x000a:
1119     case 0x000b:
1120     case 0x000c:
1121     case 0x000d:
1122     case 0x0085:
1123     case 0x2028:
1124     case 0x2029:
1125     OK = TRUE;
1126 ph10 182 break;
1127 ph10 178
1128     default:
1129     OK = FALSE;
1130 ph10 182 break;
1131 ph10 178 }
1132    
1133     if (OK == (d == OP_VSPACE))
1134 ph10 182 {
1135 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1136     {
1137     active_count--; /* Remove non-match possibility */
1138     next_active_state--;
1139     }
1140     count++;
1141     ADD_NEW_DATA(-state_offset, count, 0);
1142     }
1143     }
1144     break;
1145    
1146     /*-----------------------------------------------------------------*/
1147     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1148     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1149     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1150     count = current_state->count; /* Already matched */
1151     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152     if (clen > 0)
1153     {
1154 ph10 182 BOOL OK;
1155 ph10 178 switch (c)
1156     {
1157     case 0x09: /* HT */
1158     case 0x20: /* SPACE */
1159     case 0xa0: /* NBSP */
1160     case 0x1680: /* OGHAM SPACE MARK */
1161     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1162     case 0x2000: /* EN QUAD */
1163     case 0x2001: /* EM QUAD */
1164     case 0x2002: /* EN SPACE */
1165     case 0x2003: /* EM SPACE */
1166     case 0x2004: /* THREE-PER-EM SPACE */
1167     case 0x2005: /* FOUR-PER-EM SPACE */
1168     case 0x2006: /* SIX-PER-EM SPACE */
1169     case 0x2007: /* FIGURE SPACE */
1170     case 0x2008: /* PUNCTUATION SPACE */
1171     case 0x2009: /* THIN SPACE */
1172     case 0x200A: /* HAIR SPACE */
1173     case 0x202f: /* NARROW NO-BREAK SPACE */
1174     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1175     case 0x3000: /* IDEOGRAPHIC SPACE */
1176     OK = TRUE;
1177     break;
1178 ph10 182
1179 ph10 178 default:
1180     OK = FALSE;
1181     break;
1182     }
1183 ph10 182
1184 ph10 178 if (OK == (d == OP_HSPACE))
1185 ph10 182 {
1186 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1187     {
1188     active_count--; /* Remove non-match possibility */
1189     next_active_state--;
1190     }
1191     count++;
1192     ADD_NEW_DATA(-state_offset, count, 0);
1193     }
1194     }
1195     break;
1196    
1197     /*-----------------------------------------------------------------*/
1198 ph10 151 #ifdef SUPPORT_UCP
1199 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1200     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1201 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1202 nigel 87 count = 4;
1203 nigel 77 goto QS1;
1204    
1205     case OP_PROP_EXTRA + OP_TYPESTAR:
1206     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1207 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1208 nigel 77 count = 0;
1209    
1210     QS1:
1211    
1212 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1213 nigel 77 if (clen > 0)
1214     {
1215 nigel 87 BOOL OK;
1216 ph10 349 const ucd_record * prop = GET_UCD(c);
1217 nigel 87 switch(code[2])
1218     {
1219     case PT_ANY:
1220     OK = TRUE;
1221     break;
1222    
1223     case PT_LAMP:
1224 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1225 nigel 87 break;
1226    
1227     case PT_GC:
1228 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1229 nigel 87 break;
1230    
1231     case PT_PC:
1232 ph10 349 OK = prop->chartype == code[3];
1233 nigel 87 break;
1234    
1235     case PT_SC:
1236 ph10 349 OK = prop->script == code[3];
1237 nigel 87 break;
1238    
1239     /* Should never occur, but keep compilers from grumbling. */
1240    
1241     default:
1242     OK = codevalue != OP_PROP;
1243     break;
1244     }
1245    
1246 nigel 93 if (OK == (d == OP_PROP))
1247     {
1248     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1249     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1250     {
1251     active_count--; /* Remove non-match possibility */
1252     next_active_state--;
1253     }
1254     ADD_NEW(state_offset + count, 0);
1255     }
1256 nigel 77 }
1257     break;
1258    
1259     /*-----------------------------------------------------------------*/
1260     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1261     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1262 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1263 nigel 77 count = 2;
1264     goto QS2;
1265    
1266     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1267     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1268 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1269 nigel 77 count = 0;
1270    
1271     QS2:
1272    
1273     ADD_ACTIVE(state_offset + 2, 0);
1274 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1275 nigel 77 {
1276     const uschar *nptr = ptr + clen;
1277     int ncount = 0;
1278 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1279     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1280     {
1281     active_count--; /* Remove non-match possibility */
1282     next_active_state--;
1283     }
1284 nigel 77 while (nptr < end_subject)
1285     {
1286     int nd;
1287     int ndlen = 1;
1288     GETCHARLEN(nd, nptr, ndlen);
1289 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1290 nigel 77 ncount++;
1291     nptr += ndlen;
1292     }
1293     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1294     }
1295     break;
1296 ph10 151 #endif
1297 nigel 77
1298     /*-----------------------------------------------------------------*/
1299 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1300     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1301     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1302     count = 2;
1303     goto QS3;
1304    
1305     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1306     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1307     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1308     count = 0;
1309    
1310     QS3:
1311     ADD_ACTIVE(state_offset + 2, 0);
1312     if (clen > 0)
1313     {
1314     int ncount = 0;
1315     switch (c)
1316     {
1317     case 0x000b:
1318     case 0x000c:
1319     case 0x0085:
1320     case 0x2028:
1321     case 0x2029:
1322 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323     goto ANYNL02;
1324    
1325     case 0x000d:
1326     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327     /* Fall through */
1328    
1329     ANYNL02:
1330     case 0x000a:
1331 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1332     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1333     {
1334     active_count--; /* Remove non-match possibility */
1335     next_active_state--;
1336     }
1337     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1338     break;
1339 ph10 231
1340 nigel 93 default:
1341     break;
1342     }
1343     }
1344     break;
1345    
1346     /*-----------------------------------------------------------------*/
1347 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1348     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1349     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1350     count = 2;
1351     goto QS4;
1352    
1353     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1354     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1355     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1356     count = 0;
1357    
1358     QS4:
1359     ADD_ACTIVE(state_offset + 2, 0);
1360     if (clen > 0)
1361     {
1362 ph10 182 BOOL OK;
1363 ph10 178 switch (c)
1364     {
1365     case 0x000a:
1366     case 0x000b:
1367     case 0x000c:
1368     case 0x000d:
1369     case 0x0085:
1370     case 0x2028:
1371     case 0x2029:
1372     OK = TRUE;
1373     break;
1374 ph10 182
1375 ph10 178 default:
1376     OK = FALSE;
1377     break;
1378     }
1379     if (OK == (d == OP_VSPACE))
1380 ph10 182 {
1381 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1382     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1383     {
1384     active_count--; /* Remove non-match possibility */
1385     next_active_state--;
1386     }
1387     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1388     }
1389     }
1390     break;
1391    
1392     /*-----------------------------------------------------------------*/
1393     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1394     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1395     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1396     count = 2;
1397     goto QS5;
1398    
1399     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1400     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1401     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1402     count = 0;
1403    
1404     QS5:
1405     ADD_ACTIVE(state_offset + 2, 0);
1406     if (clen > 0)
1407     {
1408 ph10 182 BOOL OK;
1409 ph10 178 switch (c)
1410     {
1411     case 0x09: /* HT */
1412     case 0x20: /* SPACE */
1413     case 0xa0: /* NBSP */
1414     case 0x1680: /* OGHAM SPACE MARK */
1415     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1416     case 0x2000: /* EN QUAD */
1417     case 0x2001: /* EM QUAD */
1418     case 0x2002: /* EN SPACE */
1419     case 0x2003: /* EM SPACE */
1420     case 0x2004: /* THREE-PER-EM SPACE */
1421     case 0x2005: /* FOUR-PER-EM SPACE */
1422     case 0x2006: /* SIX-PER-EM SPACE */
1423     case 0x2007: /* FIGURE SPACE */
1424     case 0x2008: /* PUNCTUATION SPACE */
1425     case 0x2009: /* THIN SPACE */
1426     case 0x200A: /* HAIR SPACE */
1427     case 0x202f: /* NARROW NO-BREAK SPACE */
1428     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1429     case 0x3000: /* IDEOGRAPHIC SPACE */
1430     OK = TRUE;
1431     break;
1432 ph10 182
1433 ph10 178 default:
1434     OK = FALSE;
1435     break;
1436     }
1437 ph10 182
1438 ph10 178 if (OK == (d == OP_HSPACE))
1439 ph10 182 {
1440 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1441     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1442     {
1443     active_count--; /* Remove non-match possibility */
1444     next_active_state--;
1445     }
1446     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1447     }
1448     }
1449     break;
1450    
1451     /*-----------------------------------------------------------------*/
1452 ph10 151 #ifdef SUPPORT_UCP
1453 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1454     case OP_PROP_EXTRA + OP_TYPEUPTO:
1455     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1456 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1457 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1458 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1459 nigel 77 count = current_state->count; /* Number already matched */
1460     if (clen > 0)
1461     {
1462 nigel 87 BOOL OK;
1463 ph10 349 const ucd_record * prop = GET_UCD(c);
1464 nigel 87 switch(code[4])
1465 nigel 77 {
1466 nigel 87 case PT_ANY:
1467     OK = TRUE;
1468     break;
1469    
1470     case PT_LAMP:
1471 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1472 nigel 87 break;
1473    
1474     case PT_GC:
1475 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1476 nigel 87 break;
1477    
1478     case PT_PC:
1479 ph10 349 OK = prop->chartype == code[5];
1480 nigel 87 break;
1481    
1482     case PT_SC:
1483 ph10 349 OK = prop->script == code[5];
1484 nigel 87 break;
1485    
1486     /* Should never occur, but keep compilers from grumbling. */
1487    
1488     default:
1489     OK = codevalue != OP_PROP;
1490     break;
1491     }
1492    
1493     if (OK == (d == OP_PROP))
1494     {
1495 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1496     {
1497     active_count--; /* Remove non-match possibility */
1498     next_active_state--;
1499     }
1500 nigel 77 if (++count >= GET2(code, 1))
1501 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1502 nigel 77 else
1503     { ADD_NEW(state_offset, count); }
1504     }
1505     }
1506     break;
1507    
1508     /*-----------------------------------------------------------------*/
1509     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1510     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1511     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1512 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1513 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1514     { ADD_ACTIVE(state_offset + 4, 0); }
1515     count = current_state->count; /* Number already matched */
1516 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1517 nigel 77 {
1518     const uschar *nptr = ptr + clen;
1519     int ncount = 0;
1520 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1521     {
1522     active_count--; /* Remove non-match possibility */
1523     next_active_state--;
1524     }
1525 nigel 77 while (nptr < end_subject)
1526     {
1527     int nd;
1528     int ndlen = 1;
1529     GETCHARLEN(nd, nptr, ndlen);
1530 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1531 nigel 77 ncount++;
1532     nptr += ndlen;
1533     }
1534     if (++count >= GET2(code, 1))
1535     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1536     else
1537     { ADD_NEW_DATA(-state_offset, count, ncount); }
1538     }
1539     break;
1540 ph10 151 #endif
1541 nigel 77
1542 nigel 93 /*-----------------------------------------------------------------*/
1543     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1544     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1545     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1546     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1547     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1548     { ADD_ACTIVE(state_offset + 4, 0); }
1549     count = current_state->count; /* Number already matched */
1550     if (clen > 0)
1551     {
1552     int ncount = 0;
1553     switch (c)
1554     {
1555     case 0x000b:
1556     case 0x000c:
1557     case 0x0085:
1558     case 0x2028:
1559     case 0x2029:
1560 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1561     goto ANYNL03;
1562    
1563     case 0x000d:
1564     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1565     /* Fall through */
1566    
1567     ANYNL03:
1568     case 0x000a:
1569 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1570     {
1571     active_count--; /* Remove non-match possibility */
1572     next_active_state--;
1573     }
1574     if (++count >= GET2(code, 1))
1575     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1576     else
1577     { ADD_NEW_DATA(-state_offset, count, ncount); }
1578     break;
1579 ph10 231
1580 nigel 93 default:
1581     break;
1582     }
1583     }
1584     break;
1585    
1586 ph10 178 /*-----------------------------------------------------------------*/
1587     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1588     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1589     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1590     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1591     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1592     { ADD_ACTIVE(state_offset + 4, 0); }
1593     count = current_state->count; /* Number already matched */
1594     if (clen > 0)
1595     {
1596 ph10 182 BOOL OK;
1597 ph10 178 switch (c)
1598     {
1599     case 0x000a:
1600     case 0x000b:
1601     case 0x000c:
1602     case 0x000d:
1603     case 0x0085:
1604     case 0x2028:
1605     case 0x2029:
1606     OK = TRUE;
1607     break;
1608 ph10 182
1609 ph10 178 default:
1610     OK = FALSE;
1611     }
1612 ph10 182
1613 ph10 178 if (OK == (d == OP_VSPACE))
1614 ph10 182 {
1615 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1616     {
1617     active_count--; /* Remove non-match possibility */
1618     next_active_state--;
1619     }
1620     if (++count >= GET2(code, 1))
1621     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1622     else
1623     { ADD_NEW_DATA(-state_offset, count, 0); }
1624     }
1625     }
1626     break;
1627    
1628     /*-----------------------------------------------------------------*/
1629     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1630     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1631     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1632     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1633     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1634     { ADD_ACTIVE(state_offset + 4, 0); }
1635     count = current_state->count; /* Number already matched */
1636     if (clen > 0)
1637     {
1638 ph10 182 BOOL OK;
1639 ph10 178 switch (c)
1640     {
1641     case 0x09: /* HT */
1642     case 0x20: /* SPACE */
1643     case 0xa0: /* NBSP */
1644     case 0x1680: /* OGHAM SPACE MARK */
1645     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1646     case 0x2000: /* EN QUAD */
1647     case 0x2001: /* EM QUAD */
1648     case 0x2002: /* EN SPACE */
1649     case 0x2003: /* EM SPACE */
1650     case 0x2004: /* THREE-PER-EM SPACE */
1651     case 0x2005: /* FOUR-PER-EM SPACE */
1652     case 0x2006: /* SIX-PER-EM SPACE */
1653     case 0x2007: /* FIGURE SPACE */
1654     case 0x2008: /* PUNCTUATION SPACE */
1655     case 0x2009: /* THIN SPACE */
1656     case 0x200A: /* HAIR SPACE */
1657     case 0x202f: /* NARROW NO-BREAK SPACE */
1658     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1659     case 0x3000: /* IDEOGRAPHIC SPACE */
1660     OK = TRUE;
1661     break;
1662 ph10 182
1663 ph10 178 default:
1664     OK = FALSE;
1665     break;
1666     }
1667 ph10 182
1668 ph10 178 if (OK == (d == OP_HSPACE))
1669 ph10 182 {
1670 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1671     {
1672     active_count--; /* Remove non-match possibility */
1673     next_active_state--;
1674     }
1675     if (++count >= GET2(code, 1))
1676     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1677     else
1678     { ADD_NEW_DATA(-state_offset, count, 0); }
1679     }
1680     }
1681     break;
1682    
1683 nigel 77 /* ========================================================================== */
1684     /* These opcodes are followed by a character that is usually compared
1685     to the current subject character; it is loaded into d. We still get
1686     here even if there is no subject character, because in some cases zero
1687     repetitions are permitted. */
1688    
1689     /*-----------------------------------------------------------------*/
1690     case OP_CHAR:
1691     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1692     break;
1693    
1694     /*-----------------------------------------------------------------*/
1695     case OP_CHARNC:
1696     if (clen == 0) break;
1697    
1698     #ifdef SUPPORT_UTF8
1699     if (utf8)
1700     {
1701     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1702     {
1703 nigel 93 unsigned int othercase;
1704 nigel 77 if (c < 128) othercase = fcc[c]; else
1705    
1706     /* If we have Unicode property support, we can use it to test the
1707 nigel 87 other case of the character. */
1708 nigel 77
1709     #ifdef SUPPORT_UCP
1710 ph10 349 othercase = UCD_OTHERCASE(c);
1711 nigel 87 #else
1712 nigel 93 othercase = NOTACHAR;
1713 nigel 77 #endif
1714    
1715     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1716     }
1717     }
1718     else
1719     #endif /* SUPPORT_UTF8 */
1720    
1721     /* Non-UTF-8 mode */
1722     {
1723     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1724     }
1725     break;
1726    
1727    
1728     #ifdef SUPPORT_UCP
1729     /*-----------------------------------------------------------------*/
1730     /* This is a tricky one because it can match more than one character.
1731     Find out how many characters to skip, and then set up a negative state
1732     to wait for them to pass before continuing. */
1733    
1734     case OP_EXTUNI:
1735 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1736 nigel 77 {
1737     const uschar *nptr = ptr + clen;
1738     int ncount = 0;
1739     while (nptr < end_subject)
1740     {
1741     int nclen = 1;
1742     GETCHARLEN(c, nptr, nclen);
1743 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1744 nigel 77 ncount++;
1745     nptr += nclen;
1746     }
1747     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1748     }
1749     break;
1750     #endif
1751    
1752     /*-----------------------------------------------------------------*/
1753 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1754     character (when CR is followed by LF). In this case, set up a negative
1755     state to wait for one character to pass before continuing. */
1756    
1757     case OP_ANYNL:
1758     if (clen > 0) switch(c)
1759     {
1760     case 0x000b:
1761     case 0x000c:
1762     case 0x0085:
1763     case 0x2028:
1764     case 0x2029:
1765 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1766    
1767     case 0x000a:
1768 nigel 93 ADD_NEW(state_offset + 1, 0);
1769     break;
1770 ph10 231
1771 nigel 93 case 0x000d:
1772     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1773     {
1774     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1775     }
1776     else
1777     {
1778     ADD_NEW(state_offset + 1, 0);
1779     }
1780     break;
1781     }
1782     break;
1783    
1784     /*-----------------------------------------------------------------*/
1785 ph10 178 case OP_NOT_VSPACE:
1786     if (clen > 0) switch(c)
1787     {
1788     case 0x000a:
1789     case 0x000b:
1790     case 0x000c:
1791     case 0x000d:
1792     case 0x0085:
1793     case 0x2028:
1794     case 0x2029:
1795     break;
1796 ph10 182
1797     default:
1798 ph10 178 ADD_NEW(state_offset + 1, 0);
1799     break;
1800     }
1801     break;
1802    
1803     /*-----------------------------------------------------------------*/
1804     case OP_VSPACE:
1805     if (clen > 0) switch(c)
1806     {
1807     case 0x000a:
1808     case 0x000b:
1809     case 0x000c:
1810     case 0x000d:
1811     case 0x0085:
1812     case 0x2028:
1813     case 0x2029:
1814     ADD_NEW(state_offset + 1, 0);
1815     break;
1816 ph10 182
1817 ph10 178 default: break;
1818     }
1819     break;
1820    
1821     /*-----------------------------------------------------------------*/
1822     case OP_NOT_HSPACE:
1823     if (clen > 0) switch(c)
1824     {
1825     case 0x09: /* HT */
1826     case 0x20: /* SPACE */
1827     case 0xa0: /* NBSP */
1828     case 0x1680: /* OGHAM SPACE MARK */
1829     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1830     case 0x2000: /* EN QUAD */
1831     case 0x2001: /* EM QUAD */
1832     case 0x2002: /* EN SPACE */
1833     case 0x2003: /* EM SPACE */
1834     case 0x2004: /* THREE-PER-EM SPACE */
1835     case 0x2005: /* FOUR-PER-EM SPACE */
1836     case 0x2006: /* SIX-PER-EM SPACE */
1837     case 0x2007: /* FIGURE SPACE */
1838     case 0x2008: /* PUNCTUATION SPACE */
1839     case 0x2009: /* THIN SPACE */
1840     case 0x200A: /* HAIR SPACE */
1841     case 0x202f: /* NARROW NO-BREAK SPACE */
1842     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1843     case 0x3000: /* IDEOGRAPHIC SPACE */
1844     break;
1845 ph10 182
1846     default:
1847 ph10 178 ADD_NEW(state_offset + 1, 0);
1848     break;
1849     }
1850     break;
1851    
1852     /*-----------------------------------------------------------------*/
1853     case OP_HSPACE:
1854     if (clen > 0) switch(c)
1855     {
1856     case 0x09: /* HT */
1857     case 0x20: /* SPACE */
1858     case 0xa0: /* NBSP */
1859     case 0x1680: /* OGHAM SPACE MARK */
1860     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1861     case 0x2000: /* EN QUAD */
1862     case 0x2001: /* EM QUAD */
1863     case 0x2002: /* EN SPACE */
1864     case 0x2003: /* EM SPACE */
1865     case 0x2004: /* THREE-PER-EM SPACE */
1866     case 0x2005: /* FOUR-PER-EM SPACE */
1867     case 0x2006: /* SIX-PER-EM SPACE */
1868     case 0x2007: /* FIGURE SPACE */
1869     case 0x2008: /* PUNCTUATION SPACE */
1870     case 0x2009: /* THIN SPACE */
1871     case 0x200A: /* HAIR SPACE */
1872     case 0x202f: /* NARROW NO-BREAK SPACE */
1873     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1874     case 0x3000: /* IDEOGRAPHIC SPACE */
1875     ADD_NEW(state_offset + 1, 0);
1876     break;
1877     }
1878     break;
1879    
1880     /*-----------------------------------------------------------------*/
1881 nigel 77 /* Match a negated single character. This is only used for one-byte
1882     characters, that is, we know that d < 256. The character we are
1883     checking (c) can be multibyte. */
1884    
1885     case OP_NOT:
1886     if (clen > 0)
1887     {
1888 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1889 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1890     }
1891     break;
1892    
1893     /*-----------------------------------------------------------------*/
1894     case OP_PLUS:
1895     case OP_MINPLUS:
1896 nigel 93 case OP_POSPLUS:
1897 nigel 77 case OP_NOTPLUS:
1898     case OP_NOTMINPLUS:
1899 nigel 93 case OP_NOTPOSPLUS:
1900 nigel 77 count = current_state->count; /* Already matched */
1901     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1902     if (clen > 0)
1903     {
1904 nigel 93 unsigned int otherd = NOTACHAR;
1905 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1906     {
1907     #ifdef SUPPORT_UTF8
1908 nigel 87 if (utf8 && d >= 128)
1909 nigel 77 {
1910     #ifdef SUPPORT_UCP
1911 ph10 349 otherd = UCD_OTHERCASE(d);
1912 nigel 77 #endif /* SUPPORT_UCP */
1913     }
1914     else
1915     #endif /* SUPPORT_UTF8 */
1916     otherd = fcc[d];
1917     }
1918     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1919 nigel 93 {
1920     if (count > 0 &&
1921     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1922     {
1923     active_count--; /* Remove non-match possibility */
1924     next_active_state--;
1925     }
1926     count++;
1927     ADD_NEW(state_offset, count);
1928     }
1929 nigel 77 }
1930     break;
1931    
1932     /*-----------------------------------------------------------------*/
1933     case OP_QUERY:
1934     case OP_MINQUERY:
1935 nigel 93 case OP_POSQUERY:
1936 nigel 77 case OP_NOTQUERY:
1937     case OP_NOTMINQUERY:
1938 nigel 93 case OP_NOTPOSQUERY:
1939 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1940     if (clen > 0)
1941     {
1942 nigel 93 unsigned int otherd = NOTACHAR;
1943 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1944 nigel 77 {
1945     #ifdef SUPPORT_UTF8
1946 nigel 87 if (utf8 && d >= 128)
1947 nigel 77 {
1948     #ifdef SUPPORT_UCP
1949 ph10 349 otherd = UCD_OTHERCASE(d);
1950 nigel 77 #endif /* SUPPORT_UCP */
1951     }
1952     else
1953     #endif /* SUPPORT_UTF8 */
1954     otherd = fcc[d];
1955     }
1956     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1957 nigel 93 {
1958     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1959     {
1960     active_count--; /* Remove non-match possibility */
1961     next_active_state--;
1962     }
1963     ADD_NEW(state_offset + dlen + 1, 0);
1964     }
1965 nigel 77 }
1966     break;
1967    
1968     /*-----------------------------------------------------------------*/
1969     case OP_STAR:
1970     case OP_MINSTAR:
1971 nigel 93 case OP_POSSTAR:
1972 nigel 77 case OP_NOTSTAR:
1973     case OP_NOTMINSTAR:
1974 nigel 93 case OP_NOTPOSSTAR:
1975 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1976     if (clen > 0)
1977     {
1978 nigel 93 unsigned int otherd = NOTACHAR;
1979 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1980 nigel 77 {
1981     #ifdef SUPPORT_UTF8
1982 nigel 87 if (utf8 && d >= 128)
1983 nigel 77 {
1984     #ifdef SUPPORT_UCP
1985 ph10 349 otherd = UCD_OTHERCASE(d);
1986 nigel 77 #endif /* SUPPORT_UCP */
1987     }
1988     else
1989     #endif /* SUPPORT_UTF8 */
1990     otherd = fcc[d];
1991     }
1992     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1993 nigel 93 {
1994     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1995     {
1996     active_count--; /* Remove non-match possibility */
1997     next_active_state--;
1998     }
1999     ADD_NEW(state_offset, 0);
2000     }
2001 nigel 77 }
2002     break;
2003    
2004     /*-----------------------------------------------------------------*/
2005     case OP_EXACT:
2006 nigel 93 case OP_NOTEXACT:
2007     count = current_state->count; /* Number already matched */
2008     if (clen > 0)
2009     {
2010     unsigned int otherd = NOTACHAR;
2011     if ((ims & PCRE_CASELESS) != 0)
2012     {
2013     #ifdef SUPPORT_UTF8
2014     if (utf8 && d >= 128)
2015     {
2016     #ifdef SUPPORT_UCP
2017 ph10 349 otherd = UCD_OTHERCASE(d);
2018 nigel 93 #endif /* SUPPORT_UCP */
2019     }
2020     else
2021     #endif /* SUPPORT_UTF8 */
2022     otherd = fcc[d];
2023     }
2024     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2025     {
2026     if (++count >= GET2(code, 1))
2027     { ADD_NEW(state_offset + dlen + 3, 0); }
2028     else
2029     { ADD_NEW(state_offset, count); }
2030     }
2031     }
2032     break;
2033    
2034     /*-----------------------------------------------------------------*/
2035 nigel 77 case OP_UPTO:
2036     case OP_MINUPTO:
2037 nigel 93 case OP_POSUPTO:
2038 nigel 77 case OP_NOTUPTO:
2039     case OP_NOTMINUPTO:
2040 nigel 93 case OP_NOTPOSUPTO:
2041     ADD_ACTIVE(state_offset + dlen + 3, 0);
2042 nigel 77 count = current_state->count; /* Number already matched */
2043     if (clen > 0)
2044     {
2045 nigel 93 unsigned int otherd = NOTACHAR;
2046 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2047     {
2048     #ifdef SUPPORT_UTF8
2049 nigel 87 if (utf8 && d >= 128)
2050 nigel 77 {
2051     #ifdef SUPPORT_UCP
2052 ph10 349 otherd = UCD_OTHERCASE(d);
2053 nigel 77 #endif /* SUPPORT_UCP */
2054     }
2055     else
2056     #endif /* SUPPORT_UTF8 */
2057     otherd = fcc[d];
2058     }
2059     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2060     {
2061 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2062     {
2063     active_count--; /* Remove non-match possibility */
2064     next_active_state--;
2065     }
2066 nigel 77 if (++count >= GET2(code, 1))
2067     { ADD_NEW(state_offset + dlen + 3, 0); }
2068     else
2069     { ADD_NEW(state_offset, count); }
2070     }
2071     }
2072     break;
2073    
2074    
2075     /* ========================================================================== */
2076     /* These are the class-handling opcodes */
2077    
2078     case OP_CLASS:
2079     case OP_NCLASS:
2080     case OP_XCLASS:
2081     {
2082     BOOL isinclass = FALSE;
2083     int next_state_offset;
2084     const uschar *ecode;
2085    
2086     /* For a simple class, there is always just a 32-byte table, and we
2087     can set isinclass from it. */
2088    
2089     if (codevalue != OP_XCLASS)
2090     {
2091     ecode = code + 33;
2092     if (clen > 0)
2093     {
2094     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2095     ((code[1 + c/8] & (1 << (c&7))) != 0);
2096     }
2097     }
2098    
2099     /* An extended class may have a table or a list of single characters,
2100     ranges, or both, and it may be positive or negative. There's a
2101     function that sorts all this out. */
2102    
2103     else
2104     {
2105     ecode = code + GET(code, 1);
2106     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2107     }
2108    
2109     /* At this point, isinclass is set for all kinds of class, and ecode
2110     points to the byte after the end of the class. If there is a
2111     quantifier, this is where it will be. */
2112    
2113     next_state_offset = ecode - start_code;
2114    
2115     switch (*ecode)
2116     {
2117     case OP_CRSTAR:
2118     case OP_CRMINSTAR:
2119     ADD_ACTIVE(next_state_offset + 1, 0);
2120     if (isinclass) { ADD_NEW(state_offset, 0); }
2121     break;
2122    
2123     case OP_CRPLUS:
2124     case OP_CRMINPLUS:
2125     count = current_state->count; /* Already matched */
2126     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2127     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2128     break;
2129    
2130     case OP_CRQUERY:
2131     case OP_CRMINQUERY:
2132     ADD_ACTIVE(next_state_offset + 1, 0);
2133     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2134     break;
2135    
2136     case OP_CRRANGE:
2137     case OP_CRMINRANGE:
2138     count = current_state->count; /* Already matched */
2139     if (count >= GET2(ecode, 1))
2140     { ADD_ACTIVE(next_state_offset + 5, 0); }
2141     if (isinclass)
2142     {
2143 nigel 91 int max = GET2(ecode, 3);
2144     if (++count >= max && max != 0) /* Max 0 => no limit */
2145 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2146     else
2147     { ADD_NEW(state_offset, count); }
2148     }
2149     break;
2150    
2151     default:
2152     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2153     break;
2154     }
2155     }
2156     break;
2157    
2158     /* ========================================================================== */
2159     /* These are the opcodes for fancy brackets of various kinds. We have
2160 ph10 345 to use recursion in order to handle them. The "always failing" assersion
2161 ph10 341 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2162     though the other "backtracking verbs" are not supported. */
2163 ph10 345
2164 ph10 341 case OP_FAIL:
2165 ph10 345 break;
2166 nigel 77
2167     case OP_ASSERT:
2168     case OP_ASSERT_NOT:
2169     case OP_ASSERTBACK:
2170     case OP_ASSERTBACK_NOT:
2171     {
2172     int rc;
2173     int local_offsets[2];
2174     int local_workspace[1000];
2175     const uschar *endasscode = code + GET(code, 1);
2176    
2177     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2178    
2179     rc = internal_dfa_exec(
2180     md, /* static match data */
2181     code, /* this subexpression's code */
2182     ptr, /* where we currently are */
2183     ptr - start_subject, /* start offset */
2184     local_offsets, /* offset vector */
2185     sizeof(local_offsets)/sizeof(int), /* size of same */
2186     local_workspace, /* workspace vector */
2187     sizeof(local_workspace)/sizeof(int), /* size of same */
2188     ims, /* the current ims flags */
2189     rlevel, /* function recursion level */
2190     recursing); /* pass on regex recursion */
2191    
2192     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2193     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2194     }
2195     break;
2196    
2197     /*-----------------------------------------------------------------*/
2198     case OP_COND:
2199 nigel 93 case OP_SCOND:
2200 nigel 77 {
2201     int local_offsets[1000];
2202     int local_workspace[1000];
2203     int condcode = code[LINK_SIZE+1];
2204    
2205 nigel 93 /* Back reference conditions are not supported */
2206 nigel 77
2207 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2208    
2209     /* The DEFINE condition is always false */
2210    
2211     if (condcode == OP_DEF)
2212 nigel 77 {
2213 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2214     }
2215    
2216     /* The only supported version of OP_RREF is for the value RREF_ANY,
2217     which means "test if in any recursion". We can't test for specifically
2218     recursed groups. */
2219    
2220     else if (condcode == OP_RREF)
2221     {
2222 nigel 77 int value = GET2(code, LINK_SIZE+2);
2223 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2224 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2225     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2226     }
2227    
2228     /* Otherwise, the condition is an assertion */
2229    
2230     else
2231     {
2232     int rc;
2233     const uschar *asscode = code + LINK_SIZE + 1;
2234     const uschar *endasscode = asscode + GET(asscode, 1);
2235    
2236     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2237    
2238     rc = internal_dfa_exec(
2239     md, /* fixed match data */
2240     asscode, /* this subexpression's code */
2241     ptr, /* where we currently are */
2242     ptr - start_subject, /* start offset */
2243     local_offsets, /* offset vector */
2244     sizeof(local_offsets)/sizeof(int), /* size of same */
2245     local_workspace, /* workspace vector */
2246     sizeof(local_workspace)/sizeof(int), /* size of same */
2247     ims, /* the current ims flags */
2248     rlevel, /* function recursion level */
2249     recursing); /* pass on regex recursion */
2250    
2251     if ((rc >= 0) ==
2252     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2253     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2254     else
2255     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2256     }
2257     }
2258     break;
2259    
2260     /*-----------------------------------------------------------------*/
2261     case OP_RECURSE:
2262     {
2263     int local_offsets[1000];
2264     int local_workspace[1000];
2265     int rc;
2266    
2267     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2268     recursing + 1));
2269    
2270     rc = internal_dfa_exec(
2271     md, /* fixed match data */
2272     start_code + GET(code, 1), /* this subexpression's code */
2273     ptr, /* where we currently are */
2274     ptr - start_subject, /* start offset */
2275     local_offsets, /* offset vector */
2276     sizeof(local_offsets)/sizeof(int), /* size of same */
2277     local_workspace, /* workspace vector */
2278     sizeof(local_workspace)/sizeof(int), /* size of same */
2279     ims, /* the current ims flags */
2280     rlevel, /* function recursion level */
2281     recursing + 1); /* regex recurse level */
2282    
2283     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2284     recursing + 1, rc));
2285    
2286     /* Ran out of internal offsets */
2287    
2288     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2289    
2290     /* For each successful matched substring, set up the next state with a
2291     count of characters to skip before trying it. Note that the count is in
2292     characters, not bytes. */
2293    
2294     if (rc > 0)
2295     {
2296     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2297     {
2298     const uschar *p = start_subject + local_offsets[rc];
2299     const uschar *pp = start_subject + local_offsets[rc+1];
2300     int charcount = local_offsets[rc+1] - local_offsets[rc];
2301     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2302     if (charcount > 0)
2303     {
2304     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2305     }
2306     else
2307     {
2308     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2309     }
2310     }
2311     }
2312     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2313     }
2314     break;
2315    
2316     /*-----------------------------------------------------------------*/
2317     case OP_ONCE:
2318     {
2319     int local_offsets[2];
2320     int local_workspace[1000];
2321    
2322     int rc = internal_dfa_exec(
2323     md, /* fixed match data */
2324     code, /* this subexpression's code */
2325     ptr, /* where we currently are */
2326     ptr - start_subject, /* start offset */
2327     local_offsets, /* offset vector */
2328     sizeof(local_offsets)/sizeof(int), /* size of same */
2329     local_workspace, /* workspace vector */
2330     sizeof(local_workspace)/sizeof(int), /* size of same */
2331     ims, /* the current ims flags */
2332     rlevel, /* function recursion level */
2333     recursing); /* pass on regex recursion */
2334    
2335     if (rc >= 0)
2336     {
2337     const uschar *end_subpattern = code;
2338     int charcount = local_offsets[1] - local_offsets[0];
2339     int next_state_offset, repeat_state_offset;
2340    
2341     do { end_subpattern += GET(end_subpattern, 1); }
2342     while (*end_subpattern == OP_ALT);
2343     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2344    
2345     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2346     arrange for the repeat state also to be added to the relevant list.
2347     Calculate the offset, or set -1 for no repeat. */
2348    
2349     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2350     *end_subpattern == OP_KETRMIN)?
2351     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2352    
2353     /* If we have matched an empty string, add the next state at the
2354     current character pointer. This is important so that the duplicate
2355     checking kicks in, which is what breaks infinite loops that match an
2356     empty string. */
2357    
2358     if (charcount == 0)
2359     {
2360     ADD_ACTIVE(next_state_offset, 0);
2361     }
2362    
2363     /* Optimization: if there are no more active states, and there
2364     are no new states yet set up, then skip over the subject string
2365     right here, to save looping. Otherwise, set up the new state to swing
2366     into action when the end of the substring is reached. */
2367    
2368     else if (i + 1 >= active_count && new_count == 0)
2369     {
2370     ptr += charcount;
2371     clen = 0;
2372     ADD_NEW(next_state_offset, 0);
2373    
2374     /* If we are adding a repeat state at the new character position,
2375     we must fudge things so that it is the only current state.
2376     Otherwise, it might be a duplicate of one we processed before, and
2377     that would cause it to be skipped. */
2378    
2379     if (repeat_state_offset >= 0)
2380     {
2381     next_active_state = active_states;
2382     active_count = 0;
2383     i = -1;
2384     ADD_ACTIVE(repeat_state_offset, 0);
2385     }
2386     }
2387     else
2388     {
2389     const uschar *p = start_subject + local_offsets[0];
2390     const uschar *pp = start_subject + local_offsets[1];
2391     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2392     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2393     if (repeat_state_offset >= 0)
2394     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2395     }
2396    
2397     }
2398     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2399     }
2400     break;
2401    
2402    
2403     /* ========================================================================== */
2404     /* Handle callouts */
2405    
2406     case OP_CALLOUT:
2407     if (pcre_callout != NULL)
2408     {
2409     int rrc;
2410     pcre_callout_block cb;
2411     cb.version = 1; /* Version 1 of the callout block */
2412     cb.callout_number = code[1];
2413     cb.offset_vector = offsets;
2414 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2415 nigel 77 cb.subject_length = end_subject - start_subject;
2416     cb.start_match = current_subject - start_subject;
2417     cb.current_position = ptr - start_subject;
2418     cb.pattern_position = GET(code, 2);
2419     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2420     cb.capture_top = 1;
2421     cb.capture_last = -1;
2422     cb.callout_data = md->callout_data;
2423     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2424     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2425     }
2426     break;
2427    
2428    
2429     /* ========================================================================== */
2430     default: /* Unsupported opcode */
2431     return PCRE_ERROR_DFA_UITEM;
2432     }
2433    
2434     NEXT_ACTIVE_STATE: continue;
2435    
2436     } /* End of loop scanning active states */
2437    
2438     /* We have finished the processing at the current subject character. If no
2439     new states have been set for the next character, we have found all the
2440     matches that we are going to find. If we are at the top level and partial
2441     matching has been requested, check for appropriate conditions. */
2442    
2443     if (new_count <= 0)
2444     {
2445     if (match_count < 0 && /* No matches found */
2446     rlevel == 1 && /* Top level match function */
2447     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2448     ptr >= end_subject && /* Reached end of subject */
2449     ptr > current_subject) /* Matched non-empty string */
2450     {
2451     if (offsetcount >= 2)
2452     {
2453     offsets[0] = current_subject - start_subject;
2454     offsets[1] = end_subject - start_subject;
2455     }
2456     match_count = PCRE_ERROR_PARTIAL;
2457     }
2458    
2459     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2460     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2461     rlevel*2-2, SP));
2462 nigel 91 break; /* In effect, "return", but see the comment below */
2463 nigel 77 }
2464    
2465     /* One or more states are active for the next character. */
2466    
2467     ptr += clen; /* Advance to next subject character */
2468     } /* Loop to move along the subject string */
2469    
2470 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2471     if we use "return" above, we have compiler trouble. Some compilers warn if
2472     there's nothing here because they think the function doesn't return a value. On
2473     the other hand, if we put a dummy statement here, some more clever compilers
2474     complain that it can't be reached. Sigh. */
2475 nigel 77
2476 nigel 91 return match_count;
2477 nigel 77 }
2478    
2479    
2480    
2481    
2482     /*************************************************
2483     * Execute a Regular Expression - DFA engine *
2484     *************************************************/
2485    
2486     /* This external function applies a compiled re to a subject string using a DFA
2487     engine. This function calls the internal function multiple times if the pattern
2488     is not anchored.
2489    
2490     Arguments:
2491     argument_re points to the compiled expression
2492 ph10 97 extra_data points to extra data or is NULL
2493 nigel 77 subject points to the subject string
2494     length length of subject string (may contain binary zeros)
2495     start_offset where to start in the subject string
2496     options option bits
2497     offsets vector of match offsets
2498     offsetcount size of same
2499     workspace workspace vector
2500     wscount size of same
2501    
2502     Returns: > 0 => number of match offset pairs placed in offsets
2503     = 0 => offsets overflowed; longest matches are present
2504     -1 => failed to match
2505     < -1 => some kind of unexpected problem
2506     */
2507    
2508 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2509 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2510     const char *subject, int length, int start_offset, int options, int *offsets,
2511     int offsetcount, int *workspace, int wscount)
2512     {
2513     real_pcre *re = (real_pcre *)argument_re;
2514     dfa_match_data match_block;
2515 nigel 91 dfa_match_data *md = &match_block;
2516 nigel 77 BOOL utf8, anchored, startline, firstline;
2517     const uschar *current_subject, *end_subject, *lcc;
2518    
2519     pcre_study_data internal_study;
2520     const pcre_study_data *study = NULL;
2521     real_pcre internal_re;
2522    
2523     const uschar *req_byte_ptr;
2524     const uschar *start_bits = NULL;
2525     BOOL first_byte_caseless = FALSE;
2526     BOOL req_byte_caseless = FALSE;
2527     int first_byte = -1;
2528     int req_byte = -1;
2529     int req_byte2 = -1;
2530 nigel 91 int newline;
2531 nigel 77
2532     /* Plausibility checks */
2533    
2534     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2535     if (re == NULL || subject == NULL || workspace == NULL ||
2536     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2537     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2538     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2539    
2540     /* We need to find the pointer to any study data before we test for byte
2541     flipping, so we scan the extra_data block first. This may set two fields in the
2542     match block, so we must initialize them beforehand. However, the other fields
2543     in the match block must not be set until after the byte flipping. */
2544    
2545 nigel 91 md->tables = re->tables;
2546     md->callout_data = NULL;
2547 nigel 77
2548     if (extra_data != NULL)
2549     {
2550     unsigned int flags = extra_data->flags;
2551     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2552     study = (const pcre_study_data *)extra_data->study_data;
2553     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2554 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2555     return PCRE_ERROR_DFA_UMLIMIT;
2556 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2557 nigel 91 md->callout_data = extra_data->callout_data;
2558 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2559 nigel 91 md->tables = extra_data->tables;
2560 nigel 77 }
2561    
2562     /* Check that the first field in the block is the magic number. If it is not,
2563     test for a regex that was compiled on a host of opposite endianness. If this is
2564     the case, flipped values are put in internal_re and internal_study if there was
2565     study data too. */
2566    
2567     if (re->magic_number != MAGIC_NUMBER)
2568     {
2569     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2570     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2571     if (study != NULL) study = &internal_study;
2572     }
2573    
2574     /* Set some local values */
2575    
2576     current_subject = (const unsigned char *)subject + start_offset;
2577     end_subject = (const unsigned char *)subject + length;
2578     req_byte_ptr = current_subject - 1;
2579    
2580 nigel 91 #ifdef SUPPORT_UTF8
2581 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2582 nigel 91 #else
2583     utf8 = FALSE;
2584     #endif
2585 nigel 77
2586 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2587     (re->options & PCRE_ANCHORED) != 0;
2588    
2589 nigel 77 /* The remaining fixed data for passing around. */
2590    
2591 nigel 91 md->start_code = (const uschar *)argument_re +
2592 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2593 nigel 91 md->start_subject = (const unsigned char *)subject;
2594     md->end_subject = end_subject;
2595     md->moptions = options;
2596     md->poptions = re->options;
2597 nigel 77
2598 ph10 231 /* If the BSR option is not set at match time, copy what was set
2599     at compile time. */
2600    
2601     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2602     {
2603     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2604     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2605     #ifdef BSR_ANYCRLF
2606     else md->moptions |= PCRE_BSR_ANYCRLF;
2607 ph10 243 #endif
2608     }
2609 ph10 231
2610 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2611     nothing is set at run time, whatever was used at compile time applies. */
2612 nigel 91
2613 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2614 nigel 93 PCRE_NEWLINE_BITS)
2615 nigel 91 {
2616 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2617 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
2618     case PCRE_NEWLINE_LF: newline = '\n'; break;
2619     case PCRE_NEWLINE_CR+
2620     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2621 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2622 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2623 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2624 nigel 91 }
2625    
2626 ph10 149 if (newline == -2)
2627 nigel 91 {
2628 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2629     }
2630     else if (newline < 0)
2631     {
2632 nigel 93 md->nltype = NLTYPE_ANY;
2633 nigel 91 }
2634     else
2635     {
2636 nigel 93 md->nltype = NLTYPE_FIXED;
2637     if (newline > 255)
2638     {
2639     md->nllen = 2;
2640     md->nl[0] = (newline >> 8) & 255;
2641     md->nl[1] = newline & 255;
2642     }
2643     else
2644     {
2645     md->nllen = 1;
2646     md->nl[0] = newline;
2647     }
2648 nigel 91 }
2649    
2650 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2651     back the character offset. */
2652    
2653     #ifdef SUPPORT_UTF8
2654     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2655     {
2656     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2657     return PCRE_ERROR_BADUTF8;
2658     if (start_offset > 0 && start_offset < length)
2659     {
2660     int tb = ((uschar *)subject)[start_offset];
2661     if (tb > 127)
2662     {
2663     tb &= 0xc0;
2664     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2665     }
2666     }
2667     }
2668     #endif
2669    
2670     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2671     is a feature that makes it possible to save compiled regex and re-use them
2672     in other programs later. */
2673    
2674 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2675 nigel 77
2676     /* The lower casing table and the "must be at the start of a line" flag are
2677     used in a loop when finding where to start. */
2678    
2679 nigel 91 lcc = md->tables + lcc_offset;
2680 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2681 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2682    
2683     /* Set up the first character to match, if available. The first_byte value is
2684     never set for an anchored regular expression, but the anchoring may be forced
2685     at run time, so we have to test for anchoring. The first char may be unset for
2686     an unanchored pattern, of course. If there's no first char and the pattern was
2687     studied, there may be a bitmap of possible first characters. */
2688    
2689     if (!anchored)
2690     {
2691 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2692 nigel 77 {
2693     first_byte = re->first_byte & 255;
2694     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2695     first_byte = lcc[first_byte];
2696     }
2697     else
2698     {
2699     if (startline && study != NULL &&
2700     (study->options & PCRE_STUDY_MAPPED) != 0)
2701     start_bits = study->start_bits;
2702     }
2703     }
2704    
2705     /* For anchored or unanchored matches, there may be a "last known required
2706     character" set. */
2707    
2708 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2709 nigel 77 {
2710     req_byte = re->req_byte & 255;
2711     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2712 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2713 nigel 77 }
2714    
2715     /* Call the main matching function, looping for a non-anchored regex after a
2716     failed match. Unless restarting, optimize by moving to the first match
2717     character if possible, when not anchored. Then unless wanting a partial match,
2718     check for a required later character. */
2719    
2720     for (;;)
2721     {
2722     int rc;
2723    
2724     if ((options & PCRE_DFA_RESTART) == 0)
2725     {
2726     const uschar *save_end_subject = end_subject;
2727    
2728     /* Advance to a unique first char if possible. If firstline is TRUE, the
2729     start of the match is constrained to the first line of a multiline string.
2730 nigel 87 Implement this by temporarily adjusting end_subject so that we stop
2731     scanning at a newline. If the match fails at the newline, later code breaks
2732     this loop. */
2733 nigel 77
2734     if (firstline)
2735     {
2736     const uschar *t = current_subject;
2737 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2738 nigel 77 end_subject = t;
2739     }
2740    
2741     if (first_byte >= 0)
2742     {
2743     if (first_byte_caseless)
2744     while (current_subject < end_subject &&
2745     lcc[*current_subject] != first_byte)
2746     current_subject++;
2747     else
2748     while (current_subject < end_subject && *current_subject != first_byte)
2749     current_subject++;
2750     }
2751    
2752 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
2753 nigel 77
2754     else if (startline)
2755     {
2756 nigel 93 if (current_subject > md->start_subject + start_offset)
2757 nigel 77 {
2758 ph10 361 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2759 nigel 77 current_subject++;
2760 ph10 130
2761 ph10 149 /* If we have just passed a CR and the newline option is ANY or
2762     ANYCRLF, and we are now at a LF, advance the match position by one more
2763     character. */
2764 ph10 134
2765 ph10 130 if (current_subject[-1] == '\r' &&
2766 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2767 ph10 130 current_subject < end_subject &&
2768     *current_subject == '\n')
2769     current_subject++;
2770 nigel 77 }
2771     }
2772    
2773     /* Or to a non-unique first char after study */
2774    
2775     else if (start_bits != NULL)
2776     {
2777     while (current_subject < end_subject)
2778     {
2779     register unsigned int c = *current_subject;
2780     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2781     else break;
2782     }
2783     }
2784    
2785     /* Restore fudged end_subject */
2786    
2787     end_subject = save_end_subject;
2788     }
2789    
2790     /* If req_byte is set, we know that that character must appear in the subject
2791     for the match to succeed. If the first character is set, req_byte must be
2792     later in the subject; otherwise the test starts at the match point. This
2793     optimization can save a huge amount of work in patterns with nested unlimited
2794     repeats that aren't going to match. Writing separate code for cased/caseless
2795     versions makes it go faster, as does using an autoincrement and backing off
2796     on a match.
2797    
2798     HOWEVER: when the subject string is very, very long, searching to its end can
2799     take a long time, and give bad performance on quite ordinary patterns. This
2800     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2801     don't do this when the string is sufficiently long.
2802    
2803     ALSO: this processing is disabled when partial matching is requested.
2804     */
2805    
2806     if (req_byte >= 0 &&
2807     end_subject - current_subject < REQ_BYTE_MAX &&
2808     (options & PCRE_PARTIAL) == 0)
2809     {
2810     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2811    
2812     /* We don't need to repeat the search if we haven't yet reached the
2813     place we found it at last time. */
2814    
2815     if (p > req_byte_ptr)
2816     {
2817     if (req_byte_caseless)
2818     {
2819     while (p < end_subject)
2820     {
2821     register int pp = *p++;
2822     if (pp == req_byte || pp == req_byte2) { p--; break; }
2823     }
2824     }
2825     else
2826     {
2827     while (p < end_subject)
2828     {
2829     if (*p++ == req_byte) { p--; break; }
2830     }
2831     }
2832    
2833     /* If we can't find the required character, break the matching loop,
2834     which will cause a return or PCRE_ERROR_NOMATCH. */
2835    
2836     if (p >= end_subject) break;
2837    
2838     /* If we have found the required character, save the point where we
2839     found it, so that we don't search again next time round the loop if
2840     the start hasn't passed this character yet. */
2841    
2842     req_byte_ptr = p;
2843     }
2844     }
2845    
2846     /* OK, now we can do the business */
2847    
2848     rc = internal_dfa_exec(
2849 nigel 91 md, /* fixed match data */
2850     md->start_code, /* this subexpression's code */
2851     current_subject, /* where we currently are */
2852     start_offset, /* start offset in subject */
2853     offsets, /* offset vector */
2854     offsetcount, /* size of same */
2855     workspace, /* workspace vector */
2856     wscount, /* size of same */
2857 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2858 nigel 91 0, /* function recurse level */
2859     0); /* regex recurse level */
2860 nigel 77
2861     /* Anything other than "no match" means we are done, always; otherwise, carry
2862     on only if not anchored. */
2863    
2864     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2865    
2866     /* Advance to the next subject character unless we are at the end of a line
2867     and firstline is set. */
2868    
2869 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2870 nigel 77 current_subject++;
2871     if (utf8)
2872     {
2873     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2874     current_subject++;
2875     }
2876     if (current_subject > end_subject) break;
2877    
2878 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2879 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2880     or ANY or ANYCRLF, advance the match position by one more character. */
2881 nigel 93
2882     if (current_subject[-1] == '\r' &&
2883 ph10 226 current_subject < end_subject &&
2884     *current_subject == '\n' &&
2885 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2886 ph10 226 (md->nltype == NLTYPE_ANY ||
2887     md->nltype == NLTYPE_ANYCRLF ||
2888     md->nllen == 2))
2889 nigel 93 current_subject++;
2890    
2891     } /* "Bumpalong" loop */
2892    
2893 nigel 77 return PCRE_ERROR_NOMATCH;
2894     }
2895    
2896     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12