/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 391 - (hide annotations) (download)
Tue Mar 17 21:16:01 2009 UTC (5 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 96628 byte(s)
Add support for UTF-8 in EBCDIC environments.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6 ph10 383 and semantics are as close as possible to those of the Perl 5 language (but see
7     below for why this module is different).
8 nigel 77
9     Written by Philip Hazel
10 ph10 383 Copyright (c) 1997-2009 University of Cambridge
11 nigel 77
12     -----------------------------------------------------------------------------
13     Redistribution and use in source and binary forms, with or without
14     modification, are permitted provided that the following conditions are met:
15    
16     * Redistributions of source code must retain the above copyright notice,
17     this list of conditions and the following disclaimer.
18    
19     * Redistributions in binary form must reproduce the above copyright
20     notice, this list of conditions and the following disclaimer in the
21     documentation and/or other materials provided with the distribution.
22    
23     * Neither the name of the University of Cambridge nor the names of its
24     contributors may be used to endorse or promote products derived from
25     this software without specific prior written permission.
26    
27     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37     POSSIBILITY OF SUCH DAMAGE.
38     -----------------------------------------------------------------------------
39     */
40    
41    
42     /* This module contains the external function pcre_dfa_exec(), which is an
43 nigel 93 alternative matching function that uses a sort of DFA algorithm (not a true
44     FSM). This is NOT Perl- compatible, but it has advantages in certain
45     applications. */
46 nigel 77
47    
48 ph10 200 #ifdef HAVE_CONFIG_H
49 ph10 236 #include "config.h"
50 ph10 200 #endif
51 ph10 199
52 nigel 93 #define NLBLOCK md /* Block containing newline information */
53     #define PSSTART start_subject /* Field containing processed string start */
54     #define PSEND end_subject /* Field containing processed string end */
55    
56 nigel 77 #include "pcre_internal.h"
57    
58    
59     /* For use to indent debugging output */
60    
61     #define SP " "
62    
63    
64    
65     /*************************************************
66     * Code parameters and static tables *
67     *************************************************/
68    
69     /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
70 nigel 93 into others, under special conditions. A gap of 20 between the blocks should be
71 ph10 182 enough. The resulting opcodes don't have to be less than 256 because they are
72 ph10 178 never stored, so we push them well clear of the normal opcodes. */
73 nigel 77
74 ph10 178 #define OP_PROP_EXTRA 300
75     #define OP_EXTUNI_EXTRA 320
76     #define OP_ANYNL_EXTRA 340
77     #define OP_HSPACE_EXTRA 360
78     #define OP_VSPACE_EXTRA 380
79 nigel 77
80    
81     /* This table identifies those opcodes that are followed immediately by a
82     character that is to be tested in some way. This makes is possible to
83     centralize the loading of these characters. In the case of Type * etc, the
84     "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
85 ph10 172 small value. ***NOTE*** If the start of this table is modified, the two tables
86 ph10 168 that follow must also be modified. */
87 nigel 77
88 ph10 327 static const uschar coptable[] = {
89 nigel 77 0, /* End */
90 ph10 168 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
91     0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
92 ph10 341 0, 0, 0, /* Any, AllAny, Anybyte */
93 ph10 178 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
94     0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
95 nigel 77 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
96     1, /* Char */
97     1, /* Charnc */
98     1, /* not */
99     /* Positive single-char repeats */
100     1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
101     3, 3, 3, /* upto, minupto, exact */
102 nigel 93 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
103 nigel 77 /* Negative single-char repeats - only for chars < 256 */
104     1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
105     3, 3, 3, /* NOT upto, minupto, exact */
106 nigel 93 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
107 nigel 77 /* Positive type repeats */
108     1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
109     3, 3, 3, /* Type upto, minupto, exact */
110 nigel 93 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
111 nigel 77 /* Character class & ref repeats */
112     0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
113     0, 0, /* CRRANGE, CRMINRANGE */
114     0, /* CLASS */
115     0, /* NCLASS */
116     0, /* XCLASS - variable length */
117     0, /* REF */
118     0, /* RECURSE */
119     0, /* CALLOUT */
120     0, /* Alt */
121     0, /* Ket */
122     0, /* KetRmax */
123     0, /* KetRmin */
124     0, /* Assert */
125     0, /* Assert not */
126     0, /* Assert behind */
127     0, /* Assert behind not */
128     0, /* Reverse */
129 nigel 93 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
130     0, 0, 0, /* SBRA, SCBRA, SCOND */
131 nigel 77 0, /* CREF */
132 nigel 93 0, /* RREF */
133     0, /* DEF */
134 ph10 210 0, 0, /* BRAZERO, BRAMINZERO */
135     0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
136 ph10 341 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
137 nigel 77 };
138    
139     /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
140     and \w */
141    
142 ph10 327 static const uschar toptable1[] = {
143 ph10 168 0, 0, 0, 0, 0, 0,
144 nigel 77 ctype_digit, ctype_digit,
145     ctype_space, ctype_space,
146     ctype_word, ctype_word,
147 ph10 341 0, 0 /* OP_ANY, OP_ALLANY */
148 nigel 77 };
149    
150 ph10 327 static const uschar toptable2[] = {
151 ph10 168 0, 0, 0, 0, 0, 0,
152 nigel 77 ctype_digit, 0,
153     ctype_space, 0,
154     ctype_word, 0,
155 ph10 341 1, 1 /* OP_ANY, OP_ALLANY */
156 nigel 77 };
157    
158    
159     /* Structure for holding data about a particular state, which is in effect the
160     current data for an active path through the match tree. It must consist
161     entirely of ints because the working vector we are passed, and which we put
162     these structures in, is a vector of ints. */
163    
164     typedef struct stateblock {
165     int offset; /* Offset to opcode */
166     int count; /* Count for repeats */
167     int ims; /* ims flag bits */
168     int data; /* Some use extra data */
169     } stateblock;
170    
171     #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
172    
173    
174     #ifdef DEBUG
175     /*************************************************
176     * Print character string *
177     *************************************************/
178    
179     /* Character string printing function for debugging.
180    
181     Arguments:
182     p points to string
183     length number of bytes
184     f where to print
185    
186     Returns: nothing
187     */
188    
189     static void
190     pchars(unsigned char *p, int length, FILE *f)
191     {
192     int c;
193     while (length-- > 0)
194     {
195     if (isprint(c = *(p++)))
196     fprintf(f, "%c", c);
197     else
198     fprintf(f, "\\x%02x", c);
199     }
200     }
201     #endif
202    
203    
204    
205     /*************************************************
206     * Execute a Regular Expression - DFA engine *
207     *************************************************/
208    
209     /* This internal function applies a compiled pattern to a subject string,
210     starting at a given point, using a DFA engine. This function is called from the
211     external one, possibly multiple times if the pattern is not anchored. The
212     function calls itself recursively for some kinds of subpattern.
213    
214     Arguments:
215     md the match_data block with fixed information
216     this_start_code the opening bracket of this subexpression's code
217     current_subject where we currently are in the subject string
218     start_offset start offset in the subject string
219     offsets vector to contain the matching string offsets
220     offsetcount size of same
221     workspace vector of workspace
222     wscount size of same
223     ims the current ims flags
224     rlevel function call recursion level
225     recursing regex recursive call level
226    
227 ph10 345 Returns: > 0 => number of match offset pairs placed in offsets
228 ph10 341 = 0 => offsets overflowed; longest matches are present
229 nigel 77 -1 => failed to match
230     < -1 => some kind of unexpected problem
231    
232     The following macros are used for adding states to the two state vectors (one
233     for the current character, one for the following character). */
234    
235     #define ADD_ACTIVE(x,y) \
236     if (active_count++ < wscount) \
237     { \
238     next_active_state->offset = (x); \
239     next_active_state->count = (y); \
240     next_active_state->ims = ims; \
241     next_active_state++; \
242     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
243     } \
244     else return PCRE_ERROR_DFA_WSSIZE
245    
246     #define ADD_ACTIVE_DATA(x,y,z) \
247     if (active_count++ < wscount) \
248     { \
249     next_active_state->offset = (x); \
250     next_active_state->count = (y); \
251     next_active_state->ims = ims; \
252     next_active_state->data = (z); \
253     next_active_state++; \
254     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
255     } \
256     else return PCRE_ERROR_DFA_WSSIZE
257    
258     #define ADD_NEW(x,y) \
259     if (new_count++ < wscount) \
260     { \
261     next_new_state->offset = (x); \
262     next_new_state->count = (y); \
263     next_new_state->ims = ims; \
264     next_new_state++; \
265     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
266     } \
267     else return PCRE_ERROR_DFA_WSSIZE
268    
269     #define ADD_NEW_DATA(x,y,z) \
270     if (new_count++ < wscount) \
271     { \
272     next_new_state->offset = (x); \
273     next_new_state->count = (y); \
274     next_new_state->ims = ims; \
275     next_new_state->data = (z); \
276     next_new_state++; \
277     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
278     } \
279     else return PCRE_ERROR_DFA_WSSIZE
280    
281     /* And now, here is the code */
282    
283     static int
284     internal_dfa_exec(
285     dfa_match_data *md,
286     const uschar *this_start_code,
287     const uschar *current_subject,
288     int start_offset,
289     int *offsets,
290     int offsetcount,
291     int *workspace,
292     int wscount,
293     int ims,
294     int rlevel,
295     int recursing)
296     {
297     stateblock *active_states, *new_states, *temp_states;
298     stateblock *next_active_state, *next_new_state;
299    
300     const uschar *ctypes, *lcc, *fcc;
301     const uschar *ptr;
302 nigel 93 const uschar *end_code, *first_op;
303 nigel 77
304     int active_count, new_count, match_count;
305    
306     /* Some fields in the md block are frequently referenced, so we load them into
307     independent variables in the hope that this will perform better. */
308    
309     const uschar *start_subject = md->start_subject;
310     const uschar *end_subject = md->end_subject;
311     const uschar *start_code = md->start_code;
312    
313 nigel 87 #ifdef SUPPORT_UTF8
314 nigel 77 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
315 nigel 93 #else
316     BOOL utf8 = FALSE;
317 nigel 87 #endif
318 nigel 77
319     rlevel++;
320     offsetcount &= (-2);
321    
322     wscount -= 2;
323     wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
324     (2 * INTS_PER_STATEBLOCK);
325    
326     DPRINTF(("\n%.*s---------------------\n"
327     "%.*sCall to internal_dfa_exec f=%d r=%d\n",
328     rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
329    
330     ctypes = md->tables + ctypes_offset;
331     lcc = md->tables + lcc_offset;
332     fcc = md->tables + fcc_offset;
333    
334     match_count = PCRE_ERROR_NOMATCH; /* A negative number */
335    
336     active_states = (stateblock *)(workspace + 2);
337     next_new_state = new_states = active_states + wscount;
338     new_count = 0;
339    
340 nigel 93 first_op = this_start_code + 1 + LINK_SIZE +
341     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
342    
343 nigel 77 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
344     the alternative states onto the list, and find out where the end is. This
345     makes is possible to use this function recursively, when we want to stop at a
346     matching internal ket rather than at the end.
347    
348     If the first opcode in the first alternative is OP_REVERSE, we are dealing with
349     a backward assertion. In that case, we have to find out the maximum amount to
350     move back, and set up each alternative appropriately. */
351    
352 nigel 93 if (*first_op == OP_REVERSE)
353 nigel 77 {
354     int max_back = 0;
355     int gone_back;
356    
357     end_code = this_start_code;
358     do
359     {
360     int back = GET(end_code, 2+LINK_SIZE);
361     if (back > max_back) max_back = back;
362     end_code += GET(end_code, 1);
363     }
364     while (*end_code == OP_ALT);
365    
366     /* If we can't go back the amount required for the longest lookbehind
367     pattern, go back as far as we can; some alternatives may still be viable. */
368    
369     #ifdef SUPPORT_UTF8
370     /* In character mode we have to step back character by character */
371    
372     if (utf8)
373     {
374     for (gone_back = 0; gone_back < max_back; gone_back++)
375     {
376     if (current_subject <= start_subject) break;
377     current_subject--;
378     while (current_subject > start_subject &&
379     (*current_subject & 0xc0) == 0x80)
380     current_subject--;
381     }
382     }
383     else
384     #endif
385    
386     /* In byte-mode we can do this quickly. */
387    
388     {
389     gone_back = (current_subject - max_back < start_subject)?
390     current_subject - start_subject : max_back;
391     current_subject -= gone_back;
392     }
393    
394     /* Now we can process the individual branches. */
395    
396     end_code = this_start_code;
397     do
398     {
399     int back = GET(end_code, 2+LINK_SIZE);
400     if (back <= gone_back)
401     {
402     int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
403     ADD_NEW_DATA(-bstate, 0, gone_back - back);
404     }
405     end_code += GET(end_code, 1);
406     }
407     while (*end_code == OP_ALT);
408     }
409    
410     /* This is the code for a "normal" subpattern (not a backward assertion). The
411     start of a whole pattern is always one of these. If we are at the top level,
412     we may be asked to restart matching from the same point that we reached for a
413     previous partial match. We still have to scan through the top-level branches to
414     find the end state. */
415    
416     else
417     {
418     end_code = this_start_code;
419    
420     /* Restarting */
421    
422     if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
423     {
424     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
425     new_count = workspace[1];
426     if (!workspace[0])
427     memcpy(new_states, active_states, new_count * sizeof(stateblock));
428     }
429    
430     /* Not restarting */
431    
432     else
433     {
434 nigel 93 int length = 1 + LINK_SIZE +
435     ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
436 nigel 77 do
437     {
438 nigel 93 ADD_NEW(end_code - start_code + length, 0);
439 nigel 77 end_code += GET(end_code, 1);
440 nigel 93 length = 1 + LINK_SIZE;
441 nigel 77 }
442     while (*end_code == OP_ALT);
443     }
444     }
445    
446     workspace[0] = 0; /* Bit indicating which vector is current */
447    
448     DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
449    
450     /* Loop for scanning the subject */
451    
452     ptr = current_subject;
453     for (;;)
454     {
455     int i, j;
456 nigel 91 int clen, dlen;
457     unsigned int c, d;
458 nigel 77
459     /* Make the new state list into the active state list and empty the
460     new state list. */
461    
462     temp_states = active_states;
463     active_states = new_states;
464     new_states = temp_states;
465     active_count = new_count;
466     new_count = 0;
467    
468     workspace[0] ^= 1; /* Remember for the restarting feature */
469     workspace[1] = active_count;
470    
471     #ifdef DEBUG
472     printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
473     pchars((uschar *)ptr, strlen((char *)ptr), stdout);
474     printf("\"\n");
475    
476     printf("%.*sActive states: ", rlevel*2-2, SP);
477     for (i = 0; i < active_count; i++)
478     printf("%d/%d ", active_states[i].offset, active_states[i].count);
479     printf("\n");
480     #endif
481    
482     /* Set the pointers for adding new states */
483    
484     next_active_state = active_states + active_count;
485     next_new_state = new_states;
486    
487     /* Load the current character from the subject outside the loop, as many
488     different states may want to look at it, and we assume that at least one
489     will. */
490    
491     if (ptr < end_subject)
492     {
493 nigel 93 clen = 1; /* Number of bytes in the character */
494 nigel 77 #ifdef SUPPORT_UTF8
495     if (utf8) { GETCHARLEN(c, ptr, clen); } else
496     #endif /* SUPPORT_UTF8 */
497     c = *ptr;
498     }
499     else
500     {
501 nigel 93 clen = 0; /* This indicates the end of the subject */
502     c = NOTACHAR; /* This value should never actually be used */
503 nigel 77 }
504    
505     /* Scan up the active states and act on each one. The result of an action
506     may be to add more states to the currently active list (e.g. on hitting a
507     parenthesis) or it may be to put states on the new list, for considering
508     when we move the character pointer on. */
509    
510     for (i = 0; i < active_count; i++)
511     {
512     stateblock *current_state = active_states + i;
513     const uschar *code;
514     int state_offset = current_state->offset;
515     int count, codevalue;
516    
517     #ifdef DEBUG
518     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
519 nigel 93 if (clen == 0) printf("EOL\n");
520 nigel 77 else if (c > 32 && c < 127) printf("'%c'\n", c);
521     else printf("0x%02x\n", c);
522     #endif
523    
524     /* This variable is referred to implicity in the ADD_xxx macros. */
525    
526     ims = current_state->ims;
527    
528     /* A negative offset is a special case meaning "hold off going to this
529     (negated) state until the number of characters in the data field have
530     been skipped". */
531    
532     if (state_offset < 0)
533     {
534     if (current_state->data > 0)
535     {
536     DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
537     ADD_NEW_DATA(state_offset, current_state->count,
538     current_state->data - 1);
539     continue;
540     }
541     else
542     {
543     current_state->offset = state_offset = -state_offset;
544     }
545     }
546    
547     /* Check for a duplicate state with the same count, and skip if found. */
548    
549     for (j = 0; j < i; j++)
550     {
551     if (active_states[j].offset == state_offset &&
552     active_states[j].count == current_state->count)
553     {
554     DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
555     goto NEXT_ACTIVE_STATE;
556     }
557     }
558    
559     /* The state offset is the offset to the opcode */
560    
561     code = start_code + state_offset;
562     codevalue = *code;
563    
564     /* If this opcode is followed by an inline character, load it. It is
565     tempting to test for the presence of a subject character here, but that
566     is wrong, because sometimes zero repetitions of the subject are
567     permitted.
568    
569     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570 ph10 178 argument that is not a data character - but is always one byte long. We
571     have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
572     this case. To keep the other cases fast, convert these ones to new opcodes.
573     */
574 nigel 77
575     if (coptable[codevalue] > 0)
576     {
577     dlen = 1;
578     #ifdef SUPPORT_UTF8
579     if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
580     #endif /* SUPPORT_UTF8 */
581     d = code[coptable[codevalue]];
582     if (codevalue >= OP_TYPESTAR)
583     {
584 nigel 93 switch(d)
585     {
586     case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
587     case OP_NOTPROP:
588     case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589     case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590     case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591 ph10 178 case OP_NOT_HSPACE:
592 ph10 182 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593 ph10 178 case OP_NOT_VSPACE:
594 ph10 182 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595 nigel 93 default: break;
596     }
597 nigel 77 }
598     }
599     else
600     {
601     dlen = 0; /* Not strictly necessary, but compilers moan */
602 nigel 93 d = NOTACHAR; /* if these variables are not set. */
603 nigel 77 }
604    
605    
606     /* Now process the individual opcodes */
607    
608     switch (codevalue)
609     {
610    
611     /* ========================================================================== */
612     /* Reached a closing bracket. If not at the end of the pattern, carry
613     on with the next opcode. Otherwise, unless we have an empty string and
614     PCRE_NOTEMPTY is set, save the match data, shifting up all previous
615     matches so we always have the longest first. */
616    
617     case OP_KET:
618     case OP_KETRMIN:
619     case OP_KETRMAX:
620     if (code != end_code)
621     {
622     ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
623     if (codevalue != OP_KET)
624     {
625     ADD_ACTIVE(state_offset - GET(code, 1), 0);
626     }
627     }
628     else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
629     {
630     if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
631     else if (match_count > 0 && ++match_count * 2 >= offsetcount)
632     match_count = 0;
633     count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
634     if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
635     if (offsetcount >= 2)
636     {
637     offsets[0] = current_subject - start_subject;
638     offsets[1] = ptr - start_subject;
639     DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
640     offsets[1] - offsets[0], current_subject));
641     }
642     if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
643     {
644     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
645     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
646     match_count, rlevel*2-2, SP));
647     return match_count;
648     }
649     }
650     break;
651    
652     /* ========================================================================== */
653     /* These opcodes add to the current list of states without looking
654     at the current character. */
655    
656     /*-----------------------------------------------------------------*/
657     case OP_ALT:
658     do { code += GET(code, 1); } while (*code == OP_ALT);
659     ADD_ACTIVE(code - start_code, 0);
660     break;
661    
662     /*-----------------------------------------------------------------*/
663     case OP_BRA:
664 nigel 93 case OP_SBRA:
665 nigel 77 do
666     {
667     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
668     code += GET(code, 1);
669     }
670     while (*code == OP_ALT);
671     break;
672    
673     /*-----------------------------------------------------------------*/
674 nigel 93 case OP_CBRA:
675     case OP_SCBRA:
676     ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
677     code += GET(code, 1);
678     while (*code == OP_ALT)
679     {
680     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
681     code += GET(code, 1);
682     }
683     break;
684    
685     /*-----------------------------------------------------------------*/
686 nigel 77 case OP_BRAZERO:
687     case OP_BRAMINZERO:
688     ADD_ACTIVE(state_offset + 1, 0);
689     code += 1 + GET(code, 2);
690     while (*code == OP_ALT) code += GET(code, 1);
691     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
692     break;
693    
694     /*-----------------------------------------------------------------*/
695 ph10 335 case OP_SKIPZERO:
696     code += 1 + GET(code, 2);
697     while (*code == OP_ALT) code += GET(code, 1);
698     ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
699     break;
700    
701     /*-----------------------------------------------------------------*/
702 nigel 77 case OP_CIRC:
703     if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
704 nigel 91 ((ims & PCRE_MULTILINE) != 0 &&
705     ptr != end_subject &&
706 nigel 93 WAS_NEWLINE(ptr)))
707 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
708     break;
709    
710     /*-----------------------------------------------------------------*/
711     case OP_EOD:
712     if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
713     break;
714    
715     /*-----------------------------------------------------------------*/
716     case OP_OPT:
717     ims = code[1];
718     ADD_ACTIVE(state_offset + 2, 0);
719     break;
720    
721     /*-----------------------------------------------------------------*/
722     case OP_SOD:
723     if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
724     break;
725    
726     /*-----------------------------------------------------------------*/
727     case OP_SOM:
728     if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
729     break;
730    
731    
732     /* ========================================================================== */
733     /* These opcodes inspect the next subject character, and sometimes
734     the previous one as well, but do not have an argument. The variable
735     clen contains the length of the current character and is zero if we are
736     at the end of the subject. */
737    
738     /*-----------------------------------------------------------------*/
739     case OP_ANY:
740 ph10 342 if (clen > 0 && !IS_NEWLINE(ptr))
741 nigel 77 { ADD_NEW(state_offset + 1, 0); }
742     break;
743    
744     /*-----------------------------------------------------------------*/
745 ph10 341 case OP_ALLANY:
746     if (clen > 0)
747     { ADD_NEW(state_offset + 1, 0); }
748     break;
749    
750     /*-----------------------------------------------------------------*/
751 nigel 77 case OP_EODN:
752 nigel 93 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
753 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
754     break;
755    
756     /*-----------------------------------------------------------------*/
757     case OP_DOLL:
758     if ((md->moptions & PCRE_NOTEOL) == 0)
759     {
760 nigel 91 if (clen == 0 ||
761 ph10 383 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
762 nigel 91 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
763     ))
764 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
765     }
766 nigel 93 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
767 nigel 77 { ADD_ACTIVE(state_offset + 1, 0); }
768     break;
769    
770     /*-----------------------------------------------------------------*/
771    
772     case OP_DIGIT:
773     case OP_WHITESPACE:
774     case OP_WORDCHAR:
775     if (clen > 0 && c < 256 &&
776     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
777     { ADD_NEW(state_offset + 1, 0); }
778     break;
779    
780     /*-----------------------------------------------------------------*/
781     case OP_NOT_DIGIT:
782     case OP_NOT_WHITESPACE:
783     case OP_NOT_WORDCHAR:
784     if (clen > 0 && (c >= 256 ||
785     ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
786     { ADD_NEW(state_offset + 1, 0); }
787     break;
788    
789     /*-----------------------------------------------------------------*/
790     case OP_WORD_BOUNDARY:
791     case OP_NOT_WORD_BOUNDARY:
792     {
793     int left_word, right_word;
794    
795     if (ptr > start_subject)
796     {
797     const uschar *temp = ptr - 1;
798     #ifdef SUPPORT_UTF8
799     if (utf8) BACKCHAR(temp);
800     #endif
801     GETCHARTEST(d, temp);
802     left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
803     }
804     else left_word = 0;
805    
806     if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
807     else right_word = 0;
808    
809     if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
810     { ADD_ACTIVE(state_offset + 1, 0); }
811     }
812     break;
813    
814    
815     /*-----------------------------------------------------------------*/
816     /* Check the next character by Unicode property. We will get here only
817     if the support is in the binary; otherwise a compile-time error occurs.
818     */
819    
820 ph10 151 #ifdef SUPPORT_UCP
821 nigel 77 case OP_PROP:
822     case OP_NOTPROP:
823     if (clen > 0)
824     {
825 nigel 87 BOOL OK;
826 ph10 349 const ucd_record * prop = GET_UCD(c);
827 nigel 87 switch(code[1])
828 nigel 77 {
829 nigel 87 case PT_ANY:
830     OK = TRUE;
831     break;
832    
833     case PT_LAMP:
834 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
835 nigel 87 break;
836    
837     case PT_GC:
838 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
839 nigel 87 break;
840    
841     case PT_PC:
842 ph10 349 OK = prop->chartype == code[2];
843 nigel 87 break;
844    
845     case PT_SC:
846 ph10 349 OK = prop->script == code[2];
847 nigel 87 break;
848    
849     /* Should never occur, but keep compilers from grumbling. */
850    
851     default:
852     OK = codevalue != OP_PROP;
853     break;
854 nigel 77 }
855 nigel 87
856     if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
857 nigel 77 }
858     break;
859     #endif
860    
861    
862    
863     /* ========================================================================== */
864     /* These opcodes likewise inspect the subject character, but have an
865     argument that is not a data character. It is one of these opcodes:
866 ph10 341 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
867     OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
868 nigel 77
869     case OP_TYPEPLUS:
870     case OP_TYPEMINPLUS:
871 nigel 93 case OP_TYPEPOSPLUS:
872 nigel 77 count = current_state->count; /* Already matched */
873     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
874     if (clen > 0)
875     {
876     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
877     (c < 256 &&
878 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
879 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
880     {
881 nigel 93 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
882     {
883     active_count--; /* Remove non-match possibility */
884     next_active_state--;
885     }
886 nigel 77 count++;
887     ADD_NEW(state_offset, count);
888     }
889     }
890     break;
891    
892     /*-----------------------------------------------------------------*/
893     case OP_TYPEQUERY:
894     case OP_TYPEMINQUERY:
895 nigel 93 case OP_TYPEPOSQUERY:
896 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
897     if (clen > 0)
898     {
899     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900     (c < 256 &&
901 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
902 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
903     {
904 nigel 93 if (codevalue == OP_TYPEPOSQUERY)
905     {
906     active_count--; /* Remove non-match possibility */
907     next_active_state--;
908     }
909 nigel 77 ADD_NEW(state_offset + 2, 0);
910     }
911     }
912     break;
913    
914     /*-----------------------------------------------------------------*/
915     case OP_TYPESTAR:
916     case OP_TYPEMINSTAR:
917 nigel 93 case OP_TYPEPOSSTAR:
918 nigel 77 ADD_ACTIVE(state_offset + 2, 0);
919     if (clen > 0)
920     {
921     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
922     (c < 256 &&
923 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
924 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
925     {
926 nigel 93 if (codevalue == OP_TYPEPOSSTAR)
927     {
928     active_count--; /* Remove non-match possibility */
929     next_active_state--;
930     }
931 nigel 77 ADD_NEW(state_offset, 0);
932     }
933     }
934     break;
935    
936     /*-----------------------------------------------------------------*/
937     case OP_TYPEEXACT:
938 nigel 93 count = current_state->count; /* Number already matched */
939     if (clen > 0)
940     {
941     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
942     (c < 256 &&
943 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
944 nigel 93 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
945     {
946     if (++count >= GET2(code, 1))
947     { ADD_NEW(state_offset + 4, 0); }
948     else
949     { ADD_NEW(state_offset, count); }
950     }
951     }
952     break;
953    
954     /*-----------------------------------------------------------------*/
955 nigel 77 case OP_TYPEUPTO:
956     case OP_TYPEMINUPTO:
957 nigel 93 case OP_TYPEPOSUPTO:
958     ADD_ACTIVE(state_offset + 4, 0);
959 nigel 77 count = current_state->count; /* Number already matched */
960     if (clen > 0)
961     {
962     if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
963     (c < 256 &&
964 ph10 342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
965 nigel 77 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
966     {
967 nigel 93 if (codevalue == OP_TYPEPOSUPTO)
968     {
969     active_count--; /* Remove non-match possibility */
970     next_active_state--;
971     }
972 nigel 77 if (++count >= GET2(code, 1))
973     { ADD_NEW(state_offset + 4, 0); }
974     else
975     { ADD_NEW(state_offset, count); }
976     }
977     }
978     break;
979    
980     /* ========================================================================== */
981     /* These are virtual opcodes that are used when something like
982 nigel 93 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
983     argument. It keeps the code above fast for the other cases. The argument
984     is in the d variable. */
985 nigel 77
986 ph10 151 #ifdef SUPPORT_UCP
987 nigel 77 case OP_PROP_EXTRA + OP_TYPEPLUS:
988     case OP_PROP_EXTRA + OP_TYPEMINPLUS:
989 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
990 nigel 77 count = current_state->count; /* Already matched */
991 nigel 87 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
992 nigel 77 if (clen > 0)
993     {
994 nigel 87 BOOL OK;
995 ph10 349 const ucd_record * prop = GET_UCD(c);
996 nigel 87 switch(code[2])
997     {
998     case PT_ANY:
999     OK = TRUE;
1000     break;
1001    
1002     case PT_LAMP:
1003 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1004 nigel 87 break;
1005    
1006     case PT_GC:
1007 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1008 nigel 87 break;
1009    
1010     case PT_PC:
1011 ph10 349 OK = prop->chartype == code[3];
1012 nigel 87 break;
1013    
1014     case PT_SC:
1015 ph10 349 OK = prop->script == code[3];
1016 nigel 87 break;
1017    
1018     /* Should never occur, but keep compilers from grumbling. */
1019    
1020     default:
1021     OK = codevalue != OP_PROP;
1022     break;
1023     }
1024    
1025 nigel 93 if (OK == (d == OP_PROP))
1026     {
1027     if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1028     {
1029     active_count--; /* Remove non-match possibility */
1030     next_active_state--;
1031     }
1032     count++;
1033     ADD_NEW(state_offset, count);
1034     }
1035 nigel 77 }
1036     break;
1037    
1038     /*-----------------------------------------------------------------*/
1039     case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1040     case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1041 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1042 nigel 77 count = current_state->count; /* Already matched */
1043     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1044 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1045 nigel 77 {
1046     const uschar *nptr = ptr + clen;
1047     int ncount = 0;
1048 nigel 93 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1049     {
1050     active_count--; /* Remove non-match possibility */
1051     next_active_state--;
1052     }
1053 nigel 77 while (nptr < end_subject)
1054     {
1055     int nd;
1056     int ndlen = 1;
1057     GETCHARLEN(nd, nptr, ndlen);
1058 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1059 nigel 77 ncount++;
1060     nptr += ndlen;
1061     }
1062     count++;
1063     ADD_NEW_DATA(-state_offset, count, ncount);
1064     }
1065     break;
1066 ph10 151 #endif
1067 nigel 77
1068     /*-----------------------------------------------------------------*/
1069 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1070     case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1071     case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1072     count = current_state->count; /* Already matched */
1073     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1074     if (clen > 0)
1075     {
1076     int ncount = 0;
1077     switch (c)
1078     {
1079     case 0x000b:
1080     case 0x000c:
1081     case 0x0085:
1082     case 0x2028:
1083     case 0x2029:
1084 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1085     goto ANYNL01;
1086    
1087     case 0x000d:
1088     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1089     /* Fall through */
1090    
1091     ANYNL01:
1092     case 0x000a:
1093 nigel 93 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1094     {
1095     active_count--; /* Remove non-match possibility */
1096     next_active_state--;
1097     }
1098     count++;
1099     ADD_NEW_DATA(-state_offset, count, ncount);
1100     break;
1101 ph10 231
1102 nigel 93 default:
1103     break;
1104     }
1105     }
1106     break;
1107    
1108     /*-----------------------------------------------------------------*/
1109 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1110     case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1111     case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1112     count = current_state->count; /* Already matched */
1113     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1114     if (clen > 0)
1115     {
1116 ph10 182 BOOL OK;
1117 ph10 178 switch (c)
1118     {
1119     case 0x000a:
1120     case 0x000b:
1121     case 0x000c:
1122     case 0x000d:
1123     case 0x0085:
1124     case 0x2028:
1125     case 0x2029:
1126     OK = TRUE;
1127 ph10 182 break;
1128 ph10 178
1129     default:
1130     OK = FALSE;
1131 ph10 182 break;
1132 ph10 178 }
1133    
1134     if (OK == (d == OP_VSPACE))
1135 ph10 182 {
1136 ph10 178 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1137     {
1138     active_count--; /* Remove non-match possibility */
1139     next_active_state--;
1140     }
1141     count++;
1142     ADD_NEW_DATA(-state_offset, count, 0);
1143     }
1144     }
1145     break;
1146    
1147     /*-----------------------------------------------------------------*/
1148     case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1149     case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1150     case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1151     count = current_state->count; /* Already matched */
1152     if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1153     if (clen > 0)
1154     {
1155 ph10 182 BOOL OK;
1156 ph10 178 switch (c)
1157     {
1158     case 0x09: /* HT */
1159     case 0x20: /* SPACE */
1160     case 0xa0: /* NBSP */
1161     case 0x1680: /* OGHAM SPACE MARK */
1162     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1163     case 0x2000: /* EN QUAD */
1164     case 0x2001: /* EM QUAD */
1165     case 0x2002: /* EN SPACE */
1166     case 0x2003: /* EM SPACE */
1167     case 0x2004: /* THREE-PER-EM SPACE */
1168     case 0x2005: /* FOUR-PER-EM SPACE */
1169     case 0x2006: /* SIX-PER-EM SPACE */
1170     case 0x2007: /* FIGURE SPACE */
1171     case 0x2008: /* PUNCTUATION SPACE */
1172     case 0x2009: /* THIN SPACE */
1173     case 0x200A: /* HAIR SPACE */
1174     case 0x202f: /* NARROW NO-BREAK SPACE */
1175     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1176     case 0x3000: /* IDEOGRAPHIC SPACE */
1177     OK = TRUE;
1178     break;
1179 ph10 182
1180 ph10 178 default:
1181     OK = FALSE;
1182     break;
1183     }
1184 ph10 182
1185 ph10 178 if (OK == (d == OP_HSPACE))
1186 ph10 182 {
1187 ph10 178 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1188     {
1189     active_count--; /* Remove non-match possibility */
1190     next_active_state--;
1191     }
1192     count++;
1193     ADD_NEW_DATA(-state_offset, count, 0);
1194     }
1195     }
1196     break;
1197    
1198     /*-----------------------------------------------------------------*/
1199 ph10 151 #ifdef SUPPORT_UCP
1200 nigel 77 case OP_PROP_EXTRA + OP_TYPEQUERY:
1201     case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1202 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1203 nigel 87 count = 4;
1204 nigel 77 goto QS1;
1205    
1206     case OP_PROP_EXTRA + OP_TYPESTAR:
1207     case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1208 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1209 nigel 77 count = 0;
1210    
1211     QS1:
1212    
1213 nigel 87 ADD_ACTIVE(state_offset + 4, 0);
1214 nigel 77 if (clen > 0)
1215     {
1216 nigel 87 BOOL OK;
1217 ph10 349 const ucd_record * prop = GET_UCD(c);
1218 nigel 87 switch(code[2])
1219     {
1220     case PT_ANY:
1221     OK = TRUE;
1222     break;
1223    
1224     case PT_LAMP:
1225 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1226 nigel 87 break;
1227    
1228     case PT_GC:
1229 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1230 nigel 87 break;
1231    
1232     case PT_PC:
1233 ph10 349 OK = prop->chartype == code[3];
1234 nigel 87 break;
1235    
1236     case PT_SC:
1237 ph10 349 OK = prop->script == code[3];
1238 nigel 87 break;
1239    
1240     /* Should never occur, but keep compilers from grumbling. */
1241    
1242     default:
1243     OK = codevalue != OP_PROP;
1244     break;
1245     }
1246    
1247 nigel 93 if (OK == (d == OP_PROP))
1248     {
1249     if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1250     codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1251     {
1252     active_count--; /* Remove non-match possibility */
1253     next_active_state--;
1254     }
1255     ADD_NEW(state_offset + count, 0);
1256     }
1257 nigel 77 }
1258     break;
1259    
1260     /*-----------------------------------------------------------------*/
1261     case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1262     case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1263 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1264 nigel 77 count = 2;
1265     goto QS2;
1266    
1267     case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1268     case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1269 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1270 nigel 77 count = 0;
1271    
1272     QS2:
1273    
1274     ADD_ACTIVE(state_offset + 2, 0);
1275 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1276 nigel 77 {
1277     const uschar *nptr = ptr + clen;
1278     int ncount = 0;
1279 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1280     codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1281     {
1282     active_count--; /* Remove non-match possibility */
1283     next_active_state--;
1284     }
1285 nigel 77 while (nptr < end_subject)
1286     {
1287     int nd;
1288     int ndlen = 1;
1289     GETCHARLEN(nd, nptr, ndlen);
1290 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1291 nigel 77 ncount++;
1292     nptr += ndlen;
1293     }
1294     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1295     }
1296     break;
1297 ph10 151 #endif
1298 nigel 77
1299     /*-----------------------------------------------------------------*/
1300 nigel 93 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1301     case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1302     case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1303     count = 2;
1304     goto QS3;
1305    
1306     case OP_ANYNL_EXTRA + OP_TYPESTAR:
1307     case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1308     case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1309     count = 0;
1310    
1311     QS3:
1312     ADD_ACTIVE(state_offset + 2, 0);
1313     if (clen > 0)
1314     {
1315     int ncount = 0;
1316     switch (c)
1317     {
1318     case 0x000b:
1319     case 0x000c:
1320     case 0x0085:
1321     case 0x2028:
1322     case 0x2029:
1323 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324     goto ANYNL02;
1325    
1326     case 0x000d:
1327     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328     /* Fall through */
1329    
1330     ANYNL02:
1331     case 0x000a:
1332 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1333     codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1334     {
1335     active_count--; /* Remove non-match possibility */
1336     next_active_state--;
1337     }
1338     ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1339     break;
1340 ph10 231
1341 nigel 93 default:
1342     break;
1343     }
1344     }
1345     break;
1346    
1347     /*-----------------------------------------------------------------*/
1348 ph10 178 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1349     case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1350     case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1351     count = 2;
1352     goto QS4;
1353    
1354     case OP_VSPACE_EXTRA + OP_TYPESTAR:
1355     case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1356     case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1357     count = 0;
1358    
1359     QS4:
1360     ADD_ACTIVE(state_offset + 2, 0);
1361     if (clen > 0)
1362     {
1363 ph10 182 BOOL OK;
1364 ph10 178 switch (c)
1365     {
1366     case 0x000a:
1367     case 0x000b:
1368     case 0x000c:
1369     case 0x000d:
1370     case 0x0085:
1371     case 0x2028:
1372     case 0x2029:
1373     OK = TRUE;
1374     break;
1375 ph10 182
1376 ph10 178 default:
1377     OK = FALSE;
1378     break;
1379     }
1380     if (OK == (d == OP_VSPACE))
1381 ph10 182 {
1382 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1383     codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1384     {
1385     active_count--; /* Remove non-match possibility */
1386     next_active_state--;
1387     }
1388     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1389     }
1390     }
1391     break;
1392    
1393     /*-----------------------------------------------------------------*/
1394     case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1395     case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1396     case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1397     count = 2;
1398     goto QS5;
1399    
1400     case OP_HSPACE_EXTRA + OP_TYPESTAR:
1401     case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1402     case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1403     count = 0;
1404    
1405     QS5:
1406     ADD_ACTIVE(state_offset + 2, 0);
1407     if (clen > 0)
1408     {
1409 ph10 182 BOOL OK;
1410 ph10 178 switch (c)
1411     {
1412     case 0x09: /* HT */
1413     case 0x20: /* SPACE */
1414     case 0xa0: /* NBSP */
1415     case 0x1680: /* OGHAM SPACE MARK */
1416     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1417     case 0x2000: /* EN QUAD */
1418     case 0x2001: /* EM QUAD */
1419     case 0x2002: /* EN SPACE */
1420     case 0x2003: /* EM SPACE */
1421     case 0x2004: /* THREE-PER-EM SPACE */
1422     case 0x2005: /* FOUR-PER-EM SPACE */
1423     case 0x2006: /* SIX-PER-EM SPACE */
1424     case 0x2007: /* FIGURE SPACE */
1425     case 0x2008: /* PUNCTUATION SPACE */
1426     case 0x2009: /* THIN SPACE */
1427     case 0x200A: /* HAIR SPACE */
1428     case 0x202f: /* NARROW NO-BREAK SPACE */
1429     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1430     case 0x3000: /* IDEOGRAPHIC SPACE */
1431     OK = TRUE;
1432     break;
1433 ph10 182
1434 ph10 178 default:
1435     OK = FALSE;
1436     break;
1437     }
1438 ph10 182
1439 ph10 178 if (OK == (d == OP_HSPACE))
1440 ph10 182 {
1441 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1442     codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1443     {
1444     active_count--; /* Remove non-match possibility */
1445     next_active_state--;
1446     }
1447     ADD_NEW_DATA(-(state_offset + count), 0, 0);
1448     }
1449     }
1450     break;
1451    
1452     /*-----------------------------------------------------------------*/
1453 ph10 151 #ifdef SUPPORT_UCP
1454 nigel 77 case OP_PROP_EXTRA + OP_TYPEEXACT:
1455     case OP_PROP_EXTRA + OP_TYPEUPTO:
1456     case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1457 nigel 93 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1458 nigel 77 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1459 nigel 87 { ADD_ACTIVE(state_offset + 6, 0); }
1460 nigel 77 count = current_state->count; /* Number already matched */
1461     if (clen > 0)
1462     {
1463 nigel 87 BOOL OK;
1464 ph10 349 const ucd_record * prop = GET_UCD(c);
1465 nigel 87 switch(code[4])
1466 nigel 77 {
1467 nigel 87 case PT_ANY:
1468     OK = TRUE;
1469     break;
1470    
1471     case PT_LAMP:
1472 ph10 349 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1473 nigel 87 break;
1474    
1475     case PT_GC:
1476 ph10 351 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1477 nigel 87 break;
1478    
1479     case PT_PC:
1480 ph10 349 OK = prop->chartype == code[5];
1481 nigel 87 break;
1482    
1483     case PT_SC:
1484 ph10 349 OK = prop->script == code[5];
1485 nigel 87 break;
1486    
1487     /* Should never occur, but keep compilers from grumbling. */
1488    
1489     default:
1490     OK = codevalue != OP_PROP;
1491     break;
1492     }
1493    
1494     if (OK == (d == OP_PROP))
1495     {
1496 nigel 93 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1497     {
1498     active_count--; /* Remove non-match possibility */
1499     next_active_state--;
1500     }
1501 nigel 77 if (++count >= GET2(code, 1))
1502 nigel 87 { ADD_NEW(state_offset + 6, 0); }
1503 nigel 77 else
1504     { ADD_NEW(state_offset, count); }
1505     }
1506     }
1507     break;
1508    
1509     /*-----------------------------------------------------------------*/
1510     case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1511     case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1512     case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1513 nigel 93 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1514 nigel 77 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1515     { ADD_ACTIVE(state_offset + 4, 0); }
1516     count = current_state->count; /* Number already matched */
1517 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1518 nigel 77 {
1519     const uschar *nptr = ptr + clen;
1520     int ncount = 0;
1521 nigel 93 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1522     {
1523     active_count--; /* Remove non-match possibility */
1524     next_active_state--;
1525     }
1526 nigel 77 while (nptr < end_subject)
1527     {
1528     int nd;
1529     int ndlen = 1;
1530     GETCHARLEN(nd, nptr, ndlen);
1531 ph10 349 if (UCD_CATEGORY(nd) != ucp_M) break;
1532 nigel 77 ncount++;
1533     nptr += ndlen;
1534     }
1535     if (++count >= GET2(code, 1))
1536     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1537     else
1538     { ADD_NEW_DATA(-state_offset, count, ncount); }
1539     }
1540     break;
1541 ph10 151 #endif
1542 nigel 77
1543 nigel 93 /*-----------------------------------------------------------------*/
1544     case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1545     case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1546     case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1547     case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1548     if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1549     { ADD_ACTIVE(state_offset + 4, 0); }
1550     count = current_state->count; /* Number already matched */
1551     if (clen > 0)
1552     {
1553     int ncount = 0;
1554     switch (c)
1555     {
1556     case 0x000b:
1557     case 0x000c:
1558     case 0x0085:
1559     case 0x2028:
1560     case 0x2029:
1561 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1562     goto ANYNL03;
1563    
1564     case 0x000d:
1565     if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1566     /* Fall through */
1567    
1568     ANYNL03:
1569     case 0x000a:
1570 nigel 93 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1571     {
1572     active_count--; /* Remove non-match possibility */
1573     next_active_state--;
1574     }
1575     if (++count >= GET2(code, 1))
1576     { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1577     else
1578     { ADD_NEW_DATA(-state_offset, count, ncount); }
1579     break;
1580 ph10 231
1581 nigel 93 default:
1582     break;
1583     }
1584     }
1585     break;
1586    
1587 ph10 178 /*-----------------------------------------------------------------*/
1588     case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1589     case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1590     case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1591     case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1592     if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1593     { ADD_ACTIVE(state_offset + 4, 0); }
1594     count = current_state->count; /* Number already matched */
1595     if (clen > 0)
1596     {
1597 ph10 182 BOOL OK;
1598 ph10 178 switch (c)
1599     {
1600     case 0x000a:
1601     case 0x000b:
1602     case 0x000c:
1603     case 0x000d:
1604     case 0x0085:
1605     case 0x2028:
1606     case 0x2029:
1607     OK = TRUE;
1608     break;
1609 ph10 182
1610 ph10 178 default:
1611     OK = FALSE;
1612     }
1613 ph10 182
1614 ph10 178 if (OK == (d == OP_VSPACE))
1615 ph10 182 {
1616 ph10 178 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1617     {
1618     active_count--; /* Remove non-match possibility */
1619     next_active_state--;
1620     }
1621     if (++count >= GET2(code, 1))
1622     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1623     else
1624     { ADD_NEW_DATA(-state_offset, count, 0); }
1625     }
1626     }
1627     break;
1628    
1629     /*-----------------------------------------------------------------*/
1630     case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1631     case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1632     case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1633     case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1634     if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1635     { ADD_ACTIVE(state_offset + 4, 0); }
1636     count = current_state->count; /* Number already matched */
1637     if (clen > 0)
1638     {
1639 ph10 182 BOOL OK;
1640 ph10 178 switch (c)
1641     {
1642     case 0x09: /* HT */
1643     case 0x20: /* SPACE */
1644     case 0xa0: /* NBSP */
1645     case 0x1680: /* OGHAM SPACE MARK */
1646     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1647     case 0x2000: /* EN QUAD */
1648     case 0x2001: /* EM QUAD */
1649     case 0x2002: /* EN SPACE */
1650     case 0x2003: /* EM SPACE */
1651     case 0x2004: /* THREE-PER-EM SPACE */
1652     case 0x2005: /* FOUR-PER-EM SPACE */
1653     case 0x2006: /* SIX-PER-EM SPACE */
1654     case 0x2007: /* FIGURE SPACE */
1655     case 0x2008: /* PUNCTUATION SPACE */
1656     case 0x2009: /* THIN SPACE */
1657     case 0x200A: /* HAIR SPACE */
1658     case 0x202f: /* NARROW NO-BREAK SPACE */
1659     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1660     case 0x3000: /* IDEOGRAPHIC SPACE */
1661     OK = TRUE;
1662     break;
1663 ph10 182
1664 ph10 178 default:
1665     OK = FALSE;
1666     break;
1667     }
1668 ph10 182
1669 ph10 178 if (OK == (d == OP_HSPACE))
1670 ph10 182 {
1671 ph10 178 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1672     {
1673     active_count--; /* Remove non-match possibility */
1674     next_active_state--;
1675     }
1676     if (++count >= GET2(code, 1))
1677     { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1678     else
1679     { ADD_NEW_DATA(-state_offset, count, 0); }
1680     }
1681     }
1682     break;
1683    
1684 nigel 77 /* ========================================================================== */
1685     /* These opcodes are followed by a character that is usually compared
1686     to the current subject character; it is loaded into d. We still get
1687     here even if there is no subject character, because in some cases zero
1688     repetitions are permitted. */
1689    
1690     /*-----------------------------------------------------------------*/
1691     case OP_CHAR:
1692     if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1693     break;
1694    
1695     /*-----------------------------------------------------------------*/
1696     case OP_CHARNC:
1697     if (clen == 0) break;
1698    
1699     #ifdef SUPPORT_UTF8
1700     if (utf8)
1701     {
1702     if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1703     {
1704 nigel 93 unsigned int othercase;
1705 nigel 77 if (c < 128) othercase = fcc[c]; else
1706    
1707     /* If we have Unicode property support, we can use it to test the
1708 nigel 87 other case of the character. */
1709 nigel 77
1710     #ifdef SUPPORT_UCP
1711 ph10 349 othercase = UCD_OTHERCASE(c);
1712 nigel 87 #else
1713 nigel 93 othercase = NOTACHAR;
1714 nigel 77 #endif
1715    
1716     if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1717     }
1718     }
1719     else
1720     #endif /* SUPPORT_UTF8 */
1721    
1722     /* Non-UTF-8 mode */
1723     {
1724     if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1725     }
1726     break;
1727    
1728    
1729     #ifdef SUPPORT_UCP
1730     /*-----------------------------------------------------------------*/
1731     /* This is a tricky one because it can match more than one character.
1732     Find out how many characters to skip, and then set up a negative state
1733     to wait for them to pass before continuing. */
1734    
1735     case OP_EXTUNI:
1736 ph10 349 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1737 nigel 77 {
1738     const uschar *nptr = ptr + clen;
1739     int ncount = 0;
1740     while (nptr < end_subject)
1741     {
1742     int nclen = 1;
1743     GETCHARLEN(c, nptr, nclen);
1744 ph10 349 if (UCD_CATEGORY(c) != ucp_M) break;
1745 nigel 77 ncount++;
1746     nptr += nclen;
1747     }
1748     ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1749     }
1750     break;
1751     #endif
1752    
1753     /*-----------------------------------------------------------------*/
1754 nigel 93 /* This is a tricky like EXTUNI because it too can match more than one
1755     character (when CR is followed by LF). In this case, set up a negative
1756     state to wait for one character to pass before continuing. */
1757    
1758     case OP_ANYNL:
1759     if (clen > 0) switch(c)
1760     {
1761     case 0x000b:
1762     case 0x000c:
1763     case 0x0085:
1764     case 0x2028:
1765     case 0x2029:
1766 ph10 231 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1767    
1768     case 0x000a:
1769 nigel 93 ADD_NEW(state_offset + 1, 0);
1770     break;
1771 ph10 231
1772 nigel 93 case 0x000d:
1773     if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1774     {
1775     ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1776     }
1777     else
1778     {
1779     ADD_NEW(state_offset + 1, 0);
1780     }
1781     break;
1782     }
1783     break;
1784    
1785     /*-----------------------------------------------------------------*/
1786 ph10 178 case OP_NOT_VSPACE:
1787     if (clen > 0) switch(c)
1788     {
1789     case 0x000a:
1790     case 0x000b:
1791     case 0x000c:
1792     case 0x000d:
1793     case 0x0085:
1794     case 0x2028:
1795     case 0x2029:
1796     break;
1797 ph10 182
1798     default:
1799 ph10 178 ADD_NEW(state_offset + 1, 0);
1800     break;
1801     }
1802     break;
1803    
1804     /*-----------------------------------------------------------------*/
1805     case OP_VSPACE:
1806     if (clen > 0) switch(c)
1807     {
1808     case 0x000a:
1809     case 0x000b:
1810     case 0x000c:
1811     case 0x000d:
1812     case 0x0085:
1813     case 0x2028:
1814     case 0x2029:
1815     ADD_NEW(state_offset + 1, 0);
1816     break;
1817 ph10 182
1818 ph10 178 default: break;
1819     }
1820     break;
1821    
1822     /*-----------------------------------------------------------------*/
1823     case OP_NOT_HSPACE:
1824     if (clen > 0) switch(c)
1825     {
1826     case 0x09: /* HT */
1827     case 0x20: /* SPACE */
1828     case 0xa0: /* NBSP */
1829     case 0x1680: /* OGHAM SPACE MARK */
1830     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1831     case 0x2000: /* EN QUAD */
1832     case 0x2001: /* EM QUAD */
1833     case 0x2002: /* EN SPACE */
1834     case 0x2003: /* EM SPACE */
1835     case 0x2004: /* THREE-PER-EM SPACE */
1836     case 0x2005: /* FOUR-PER-EM SPACE */
1837     case 0x2006: /* SIX-PER-EM SPACE */
1838     case 0x2007: /* FIGURE SPACE */
1839     case 0x2008: /* PUNCTUATION SPACE */
1840     case 0x2009: /* THIN SPACE */
1841     case 0x200A: /* HAIR SPACE */
1842     case 0x202f: /* NARROW NO-BREAK SPACE */
1843     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1844     case 0x3000: /* IDEOGRAPHIC SPACE */
1845     break;
1846 ph10 182
1847     default:
1848 ph10 178 ADD_NEW(state_offset + 1, 0);
1849     break;
1850     }
1851     break;
1852    
1853     /*-----------------------------------------------------------------*/
1854     case OP_HSPACE:
1855     if (clen > 0) switch(c)
1856     {
1857     case 0x09: /* HT */
1858     case 0x20: /* SPACE */
1859     case 0xa0: /* NBSP */
1860     case 0x1680: /* OGHAM SPACE MARK */
1861     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1862     case 0x2000: /* EN QUAD */
1863     case 0x2001: /* EM QUAD */
1864     case 0x2002: /* EN SPACE */
1865     case 0x2003: /* EM SPACE */
1866     case 0x2004: /* THREE-PER-EM SPACE */
1867     case 0x2005: /* FOUR-PER-EM SPACE */
1868     case 0x2006: /* SIX-PER-EM SPACE */
1869     case 0x2007: /* FIGURE SPACE */
1870     case 0x2008: /* PUNCTUATION SPACE */
1871     case 0x2009: /* THIN SPACE */
1872     case 0x200A: /* HAIR SPACE */
1873     case 0x202f: /* NARROW NO-BREAK SPACE */
1874     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1875     case 0x3000: /* IDEOGRAPHIC SPACE */
1876     ADD_NEW(state_offset + 1, 0);
1877     break;
1878     }
1879     break;
1880    
1881     /*-----------------------------------------------------------------*/
1882 nigel 77 /* Match a negated single character. This is only used for one-byte
1883     characters, that is, we know that d < 256. The character we are
1884     checking (c) can be multibyte. */
1885    
1886     case OP_NOT:
1887     if (clen > 0)
1888     {
1889 nigel 93 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1890 nigel 77 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1891     }
1892     break;
1893    
1894     /*-----------------------------------------------------------------*/
1895     case OP_PLUS:
1896     case OP_MINPLUS:
1897 nigel 93 case OP_POSPLUS:
1898 nigel 77 case OP_NOTPLUS:
1899     case OP_NOTMINPLUS:
1900 nigel 93 case OP_NOTPOSPLUS:
1901 nigel 77 count = current_state->count; /* Already matched */
1902     if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1903     if (clen > 0)
1904     {
1905 nigel 93 unsigned int otherd = NOTACHAR;
1906 nigel 77 if ((ims & PCRE_CASELESS) != 0)
1907     {
1908     #ifdef SUPPORT_UTF8
1909 nigel 87 if (utf8 && d >= 128)
1910 nigel 77 {
1911     #ifdef SUPPORT_UCP
1912 ph10 349 otherd = UCD_OTHERCASE(d);
1913 nigel 77 #endif /* SUPPORT_UCP */
1914     }
1915     else
1916     #endif /* SUPPORT_UTF8 */
1917     otherd = fcc[d];
1918     }
1919     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1920 nigel 93 {
1921     if (count > 0 &&
1922     (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1923     {
1924     active_count--; /* Remove non-match possibility */
1925     next_active_state--;
1926     }
1927     count++;
1928     ADD_NEW(state_offset, count);
1929     }
1930 nigel 77 }
1931     break;
1932    
1933     /*-----------------------------------------------------------------*/
1934     case OP_QUERY:
1935     case OP_MINQUERY:
1936 nigel 93 case OP_POSQUERY:
1937 nigel 77 case OP_NOTQUERY:
1938     case OP_NOTMINQUERY:
1939 nigel 93 case OP_NOTPOSQUERY:
1940 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1941     if (clen > 0)
1942     {
1943 nigel 93 unsigned int otherd = NOTACHAR;
1944 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1945 nigel 77 {
1946     #ifdef SUPPORT_UTF8
1947 nigel 87 if (utf8 && d >= 128)
1948 nigel 77 {
1949     #ifdef SUPPORT_UCP
1950 ph10 349 otherd = UCD_OTHERCASE(d);
1951 nigel 77 #endif /* SUPPORT_UCP */
1952     }
1953     else
1954     #endif /* SUPPORT_UTF8 */
1955     otherd = fcc[d];
1956     }
1957     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1958 nigel 93 {
1959     if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1960     {
1961     active_count--; /* Remove non-match possibility */
1962     next_active_state--;
1963     }
1964     ADD_NEW(state_offset + dlen + 1, 0);
1965     }
1966 nigel 77 }
1967     break;
1968    
1969     /*-----------------------------------------------------------------*/
1970     case OP_STAR:
1971     case OP_MINSTAR:
1972 nigel 93 case OP_POSSTAR:
1973 nigel 77 case OP_NOTSTAR:
1974     case OP_NOTMINSTAR:
1975 nigel 93 case OP_NOTPOSSTAR:
1976 nigel 77 ADD_ACTIVE(state_offset + dlen + 1, 0);
1977     if (clen > 0)
1978     {
1979 nigel 93 unsigned int otherd = NOTACHAR;
1980 nigel 91 if ((ims & PCRE_CASELESS) != 0)
1981 nigel 77 {
1982     #ifdef SUPPORT_UTF8
1983 nigel 87 if (utf8 && d >= 128)
1984 nigel 77 {
1985     #ifdef SUPPORT_UCP
1986 ph10 349 otherd = UCD_OTHERCASE(d);
1987 nigel 77 #endif /* SUPPORT_UCP */
1988     }
1989     else
1990     #endif /* SUPPORT_UTF8 */
1991     otherd = fcc[d];
1992     }
1993     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1994 nigel 93 {
1995     if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1996     {
1997     active_count--; /* Remove non-match possibility */
1998     next_active_state--;
1999     }
2000     ADD_NEW(state_offset, 0);
2001     }
2002 nigel 77 }
2003     break;
2004    
2005     /*-----------------------------------------------------------------*/
2006     case OP_EXACT:
2007 nigel 93 case OP_NOTEXACT:
2008     count = current_state->count; /* Number already matched */
2009     if (clen > 0)
2010     {
2011     unsigned int otherd = NOTACHAR;
2012     if ((ims & PCRE_CASELESS) != 0)
2013     {
2014     #ifdef SUPPORT_UTF8
2015     if (utf8 && d >= 128)
2016     {
2017     #ifdef SUPPORT_UCP
2018 ph10 349 otherd = UCD_OTHERCASE(d);
2019 nigel 93 #endif /* SUPPORT_UCP */
2020     }
2021     else
2022     #endif /* SUPPORT_UTF8 */
2023     otherd = fcc[d];
2024     }
2025     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2026     {
2027     if (++count >= GET2(code, 1))
2028     { ADD_NEW(state_offset + dlen + 3, 0); }
2029     else
2030     { ADD_NEW(state_offset, count); }
2031     }
2032     }
2033     break;
2034    
2035     /*-----------------------------------------------------------------*/
2036 nigel 77 case OP_UPTO:
2037     case OP_MINUPTO:
2038 nigel 93 case OP_POSUPTO:
2039 nigel 77 case OP_NOTUPTO:
2040     case OP_NOTMINUPTO:
2041 nigel 93 case OP_NOTPOSUPTO:
2042     ADD_ACTIVE(state_offset + dlen + 3, 0);
2043 nigel 77 count = current_state->count; /* Number already matched */
2044     if (clen > 0)
2045     {
2046 nigel 93 unsigned int otherd = NOTACHAR;
2047 nigel 77 if ((ims & PCRE_CASELESS) != 0)
2048     {
2049     #ifdef SUPPORT_UTF8
2050 nigel 87 if (utf8 && d >= 128)
2051 nigel 77 {
2052     #ifdef SUPPORT_UCP
2053 ph10 349 otherd = UCD_OTHERCASE(d);
2054 nigel 77 #endif /* SUPPORT_UCP */
2055     }
2056     else
2057     #endif /* SUPPORT_UTF8 */
2058     otherd = fcc[d];
2059     }
2060     if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2061     {
2062 nigel 93 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2063     {
2064     active_count--; /* Remove non-match possibility */
2065     next_active_state--;
2066     }
2067 nigel 77 if (++count >= GET2(code, 1))
2068     { ADD_NEW(state_offset + dlen + 3, 0); }
2069     else
2070     { ADD_NEW(state_offset, count); }
2071     }
2072     }
2073     break;
2074    
2075    
2076     /* ========================================================================== */
2077     /* These are the class-handling opcodes */
2078    
2079     case OP_CLASS:
2080     case OP_NCLASS:
2081     case OP_XCLASS:
2082     {
2083     BOOL isinclass = FALSE;
2084     int next_state_offset;
2085     const uschar *ecode;
2086    
2087     /* For a simple class, there is always just a 32-byte table, and we
2088     can set isinclass from it. */
2089    
2090     if (codevalue != OP_XCLASS)
2091     {
2092     ecode = code + 33;
2093     if (clen > 0)
2094     {
2095     isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2096     ((code[1 + c/8] & (1 << (c&7))) != 0);
2097     }
2098     }
2099    
2100     /* An extended class may have a table or a list of single characters,
2101     ranges, or both, and it may be positive or negative. There's a
2102     function that sorts all this out. */
2103    
2104     else
2105     {
2106     ecode = code + GET(code, 1);
2107     if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2108     }
2109    
2110     /* At this point, isinclass is set for all kinds of class, and ecode
2111     points to the byte after the end of the class. If there is a
2112     quantifier, this is where it will be. */
2113    
2114     next_state_offset = ecode - start_code;
2115    
2116     switch (*ecode)
2117     {
2118     case OP_CRSTAR:
2119     case OP_CRMINSTAR:
2120     ADD_ACTIVE(next_state_offset + 1, 0);
2121     if (isinclass) { ADD_NEW(state_offset, 0); }
2122     break;
2123    
2124     case OP_CRPLUS:
2125     case OP_CRMINPLUS:
2126     count = current_state->count; /* Already matched */
2127     if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2128     if (isinclass) { count++; ADD_NEW(state_offset, count); }
2129     break;
2130    
2131     case OP_CRQUERY:
2132     case OP_CRMINQUERY:
2133     ADD_ACTIVE(next_state_offset + 1, 0);
2134     if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2135     break;
2136    
2137     case OP_CRRANGE:
2138     case OP_CRMINRANGE:
2139     count = current_state->count; /* Already matched */
2140     if (count >= GET2(ecode, 1))
2141     { ADD_ACTIVE(next_state_offset + 5, 0); }
2142     if (isinclass)
2143     {
2144 nigel 91 int max = GET2(ecode, 3);
2145     if (++count >= max && max != 0) /* Max 0 => no limit */
2146 nigel 77 { ADD_NEW(next_state_offset + 5, 0); }
2147     else
2148     { ADD_NEW(state_offset, count); }
2149     }
2150     break;
2151    
2152     default:
2153     if (isinclass) { ADD_NEW(next_state_offset, 0); }
2154     break;
2155     }
2156     }
2157     break;
2158    
2159     /* ========================================================================== */
2160     /* These are the opcodes for fancy brackets of various kinds. We have
2161 ph10 345 to use recursion in order to handle them. The "always failing" assersion
2162 ph10 341 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2163     though the other "backtracking verbs" are not supported. */
2164 ph10 345
2165 ph10 341 case OP_FAIL:
2166 ph10 345 break;
2167 nigel 77
2168     case OP_ASSERT:
2169     case OP_ASSERT_NOT:
2170     case OP_ASSERTBACK:
2171     case OP_ASSERTBACK_NOT:
2172     {
2173     int rc;
2174     int local_offsets[2];
2175     int local_workspace[1000];
2176     const uschar *endasscode = code + GET(code, 1);
2177    
2178     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2179    
2180     rc = internal_dfa_exec(
2181     md, /* static match data */
2182     code, /* this subexpression's code */
2183     ptr, /* where we currently are */
2184     ptr - start_subject, /* start offset */
2185     local_offsets, /* offset vector */
2186     sizeof(local_offsets)/sizeof(int), /* size of same */
2187     local_workspace, /* workspace vector */
2188     sizeof(local_workspace)/sizeof(int), /* size of same */
2189     ims, /* the current ims flags */
2190     rlevel, /* function recursion level */
2191     recursing); /* pass on regex recursion */
2192    
2193     if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2194     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2195     }
2196     break;
2197    
2198     /*-----------------------------------------------------------------*/
2199     case OP_COND:
2200 nigel 93 case OP_SCOND:
2201 nigel 77 {
2202     int local_offsets[1000];
2203     int local_workspace[1000];
2204     int condcode = code[LINK_SIZE+1];
2205    
2206 nigel 93 /* Back reference conditions are not supported */
2207 nigel 77
2208 nigel 93 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2209    
2210     /* The DEFINE condition is always false */
2211    
2212     if (condcode == OP_DEF)
2213 nigel 77 {
2214 nigel 93 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2215     }
2216    
2217     /* The only supported version of OP_RREF is for the value RREF_ANY,
2218     which means "test if in any recursion". We can't test for specifically
2219     recursed groups. */
2220    
2221     else if (condcode == OP_RREF)
2222     {
2223 nigel 77 int value = GET2(code, LINK_SIZE+2);
2224 nigel 93 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2225 nigel 77 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2226     else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2227     }
2228    
2229     /* Otherwise, the condition is an assertion */
2230    
2231     else
2232     {
2233     int rc;
2234     const uschar *asscode = code + LINK_SIZE + 1;
2235     const uschar *endasscode = asscode + GET(asscode, 1);
2236    
2237     while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2238    
2239     rc = internal_dfa_exec(
2240     md, /* fixed match data */
2241     asscode, /* this subexpression's code */
2242     ptr, /* where we currently are */
2243     ptr - start_subject, /* start offset */
2244     local_offsets, /* offset vector */
2245     sizeof(local_offsets)/sizeof(int), /* size of same */
2246     local_workspace, /* workspace vector */
2247     sizeof(local_workspace)/sizeof(int), /* size of same */
2248     ims, /* the current ims flags */
2249     rlevel, /* function recursion level */
2250     recursing); /* pass on regex recursion */
2251    
2252     if ((rc >= 0) ==
2253     (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2254     { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2255     else
2256     { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2257     }
2258     }
2259     break;
2260    
2261     /*-----------------------------------------------------------------*/
2262     case OP_RECURSE:
2263     {
2264     int local_offsets[1000];
2265     int local_workspace[1000];
2266     int rc;
2267    
2268     DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2269     recursing + 1));
2270    
2271     rc = internal_dfa_exec(
2272     md, /* fixed match data */
2273     start_code + GET(code, 1), /* this subexpression's code */
2274     ptr, /* where we currently are */
2275     ptr - start_subject, /* start offset */
2276     local_offsets, /* offset vector */
2277     sizeof(local_offsets)/sizeof(int), /* size of same */
2278     local_workspace, /* workspace vector */
2279     sizeof(local_workspace)/sizeof(int), /* size of same */
2280     ims, /* the current ims flags */
2281     rlevel, /* function recursion level */
2282     recursing + 1); /* regex recurse level */
2283    
2284     DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2285     recursing + 1, rc));
2286    
2287     /* Ran out of internal offsets */
2288    
2289     if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2290    
2291     /* For each successful matched substring, set up the next state with a
2292     count of characters to skip before trying it. Note that the count is in
2293     characters, not bytes. */
2294    
2295     if (rc > 0)
2296     {
2297     for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2298     {
2299     const uschar *p = start_subject + local_offsets[rc];
2300     const uschar *pp = start_subject + local_offsets[rc+1];
2301     int charcount = local_offsets[rc+1] - local_offsets[rc];
2302     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2303     if (charcount > 0)
2304     {
2305     ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2306     }
2307     else
2308     {
2309     ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2310     }
2311     }
2312     }
2313     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2314     }
2315     break;
2316    
2317     /*-----------------------------------------------------------------*/
2318     case OP_ONCE:
2319     {
2320     int local_offsets[2];
2321     int local_workspace[1000];
2322    
2323     int rc = internal_dfa_exec(
2324     md, /* fixed match data */
2325     code, /* this subexpression's code */
2326     ptr, /* where we currently are */
2327     ptr - start_subject, /* start offset */
2328     local_offsets, /* offset vector */
2329     sizeof(local_offsets)/sizeof(int), /* size of same */
2330     local_workspace, /* workspace vector */
2331     sizeof(local_workspace)/sizeof(int), /* size of same */
2332     ims, /* the current ims flags */
2333     rlevel, /* function recursion level */
2334     recursing); /* pass on regex recursion */
2335    
2336     if (rc >= 0)
2337     {
2338     const uschar *end_subpattern = code;
2339     int charcount = local_offsets[1] - local_offsets[0];
2340     int next_state_offset, repeat_state_offset;
2341    
2342     do { end_subpattern += GET(end_subpattern, 1); }
2343     while (*end_subpattern == OP_ALT);
2344     next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2345    
2346     /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2347     arrange for the repeat state also to be added to the relevant list.
2348     Calculate the offset, or set -1 for no repeat. */
2349    
2350     repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2351     *end_subpattern == OP_KETRMIN)?
2352     end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2353    
2354     /* If we have matched an empty string, add the next state at the
2355     current character pointer. This is important so that the duplicate
2356     checking kicks in, which is what breaks infinite loops that match an
2357     empty string. */
2358    
2359     if (charcount == 0)
2360     {
2361     ADD_ACTIVE(next_state_offset, 0);
2362     }
2363    
2364     /* Optimization: if there are no more active states, and there
2365     are no new states yet set up, then skip over the subject string
2366     right here, to save looping. Otherwise, set up the new state to swing
2367     into action when the end of the substring is reached. */
2368    
2369     else if (i + 1 >= active_count && new_count == 0)
2370     {
2371     ptr += charcount;
2372     clen = 0;
2373     ADD_NEW(next_state_offset, 0);
2374    
2375     /* If we are adding a repeat state at the new character position,
2376     we must fudge things so that it is the only current state.
2377     Otherwise, it might be a duplicate of one we processed before, and
2378     that would cause it to be skipped. */
2379    
2380     if (repeat_state_offset >= 0)
2381     {
2382     next_active_state = active_states;
2383     active_count = 0;
2384     i = -1;
2385     ADD_ACTIVE(repeat_state_offset, 0);
2386     }
2387     }
2388     else
2389     {
2390     const uschar *p = start_subject + local_offsets[0];
2391     const uschar *pp = start_subject + local_offsets[1];
2392     while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2393     ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2394     if (repeat_state_offset >= 0)
2395     { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2396     }
2397    
2398     }
2399     else if (rc != PCRE_ERROR_NOMATCH) return rc;
2400     }
2401     break;
2402    
2403    
2404     /* ========================================================================== */
2405     /* Handle callouts */
2406    
2407     case OP_CALLOUT:
2408     if (pcre_callout != NULL)
2409     {
2410     int rrc;
2411     pcre_callout_block cb;
2412     cb.version = 1; /* Version 1 of the callout block */
2413     cb.callout_number = code[1];
2414     cb.offset_vector = offsets;
2415 nigel 87 cb.subject = (PCRE_SPTR)start_subject;
2416 nigel 77 cb.subject_length = end_subject - start_subject;
2417     cb.start_match = current_subject - start_subject;
2418     cb.current_position = ptr - start_subject;
2419     cb.pattern_position = GET(code, 2);
2420     cb.next_item_length = GET(code, 2 + LINK_SIZE);
2421     cb.capture_top = 1;
2422     cb.capture_last = -1;
2423     cb.callout_data = md->callout_data;
2424     if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2425     if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2426     }
2427     break;
2428    
2429    
2430     /* ========================================================================== */
2431     default: /* Unsupported opcode */
2432     return PCRE_ERROR_DFA_UITEM;
2433     }
2434    
2435     NEXT_ACTIVE_STATE: continue;
2436    
2437     } /* End of loop scanning active states */
2438    
2439     /* We have finished the processing at the current subject character. If no
2440     new states have been set for the next character, we have found all the
2441     matches that we are going to find. If we are at the top level and partial
2442     matching has been requested, check for appropriate conditions. */
2443    
2444     if (new_count <= 0)
2445     {
2446     if (match_count < 0 && /* No matches found */
2447     rlevel == 1 && /* Top level match function */
2448     (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2449     ptr >= end_subject && /* Reached end of subject */
2450     ptr > current_subject) /* Matched non-empty string */
2451     {
2452     if (offsetcount >= 2)
2453     {
2454     offsets[0] = current_subject - start_subject;
2455     offsets[1] = end_subject - start_subject;
2456     }
2457     match_count = PCRE_ERROR_PARTIAL;
2458     }
2459    
2460     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2461     "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2462     rlevel*2-2, SP));
2463 nigel 91 break; /* In effect, "return", but see the comment below */
2464 nigel 77 }
2465    
2466     /* One or more states are active for the next character. */
2467    
2468     ptr += clen; /* Advance to next subject character */
2469     } /* Loop to move along the subject string */
2470    
2471 nigel 91 /* Control gets here from "break" a few lines above. We do it this way because
2472     if we use "return" above, we have compiler trouble. Some compilers warn if
2473     there's nothing here because they think the function doesn't return a value. On
2474     the other hand, if we put a dummy statement here, some more clever compilers
2475     complain that it can't be reached. Sigh. */
2476 nigel 77
2477 nigel 91 return match_count;
2478 nigel 77 }
2479    
2480    
2481    
2482    
2483     /*************************************************
2484     * Execute a Regular Expression - DFA engine *
2485     *************************************************/
2486    
2487     /* This external function applies a compiled re to a subject string using a DFA
2488     engine. This function calls the internal function multiple times if the pattern
2489     is not anchored.
2490    
2491     Arguments:
2492     argument_re points to the compiled expression
2493 ph10 97 extra_data points to extra data or is NULL
2494 nigel 77 subject points to the subject string
2495     length length of subject string (may contain binary zeros)
2496     start_offset where to start in the subject string
2497     options option bits
2498     offsets vector of match offsets
2499     offsetcount size of same
2500     workspace workspace vector
2501     wscount size of same
2502    
2503     Returns: > 0 => number of match offset pairs placed in offsets
2504     = 0 => offsets overflowed; longest matches are present
2505     -1 => failed to match
2506     < -1 => some kind of unexpected problem
2507     */
2508    
2509 ph10 359 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2510 nigel 77 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2511     const char *subject, int length, int start_offset, int options, int *offsets,
2512     int offsetcount, int *workspace, int wscount)
2513     {
2514     real_pcre *re = (real_pcre *)argument_re;
2515     dfa_match_data match_block;
2516 nigel 91 dfa_match_data *md = &match_block;
2517 nigel 77 BOOL utf8, anchored, startline, firstline;
2518     const uschar *current_subject, *end_subject, *lcc;
2519    
2520     pcre_study_data internal_study;
2521     const pcre_study_data *study = NULL;
2522     real_pcre internal_re;
2523    
2524     const uschar *req_byte_ptr;
2525     const uschar *start_bits = NULL;
2526     BOOL first_byte_caseless = FALSE;
2527     BOOL req_byte_caseless = FALSE;
2528     int first_byte = -1;
2529     int req_byte = -1;
2530     int req_byte2 = -1;
2531 nigel 91 int newline;
2532 nigel 77
2533     /* Plausibility checks */
2534    
2535     if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2536     if (re == NULL || subject == NULL || workspace == NULL ||
2537     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2538     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2539     if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2540    
2541     /* We need to find the pointer to any study data before we test for byte
2542     flipping, so we scan the extra_data block first. This may set two fields in the
2543     match block, so we must initialize them beforehand. However, the other fields
2544     in the match block must not be set until after the byte flipping. */
2545    
2546 nigel 91 md->tables = re->tables;
2547     md->callout_data = NULL;
2548 nigel 77
2549     if (extra_data != NULL)
2550     {
2551     unsigned int flags = extra_data->flags;
2552     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2553     study = (const pcre_study_data *)extra_data->study_data;
2554     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2555 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2556     return PCRE_ERROR_DFA_UMLIMIT;
2557 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2558 nigel 91 md->callout_data = extra_data->callout_data;
2559 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0)
2560 nigel 91 md->tables = extra_data->tables;
2561 nigel 77 }
2562    
2563     /* Check that the first field in the block is the magic number. If it is not,
2564     test for a regex that was compiled on a host of opposite endianness. If this is
2565     the case, flipped values are put in internal_re and internal_study if there was
2566     study data too. */
2567    
2568     if (re->magic_number != MAGIC_NUMBER)
2569     {
2570     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2571     if (re == NULL) return PCRE_ERROR_BADMAGIC;
2572     if (study != NULL) study = &internal_study;
2573     }
2574    
2575     /* Set some local values */
2576    
2577     current_subject = (const unsigned char *)subject + start_offset;
2578     end_subject = (const unsigned char *)subject + length;
2579     req_byte_ptr = current_subject - 1;
2580    
2581 nigel 91 #ifdef SUPPORT_UTF8
2582 nigel 77 utf8 = (re->options & PCRE_UTF8) != 0;
2583 nigel 91 #else
2584     utf8 = FALSE;
2585     #endif
2586 nigel 77
2587 nigel 87 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2588     (re->options & PCRE_ANCHORED) != 0;
2589    
2590 nigel 77 /* The remaining fixed data for passing around. */
2591    
2592 nigel 91 md->start_code = (const uschar *)argument_re +
2593 nigel 77 re->name_table_offset + re->name_count * re->name_entry_size;
2594 nigel 91 md->start_subject = (const unsigned char *)subject;
2595     md->end_subject = end_subject;
2596     md->moptions = options;
2597     md->poptions = re->options;
2598 nigel 77
2599 ph10 231 /* If the BSR option is not set at match time, copy what was set
2600     at compile time. */
2601    
2602     if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2603     {
2604     if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2605     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2606     #ifdef BSR_ANYCRLF
2607     else md->moptions |= PCRE_BSR_ANYCRLF;
2608 ph10 243 #endif
2609     }
2610 ph10 231
2611 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
2612     nothing is set at run time, whatever was used at compile time applies. */
2613 nigel 91
2614 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2615 nigel 93 PCRE_NEWLINE_BITS)
2616 nigel 91 {
2617 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
2618 ph10 391 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2619     case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2620 nigel 91 case PCRE_NEWLINE_CR+
2621 ph10 391 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2622 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
2623 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2624 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
2625 nigel 91 }
2626    
2627 ph10 149 if (newline == -2)
2628 nigel 91 {
2629 ph10 149 md->nltype = NLTYPE_ANYCRLF;
2630     }
2631     else if (newline < 0)
2632     {
2633 nigel 93 md->nltype = NLTYPE_ANY;
2634 nigel 91 }
2635     else
2636     {
2637 nigel 93 md->nltype = NLTYPE_FIXED;
2638     if (newline > 255)
2639     {
2640     md->nllen = 2;
2641     md->nl[0] = (newline >> 8) & 255;
2642     md->nl[1] = newline & 255;
2643     }
2644     else
2645     {
2646     md->nllen = 1;
2647     md->nl[0] = newline;
2648     }
2649 nigel 91 }
2650    
2651 nigel 77 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2652     back the character offset. */
2653    
2654     #ifdef SUPPORT_UTF8
2655     if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2656     {
2657     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2658     return PCRE_ERROR_BADUTF8;
2659     if (start_offset > 0 && start_offset < length)
2660     {
2661     int tb = ((uschar *)subject)[start_offset];
2662     if (tb > 127)
2663     {
2664     tb &= 0xc0;
2665     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2666     }
2667     }
2668     }
2669     #endif
2670    
2671     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2672     is a feature that makes it possible to save compiled regex and re-use them
2673     in other programs later. */
2674    
2675 nigel 91 if (md->tables == NULL) md->tables = _pcre_default_tables;
2676 nigel 77
2677     /* The lower casing table and the "must be at the start of a line" flag are
2678     used in a loop when finding where to start. */
2679    
2680 nigel 91 lcc = md->tables + lcc_offset;
2681 ph10 230 startline = (re->flags & PCRE_STARTLINE) != 0;
2682 nigel 77 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2683    
2684     /* Set up the first character to match, if available. The first_byte value is
2685     never set for an anchored regular expression, but the anchoring may be forced
2686     at run time, so we have to test for anchoring. The first char may be unset for
2687     an unanchored pattern, of course. If there's no first char and the pattern was
2688     studied, there may be a bitmap of possible first characters. */
2689    
2690     if (!anchored)
2691     {
2692 ph10 230 if ((re->flags & PCRE_FIRSTSET) != 0)
2693 nigel 77 {
2694     first_byte = re->first_byte & 255;
2695     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2696     first_byte = lcc[first_byte];
2697     }
2698     else
2699     {
2700     if (startline && study != NULL &&
2701     (study->options & PCRE_STUDY_MAPPED) != 0)
2702     start_bits = study->start_bits;
2703     }
2704     }
2705    
2706     /* For anchored or unanchored matches, there may be a "last known required
2707     character" set. */
2708    
2709 ph10 230 if ((re->flags & PCRE_REQCHSET) != 0)
2710 nigel 77 {
2711     req_byte = re->req_byte & 255;
2712     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2713 nigel 91 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2714 nigel 77 }
2715    
2716     /* Call the main matching function, looping for a non-anchored regex after a
2717 ph10 389 failed match. If not restarting, perform certain optimizations at the start of
2718     a match. */
2719 nigel 77
2720     for (;;)
2721     {
2722     int rc;
2723    
2724     if ((options & PCRE_DFA_RESTART) == 0)
2725     {
2726     const uschar *save_end_subject = end_subject;
2727    
2728 ph10 389 /* If firstline is TRUE, the start of the match is constrained to the first
2729     line of a multiline string. Implement this by temporarily adjusting
2730     end_subject so that we stop scanning at a newline. If the match fails at
2731     the newline, later code breaks this loop. */
2732 nigel 77
2733     if (firstline)
2734     {
2735 ph10 365 USPTR t = current_subject;
2736     #ifdef SUPPORT_UTF8
2737     if (utf8)
2738 ph10 371 {
2739     while (t < md->end_subject && !IS_NEWLINE(t))
2740 ph10 365 {
2741     t++;
2742     while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2743 ph10 371 }
2744 ph10 365 }
2745     else
2746 ph10 371 #endif
2747 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2748 nigel 77 end_subject = t;
2749     }
2750 ph10 389
2751     /* There are some optimizations that avoid running the match if a known
2752     starting point is not found, or if a known later character is not present.
2753     However, there is an option that disables these, for testing and for
2754     ensuring that all callouts do actually occur. */
2755 nigel 77
2756 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2757     {
2758    
2759     /* Advance to a known first byte. */
2760    
2761     if (first_byte >= 0)
2762 nigel 77 {
2763 ph10 389 if (first_byte_caseless)
2764     while (current_subject < end_subject &&
2765     lcc[*current_subject] != first_byte)
2766     current_subject++;
2767     else
2768     while (current_subject < end_subject &&
2769     *current_subject != first_byte)
2770     current_subject++;
2771     }
2772    
2773     /* Or to just after a linebreak for a multiline match if possible */
2774    
2775     else if (startline)
2776     {
2777     if (current_subject > md->start_subject + start_offset)
2778     {
2779 ph10 365 #ifdef SUPPORT_UTF8
2780 ph10 389 if (utf8)
2781 ph10 365 {
2782 ph10 389 while (current_subject < end_subject &&
2783     !WAS_NEWLINE(current_subject))
2784     {
2785 ph10 365 current_subject++;
2786 ph10 389 while(current_subject < end_subject &&
2787     (*current_subject & 0xc0) == 0x80)
2788     current_subject++;
2789     }
2790 ph10 371 }
2791 ph10 389 else
2792     #endif
2793     while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2794     current_subject++;
2795    
2796     /* If we have just passed a CR and the newline option is ANY or
2797     ANYCRLF, and we are now at a LF, advance the match position by one
2798     more character. */
2799    
2800 ph10 391 if (current_subject[-1] == CHAR_CR &&
2801 ph10 389 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2802     current_subject < end_subject &&
2803 ph10 391 *current_subject == CHAR_NL)
2804 ph10 389 current_subject++;
2805 ph10 365 }
2806 nigel 77 }
2807 ph10 389
2808     /* Or to a non-unique first char after study */
2809    
2810     else if (start_bits != NULL)
2811 nigel 77 {
2812 ph10 389 while (current_subject < end_subject)
2813     {
2814     register unsigned int c = *current_subject;
2815     if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2816     else break;
2817     }
2818 nigel 77 }
2819 ph10 389 }
2820 nigel 77
2821     /* Restore fudged end_subject */
2822    
2823     end_subject = save_end_subject;
2824     }
2825    
2826     /* If req_byte is set, we know that that character must appear in the subject
2827     for the match to succeed. If the first character is set, req_byte must be
2828     later in the subject; otherwise the test starts at the match point. This
2829     optimization can save a huge amount of work in patterns with nested unlimited
2830     repeats that aren't going to match. Writing separate code for cased/caseless
2831     versions makes it go faster, as does using an autoincrement and backing off
2832     on a match.
2833    
2834     HOWEVER: when the subject string is very, very long, searching to its end can
2835     take a long time, and give bad performance on quite ordinary patterns. This
2836     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2837     don't do this when the string is sufficiently long.
2838    
2839 ph10 389 ALSO: this processing is disabled when partial matching is requested, and can
2840     also be explicitly deactivated. */
2841 nigel 77
2842 ph10 389 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2843     req_byte >= 0 &&
2844 nigel 77 end_subject - current_subject < REQ_BYTE_MAX &&
2845     (options & PCRE_PARTIAL) == 0)
2846     {
2847     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2848    
2849     /* We don't need to repeat the search if we haven't yet reached the
2850     place we found it at last time. */
2851    
2852     if (p > req_byte_ptr)
2853     {
2854     if (req_byte_caseless)
2855     {
2856     while (p < end_subject)
2857     {
2858     register int pp = *p++;
2859     if (pp == req_byte || pp == req_byte2) { p--; break; }
2860     }
2861     }
2862     else
2863     {
2864     while (p < end_subject)
2865     {
2866     if (*p++ == req_byte) { p--; break; }
2867     }
2868     }
2869    
2870     /* If we can't find the required character, break the matching loop,
2871     which will cause a return or PCRE_ERROR_NOMATCH. */
2872    
2873     if (p >= end_subject) break;
2874    
2875     /* If we have found the required character, save the point where we
2876     found it, so that we don't search again next time round the loop if
2877     the start hasn't passed this character yet. */
2878    
2879     req_byte_ptr = p;
2880     }
2881     }
2882    
2883     /* OK, now we can do the business */
2884    
2885     rc = internal_dfa_exec(
2886 nigel 91 md, /* fixed match data */
2887     md->start_code, /* this subexpression's code */
2888     current_subject, /* where we currently are */
2889     start_offset, /* start offset in subject */
2890     offsets, /* offset vector */
2891     offsetcount, /* size of same */
2892     workspace, /* workspace vector */
2893     wscount, /* size of same */
2894 nigel 77 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2895 nigel 91 0, /* function recurse level */
2896     0); /* regex recurse level */
2897 nigel 77
2898     /* Anything other than "no match" means we are done, always; otherwise, carry
2899     on only if not anchored. */
2900    
2901     if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2902    
2903     /* Advance to the next subject character unless we are at the end of a line
2904     and firstline is set. */
2905    
2906 nigel 93 if (firstline && IS_NEWLINE(current_subject)) break;
2907 nigel 77 current_subject++;
2908     if (utf8)
2909     {
2910     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2911     current_subject++;
2912     }
2913     if (current_subject > end_subject) break;
2914    
2915 ph10 227 /* If we have just passed a CR and we are now at a LF, and the pattern does
2916 ph10 226 not contain any explicit matches for \r or \n, and the newline option is CRLF
2917     or ANY or ANYCRLF, advance the match position by one more character. */
2918 nigel 93
2919 ph10 391 if (current_subject[-1] == CHAR_CR &&
2920 ph10 226 current_subject < end_subject &&
2921 ph10 391 *current_subject == CHAR_NL &&
2922 ph10 230 (re->flags & PCRE_HASCRORLF) == 0 &&
2923 ph10 226 (md->nltype == NLTYPE_ANY ||
2924     md->nltype == NLTYPE_ANYCRLF ||
2925     md->nllen == 2))
2926 nigel 93 current_subject++;
2927    
2928     } /* "Bumpalong" loop */
2929    
2930 nigel 77 return PCRE_ERROR_NOMATCH;
2931     }
2932    
2933     /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12