/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 435 - (show annotations) (download)
Sat Sep 5 10:20:44 2009 UTC (4 years, 10 months ago) by ph10
File MIME type: text/plain
File size: 99547 byte(s)
Further updates to partial matching.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2009 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 #ifdef HAVE_CONFIG_H
49 #include "config.h"
50 #endif
51
52 #define NLBLOCK md /* Block containing newline information */
53 #define PSSTART start_subject /* Field containing processed string start */
54 #define PSEND end_subject /* Field containing processed string end */
55
56 #include "pcre_internal.h"
57
58
59 /* For use to indent debugging output */
60
61 #define SP " "
62
63
64 /*************************************************
65 * Code parameters and static tables *
66 *************************************************/
67
68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 into others, under special conditions. A gap of 20 between the blocks should be
70 enough. The resulting opcodes don't have to be less than 256 because they are
71 never stored, so we push them well clear of the normal opcodes. */
72
73 #define OP_PROP_EXTRA 300
74 #define OP_EXTUNI_EXTRA 320
75 #define OP_ANYNL_EXTRA 340
76 #define OP_HSPACE_EXTRA 360
77 #define OP_VSPACE_EXTRA 380
78
79
80 /* This table identifies those opcodes that are followed immediately by a
81 character that is to be tested in some way. This makes is possible to
82 centralize the loading of these characters. In the case of Type * etc, the
83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 small value. ***NOTE*** If the start of this table is modified, the two tables
85 that follow must also be modified. */
86
87 static const uschar coptable[] = {
88 0, /* End */
89 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 0, 0, 0, /* Any, AllAny, Anybyte */
92 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95 1, /* Char */
96 1, /* Charnc */
97 1, /* not */
98 /* Positive single-char repeats */
99 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* upto, minupto, exact */
101 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 /* Negative single-char repeats - only for chars < 256 */
103 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* NOT upto, minupto, exact */
105 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 /* Positive type repeats */
107 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108 3, 3, 3, /* Type upto, minupto, exact */
109 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 /* Character class & ref repeats */
111 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112 0, 0, /* CRRANGE, CRMINRANGE */
113 0, /* CLASS */
114 0, /* NCLASS */
115 0, /* XCLASS - variable length */
116 0, /* REF */
117 0, /* RECURSE */
118 0, /* CALLOUT */
119 0, /* Alt */
120 0, /* Ket */
121 0, /* KetRmax */
122 0, /* KetRmin */
123 0, /* Assert */
124 0, /* Assert not */
125 0, /* Assert behind */
126 0, /* Assert behind not */
127 0, /* Reverse */
128 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129 0, 0, 0, /* SBRA, SCBRA, SCOND */
130 0, /* CREF */
131 0, /* RREF */
132 0, /* DEF */
133 0, 0, /* BRAZERO, BRAMINZERO */
134 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 };
137
138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139 and \w */
140
141 static const uschar toptable1[] = {
142 0, 0, 0, 0, 0, 0,
143 ctype_digit, ctype_digit,
144 ctype_space, ctype_space,
145 ctype_word, ctype_word,
146 0, 0 /* OP_ANY, OP_ALLANY */
147 };
148
149 static const uschar toptable2[] = {
150 0, 0, 0, 0, 0, 0,
151 ctype_digit, 0,
152 ctype_space, 0,
153 ctype_word, 0,
154 1, 1 /* OP_ANY, OP_ALLANY */
155 };
156
157
158 /* Structure for holding data about a particular state, which is in effect the
159 current data for an active path through the match tree. It must consist
160 entirely of ints because the working vector we are passed, and which we put
161 these structures in, is a vector of ints. */
162
163 typedef struct stateblock {
164 int offset; /* Offset to opcode */
165 int count; /* Count for repeats */
166 int ims; /* ims flag bits */
167 int data; /* Some use extra data */
168 } stateblock;
169
170 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171
172
173 #ifdef DEBUG
174 /*************************************************
175 * Print character string *
176 *************************************************/
177
178 /* Character string printing function for debugging.
179
180 Arguments:
181 p points to string
182 length number of bytes
183 f where to print
184
185 Returns: nothing
186 */
187
188 static void
189 pchars(unsigned char *p, int length, FILE *f)
190 {
191 int c;
192 while (length-- > 0)
193 {
194 if (isprint(c = *(p++)))
195 fprintf(f, "%c", c);
196 else
197 fprintf(f, "\\x%02x", c);
198 }
199 }
200 #endif
201
202
203
204 /*************************************************
205 * Execute a Regular Expression - DFA engine *
206 *************************************************/
207
208 /* This internal function applies a compiled pattern to a subject string,
209 starting at a given point, using a DFA engine. This function is called from the
210 external one, possibly multiple times if the pattern is not anchored. The
211 function calls itself recursively for some kinds of subpattern.
212
213 Arguments:
214 md the match_data block with fixed information
215 this_start_code the opening bracket of this subexpression's code
216 current_subject where we currently are in the subject string
217 start_offset start offset in the subject string
218 offsets vector to contain the matching string offsets
219 offsetcount size of same
220 workspace vector of workspace
221 wscount size of same
222 ims the current ims flags
223 rlevel function call recursion level
224 recursing regex recursive call level
225
226 Returns: > 0 => number of match offset pairs placed in offsets
227 = 0 => offsets overflowed; longest matches are present
228 -1 => failed to match
229 < -1 => some kind of unexpected problem
230
231 The following macros are used for adding states to the two state vectors (one
232 for the current character, one for the following character). */
233
234 #define ADD_ACTIVE(x,y) \
235 if (active_count++ < wscount) \
236 { \
237 next_active_state->offset = (x); \
238 next_active_state->count = (y); \
239 next_active_state->ims = ims; \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242 } \
243 else return PCRE_ERROR_DFA_WSSIZE
244
245 #define ADD_ACTIVE_DATA(x,y,z) \
246 if (active_count++ < wscount) \
247 { \
248 next_active_state->offset = (x); \
249 next_active_state->count = (y); \
250 next_active_state->ims = ims; \
251 next_active_state->data = (z); \
252 next_active_state++; \
253 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254 } \
255 else return PCRE_ERROR_DFA_WSSIZE
256
257 #define ADD_NEW(x,y) \
258 if (new_count++ < wscount) \
259 { \
260 next_new_state->offset = (x); \
261 next_new_state->count = (y); \
262 next_new_state->ims = ims; \
263 next_new_state++; \
264 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265 } \
266 else return PCRE_ERROR_DFA_WSSIZE
267
268 #define ADD_NEW_DATA(x,y,z) \
269 if (new_count++ < wscount) \
270 { \
271 next_new_state->offset = (x); \
272 next_new_state->count = (y); \
273 next_new_state->ims = ims; \
274 next_new_state->data = (z); \
275 next_new_state++; \
276 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277 } \
278 else return PCRE_ERROR_DFA_WSSIZE
279
280 /* And now, here is the code */
281
282 static int
283 internal_dfa_exec(
284 dfa_match_data *md,
285 const uschar *this_start_code,
286 const uschar *current_subject,
287 int start_offset,
288 int *offsets,
289 int offsetcount,
290 int *workspace,
291 int wscount,
292 int ims,
293 int rlevel,
294 int recursing)
295 {
296 stateblock *active_states, *new_states, *temp_states;
297 stateblock *next_active_state, *next_new_state;
298
299 const uschar *ctypes, *lcc, *fcc;
300 const uschar *ptr;
301 const uschar *end_code, *first_op;
302
303 int active_count, new_count, match_count;
304
305 /* Some fields in the md block are frequently referenced, so we load them into
306 independent variables in the hope that this will perform better. */
307
308 const uschar *start_subject = md->start_subject;
309 const uschar *end_subject = md->end_subject;
310 const uschar *start_code = md->start_code;
311
312 #ifdef SUPPORT_UTF8
313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 #else
315 BOOL utf8 = FALSE;
316 #endif
317
318 rlevel++;
319 offsetcount &= (-2);
320
321 wscount -= 2;
322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323 (2 * INTS_PER_STATEBLOCK);
324
325 DPRINTF(("\n%.*s---------------------\n"
326 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328
329 ctypes = md->tables + ctypes_offset;
330 lcc = md->tables + lcc_offset;
331 fcc = md->tables + fcc_offset;
332
333 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334
335 active_states = (stateblock *)(workspace + 2);
336 next_new_state = new_states = active_states + wscount;
337 new_count = 0;
338
339 first_op = this_start_code + 1 + LINK_SIZE +
340 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341
342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343 the alternative states onto the list, and find out where the end is. This
344 makes is possible to use this function recursively, when we want to stop at a
345 matching internal ket rather than at the end.
346
347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348 a backward assertion. In that case, we have to find out the maximum amount to
349 move back, and set up each alternative appropriately. */
350
351 if (*first_op == OP_REVERSE)
352 {
353 int max_back = 0;
354 int gone_back;
355
356 end_code = this_start_code;
357 do
358 {
359 int back = GET(end_code, 2+LINK_SIZE);
360 if (back > max_back) max_back = back;
361 end_code += GET(end_code, 1);
362 }
363 while (*end_code == OP_ALT);
364
365 /* If we can't go back the amount required for the longest lookbehind
366 pattern, go back as far as we can; some alternatives may still be viable. */
367
368 #ifdef SUPPORT_UTF8
369 /* In character mode we have to step back character by character */
370
371 if (utf8)
372 {
373 for (gone_back = 0; gone_back < max_back; gone_back++)
374 {
375 if (current_subject <= start_subject) break;
376 current_subject--;
377 while (current_subject > start_subject &&
378 (*current_subject & 0xc0) == 0x80)
379 current_subject--;
380 }
381 }
382 else
383 #endif
384
385 /* In byte-mode we can do this quickly. */
386
387 {
388 gone_back = (current_subject - max_back < start_subject)?
389 current_subject - start_subject : max_back;
390 current_subject -= gone_back;
391 }
392
393 /* Save the earliest consulted character */
394
395 if (current_subject < md->start_used_ptr)
396 md->start_used_ptr = current_subject;
397
398 /* Now we can process the individual branches. */
399
400 end_code = this_start_code;
401 do
402 {
403 int back = GET(end_code, 2+LINK_SIZE);
404 if (back <= gone_back)
405 {
406 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
407 ADD_NEW_DATA(-bstate, 0, gone_back - back);
408 }
409 end_code += GET(end_code, 1);
410 }
411 while (*end_code == OP_ALT);
412 }
413
414 /* This is the code for a "normal" subpattern (not a backward assertion). The
415 start of a whole pattern is always one of these. If we are at the top level,
416 we may be asked to restart matching from the same point that we reached for a
417 previous partial match. We still have to scan through the top-level branches to
418 find the end state. */
419
420 else
421 {
422 end_code = this_start_code;
423
424 /* Restarting */
425
426 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
427 {
428 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
429 new_count = workspace[1];
430 if (!workspace[0])
431 memcpy(new_states, active_states, new_count * sizeof(stateblock));
432 }
433
434 /* Not restarting */
435
436 else
437 {
438 int length = 1 + LINK_SIZE +
439 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
440 do
441 {
442 ADD_NEW(end_code - start_code + length, 0);
443 end_code += GET(end_code, 1);
444 length = 1 + LINK_SIZE;
445 }
446 while (*end_code == OP_ALT);
447 }
448 }
449
450 workspace[0] = 0; /* Bit indicating which vector is current */
451
452 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
453
454 /* Loop for scanning the subject */
455
456 ptr = current_subject;
457 for (;;)
458 {
459 int i, j;
460 int clen, dlen;
461 unsigned int c, d;
462 int forced_fail = 0;
463 int reached_end = 0;
464
465 /* Make the new state list into the active state list and empty the
466 new state list. */
467
468 temp_states = active_states;
469 active_states = new_states;
470 new_states = temp_states;
471 active_count = new_count;
472 new_count = 0;
473
474 workspace[0] ^= 1; /* Remember for the restarting feature */
475 workspace[1] = active_count;
476
477 #ifdef DEBUG
478 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
479 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
480 printf("\"\n");
481
482 printf("%.*sActive states: ", rlevel*2-2, SP);
483 for (i = 0; i < active_count; i++)
484 printf("%d/%d ", active_states[i].offset, active_states[i].count);
485 printf("\n");
486 #endif
487
488 /* Set the pointers for adding new states */
489
490 next_active_state = active_states + active_count;
491 next_new_state = new_states;
492
493 /* Load the current character from the subject outside the loop, as many
494 different states may want to look at it, and we assume that at least one
495 will. */
496
497 if (ptr < end_subject)
498 {
499 clen = 1; /* Number of bytes in the character */
500 #ifdef SUPPORT_UTF8
501 if (utf8) { GETCHARLEN(c, ptr, clen); } else
502 #endif /* SUPPORT_UTF8 */
503 c = *ptr;
504 }
505 else
506 {
507 clen = 0; /* This indicates the end of the subject */
508 c = NOTACHAR; /* This value should never actually be used */
509 }
510
511 /* Scan up the active states and act on each one. The result of an action
512 may be to add more states to the currently active list (e.g. on hitting a
513 parenthesis) or it may be to put states on the new list, for considering
514 when we move the character pointer on. */
515
516 for (i = 0; i < active_count; i++)
517 {
518 stateblock *current_state = active_states + i;
519 const uschar *code;
520 int state_offset = current_state->offset;
521 int count, codevalue, rrc;
522
523 #ifdef DEBUG
524 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
525 if (clen == 0) printf("EOL\n");
526 else if (c > 32 && c < 127) printf("'%c'\n", c);
527 else printf("0x%02x\n", c);
528 #endif
529
530 /* This variable is referred to implicity in the ADD_xxx macros. */
531
532 ims = current_state->ims;
533
534 /* A negative offset is a special case meaning "hold off going to this
535 (negated) state until the number of characters in the data field have
536 been skipped". */
537
538 if (state_offset < 0)
539 {
540 if (current_state->data > 0)
541 {
542 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
543 ADD_NEW_DATA(state_offset, current_state->count,
544 current_state->data - 1);
545 continue;
546 }
547 else
548 {
549 current_state->offset = state_offset = -state_offset;
550 }
551 }
552
553 /* Check for a duplicate state with the same count, and skip if found. */
554
555 for (j = 0; j < i; j++)
556 {
557 if (active_states[j].offset == state_offset &&
558 active_states[j].count == current_state->count)
559 {
560 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
561 goto NEXT_ACTIVE_STATE;
562 }
563 }
564
565 /* The state offset is the offset to the opcode */
566
567 code = start_code + state_offset;
568 codevalue = *code;
569
570 /* If this opcode is followed by an inline character, load it. It is
571 tempting to test for the presence of a subject character here, but that
572 is wrong, because sometimes zero repetitions of the subject are
573 permitted.
574
575 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
576 argument that is not a data character - but is always one byte long. We
577 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
578 this case. To keep the other cases fast, convert these ones to new opcodes.
579 */
580
581 if (coptable[codevalue] > 0)
582 {
583 dlen = 1;
584 #ifdef SUPPORT_UTF8
585 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
586 #endif /* SUPPORT_UTF8 */
587 d = code[coptable[codevalue]];
588 if (codevalue >= OP_TYPESTAR)
589 {
590 switch(d)
591 {
592 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
593 case OP_NOTPROP:
594 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
595 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
596 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
597 case OP_NOT_HSPACE:
598 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
599 case OP_NOT_VSPACE:
600 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
601 default: break;
602 }
603 }
604 }
605 else
606 {
607 dlen = 0; /* Not strictly necessary, but compilers moan */
608 d = NOTACHAR; /* if these variables are not set. */
609 }
610
611
612 /* Now process the individual opcodes */
613
614 switch (codevalue)
615 {
616
617 /* ========================================================================== */
618 /* Reached a closing bracket. If not at the end of the pattern, carry
619 on with the next opcode. Otherwise, unless we have an empty string and
620 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
621 matches so we always have the longest first. */
622
623 case OP_KET:
624 case OP_KETRMIN:
625 case OP_KETRMAX:
626 if (code != end_code)
627 {
628 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
629 if (codevalue != OP_KET)
630 {
631 ADD_ACTIVE(state_offset - GET(code, 1), 0);
632 }
633 }
634 else
635 {
636 reached_end++; /* Count branches that reach the end */
637 if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
638 {
639 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
640 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
641 match_count = 0;
642 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
643 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
644 if (offsetcount >= 2)
645 {
646 offsets[0] = current_subject - start_subject;
647 offsets[1] = ptr - start_subject;
648 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
649 offsets[1] - offsets[0], current_subject));
650 }
651 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
652 {
653 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
654 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
655 match_count, rlevel*2-2, SP));
656 return match_count;
657 }
658 }
659 }
660 break;
661
662 /* ========================================================================== */
663 /* These opcodes add to the current list of states without looking
664 at the current character. */
665
666 /*-----------------------------------------------------------------*/
667 case OP_ALT:
668 do { code += GET(code, 1); } while (*code == OP_ALT);
669 ADD_ACTIVE(code - start_code, 0);
670 break;
671
672 /*-----------------------------------------------------------------*/
673 case OP_BRA:
674 case OP_SBRA:
675 do
676 {
677 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
678 code += GET(code, 1);
679 }
680 while (*code == OP_ALT);
681 break;
682
683 /*-----------------------------------------------------------------*/
684 case OP_CBRA:
685 case OP_SCBRA:
686 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
687 code += GET(code, 1);
688 while (*code == OP_ALT)
689 {
690 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
691 code += GET(code, 1);
692 }
693 break;
694
695 /*-----------------------------------------------------------------*/
696 case OP_BRAZERO:
697 case OP_BRAMINZERO:
698 ADD_ACTIVE(state_offset + 1, 0);
699 code += 1 + GET(code, 2);
700 while (*code == OP_ALT) code += GET(code, 1);
701 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
702 break;
703
704 /*-----------------------------------------------------------------*/
705 case OP_SKIPZERO:
706 code += 1 + GET(code, 2);
707 while (*code == OP_ALT) code += GET(code, 1);
708 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
709 break;
710
711 /*-----------------------------------------------------------------*/
712 case OP_CIRC:
713 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
714 ((ims & PCRE_MULTILINE) != 0 &&
715 ptr != end_subject &&
716 WAS_NEWLINE(ptr)))
717 { ADD_ACTIVE(state_offset + 1, 0); }
718 break;
719
720 /*-----------------------------------------------------------------*/
721 case OP_EOD:
722 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
723 break;
724
725 /*-----------------------------------------------------------------*/
726 case OP_OPT:
727 ims = code[1];
728 ADD_ACTIVE(state_offset + 2, 0);
729 break;
730
731 /*-----------------------------------------------------------------*/
732 case OP_SOD:
733 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
734 break;
735
736 /*-----------------------------------------------------------------*/
737 case OP_SOM:
738 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
739 break;
740
741
742 /* ========================================================================== */
743 /* These opcodes inspect the next subject character, and sometimes
744 the previous one as well, but do not have an argument. The variable
745 clen contains the length of the current character and is zero if we are
746 at the end of the subject. */
747
748 /*-----------------------------------------------------------------*/
749 case OP_ANY:
750 if (clen > 0 && !IS_NEWLINE(ptr))
751 { ADD_NEW(state_offset + 1, 0); }
752 break;
753
754 /*-----------------------------------------------------------------*/
755 case OP_ALLANY:
756 if (clen > 0)
757 { ADD_NEW(state_offset + 1, 0); }
758 break;
759
760 /*-----------------------------------------------------------------*/
761 case OP_EODN:
762 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
763 { ADD_ACTIVE(state_offset + 1, 0); }
764 break;
765
766 /*-----------------------------------------------------------------*/
767 case OP_DOLL:
768 if ((md->moptions & PCRE_NOTEOL) == 0)
769 {
770 if (clen == 0 ||
771 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
772 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
773 ))
774 { ADD_ACTIVE(state_offset + 1, 0); }
775 }
776 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
777 { ADD_ACTIVE(state_offset + 1, 0); }
778 break;
779
780 /*-----------------------------------------------------------------*/
781
782 case OP_DIGIT:
783 case OP_WHITESPACE:
784 case OP_WORDCHAR:
785 if (clen > 0 && c < 256 &&
786 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
787 { ADD_NEW(state_offset + 1, 0); }
788 break;
789
790 /*-----------------------------------------------------------------*/
791 case OP_NOT_DIGIT:
792 case OP_NOT_WHITESPACE:
793 case OP_NOT_WORDCHAR:
794 if (clen > 0 && (c >= 256 ||
795 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
796 { ADD_NEW(state_offset + 1, 0); }
797 break;
798
799 /*-----------------------------------------------------------------*/
800 case OP_WORD_BOUNDARY:
801 case OP_NOT_WORD_BOUNDARY:
802 {
803 int left_word, right_word;
804
805 if (ptr > start_subject)
806 {
807 const uschar *temp = ptr - 1;
808 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
809 #ifdef SUPPORT_UTF8
810 if (utf8) BACKCHAR(temp);
811 #endif
812 GETCHARTEST(d, temp);
813 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
814 }
815 else left_word = 0;
816
817 if (clen > 0)
818 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
819 else /* This is a fudge to ensure that if this is the */
820 { /* last item in the pattern, we don't count it as */
821 reached_end--; /* reached, thus disabling a partial match. */
822 right_word = 0;
823 }
824
825 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
826 { ADD_ACTIVE(state_offset + 1, 0); }
827 }
828 break;
829
830
831 /*-----------------------------------------------------------------*/
832 /* Check the next character by Unicode property. We will get here only
833 if the support is in the binary; otherwise a compile-time error occurs.
834 */
835
836 #ifdef SUPPORT_UCP
837 case OP_PROP:
838 case OP_NOTPROP:
839 if (clen > 0)
840 {
841 BOOL OK;
842 const ucd_record * prop = GET_UCD(c);
843 switch(code[1])
844 {
845 case PT_ANY:
846 OK = TRUE;
847 break;
848
849 case PT_LAMP:
850 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
851 break;
852
853 case PT_GC:
854 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
855 break;
856
857 case PT_PC:
858 OK = prop->chartype == code[2];
859 break;
860
861 case PT_SC:
862 OK = prop->script == code[2];
863 break;
864
865 /* Should never occur, but keep compilers from grumbling. */
866
867 default:
868 OK = codevalue != OP_PROP;
869 break;
870 }
871
872 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
873 }
874 break;
875 #endif
876
877
878
879 /* ========================================================================== */
880 /* These opcodes likewise inspect the subject character, but have an
881 argument that is not a data character. It is one of these opcodes:
882 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
883 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
884
885 case OP_TYPEPLUS:
886 case OP_TYPEMINPLUS:
887 case OP_TYPEPOSPLUS:
888 count = current_state->count; /* Already matched */
889 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
890 if (clen > 0)
891 {
892 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
893 (c < 256 &&
894 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
895 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
896 {
897 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
898 {
899 active_count--; /* Remove non-match possibility */
900 next_active_state--;
901 }
902 count++;
903 ADD_NEW(state_offset, count);
904 }
905 }
906 break;
907
908 /*-----------------------------------------------------------------*/
909 case OP_TYPEQUERY:
910 case OP_TYPEMINQUERY:
911 case OP_TYPEPOSQUERY:
912 ADD_ACTIVE(state_offset + 2, 0);
913 if (clen > 0)
914 {
915 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
916 (c < 256 &&
917 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
918 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
919 {
920 if (codevalue == OP_TYPEPOSQUERY)
921 {
922 active_count--; /* Remove non-match possibility */
923 next_active_state--;
924 }
925 ADD_NEW(state_offset + 2, 0);
926 }
927 }
928 break;
929
930 /*-----------------------------------------------------------------*/
931 case OP_TYPESTAR:
932 case OP_TYPEMINSTAR:
933 case OP_TYPEPOSSTAR:
934 ADD_ACTIVE(state_offset + 2, 0);
935 if (clen > 0)
936 {
937 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
938 (c < 256 &&
939 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
940 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
941 {
942 if (codevalue == OP_TYPEPOSSTAR)
943 {
944 active_count--; /* Remove non-match possibility */
945 next_active_state--;
946 }
947 ADD_NEW(state_offset, 0);
948 }
949 }
950 break;
951
952 /*-----------------------------------------------------------------*/
953 case OP_TYPEEXACT:
954 count = current_state->count; /* Number already matched */
955 if (clen > 0)
956 {
957 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
958 (c < 256 &&
959 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
960 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
961 {
962 if (++count >= GET2(code, 1))
963 { ADD_NEW(state_offset + 4, 0); }
964 else
965 { ADD_NEW(state_offset, count); }
966 }
967 }
968 break;
969
970 /*-----------------------------------------------------------------*/
971 case OP_TYPEUPTO:
972 case OP_TYPEMINUPTO:
973 case OP_TYPEPOSUPTO:
974 ADD_ACTIVE(state_offset + 4, 0);
975 count = current_state->count; /* Number already matched */
976 if (clen > 0)
977 {
978 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
979 (c < 256 &&
980 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
981 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
982 {
983 if (codevalue == OP_TYPEPOSUPTO)
984 {
985 active_count--; /* Remove non-match possibility */
986 next_active_state--;
987 }
988 if (++count >= GET2(code, 1))
989 { ADD_NEW(state_offset + 4, 0); }
990 else
991 { ADD_NEW(state_offset, count); }
992 }
993 }
994 break;
995
996 /* ========================================================================== */
997 /* These are virtual opcodes that are used when something like
998 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
999 argument. It keeps the code above fast for the other cases. The argument
1000 is in the d variable. */
1001
1002 #ifdef SUPPORT_UCP
1003 case OP_PROP_EXTRA + OP_TYPEPLUS:
1004 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1005 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1006 count = current_state->count; /* Already matched */
1007 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1008 if (clen > 0)
1009 {
1010 BOOL OK;
1011 const ucd_record * prop = GET_UCD(c);
1012 switch(code[2])
1013 {
1014 case PT_ANY:
1015 OK = TRUE;
1016 break;
1017
1018 case PT_LAMP:
1019 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1020 break;
1021
1022 case PT_GC:
1023 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1024 break;
1025
1026 case PT_PC:
1027 OK = prop->chartype == code[3];
1028 break;
1029
1030 case PT_SC:
1031 OK = prop->script == code[3];
1032 break;
1033
1034 /* Should never occur, but keep compilers from grumbling. */
1035
1036 default:
1037 OK = codevalue != OP_PROP;
1038 break;
1039 }
1040
1041 if (OK == (d == OP_PROP))
1042 {
1043 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1044 {
1045 active_count--; /* Remove non-match possibility */
1046 next_active_state--;
1047 }
1048 count++;
1049 ADD_NEW(state_offset, count);
1050 }
1051 }
1052 break;
1053
1054 /*-----------------------------------------------------------------*/
1055 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1056 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1057 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1058 count = current_state->count; /* Already matched */
1059 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1060 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1061 {
1062 const uschar *nptr = ptr + clen;
1063 int ncount = 0;
1064 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1065 {
1066 active_count--; /* Remove non-match possibility */
1067 next_active_state--;
1068 }
1069 while (nptr < end_subject)
1070 {
1071 int nd;
1072 int ndlen = 1;
1073 GETCHARLEN(nd, nptr, ndlen);
1074 if (UCD_CATEGORY(nd) != ucp_M) break;
1075 ncount++;
1076 nptr += ndlen;
1077 }
1078 count++;
1079 ADD_NEW_DATA(-state_offset, count, ncount);
1080 }
1081 break;
1082 #endif
1083
1084 /*-----------------------------------------------------------------*/
1085 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1086 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1087 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1088 count = current_state->count; /* Already matched */
1089 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1090 if (clen > 0)
1091 {
1092 int ncount = 0;
1093 switch (c)
1094 {
1095 case 0x000b:
1096 case 0x000c:
1097 case 0x0085:
1098 case 0x2028:
1099 case 0x2029:
1100 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1101 goto ANYNL01;
1102
1103 case 0x000d:
1104 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1105 /* Fall through */
1106
1107 ANYNL01:
1108 case 0x000a:
1109 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1110 {
1111 active_count--; /* Remove non-match possibility */
1112 next_active_state--;
1113 }
1114 count++;
1115 ADD_NEW_DATA(-state_offset, count, ncount);
1116 break;
1117
1118 default:
1119 break;
1120 }
1121 }
1122 break;
1123
1124 /*-----------------------------------------------------------------*/
1125 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1126 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1127 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1128 count = current_state->count; /* Already matched */
1129 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1130 if (clen > 0)
1131 {
1132 BOOL OK;
1133 switch (c)
1134 {
1135 case 0x000a:
1136 case 0x000b:
1137 case 0x000c:
1138 case 0x000d:
1139 case 0x0085:
1140 case 0x2028:
1141 case 0x2029:
1142 OK = TRUE;
1143 break;
1144
1145 default:
1146 OK = FALSE;
1147 break;
1148 }
1149
1150 if (OK == (d == OP_VSPACE))
1151 {
1152 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1153 {
1154 active_count--; /* Remove non-match possibility */
1155 next_active_state--;
1156 }
1157 count++;
1158 ADD_NEW_DATA(-state_offset, count, 0);
1159 }
1160 }
1161 break;
1162
1163 /*-----------------------------------------------------------------*/
1164 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1165 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1166 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1167 count = current_state->count; /* Already matched */
1168 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1169 if (clen > 0)
1170 {
1171 BOOL OK;
1172 switch (c)
1173 {
1174 case 0x09: /* HT */
1175 case 0x20: /* SPACE */
1176 case 0xa0: /* NBSP */
1177 case 0x1680: /* OGHAM SPACE MARK */
1178 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1179 case 0x2000: /* EN QUAD */
1180 case 0x2001: /* EM QUAD */
1181 case 0x2002: /* EN SPACE */
1182 case 0x2003: /* EM SPACE */
1183 case 0x2004: /* THREE-PER-EM SPACE */
1184 case 0x2005: /* FOUR-PER-EM SPACE */
1185 case 0x2006: /* SIX-PER-EM SPACE */
1186 case 0x2007: /* FIGURE SPACE */
1187 case 0x2008: /* PUNCTUATION SPACE */
1188 case 0x2009: /* THIN SPACE */
1189 case 0x200A: /* HAIR SPACE */
1190 case 0x202f: /* NARROW NO-BREAK SPACE */
1191 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1192 case 0x3000: /* IDEOGRAPHIC SPACE */
1193 OK = TRUE;
1194 break;
1195
1196 default:
1197 OK = FALSE;
1198 break;
1199 }
1200
1201 if (OK == (d == OP_HSPACE))
1202 {
1203 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1204 {
1205 active_count--; /* Remove non-match possibility */
1206 next_active_state--;
1207 }
1208 count++;
1209 ADD_NEW_DATA(-state_offset, count, 0);
1210 }
1211 }
1212 break;
1213
1214 /*-----------------------------------------------------------------*/
1215 #ifdef SUPPORT_UCP
1216 case OP_PROP_EXTRA + OP_TYPEQUERY:
1217 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1218 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1219 count = 4;
1220 goto QS1;
1221
1222 case OP_PROP_EXTRA + OP_TYPESTAR:
1223 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1224 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1225 count = 0;
1226
1227 QS1:
1228
1229 ADD_ACTIVE(state_offset + 4, 0);
1230 if (clen > 0)
1231 {
1232 BOOL OK;
1233 const ucd_record * prop = GET_UCD(c);
1234 switch(code[2])
1235 {
1236 case PT_ANY:
1237 OK = TRUE;
1238 break;
1239
1240 case PT_LAMP:
1241 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1242 break;
1243
1244 case PT_GC:
1245 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1246 break;
1247
1248 case PT_PC:
1249 OK = prop->chartype == code[3];
1250 break;
1251
1252 case PT_SC:
1253 OK = prop->script == code[3];
1254 break;
1255
1256 /* Should never occur, but keep compilers from grumbling. */
1257
1258 default:
1259 OK = codevalue != OP_PROP;
1260 break;
1261 }
1262
1263 if (OK == (d == OP_PROP))
1264 {
1265 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1266 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1267 {
1268 active_count--; /* Remove non-match possibility */
1269 next_active_state--;
1270 }
1271 ADD_NEW(state_offset + count, 0);
1272 }
1273 }
1274 break;
1275
1276 /*-----------------------------------------------------------------*/
1277 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1278 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1279 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1280 count = 2;
1281 goto QS2;
1282
1283 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1284 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1285 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1286 count = 0;
1287
1288 QS2:
1289
1290 ADD_ACTIVE(state_offset + 2, 0);
1291 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1292 {
1293 const uschar *nptr = ptr + clen;
1294 int ncount = 0;
1295 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1296 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1297 {
1298 active_count--; /* Remove non-match possibility */
1299 next_active_state--;
1300 }
1301 while (nptr < end_subject)
1302 {
1303 int nd;
1304 int ndlen = 1;
1305 GETCHARLEN(nd, nptr, ndlen);
1306 if (UCD_CATEGORY(nd) != ucp_M) break;
1307 ncount++;
1308 nptr += ndlen;
1309 }
1310 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1311 }
1312 break;
1313 #endif
1314
1315 /*-----------------------------------------------------------------*/
1316 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1317 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1318 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1319 count = 2;
1320 goto QS3;
1321
1322 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1323 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1324 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1325 count = 0;
1326
1327 QS3:
1328 ADD_ACTIVE(state_offset + 2, 0);
1329 if (clen > 0)
1330 {
1331 int ncount = 0;
1332 switch (c)
1333 {
1334 case 0x000b:
1335 case 0x000c:
1336 case 0x0085:
1337 case 0x2028:
1338 case 0x2029:
1339 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1340 goto ANYNL02;
1341
1342 case 0x000d:
1343 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1344 /* Fall through */
1345
1346 ANYNL02:
1347 case 0x000a:
1348 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1349 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1350 {
1351 active_count--; /* Remove non-match possibility */
1352 next_active_state--;
1353 }
1354 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1355 break;
1356
1357 default:
1358 break;
1359 }
1360 }
1361 break;
1362
1363 /*-----------------------------------------------------------------*/
1364 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1365 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1366 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1367 count = 2;
1368 goto QS4;
1369
1370 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1371 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1372 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1373 count = 0;
1374
1375 QS4:
1376 ADD_ACTIVE(state_offset + 2, 0);
1377 if (clen > 0)
1378 {
1379 BOOL OK;
1380 switch (c)
1381 {
1382 case 0x000a:
1383 case 0x000b:
1384 case 0x000c:
1385 case 0x000d:
1386 case 0x0085:
1387 case 0x2028:
1388 case 0x2029:
1389 OK = TRUE;
1390 break;
1391
1392 default:
1393 OK = FALSE;
1394 break;
1395 }
1396 if (OK == (d == OP_VSPACE))
1397 {
1398 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1399 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1400 {
1401 active_count--; /* Remove non-match possibility */
1402 next_active_state--;
1403 }
1404 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1405 }
1406 }
1407 break;
1408
1409 /*-----------------------------------------------------------------*/
1410 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1411 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1412 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1413 count = 2;
1414 goto QS5;
1415
1416 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1417 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1418 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1419 count = 0;
1420
1421 QS5:
1422 ADD_ACTIVE(state_offset + 2, 0);
1423 if (clen > 0)
1424 {
1425 BOOL OK;
1426 switch (c)
1427 {
1428 case 0x09: /* HT */
1429 case 0x20: /* SPACE */
1430 case 0xa0: /* NBSP */
1431 case 0x1680: /* OGHAM SPACE MARK */
1432 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1433 case 0x2000: /* EN QUAD */
1434 case 0x2001: /* EM QUAD */
1435 case 0x2002: /* EN SPACE */
1436 case 0x2003: /* EM SPACE */
1437 case 0x2004: /* THREE-PER-EM SPACE */
1438 case 0x2005: /* FOUR-PER-EM SPACE */
1439 case 0x2006: /* SIX-PER-EM SPACE */
1440 case 0x2007: /* FIGURE SPACE */
1441 case 0x2008: /* PUNCTUATION SPACE */
1442 case 0x2009: /* THIN SPACE */
1443 case 0x200A: /* HAIR SPACE */
1444 case 0x202f: /* NARROW NO-BREAK SPACE */
1445 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1446 case 0x3000: /* IDEOGRAPHIC SPACE */
1447 OK = TRUE;
1448 break;
1449
1450 default:
1451 OK = FALSE;
1452 break;
1453 }
1454
1455 if (OK == (d == OP_HSPACE))
1456 {
1457 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1458 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1459 {
1460 active_count--; /* Remove non-match possibility */
1461 next_active_state--;
1462 }
1463 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1464 }
1465 }
1466 break;
1467
1468 /*-----------------------------------------------------------------*/
1469 #ifdef SUPPORT_UCP
1470 case OP_PROP_EXTRA + OP_TYPEEXACT:
1471 case OP_PROP_EXTRA + OP_TYPEUPTO:
1472 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1473 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1474 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1475 { ADD_ACTIVE(state_offset + 6, 0); }
1476 count = current_state->count; /* Number already matched */
1477 if (clen > 0)
1478 {
1479 BOOL OK;
1480 const ucd_record * prop = GET_UCD(c);
1481 switch(code[4])
1482 {
1483 case PT_ANY:
1484 OK = TRUE;
1485 break;
1486
1487 case PT_LAMP:
1488 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1489 break;
1490
1491 case PT_GC:
1492 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1493 break;
1494
1495 case PT_PC:
1496 OK = prop->chartype == code[5];
1497 break;
1498
1499 case PT_SC:
1500 OK = prop->script == code[5];
1501 break;
1502
1503 /* Should never occur, but keep compilers from grumbling. */
1504
1505 default:
1506 OK = codevalue != OP_PROP;
1507 break;
1508 }
1509
1510 if (OK == (d == OP_PROP))
1511 {
1512 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1513 {
1514 active_count--; /* Remove non-match possibility */
1515 next_active_state--;
1516 }
1517 if (++count >= GET2(code, 1))
1518 { ADD_NEW(state_offset + 6, 0); }
1519 else
1520 { ADD_NEW(state_offset, count); }
1521 }
1522 }
1523 break;
1524
1525 /*-----------------------------------------------------------------*/
1526 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1527 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1528 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1529 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1530 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1531 { ADD_ACTIVE(state_offset + 4, 0); }
1532 count = current_state->count; /* Number already matched */
1533 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1534 {
1535 const uschar *nptr = ptr + clen;
1536 int ncount = 0;
1537 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1538 {
1539 active_count--; /* Remove non-match possibility */
1540 next_active_state--;
1541 }
1542 while (nptr < end_subject)
1543 {
1544 int nd;
1545 int ndlen = 1;
1546 GETCHARLEN(nd, nptr, ndlen);
1547 if (UCD_CATEGORY(nd) != ucp_M) break;
1548 ncount++;
1549 nptr += ndlen;
1550 }
1551 if (++count >= GET2(code, 1))
1552 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1553 else
1554 { ADD_NEW_DATA(-state_offset, count, ncount); }
1555 }
1556 break;
1557 #endif
1558
1559 /*-----------------------------------------------------------------*/
1560 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1561 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1562 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1563 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1564 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1565 { ADD_ACTIVE(state_offset + 4, 0); }
1566 count = current_state->count; /* Number already matched */
1567 if (clen > 0)
1568 {
1569 int ncount = 0;
1570 switch (c)
1571 {
1572 case 0x000b:
1573 case 0x000c:
1574 case 0x0085:
1575 case 0x2028:
1576 case 0x2029:
1577 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1578 goto ANYNL03;
1579
1580 case 0x000d:
1581 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1582 /* Fall through */
1583
1584 ANYNL03:
1585 case 0x000a:
1586 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1587 {
1588 active_count--; /* Remove non-match possibility */
1589 next_active_state--;
1590 }
1591 if (++count >= GET2(code, 1))
1592 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1593 else
1594 { ADD_NEW_DATA(-state_offset, count, ncount); }
1595 break;
1596
1597 default:
1598 break;
1599 }
1600 }
1601 break;
1602
1603 /*-----------------------------------------------------------------*/
1604 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1605 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1606 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1607 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1608 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1609 { ADD_ACTIVE(state_offset + 4, 0); }
1610 count = current_state->count; /* Number already matched */
1611 if (clen > 0)
1612 {
1613 BOOL OK;
1614 switch (c)
1615 {
1616 case 0x000a:
1617 case 0x000b:
1618 case 0x000c:
1619 case 0x000d:
1620 case 0x0085:
1621 case 0x2028:
1622 case 0x2029:
1623 OK = TRUE;
1624 break;
1625
1626 default:
1627 OK = FALSE;
1628 }
1629
1630 if (OK == (d == OP_VSPACE))
1631 {
1632 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1633 {
1634 active_count--; /* Remove non-match possibility */
1635 next_active_state--;
1636 }
1637 if (++count >= GET2(code, 1))
1638 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1639 else
1640 { ADD_NEW_DATA(-state_offset, count, 0); }
1641 }
1642 }
1643 break;
1644
1645 /*-----------------------------------------------------------------*/
1646 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1647 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1648 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1649 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1650 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1651 { ADD_ACTIVE(state_offset + 4, 0); }
1652 count = current_state->count; /* Number already matched */
1653 if (clen > 0)
1654 {
1655 BOOL OK;
1656 switch (c)
1657 {
1658 case 0x09: /* HT */
1659 case 0x20: /* SPACE */
1660 case 0xa0: /* NBSP */
1661 case 0x1680: /* OGHAM SPACE MARK */
1662 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1663 case 0x2000: /* EN QUAD */
1664 case 0x2001: /* EM QUAD */
1665 case 0x2002: /* EN SPACE */
1666 case 0x2003: /* EM SPACE */
1667 case 0x2004: /* THREE-PER-EM SPACE */
1668 case 0x2005: /* FOUR-PER-EM SPACE */
1669 case 0x2006: /* SIX-PER-EM SPACE */
1670 case 0x2007: /* FIGURE SPACE */
1671 case 0x2008: /* PUNCTUATION SPACE */
1672 case 0x2009: /* THIN SPACE */
1673 case 0x200A: /* HAIR SPACE */
1674 case 0x202f: /* NARROW NO-BREAK SPACE */
1675 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1676 case 0x3000: /* IDEOGRAPHIC SPACE */
1677 OK = TRUE;
1678 break;
1679
1680 default:
1681 OK = FALSE;
1682 break;
1683 }
1684
1685 if (OK == (d == OP_HSPACE))
1686 {
1687 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1688 {
1689 active_count--; /* Remove non-match possibility */
1690 next_active_state--;
1691 }
1692 if (++count >= GET2(code, 1))
1693 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1694 else
1695 { ADD_NEW_DATA(-state_offset, count, 0); }
1696 }
1697 }
1698 break;
1699
1700 /* ========================================================================== */
1701 /* These opcodes are followed by a character that is usually compared
1702 to the current subject character; it is loaded into d. We still get
1703 here even if there is no subject character, because in some cases zero
1704 repetitions are permitted. */
1705
1706 /*-----------------------------------------------------------------*/
1707 case OP_CHAR:
1708 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1709 break;
1710
1711 /*-----------------------------------------------------------------*/
1712 case OP_CHARNC:
1713 if (clen == 0) break;
1714
1715 #ifdef SUPPORT_UTF8
1716 if (utf8)
1717 {
1718 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1719 {
1720 unsigned int othercase;
1721 if (c < 128) othercase = fcc[c]; else
1722
1723 /* If we have Unicode property support, we can use it to test the
1724 other case of the character. */
1725
1726 #ifdef SUPPORT_UCP
1727 othercase = UCD_OTHERCASE(c);
1728 #else
1729 othercase = NOTACHAR;
1730 #endif
1731
1732 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1733 }
1734 }
1735 else
1736 #endif /* SUPPORT_UTF8 */
1737
1738 /* Non-UTF-8 mode */
1739 {
1740 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1741 }
1742 break;
1743
1744
1745 #ifdef SUPPORT_UCP
1746 /*-----------------------------------------------------------------*/
1747 /* This is a tricky one because it can match more than one character.
1748 Find out how many characters to skip, and then set up a negative state
1749 to wait for them to pass before continuing. */
1750
1751 case OP_EXTUNI:
1752 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1753 {
1754 const uschar *nptr = ptr + clen;
1755 int ncount = 0;
1756 while (nptr < end_subject)
1757 {
1758 int nclen = 1;
1759 GETCHARLEN(c, nptr, nclen);
1760 if (UCD_CATEGORY(c) != ucp_M) break;
1761 ncount++;
1762 nptr += nclen;
1763 }
1764 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1765 }
1766 break;
1767 #endif
1768
1769 /*-----------------------------------------------------------------*/
1770 /* This is a tricky like EXTUNI because it too can match more than one
1771 character (when CR is followed by LF). In this case, set up a negative
1772 state to wait for one character to pass before continuing. */
1773
1774 case OP_ANYNL:
1775 if (clen > 0) switch(c)
1776 {
1777 case 0x000b:
1778 case 0x000c:
1779 case 0x0085:
1780 case 0x2028:
1781 case 0x2029:
1782 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1783
1784 case 0x000a:
1785 ADD_NEW(state_offset + 1, 0);
1786 break;
1787
1788 case 0x000d:
1789 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1790 {
1791 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1792 }
1793 else
1794 {
1795 ADD_NEW(state_offset + 1, 0);
1796 }
1797 break;
1798 }
1799 break;
1800
1801 /*-----------------------------------------------------------------*/
1802 case OP_NOT_VSPACE:
1803 if (clen > 0) switch(c)
1804 {
1805 case 0x000a:
1806 case 0x000b:
1807 case 0x000c:
1808 case 0x000d:
1809 case 0x0085:
1810 case 0x2028:
1811 case 0x2029:
1812 break;
1813
1814 default:
1815 ADD_NEW(state_offset + 1, 0);
1816 break;
1817 }
1818 break;
1819
1820 /*-----------------------------------------------------------------*/
1821 case OP_VSPACE:
1822 if (clen > 0) switch(c)
1823 {
1824 case 0x000a:
1825 case 0x000b:
1826 case 0x000c:
1827 case 0x000d:
1828 case 0x0085:
1829 case 0x2028:
1830 case 0x2029:
1831 ADD_NEW(state_offset + 1, 0);
1832 break;
1833
1834 default: break;
1835 }
1836 break;
1837
1838 /*-----------------------------------------------------------------*/
1839 case OP_NOT_HSPACE:
1840 if (clen > 0) switch(c)
1841 {
1842 case 0x09: /* HT */
1843 case 0x20: /* SPACE */
1844 case 0xa0: /* NBSP */
1845 case 0x1680: /* OGHAM SPACE MARK */
1846 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1847 case 0x2000: /* EN QUAD */
1848 case 0x2001: /* EM QUAD */
1849 case 0x2002: /* EN SPACE */
1850 case 0x2003: /* EM SPACE */
1851 case 0x2004: /* THREE-PER-EM SPACE */
1852 case 0x2005: /* FOUR-PER-EM SPACE */
1853 case 0x2006: /* SIX-PER-EM SPACE */
1854 case 0x2007: /* FIGURE SPACE */
1855 case 0x2008: /* PUNCTUATION SPACE */
1856 case 0x2009: /* THIN SPACE */
1857 case 0x200A: /* HAIR SPACE */
1858 case 0x202f: /* NARROW NO-BREAK SPACE */
1859 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1860 case 0x3000: /* IDEOGRAPHIC SPACE */
1861 break;
1862
1863 default:
1864 ADD_NEW(state_offset + 1, 0);
1865 break;
1866 }
1867 break;
1868
1869 /*-----------------------------------------------------------------*/
1870 case OP_HSPACE:
1871 if (clen > 0) switch(c)
1872 {
1873 case 0x09: /* HT */
1874 case 0x20: /* SPACE */
1875 case 0xa0: /* NBSP */
1876 case 0x1680: /* OGHAM SPACE MARK */
1877 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1878 case 0x2000: /* EN QUAD */
1879 case 0x2001: /* EM QUAD */
1880 case 0x2002: /* EN SPACE */
1881 case 0x2003: /* EM SPACE */
1882 case 0x2004: /* THREE-PER-EM SPACE */
1883 case 0x2005: /* FOUR-PER-EM SPACE */
1884 case 0x2006: /* SIX-PER-EM SPACE */
1885 case 0x2007: /* FIGURE SPACE */
1886 case 0x2008: /* PUNCTUATION SPACE */
1887 case 0x2009: /* THIN SPACE */
1888 case 0x200A: /* HAIR SPACE */
1889 case 0x202f: /* NARROW NO-BREAK SPACE */
1890 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1891 case 0x3000: /* IDEOGRAPHIC SPACE */
1892 ADD_NEW(state_offset + 1, 0);
1893 break;
1894 }
1895 break;
1896
1897 /*-----------------------------------------------------------------*/
1898 /* Match a negated single character. This is only used for one-byte
1899 characters, that is, we know that d < 256. The character we are
1900 checking (c) can be multibyte. */
1901
1902 case OP_NOT:
1903 if (clen > 0)
1904 {
1905 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1906 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1907 }
1908 break;
1909
1910 /*-----------------------------------------------------------------*/
1911 case OP_PLUS:
1912 case OP_MINPLUS:
1913 case OP_POSPLUS:
1914 case OP_NOTPLUS:
1915 case OP_NOTMINPLUS:
1916 case OP_NOTPOSPLUS:
1917 count = current_state->count; /* Already matched */
1918 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1919 if (clen > 0)
1920 {
1921 unsigned int otherd = NOTACHAR;
1922 if ((ims & PCRE_CASELESS) != 0)
1923 {
1924 #ifdef SUPPORT_UTF8
1925 if (utf8 && d >= 128)
1926 {
1927 #ifdef SUPPORT_UCP
1928 otherd = UCD_OTHERCASE(d);
1929 #endif /* SUPPORT_UCP */
1930 }
1931 else
1932 #endif /* SUPPORT_UTF8 */
1933 otherd = fcc[d];
1934 }
1935 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1936 {
1937 if (count > 0 &&
1938 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1939 {
1940 active_count--; /* Remove non-match possibility */
1941 next_active_state--;
1942 }
1943 count++;
1944 ADD_NEW(state_offset, count);
1945 }
1946 }
1947 break;
1948
1949 /*-----------------------------------------------------------------*/
1950 case OP_QUERY:
1951 case OP_MINQUERY:
1952 case OP_POSQUERY:
1953 case OP_NOTQUERY:
1954 case OP_NOTMINQUERY:
1955 case OP_NOTPOSQUERY:
1956 ADD_ACTIVE(state_offset + dlen + 1, 0);
1957 if (clen > 0)
1958 {
1959 unsigned int otherd = NOTACHAR;
1960 if ((ims & PCRE_CASELESS) != 0)
1961 {
1962 #ifdef SUPPORT_UTF8
1963 if (utf8 && d >= 128)
1964 {
1965 #ifdef SUPPORT_UCP
1966 otherd = UCD_OTHERCASE(d);
1967 #endif /* SUPPORT_UCP */
1968 }
1969 else
1970 #endif /* SUPPORT_UTF8 */
1971 otherd = fcc[d];
1972 }
1973 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1974 {
1975 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1976 {
1977 active_count--; /* Remove non-match possibility */
1978 next_active_state--;
1979 }
1980 ADD_NEW(state_offset + dlen + 1, 0);
1981 }
1982 }
1983 break;
1984
1985 /*-----------------------------------------------------------------*/
1986 case OP_STAR:
1987 case OP_MINSTAR:
1988 case OP_POSSTAR:
1989 case OP_NOTSTAR:
1990 case OP_NOTMINSTAR:
1991 case OP_NOTPOSSTAR:
1992 ADD_ACTIVE(state_offset + dlen + 1, 0);
1993 if (clen > 0)
1994 {
1995 unsigned int otherd = NOTACHAR;
1996 if ((ims & PCRE_CASELESS) != 0)
1997 {
1998 #ifdef SUPPORT_UTF8
1999 if (utf8 && d >= 128)
2000 {
2001 #ifdef SUPPORT_UCP
2002 otherd = UCD_OTHERCASE(d);
2003 #endif /* SUPPORT_UCP */
2004 }
2005 else
2006 #endif /* SUPPORT_UTF8 */
2007 otherd = fcc[d];
2008 }
2009 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2010 {
2011 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2012 {
2013 active_count--; /* Remove non-match possibility */
2014 next_active_state--;
2015 }
2016 ADD_NEW(state_offset, 0);
2017 }
2018 }
2019 break;
2020
2021 /*-----------------------------------------------------------------*/
2022 case OP_EXACT:
2023 case OP_NOTEXACT:
2024 count = current_state->count; /* Number already matched */
2025 if (clen > 0)
2026 {
2027 unsigned int otherd = NOTACHAR;
2028 if ((ims & PCRE_CASELESS) != 0)
2029 {
2030 #ifdef SUPPORT_UTF8
2031 if (utf8 && d >= 128)
2032 {
2033 #ifdef SUPPORT_UCP
2034 otherd = UCD_OTHERCASE(d);
2035 #endif /* SUPPORT_UCP */
2036 }
2037 else
2038 #endif /* SUPPORT_UTF8 */
2039 otherd = fcc[d];
2040 }
2041 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2042 {
2043 if (++count >= GET2(code, 1))
2044 { ADD_NEW(state_offset + dlen + 3, 0); }
2045 else
2046 { ADD_NEW(state_offset, count); }
2047 }
2048 }
2049 break;
2050
2051 /*-----------------------------------------------------------------*/
2052 case OP_UPTO:
2053 case OP_MINUPTO:
2054 case OP_POSUPTO:
2055 case OP_NOTUPTO:
2056 case OP_NOTMINUPTO:
2057 case OP_NOTPOSUPTO:
2058 ADD_ACTIVE(state_offset + dlen + 3, 0);
2059 count = current_state->count; /* Number already matched */
2060 if (clen > 0)
2061 {
2062 unsigned int otherd = NOTACHAR;
2063 if ((ims & PCRE_CASELESS) != 0)
2064 {
2065 #ifdef SUPPORT_UTF8
2066 if (utf8 && d >= 128)
2067 {
2068 #ifdef SUPPORT_UCP
2069 otherd = UCD_OTHERCASE(d);
2070 #endif /* SUPPORT_UCP */
2071 }
2072 else
2073 #endif /* SUPPORT_UTF8 */
2074 otherd = fcc[d];
2075 }
2076 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2077 {
2078 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2079 {
2080 active_count--; /* Remove non-match possibility */
2081 next_active_state--;
2082 }
2083 if (++count >= GET2(code, 1))
2084 { ADD_NEW(state_offset + dlen + 3, 0); }
2085 else
2086 { ADD_NEW(state_offset, count); }
2087 }
2088 }
2089 break;
2090
2091
2092 /* ========================================================================== */
2093 /* These are the class-handling opcodes */
2094
2095 case OP_CLASS:
2096 case OP_NCLASS:
2097 case OP_XCLASS:
2098 {
2099 BOOL isinclass = FALSE;
2100 int next_state_offset;
2101 const uschar *ecode;
2102
2103 /* For a simple class, there is always just a 32-byte table, and we
2104 can set isinclass from it. */
2105
2106 if (codevalue != OP_XCLASS)
2107 {
2108 ecode = code + 33;
2109 if (clen > 0)
2110 {
2111 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2112 ((code[1 + c/8] & (1 << (c&7))) != 0);
2113 }
2114 }
2115
2116 /* An extended class may have a table or a list of single characters,
2117 ranges, or both, and it may be positive or negative. There's a
2118 function that sorts all this out. */
2119
2120 else
2121 {
2122 ecode = code + GET(code, 1);
2123 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2124 }
2125
2126 /* At this point, isinclass is set for all kinds of class, and ecode
2127 points to the byte after the end of the class. If there is a
2128 quantifier, this is where it will be. */
2129
2130 next_state_offset = ecode - start_code;
2131
2132 switch (*ecode)
2133 {
2134 case OP_CRSTAR:
2135 case OP_CRMINSTAR:
2136 ADD_ACTIVE(next_state_offset + 1, 0);
2137 if (isinclass) { ADD_NEW(state_offset, 0); }
2138 break;
2139
2140 case OP_CRPLUS:
2141 case OP_CRMINPLUS:
2142 count = current_state->count; /* Already matched */
2143 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2144 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2145 break;
2146
2147 case OP_CRQUERY:
2148 case OP_CRMINQUERY:
2149 ADD_ACTIVE(next_state_offset + 1, 0);
2150 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2151 break;
2152
2153 case OP_CRRANGE:
2154 case OP_CRMINRANGE:
2155 count = current_state->count; /* Already matched */
2156 if (count >= GET2(ecode, 1))
2157 { ADD_ACTIVE(next_state_offset + 5, 0); }
2158 if (isinclass)
2159 {
2160 int max = GET2(ecode, 3);
2161 if (++count >= max && max != 0) /* Max 0 => no limit */
2162 { ADD_NEW(next_state_offset + 5, 0); }
2163 else
2164 { ADD_NEW(state_offset, count); }
2165 }
2166 break;
2167
2168 default:
2169 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2170 break;
2171 }
2172 }
2173 break;
2174
2175 /* ========================================================================== */
2176 /* These are the opcodes for fancy brackets of various kinds. We have
2177 to use recursion in order to handle them. The "always failing" assertion
2178 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2179 though the other "backtracking verbs" are not supported. */
2180
2181 case OP_FAIL:
2182 forced_fail++; /* Count FAILs for multiple states */
2183 break;
2184
2185 case OP_ASSERT:
2186 case OP_ASSERT_NOT:
2187 case OP_ASSERTBACK:
2188 case OP_ASSERTBACK_NOT:
2189 {
2190 int rc;
2191 int local_offsets[2];
2192 int local_workspace[1000];
2193 const uschar *endasscode = code + GET(code, 1);
2194
2195 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2196
2197 rc = internal_dfa_exec(
2198 md, /* static match data */
2199 code, /* this subexpression's code */
2200 ptr, /* where we currently are */
2201 ptr - start_subject, /* start offset */
2202 local_offsets, /* offset vector */
2203 sizeof(local_offsets)/sizeof(int), /* size of same */
2204 local_workspace, /* workspace vector */
2205 sizeof(local_workspace)/sizeof(int), /* size of same */
2206 ims, /* the current ims flags */
2207 rlevel, /* function recursion level */
2208 recursing); /* pass on regex recursion */
2209
2210 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2211 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2212 }
2213 break;
2214
2215 /*-----------------------------------------------------------------*/
2216 case OP_COND:
2217 case OP_SCOND:
2218 {
2219 int local_offsets[1000];
2220 int local_workspace[1000];
2221 int codelink = GET(code, 1);
2222 int condcode;
2223
2224 /* Because of the way auto-callout works during compile, a callout item
2225 is inserted between OP_COND and an assertion condition. This does not
2226 happen for the other conditions. */
2227
2228 if (code[LINK_SIZE+1] == OP_CALLOUT)
2229 {
2230 rrc = 0;
2231 if (pcre_callout != NULL)
2232 {
2233 pcre_callout_block cb;
2234 cb.version = 1; /* Version 1 of the callout block */
2235 cb.callout_number = code[LINK_SIZE+2];
2236 cb.offset_vector = offsets;
2237 cb.subject = (PCRE_SPTR)start_subject;
2238 cb.subject_length = end_subject - start_subject;
2239 cb.start_match = current_subject - start_subject;
2240 cb.current_position = ptr - start_subject;
2241 cb.pattern_position = GET(code, LINK_SIZE + 3);
2242 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2243 cb.capture_top = 1;
2244 cb.capture_last = -1;
2245 cb.callout_data = md->callout_data;
2246 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2247 }
2248 if (rrc > 0) break; /* Fail this thread */
2249 code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2250 }
2251
2252 condcode = code[LINK_SIZE+1];
2253
2254 /* Back reference conditions are not supported */
2255
2256 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2257
2258 /* The DEFINE condition is always false */
2259
2260 if (condcode == OP_DEF)
2261 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2262
2263 /* The only supported version of OP_RREF is for the value RREF_ANY,
2264 which means "test if in any recursion". We can't test for specifically
2265 recursed groups. */
2266
2267 else if (condcode == OP_RREF)
2268 {
2269 int value = GET2(code, LINK_SIZE+2);
2270 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2271 if (recursing > 0)
2272 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2273 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2274 }
2275
2276 /* Otherwise, the condition is an assertion */
2277
2278 else
2279 {
2280 int rc;
2281 const uschar *asscode = code + LINK_SIZE + 1;
2282 const uschar *endasscode = asscode + GET(asscode, 1);
2283
2284 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2285
2286 rc = internal_dfa_exec(
2287 md, /* fixed match data */
2288 asscode, /* this subexpression's code */
2289 ptr, /* where we currently are */
2290 ptr - start_subject, /* start offset */
2291 local_offsets, /* offset vector */
2292 sizeof(local_offsets)/sizeof(int), /* size of same */
2293 local_workspace, /* workspace vector */
2294 sizeof(local_workspace)/sizeof(int), /* size of same */
2295 ims, /* the current ims flags */
2296 rlevel, /* function recursion level */
2297 recursing); /* pass on regex recursion */
2298
2299 if ((rc >= 0) ==
2300 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2301 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2302 else
2303 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2304 }
2305 }
2306 break;
2307
2308 /*-----------------------------------------------------------------*/
2309 case OP_RECURSE:
2310 {
2311 int local_offsets[1000];
2312 int local_workspace[1000];
2313 int rc;
2314
2315 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2316 recursing + 1));
2317
2318 rc = internal_dfa_exec(
2319 md, /* fixed match data */
2320 start_code + GET(code, 1), /* this subexpression's code */
2321 ptr, /* where we currently are */
2322 ptr - start_subject, /* start offset */
2323 local_offsets, /* offset vector */
2324 sizeof(local_offsets)/sizeof(int), /* size of same */
2325 local_workspace, /* workspace vector */
2326 sizeof(local_workspace)/sizeof(int), /* size of same */
2327 ims, /* the current ims flags */
2328 rlevel, /* function recursion level */
2329 recursing + 1); /* regex recurse level */
2330
2331 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2332 recursing + 1, rc));
2333
2334 /* Ran out of internal offsets */
2335
2336 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2337
2338 /* For each successful matched substring, set up the next state with a
2339 count of characters to skip before trying it. Note that the count is in
2340 characters, not bytes. */
2341
2342 if (rc > 0)
2343 {
2344 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2345 {
2346 const uschar *p = start_subject + local_offsets[rc];
2347 const uschar *pp = start_subject + local_offsets[rc+1];
2348 int charcount = local_offsets[rc+1] - local_offsets[rc];
2349 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2350 if (charcount > 0)
2351 {
2352 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2353 }
2354 else
2355 {
2356 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2357 }
2358 }
2359 }
2360 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2361 }
2362 break;
2363
2364 /*-----------------------------------------------------------------*/
2365 case OP_ONCE:
2366 {
2367 int local_offsets[2];
2368 int local_workspace[1000];
2369
2370 int rc = internal_dfa_exec(
2371 md, /* fixed match data */
2372 code, /* this subexpression's code */
2373 ptr, /* where we currently are */
2374 ptr - start_subject, /* start offset */
2375 local_offsets, /* offset vector */
2376 sizeof(local_offsets)/sizeof(int), /* size of same */
2377 local_workspace, /* workspace vector */
2378 sizeof(local_workspace)/sizeof(int), /* size of same */
2379 ims, /* the current ims flags */
2380 rlevel, /* function recursion level */
2381 recursing); /* pass on regex recursion */
2382
2383 if (rc >= 0)
2384 {
2385 const uschar *end_subpattern = code;
2386 int charcount = local_offsets[1] - local_offsets[0];
2387 int next_state_offset, repeat_state_offset;
2388
2389 do { end_subpattern += GET(end_subpattern, 1); }
2390 while (*end_subpattern == OP_ALT);
2391 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2392
2393 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2394 arrange for the repeat state also to be added to the relevant list.
2395 Calculate the offset, or set -1 for no repeat. */
2396
2397 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2398 *end_subpattern == OP_KETRMIN)?
2399 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2400
2401 /* If we have matched an empty string, add the next state at the
2402 current character pointer. This is important so that the duplicate
2403 checking kicks in, which is what breaks infinite loops that match an
2404 empty string. */
2405
2406 if (charcount == 0)
2407 {
2408 ADD_ACTIVE(next_state_offset, 0);
2409 }
2410
2411 /* Optimization: if there are no more active states, and there
2412 are no new states yet set up, then skip over the subject string
2413 right here, to save looping. Otherwise, set up the new state to swing
2414 into action when the end of the substring is reached. */
2415
2416 else if (i + 1 >= active_count && new_count == 0)
2417 {
2418 ptr += charcount;
2419 clen = 0;
2420 ADD_NEW(next_state_offset, 0);
2421
2422 /* If we are adding a repeat state at the new character position,
2423 we must fudge things so that it is the only current state.
2424 Otherwise, it might be a duplicate of one we processed before, and
2425 that would cause it to be skipped. */
2426
2427 if (repeat_state_offset >= 0)
2428 {
2429 next_active_state = active_states;
2430 active_count = 0;
2431 i = -1;
2432 ADD_ACTIVE(repeat_state_offset, 0);
2433 }
2434 }
2435 else
2436 {
2437 const uschar *p = start_subject + local_offsets[0];
2438 const uschar *pp = start_subject + local_offsets[1];
2439 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2440 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2441 if (repeat_state_offset >= 0)
2442 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2443 }
2444
2445 }
2446 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2447 }
2448 break;
2449
2450
2451 /* ========================================================================== */
2452 /* Handle callouts */
2453
2454 case OP_CALLOUT:
2455 rrc = 0;
2456 if (pcre_callout != NULL)
2457 {
2458 pcre_callout_block cb;
2459 cb.version = 1; /* Version 1 of the callout block */
2460 cb.callout_number = code[1];
2461 cb.offset_vector = offsets;
2462 cb.subject = (PCRE_SPTR)start_subject;
2463 cb.subject_length = end_subject - start_subject;
2464 cb.start_match = current_subject - start_subject;
2465 cb.current_position = ptr - start_subject;
2466 cb.pattern_position = GET(code, 2);
2467 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2468 cb.capture_top = 1;
2469 cb.capture_last = -1;
2470 cb.callout_data = md->callout_data;
2471 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2472 }
2473 if (rrc == 0)
2474 { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2475 break;
2476
2477
2478 /* ========================================================================== */
2479 default: /* Unsupported opcode */
2480 return PCRE_ERROR_DFA_UITEM;
2481 }
2482
2483 NEXT_ACTIVE_STATE: continue;
2484
2485 } /* End of loop scanning active states */
2486
2487 /* We have finished the processing at the current subject character. If no
2488 new states have been set for the next character, we have found all the
2489 matches that we are going to find. If we are at the top level and partial
2490 matching has been requested, check for appropriate conditions. The "forced_
2491 fail" variable counts the number of (*F) encountered for the character. If it
2492 is equal to the original active_count (saved in workspace[1]) it means that
2493 (*F) was found on every active state. In this case we don't want to give a
2494 partial match. */
2495
2496 if (new_count <= 0)
2497 {
2498 if (rlevel == 1 && /* Top level, and */
2499 reached_end != workspace[1] && /* Not all reached end */
2500 forced_fail != workspace[1] && /* Not all forced fail & */
2501 ( /* either... */
2502 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2503 || /* or... */
2504 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2505 match_count < 0) /* no matches */
2506 ) && /* And... */
2507 ptr >= end_subject && /* Reached end of subject */
2508 ptr > current_subject) /* Matched non-empty string */
2509 {
2510 if (offsetcount >= 2)
2511 {
2512 offsets[0] = md->start_used_ptr - start_subject;
2513 offsets[1] = end_subject - start_subject;
2514 }
2515 match_count = PCRE_ERROR_PARTIAL;
2516 }
2517
2518 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2519 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2520 rlevel*2-2, SP));
2521 break; /* In effect, "return", but see the comment below */
2522 }
2523
2524 /* One or more states are active for the next character. */
2525
2526 ptr += clen; /* Advance to next subject character */
2527 } /* Loop to move along the subject string */
2528
2529 /* Control gets here from "break" a few lines above. We do it this way because
2530 if we use "return" above, we have compiler trouble. Some compilers warn if
2531 there's nothing here because they think the function doesn't return a value. On
2532 the other hand, if we put a dummy statement here, some more clever compilers
2533 complain that it can't be reached. Sigh. */
2534
2535 return match_count;
2536 }
2537
2538
2539
2540
2541 /*************************************************
2542 * Execute a Regular Expression - DFA engine *
2543 *************************************************/
2544
2545 /* This external function applies a compiled re to a subject string using a DFA
2546 engine. This function calls the internal function multiple times if the pattern
2547 is not anchored.
2548
2549 Arguments:
2550 argument_re points to the compiled expression
2551 extra_data points to extra data or is NULL
2552 subject points to the subject string
2553 length length of subject string (may contain binary zeros)
2554 start_offset where to start in the subject string
2555 options option bits
2556 offsets vector of match offsets
2557 offsetcount size of same
2558 workspace workspace vector
2559 wscount size of same
2560
2561 Returns: > 0 => number of match offset pairs placed in offsets
2562 = 0 => offsets overflowed; longest matches are present
2563 -1 => failed to match
2564 < -1 => some kind of unexpected problem
2565 */
2566
2567 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2568 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2569 const char *subject, int length, int start_offset, int options, int *offsets,
2570 int offsetcount, int *workspace, int wscount)
2571 {
2572 real_pcre *re = (real_pcre *)argument_re;
2573 dfa_match_data match_block;
2574 dfa_match_data *md = &match_block;
2575 BOOL utf8, anchored, startline, firstline;
2576 const uschar *current_subject, *end_subject, *lcc;
2577
2578 pcre_study_data internal_study;
2579 const pcre_study_data *study = NULL;
2580 real_pcre internal_re;
2581
2582 const uschar *req_byte_ptr;
2583 const uschar *start_bits = NULL;
2584 BOOL first_byte_caseless = FALSE;
2585 BOOL req_byte_caseless = FALSE;
2586 int first_byte = -1;
2587 int req_byte = -1;
2588 int req_byte2 = -1;
2589 int newline;
2590
2591 /* Plausibility checks */
2592
2593 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2594 if (re == NULL || subject == NULL || workspace == NULL ||
2595 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2596 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2597 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2598
2599 /* We need to find the pointer to any study data before we test for byte
2600 flipping, so we scan the extra_data block first. This may set two fields in the
2601 match block, so we must initialize them beforehand. However, the other fields
2602 in the match block must not be set until after the byte flipping. */
2603
2604 md->tables = re->tables;
2605 md->callout_data = NULL;
2606
2607 if (extra_data != NULL)
2608 {
2609 unsigned int flags = extra_data->flags;
2610 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2611 study = (const pcre_study_data *)extra_data->study_data;
2612 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2613 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2614 return PCRE_ERROR_DFA_UMLIMIT;
2615 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2616 md->callout_data = extra_data->callout_data;
2617 if ((flags & PCRE_EXTRA_TABLES) != 0)
2618 md->tables = extra_data->tables;
2619 }
2620
2621 /* Check that the first field in the block is the magic number. If it is not,
2622 test for a regex that was compiled on a host of opposite endianness. If this is
2623 the case, flipped values are put in internal_re and internal_study if there was
2624 study data too. */
2625
2626 if (re->magic_number != MAGIC_NUMBER)
2627 {
2628 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2629 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2630 if (study != NULL) study = &internal_study;
2631 }
2632
2633 /* Set some local values */
2634
2635 current_subject = (const unsigned char *)subject + start_offset;
2636 end_subject = (const unsigned char *)subject + length;
2637 req_byte_ptr = current_subject - 1;
2638
2639 #ifdef SUPPORT_UTF8
2640 utf8 = (re->options & PCRE_UTF8) != 0;
2641 #else
2642 utf8 = FALSE;
2643 #endif
2644
2645 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2646 (re->options & PCRE_ANCHORED) != 0;
2647
2648 /* The remaining fixed data for passing around. */
2649
2650 md->start_code = (const uschar *)argument_re +
2651 re->name_table_offset + re->name_count * re->name_entry_size;
2652 md->start_subject = (const unsigned char *)subject;
2653 md->end_subject = end_subject;
2654 md->moptions = options;
2655 md->poptions = re->options;
2656
2657 /* If the BSR option is not set at match time, copy what was set
2658 at compile time. */
2659
2660 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2661 {
2662 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2663 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2664 #ifdef BSR_ANYCRLF
2665 else md->moptions |= PCRE_BSR_ANYCRLF;
2666 #endif
2667 }
2668
2669 /* Handle different types of newline. The three bits give eight cases. If
2670 nothing is set at run time, whatever was used at compile time applies. */
2671
2672 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2673 PCRE_NEWLINE_BITS)
2674 {
2675 case 0: newline = NEWLINE; break; /* Compile-time default */
2676 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2677 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2678 case PCRE_NEWLINE_CR+
2679 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2680 case PCRE_NEWLINE_ANY: newline = -1; break;
2681 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2682 default: return PCRE_ERROR_BADNEWLINE;
2683 }
2684
2685 if (newline == -2)
2686 {
2687 md->nltype = NLTYPE_ANYCRLF;
2688 }
2689 else if (newline < 0)
2690 {
2691 md->nltype = NLTYPE_ANY;
2692 }
2693 else
2694 {
2695 md->nltype = NLTYPE_FIXED;
2696 if (newline > 255)
2697 {
2698 md->nllen = 2;
2699 md->nl[0] = (newline >> 8) & 255;
2700 md->nl[1] = newline & 255;
2701 }
2702 else
2703 {
2704 md->nllen = 1;
2705 md->nl[0] = newline;
2706 }
2707 }
2708
2709 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2710 back the character offset. */
2711
2712 #ifdef SUPPORT_UTF8
2713 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2714 {
2715 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2716 return PCRE_ERROR_BADUTF8;
2717 if (start_offset > 0 && start_offset < length)
2718 {
2719 int tb = ((uschar *)subject)[start_offset];
2720 if (tb > 127)
2721 {
2722 tb &= 0xc0;
2723 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2724 }
2725 }
2726 }
2727 #endif
2728
2729 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2730 is a feature that makes it possible to save compiled regex and re-use them
2731 in other programs later. */
2732
2733 if (md->tables == NULL) md->tables = _pcre_default_tables;
2734
2735 /* The lower casing table and the "must be at the start of a line" flag are
2736 used in a loop when finding where to start. */
2737
2738 lcc = md->tables + lcc_offset;
2739 startline = (re->flags & PCRE_STARTLINE) != 0;
2740 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2741
2742 /* Set up the first character to match, if available. The first_byte value is
2743 never set for an anchored regular expression, but the anchoring may be forced
2744 at run time, so we have to test for anchoring. The first char may be unset for
2745 an unanchored pattern, of course. If there's no first char and the pattern was
2746 studied, there may be a bitmap of possible first characters. */
2747
2748 if (!anchored)
2749 {
2750 if ((re->flags & PCRE_FIRSTSET) != 0)
2751 {
2752 first_byte = re->first_byte & 255;
2753 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2754 first_byte = lcc[first_byte];
2755 }
2756 else
2757 {
2758 if (startline && study != NULL &&
2759 (study->options & PCRE_STUDY_MAPPED) != 0)
2760 start_bits = study->start_bits;
2761 }
2762 }
2763
2764 /* For anchored or unanchored matches, there may be a "last known required
2765 character" set. */
2766
2767 if ((re->flags & PCRE_REQCHSET) != 0)
2768 {
2769 req_byte = re->req_byte & 255;
2770 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2771 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2772 }
2773
2774 /* Call the main matching function, looping for a non-anchored regex after a
2775 failed match. If not restarting, perform certain optimizations at the start of
2776 a match. */
2777
2778 for (;;)
2779 {
2780 int rc;
2781
2782 if ((options & PCRE_DFA_RESTART) == 0)
2783 {
2784 const uschar *save_end_subject = end_subject;
2785
2786 /* If firstline is TRUE, the start of the match is constrained to the first
2787 line of a multiline string. Implement this by temporarily adjusting
2788 end_subject so that we stop scanning at a newline. If the match fails at
2789 the newline, later code breaks this loop. */
2790
2791 if (firstline)
2792 {
2793 USPTR t = current_subject;
2794 #ifdef SUPPORT_UTF8
2795 if (utf8)
2796 {
2797 while (t < md->end_subject && !IS_NEWLINE(t))
2798 {
2799 t++;
2800 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2801 }
2802 }
2803 else
2804 #endif
2805 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2806 end_subject = t;
2807 }
2808
2809 /* There are some optimizations that avoid running the match if a known
2810 starting point is not found, or if a known later character is not present.
2811 However, there is an option that disables these, for testing and for
2812 ensuring that all callouts do actually occur. */
2813
2814 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2815 {
2816
2817 /* Advance to a known first byte. */
2818
2819 if (first_byte >= 0)
2820 {
2821 if (first_byte_caseless)
2822 while (current_subject < end_subject &&
2823 lcc[*current_subject] != first_byte)
2824 current_subject++;
2825 else
2826 while (current_subject < end_subject &&
2827 *current_subject != first_byte)
2828 current_subject++;
2829 }
2830
2831 /* Or to just after a linebreak for a multiline match if possible */
2832
2833 else if (startline)
2834 {
2835 if (current_subject > md->start_subject + start_offset)
2836 {
2837 #ifdef SUPPORT_UTF8
2838 if (utf8)
2839 {
2840 while (current_subject < end_subject &&
2841 !WAS_NEWLINE(current_subject))
2842 {
2843 current_subject++;
2844 while(current_subject < end_subject &&
2845 (*current_subject & 0xc0) == 0x80)
2846 current_subject++;
2847 }
2848 }
2849 else
2850 #endif
2851 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2852 current_subject++;
2853
2854 /* If we have just passed a CR and the newline option is ANY or
2855 ANYCRLF, and we are now at a LF, advance the match position by one
2856 more character. */
2857
2858 if (current_subject[-1] == CHAR_CR &&
2859 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2860 current_subject < end_subject &&
2861 *current_subject == CHAR_NL)
2862 current_subject++;
2863 }
2864 }
2865
2866 /* Or to a non-unique first char after study */
2867
2868 else if (start_bits != NULL)
2869 {
2870 while (current_subject < end_subject)
2871 {
2872 register unsigned int c = *current_subject;
2873 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2874 else break;
2875 }
2876 }
2877 }
2878
2879 /* Restore fudged end_subject */
2880
2881 end_subject = save_end_subject;
2882 }
2883
2884 /* If req_byte is set, we know that that character must appear in the subject
2885 for the match to succeed. If the first character is set, req_byte must be
2886 later in the subject; otherwise the test starts at the match point. This
2887 optimization can save a huge amount of work in patterns with nested unlimited
2888 repeats that aren't going to match. Writing separate code for cased/caseless
2889 versions makes it go faster, as does using an autoincrement and backing off
2890 on a match.
2891
2892 HOWEVER: when the subject string is very, very long, searching to its end can
2893 take a long time, and give bad performance on quite ordinary patterns. This
2894 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2895 don't do this when the string is sufficiently long.
2896
2897 ALSO: this processing is disabled when partial matching is requested, and can
2898 also be explicitly deactivated. Furthermore, we have to disable when
2899 restarting after a partial match, because the required character may have
2900 already been matched. */
2901
2902 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2903 req_byte >= 0 &&
2904 end_subject - current_subject < REQ_BYTE_MAX &&
2905 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT|PCRE_DFA_RESTART)) == 0)
2906 {
2907 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2908
2909 /* We don't need to repeat the search if we haven't yet reached the
2910 place we found it at last time. */
2911
2912 if (p > req_byte_ptr)
2913 {
2914 if (req_byte_caseless)
2915 {
2916 while (p < end_subject)
2917 {
2918 register int pp = *p++;
2919 if (pp == req_byte || pp == req_byte2) { p--; break; }
2920 }
2921 }
2922 else
2923 {
2924 while (p < end_subject)
2925 {
2926 if (*p++ == req_byte) { p--; break; }
2927 }
2928 }
2929
2930 /* If we can't find the required character, break the matching loop,
2931 which will cause a return or PCRE_ERROR_NOMATCH. */
2932
2933 if (p >= end_subject) break;
2934
2935 /* If we have found the required character, save the point where we
2936 found it, so that we don't search again next time round the loop if
2937 the start hasn't passed this character yet. */
2938
2939 req_byte_ptr = p;
2940 }
2941 }
2942
2943 /* OK, now we can do the business */
2944
2945 md->start_used_ptr = current_subject;
2946
2947 rc = internal_dfa_exec(
2948 md, /* fixed match data */
2949 md->start_code, /* this subexpression's code */
2950 current_subject, /* where we currently are */
2951 start_offset, /* start offset in subject */
2952 offsets, /* offset vector */
2953 offsetcount, /* size of same */
2954 workspace, /* workspace vector */
2955 wscount, /* size of same */
2956 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2957 0, /* function recurse level */
2958 0); /* regex recurse level */
2959
2960 /* Anything other than "no match" means we are done, always; otherwise, carry
2961 on only if not anchored. */
2962
2963 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2964
2965 /* Advance to the next subject character unless we are at the end of a line
2966 and firstline is set. */
2967
2968 if (firstline && IS_NEWLINE(current_subject)) break;
2969 current_subject++;
2970 if (utf8)
2971 {
2972 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2973 current_subject++;
2974 }
2975 if (current_subject > end_subject) break;
2976
2977 /* If we have just passed a CR and we are now at a LF, and the pattern does
2978 not contain any explicit matches for \r or \n, and the newline option is CRLF
2979 or ANY or ANYCRLF, advance the match position by one more character. */
2980
2981 if (current_subject[-1] == CHAR_CR &&
2982 current_subject < end_subject &&
2983 *current_subject == CHAR_NL &&
2984 (re->flags & PCRE_HASCRORLF) == 0 &&
2985 (md->nltype == NLTYPE_ANY ||
2986 md->nltype == NLTYPE_ANYCRLF ||
2987 md->nllen == 2))
2988 current_subject++;
2989
2990 } /* "Bumpalong" loop */
2991
2992 return PCRE_ERROR_NOMATCH;
2993 }
2994
2995 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12