/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 345 - (show annotations) (download)
Mon Apr 28 15:10:02 2008 UTC (6 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 95544 byte(s)
Tidies for the 7.7-RC1 distribution.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2008 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45
46
47 #ifdef HAVE_CONFIG_H
48 #include "config.h"
49 #endif
50
51 #define NLBLOCK md /* Block containing newline information */
52 #define PSSTART start_subject /* Field containing processed string start */
53 #define PSEND end_subject /* Field containing processed string end */
54
55 #include "pcre_internal.h"
56
57
58 /* For use to indent debugging output */
59
60 #define SP " "
61
62
63
64 /*************************************************
65 * Code parameters and static tables *
66 *************************************************/
67
68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 into others, under special conditions. A gap of 20 between the blocks should be
70 enough. The resulting opcodes don't have to be less than 256 because they are
71 never stored, so we push them well clear of the normal opcodes. */
72
73 #define OP_PROP_EXTRA 300
74 #define OP_EXTUNI_EXTRA 320
75 #define OP_ANYNL_EXTRA 340
76 #define OP_HSPACE_EXTRA 360
77 #define OP_VSPACE_EXTRA 380
78
79
80 /* This table identifies those opcodes that are followed immediately by a
81 character that is to be tested in some way. This makes is possible to
82 centralize the loading of these characters. In the case of Type * etc, the
83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 small value. ***NOTE*** If the start of this table is modified, the two tables
85 that follow must also be modified. */
86
87 static const uschar coptable[] = {
88 0, /* End */
89 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 0, 0, 0, /* Any, AllAny, Anybyte */
92 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95 1, /* Char */
96 1, /* Charnc */
97 1, /* not */
98 /* Positive single-char repeats */
99 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* upto, minupto, exact */
101 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 /* Negative single-char repeats - only for chars < 256 */
103 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* NOT upto, minupto, exact */
105 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 /* Positive type repeats */
107 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108 3, 3, 3, /* Type upto, minupto, exact */
109 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 /* Character class & ref repeats */
111 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112 0, 0, /* CRRANGE, CRMINRANGE */
113 0, /* CLASS */
114 0, /* NCLASS */
115 0, /* XCLASS - variable length */
116 0, /* REF */
117 0, /* RECURSE */
118 0, /* CALLOUT */
119 0, /* Alt */
120 0, /* Ket */
121 0, /* KetRmax */
122 0, /* KetRmin */
123 0, /* Assert */
124 0, /* Assert not */
125 0, /* Assert behind */
126 0, /* Assert behind not */
127 0, /* Reverse */
128 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129 0, 0, 0, /* SBRA, SCBRA, SCOND */
130 0, /* CREF */
131 0, /* RREF */
132 0, /* DEF */
133 0, 0, /* BRAZERO, BRAMINZERO */
134 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136 };
137
138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139 and \w */
140
141 static const uschar toptable1[] = {
142 0, 0, 0, 0, 0, 0,
143 ctype_digit, ctype_digit,
144 ctype_space, ctype_space,
145 ctype_word, ctype_word,
146 0, 0 /* OP_ANY, OP_ALLANY */
147 };
148
149 static const uschar toptable2[] = {
150 0, 0, 0, 0, 0, 0,
151 ctype_digit, 0,
152 ctype_space, 0,
153 ctype_word, 0,
154 1, 1 /* OP_ANY, OP_ALLANY */
155 };
156
157
158 /* Structure for holding data about a particular state, which is in effect the
159 current data for an active path through the match tree. It must consist
160 entirely of ints because the working vector we are passed, and which we put
161 these structures in, is a vector of ints. */
162
163 typedef struct stateblock {
164 int offset; /* Offset to opcode */
165 int count; /* Count for repeats */
166 int ims; /* ims flag bits */
167 int data; /* Some use extra data */
168 } stateblock;
169
170 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171
172
173 #ifdef DEBUG
174 /*************************************************
175 * Print character string *
176 *************************************************/
177
178 /* Character string printing function for debugging.
179
180 Arguments:
181 p points to string
182 length number of bytes
183 f where to print
184
185 Returns: nothing
186 */
187
188 static void
189 pchars(unsigned char *p, int length, FILE *f)
190 {
191 int c;
192 while (length-- > 0)
193 {
194 if (isprint(c = *(p++)))
195 fprintf(f, "%c", c);
196 else
197 fprintf(f, "\\x%02x", c);
198 }
199 }
200 #endif
201
202
203
204 /*************************************************
205 * Execute a Regular Expression - DFA engine *
206 *************************************************/
207
208 /* This internal function applies a compiled pattern to a subject string,
209 starting at a given point, using a DFA engine. This function is called from the
210 external one, possibly multiple times if the pattern is not anchored. The
211 function calls itself recursively for some kinds of subpattern.
212
213 Arguments:
214 md the match_data block with fixed information
215 this_start_code the opening bracket of this subexpression's code
216 current_subject where we currently are in the subject string
217 start_offset start offset in the subject string
218 offsets vector to contain the matching string offsets
219 offsetcount size of same
220 workspace vector of workspace
221 wscount size of same
222 ims the current ims flags
223 rlevel function call recursion level
224 recursing regex recursive call level
225
226 Returns: > 0 => number of match offset pairs placed in offsets
227 = 0 => offsets overflowed; longest matches are present
228 -1 => failed to match
229 < -1 => some kind of unexpected problem
230
231 The following macros are used for adding states to the two state vectors (one
232 for the current character, one for the following character). */
233
234 #define ADD_ACTIVE(x,y) \
235 if (active_count++ < wscount) \
236 { \
237 next_active_state->offset = (x); \
238 next_active_state->count = (y); \
239 next_active_state->ims = ims; \
240 next_active_state++; \
241 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242 } \
243 else return PCRE_ERROR_DFA_WSSIZE
244
245 #define ADD_ACTIVE_DATA(x,y,z) \
246 if (active_count++ < wscount) \
247 { \
248 next_active_state->offset = (x); \
249 next_active_state->count = (y); \
250 next_active_state->ims = ims; \
251 next_active_state->data = (z); \
252 next_active_state++; \
253 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254 } \
255 else return PCRE_ERROR_DFA_WSSIZE
256
257 #define ADD_NEW(x,y) \
258 if (new_count++ < wscount) \
259 { \
260 next_new_state->offset = (x); \
261 next_new_state->count = (y); \
262 next_new_state->ims = ims; \
263 next_new_state++; \
264 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265 } \
266 else return PCRE_ERROR_DFA_WSSIZE
267
268 #define ADD_NEW_DATA(x,y,z) \
269 if (new_count++ < wscount) \
270 { \
271 next_new_state->offset = (x); \
272 next_new_state->count = (y); \
273 next_new_state->ims = ims; \
274 next_new_state->data = (z); \
275 next_new_state++; \
276 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277 } \
278 else return PCRE_ERROR_DFA_WSSIZE
279
280 /* And now, here is the code */
281
282 static int
283 internal_dfa_exec(
284 dfa_match_data *md,
285 const uschar *this_start_code,
286 const uschar *current_subject,
287 int start_offset,
288 int *offsets,
289 int offsetcount,
290 int *workspace,
291 int wscount,
292 int ims,
293 int rlevel,
294 int recursing)
295 {
296 stateblock *active_states, *new_states, *temp_states;
297 stateblock *next_active_state, *next_new_state;
298
299 const uschar *ctypes, *lcc, *fcc;
300 const uschar *ptr;
301 const uschar *end_code, *first_op;
302
303 int active_count, new_count, match_count;
304
305 /* Some fields in the md block are frequently referenced, so we load them into
306 independent variables in the hope that this will perform better. */
307
308 const uschar *start_subject = md->start_subject;
309 const uschar *end_subject = md->end_subject;
310 const uschar *start_code = md->start_code;
311
312 #ifdef SUPPORT_UTF8
313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 #else
315 BOOL utf8 = FALSE;
316 #endif
317
318 rlevel++;
319 offsetcount &= (-2);
320
321 wscount -= 2;
322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323 (2 * INTS_PER_STATEBLOCK);
324
325 DPRINTF(("\n%.*s---------------------\n"
326 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328
329 ctypes = md->tables + ctypes_offset;
330 lcc = md->tables + lcc_offset;
331 fcc = md->tables + fcc_offset;
332
333 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334
335 active_states = (stateblock *)(workspace + 2);
336 next_new_state = new_states = active_states + wscount;
337 new_count = 0;
338
339 first_op = this_start_code + 1 + LINK_SIZE +
340 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341
342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343 the alternative states onto the list, and find out where the end is. This
344 makes is possible to use this function recursively, when we want to stop at a
345 matching internal ket rather than at the end.
346
347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348 a backward assertion. In that case, we have to find out the maximum amount to
349 move back, and set up each alternative appropriately. */
350
351 if (*first_op == OP_REVERSE)
352 {
353 int max_back = 0;
354 int gone_back;
355
356 end_code = this_start_code;
357 do
358 {
359 int back = GET(end_code, 2+LINK_SIZE);
360 if (back > max_back) max_back = back;
361 end_code += GET(end_code, 1);
362 }
363 while (*end_code == OP_ALT);
364
365 /* If we can't go back the amount required for the longest lookbehind
366 pattern, go back as far as we can; some alternatives may still be viable. */
367
368 #ifdef SUPPORT_UTF8
369 /* In character mode we have to step back character by character */
370
371 if (utf8)
372 {
373 for (gone_back = 0; gone_back < max_back; gone_back++)
374 {
375 if (current_subject <= start_subject) break;
376 current_subject--;
377 while (current_subject > start_subject &&
378 (*current_subject & 0xc0) == 0x80)
379 current_subject--;
380 }
381 }
382 else
383 #endif
384
385 /* In byte-mode we can do this quickly. */
386
387 {
388 gone_back = (current_subject - max_back < start_subject)?
389 current_subject - start_subject : max_back;
390 current_subject -= gone_back;
391 }
392
393 /* Now we can process the individual branches. */
394
395 end_code = this_start_code;
396 do
397 {
398 int back = GET(end_code, 2+LINK_SIZE);
399 if (back <= gone_back)
400 {
401 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402 ADD_NEW_DATA(-bstate, 0, gone_back - back);
403 }
404 end_code += GET(end_code, 1);
405 }
406 while (*end_code == OP_ALT);
407 }
408
409 /* This is the code for a "normal" subpattern (not a backward assertion). The
410 start of a whole pattern is always one of these. If we are at the top level,
411 we may be asked to restart matching from the same point that we reached for a
412 previous partial match. We still have to scan through the top-level branches to
413 find the end state. */
414
415 else
416 {
417 end_code = this_start_code;
418
419 /* Restarting */
420
421 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422 {
423 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424 new_count = workspace[1];
425 if (!workspace[0])
426 memcpy(new_states, active_states, new_count * sizeof(stateblock));
427 }
428
429 /* Not restarting */
430
431 else
432 {
433 int length = 1 + LINK_SIZE +
434 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435 do
436 {
437 ADD_NEW(end_code - start_code + length, 0);
438 end_code += GET(end_code, 1);
439 length = 1 + LINK_SIZE;
440 }
441 while (*end_code == OP_ALT);
442 }
443 }
444
445 workspace[0] = 0; /* Bit indicating which vector is current */
446
447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448
449 /* Loop for scanning the subject */
450
451 ptr = current_subject;
452 for (;;)
453 {
454 int i, j;
455 int clen, dlen;
456 unsigned int c, d;
457
458 /* Make the new state list into the active state list and empty the
459 new state list. */
460
461 temp_states = active_states;
462 active_states = new_states;
463 new_states = temp_states;
464 active_count = new_count;
465 new_count = 0;
466
467 workspace[0] ^= 1; /* Remember for the restarting feature */
468 workspace[1] = active_count;
469
470 #ifdef DEBUG
471 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473 printf("\"\n");
474
475 printf("%.*sActive states: ", rlevel*2-2, SP);
476 for (i = 0; i < active_count; i++)
477 printf("%d/%d ", active_states[i].offset, active_states[i].count);
478 printf("\n");
479 #endif
480
481 /* Set the pointers for adding new states */
482
483 next_active_state = active_states + active_count;
484 next_new_state = new_states;
485
486 /* Load the current character from the subject outside the loop, as many
487 different states may want to look at it, and we assume that at least one
488 will. */
489
490 if (ptr < end_subject)
491 {
492 clen = 1; /* Number of bytes in the character */
493 #ifdef SUPPORT_UTF8
494 if (utf8) { GETCHARLEN(c, ptr, clen); } else
495 #endif /* SUPPORT_UTF8 */
496 c = *ptr;
497 }
498 else
499 {
500 clen = 0; /* This indicates the end of the subject */
501 c = NOTACHAR; /* This value should never actually be used */
502 }
503
504 /* Scan up the active states and act on each one. The result of an action
505 may be to add more states to the currently active list (e.g. on hitting a
506 parenthesis) or it may be to put states on the new list, for considering
507 when we move the character pointer on. */
508
509 for (i = 0; i < active_count; i++)
510 {
511 stateblock *current_state = active_states + i;
512 const uschar *code;
513 int state_offset = current_state->offset;
514 int count, codevalue;
515 #ifdef SUPPORT_UCP
516 int chartype, script;
517 #endif
518
519 #ifdef DEBUG
520 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
521 if (clen == 0) printf("EOL\n");
522 else if (c > 32 && c < 127) printf("'%c'\n", c);
523 else printf("0x%02x\n", c);
524 #endif
525
526 /* This variable is referred to implicity in the ADD_xxx macros. */
527
528 ims = current_state->ims;
529
530 /* A negative offset is a special case meaning "hold off going to this
531 (negated) state until the number of characters in the data field have
532 been skipped". */
533
534 if (state_offset < 0)
535 {
536 if (current_state->data > 0)
537 {
538 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
539 ADD_NEW_DATA(state_offset, current_state->count,
540 current_state->data - 1);
541 continue;
542 }
543 else
544 {
545 current_state->offset = state_offset = -state_offset;
546 }
547 }
548
549 /* Check for a duplicate state with the same count, and skip if found. */
550
551 for (j = 0; j < i; j++)
552 {
553 if (active_states[j].offset == state_offset &&
554 active_states[j].count == current_state->count)
555 {
556 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
557 goto NEXT_ACTIVE_STATE;
558 }
559 }
560
561 /* The state offset is the offset to the opcode */
562
563 code = start_code + state_offset;
564 codevalue = *code;
565
566 /* If this opcode is followed by an inline character, load it. It is
567 tempting to test for the presence of a subject character here, but that
568 is wrong, because sometimes zero repetitions of the subject are
569 permitted.
570
571 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
572 argument that is not a data character - but is always one byte long. We
573 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
574 this case. To keep the other cases fast, convert these ones to new opcodes.
575 */
576
577 if (coptable[codevalue] > 0)
578 {
579 dlen = 1;
580 #ifdef SUPPORT_UTF8
581 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
582 #endif /* SUPPORT_UTF8 */
583 d = code[coptable[codevalue]];
584 if (codevalue >= OP_TYPESTAR)
585 {
586 switch(d)
587 {
588 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
589 case OP_NOTPROP:
590 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
591 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
592 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
593 case OP_NOT_HSPACE:
594 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
595 case OP_NOT_VSPACE:
596 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
597 default: break;
598 }
599 }
600 }
601 else
602 {
603 dlen = 0; /* Not strictly necessary, but compilers moan */
604 d = NOTACHAR; /* if these variables are not set. */
605 }
606
607
608 /* Now process the individual opcodes */
609
610 switch (codevalue)
611 {
612
613 /* ========================================================================== */
614 /* Reached a closing bracket. If not at the end of the pattern, carry
615 on with the next opcode. Otherwise, unless we have an empty string and
616 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
617 matches so we always have the longest first. */
618
619 case OP_KET:
620 case OP_KETRMIN:
621 case OP_KETRMAX:
622 if (code != end_code)
623 {
624 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
625 if (codevalue != OP_KET)
626 {
627 ADD_ACTIVE(state_offset - GET(code, 1), 0);
628 }
629 }
630 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
631 {
632 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
633 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
634 match_count = 0;
635 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
636 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
637 if (offsetcount >= 2)
638 {
639 offsets[0] = current_subject - start_subject;
640 offsets[1] = ptr - start_subject;
641 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
642 offsets[1] - offsets[0], current_subject));
643 }
644 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
645 {
646 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
647 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
648 match_count, rlevel*2-2, SP));
649 return match_count;
650 }
651 }
652 break;
653
654 /* ========================================================================== */
655 /* These opcodes add to the current list of states without looking
656 at the current character. */
657
658 /*-----------------------------------------------------------------*/
659 case OP_ALT:
660 do { code += GET(code, 1); } while (*code == OP_ALT);
661 ADD_ACTIVE(code - start_code, 0);
662 break;
663
664 /*-----------------------------------------------------------------*/
665 case OP_BRA:
666 case OP_SBRA:
667 do
668 {
669 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
670 code += GET(code, 1);
671 }
672 while (*code == OP_ALT);
673 break;
674
675 /*-----------------------------------------------------------------*/
676 case OP_CBRA:
677 case OP_SCBRA:
678 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
679 code += GET(code, 1);
680 while (*code == OP_ALT)
681 {
682 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
683 code += GET(code, 1);
684 }
685 break;
686
687 /*-----------------------------------------------------------------*/
688 case OP_BRAZERO:
689 case OP_BRAMINZERO:
690 ADD_ACTIVE(state_offset + 1, 0);
691 code += 1 + GET(code, 2);
692 while (*code == OP_ALT) code += GET(code, 1);
693 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
694 break;
695
696 /*-----------------------------------------------------------------*/
697 case OP_SKIPZERO:
698 code += 1 + GET(code, 2);
699 while (*code == OP_ALT) code += GET(code, 1);
700 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
701 break;
702
703 /*-----------------------------------------------------------------*/
704 case OP_CIRC:
705 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
706 ((ims & PCRE_MULTILINE) != 0 &&
707 ptr != end_subject &&
708 WAS_NEWLINE(ptr)))
709 { ADD_ACTIVE(state_offset + 1, 0); }
710 break;
711
712 /*-----------------------------------------------------------------*/
713 case OP_EOD:
714 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
715 break;
716
717 /*-----------------------------------------------------------------*/
718 case OP_OPT:
719 ims = code[1];
720 ADD_ACTIVE(state_offset + 2, 0);
721 break;
722
723 /*-----------------------------------------------------------------*/
724 case OP_SOD:
725 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
726 break;
727
728 /*-----------------------------------------------------------------*/
729 case OP_SOM:
730 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
731 break;
732
733
734 /* ========================================================================== */
735 /* These opcodes inspect the next subject character, and sometimes
736 the previous one as well, but do not have an argument. The variable
737 clen contains the length of the current character and is zero if we are
738 at the end of the subject. */
739
740 /*-----------------------------------------------------------------*/
741 case OP_ANY:
742 if (clen > 0 && !IS_NEWLINE(ptr))
743 { ADD_NEW(state_offset + 1, 0); }
744 break;
745
746 /*-----------------------------------------------------------------*/
747 case OP_ALLANY:
748 if (clen > 0)
749 { ADD_NEW(state_offset + 1, 0); }
750 break;
751
752 /*-----------------------------------------------------------------*/
753 case OP_EODN:
754 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
755 { ADD_ACTIVE(state_offset + 1, 0); }
756 break;
757
758 /*-----------------------------------------------------------------*/
759 case OP_DOLL:
760 if ((md->moptions & PCRE_NOTEOL) == 0)
761 {
762 if (clen == 0 ||
763 (IS_NEWLINE(ptr) &&
764 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
765 ))
766 { ADD_ACTIVE(state_offset + 1, 0); }
767 }
768 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
769 { ADD_ACTIVE(state_offset + 1, 0); }
770 break;
771
772 /*-----------------------------------------------------------------*/
773
774 case OP_DIGIT:
775 case OP_WHITESPACE:
776 case OP_WORDCHAR:
777 if (clen > 0 && c < 256 &&
778 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
779 { ADD_NEW(state_offset + 1, 0); }
780 break;
781
782 /*-----------------------------------------------------------------*/
783 case OP_NOT_DIGIT:
784 case OP_NOT_WHITESPACE:
785 case OP_NOT_WORDCHAR:
786 if (clen > 0 && (c >= 256 ||
787 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
788 { ADD_NEW(state_offset + 1, 0); }
789 break;
790
791 /*-----------------------------------------------------------------*/
792 case OP_WORD_BOUNDARY:
793 case OP_NOT_WORD_BOUNDARY:
794 {
795 int left_word, right_word;
796
797 if (ptr > start_subject)
798 {
799 const uschar *temp = ptr - 1;
800 #ifdef SUPPORT_UTF8
801 if (utf8) BACKCHAR(temp);
802 #endif
803 GETCHARTEST(d, temp);
804 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
805 }
806 else left_word = 0;
807
808 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
809 else right_word = 0;
810
811 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
812 { ADD_ACTIVE(state_offset + 1, 0); }
813 }
814 break;
815
816
817 /*-----------------------------------------------------------------*/
818 /* Check the next character by Unicode property. We will get here only
819 if the support is in the binary; otherwise a compile-time error occurs.
820 */
821
822 #ifdef SUPPORT_UCP
823 case OP_PROP:
824 case OP_NOTPROP:
825 if (clen > 0)
826 {
827 BOOL OK;
828 int category = _pcre_ucp_findprop(c, &chartype, &script);
829 switch(code[1])
830 {
831 case PT_ANY:
832 OK = TRUE;
833 break;
834
835 case PT_LAMP:
836 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
837 break;
838
839 case PT_GC:
840 OK = category == code[2];
841 break;
842
843 case PT_PC:
844 OK = chartype == code[2];
845 break;
846
847 case PT_SC:
848 OK = script == code[2];
849 break;
850
851 /* Should never occur, but keep compilers from grumbling. */
852
853 default:
854 OK = codevalue != OP_PROP;
855 break;
856 }
857
858 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
859 }
860 break;
861 #endif
862
863
864
865 /* ========================================================================== */
866 /* These opcodes likewise inspect the subject character, but have an
867 argument that is not a data character. It is one of these opcodes:
868 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
869 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
870
871 case OP_TYPEPLUS:
872 case OP_TYPEMINPLUS:
873 case OP_TYPEPOSPLUS:
874 count = current_state->count; /* Already matched */
875 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
876 if (clen > 0)
877 {
878 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
879 (c < 256 &&
880 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
881 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
882 {
883 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
884 {
885 active_count--; /* Remove non-match possibility */
886 next_active_state--;
887 }
888 count++;
889 ADD_NEW(state_offset, count);
890 }
891 }
892 break;
893
894 /*-----------------------------------------------------------------*/
895 case OP_TYPEQUERY:
896 case OP_TYPEMINQUERY:
897 case OP_TYPEPOSQUERY:
898 ADD_ACTIVE(state_offset + 2, 0);
899 if (clen > 0)
900 {
901 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
902 (c < 256 &&
903 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
904 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
905 {
906 if (codevalue == OP_TYPEPOSQUERY)
907 {
908 active_count--; /* Remove non-match possibility */
909 next_active_state--;
910 }
911 ADD_NEW(state_offset + 2, 0);
912 }
913 }
914 break;
915
916 /*-----------------------------------------------------------------*/
917 case OP_TYPESTAR:
918 case OP_TYPEMINSTAR:
919 case OP_TYPEPOSSTAR:
920 ADD_ACTIVE(state_offset + 2, 0);
921 if (clen > 0)
922 {
923 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
924 (c < 256 &&
925 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
926 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
927 {
928 if (codevalue == OP_TYPEPOSSTAR)
929 {
930 active_count--; /* Remove non-match possibility */
931 next_active_state--;
932 }
933 ADD_NEW(state_offset, 0);
934 }
935 }
936 break;
937
938 /*-----------------------------------------------------------------*/
939 case OP_TYPEEXACT:
940 count = current_state->count; /* Number already matched */
941 if (clen > 0)
942 {
943 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
944 (c < 256 &&
945 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
946 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
947 {
948 if (++count >= GET2(code, 1))
949 { ADD_NEW(state_offset + 4, 0); }
950 else
951 { ADD_NEW(state_offset, count); }
952 }
953 }
954 break;
955
956 /*-----------------------------------------------------------------*/
957 case OP_TYPEUPTO:
958 case OP_TYPEMINUPTO:
959 case OP_TYPEPOSUPTO:
960 ADD_ACTIVE(state_offset + 4, 0);
961 count = current_state->count; /* Number already matched */
962 if (clen > 0)
963 {
964 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
965 (c < 256 &&
966 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
967 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
968 {
969 if (codevalue == OP_TYPEPOSUPTO)
970 {
971 active_count--; /* Remove non-match possibility */
972 next_active_state--;
973 }
974 if (++count >= GET2(code, 1))
975 { ADD_NEW(state_offset + 4, 0); }
976 else
977 { ADD_NEW(state_offset, count); }
978 }
979 }
980 break;
981
982 /* ========================================================================== */
983 /* These are virtual opcodes that are used when something like
984 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
985 argument. It keeps the code above fast for the other cases. The argument
986 is in the d variable. */
987
988 #ifdef SUPPORT_UCP
989 case OP_PROP_EXTRA + OP_TYPEPLUS:
990 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
991 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
992 count = current_state->count; /* Already matched */
993 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
994 if (clen > 0)
995 {
996 BOOL OK;
997 int category = _pcre_ucp_findprop(c, &chartype, &script);
998 switch(code[2])
999 {
1000 case PT_ANY:
1001 OK = TRUE;
1002 break;
1003
1004 case PT_LAMP:
1005 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1006 break;
1007
1008 case PT_GC:
1009 OK = category == code[3];
1010 break;
1011
1012 case PT_PC:
1013 OK = chartype == code[3];
1014 break;
1015
1016 case PT_SC:
1017 OK = script == code[3];
1018 break;
1019
1020 /* Should never occur, but keep compilers from grumbling. */
1021
1022 default:
1023 OK = codevalue != OP_PROP;
1024 break;
1025 }
1026
1027 if (OK == (d == OP_PROP))
1028 {
1029 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1030 {
1031 active_count--; /* Remove non-match possibility */
1032 next_active_state--;
1033 }
1034 count++;
1035 ADD_NEW(state_offset, count);
1036 }
1037 }
1038 break;
1039
1040 /*-----------------------------------------------------------------*/
1041 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1042 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1043 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1044 count = current_state->count; /* Already matched */
1045 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1046 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1047 {
1048 const uschar *nptr = ptr + clen;
1049 int ncount = 0;
1050 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1051 {
1052 active_count--; /* Remove non-match possibility */
1053 next_active_state--;
1054 }
1055 while (nptr < end_subject)
1056 {
1057 int nd;
1058 int ndlen = 1;
1059 GETCHARLEN(nd, nptr, ndlen);
1060 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1061 ncount++;
1062 nptr += ndlen;
1063 }
1064 count++;
1065 ADD_NEW_DATA(-state_offset, count, ncount);
1066 }
1067 break;
1068 #endif
1069
1070 /*-----------------------------------------------------------------*/
1071 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1072 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1073 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1074 count = current_state->count; /* Already matched */
1075 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1076 if (clen > 0)
1077 {
1078 int ncount = 0;
1079 switch (c)
1080 {
1081 case 0x000b:
1082 case 0x000c:
1083 case 0x0085:
1084 case 0x2028:
1085 case 0x2029:
1086 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1087 goto ANYNL01;
1088
1089 case 0x000d:
1090 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1091 /* Fall through */
1092
1093 ANYNL01:
1094 case 0x000a:
1095 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1096 {
1097 active_count--; /* Remove non-match possibility */
1098 next_active_state--;
1099 }
1100 count++;
1101 ADD_NEW_DATA(-state_offset, count, ncount);
1102 break;
1103
1104 default:
1105 break;
1106 }
1107 }
1108 break;
1109
1110 /*-----------------------------------------------------------------*/
1111 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1112 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1113 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1114 count = current_state->count; /* Already matched */
1115 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1116 if (clen > 0)
1117 {
1118 BOOL OK;
1119 switch (c)
1120 {
1121 case 0x000a:
1122 case 0x000b:
1123 case 0x000c:
1124 case 0x000d:
1125 case 0x0085:
1126 case 0x2028:
1127 case 0x2029:
1128 OK = TRUE;
1129 break;
1130
1131 default:
1132 OK = FALSE;
1133 break;
1134 }
1135
1136 if (OK == (d == OP_VSPACE))
1137 {
1138 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1139 {
1140 active_count--; /* Remove non-match possibility */
1141 next_active_state--;
1142 }
1143 count++;
1144 ADD_NEW_DATA(-state_offset, count, 0);
1145 }
1146 }
1147 break;
1148
1149 /*-----------------------------------------------------------------*/
1150 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1151 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1152 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1153 count = current_state->count; /* Already matched */
1154 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1155 if (clen > 0)
1156 {
1157 BOOL OK;
1158 switch (c)
1159 {
1160 case 0x09: /* HT */
1161 case 0x20: /* SPACE */
1162 case 0xa0: /* NBSP */
1163 case 0x1680: /* OGHAM SPACE MARK */
1164 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1165 case 0x2000: /* EN QUAD */
1166 case 0x2001: /* EM QUAD */
1167 case 0x2002: /* EN SPACE */
1168 case 0x2003: /* EM SPACE */
1169 case 0x2004: /* THREE-PER-EM SPACE */
1170 case 0x2005: /* FOUR-PER-EM SPACE */
1171 case 0x2006: /* SIX-PER-EM SPACE */
1172 case 0x2007: /* FIGURE SPACE */
1173 case 0x2008: /* PUNCTUATION SPACE */
1174 case 0x2009: /* THIN SPACE */
1175 case 0x200A: /* HAIR SPACE */
1176 case 0x202f: /* NARROW NO-BREAK SPACE */
1177 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1178 case 0x3000: /* IDEOGRAPHIC SPACE */
1179 OK = TRUE;
1180 break;
1181
1182 default:
1183 OK = FALSE;
1184 break;
1185 }
1186
1187 if (OK == (d == OP_HSPACE))
1188 {
1189 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1190 {
1191 active_count--; /* Remove non-match possibility */
1192 next_active_state--;
1193 }
1194 count++;
1195 ADD_NEW_DATA(-state_offset, count, 0);
1196 }
1197 }
1198 break;
1199
1200 /*-----------------------------------------------------------------*/
1201 #ifdef SUPPORT_UCP
1202 case OP_PROP_EXTRA + OP_TYPEQUERY:
1203 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1204 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1205 count = 4;
1206 goto QS1;
1207
1208 case OP_PROP_EXTRA + OP_TYPESTAR:
1209 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1210 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1211 count = 0;
1212
1213 QS1:
1214
1215 ADD_ACTIVE(state_offset + 4, 0);
1216 if (clen > 0)
1217 {
1218 BOOL OK;
1219 int category = _pcre_ucp_findprop(c, &chartype, &script);
1220 switch(code[2])
1221 {
1222 case PT_ANY:
1223 OK = TRUE;
1224 break;
1225
1226 case PT_LAMP:
1227 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1228 break;
1229
1230 case PT_GC:
1231 OK = category == code[3];
1232 break;
1233
1234 case PT_PC:
1235 OK = chartype == code[3];
1236 break;
1237
1238 case PT_SC:
1239 OK = script == code[3];
1240 break;
1241
1242 /* Should never occur, but keep compilers from grumbling. */
1243
1244 default:
1245 OK = codevalue != OP_PROP;
1246 break;
1247 }
1248
1249 if (OK == (d == OP_PROP))
1250 {
1251 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1252 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1253 {
1254 active_count--; /* Remove non-match possibility */
1255 next_active_state--;
1256 }
1257 ADD_NEW(state_offset + count, 0);
1258 }
1259 }
1260 break;
1261
1262 /*-----------------------------------------------------------------*/
1263 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1264 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1265 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1266 count = 2;
1267 goto QS2;
1268
1269 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1270 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1271 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1272 count = 0;
1273
1274 QS2:
1275
1276 ADD_ACTIVE(state_offset + 2, 0);
1277 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1278 {
1279 const uschar *nptr = ptr + clen;
1280 int ncount = 0;
1281 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1282 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1283 {
1284 active_count--; /* Remove non-match possibility */
1285 next_active_state--;
1286 }
1287 while (nptr < end_subject)
1288 {
1289 int nd;
1290 int ndlen = 1;
1291 GETCHARLEN(nd, nptr, ndlen);
1292 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1293 ncount++;
1294 nptr += ndlen;
1295 }
1296 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1297 }
1298 break;
1299 #endif
1300
1301 /*-----------------------------------------------------------------*/
1302 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1303 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1304 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1305 count = 2;
1306 goto QS3;
1307
1308 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1309 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1310 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1311 count = 0;
1312
1313 QS3:
1314 ADD_ACTIVE(state_offset + 2, 0);
1315 if (clen > 0)
1316 {
1317 int ncount = 0;
1318 switch (c)
1319 {
1320 case 0x000b:
1321 case 0x000c:
1322 case 0x0085:
1323 case 0x2028:
1324 case 0x2029:
1325 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1326 goto ANYNL02;
1327
1328 case 0x000d:
1329 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1330 /* Fall through */
1331
1332 ANYNL02:
1333 case 0x000a:
1334 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1335 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1336 {
1337 active_count--; /* Remove non-match possibility */
1338 next_active_state--;
1339 }
1340 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1341 break;
1342
1343 default:
1344 break;
1345 }
1346 }
1347 break;
1348
1349 /*-----------------------------------------------------------------*/
1350 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1351 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1352 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1353 count = 2;
1354 goto QS4;
1355
1356 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1357 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1358 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1359 count = 0;
1360
1361 QS4:
1362 ADD_ACTIVE(state_offset + 2, 0);
1363 if (clen > 0)
1364 {
1365 BOOL OK;
1366 switch (c)
1367 {
1368 case 0x000a:
1369 case 0x000b:
1370 case 0x000c:
1371 case 0x000d:
1372 case 0x0085:
1373 case 0x2028:
1374 case 0x2029:
1375 OK = TRUE;
1376 break;
1377
1378 default:
1379 OK = FALSE;
1380 break;
1381 }
1382 if (OK == (d == OP_VSPACE))
1383 {
1384 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1385 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1386 {
1387 active_count--; /* Remove non-match possibility */
1388 next_active_state--;
1389 }
1390 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1391 }
1392 }
1393 break;
1394
1395 /*-----------------------------------------------------------------*/
1396 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1397 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1398 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1399 count = 2;
1400 goto QS5;
1401
1402 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1403 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1404 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1405 count = 0;
1406
1407 QS5:
1408 ADD_ACTIVE(state_offset + 2, 0);
1409 if (clen > 0)
1410 {
1411 BOOL OK;
1412 switch (c)
1413 {
1414 case 0x09: /* HT */
1415 case 0x20: /* SPACE */
1416 case 0xa0: /* NBSP */
1417 case 0x1680: /* OGHAM SPACE MARK */
1418 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1419 case 0x2000: /* EN QUAD */
1420 case 0x2001: /* EM QUAD */
1421 case 0x2002: /* EN SPACE */
1422 case 0x2003: /* EM SPACE */
1423 case 0x2004: /* THREE-PER-EM SPACE */
1424 case 0x2005: /* FOUR-PER-EM SPACE */
1425 case 0x2006: /* SIX-PER-EM SPACE */
1426 case 0x2007: /* FIGURE SPACE */
1427 case 0x2008: /* PUNCTUATION SPACE */
1428 case 0x2009: /* THIN SPACE */
1429 case 0x200A: /* HAIR SPACE */
1430 case 0x202f: /* NARROW NO-BREAK SPACE */
1431 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1432 case 0x3000: /* IDEOGRAPHIC SPACE */
1433 OK = TRUE;
1434 break;
1435
1436 default:
1437 OK = FALSE;
1438 break;
1439 }
1440
1441 if (OK == (d == OP_HSPACE))
1442 {
1443 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1444 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1445 {
1446 active_count--; /* Remove non-match possibility */
1447 next_active_state--;
1448 }
1449 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1450 }
1451 }
1452 break;
1453
1454 /*-----------------------------------------------------------------*/
1455 #ifdef SUPPORT_UCP
1456 case OP_PROP_EXTRA + OP_TYPEEXACT:
1457 case OP_PROP_EXTRA + OP_TYPEUPTO:
1458 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1459 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1460 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1461 { ADD_ACTIVE(state_offset + 6, 0); }
1462 count = current_state->count; /* Number already matched */
1463 if (clen > 0)
1464 {
1465 BOOL OK;
1466 int category = _pcre_ucp_findprop(c, &chartype, &script);
1467 switch(code[4])
1468 {
1469 case PT_ANY:
1470 OK = TRUE;
1471 break;
1472
1473 case PT_LAMP:
1474 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1475 break;
1476
1477 case PT_GC:
1478 OK = category == code[5];
1479 break;
1480
1481 case PT_PC:
1482 OK = chartype == code[5];
1483 break;
1484
1485 case PT_SC:
1486 OK = script == code[5];
1487 break;
1488
1489 /* Should never occur, but keep compilers from grumbling. */
1490
1491 default:
1492 OK = codevalue != OP_PROP;
1493 break;
1494 }
1495
1496 if (OK == (d == OP_PROP))
1497 {
1498 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1499 {
1500 active_count--; /* Remove non-match possibility */
1501 next_active_state--;
1502 }
1503 if (++count >= GET2(code, 1))
1504 { ADD_NEW(state_offset + 6, 0); }
1505 else
1506 { ADD_NEW(state_offset, count); }
1507 }
1508 }
1509 break;
1510
1511 /*-----------------------------------------------------------------*/
1512 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1513 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1514 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1515 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1516 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1517 { ADD_ACTIVE(state_offset + 4, 0); }
1518 count = current_state->count; /* Number already matched */
1519 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1520 {
1521 const uschar *nptr = ptr + clen;
1522 int ncount = 0;
1523 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1524 {
1525 active_count--; /* Remove non-match possibility */
1526 next_active_state--;
1527 }
1528 while (nptr < end_subject)
1529 {
1530 int nd;
1531 int ndlen = 1;
1532 GETCHARLEN(nd, nptr, ndlen);
1533 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1534 ncount++;
1535 nptr += ndlen;
1536 }
1537 if (++count >= GET2(code, 1))
1538 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1539 else
1540 { ADD_NEW_DATA(-state_offset, count, ncount); }
1541 }
1542 break;
1543 #endif
1544
1545 /*-----------------------------------------------------------------*/
1546 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1547 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1548 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1549 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1550 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1551 { ADD_ACTIVE(state_offset + 4, 0); }
1552 count = current_state->count; /* Number already matched */
1553 if (clen > 0)
1554 {
1555 int ncount = 0;
1556 switch (c)
1557 {
1558 case 0x000b:
1559 case 0x000c:
1560 case 0x0085:
1561 case 0x2028:
1562 case 0x2029:
1563 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1564 goto ANYNL03;
1565
1566 case 0x000d:
1567 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1568 /* Fall through */
1569
1570 ANYNL03:
1571 case 0x000a:
1572 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1573 {
1574 active_count--; /* Remove non-match possibility */
1575 next_active_state--;
1576 }
1577 if (++count >= GET2(code, 1))
1578 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1579 else
1580 { ADD_NEW_DATA(-state_offset, count, ncount); }
1581 break;
1582
1583 default:
1584 break;
1585 }
1586 }
1587 break;
1588
1589 /*-----------------------------------------------------------------*/
1590 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1591 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1592 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1593 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1594 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1595 { ADD_ACTIVE(state_offset + 4, 0); }
1596 count = current_state->count; /* Number already matched */
1597 if (clen > 0)
1598 {
1599 BOOL OK;
1600 switch (c)
1601 {
1602 case 0x000a:
1603 case 0x000b:
1604 case 0x000c:
1605 case 0x000d:
1606 case 0x0085:
1607 case 0x2028:
1608 case 0x2029:
1609 OK = TRUE;
1610 break;
1611
1612 default:
1613 OK = FALSE;
1614 }
1615
1616 if (OK == (d == OP_VSPACE))
1617 {
1618 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1619 {
1620 active_count--; /* Remove non-match possibility */
1621 next_active_state--;
1622 }
1623 if (++count >= GET2(code, 1))
1624 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1625 else
1626 { ADD_NEW_DATA(-state_offset, count, 0); }
1627 }
1628 }
1629 break;
1630
1631 /*-----------------------------------------------------------------*/
1632 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1633 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1634 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1635 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1636 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1637 { ADD_ACTIVE(state_offset + 4, 0); }
1638 count = current_state->count; /* Number already matched */
1639 if (clen > 0)
1640 {
1641 BOOL OK;
1642 switch (c)
1643 {
1644 case 0x09: /* HT */
1645 case 0x20: /* SPACE */
1646 case 0xa0: /* NBSP */
1647 case 0x1680: /* OGHAM SPACE MARK */
1648 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1649 case 0x2000: /* EN QUAD */
1650 case 0x2001: /* EM QUAD */
1651 case 0x2002: /* EN SPACE */
1652 case 0x2003: /* EM SPACE */
1653 case 0x2004: /* THREE-PER-EM SPACE */
1654 case 0x2005: /* FOUR-PER-EM SPACE */
1655 case 0x2006: /* SIX-PER-EM SPACE */
1656 case 0x2007: /* FIGURE SPACE */
1657 case 0x2008: /* PUNCTUATION SPACE */
1658 case 0x2009: /* THIN SPACE */
1659 case 0x200A: /* HAIR SPACE */
1660 case 0x202f: /* NARROW NO-BREAK SPACE */
1661 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1662 case 0x3000: /* IDEOGRAPHIC SPACE */
1663 OK = TRUE;
1664 break;
1665
1666 default:
1667 OK = FALSE;
1668 break;
1669 }
1670
1671 if (OK == (d == OP_HSPACE))
1672 {
1673 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1674 {
1675 active_count--; /* Remove non-match possibility */
1676 next_active_state--;
1677 }
1678 if (++count >= GET2(code, 1))
1679 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1680 else
1681 { ADD_NEW_DATA(-state_offset, count, 0); }
1682 }
1683 }
1684 break;
1685
1686 /* ========================================================================== */
1687 /* These opcodes are followed by a character that is usually compared
1688 to the current subject character; it is loaded into d. We still get
1689 here even if there is no subject character, because in some cases zero
1690 repetitions are permitted. */
1691
1692 /*-----------------------------------------------------------------*/
1693 case OP_CHAR:
1694 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1695 break;
1696
1697 /*-----------------------------------------------------------------*/
1698 case OP_CHARNC:
1699 if (clen == 0) break;
1700
1701 #ifdef SUPPORT_UTF8
1702 if (utf8)
1703 {
1704 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1705 {
1706 unsigned int othercase;
1707 if (c < 128) othercase = fcc[c]; else
1708
1709 /* If we have Unicode property support, we can use it to test the
1710 other case of the character. */
1711
1712 #ifdef SUPPORT_UCP
1713 othercase = _pcre_ucp_othercase(c);
1714 #else
1715 othercase = NOTACHAR;
1716 #endif
1717
1718 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1719 }
1720 }
1721 else
1722 #endif /* SUPPORT_UTF8 */
1723
1724 /* Non-UTF-8 mode */
1725 {
1726 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1727 }
1728 break;
1729
1730
1731 #ifdef SUPPORT_UCP
1732 /*-----------------------------------------------------------------*/
1733 /* This is a tricky one because it can match more than one character.
1734 Find out how many characters to skip, and then set up a negative state
1735 to wait for them to pass before continuing. */
1736
1737 case OP_EXTUNI:
1738 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1739 {
1740 const uschar *nptr = ptr + clen;
1741 int ncount = 0;
1742 while (nptr < end_subject)
1743 {
1744 int nclen = 1;
1745 GETCHARLEN(c, nptr, nclen);
1746 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1747 ncount++;
1748 nptr += nclen;
1749 }
1750 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1751 }
1752 break;
1753 #endif
1754
1755 /*-----------------------------------------------------------------*/
1756 /* This is a tricky like EXTUNI because it too can match more than one
1757 character (when CR is followed by LF). In this case, set up a negative
1758 state to wait for one character to pass before continuing. */
1759
1760 case OP_ANYNL:
1761 if (clen > 0) switch(c)
1762 {
1763 case 0x000b:
1764 case 0x000c:
1765 case 0x0085:
1766 case 0x2028:
1767 case 0x2029:
1768 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1769
1770 case 0x000a:
1771 ADD_NEW(state_offset + 1, 0);
1772 break;
1773
1774 case 0x000d:
1775 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1776 {
1777 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1778 }
1779 else
1780 {
1781 ADD_NEW(state_offset + 1, 0);
1782 }
1783 break;
1784 }
1785 break;
1786
1787 /*-----------------------------------------------------------------*/
1788 case OP_NOT_VSPACE:
1789 if (clen > 0) switch(c)
1790 {
1791 case 0x000a:
1792 case 0x000b:
1793 case 0x000c:
1794 case 0x000d:
1795 case 0x0085:
1796 case 0x2028:
1797 case 0x2029:
1798 break;
1799
1800 default:
1801 ADD_NEW(state_offset + 1, 0);
1802 break;
1803 }
1804 break;
1805
1806 /*-----------------------------------------------------------------*/
1807 case OP_VSPACE:
1808 if (clen > 0) switch(c)
1809 {
1810 case 0x000a:
1811 case 0x000b:
1812 case 0x000c:
1813 case 0x000d:
1814 case 0x0085:
1815 case 0x2028:
1816 case 0x2029:
1817 ADD_NEW(state_offset + 1, 0);
1818 break;
1819
1820 default: break;
1821 }
1822 break;
1823
1824 /*-----------------------------------------------------------------*/
1825 case OP_NOT_HSPACE:
1826 if (clen > 0) switch(c)
1827 {
1828 case 0x09: /* HT */
1829 case 0x20: /* SPACE */
1830 case 0xa0: /* NBSP */
1831 case 0x1680: /* OGHAM SPACE MARK */
1832 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1833 case 0x2000: /* EN QUAD */
1834 case 0x2001: /* EM QUAD */
1835 case 0x2002: /* EN SPACE */
1836 case 0x2003: /* EM SPACE */
1837 case 0x2004: /* THREE-PER-EM SPACE */
1838 case 0x2005: /* FOUR-PER-EM SPACE */
1839 case 0x2006: /* SIX-PER-EM SPACE */
1840 case 0x2007: /* FIGURE SPACE */
1841 case 0x2008: /* PUNCTUATION SPACE */
1842 case 0x2009: /* THIN SPACE */
1843 case 0x200A: /* HAIR SPACE */
1844 case 0x202f: /* NARROW NO-BREAK SPACE */
1845 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1846 case 0x3000: /* IDEOGRAPHIC SPACE */
1847 break;
1848
1849 default:
1850 ADD_NEW(state_offset + 1, 0);
1851 break;
1852 }
1853 break;
1854
1855 /*-----------------------------------------------------------------*/
1856 case OP_HSPACE:
1857 if (clen > 0) switch(c)
1858 {
1859 case 0x09: /* HT */
1860 case 0x20: /* SPACE */
1861 case 0xa0: /* NBSP */
1862 case 0x1680: /* OGHAM SPACE MARK */
1863 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1864 case 0x2000: /* EN QUAD */
1865 case 0x2001: /* EM QUAD */
1866 case 0x2002: /* EN SPACE */
1867 case 0x2003: /* EM SPACE */
1868 case 0x2004: /* THREE-PER-EM SPACE */
1869 case 0x2005: /* FOUR-PER-EM SPACE */
1870 case 0x2006: /* SIX-PER-EM SPACE */
1871 case 0x2007: /* FIGURE SPACE */
1872 case 0x2008: /* PUNCTUATION SPACE */
1873 case 0x2009: /* THIN SPACE */
1874 case 0x200A: /* HAIR SPACE */
1875 case 0x202f: /* NARROW NO-BREAK SPACE */
1876 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1877 case 0x3000: /* IDEOGRAPHIC SPACE */
1878 ADD_NEW(state_offset + 1, 0);
1879 break;
1880 }
1881 break;
1882
1883 /*-----------------------------------------------------------------*/
1884 /* Match a negated single character. This is only used for one-byte
1885 characters, that is, we know that d < 256. The character we are
1886 checking (c) can be multibyte. */
1887
1888 case OP_NOT:
1889 if (clen > 0)
1890 {
1891 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1892 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1893 }
1894 break;
1895
1896 /*-----------------------------------------------------------------*/
1897 case OP_PLUS:
1898 case OP_MINPLUS:
1899 case OP_POSPLUS:
1900 case OP_NOTPLUS:
1901 case OP_NOTMINPLUS:
1902 case OP_NOTPOSPLUS:
1903 count = current_state->count; /* Already matched */
1904 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1905 if (clen > 0)
1906 {
1907 unsigned int otherd = NOTACHAR;
1908 if ((ims & PCRE_CASELESS) != 0)
1909 {
1910 #ifdef SUPPORT_UTF8
1911 if (utf8 && d >= 128)
1912 {
1913 #ifdef SUPPORT_UCP
1914 otherd = _pcre_ucp_othercase(d);
1915 #endif /* SUPPORT_UCP */
1916 }
1917 else
1918 #endif /* SUPPORT_UTF8 */
1919 otherd = fcc[d];
1920 }
1921 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1922 {
1923 if (count > 0 &&
1924 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1925 {
1926 active_count--; /* Remove non-match possibility */
1927 next_active_state--;
1928 }
1929 count++;
1930 ADD_NEW(state_offset, count);
1931 }
1932 }
1933 break;
1934
1935 /*-----------------------------------------------------------------*/
1936 case OP_QUERY:
1937 case OP_MINQUERY:
1938 case OP_POSQUERY:
1939 case OP_NOTQUERY:
1940 case OP_NOTMINQUERY:
1941 case OP_NOTPOSQUERY:
1942 ADD_ACTIVE(state_offset + dlen + 1, 0);
1943 if (clen > 0)
1944 {
1945 unsigned int otherd = NOTACHAR;
1946 if ((ims & PCRE_CASELESS) != 0)
1947 {
1948 #ifdef SUPPORT_UTF8
1949 if (utf8 && d >= 128)
1950 {
1951 #ifdef SUPPORT_UCP
1952 otherd = _pcre_ucp_othercase(d);
1953 #endif /* SUPPORT_UCP */
1954 }
1955 else
1956 #endif /* SUPPORT_UTF8 */
1957 otherd = fcc[d];
1958 }
1959 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1960 {
1961 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1962 {
1963 active_count--; /* Remove non-match possibility */
1964 next_active_state--;
1965 }
1966 ADD_NEW(state_offset + dlen + 1, 0);
1967 }
1968 }
1969 break;
1970
1971 /*-----------------------------------------------------------------*/
1972 case OP_STAR:
1973 case OP_MINSTAR:
1974 case OP_POSSTAR:
1975 case OP_NOTSTAR:
1976 case OP_NOTMINSTAR:
1977 case OP_NOTPOSSTAR:
1978 ADD_ACTIVE(state_offset + dlen + 1, 0);
1979 if (clen > 0)
1980 {
1981 unsigned int otherd = NOTACHAR;
1982 if ((ims & PCRE_CASELESS) != 0)
1983 {
1984 #ifdef SUPPORT_UTF8
1985 if (utf8 && d >= 128)
1986 {
1987 #ifdef SUPPORT_UCP
1988 otherd = _pcre_ucp_othercase(d);
1989 #endif /* SUPPORT_UCP */
1990 }
1991 else
1992 #endif /* SUPPORT_UTF8 */
1993 otherd = fcc[d];
1994 }
1995 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1996 {
1997 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1998 {
1999 active_count--; /* Remove non-match possibility */
2000 next_active_state--;
2001 }
2002 ADD_NEW(state_offset, 0);
2003 }
2004 }
2005 break;
2006
2007 /*-----------------------------------------------------------------*/
2008 case OP_EXACT:
2009 case OP_NOTEXACT:
2010 count = current_state->count; /* Number already matched */
2011 if (clen > 0)
2012 {
2013 unsigned int otherd = NOTACHAR;
2014 if ((ims & PCRE_CASELESS) != 0)
2015 {
2016 #ifdef SUPPORT_UTF8
2017 if (utf8 && d >= 128)
2018 {
2019 #ifdef SUPPORT_UCP
2020 otherd = _pcre_ucp_othercase(d);
2021 #endif /* SUPPORT_UCP */
2022 }
2023 else
2024 #endif /* SUPPORT_UTF8 */
2025 otherd = fcc[d];
2026 }
2027 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2028 {
2029 if (++count >= GET2(code, 1))
2030 { ADD_NEW(state_offset + dlen + 3, 0); }
2031 else
2032 { ADD_NEW(state_offset, count); }
2033 }
2034 }
2035 break;
2036
2037 /*-----------------------------------------------------------------*/
2038 case OP_UPTO:
2039 case OP_MINUPTO:
2040 case OP_POSUPTO:
2041 case OP_NOTUPTO:
2042 case OP_NOTMINUPTO:
2043 case OP_NOTPOSUPTO:
2044 ADD_ACTIVE(state_offset + dlen + 3, 0);
2045 count = current_state->count; /* Number already matched */
2046 if (clen > 0)
2047 {
2048 unsigned int otherd = NOTACHAR;
2049 if ((ims & PCRE_CASELESS) != 0)
2050 {
2051 #ifdef SUPPORT_UTF8
2052 if (utf8 && d >= 128)
2053 {
2054 #ifdef SUPPORT_UCP
2055 otherd = _pcre_ucp_othercase(d);
2056 #endif /* SUPPORT_UCP */
2057 }
2058 else
2059 #endif /* SUPPORT_UTF8 */
2060 otherd = fcc[d];
2061 }
2062 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2063 {
2064 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2065 {
2066 active_count--; /* Remove non-match possibility */
2067 next_active_state--;
2068 }
2069 if (++count >= GET2(code, 1))
2070 { ADD_NEW(state_offset + dlen + 3, 0); }
2071 else
2072 { ADD_NEW(state_offset, count); }
2073 }
2074 }
2075 break;
2076
2077
2078 /* ========================================================================== */
2079 /* These are the class-handling opcodes */
2080
2081 case OP_CLASS:
2082 case OP_NCLASS:
2083 case OP_XCLASS:
2084 {
2085 BOOL isinclass = FALSE;
2086 int next_state_offset;
2087 const uschar *ecode;
2088
2089 /* For a simple class, there is always just a 32-byte table, and we
2090 can set isinclass from it. */
2091
2092 if (codevalue != OP_XCLASS)
2093 {
2094 ecode = code + 33;
2095 if (clen > 0)
2096 {
2097 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2098 ((code[1 + c/8] & (1 << (c&7))) != 0);
2099 }
2100 }
2101
2102 /* An extended class may have a table or a list of single characters,
2103 ranges, or both, and it may be positive or negative. There's a
2104 function that sorts all this out. */
2105
2106 else
2107 {
2108 ecode = code + GET(code, 1);
2109 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2110 }
2111
2112 /* At this point, isinclass is set for all kinds of class, and ecode
2113 points to the byte after the end of the class. If there is a
2114 quantifier, this is where it will be. */
2115
2116 next_state_offset = ecode - start_code;
2117
2118 switch (*ecode)
2119 {
2120 case OP_CRSTAR:
2121 case OP_CRMINSTAR:
2122 ADD_ACTIVE(next_state_offset + 1, 0);
2123 if (isinclass) { ADD_NEW(state_offset, 0); }
2124 break;
2125
2126 case OP_CRPLUS:
2127 case OP_CRMINPLUS:
2128 count = current_state->count; /* Already matched */
2129 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2130 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2131 break;
2132
2133 case OP_CRQUERY:
2134 case OP_CRMINQUERY:
2135 ADD_ACTIVE(next_state_offset + 1, 0);
2136 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2137 break;
2138
2139 case OP_CRRANGE:
2140 case OP_CRMINRANGE:
2141 count = current_state->count; /* Already matched */
2142 if (count >= GET2(ecode, 1))
2143 { ADD_ACTIVE(next_state_offset + 5, 0); }
2144 if (isinclass)
2145 {
2146 int max = GET2(ecode, 3);
2147 if (++count >= max && max != 0) /* Max 0 => no limit */
2148 { ADD_NEW(next_state_offset + 5, 0); }
2149 else
2150 { ADD_NEW(state_offset, count); }
2151 }
2152 break;
2153
2154 default:
2155 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2156 break;
2157 }
2158 }
2159 break;
2160
2161 /* ========================================================================== */
2162 /* These are the opcodes for fancy brackets of various kinds. We have
2163 to use recursion in order to handle them. The "always failing" assersion
2164 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2165 though the other "backtracking verbs" are not supported. */
2166
2167 case OP_FAIL:
2168 break;
2169
2170 case OP_ASSERT:
2171 case OP_ASSERT_NOT:
2172 case OP_ASSERTBACK:
2173 case OP_ASSERTBACK_NOT:
2174 {
2175 int rc;
2176 int local_offsets[2];
2177 int local_workspace[1000];
2178 const uschar *endasscode = code + GET(code, 1);
2179
2180 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2181
2182 rc = internal_dfa_exec(
2183 md, /* static match data */
2184 code, /* this subexpression's code */
2185 ptr, /* where we currently are */
2186 ptr - start_subject, /* start offset */
2187 local_offsets, /* offset vector */
2188 sizeof(local_offsets)/sizeof(int), /* size of same */
2189 local_workspace, /* workspace vector */
2190 sizeof(local_workspace)/sizeof(int), /* size of same */
2191 ims, /* the current ims flags */
2192 rlevel, /* function recursion level */
2193 recursing); /* pass on regex recursion */
2194
2195 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2196 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2197 }
2198 break;
2199
2200 /*-----------------------------------------------------------------*/
2201 case OP_COND:
2202 case OP_SCOND:
2203 {
2204 int local_offsets[1000];
2205 int local_workspace[1000];
2206 int condcode = code[LINK_SIZE+1];
2207
2208 /* Back reference conditions are not supported */
2209
2210 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2211
2212 /* The DEFINE condition is always false */
2213
2214 if (condcode == OP_DEF)
2215 {
2216 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2217 }
2218
2219 /* The only supported version of OP_RREF is for the value RREF_ANY,
2220 which means "test if in any recursion". We can't test for specifically
2221 recursed groups. */
2222
2223 else if (condcode == OP_RREF)
2224 {
2225 int value = GET2(code, LINK_SIZE+2);
2226 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2227 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2228 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2229 }
2230
2231 /* Otherwise, the condition is an assertion */
2232
2233 else
2234 {
2235 int rc;
2236 const uschar *asscode = code + LINK_SIZE + 1;
2237 const uschar *endasscode = asscode + GET(asscode, 1);
2238
2239 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2240
2241 rc = internal_dfa_exec(
2242 md, /* fixed match data */
2243 asscode, /* this subexpression's code */
2244 ptr, /* where we currently are */
2245 ptr - start_subject, /* start offset */
2246 local_offsets, /* offset vector */
2247 sizeof(local_offsets)/sizeof(int), /* size of same */
2248 local_workspace, /* workspace vector */
2249 sizeof(local_workspace)/sizeof(int), /* size of same */
2250 ims, /* the current ims flags */
2251 rlevel, /* function recursion level */
2252 recursing); /* pass on regex recursion */
2253
2254 if ((rc >= 0) ==
2255 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2256 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2257 else
2258 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2259 }
2260 }
2261 break;
2262
2263 /*-----------------------------------------------------------------*/
2264 case OP_RECURSE:
2265 {
2266 int local_offsets[1000];
2267 int local_workspace[1000];
2268 int rc;
2269
2270 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2271 recursing + 1));
2272
2273 rc = internal_dfa_exec(
2274 md, /* fixed match data */
2275 start_code + GET(code, 1), /* this subexpression's code */
2276 ptr, /* where we currently are */
2277 ptr - start_subject, /* start offset */
2278 local_offsets, /* offset vector */
2279 sizeof(local_offsets)/sizeof(int), /* size of same */
2280 local_workspace, /* workspace vector */
2281 sizeof(local_workspace)/sizeof(int), /* size of same */
2282 ims, /* the current ims flags */
2283 rlevel, /* function recursion level */
2284 recursing + 1); /* regex recurse level */
2285
2286 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2287 recursing + 1, rc));
2288
2289 /* Ran out of internal offsets */
2290
2291 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2292
2293 /* For each successful matched substring, set up the next state with a
2294 count of characters to skip before trying it. Note that the count is in
2295 characters, not bytes. */
2296
2297 if (rc > 0)
2298 {
2299 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2300 {
2301 const uschar *p = start_subject + local_offsets[rc];
2302 const uschar *pp = start_subject + local_offsets[rc+1];
2303 int charcount = local_offsets[rc+1] - local_offsets[rc];
2304 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2305 if (charcount > 0)
2306 {
2307 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2308 }
2309 else
2310 {
2311 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2312 }
2313 }
2314 }
2315 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2316 }
2317 break;
2318
2319 /*-----------------------------------------------------------------*/
2320 case OP_ONCE:
2321 {
2322 int local_offsets[2];
2323 int local_workspace[1000];
2324
2325 int rc = internal_dfa_exec(
2326 md, /* fixed match data */
2327 code, /* this subexpression's code */
2328 ptr, /* where we currently are */
2329 ptr - start_subject, /* start offset */
2330 local_offsets, /* offset vector */
2331 sizeof(local_offsets)/sizeof(int), /* size of same */
2332 local_workspace, /* workspace vector */
2333 sizeof(local_workspace)/sizeof(int), /* size of same */
2334 ims, /* the current ims flags */
2335 rlevel, /* function recursion level */
2336 recursing); /* pass on regex recursion */
2337
2338 if (rc >= 0)
2339 {
2340 const uschar *end_subpattern = code;
2341 int charcount = local_offsets[1] - local_offsets[0];
2342 int next_state_offset, repeat_state_offset;
2343
2344 do { end_subpattern += GET(end_subpattern, 1); }
2345 while (*end_subpattern == OP_ALT);
2346 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2347
2348 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2349 arrange for the repeat state also to be added to the relevant list.
2350 Calculate the offset, or set -1 for no repeat. */
2351
2352 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2353 *end_subpattern == OP_KETRMIN)?
2354 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2355
2356 /* If we have matched an empty string, add the next state at the
2357 current character pointer. This is important so that the duplicate
2358 checking kicks in, which is what breaks infinite loops that match an
2359 empty string. */
2360
2361 if (charcount == 0)
2362 {
2363 ADD_ACTIVE(next_state_offset, 0);
2364 }
2365
2366 /* Optimization: if there are no more active states, and there
2367 are no new states yet set up, then skip over the subject string
2368 right here, to save looping. Otherwise, set up the new state to swing
2369 into action when the end of the substring is reached. */
2370
2371 else if (i + 1 >= active_count && new_count == 0)
2372 {
2373 ptr += charcount;
2374 clen = 0;
2375 ADD_NEW(next_state_offset, 0);
2376
2377 /* If we are adding a repeat state at the new character position,
2378 we must fudge things so that it is the only current state.
2379 Otherwise, it might be a duplicate of one we processed before, and
2380 that would cause it to be skipped. */
2381
2382 if (repeat_state_offset >= 0)
2383 {
2384 next_active_state = active_states;
2385 active_count = 0;
2386 i = -1;
2387 ADD_ACTIVE(repeat_state_offset, 0);
2388 }
2389 }
2390 else
2391 {
2392 const uschar *p = start_subject + local_offsets[0];
2393 const uschar *pp = start_subject + local_offsets[1];
2394 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2395 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2396 if (repeat_state_offset >= 0)
2397 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2398 }
2399
2400 }
2401 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2402 }
2403 break;
2404
2405
2406 /* ========================================================================== */
2407 /* Handle callouts */
2408
2409 case OP_CALLOUT:
2410 if (pcre_callout != NULL)
2411 {
2412 int rrc;
2413 pcre_callout_block cb;
2414 cb.version = 1; /* Version 1 of the callout block */
2415 cb.callout_number = code[1];
2416 cb.offset_vector = offsets;
2417 cb.subject = (PCRE_SPTR)start_subject;
2418 cb.subject_length = end_subject - start_subject;
2419 cb.start_match = current_subject - start_subject;
2420 cb.current_position = ptr - start_subject;
2421 cb.pattern_position = GET(code, 2);
2422 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2423 cb.capture_top = 1;
2424 cb.capture_last = -1;
2425 cb.callout_data = md->callout_data;
2426 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2427 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2428 }
2429 break;
2430
2431
2432 /* ========================================================================== */
2433 default: /* Unsupported opcode */
2434 return PCRE_ERROR_DFA_UITEM;
2435 }
2436
2437 NEXT_ACTIVE_STATE: continue;
2438
2439 } /* End of loop scanning active states */
2440
2441 /* We have finished the processing at the current subject character. If no
2442 new states have been set for the next character, we have found all the
2443 matches that we are going to find. If we are at the top level and partial
2444 matching has been requested, check for appropriate conditions. */
2445
2446 if (new_count <= 0)
2447 {
2448 if (match_count < 0 && /* No matches found */
2449 rlevel == 1 && /* Top level match function */
2450 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2451 ptr >= end_subject && /* Reached end of subject */
2452 ptr > current_subject) /* Matched non-empty string */
2453 {
2454 if (offsetcount >= 2)
2455 {
2456 offsets[0] = current_subject - start_subject;
2457 offsets[1] = end_subject - start_subject;
2458 }
2459 match_count = PCRE_ERROR_PARTIAL;
2460 }
2461
2462 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2463 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2464 rlevel*2-2, SP));
2465 break; /* In effect, "return", but see the comment below */
2466 }
2467
2468 /* One or more states are active for the next character. */
2469
2470 ptr += clen; /* Advance to next subject character */
2471 } /* Loop to move along the subject string */
2472
2473 /* Control gets here from "break" a few lines above. We do it this way because
2474 if we use "return" above, we have compiler trouble. Some compilers warn if
2475 there's nothing here because they think the function doesn't return a value. On
2476 the other hand, if we put a dummy statement here, some more clever compilers
2477 complain that it can't be reached. Sigh. */
2478
2479 return match_count;
2480 }
2481
2482
2483
2484
2485 /*************************************************
2486 * Execute a Regular Expression - DFA engine *
2487 *************************************************/
2488
2489 /* This external function applies a compiled re to a subject string using a DFA
2490 engine. This function calls the internal function multiple times if the pattern
2491 is not anchored.
2492
2493 Arguments:
2494 argument_re points to the compiled expression
2495 extra_data points to extra data or is NULL
2496 subject points to the subject string
2497 length length of subject string (may contain binary zeros)
2498 start_offset where to start in the subject string
2499 options option bits
2500 offsets vector of match offsets
2501 offsetcount size of same
2502 workspace workspace vector
2503 wscount size of same
2504
2505 Returns: > 0 => number of match offset pairs placed in offsets
2506 = 0 => offsets overflowed; longest matches are present
2507 -1 => failed to match
2508 < -1 => some kind of unexpected problem
2509 */
2510
2511 PCRE_EXP_DEFN int
2512 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2513 const char *subject, int length, int start_offset, int options, int *offsets,
2514 int offsetcount, int *workspace, int wscount)
2515 {
2516 real_pcre *re = (real_pcre *)argument_re;
2517 dfa_match_data match_block;
2518 dfa_match_data *md = &match_block;
2519 BOOL utf8, anchored, startline, firstline;
2520 const uschar *current_subject, *end_subject, *lcc;
2521
2522 pcre_study_data internal_study;
2523 const pcre_study_data *study = NULL;
2524 real_pcre internal_re;
2525
2526 const uschar *req_byte_ptr;
2527 const uschar *start_bits = NULL;
2528 BOOL first_byte_caseless = FALSE;
2529 BOOL req_byte_caseless = FALSE;
2530 int first_byte = -1;
2531 int req_byte = -1;
2532 int req_byte2 = -1;
2533 int newline;
2534
2535 /* Plausibility checks */
2536
2537 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2538 if (re == NULL || subject == NULL || workspace == NULL ||
2539 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2540 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2541 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2542
2543 /* We need to find the pointer to any study data before we test for byte
2544 flipping, so we scan the extra_data block first. This may set two fields in the
2545 match block, so we must initialize them beforehand. However, the other fields
2546 in the match block must not be set until after the byte flipping. */
2547
2548 md->tables = re->tables;
2549 md->callout_data = NULL;
2550
2551 if (extra_data != NULL)
2552 {
2553 unsigned int flags = extra_data->flags;
2554 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2555 study = (const pcre_study_data *)extra_data->study_data;
2556 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2557 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2558 return PCRE_ERROR_DFA_UMLIMIT;
2559 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2560 md->callout_data = extra_data->callout_data;
2561 if ((flags & PCRE_EXTRA_TABLES) != 0)
2562 md->tables = extra_data->tables;
2563 }
2564
2565 /* Check that the first field in the block is the magic number. If it is not,
2566 test for a regex that was compiled on a host of opposite endianness. If this is
2567 the case, flipped values are put in internal_re and internal_study if there was
2568 study data too. */
2569
2570 if (re->magic_number != MAGIC_NUMBER)
2571 {
2572 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2573 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2574 if (study != NULL) study = &internal_study;
2575 }
2576
2577 /* Set some local values */
2578
2579 current_subject = (const unsigned char *)subject + start_offset;
2580 end_subject = (const unsigned char *)subject + length;
2581 req_byte_ptr = current_subject - 1;
2582
2583 #ifdef SUPPORT_UTF8
2584 utf8 = (re->options & PCRE_UTF8) != 0;
2585 #else
2586 utf8 = FALSE;
2587 #endif
2588
2589 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2590 (re->options & PCRE_ANCHORED) != 0;
2591
2592 /* The remaining fixed data for passing around. */
2593
2594 md->start_code = (const uschar *)argument_re +
2595 re->name_table_offset + re->name_count * re->name_entry_size;
2596 md->start_subject = (const unsigned char *)subject;
2597 md->end_subject = end_subject;
2598 md->moptions = options;
2599 md->poptions = re->options;
2600
2601 /* If the BSR option is not set at match time, copy what was set
2602 at compile time. */
2603
2604 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2605 {
2606 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2607 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2608 #ifdef BSR_ANYCRLF
2609 else md->moptions |= PCRE_BSR_ANYCRLF;
2610 #endif
2611 }
2612
2613 /* Handle different types of newline. The three bits give eight cases. If
2614 nothing is set at run time, whatever was used at compile time applies. */
2615
2616 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2617 PCRE_NEWLINE_BITS)
2618 {
2619 case 0: newline = NEWLINE; break; /* Compile-time default */
2620 case PCRE_NEWLINE_CR: newline = '\r'; break;
2621 case PCRE_NEWLINE_LF: newline = '\n'; break;
2622 case PCRE_NEWLINE_CR+
2623 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2624 case PCRE_NEWLINE_ANY: newline = -1; break;
2625 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2626 default: return PCRE_ERROR_BADNEWLINE;
2627 }
2628
2629 if (newline == -2)
2630 {
2631 md->nltype = NLTYPE_ANYCRLF;
2632 }
2633 else if (newline < 0)
2634 {
2635 md->nltype = NLTYPE_ANY;
2636 }
2637 else
2638 {
2639 md->nltype = NLTYPE_FIXED;
2640 if (newline > 255)
2641 {
2642 md->nllen = 2;
2643 md->nl[0] = (newline >> 8) & 255;
2644 md->nl[1] = newline & 255;
2645 }
2646 else
2647 {
2648 md->nllen = 1;
2649 md->nl[0] = newline;
2650 }
2651 }
2652
2653 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2654 back the character offset. */
2655
2656 #ifdef SUPPORT_UTF8
2657 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2658 {
2659 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2660 return PCRE_ERROR_BADUTF8;
2661 if (start_offset > 0 && start_offset < length)
2662 {
2663 int tb = ((uschar *)subject)[start_offset];
2664 if (tb > 127)
2665 {
2666 tb &= 0xc0;
2667 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2668 }
2669 }
2670 }
2671 #endif
2672
2673 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2674 is a feature that makes it possible to save compiled regex and re-use them
2675 in other programs later. */
2676
2677 if (md->tables == NULL) md->tables = _pcre_default_tables;
2678
2679 /* The lower casing table and the "must be at the start of a line" flag are
2680 used in a loop when finding where to start. */
2681
2682 lcc = md->tables + lcc_offset;
2683 startline = (re->flags & PCRE_STARTLINE) != 0;
2684 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2685
2686 /* Set up the first character to match, if available. The first_byte value is
2687 never set for an anchored regular expression, but the anchoring may be forced
2688 at run time, so we have to test for anchoring. The first char may be unset for
2689 an unanchored pattern, of course. If there's no first char and the pattern was
2690 studied, there may be a bitmap of possible first characters. */
2691
2692 if (!anchored)
2693 {
2694 if ((re->flags & PCRE_FIRSTSET) != 0)
2695 {
2696 first_byte = re->first_byte & 255;
2697 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2698 first_byte = lcc[first_byte];
2699 }
2700 else
2701 {
2702 if (startline && study != NULL &&
2703 (study->options & PCRE_STUDY_MAPPED) != 0)
2704 start_bits = study->start_bits;
2705 }
2706 }
2707
2708 /* For anchored or unanchored matches, there may be a "last known required
2709 character" set. */
2710
2711 if ((re->flags & PCRE_REQCHSET) != 0)
2712 {
2713 req_byte = re->req_byte & 255;
2714 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2715 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2716 }
2717
2718 /* Call the main matching function, looping for a non-anchored regex after a
2719 failed match. Unless restarting, optimize by moving to the first match
2720 character if possible, when not anchored. Then unless wanting a partial match,
2721 check for a required later character. */
2722
2723 for (;;)
2724 {
2725 int rc;
2726
2727 if ((options & PCRE_DFA_RESTART) == 0)
2728 {
2729 const uschar *save_end_subject = end_subject;
2730
2731 /* Advance to a unique first char if possible. If firstline is TRUE, the
2732 start of the match is constrained to the first line of a multiline string.
2733 Implement this by temporarily adjusting end_subject so that we stop
2734 scanning at a newline. If the match fails at the newline, later code breaks
2735 this loop. */
2736
2737 if (firstline)
2738 {
2739 const uschar *t = current_subject;
2740 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2741 end_subject = t;
2742 }
2743
2744 if (first_byte >= 0)
2745 {
2746 if (first_byte_caseless)
2747 while (current_subject < end_subject &&
2748 lcc[*current_subject] != first_byte)
2749 current_subject++;
2750 else
2751 while (current_subject < end_subject && *current_subject != first_byte)
2752 current_subject++;
2753 }
2754
2755 /* Or to just after a linebreak for a multiline match if possible */
2756
2757 else if (startline)
2758 {
2759 if (current_subject > md->start_subject + start_offset)
2760 {
2761 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2762 current_subject++;
2763
2764 /* If we have just passed a CR and the newline option is ANY or
2765 ANYCRLF, and we are now at a LF, advance the match position by one more
2766 character. */
2767
2768 if (current_subject[-1] == '\r' &&
2769 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2770 current_subject < end_subject &&
2771 *current_subject == '\n')
2772 current_subject++;
2773 }
2774 }
2775
2776 /* Or to a non-unique first char after study */
2777
2778 else if (start_bits != NULL)
2779 {
2780 while (current_subject < end_subject)
2781 {
2782 register unsigned int c = *current_subject;
2783 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2784 else break;
2785 }
2786 }
2787
2788 /* Restore fudged end_subject */
2789
2790 end_subject = save_end_subject;
2791 }
2792
2793 /* If req_byte is set, we know that that character must appear in the subject
2794 for the match to succeed. If the first character is set, req_byte must be
2795 later in the subject; otherwise the test starts at the match point. This
2796 optimization can save a huge amount of work in patterns with nested unlimited
2797 repeats that aren't going to match. Writing separate code for cased/caseless
2798 versions makes it go faster, as does using an autoincrement and backing off
2799 on a match.
2800
2801 HOWEVER: when the subject string is very, very long, searching to its end can
2802 take a long time, and give bad performance on quite ordinary patterns. This
2803 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2804 don't do this when the string is sufficiently long.
2805
2806 ALSO: this processing is disabled when partial matching is requested.
2807 */
2808
2809 if (req_byte >= 0 &&
2810 end_subject - current_subject < REQ_BYTE_MAX &&
2811 (options & PCRE_PARTIAL) == 0)
2812 {
2813 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2814
2815 /* We don't need to repeat the search if we haven't yet reached the
2816 place we found it at last time. */
2817
2818 if (p > req_byte_ptr)
2819 {
2820 if (req_byte_caseless)
2821 {
2822 while (p < end_subject)
2823 {
2824 register int pp = *p++;
2825 if (pp == req_byte || pp == req_byte2) { p--; break; }
2826 }
2827 }
2828 else
2829 {
2830 while (p < end_subject)
2831 {
2832 if (*p++ == req_byte) { p--; break; }
2833 }
2834 }
2835
2836 /* If we can't find the required character, break the matching loop,
2837 which will cause a return or PCRE_ERROR_NOMATCH. */
2838
2839 if (p >= end_subject) break;
2840
2841 /* If we have found the required character, save the point where we
2842 found it, so that we don't search again next time round the loop if
2843 the start hasn't passed this character yet. */
2844
2845 req_byte_ptr = p;
2846 }
2847 }
2848
2849 /* OK, now we can do the business */
2850
2851 rc = internal_dfa_exec(
2852 md, /* fixed match data */
2853 md->start_code, /* this subexpression's code */
2854 current_subject, /* where we currently are */
2855 start_offset, /* start offset in subject */
2856 offsets, /* offset vector */
2857 offsetcount, /* size of same */
2858 workspace, /* workspace vector */
2859 wscount, /* size of same */
2860 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2861 0, /* function recurse level */
2862 0); /* regex recurse level */
2863
2864 /* Anything other than "no match" means we are done, always; otherwise, carry
2865 on only if not anchored. */
2866
2867 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2868
2869 /* Advance to the next subject character unless we are at the end of a line
2870 and firstline is set. */
2871
2872 if (firstline && IS_NEWLINE(current_subject)) break;
2873 current_subject++;
2874 if (utf8)
2875 {
2876 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2877 current_subject++;
2878 }
2879 if (current_subject > end_subject) break;
2880
2881 /* If we have just passed a CR and we are now at a LF, and the pattern does
2882 not contain any explicit matches for \r or \n, and the newline option is CRLF
2883 or ANY or ANYCRLF, advance the match position by one more character. */
2884
2885 if (current_subject[-1] == '\r' &&
2886 current_subject < end_subject &&
2887 *current_subject == '\n' &&
2888 (re->flags & PCRE_HASCRORLF) == 0 &&
2889 (md->nltype == NLTYPE_ANY ||
2890 md->nltype == NLTYPE_ANYCRLF ||
2891 md->nllen == 2))
2892 current_subject++;
2893
2894 } /* "Bumpalong" loop */
2895
2896 return PCRE_ERROR_NOMATCH;
2897 }
2898
2899 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12