/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 178 - (show annotations) (download)
Wed Jun 13 08:44:34 2007 UTC (7 years, 5 months ago) by ph10
File MIME type: text/plain
File size: 94293 byte(s)
Add support for \h, \H, \v, \V.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45
46
47 #define NLBLOCK md /* Block containing newline information */
48 #define PSSTART start_subject /* Field containing processed string start */
49 #define PSEND end_subject /* Field containing processed string end */
50
51 #include "pcre_internal.h"
52
53
54 /* For use to indent debugging output */
55
56 #define SP " "
57
58
59
60 /*************************************************
61 * Code parameters and static tables *
62 *************************************************/
63
64 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
65 into others, under special conditions. A gap of 20 between the blocks should be
66 enough. The resulting opcodes don't have to be less than 256 because they are
67 never stored, so we push them well clear of the normal opcodes. */
68
69 #define OP_PROP_EXTRA 300
70 #define OP_EXTUNI_EXTRA 320
71 #define OP_ANYNL_EXTRA 340
72 #define OP_HSPACE_EXTRA 360
73 #define OP_VSPACE_EXTRA 380
74
75
76 /* This table identifies those opcodes that are followed immediately by a
77 character that is to be tested in some way. This makes is possible to
78 centralize the loading of these characters. In the case of Type * etc, the
79 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
80 small value. ***NOTE*** If the start of this table is modified, the two tables
81 that follow must also be modified. */
82
83 static uschar coptable[] = {
84 0, /* End */
85 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
86 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
87 0, 0, /* Any, Anybyte */
88 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
89 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
90 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
91 1, /* Char */
92 1, /* Charnc */
93 1, /* not */
94 /* Positive single-char repeats */
95 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
96 3, 3, 3, /* upto, minupto, exact */
97 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
98 /* Negative single-char repeats - only for chars < 256 */
99 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* NOT upto, minupto, exact */
101 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
102 /* Positive type repeats */
103 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* Type upto, minupto, exact */
105 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
106 /* Character class & ref repeats */
107 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
108 0, 0, /* CRRANGE, CRMINRANGE */
109 0, /* CLASS */
110 0, /* NCLASS */
111 0, /* XCLASS - variable length */
112 0, /* REF */
113 0, /* RECURSE */
114 0, /* CALLOUT */
115 0, /* Alt */
116 0, /* Ket */
117 0, /* KetRmax */
118 0, /* KetRmin */
119 0, /* Assert */
120 0, /* Assert not */
121 0, /* Assert behind */
122 0, /* Assert behind not */
123 0, /* Reverse */
124 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
125 0, 0, 0, /* SBRA, SCBRA, SCOND */
126 0, /* CREF */
127 0, /* RREF */
128 0, /* DEF */
129 0, 0 /* BRAZERO, BRAMINZERO */
130 };
131
132 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
133 and \w */
134
135 static uschar toptable1[] = {
136 0, 0, 0, 0, 0, 0,
137 ctype_digit, ctype_digit,
138 ctype_space, ctype_space,
139 ctype_word, ctype_word,
140 0 /* OP_ANY */
141 };
142
143 static uschar toptable2[] = {
144 0, 0, 0, 0, 0, 0,
145 ctype_digit, 0,
146 ctype_space, 0,
147 ctype_word, 0,
148 1 /* OP_ANY */
149 };
150
151
152 /* Structure for holding data about a particular state, which is in effect the
153 current data for an active path through the match tree. It must consist
154 entirely of ints because the working vector we are passed, and which we put
155 these structures in, is a vector of ints. */
156
157 typedef struct stateblock {
158 int offset; /* Offset to opcode */
159 int count; /* Count for repeats */
160 int ims; /* ims flag bits */
161 int data; /* Some use extra data */
162 } stateblock;
163
164 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
165
166
167 #ifdef DEBUG
168 /*************************************************
169 * Print character string *
170 *************************************************/
171
172 /* Character string printing function for debugging.
173
174 Arguments:
175 p points to string
176 length number of bytes
177 f where to print
178
179 Returns: nothing
180 */
181
182 static void
183 pchars(unsigned char *p, int length, FILE *f)
184 {
185 int c;
186 while (length-- > 0)
187 {
188 if (isprint(c = *(p++)))
189 fprintf(f, "%c", c);
190 else
191 fprintf(f, "\\x%02x", c);
192 }
193 }
194 #endif
195
196
197
198 /*************************************************
199 * Execute a Regular Expression - DFA engine *
200 *************************************************/
201
202 /* This internal function applies a compiled pattern to a subject string,
203 starting at a given point, using a DFA engine. This function is called from the
204 external one, possibly multiple times if the pattern is not anchored. The
205 function calls itself recursively for some kinds of subpattern.
206
207 Arguments:
208 md the match_data block with fixed information
209 this_start_code the opening bracket of this subexpression's code
210 current_subject where we currently are in the subject string
211 start_offset start offset in the subject string
212 offsets vector to contain the matching string offsets
213 offsetcount size of same
214 workspace vector of workspace
215 wscount size of same
216 ims the current ims flags
217 rlevel function call recursion level
218 recursing regex recursive call level
219
220 Returns: > 0 =>
221 = 0 =>
222 -1 => failed to match
223 < -1 => some kind of unexpected problem
224
225 The following macros are used for adding states to the two state vectors (one
226 for the current character, one for the following character). */
227
228 #define ADD_ACTIVE(x,y) \
229 if (active_count++ < wscount) \
230 { \
231 next_active_state->offset = (x); \
232 next_active_state->count = (y); \
233 next_active_state->ims = ims; \
234 next_active_state++; \
235 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
236 } \
237 else return PCRE_ERROR_DFA_WSSIZE
238
239 #define ADD_ACTIVE_DATA(x,y,z) \
240 if (active_count++ < wscount) \
241 { \
242 next_active_state->offset = (x); \
243 next_active_state->count = (y); \
244 next_active_state->ims = ims; \
245 next_active_state->data = (z); \
246 next_active_state++; \
247 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
248 } \
249 else return PCRE_ERROR_DFA_WSSIZE
250
251 #define ADD_NEW(x,y) \
252 if (new_count++ < wscount) \
253 { \
254 next_new_state->offset = (x); \
255 next_new_state->count = (y); \
256 next_new_state->ims = ims; \
257 next_new_state++; \
258 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
259 } \
260 else return PCRE_ERROR_DFA_WSSIZE
261
262 #define ADD_NEW_DATA(x,y,z) \
263 if (new_count++ < wscount) \
264 { \
265 next_new_state->offset = (x); \
266 next_new_state->count = (y); \
267 next_new_state->ims = ims; \
268 next_new_state->data = (z); \
269 next_new_state++; \
270 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
271 } \
272 else return PCRE_ERROR_DFA_WSSIZE
273
274 /* And now, here is the code */
275
276 static int
277 internal_dfa_exec(
278 dfa_match_data *md,
279 const uschar *this_start_code,
280 const uschar *current_subject,
281 int start_offset,
282 int *offsets,
283 int offsetcount,
284 int *workspace,
285 int wscount,
286 int ims,
287 int rlevel,
288 int recursing)
289 {
290 stateblock *active_states, *new_states, *temp_states;
291 stateblock *next_active_state, *next_new_state;
292
293 const uschar *ctypes, *lcc, *fcc;
294 const uschar *ptr;
295 const uschar *end_code, *first_op;
296
297 int active_count, new_count, match_count;
298
299 /* Some fields in the md block are frequently referenced, so we load them into
300 independent variables in the hope that this will perform better. */
301
302 const uschar *start_subject = md->start_subject;
303 const uschar *end_subject = md->end_subject;
304 const uschar *start_code = md->start_code;
305
306 #ifdef SUPPORT_UTF8
307 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
308 #else
309 BOOL utf8 = FALSE;
310 #endif
311
312 rlevel++;
313 offsetcount &= (-2);
314
315 wscount -= 2;
316 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
317 (2 * INTS_PER_STATEBLOCK);
318
319 DPRINTF(("\n%.*s---------------------\n"
320 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
321 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
322
323 ctypes = md->tables + ctypes_offset;
324 lcc = md->tables + lcc_offset;
325 fcc = md->tables + fcc_offset;
326
327 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
328
329 active_states = (stateblock *)(workspace + 2);
330 next_new_state = new_states = active_states + wscount;
331 new_count = 0;
332
333 first_op = this_start_code + 1 + LINK_SIZE +
334 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
335
336 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
337 the alternative states onto the list, and find out where the end is. This
338 makes is possible to use this function recursively, when we want to stop at a
339 matching internal ket rather than at the end.
340
341 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
342 a backward assertion. In that case, we have to find out the maximum amount to
343 move back, and set up each alternative appropriately. */
344
345 if (*first_op == OP_REVERSE)
346 {
347 int max_back = 0;
348 int gone_back;
349
350 end_code = this_start_code;
351 do
352 {
353 int back = GET(end_code, 2+LINK_SIZE);
354 if (back > max_back) max_back = back;
355 end_code += GET(end_code, 1);
356 }
357 while (*end_code == OP_ALT);
358
359 /* If we can't go back the amount required for the longest lookbehind
360 pattern, go back as far as we can; some alternatives may still be viable. */
361
362 #ifdef SUPPORT_UTF8
363 /* In character mode we have to step back character by character */
364
365 if (utf8)
366 {
367 for (gone_back = 0; gone_back < max_back; gone_back++)
368 {
369 if (current_subject <= start_subject) break;
370 current_subject--;
371 while (current_subject > start_subject &&
372 (*current_subject & 0xc0) == 0x80)
373 current_subject--;
374 }
375 }
376 else
377 #endif
378
379 /* In byte-mode we can do this quickly. */
380
381 {
382 gone_back = (current_subject - max_back < start_subject)?
383 current_subject - start_subject : max_back;
384 current_subject -= gone_back;
385 }
386
387 /* Now we can process the individual branches. */
388
389 end_code = this_start_code;
390 do
391 {
392 int back = GET(end_code, 2+LINK_SIZE);
393 if (back <= gone_back)
394 {
395 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
396 ADD_NEW_DATA(-bstate, 0, gone_back - back);
397 }
398 end_code += GET(end_code, 1);
399 }
400 while (*end_code == OP_ALT);
401 }
402
403 /* This is the code for a "normal" subpattern (not a backward assertion). The
404 start of a whole pattern is always one of these. If we are at the top level,
405 we may be asked to restart matching from the same point that we reached for a
406 previous partial match. We still have to scan through the top-level branches to
407 find the end state. */
408
409 else
410 {
411 end_code = this_start_code;
412
413 /* Restarting */
414
415 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
416 {
417 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
418 new_count = workspace[1];
419 if (!workspace[0])
420 memcpy(new_states, active_states, new_count * sizeof(stateblock));
421 }
422
423 /* Not restarting */
424
425 else
426 {
427 int length = 1 + LINK_SIZE +
428 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
429 do
430 {
431 ADD_NEW(end_code - start_code + length, 0);
432 end_code += GET(end_code, 1);
433 length = 1 + LINK_SIZE;
434 }
435 while (*end_code == OP_ALT);
436 }
437 }
438
439 workspace[0] = 0; /* Bit indicating which vector is current */
440
441 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
442
443 /* Loop for scanning the subject */
444
445 ptr = current_subject;
446 for (;;)
447 {
448 int i, j;
449 int clen, dlen;
450 unsigned int c, d;
451
452 /* Make the new state list into the active state list and empty the
453 new state list. */
454
455 temp_states = active_states;
456 active_states = new_states;
457 new_states = temp_states;
458 active_count = new_count;
459 new_count = 0;
460
461 workspace[0] ^= 1; /* Remember for the restarting feature */
462 workspace[1] = active_count;
463
464 #ifdef DEBUG
465 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
466 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
467 printf("\"\n");
468
469 printf("%.*sActive states: ", rlevel*2-2, SP);
470 for (i = 0; i < active_count; i++)
471 printf("%d/%d ", active_states[i].offset, active_states[i].count);
472 printf("\n");
473 #endif
474
475 /* Set the pointers for adding new states */
476
477 next_active_state = active_states + active_count;
478 next_new_state = new_states;
479
480 /* Load the current character from the subject outside the loop, as many
481 different states may want to look at it, and we assume that at least one
482 will. */
483
484 if (ptr < end_subject)
485 {
486 clen = 1; /* Number of bytes in the character */
487 #ifdef SUPPORT_UTF8
488 if (utf8) { GETCHARLEN(c, ptr, clen); } else
489 #endif /* SUPPORT_UTF8 */
490 c = *ptr;
491 }
492 else
493 {
494 clen = 0; /* This indicates the end of the subject */
495 c = NOTACHAR; /* This value should never actually be used */
496 }
497
498 /* Scan up the active states and act on each one. The result of an action
499 may be to add more states to the currently active list (e.g. on hitting a
500 parenthesis) or it may be to put states on the new list, for considering
501 when we move the character pointer on. */
502
503 for (i = 0; i < active_count; i++)
504 {
505 stateblock *current_state = active_states + i;
506 const uschar *code;
507 int state_offset = current_state->offset;
508 int count, codevalue;
509 #ifdef SUPPORT_UCP
510 int chartype, script;
511 #endif
512
513 #ifdef DEBUG
514 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
515 if (clen == 0) printf("EOL\n");
516 else if (c > 32 && c < 127) printf("'%c'\n", c);
517 else printf("0x%02x\n", c);
518 #endif
519
520 /* This variable is referred to implicity in the ADD_xxx macros. */
521
522 ims = current_state->ims;
523
524 /* A negative offset is a special case meaning "hold off going to this
525 (negated) state until the number of characters in the data field have
526 been skipped". */
527
528 if (state_offset < 0)
529 {
530 if (current_state->data > 0)
531 {
532 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
533 ADD_NEW_DATA(state_offset, current_state->count,
534 current_state->data - 1);
535 continue;
536 }
537 else
538 {
539 current_state->offset = state_offset = -state_offset;
540 }
541 }
542
543 /* Check for a duplicate state with the same count, and skip if found. */
544
545 for (j = 0; j < i; j++)
546 {
547 if (active_states[j].offset == state_offset &&
548 active_states[j].count == current_state->count)
549 {
550 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
551 goto NEXT_ACTIVE_STATE;
552 }
553 }
554
555 /* The state offset is the offset to the opcode */
556
557 code = start_code + state_offset;
558 codevalue = *code;
559
560 /* If this opcode is followed by an inline character, load it. It is
561 tempting to test for the presence of a subject character here, but that
562 is wrong, because sometimes zero repetitions of the subject are
563 permitted.
564
565 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
566 argument that is not a data character - but is always one byte long. We
567 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
568 this case. To keep the other cases fast, convert these ones to new opcodes.
569 */
570
571 if (coptable[codevalue] > 0)
572 {
573 dlen = 1;
574 #ifdef SUPPORT_UTF8
575 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
576 #endif /* SUPPORT_UTF8 */
577 d = code[coptable[codevalue]];
578 if (codevalue >= OP_TYPESTAR)
579 {
580 switch(d)
581 {
582 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
583 case OP_NOTPROP:
584 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
585 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
586 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
587 case OP_NOT_HSPACE:
588 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
589 case OP_NOT_VSPACE:
590 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
591 default: break;
592 }
593 }
594 }
595 else
596 {
597 dlen = 0; /* Not strictly necessary, but compilers moan */
598 d = NOTACHAR; /* if these variables are not set. */
599 }
600
601
602 /* Now process the individual opcodes */
603
604 switch (codevalue)
605 {
606
607 /* ========================================================================== */
608 /* Reached a closing bracket. If not at the end of the pattern, carry
609 on with the next opcode. Otherwise, unless we have an empty string and
610 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
611 matches so we always have the longest first. */
612
613 case OP_KET:
614 case OP_KETRMIN:
615 case OP_KETRMAX:
616 if (code != end_code)
617 {
618 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
619 if (codevalue != OP_KET)
620 {
621 ADD_ACTIVE(state_offset - GET(code, 1), 0);
622 }
623 }
624 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
625 {
626 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
627 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
628 match_count = 0;
629 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
630 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
631 if (offsetcount >= 2)
632 {
633 offsets[0] = current_subject - start_subject;
634 offsets[1] = ptr - start_subject;
635 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
636 offsets[1] - offsets[0], current_subject));
637 }
638 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
639 {
640 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
641 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
642 match_count, rlevel*2-2, SP));
643 return match_count;
644 }
645 }
646 break;
647
648 /* ========================================================================== */
649 /* These opcodes add to the current list of states without looking
650 at the current character. */
651
652 /*-----------------------------------------------------------------*/
653 case OP_ALT:
654 do { code += GET(code, 1); } while (*code == OP_ALT);
655 ADD_ACTIVE(code - start_code, 0);
656 break;
657
658 /*-----------------------------------------------------------------*/
659 case OP_BRA:
660 case OP_SBRA:
661 do
662 {
663 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
664 code += GET(code, 1);
665 }
666 while (*code == OP_ALT);
667 break;
668
669 /*-----------------------------------------------------------------*/
670 case OP_CBRA:
671 case OP_SCBRA:
672 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
673 code += GET(code, 1);
674 while (*code == OP_ALT)
675 {
676 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
677 code += GET(code, 1);
678 }
679 break;
680
681 /*-----------------------------------------------------------------*/
682 case OP_BRAZERO:
683 case OP_BRAMINZERO:
684 ADD_ACTIVE(state_offset + 1, 0);
685 code += 1 + GET(code, 2);
686 while (*code == OP_ALT) code += GET(code, 1);
687 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
688 break;
689
690 /*-----------------------------------------------------------------*/
691 case OP_CIRC:
692 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
693 ((ims & PCRE_MULTILINE) != 0 &&
694 ptr != end_subject &&
695 WAS_NEWLINE(ptr)))
696 { ADD_ACTIVE(state_offset + 1, 0); }
697 break;
698
699 /*-----------------------------------------------------------------*/
700 case OP_EOD:
701 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
702 break;
703
704 /*-----------------------------------------------------------------*/
705 case OP_OPT:
706 ims = code[1];
707 ADD_ACTIVE(state_offset + 2, 0);
708 break;
709
710 /*-----------------------------------------------------------------*/
711 case OP_SOD:
712 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
713 break;
714
715 /*-----------------------------------------------------------------*/
716 case OP_SOM:
717 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
718 break;
719
720
721 /* ========================================================================== */
722 /* These opcodes inspect the next subject character, and sometimes
723 the previous one as well, but do not have an argument. The variable
724 clen contains the length of the current character and is zero if we are
725 at the end of the subject. */
726
727 /*-----------------------------------------------------------------*/
728 case OP_ANY:
729 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
730 { ADD_NEW(state_offset + 1, 0); }
731 break;
732
733 /*-----------------------------------------------------------------*/
734 case OP_EODN:
735 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
736 { ADD_ACTIVE(state_offset + 1, 0); }
737 break;
738
739 /*-----------------------------------------------------------------*/
740 case OP_DOLL:
741 if ((md->moptions & PCRE_NOTEOL) == 0)
742 {
743 if (clen == 0 ||
744 (IS_NEWLINE(ptr) &&
745 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
746 ))
747 { ADD_ACTIVE(state_offset + 1, 0); }
748 }
749 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
750 { ADD_ACTIVE(state_offset + 1, 0); }
751 break;
752
753 /*-----------------------------------------------------------------*/
754
755 case OP_DIGIT:
756 case OP_WHITESPACE:
757 case OP_WORDCHAR:
758 if (clen > 0 && c < 256 &&
759 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
760 { ADD_NEW(state_offset + 1, 0); }
761 break;
762
763 /*-----------------------------------------------------------------*/
764 case OP_NOT_DIGIT:
765 case OP_NOT_WHITESPACE:
766 case OP_NOT_WORDCHAR:
767 if (clen > 0 && (c >= 256 ||
768 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
769 { ADD_NEW(state_offset + 1, 0); }
770 break;
771
772 /*-----------------------------------------------------------------*/
773 case OP_WORD_BOUNDARY:
774 case OP_NOT_WORD_BOUNDARY:
775 {
776 int left_word, right_word;
777
778 if (ptr > start_subject)
779 {
780 const uschar *temp = ptr - 1;
781 #ifdef SUPPORT_UTF8
782 if (utf8) BACKCHAR(temp);
783 #endif
784 GETCHARTEST(d, temp);
785 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
786 }
787 else left_word = 0;
788
789 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
790 else right_word = 0;
791
792 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
793 { ADD_ACTIVE(state_offset + 1, 0); }
794 }
795 break;
796
797
798 /*-----------------------------------------------------------------*/
799 /* Check the next character by Unicode property. We will get here only
800 if the support is in the binary; otherwise a compile-time error occurs.
801 */
802
803 #ifdef SUPPORT_UCP
804 case OP_PROP:
805 case OP_NOTPROP:
806 if (clen > 0)
807 {
808 BOOL OK;
809 int category = _pcre_ucp_findprop(c, &chartype, &script);
810 switch(code[1])
811 {
812 case PT_ANY:
813 OK = TRUE;
814 break;
815
816 case PT_LAMP:
817 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
818 break;
819
820 case PT_GC:
821 OK = category == code[2];
822 break;
823
824 case PT_PC:
825 OK = chartype == code[2];
826 break;
827
828 case PT_SC:
829 OK = script == code[2];
830 break;
831
832 /* Should never occur, but keep compilers from grumbling. */
833
834 default:
835 OK = codevalue != OP_PROP;
836 break;
837 }
838
839 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
840 }
841 break;
842 #endif
843
844
845
846 /* ========================================================================== */
847 /* These opcodes likewise inspect the subject character, but have an
848 argument that is not a data character. It is one of these opcodes:
849 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
850 OP_NOT_WORDCHAR. The value is loaded into d. */
851
852 case OP_TYPEPLUS:
853 case OP_TYPEMINPLUS:
854 case OP_TYPEPOSPLUS:
855 count = current_state->count; /* Already matched */
856 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
857 if (clen > 0)
858 {
859 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
860 (c < 256 &&
861 (d != OP_ANY ||
862 (ims & PCRE_DOTALL) != 0 ||
863 !IS_NEWLINE(ptr)
864 ) &&
865 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
866 {
867 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
868 {
869 active_count--; /* Remove non-match possibility */
870 next_active_state--;
871 }
872 count++;
873 ADD_NEW(state_offset, count);
874 }
875 }
876 break;
877
878 /*-----------------------------------------------------------------*/
879 case OP_TYPEQUERY:
880 case OP_TYPEMINQUERY:
881 case OP_TYPEPOSQUERY:
882 ADD_ACTIVE(state_offset + 2, 0);
883 if (clen > 0)
884 {
885 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
886 (c < 256 &&
887 (d != OP_ANY ||
888 (ims & PCRE_DOTALL) != 0 ||
889 !IS_NEWLINE(ptr)
890 ) &&
891 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
892 {
893 if (codevalue == OP_TYPEPOSQUERY)
894 {
895 active_count--; /* Remove non-match possibility */
896 next_active_state--;
897 }
898 ADD_NEW(state_offset + 2, 0);
899 }
900 }
901 break;
902
903 /*-----------------------------------------------------------------*/
904 case OP_TYPESTAR:
905 case OP_TYPEMINSTAR:
906 case OP_TYPEPOSSTAR:
907 ADD_ACTIVE(state_offset + 2, 0);
908 if (clen > 0)
909 {
910 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
911 (c < 256 &&
912 (d != OP_ANY ||
913 (ims & PCRE_DOTALL) != 0 ||
914 !IS_NEWLINE(ptr)
915 ) &&
916 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
917 {
918 if (codevalue == OP_TYPEPOSSTAR)
919 {
920 active_count--; /* Remove non-match possibility */
921 next_active_state--;
922 }
923 ADD_NEW(state_offset, 0);
924 }
925 }
926 break;
927
928 /*-----------------------------------------------------------------*/
929 case OP_TYPEEXACT:
930 count = current_state->count; /* Number already matched */
931 if (clen > 0)
932 {
933 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
934 (c < 256 &&
935 (d != OP_ANY ||
936 (ims & PCRE_DOTALL) != 0 ||
937 !IS_NEWLINE(ptr)
938 ) &&
939 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
940 {
941 if (++count >= GET2(code, 1))
942 { ADD_NEW(state_offset + 4, 0); }
943 else
944 { ADD_NEW(state_offset, count); }
945 }
946 }
947 break;
948
949 /*-----------------------------------------------------------------*/
950 case OP_TYPEUPTO:
951 case OP_TYPEMINUPTO:
952 case OP_TYPEPOSUPTO:
953 ADD_ACTIVE(state_offset + 4, 0);
954 count = current_state->count; /* Number already matched */
955 if (clen > 0)
956 {
957 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
958 (c < 256 &&
959 (d != OP_ANY ||
960 (ims & PCRE_DOTALL) != 0 ||
961 !IS_NEWLINE(ptr)
962 ) &&
963 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
964 {
965 if (codevalue == OP_TYPEPOSUPTO)
966 {
967 active_count--; /* Remove non-match possibility */
968 next_active_state--;
969 }
970 if (++count >= GET2(code, 1))
971 { ADD_NEW(state_offset + 4, 0); }
972 else
973 { ADD_NEW(state_offset, count); }
974 }
975 }
976 break;
977
978 /* ========================================================================== */
979 /* These are virtual opcodes that are used when something like
980 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
981 argument. It keeps the code above fast for the other cases. The argument
982 is in the d variable. */
983
984 #ifdef SUPPORT_UCP
985 case OP_PROP_EXTRA + OP_TYPEPLUS:
986 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
987 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
988 count = current_state->count; /* Already matched */
989 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
990 if (clen > 0)
991 {
992 BOOL OK;
993 int category = _pcre_ucp_findprop(c, &chartype, &script);
994 switch(code[2])
995 {
996 case PT_ANY:
997 OK = TRUE;
998 break;
999
1000 case PT_LAMP:
1001 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1002 break;
1003
1004 case PT_GC:
1005 OK = category == code[3];
1006 break;
1007
1008 case PT_PC:
1009 OK = chartype == code[3];
1010 break;
1011
1012 case PT_SC:
1013 OK = script == code[3];
1014 break;
1015
1016 /* Should never occur, but keep compilers from grumbling. */
1017
1018 default:
1019 OK = codevalue != OP_PROP;
1020 break;
1021 }
1022
1023 if (OK == (d == OP_PROP))
1024 {
1025 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1026 {
1027 active_count--; /* Remove non-match possibility */
1028 next_active_state--;
1029 }
1030 count++;
1031 ADD_NEW(state_offset, count);
1032 }
1033 }
1034 break;
1035
1036 /*-----------------------------------------------------------------*/
1037 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1038 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1039 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1040 count = current_state->count; /* Already matched */
1041 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1042 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1043 {
1044 const uschar *nptr = ptr + clen;
1045 int ncount = 0;
1046 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1047 {
1048 active_count--; /* Remove non-match possibility */
1049 next_active_state--;
1050 }
1051 while (nptr < end_subject)
1052 {
1053 int nd;
1054 int ndlen = 1;
1055 GETCHARLEN(nd, nptr, ndlen);
1056 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1057 ncount++;
1058 nptr += ndlen;
1059 }
1060 count++;
1061 ADD_NEW_DATA(-state_offset, count, ncount);
1062 }
1063 break;
1064 #endif
1065
1066 /*-----------------------------------------------------------------*/
1067 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1068 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1069 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1070 count = current_state->count; /* Already matched */
1071 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1072 if (clen > 0)
1073 {
1074 int ncount = 0;
1075 switch (c)
1076 {
1077 case 0x000d:
1078 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1079 /* Fall through */
1080 case 0x000a:
1081 case 0x000b:
1082 case 0x000c:
1083 case 0x0085:
1084 case 0x2028:
1085 case 0x2029:
1086 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1087 {
1088 active_count--; /* Remove non-match possibility */
1089 next_active_state--;
1090 }
1091 count++;
1092 ADD_NEW_DATA(-state_offset, count, ncount);
1093 break;
1094 default:
1095 break;
1096 }
1097 }
1098 break;
1099
1100 /*-----------------------------------------------------------------*/
1101 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1102 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1103 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1104 count = current_state->count; /* Already matched */
1105 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1106 if (clen > 0)
1107 {
1108 BOOL OK;
1109 switch (c)
1110 {
1111 case 0x000a:
1112 case 0x000b:
1113 case 0x000c:
1114 case 0x000d:
1115 case 0x0085:
1116 case 0x2028:
1117 case 0x2029:
1118 OK = TRUE;
1119 break;
1120
1121 default:
1122 OK = FALSE;
1123 break;
1124 }
1125
1126 if (OK == (d == OP_VSPACE))
1127 {
1128 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1129 {
1130 active_count--; /* Remove non-match possibility */
1131 next_active_state--;
1132 }
1133 count++;
1134 ADD_NEW_DATA(-state_offset, count, 0);
1135 }
1136 }
1137 break;
1138
1139 /*-----------------------------------------------------------------*/
1140 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1141 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1142 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1143 count = current_state->count; /* Already matched */
1144 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1145 if (clen > 0)
1146 {
1147 BOOL OK;
1148 switch (c)
1149 {
1150 case 0x09: /* HT */
1151 case 0x20: /* SPACE */
1152 case 0xa0: /* NBSP */
1153 case 0x1680: /* OGHAM SPACE MARK */
1154 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1155 case 0x2000: /* EN QUAD */
1156 case 0x2001: /* EM QUAD */
1157 case 0x2002: /* EN SPACE */
1158 case 0x2003: /* EM SPACE */
1159 case 0x2004: /* THREE-PER-EM SPACE */
1160 case 0x2005: /* FOUR-PER-EM SPACE */
1161 case 0x2006: /* SIX-PER-EM SPACE */
1162 case 0x2007: /* FIGURE SPACE */
1163 case 0x2008: /* PUNCTUATION SPACE */
1164 case 0x2009: /* THIN SPACE */
1165 case 0x200A: /* HAIR SPACE */
1166 case 0x202f: /* NARROW NO-BREAK SPACE */
1167 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1168 case 0x3000: /* IDEOGRAPHIC SPACE */
1169 OK = TRUE;
1170 break;
1171
1172 default:
1173 OK = FALSE;
1174 break;
1175 }
1176
1177 if (OK == (d == OP_HSPACE))
1178 {
1179 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1180 {
1181 active_count--; /* Remove non-match possibility */
1182 next_active_state--;
1183 }
1184 count++;
1185 ADD_NEW_DATA(-state_offset, count, 0);
1186 }
1187 }
1188 break;
1189
1190 /*-----------------------------------------------------------------*/
1191 #ifdef SUPPORT_UCP
1192 case OP_PROP_EXTRA + OP_TYPEQUERY:
1193 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1194 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1195 count = 4;
1196 goto QS1;
1197
1198 case OP_PROP_EXTRA + OP_TYPESTAR:
1199 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1200 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1201 count = 0;
1202
1203 QS1:
1204
1205 ADD_ACTIVE(state_offset + 4, 0);
1206 if (clen > 0)
1207 {
1208 BOOL OK;
1209 int category = _pcre_ucp_findprop(c, &chartype, &script);
1210 switch(code[2])
1211 {
1212 case PT_ANY:
1213 OK = TRUE;
1214 break;
1215
1216 case PT_LAMP:
1217 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1218 break;
1219
1220 case PT_GC:
1221 OK = category == code[3];
1222 break;
1223
1224 case PT_PC:
1225 OK = chartype == code[3];
1226 break;
1227
1228 case PT_SC:
1229 OK = script == code[3];
1230 break;
1231
1232 /* Should never occur, but keep compilers from grumbling. */
1233
1234 default:
1235 OK = codevalue != OP_PROP;
1236 break;
1237 }
1238
1239 if (OK == (d == OP_PROP))
1240 {
1241 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1242 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1243 {
1244 active_count--; /* Remove non-match possibility */
1245 next_active_state--;
1246 }
1247 ADD_NEW(state_offset + count, 0);
1248 }
1249 }
1250 break;
1251
1252 /*-----------------------------------------------------------------*/
1253 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1254 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1255 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1256 count = 2;
1257 goto QS2;
1258
1259 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1260 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1261 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1262 count = 0;
1263
1264 QS2:
1265
1266 ADD_ACTIVE(state_offset + 2, 0);
1267 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1268 {
1269 const uschar *nptr = ptr + clen;
1270 int ncount = 0;
1271 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1272 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1273 {
1274 active_count--; /* Remove non-match possibility */
1275 next_active_state--;
1276 }
1277 while (nptr < end_subject)
1278 {
1279 int nd;
1280 int ndlen = 1;
1281 GETCHARLEN(nd, nptr, ndlen);
1282 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1283 ncount++;
1284 nptr += ndlen;
1285 }
1286 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1287 }
1288 break;
1289 #endif
1290
1291 /*-----------------------------------------------------------------*/
1292 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1293 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1294 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1295 count = 2;
1296 goto QS3;
1297
1298 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1299 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1300 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1301 count = 0;
1302
1303 QS3:
1304 ADD_ACTIVE(state_offset + 2, 0);
1305 if (clen > 0)
1306 {
1307 int ncount = 0;
1308 switch (c)
1309 {
1310 case 0x000d:
1311 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1312 /* Fall through */
1313 case 0x000a:
1314 case 0x000b:
1315 case 0x000c:
1316 case 0x0085:
1317 case 0x2028:
1318 case 0x2029:
1319 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1320 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1321 {
1322 active_count--; /* Remove non-match possibility */
1323 next_active_state--;
1324 }
1325 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1326 break;
1327 default:
1328 break;
1329 }
1330 }
1331 break;
1332
1333 /*-----------------------------------------------------------------*/
1334 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1335 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1336 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1337 count = 2;
1338 goto QS4;
1339
1340 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1341 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1342 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1343 count = 0;
1344
1345 QS4:
1346 ADD_ACTIVE(state_offset + 2, 0);
1347 if (clen > 0)
1348 {
1349 BOOL OK;
1350 switch (c)
1351 {
1352 case 0x000a:
1353 case 0x000b:
1354 case 0x000c:
1355 case 0x000d:
1356 case 0x0085:
1357 case 0x2028:
1358 case 0x2029:
1359 OK = TRUE;
1360 break;
1361
1362 default:
1363 OK = FALSE;
1364 break;
1365 }
1366 if (OK == (d == OP_VSPACE))
1367 {
1368 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1369 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1370 {
1371 active_count--; /* Remove non-match possibility */
1372 next_active_state--;
1373 }
1374 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1375 }
1376 }
1377 break;
1378
1379 /*-----------------------------------------------------------------*/
1380 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1381 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1382 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1383 count = 2;
1384 goto QS5;
1385
1386 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1387 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1388 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1389 count = 0;
1390
1391 QS5:
1392 ADD_ACTIVE(state_offset + 2, 0);
1393 if (clen > 0)
1394 {
1395 BOOL OK;
1396 switch (c)
1397 {
1398 case 0x09: /* HT */
1399 case 0x20: /* SPACE */
1400 case 0xa0: /* NBSP */
1401 case 0x1680: /* OGHAM SPACE MARK */
1402 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1403 case 0x2000: /* EN QUAD */
1404 case 0x2001: /* EM QUAD */
1405 case 0x2002: /* EN SPACE */
1406 case 0x2003: /* EM SPACE */
1407 case 0x2004: /* THREE-PER-EM SPACE */
1408 case 0x2005: /* FOUR-PER-EM SPACE */
1409 case 0x2006: /* SIX-PER-EM SPACE */
1410 case 0x2007: /* FIGURE SPACE */
1411 case 0x2008: /* PUNCTUATION SPACE */
1412 case 0x2009: /* THIN SPACE */
1413 case 0x200A: /* HAIR SPACE */
1414 case 0x202f: /* NARROW NO-BREAK SPACE */
1415 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1416 case 0x3000: /* IDEOGRAPHIC SPACE */
1417 OK = TRUE;
1418 break;
1419
1420 default:
1421 OK = FALSE;
1422 break;
1423 }
1424
1425 if (OK == (d == OP_HSPACE))
1426 {
1427 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1428 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1429 {
1430 active_count--; /* Remove non-match possibility */
1431 next_active_state--;
1432 }
1433 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1434 }
1435 }
1436 break;
1437
1438 /*-----------------------------------------------------------------*/
1439 #ifdef SUPPORT_UCP
1440 case OP_PROP_EXTRA + OP_TYPEEXACT:
1441 case OP_PROP_EXTRA + OP_TYPEUPTO:
1442 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1443 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1444 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1445 { ADD_ACTIVE(state_offset + 6, 0); }
1446 count = current_state->count; /* Number already matched */
1447 if (clen > 0)
1448 {
1449 BOOL OK;
1450 int category = _pcre_ucp_findprop(c, &chartype, &script);
1451 switch(code[4])
1452 {
1453 case PT_ANY:
1454 OK = TRUE;
1455 break;
1456
1457 case PT_LAMP:
1458 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1459 break;
1460
1461 case PT_GC:
1462 OK = category == code[5];
1463 break;
1464
1465 case PT_PC:
1466 OK = chartype == code[5];
1467 break;
1468
1469 case PT_SC:
1470 OK = script == code[5];
1471 break;
1472
1473 /* Should never occur, but keep compilers from grumbling. */
1474
1475 default:
1476 OK = codevalue != OP_PROP;
1477 break;
1478 }
1479
1480 if (OK == (d == OP_PROP))
1481 {
1482 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1483 {
1484 active_count--; /* Remove non-match possibility */
1485 next_active_state--;
1486 }
1487 if (++count >= GET2(code, 1))
1488 { ADD_NEW(state_offset + 6, 0); }
1489 else
1490 { ADD_NEW(state_offset, count); }
1491 }
1492 }
1493 break;
1494
1495 /*-----------------------------------------------------------------*/
1496 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1497 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1498 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1499 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1500 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1501 { ADD_ACTIVE(state_offset + 4, 0); }
1502 count = current_state->count; /* Number already matched */
1503 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1504 {
1505 const uschar *nptr = ptr + clen;
1506 int ncount = 0;
1507 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1508 {
1509 active_count--; /* Remove non-match possibility */
1510 next_active_state--;
1511 }
1512 while (nptr < end_subject)
1513 {
1514 int nd;
1515 int ndlen = 1;
1516 GETCHARLEN(nd, nptr, ndlen);
1517 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1518 ncount++;
1519 nptr += ndlen;
1520 }
1521 if (++count >= GET2(code, 1))
1522 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1523 else
1524 { ADD_NEW_DATA(-state_offset, count, ncount); }
1525 }
1526 break;
1527 #endif
1528
1529 /*-----------------------------------------------------------------*/
1530 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1531 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1532 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1533 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1534 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1535 { ADD_ACTIVE(state_offset + 4, 0); }
1536 count = current_state->count; /* Number already matched */
1537 if (clen > 0)
1538 {
1539 int ncount = 0;
1540 switch (c)
1541 {
1542 case 0x000d:
1543 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1544 /* Fall through */
1545 case 0x000a:
1546 case 0x000b:
1547 case 0x000c:
1548 case 0x0085:
1549 case 0x2028:
1550 case 0x2029:
1551 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1552 {
1553 active_count--; /* Remove non-match possibility */
1554 next_active_state--;
1555 }
1556 if (++count >= GET2(code, 1))
1557 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1558 else
1559 { ADD_NEW_DATA(-state_offset, count, ncount); }
1560 break;
1561 default:
1562 break;
1563 }
1564 }
1565 break;
1566
1567 /*-----------------------------------------------------------------*/
1568 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1569 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1570 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1571 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1572 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1573 { ADD_ACTIVE(state_offset + 4, 0); }
1574 count = current_state->count; /* Number already matched */
1575 if (clen > 0)
1576 {
1577 BOOL OK;
1578 switch (c)
1579 {
1580 case 0x000a:
1581 case 0x000b:
1582 case 0x000c:
1583 case 0x000d:
1584 case 0x0085:
1585 case 0x2028:
1586 case 0x2029:
1587 OK = TRUE;
1588 break;
1589
1590 default:
1591 OK = FALSE;
1592 }
1593
1594 if (OK == (d == OP_VSPACE))
1595 {
1596 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1597 {
1598 active_count--; /* Remove non-match possibility */
1599 next_active_state--;
1600 }
1601 if (++count >= GET2(code, 1))
1602 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1603 else
1604 { ADD_NEW_DATA(-state_offset, count, 0); }
1605 }
1606 }
1607 break;
1608
1609 /*-----------------------------------------------------------------*/
1610 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1611 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1612 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1613 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1614 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1615 { ADD_ACTIVE(state_offset + 4, 0); }
1616 count = current_state->count; /* Number already matched */
1617 if (clen > 0)
1618 {
1619 BOOL OK;
1620 switch (c)
1621 {
1622 case 0x09: /* HT */
1623 case 0x20: /* SPACE */
1624 case 0xa0: /* NBSP */
1625 case 0x1680: /* OGHAM SPACE MARK */
1626 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1627 case 0x2000: /* EN QUAD */
1628 case 0x2001: /* EM QUAD */
1629 case 0x2002: /* EN SPACE */
1630 case 0x2003: /* EM SPACE */
1631 case 0x2004: /* THREE-PER-EM SPACE */
1632 case 0x2005: /* FOUR-PER-EM SPACE */
1633 case 0x2006: /* SIX-PER-EM SPACE */
1634 case 0x2007: /* FIGURE SPACE */
1635 case 0x2008: /* PUNCTUATION SPACE */
1636 case 0x2009: /* THIN SPACE */
1637 case 0x200A: /* HAIR SPACE */
1638 case 0x202f: /* NARROW NO-BREAK SPACE */
1639 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1640 case 0x3000: /* IDEOGRAPHIC SPACE */
1641 OK = TRUE;
1642 break;
1643
1644 default:
1645 OK = FALSE;
1646 break;
1647 }
1648
1649 if (OK == (d == OP_HSPACE))
1650 {
1651 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1652 {
1653 active_count--; /* Remove non-match possibility */
1654 next_active_state--;
1655 }
1656 if (++count >= GET2(code, 1))
1657 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1658 else
1659 { ADD_NEW_DATA(-state_offset, count, 0); }
1660 }
1661 }
1662 break;
1663
1664 /* ========================================================================== */
1665 /* These opcodes are followed by a character that is usually compared
1666 to the current subject character; it is loaded into d. We still get
1667 here even if there is no subject character, because in some cases zero
1668 repetitions are permitted. */
1669
1670 /*-----------------------------------------------------------------*/
1671 case OP_CHAR:
1672 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1673 break;
1674
1675 /*-----------------------------------------------------------------*/
1676 case OP_CHARNC:
1677 if (clen == 0) break;
1678
1679 #ifdef SUPPORT_UTF8
1680 if (utf8)
1681 {
1682 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1683 {
1684 unsigned int othercase;
1685 if (c < 128) othercase = fcc[c]; else
1686
1687 /* If we have Unicode property support, we can use it to test the
1688 other case of the character. */
1689
1690 #ifdef SUPPORT_UCP
1691 othercase = _pcre_ucp_othercase(c);
1692 #else
1693 othercase = NOTACHAR;
1694 #endif
1695
1696 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1697 }
1698 }
1699 else
1700 #endif /* SUPPORT_UTF8 */
1701
1702 /* Non-UTF-8 mode */
1703 {
1704 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1705 }
1706 break;
1707
1708
1709 #ifdef SUPPORT_UCP
1710 /*-----------------------------------------------------------------*/
1711 /* This is a tricky one because it can match more than one character.
1712 Find out how many characters to skip, and then set up a negative state
1713 to wait for them to pass before continuing. */
1714
1715 case OP_EXTUNI:
1716 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1717 {
1718 const uschar *nptr = ptr + clen;
1719 int ncount = 0;
1720 while (nptr < end_subject)
1721 {
1722 int nclen = 1;
1723 GETCHARLEN(c, nptr, nclen);
1724 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1725 ncount++;
1726 nptr += nclen;
1727 }
1728 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1729 }
1730 break;
1731 #endif
1732
1733 /*-----------------------------------------------------------------*/
1734 /* This is a tricky like EXTUNI because it too can match more than one
1735 character (when CR is followed by LF). In this case, set up a negative
1736 state to wait for one character to pass before continuing. */
1737
1738 case OP_ANYNL:
1739 if (clen > 0) switch(c)
1740 {
1741 case 0x000a:
1742 case 0x000b:
1743 case 0x000c:
1744 case 0x0085:
1745 case 0x2028:
1746 case 0x2029:
1747 ADD_NEW(state_offset + 1, 0);
1748 break;
1749 case 0x000d:
1750 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1751 {
1752 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1753 }
1754 else
1755 {
1756 ADD_NEW(state_offset + 1, 0);
1757 }
1758 break;
1759 }
1760 break;
1761
1762 /*-----------------------------------------------------------------*/
1763 case OP_NOT_VSPACE:
1764 if (clen > 0) switch(c)
1765 {
1766 case 0x000a:
1767 case 0x000b:
1768 case 0x000c:
1769 case 0x000d:
1770 case 0x0085:
1771 case 0x2028:
1772 case 0x2029:
1773 break;
1774
1775 default:
1776 ADD_NEW(state_offset + 1, 0);
1777 break;
1778 }
1779 break;
1780
1781 /*-----------------------------------------------------------------*/
1782 case OP_VSPACE:
1783 if (clen > 0) switch(c)
1784 {
1785 case 0x000a:
1786 case 0x000b:
1787 case 0x000c:
1788 case 0x000d:
1789 case 0x0085:
1790 case 0x2028:
1791 case 0x2029:
1792 ADD_NEW(state_offset + 1, 0);
1793 break;
1794
1795 default: break;
1796 }
1797 break;
1798
1799 /*-----------------------------------------------------------------*/
1800 case OP_NOT_HSPACE:
1801 if (clen > 0) switch(c)
1802 {
1803 case 0x09: /* HT */
1804 case 0x20: /* SPACE */
1805 case 0xa0: /* NBSP */
1806 case 0x1680: /* OGHAM SPACE MARK */
1807 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1808 case 0x2000: /* EN QUAD */
1809 case 0x2001: /* EM QUAD */
1810 case 0x2002: /* EN SPACE */
1811 case 0x2003: /* EM SPACE */
1812 case 0x2004: /* THREE-PER-EM SPACE */
1813 case 0x2005: /* FOUR-PER-EM SPACE */
1814 case 0x2006: /* SIX-PER-EM SPACE */
1815 case 0x2007: /* FIGURE SPACE */
1816 case 0x2008: /* PUNCTUATION SPACE */
1817 case 0x2009: /* THIN SPACE */
1818 case 0x200A: /* HAIR SPACE */
1819 case 0x202f: /* NARROW NO-BREAK SPACE */
1820 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1821 case 0x3000: /* IDEOGRAPHIC SPACE */
1822 break;
1823
1824 default:
1825 ADD_NEW(state_offset + 1, 0);
1826 break;
1827 }
1828 break;
1829
1830 /*-----------------------------------------------------------------*/
1831 case OP_HSPACE:
1832 if (clen > 0) switch(c)
1833 {
1834 case 0x09: /* HT */
1835 case 0x20: /* SPACE */
1836 case 0xa0: /* NBSP */
1837 case 0x1680: /* OGHAM SPACE MARK */
1838 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1839 case 0x2000: /* EN QUAD */
1840 case 0x2001: /* EM QUAD */
1841 case 0x2002: /* EN SPACE */
1842 case 0x2003: /* EM SPACE */
1843 case 0x2004: /* THREE-PER-EM SPACE */
1844 case 0x2005: /* FOUR-PER-EM SPACE */
1845 case 0x2006: /* SIX-PER-EM SPACE */
1846 case 0x2007: /* FIGURE SPACE */
1847 case 0x2008: /* PUNCTUATION SPACE */
1848 case 0x2009: /* THIN SPACE */
1849 case 0x200A: /* HAIR SPACE */
1850 case 0x202f: /* NARROW NO-BREAK SPACE */
1851 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1852 case 0x3000: /* IDEOGRAPHIC SPACE */
1853 ADD_NEW(state_offset + 1, 0);
1854 break;
1855 }
1856 break;
1857
1858 /*-----------------------------------------------------------------*/
1859 /* Match a negated single character. This is only used for one-byte
1860 characters, that is, we know that d < 256. The character we are
1861 checking (c) can be multibyte. */
1862
1863 case OP_NOT:
1864 if (clen > 0)
1865 {
1866 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1867 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1868 }
1869 break;
1870
1871 /*-----------------------------------------------------------------*/
1872 case OP_PLUS:
1873 case OP_MINPLUS:
1874 case OP_POSPLUS:
1875 case OP_NOTPLUS:
1876 case OP_NOTMINPLUS:
1877 case OP_NOTPOSPLUS:
1878 count = current_state->count; /* Already matched */
1879 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1880 if (clen > 0)
1881 {
1882 unsigned int otherd = NOTACHAR;
1883 if ((ims & PCRE_CASELESS) != 0)
1884 {
1885 #ifdef SUPPORT_UTF8
1886 if (utf8 && d >= 128)
1887 {
1888 #ifdef SUPPORT_UCP
1889 otherd = _pcre_ucp_othercase(d);
1890 #endif /* SUPPORT_UCP */
1891 }
1892 else
1893 #endif /* SUPPORT_UTF8 */
1894 otherd = fcc[d];
1895 }
1896 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1897 {
1898 if (count > 0 &&
1899 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1900 {
1901 active_count--; /* Remove non-match possibility */
1902 next_active_state--;
1903 }
1904 count++;
1905 ADD_NEW(state_offset, count);
1906 }
1907 }
1908 break;
1909
1910 /*-----------------------------------------------------------------*/
1911 case OP_QUERY:
1912 case OP_MINQUERY:
1913 case OP_POSQUERY:
1914 case OP_NOTQUERY:
1915 case OP_NOTMINQUERY:
1916 case OP_NOTPOSQUERY:
1917 ADD_ACTIVE(state_offset + dlen + 1, 0);
1918 if (clen > 0)
1919 {
1920 unsigned int otherd = NOTACHAR;
1921 if ((ims & PCRE_CASELESS) != 0)
1922 {
1923 #ifdef SUPPORT_UTF8
1924 if (utf8 && d >= 128)
1925 {
1926 #ifdef SUPPORT_UCP
1927 otherd = _pcre_ucp_othercase(d);
1928 #endif /* SUPPORT_UCP */
1929 }
1930 else
1931 #endif /* SUPPORT_UTF8 */
1932 otherd = fcc[d];
1933 }
1934 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1935 {
1936 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1937 {
1938 active_count--; /* Remove non-match possibility */
1939 next_active_state--;
1940 }
1941 ADD_NEW(state_offset + dlen + 1, 0);
1942 }
1943 }
1944 break;
1945
1946 /*-----------------------------------------------------------------*/
1947 case OP_STAR:
1948 case OP_MINSTAR:
1949 case OP_POSSTAR:
1950 case OP_NOTSTAR:
1951 case OP_NOTMINSTAR:
1952 case OP_NOTPOSSTAR:
1953 ADD_ACTIVE(state_offset + dlen + 1, 0);
1954 if (clen > 0)
1955 {
1956 unsigned int otherd = NOTACHAR;
1957 if ((ims & PCRE_CASELESS) != 0)
1958 {
1959 #ifdef SUPPORT_UTF8
1960 if (utf8 && d >= 128)
1961 {
1962 #ifdef SUPPORT_UCP
1963 otherd = _pcre_ucp_othercase(d);
1964 #endif /* SUPPORT_UCP */
1965 }
1966 else
1967 #endif /* SUPPORT_UTF8 */
1968 otherd = fcc[d];
1969 }
1970 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1971 {
1972 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1973 {
1974 active_count--; /* Remove non-match possibility */
1975 next_active_state--;
1976 }
1977 ADD_NEW(state_offset, 0);
1978 }
1979 }
1980 break;
1981
1982 /*-----------------------------------------------------------------*/
1983 case OP_EXACT:
1984 case OP_NOTEXACT:
1985 count = current_state->count; /* Number already matched */
1986 if (clen > 0)
1987 {
1988 unsigned int otherd = NOTACHAR;
1989 if ((ims & PCRE_CASELESS) != 0)
1990 {
1991 #ifdef SUPPORT_UTF8
1992 if (utf8 && d >= 128)
1993 {
1994 #ifdef SUPPORT_UCP
1995 otherd = _pcre_ucp_othercase(d);
1996 #endif /* SUPPORT_UCP */
1997 }
1998 else
1999 #endif /* SUPPORT_UTF8 */
2000 otherd = fcc[d];
2001 }
2002 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2003 {
2004 if (++count >= GET2(code, 1))
2005 { ADD_NEW(state_offset + dlen + 3, 0); }
2006 else
2007 { ADD_NEW(state_offset, count); }
2008 }
2009 }
2010 break;
2011
2012 /*-----------------------------------------------------------------*/
2013 case OP_UPTO:
2014 case OP_MINUPTO:
2015 case OP_POSUPTO:
2016 case OP_NOTUPTO:
2017 case OP_NOTMINUPTO:
2018 case OP_NOTPOSUPTO:
2019 ADD_ACTIVE(state_offset + dlen + 3, 0);
2020 count = current_state->count; /* Number already matched */
2021 if (clen > 0)
2022 {
2023 unsigned int otherd = NOTACHAR;
2024 if ((ims & PCRE_CASELESS) != 0)
2025 {
2026 #ifdef SUPPORT_UTF8
2027 if (utf8 && d >= 128)
2028 {
2029 #ifdef SUPPORT_UCP
2030 otherd = _pcre_ucp_othercase(d);
2031 #endif /* SUPPORT_UCP */
2032 }
2033 else
2034 #endif /* SUPPORT_UTF8 */
2035 otherd = fcc[d];
2036 }
2037 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2038 {
2039 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2040 {
2041 active_count--; /* Remove non-match possibility */
2042 next_active_state--;
2043 }
2044 if (++count >= GET2(code, 1))
2045 { ADD_NEW(state_offset + dlen + 3, 0); }
2046 else
2047 { ADD_NEW(state_offset, count); }
2048 }
2049 }
2050 break;
2051
2052
2053 /* ========================================================================== */
2054 /* These are the class-handling opcodes */
2055
2056 case OP_CLASS:
2057 case OP_NCLASS:
2058 case OP_XCLASS:
2059 {
2060 BOOL isinclass = FALSE;
2061 int next_state_offset;
2062 const uschar *ecode;
2063
2064 /* For a simple class, there is always just a 32-byte table, and we
2065 can set isinclass from it. */
2066
2067 if (codevalue != OP_XCLASS)
2068 {
2069 ecode = code + 33;
2070 if (clen > 0)
2071 {
2072 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2073 ((code[1 + c/8] & (1 << (c&7))) != 0);
2074 }
2075 }
2076
2077 /* An extended class may have a table or a list of single characters,
2078 ranges, or both, and it may be positive or negative. There's a
2079 function that sorts all this out. */
2080
2081 else
2082 {
2083 ecode = code + GET(code, 1);
2084 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2085 }
2086
2087 /* At this point, isinclass is set for all kinds of class, and ecode
2088 points to the byte after the end of the class. If there is a
2089 quantifier, this is where it will be. */
2090
2091 next_state_offset = ecode - start_code;
2092
2093 switch (*ecode)
2094 {
2095 case OP_CRSTAR:
2096 case OP_CRMINSTAR:
2097 ADD_ACTIVE(next_state_offset + 1, 0);
2098 if (isinclass) { ADD_NEW(state_offset, 0); }
2099 break;
2100
2101 case OP_CRPLUS:
2102 case OP_CRMINPLUS:
2103 count = current_state->count; /* Already matched */
2104 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2105 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2106 break;
2107
2108 case OP_CRQUERY:
2109 case OP_CRMINQUERY:
2110 ADD_ACTIVE(next_state_offset + 1, 0);
2111 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2112 break;
2113
2114 case OP_CRRANGE:
2115 case OP_CRMINRANGE:
2116 count = current_state->count; /* Already matched */
2117 if (count >= GET2(ecode, 1))
2118 { ADD_ACTIVE(next_state_offset + 5, 0); }
2119 if (isinclass)
2120 {
2121 int max = GET2(ecode, 3);
2122 if (++count >= max && max != 0) /* Max 0 => no limit */
2123 { ADD_NEW(next_state_offset + 5, 0); }
2124 else
2125 { ADD_NEW(state_offset, count); }
2126 }
2127 break;
2128
2129 default:
2130 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2131 break;
2132 }
2133 }
2134 break;
2135
2136 /* ========================================================================== */
2137 /* These are the opcodes for fancy brackets of various kinds. We have
2138 to use recursion in order to handle them. */
2139
2140 case OP_ASSERT:
2141 case OP_ASSERT_NOT:
2142 case OP_ASSERTBACK:
2143 case OP_ASSERTBACK_NOT:
2144 {
2145 int rc;
2146 int local_offsets[2];
2147 int local_workspace[1000];
2148 const uschar *endasscode = code + GET(code, 1);
2149
2150 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2151
2152 rc = internal_dfa_exec(
2153 md, /* static match data */
2154 code, /* this subexpression's code */
2155 ptr, /* where we currently are */
2156 ptr - start_subject, /* start offset */
2157 local_offsets, /* offset vector */
2158 sizeof(local_offsets)/sizeof(int), /* size of same */
2159 local_workspace, /* workspace vector */
2160 sizeof(local_workspace)/sizeof(int), /* size of same */
2161 ims, /* the current ims flags */
2162 rlevel, /* function recursion level */
2163 recursing); /* pass on regex recursion */
2164
2165 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2166 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2167 }
2168 break;
2169
2170 /*-----------------------------------------------------------------*/
2171 case OP_COND:
2172 case OP_SCOND:
2173 {
2174 int local_offsets[1000];
2175 int local_workspace[1000];
2176 int condcode = code[LINK_SIZE+1];
2177
2178 /* Back reference conditions are not supported */
2179
2180 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2181
2182 /* The DEFINE condition is always false */
2183
2184 if (condcode == OP_DEF)
2185 {
2186 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2187 }
2188
2189 /* The only supported version of OP_RREF is for the value RREF_ANY,
2190 which means "test if in any recursion". We can't test for specifically
2191 recursed groups. */
2192
2193 else if (condcode == OP_RREF)
2194 {
2195 int value = GET2(code, LINK_SIZE+2);
2196 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2197 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2198 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2199 }
2200
2201 /* Otherwise, the condition is an assertion */
2202
2203 else
2204 {
2205 int rc;
2206 const uschar *asscode = code + LINK_SIZE + 1;
2207 const uschar *endasscode = asscode + GET(asscode, 1);
2208
2209 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2210
2211 rc = internal_dfa_exec(
2212 md, /* fixed match data */
2213 asscode, /* this subexpression's code */
2214 ptr, /* where we currently are */
2215 ptr - start_subject, /* start offset */
2216 local_offsets, /* offset vector */
2217 sizeof(local_offsets)/sizeof(int), /* size of same */
2218 local_workspace, /* workspace vector */
2219 sizeof(local_workspace)/sizeof(int), /* size of same */
2220 ims, /* the current ims flags */
2221 rlevel, /* function recursion level */
2222 recursing); /* pass on regex recursion */
2223
2224 if ((rc >= 0) ==
2225 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2226 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2227 else
2228 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2229 }
2230 }
2231 break;
2232
2233 /*-----------------------------------------------------------------*/
2234 case OP_RECURSE:
2235 {
2236 int local_offsets[1000];
2237 int local_workspace[1000];
2238 int rc;
2239
2240 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2241 recursing + 1));
2242
2243 rc = internal_dfa_exec(
2244 md, /* fixed match data */
2245 start_code + GET(code, 1), /* this subexpression's code */
2246 ptr, /* where we currently are */
2247 ptr - start_subject, /* start offset */
2248 local_offsets, /* offset vector */
2249 sizeof(local_offsets)/sizeof(int), /* size of same */
2250 local_workspace, /* workspace vector */
2251 sizeof(local_workspace)/sizeof(int), /* size of same */
2252 ims, /* the current ims flags */
2253 rlevel, /* function recursion level */
2254 recursing + 1); /* regex recurse level */
2255
2256 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2257 recursing + 1, rc));
2258
2259 /* Ran out of internal offsets */
2260
2261 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2262
2263 /* For each successful matched substring, set up the next state with a
2264 count of characters to skip before trying it. Note that the count is in
2265 characters, not bytes. */
2266
2267 if (rc > 0)
2268 {
2269 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2270 {
2271 const uschar *p = start_subject + local_offsets[rc];
2272 const uschar *pp = start_subject + local_offsets[rc+1];
2273 int charcount = local_offsets[rc+1] - local_offsets[rc];
2274 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2275 if (charcount > 0)
2276 {
2277 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2278 }
2279 else
2280 {
2281 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2282 }
2283 }
2284 }
2285 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2286 }
2287 break;
2288
2289 /*-----------------------------------------------------------------*/
2290 case OP_ONCE:
2291 {
2292 int local_offsets[2];
2293 int local_workspace[1000];
2294
2295 int rc = internal_dfa_exec(
2296 md, /* fixed match data */
2297 code, /* this subexpression's code */
2298 ptr, /* where we currently are */
2299 ptr - start_subject, /* start offset */
2300 local_offsets, /* offset vector */
2301 sizeof(local_offsets)/sizeof(int), /* size of same */
2302 local_workspace, /* workspace vector */
2303 sizeof(local_workspace)/sizeof(int), /* size of same */
2304 ims, /* the current ims flags */
2305 rlevel, /* function recursion level */
2306 recursing); /* pass on regex recursion */
2307
2308 if (rc >= 0)
2309 {
2310 const uschar *end_subpattern = code;
2311 int charcount = local_offsets[1] - local_offsets[0];
2312 int next_state_offset, repeat_state_offset;
2313
2314 do { end_subpattern += GET(end_subpattern, 1); }
2315 while (*end_subpattern == OP_ALT);
2316 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2317
2318 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2319 arrange for the repeat state also to be added to the relevant list.
2320 Calculate the offset, or set -1 for no repeat. */
2321
2322 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2323 *end_subpattern == OP_KETRMIN)?
2324 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2325
2326 /* If we have matched an empty string, add the next state at the
2327 current character pointer. This is important so that the duplicate
2328 checking kicks in, which is what breaks infinite loops that match an
2329 empty string. */
2330
2331 if (charcount == 0)
2332 {
2333 ADD_ACTIVE(next_state_offset, 0);
2334 }
2335
2336 /* Optimization: if there are no more active states, and there
2337 are no new states yet set up, then skip over the subject string
2338 right here, to save looping. Otherwise, set up the new state to swing
2339 into action when the end of the substring is reached. */
2340
2341 else if (i + 1 >= active_count && new_count == 0)
2342 {
2343 ptr += charcount;
2344 clen = 0;
2345 ADD_NEW(next_state_offset, 0);
2346
2347 /* If we are adding a repeat state at the new character position,
2348 we must fudge things so that it is the only current state.
2349 Otherwise, it might be a duplicate of one we processed before, and
2350 that would cause it to be skipped. */
2351
2352 if (repeat_state_offset >= 0)
2353 {
2354 next_active_state = active_states;
2355 active_count = 0;
2356 i = -1;
2357 ADD_ACTIVE(repeat_state_offset, 0);
2358 }
2359 }
2360 else
2361 {
2362 const uschar *p = start_subject + local_offsets[0];
2363 const uschar *pp = start_subject + local_offsets[1];
2364 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2365 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2366 if (repeat_state_offset >= 0)
2367 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2368 }
2369
2370 }
2371 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2372 }
2373 break;
2374
2375
2376 /* ========================================================================== */
2377 /* Handle callouts */
2378
2379 case OP_CALLOUT:
2380 if (pcre_callout != NULL)
2381 {
2382 int rrc;
2383 pcre_callout_block cb;
2384 cb.version = 1; /* Version 1 of the callout block */
2385 cb.callout_number = code[1];
2386 cb.offset_vector = offsets;
2387 cb.subject = (PCRE_SPTR)start_subject;
2388 cb.subject_length = end_subject - start_subject;
2389 cb.start_match = current_subject - start_subject;
2390 cb.current_position = ptr - start_subject;
2391 cb.pattern_position = GET(code, 2);
2392 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2393 cb.capture_top = 1;
2394 cb.capture_last = -1;
2395 cb.callout_data = md->callout_data;
2396 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2397 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2398 }
2399 break;
2400
2401
2402 /* ========================================================================== */
2403 default: /* Unsupported opcode */
2404 return PCRE_ERROR_DFA_UITEM;
2405 }
2406
2407 NEXT_ACTIVE_STATE: continue;
2408
2409 } /* End of loop scanning active states */
2410
2411 /* We have finished the processing at the current subject character. If no
2412 new states have been set for the next character, we have found all the
2413 matches that we are going to find. If we are at the top level and partial
2414 matching has been requested, check for appropriate conditions. */
2415
2416 if (new_count <= 0)
2417 {
2418 if (match_count < 0 && /* No matches found */
2419 rlevel == 1 && /* Top level match function */
2420 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2421 ptr >= end_subject && /* Reached end of subject */
2422 ptr > current_subject) /* Matched non-empty string */
2423 {
2424 if (offsetcount >= 2)
2425 {
2426 offsets[0] = current_subject - start_subject;
2427 offsets[1] = end_subject - start_subject;
2428 }
2429 match_count = PCRE_ERROR_PARTIAL;
2430 }
2431
2432 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2433 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2434 rlevel*2-2, SP));
2435 break; /* In effect, "return", but see the comment below */
2436 }
2437
2438 /* One or more states are active for the next character. */
2439
2440 ptr += clen; /* Advance to next subject character */
2441 } /* Loop to move along the subject string */
2442
2443 /* Control gets here from "break" a few lines above. We do it this way because
2444 if we use "return" above, we have compiler trouble. Some compilers warn if
2445 there's nothing here because they think the function doesn't return a value. On
2446 the other hand, if we put a dummy statement here, some more clever compilers
2447 complain that it can't be reached. Sigh. */
2448
2449 return match_count;
2450 }
2451
2452
2453
2454
2455 /*************************************************
2456 * Execute a Regular Expression - DFA engine *
2457 *************************************************/
2458
2459 /* This external function applies a compiled re to a subject string using a DFA
2460 engine. This function calls the internal function multiple times if the pattern
2461 is not anchored.
2462
2463 Arguments:
2464 argument_re points to the compiled expression
2465 extra_data points to extra data or is NULL
2466 subject points to the subject string
2467 length length of subject string (may contain binary zeros)
2468 start_offset where to start in the subject string
2469 options option bits
2470 offsets vector of match offsets
2471 offsetcount size of same
2472 workspace workspace vector
2473 wscount size of same
2474
2475 Returns: > 0 => number of match offset pairs placed in offsets
2476 = 0 => offsets overflowed; longest matches are present
2477 -1 => failed to match
2478 < -1 => some kind of unexpected problem
2479 */
2480
2481 PCRE_EXP_DEFN int
2482 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2483 const char *subject, int length, int start_offset, int options, int *offsets,
2484 int offsetcount, int *workspace, int wscount)
2485 {
2486 real_pcre *re = (real_pcre *)argument_re;
2487 dfa_match_data match_block;
2488 dfa_match_data *md = &match_block;
2489 BOOL utf8, anchored, startline, firstline;
2490 const uschar *current_subject, *end_subject, *lcc;
2491
2492 pcre_study_data internal_study;
2493 const pcre_study_data *study = NULL;
2494 real_pcre internal_re;
2495
2496 const uschar *req_byte_ptr;
2497 const uschar *start_bits = NULL;
2498 BOOL first_byte_caseless = FALSE;
2499 BOOL req_byte_caseless = FALSE;
2500 int first_byte = -1;
2501 int req_byte = -1;
2502 int req_byte2 = -1;
2503 int newline;
2504
2505 /* Plausibility checks */
2506
2507 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2508 if (re == NULL || subject == NULL || workspace == NULL ||
2509 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2510 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2511 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2512
2513 /* We need to find the pointer to any study data before we test for byte
2514 flipping, so we scan the extra_data block first. This may set two fields in the
2515 match block, so we must initialize them beforehand. However, the other fields
2516 in the match block must not be set until after the byte flipping. */
2517
2518 md->tables = re->tables;
2519 md->callout_data = NULL;
2520
2521 if (extra_data != NULL)
2522 {
2523 unsigned int flags = extra_data->flags;
2524 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2525 study = (const pcre_study_data *)extra_data->study_data;
2526 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2527 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2528 return PCRE_ERROR_DFA_UMLIMIT;
2529 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2530 md->callout_data = extra_data->callout_data;
2531 if ((flags & PCRE_EXTRA_TABLES) != 0)
2532 md->tables = extra_data->tables;
2533 }
2534
2535 /* Check that the first field in the block is the magic number. If it is not,
2536 test for a regex that was compiled on a host of opposite endianness. If this is
2537 the case, flipped values are put in internal_re and internal_study if there was
2538 study data too. */
2539
2540 if (re->magic_number != MAGIC_NUMBER)
2541 {
2542 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2543 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2544 if (study != NULL) study = &internal_study;
2545 }
2546
2547 /* Set some local values */
2548
2549 current_subject = (const unsigned char *)subject + start_offset;
2550 end_subject = (const unsigned char *)subject + length;
2551 req_byte_ptr = current_subject - 1;
2552
2553 #ifdef SUPPORT_UTF8
2554 utf8 = (re->options & PCRE_UTF8) != 0;
2555 #else
2556 utf8 = FALSE;
2557 #endif
2558
2559 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2560 (re->options & PCRE_ANCHORED) != 0;
2561
2562 /* The remaining fixed data for passing around. */
2563
2564 md->start_code = (const uschar *)argument_re +
2565 re->name_table_offset + re->name_count * re->name_entry_size;
2566 md->start_subject = (const unsigned char *)subject;
2567 md->end_subject = end_subject;
2568 md->moptions = options;
2569 md->poptions = re->options;
2570
2571 /* Handle different types of newline. The three bits give eight cases. If
2572 nothing is set at run time, whatever was used at compile time applies. */
2573
2574 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2575 PCRE_NEWLINE_BITS)
2576 {
2577 case 0: newline = NEWLINE; break; /* Compile-time default */
2578 case PCRE_NEWLINE_CR: newline = '\r'; break;
2579 case PCRE_NEWLINE_LF: newline = '\n'; break;
2580 case PCRE_NEWLINE_CR+
2581 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2582 case PCRE_NEWLINE_ANY: newline = -1; break;
2583 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2584 default: return PCRE_ERROR_BADNEWLINE;
2585 }
2586
2587 if (newline == -2)
2588 {
2589 md->nltype = NLTYPE_ANYCRLF;
2590 }
2591 else if (newline < 0)
2592 {
2593 md->nltype = NLTYPE_ANY;
2594 }
2595 else
2596 {
2597 md->nltype = NLTYPE_FIXED;
2598 if (newline > 255)
2599 {
2600 md->nllen = 2;
2601 md->nl[0] = (newline >> 8) & 255;
2602 md->nl[1] = newline & 255;
2603 }
2604 else
2605 {
2606 md->nllen = 1;
2607 md->nl[0] = newline;
2608 }
2609 }
2610
2611 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2612 back the character offset. */
2613
2614 #ifdef SUPPORT_UTF8
2615 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2616 {
2617 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2618 return PCRE_ERROR_BADUTF8;
2619 if (start_offset > 0 && start_offset < length)
2620 {
2621 int tb = ((uschar *)subject)[start_offset];
2622 if (tb > 127)
2623 {
2624 tb &= 0xc0;
2625 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2626 }
2627 }
2628 }
2629 #endif
2630
2631 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2632 is a feature that makes it possible to save compiled regex and re-use them
2633 in other programs later. */
2634
2635 if (md->tables == NULL) md->tables = _pcre_default_tables;
2636
2637 /* The lower casing table and the "must be at the start of a line" flag are
2638 used in a loop when finding where to start. */
2639
2640 lcc = md->tables + lcc_offset;
2641 startline = (re->options & PCRE_STARTLINE) != 0;
2642 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2643
2644 /* Set up the first character to match, if available. The first_byte value is
2645 never set for an anchored regular expression, but the anchoring may be forced
2646 at run time, so we have to test for anchoring. The first char may be unset for
2647 an unanchored pattern, of course. If there's no first char and the pattern was
2648 studied, there may be a bitmap of possible first characters. */
2649
2650 if (!anchored)
2651 {
2652 if ((re->options & PCRE_FIRSTSET) != 0)
2653 {
2654 first_byte = re->first_byte & 255;
2655 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2656 first_byte = lcc[first_byte];
2657 }
2658 else
2659 {
2660 if (startline && study != NULL &&
2661 (study->options & PCRE_STUDY_MAPPED) != 0)
2662 start_bits = study->start_bits;
2663 }
2664 }
2665
2666 /* For anchored or unanchored matches, there may be a "last known required
2667 character" set. */
2668
2669 if ((re->options & PCRE_REQCHSET) != 0)
2670 {
2671 req_byte = re->req_byte & 255;
2672 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2673 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2674 }
2675
2676 /* Call the main matching function, looping for a non-anchored regex after a
2677 failed match. Unless restarting, optimize by moving to the first match
2678 character if possible, when not anchored. Then unless wanting a partial match,
2679 check for a required later character. */
2680
2681 for (;;)
2682 {
2683 int rc;
2684
2685 if ((options & PCRE_DFA_RESTART) == 0)
2686 {
2687 const uschar *save_end_subject = end_subject;
2688
2689 /* Advance to a unique first char if possible. If firstline is TRUE, the
2690 start of the match is constrained to the first line of a multiline string.
2691 Implement this by temporarily adjusting end_subject so that we stop
2692 scanning at a newline. If the match fails at the newline, later code breaks
2693 this loop. */
2694
2695 if (firstline)
2696 {
2697 const uschar *t = current_subject;
2698 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2699 end_subject = t;
2700 }
2701
2702 if (first_byte >= 0)
2703 {
2704 if (first_byte_caseless)
2705 while (current_subject < end_subject &&
2706 lcc[*current_subject] != first_byte)
2707 current_subject++;
2708 else
2709 while (current_subject < end_subject && *current_subject != first_byte)
2710 current_subject++;
2711 }
2712
2713 /* Or to just after a linebreak for a multiline match if possible */
2714
2715 else if (startline)
2716 {
2717 if (current_subject > md->start_subject + start_offset)
2718 {
2719 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2720 current_subject++;
2721
2722 /* If we have just passed a CR and the newline option is ANY or
2723 ANYCRLF, and we are now at a LF, advance the match position by one more
2724 character. */
2725
2726 if (current_subject[-1] == '\r' &&
2727 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2728 current_subject < end_subject &&
2729 *current_subject == '\n')
2730 current_subject++;
2731 }
2732 }
2733
2734 /* Or to a non-unique first char after study */
2735
2736 else if (start_bits != NULL)
2737 {
2738 while (current_subject < end_subject)
2739 {
2740 register unsigned int c = *current_subject;
2741 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2742 else break;
2743 }
2744 }
2745
2746 /* Restore fudged end_subject */
2747
2748 end_subject = save_end_subject;
2749 }
2750
2751 /* If req_byte is set, we know that that character must appear in the subject
2752 for the match to succeed. If the first character is set, req_byte must be
2753 later in the subject; otherwise the test starts at the match point. This
2754 optimization can save a huge amount of work in patterns with nested unlimited
2755 repeats that aren't going to match. Writing separate code for cased/caseless
2756 versions makes it go faster, as does using an autoincrement and backing off
2757 on a match.
2758
2759 HOWEVER: when the subject string is very, very long, searching to its end can
2760 take a long time, and give bad performance on quite ordinary patterns. This
2761 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2762 don't do this when the string is sufficiently long.
2763
2764 ALSO: this processing is disabled when partial matching is requested.
2765 */
2766
2767 if (req_byte >= 0 &&
2768 end_subject - current_subject < REQ_BYTE_MAX &&
2769 (options & PCRE_PARTIAL) == 0)
2770 {
2771 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2772
2773 /* We don't need to repeat the search if we haven't yet reached the
2774 place we found it at last time. */
2775
2776 if (p > req_byte_ptr)
2777 {
2778 if (req_byte_caseless)
2779 {
2780 while (p < end_subject)
2781 {
2782 register int pp = *p++;
2783 if (pp == req_byte || pp == req_byte2) { p--; break; }
2784 }
2785 }
2786 else
2787 {
2788 while (p < end_subject)
2789 {
2790 if (*p++ == req_byte) { p--; break; }
2791 }
2792 }
2793
2794 /* If we can't find the required character, break the matching loop,
2795 which will cause a return or PCRE_ERROR_NOMATCH. */
2796
2797 if (p >= end_subject) break;
2798
2799 /* If we have found the required character, save the point where we
2800 found it, so that we don't search again next time round the loop if
2801 the start hasn't passed this character yet. */
2802
2803 req_byte_ptr = p;
2804 }
2805 }
2806
2807 /* OK, now we can do the business */
2808
2809 rc = internal_dfa_exec(
2810 md, /* fixed match data */
2811 md->start_code, /* this subexpression's code */
2812 current_subject, /* where we currently are */
2813 start_offset, /* start offset in subject */
2814 offsets, /* offset vector */
2815 offsetcount, /* size of same */
2816 workspace, /* workspace vector */
2817 wscount, /* size of same */
2818 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2819 0, /* function recurse level */
2820 0); /* regex recurse level */
2821
2822 /* Anything other than "no match" means we are done, always; otherwise, carry
2823 on only if not anchored. */
2824
2825 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2826
2827 /* Advance to the next subject character unless we are at the end of a line
2828 and firstline is set. */
2829
2830 if (firstline && IS_NEWLINE(current_subject)) break;
2831 current_subject++;
2832 if (utf8)
2833 {
2834 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2835 current_subject++;
2836 }
2837 if (current_subject > end_subject) break;
2838
2839 /* If we have just passed a CR and the newline option is CRLF or ANY or
2840 ANYCRLF, and we are now at a LF, advance the match position by one more
2841 character. */
2842
2843 if (current_subject[-1] == '\r' &&
2844 (md->nltype == NLTYPE_ANY ||
2845 md->nltype == NLTYPE_ANYCRLF ||
2846 md->nllen == 2) &&
2847 current_subject < end_subject &&
2848 *current_subject == '\n')
2849 current_subject++;
2850
2851 } /* "Bumpalong" loop */
2852
2853 return PCRE_ERROR_NOMATCH;
2854 }
2855
2856 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12