/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 392 - (show annotations) (download)
Tue Mar 17 21:30:30 2009 UTC (5 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 96571 byte(s)
Update after detrailing for a test release.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2009 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 #ifdef HAVE_CONFIG_H
49 #include "config.h"
50 #endif
51
52 #define NLBLOCK md /* Block containing newline information */
53 #define PSSTART start_subject /* Field containing processed string start */
54 #define PSEND end_subject /* Field containing processed string end */
55
56 #include "pcre_internal.h"
57
58
59 /* For use to indent debugging output */
60
61 #define SP " "
62
63
64
65 /*************************************************
66 * Code parameters and static tables *
67 *************************************************/
68
69 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
70 into others, under special conditions. A gap of 20 between the blocks should be
71 enough. The resulting opcodes don't have to be less than 256 because they are
72 never stored, so we push them well clear of the normal opcodes. */
73
74 #define OP_PROP_EXTRA 300
75 #define OP_EXTUNI_EXTRA 320
76 #define OP_ANYNL_EXTRA 340
77 #define OP_HSPACE_EXTRA 360
78 #define OP_VSPACE_EXTRA 380
79
80
81 /* This table identifies those opcodes that are followed immediately by a
82 character that is to be tested in some way. This makes is possible to
83 centralize the loading of these characters. In the case of Type * etc, the
84 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
85 small value. ***NOTE*** If the start of this table is modified, the two tables
86 that follow must also be modified. */
87
88 static const uschar coptable[] = {
89 0, /* End */
90 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
91 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
92 0, 0, 0, /* Any, AllAny, Anybyte */
93 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
94 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
95 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
96 1, /* Char */
97 1, /* Charnc */
98 1, /* not */
99 /* Positive single-char repeats */
100 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
101 3, 3, 3, /* upto, minupto, exact */
102 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
103 /* Negative single-char repeats - only for chars < 256 */
104 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
105 3, 3, 3, /* NOT upto, minupto, exact */
106 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
107 /* Positive type repeats */
108 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
109 3, 3, 3, /* Type upto, minupto, exact */
110 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
111 /* Character class & ref repeats */
112 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
113 0, 0, /* CRRANGE, CRMINRANGE */
114 0, /* CLASS */
115 0, /* NCLASS */
116 0, /* XCLASS - variable length */
117 0, /* REF */
118 0, /* RECURSE */
119 0, /* CALLOUT */
120 0, /* Alt */
121 0, /* Ket */
122 0, /* KetRmax */
123 0, /* KetRmin */
124 0, /* Assert */
125 0, /* Assert not */
126 0, /* Assert behind */
127 0, /* Assert behind not */
128 0, /* Reverse */
129 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
130 0, 0, 0, /* SBRA, SCBRA, SCOND */
131 0, /* CREF */
132 0, /* RREF */
133 0, /* DEF */
134 0, 0, /* BRAZERO, BRAMINZERO */
135 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
136 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
137 };
138
139 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
140 and \w */
141
142 static const uschar toptable1[] = {
143 0, 0, 0, 0, 0, 0,
144 ctype_digit, ctype_digit,
145 ctype_space, ctype_space,
146 ctype_word, ctype_word,
147 0, 0 /* OP_ANY, OP_ALLANY */
148 };
149
150 static const uschar toptable2[] = {
151 0, 0, 0, 0, 0, 0,
152 ctype_digit, 0,
153 ctype_space, 0,
154 ctype_word, 0,
155 1, 1 /* OP_ANY, OP_ALLANY */
156 };
157
158
159 /* Structure for holding data about a particular state, which is in effect the
160 current data for an active path through the match tree. It must consist
161 entirely of ints because the working vector we are passed, and which we put
162 these structures in, is a vector of ints. */
163
164 typedef struct stateblock {
165 int offset; /* Offset to opcode */
166 int count; /* Count for repeats */
167 int ims; /* ims flag bits */
168 int data; /* Some use extra data */
169 } stateblock;
170
171 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
172
173
174 #ifdef DEBUG
175 /*************************************************
176 * Print character string *
177 *************************************************/
178
179 /* Character string printing function for debugging.
180
181 Arguments:
182 p points to string
183 length number of bytes
184 f where to print
185
186 Returns: nothing
187 */
188
189 static void
190 pchars(unsigned char *p, int length, FILE *f)
191 {
192 int c;
193 while (length-- > 0)
194 {
195 if (isprint(c = *(p++)))
196 fprintf(f, "%c", c);
197 else
198 fprintf(f, "\\x%02x", c);
199 }
200 }
201 #endif
202
203
204
205 /*************************************************
206 * Execute a Regular Expression - DFA engine *
207 *************************************************/
208
209 /* This internal function applies a compiled pattern to a subject string,
210 starting at a given point, using a DFA engine. This function is called from the
211 external one, possibly multiple times if the pattern is not anchored. The
212 function calls itself recursively for some kinds of subpattern.
213
214 Arguments:
215 md the match_data block with fixed information
216 this_start_code the opening bracket of this subexpression's code
217 current_subject where we currently are in the subject string
218 start_offset start offset in the subject string
219 offsets vector to contain the matching string offsets
220 offsetcount size of same
221 workspace vector of workspace
222 wscount size of same
223 ims the current ims flags
224 rlevel function call recursion level
225 recursing regex recursive call level
226
227 Returns: > 0 => number of match offset pairs placed in offsets
228 = 0 => offsets overflowed; longest matches are present
229 -1 => failed to match
230 < -1 => some kind of unexpected problem
231
232 The following macros are used for adding states to the two state vectors (one
233 for the current character, one for the following character). */
234
235 #define ADD_ACTIVE(x,y) \
236 if (active_count++ < wscount) \
237 { \
238 next_active_state->offset = (x); \
239 next_active_state->count = (y); \
240 next_active_state->ims = ims; \
241 next_active_state++; \
242 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
243 } \
244 else return PCRE_ERROR_DFA_WSSIZE
245
246 #define ADD_ACTIVE_DATA(x,y,z) \
247 if (active_count++ < wscount) \
248 { \
249 next_active_state->offset = (x); \
250 next_active_state->count = (y); \
251 next_active_state->ims = ims; \
252 next_active_state->data = (z); \
253 next_active_state++; \
254 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
255 } \
256 else return PCRE_ERROR_DFA_WSSIZE
257
258 #define ADD_NEW(x,y) \
259 if (new_count++ < wscount) \
260 { \
261 next_new_state->offset = (x); \
262 next_new_state->count = (y); \
263 next_new_state->ims = ims; \
264 next_new_state++; \
265 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
266 } \
267 else return PCRE_ERROR_DFA_WSSIZE
268
269 #define ADD_NEW_DATA(x,y,z) \
270 if (new_count++ < wscount) \
271 { \
272 next_new_state->offset = (x); \
273 next_new_state->count = (y); \
274 next_new_state->ims = ims; \
275 next_new_state->data = (z); \
276 next_new_state++; \
277 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
278 } \
279 else return PCRE_ERROR_DFA_WSSIZE
280
281 /* And now, here is the code */
282
283 static int
284 internal_dfa_exec(
285 dfa_match_data *md,
286 const uschar *this_start_code,
287 const uschar *current_subject,
288 int start_offset,
289 int *offsets,
290 int offsetcount,
291 int *workspace,
292 int wscount,
293 int ims,
294 int rlevel,
295 int recursing)
296 {
297 stateblock *active_states, *new_states, *temp_states;
298 stateblock *next_active_state, *next_new_state;
299
300 const uschar *ctypes, *lcc, *fcc;
301 const uschar *ptr;
302 const uschar *end_code, *first_op;
303
304 int active_count, new_count, match_count;
305
306 /* Some fields in the md block are frequently referenced, so we load them into
307 independent variables in the hope that this will perform better. */
308
309 const uschar *start_subject = md->start_subject;
310 const uschar *end_subject = md->end_subject;
311 const uschar *start_code = md->start_code;
312
313 #ifdef SUPPORT_UTF8
314 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
315 #else
316 BOOL utf8 = FALSE;
317 #endif
318
319 rlevel++;
320 offsetcount &= (-2);
321
322 wscount -= 2;
323 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
324 (2 * INTS_PER_STATEBLOCK);
325
326 DPRINTF(("\n%.*s---------------------\n"
327 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
328 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
329
330 ctypes = md->tables + ctypes_offset;
331 lcc = md->tables + lcc_offset;
332 fcc = md->tables + fcc_offset;
333
334 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
335
336 active_states = (stateblock *)(workspace + 2);
337 next_new_state = new_states = active_states + wscount;
338 new_count = 0;
339
340 first_op = this_start_code + 1 + LINK_SIZE +
341 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
342
343 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
344 the alternative states onto the list, and find out where the end is. This
345 makes is possible to use this function recursively, when we want to stop at a
346 matching internal ket rather than at the end.
347
348 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
349 a backward assertion. In that case, we have to find out the maximum amount to
350 move back, and set up each alternative appropriately. */
351
352 if (*first_op == OP_REVERSE)
353 {
354 int max_back = 0;
355 int gone_back;
356
357 end_code = this_start_code;
358 do
359 {
360 int back = GET(end_code, 2+LINK_SIZE);
361 if (back > max_back) max_back = back;
362 end_code += GET(end_code, 1);
363 }
364 while (*end_code == OP_ALT);
365
366 /* If we can't go back the amount required for the longest lookbehind
367 pattern, go back as far as we can; some alternatives may still be viable. */
368
369 #ifdef SUPPORT_UTF8
370 /* In character mode we have to step back character by character */
371
372 if (utf8)
373 {
374 for (gone_back = 0; gone_back < max_back; gone_back++)
375 {
376 if (current_subject <= start_subject) break;
377 current_subject--;
378 while (current_subject > start_subject &&
379 (*current_subject & 0xc0) == 0x80)
380 current_subject--;
381 }
382 }
383 else
384 #endif
385
386 /* In byte-mode we can do this quickly. */
387
388 {
389 gone_back = (current_subject - max_back < start_subject)?
390 current_subject - start_subject : max_back;
391 current_subject -= gone_back;
392 }
393
394 /* Now we can process the individual branches. */
395
396 end_code = this_start_code;
397 do
398 {
399 int back = GET(end_code, 2+LINK_SIZE);
400 if (back <= gone_back)
401 {
402 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
403 ADD_NEW_DATA(-bstate, 0, gone_back - back);
404 }
405 end_code += GET(end_code, 1);
406 }
407 while (*end_code == OP_ALT);
408 }
409
410 /* This is the code for a "normal" subpattern (not a backward assertion). The
411 start of a whole pattern is always one of these. If we are at the top level,
412 we may be asked to restart matching from the same point that we reached for a
413 previous partial match. We still have to scan through the top-level branches to
414 find the end state. */
415
416 else
417 {
418 end_code = this_start_code;
419
420 /* Restarting */
421
422 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
423 {
424 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
425 new_count = workspace[1];
426 if (!workspace[0])
427 memcpy(new_states, active_states, new_count * sizeof(stateblock));
428 }
429
430 /* Not restarting */
431
432 else
433 {
434 int length = 1 + LINK_SIZE +
435 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
436 do
437 {
438 ADD_NEW(end_code - start_code + length, 0);
439 end_code += GET(end_code, 1);
440 length = 1 + LINK_SIZE;
441 }
442 while (*end_code == OP_ALT);
443 }
444 }
445
446 workspace[0] = 0; /* Bit indicating which vector is current */
447
448 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
449
450 /* Loop for scanning the subject */
451
452 ptr = current_subject;
453 for (;;)
454 {
455 int i, j;
456 int clen, dlen;
457 unsigned int c, d;
458
459 /* Make the new state list into the active state list and empty the
460 new state list. */
461
462 temp_states = active_states;
463 active_states = new_states;
464 new_states = temp_states;
465 active_count = new_count;
466 new_count = 0;
467
468 workspace[0] ^= 1; /* Remember for the restarting feature */
469 workspace[1] = active_count;
470
471 #ifdef DEBUG
472 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
473 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
474 printf("\"\n");
475
476 printf("%.*sActive states: ", rlevel*2-2, SP);
477 for (i = 0; i < active_count; i++)
478 printf("%d/%d ", active_states[i].offset, active_states[i].count);
479 printf("\n");
480 #endif
481
482 /* Set the pointers for adding new states */
483
484 next_active_state = active_states + active_count;
485 next_new_state = new_states;
486
487 /* Load the current character from the subject outside the loop, as many
488 different states may want to look at it, and we assume that at least one
489 will. */
490
491 if (ptr < end_subject)
492 {
493 clen = 1; /* Number of bytes in the character */
494 #ifdef SUPPORT_UTF8
495 if (utf8) { GETCHARLEN(c, ptr, clen); } else
496 #endif /* SUPPORT_UTF8 */
497 c = *ptr;
498 }
499 else
500 {
501 clen = 0; /* This indicates the end of the subject */
502 c = NOTACHAR; /* This value should never actually be used */
503 }
504
505 /* Scan up the active states and act on each one. The result of an action
506 may be to add more states to the currently active list (e.g. on hitting a
507 parenthesis) or it may be to put states on the new list, for considering
508 when we move the character pointer on. */
509
510 for (i = 0; i < active_count; i++)
511 {
512 stateblock *current_state = active_states + i;
513 const uschar *code;
514 int state_offset = current_state->offset;
515 int count, codevalue;
516
517 #ifdef DEBUG
518 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
519 if (clen == 0) printf("EOL\n");
520 else if (c > 32 && c < 127) printf("'%c'\n", c);
521 else printf("0x%02x\n", c);
522 #endif
523
524 /* This variable is referred to implicity in the ADD_xxx macros. */
525
526 ims = current_state->ims;
527
528 /* A negative offset is a special case meaning "hold off going to this
529 (negated) state until the number of characters in the data field have
530 been skipped". */
531
532 if (state_offset < 0)
533 {
534 if (current_state->data > 0)
535 {
536 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
537 ADD_NEW_DATA(state_offset, current_state->count,
538 current_state->data - 1);
539 continue;
540 }
541 else
542 {
543 current_state->offset = state_offset = -state_offset;
544 }
545 }
546
547 /* Check for a duplicate state with the same count, and skip if found. */
548
549 for (j = 0; j < i; j++)
550 {
551 if (active_states[j].offset == state_offset &&
552 active_states[j].count == current_state->count)
553 {
554 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
555 goto NEXT_ACTIVE_STATE;
556 }
557 }
558
559 /* The state offset is the offset to the opcode */
560
561 code = start_code + state_offset;
562 codevalue = *code;
563
564 /* If this opcode is followed by an inline character, load it. It is
565 tempting to test for the presence of a subject character here, but that
566 is wrong, because sometimes zero repetitions of the subject are
567 permitted.
568
569 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570 argument that is not a data character - but is always one byte long. We
571 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
572 this case. To keep the other cases fast, convert these ones to new opcodes.
573 */
574
575 if (coptable[codevalue] > 0)
576 {
577 dlen = 1;
578 #ifdef SUPPORT_UTF8
579 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
580 #endif /* SUPPORT_UTF8 */
581 d = code[coptable[codevalue]];
582 if (codevalue >= OP_TYPESTAR)
583 {
584 switch(d)
585 {
586 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
587 case OP_NOTPROP:
588 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591 case OP_NOT_HSPACE:
592 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593 case OP_NOT_VSPACE:
594 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595 default: break;
596 }
597 }
598 }
599 else
600 {
601 dlen = 0; /* Not strictly necessary, but compilers moan */
602 d = NOTACHAR; /* if these variables are not set. */
603 }
604
605
606 /* Now process the individual opcodes */
607
608 switch (codevalue)
609 {
610
611 /* ========================================================================== */
612 /* Reached a closing bracket. If not at the end of the pattern, carry
613 on with the next opcode. Otherwise, unless we have an empty string and
614 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
615 matches so we always have the longest first. */
616
617 case OP_KET:
618 case OP_KETRMIN:
619 case OP_KETRMAX:
620 if (code != end_code)
621 {
622 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
623 if (codevalue != OP_KET)
624 {
625 ADD_ACTIVE(state_offset - GET(code, 1), 0);
626 }
627 }
628 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
629 {
630 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
631 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
632 match_count = 0;
633 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
634 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
635 if (offsetcount >= 2)
636 {
637 offsets[0] = current_subject - start_subject;
638 offsets[1] = ptr - start_subject;
639 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
640 offsets[1] - offsets[0], current_subject));
641 }
642 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
643 {
644 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
645 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
646 match_count, rlevel*2-2, SP));
647 return match_count;
648 }
649 }
650 break;
651
652 /* ========================================================================== */
653 /* These opcodes add to the current list of states without looking
654 at the current character. */
655
656 /*-----------------------------------------------------------------*/
657 case OP_ALT:
658 do { code += GET(code, 1); } while (*code == OP_ALT);
659 ADD_ACTIVE(code - start_code, 0);
660 break;
661
662 /*-----------------------------------------------------------------*/
663 case OP_BRA:
664 case OP_SBRA:
665 do
666 {
667 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
668 code += GET(code, 1);
669 }
670 while (*code == OP_ALT);
671 break;
672
673 /*-----------------------------------------------------------------*/
674 case OP_CBRA:
675 case OP_SCBRA:
676 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
677 code += GET(code, 1);
678 while (*code == OP_ALT)
679 {
680 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
681 code += GET(code, 1);
682 }
683 break;
684
685 /*-----------------------------------------------------------------*/
686 case OP_BRAZERO:
687 case OP_BRAMINZERO:
688 ADD_ACTIVE(state_offset + 1, 0);
689 code += 1 + GET(code, 2);
690 while (*code == OP_ALT) code += GET(code, 1);
691 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
692 break;
693
694 /*-----------------------------------------------------------------*/
695 case OP_SKIPZERO:
696 code += 1 + GET(code, 2);
697 while (*code == OP_ALT) code += GET(code, 1);
698 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
699 break;
700
701 /*-----------------------------------------------------------------*/
702 case OP_CIRC:
703 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
704 ((ims & PCRE_MULTILINE) != 0 &&
705 ptr != end_subject &&
706 WAS_NEWLINE(ptr)))
707 { ADD_ACTIVE(state_offset + 1, 0); }
708 break;
709
710 /*-----------------------------------------------------------------*/
711 case OP_EOD:
712 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
713 break;
714
715 /*-----------------------------------------------------------------*/
716 case OP_OPT:
717 ims = code[1];
718 ADD_ACTIVE(state_offset + 2, 0);
719 break;
720
721 /*-----------------------------------------------------------------*/
722 case OP_SOD:
723 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
724 break;
725
726 /*-----------------------------------------------------------------*/
727 case OP_SOM:
728 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
729 break;
730
731
732 /* ========================================================================== */
733 /* These opcodes inspect the next subject character, and sometimes
734 the previous one as well, but do not have an argument. The variable
735 clen contains the length of the current character and is zero if we are
736 at the end of the subject. */
737
738 /*-----------------------------------------------------------------*/
739 case OP_ANY:
740 if (clen > 0 && !IS_NEWLINE(ptr))
741 { ADD_NEW(state_offset + 1, 0); }
742 break;
743
744 /*-----------------------------------------------------------------*/
745 case OP_ALLANY:
746 if (clen > 0)
747 { ADD_NEW(state_offset + 1, 0); }
748 break;
749
750 /*-----------------------------------------------------------------*/
751 case OP_EODN:
752 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
753 { ADD_ACTIVE(state_offset + 1, 0); }
754 break;
755
756 /*-----------------------------------------------------------------*/
757 case OP_DOLL:
758 if ((md->moptions & PCRE_NOTEOL) == 0)
759 {
760 if (clen == 0 ||
761 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
762 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
763 ))
764 { ADD_ACTIVE(state_offset + 1, 0); }
765 }
766 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
767 { ADD_ACTIVE(state_offset + 1, 0); }
768 break;
769
770 /*-----------------------------------------------------------------*/
771
772 case OP_DIGIT:
773 case OP_WHITESPACE:
774 case OP_WORDCHAR:
775 if (clen > 0 && c < 256 &&
776 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
777 { ADD_NEW(state_offset + 1, 0); }
778 break;
779
780 /*-----------------------------------------------------------------*/
781 case OP_NOT_DIGIT:
782 case OP_NOT_WHITESPACE:
783 case OP_NOT_WORDCHAR:
784 if (clen > 0 && (c >= 256 ||
785 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
786 { ADD_NEW(state_offset + 1, 0); }
787 break;
788
789 /*-----------------------------------------------------------------*/
790 case OP_WORD_BOUNDARY:
791 case OP_NOT_WORD_BOUNDARY:
792 {
793 int left_word, right_word;
794
795 if (ptr > start_subject)
796 {
797 const uschar *temp = ptr - 1;
798 #ifdef SUPPORT_UTF8
799 if (utf8) BACKCHAR(temp);
800 #endif
801 GETCHARTEST(d, temp);
802 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
803 }
804 else left_word = 0;
805
806 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
807 else right_word = 0;
808
809 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
810 { ADD_ACTIVE(state_offset + 1, 0); }
811 }
812 break;
813
814
815 /*-----------------------------------------------------------------*/
816 /* Check the next character by Unicode property. We will get here only
817 if the support is in the binary; otherwise a compile-time error occurs.
818 */
819
820 #ifdef SUPPORT_UCP
821 case OP_PROP:
822 case OP_NOTPROP:
823 if (clen > 0)
824 {
825 BOOL OK;
826 const ucd_record * prop = GET_UCD(c);
827 switch(code[1])
828 {
829 case PT_ANY:
830 OK = TRUE;
831 break;
832
833 case PT_LAMP:
834 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
835 break;
836
837 case PT_GC:
838 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
839 break;
840
841 case PT_PC:
842 OK = prop->chartype == code[2];
843 break;
844
845 case PT_SC:
846 OK = prop->script == code[2];
847 break;
848
849 /* Should never occur, but keep compilers from grumbling. */
850
851 default:
852 OK = codevalue != OP_PROP;
853 break;
854 }
855
856 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
857 }
858 break;
859 #endif
860
861
862
863 /* ========================================================================== */
864 /* These opcodes likewise inspect the subject character, but have an
865 argument that is not a data character. It is one of these opcodes:
866 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
867 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
868
869 case OP_TYPEPLUS:
870 case OP_TYPEMINPLUS:
871 case OP_TYPEPOSPLUS:
872 count = current_state->count; /* Already matched */
873 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
874 if (clen > 0)
875 {
876 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
877 (c < 256 &&
878 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
879 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
880 {
881 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
882 {
883 active_count--; /* Remove non-match possibility */
884 next_active_state--;
885 }
886 count++;
887 ADD_NEW(state_offset, count);
888 }
889 }
890 break;
891
892 /*-----------------------------------------------------------------*/
893 case OP_TYPEQUERY:
894 case OP_TYPEMINQUERY:
895 case OP_TYPEPOSQUERY:
896 ADD_ACTIVE(state_offset + 2, 0);
897 if (clen > 0)
898 {
899 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
900 (c < 256 &&
901 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
902 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
903 {
904 if (codevalue == OP_TYPEPOSQUERY)
905 {
906 active_count--; /* Remove non-match possibility */
907 next_active_state--;
908 }
909 ADD_NEW(state_offset + 2, 0);
910 }
911 }
912 break;
913
914 /*-----------------------------------------------------------------*/
915 case OP_TYPESTAR:
916 case OP_TYPEMINSTAR:
917 case OP_TYPEPOSSTAR:
918 ADD_ACTIVE(state_offset + 2, 0);
919 if (clen > 0)
920 {
921 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
922 (c < 256 &&
923 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
924 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
925 {
926 if (codevalue == OP_TYPEPOSSTAR)
927 {
928 active_count--; /* Remove non-match possibility */
929 next_active_state--;
930 }
931 ADD_NEW(state_offset, 0);
932 }
933 }
934 break;
935
936 /*-----------------------------------------------------------------*/
937 case OP_TYPEEXACT:
938 count = current_state->count; /* Number already matched */
939 if (clen > 0)
940 {
941 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
942 (c < 256 &&
943 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
944 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
945 {
946 if (++count >= GET2(code, 1))
947 { ADD_NEW(state_offset + 4, 0); }
948 else
949 { ADD_NEW(state_offset, count); }
950 }
951 }
952 break;
953
954 /*-----------------------------------------------------------------*/
955 case OP_TYPEUPTO:
956 case OP_TYPEMINUPTO:
957 case OP_TYPEPOSUPTO:
958 ADD_ACTIVE(state_offset + 4, 0);
959 count = current_state->count; /* Number already matched */
960 if (clen > 0)
961 {
962 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
963 (c < 256 &&
964 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
965 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
966 {
967 if (codevalue == OP_TYPEPOSUPTO)
968 {
969 active_count--; /* Remove non-match possibility */
970 next_active_state--;
971 }
972 if (++count >= GET2(code, 1))
973 { ADD_NEW(state_offset + 4, 0); }
974 else
975 { ADD_NEW(state_offset, count); }
976 }
977 }
978 break;
979
980 /* ========================================================================== */
981 /* These are virtual opcodes that are used when something like
982 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
983 argument. It keeps the code above fast for the other cases. The argument
984 is in the d variable. */
985
986 #ifdef SUPPORT_UCP
987 case OP_PROP_EXTRA + OP_TYPEPLUS:
988 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
989 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
990 count = current_state->count; /* Already matched */
991 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
992 if (clen > 0)
993 {
994 BOOL OK;
995 const ucd_record * prop = GET_UCD(c);
996 switch(code[2])
997 {
998 case PT_ANY:
999 OK = TRUE;
1000 break;
1001
1002 case PT_LAMP:
1003 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1004 break;
1005
1006 case PT_GC:
1007 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1008 break;
1009
1010 case PT_PC:
1011 OK = prop->chartype == code[3];
1012 break;
1013
1014 case PT_SC:
1015 OK = prop->script == code[3];
1016 break;
1017
1018 /* Should never occur, but keep compilers from grumbling. */
1019
1020 default:
1021 OK = codevalue != OP_PROP;
1022 break;
1023 }
1024
1025 if (OK == (d == OP_PROP))
1026 {
1027 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1028 {
1029 active_count--; /* Remove non-match possibility */
1030 next_active_state--;
1031 }
1032 count++;
1033 ADD_NEW(state_offset, count);
1034 }
1035 }
1036 break;
1037
1038 /*-----------------------------------------------------------------*/
1039 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1040 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1041 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1042 count = current_state->count; /* Already matched */
1043 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1044 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1045 {
1046 const uschar *nptr = ptr + clen;
1047 int ncount = 0;
1048 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1049 {
1050 active_count--; /* Remove non-match possibility */
1051 next_active_state--;
1052 }
1053 while (nptr < end_subject)
1054 {
1055 int nd;
1056 int ndlen = 1;
1057 GETCHARLEN(nd, nptr, ndlen);
1058 if (UCD_CATEGORY(nd) != ucp_M) break;
1059 ncount++;
1060 nptr += ndlen;
1061 }
1062 count++;
1063 ADD_NEW_DATA(-state_offset, count, ncount);
1064 }
1065 break;
1066 #endif
1067
1068 /*-----------------------------------------------------------------*/
1069 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1070 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1071 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1072 count = current_state->count; /* Already matched */
1073 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1074 if (clen > 0)
1075 {
1076 int ncount = 0;
1077 switch (c)
1078 {
1079 case 0x000b:
1080 case 0x000c:
1081 case 0x0085:
1082 case 0x2028:
1083 case 0x2029:
1084 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1085 goto ANYNL01;
1086
1087 case 0x000d:
1088 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1089 /* Fall through */
1090
1091 ANYNL01:
1092 case 0x000a:
1093 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1094 {
1095 active_count--; /* Remove non-match possibility */
1096 next_active_state--;
1097 }
1098 count++;
1099 ADD_NEW_DATA(-state_offset, count, ncount);
1100 break;
1101
1102 default:
1103 break;
1104 }
1105 }
1106 break;
1107
1108 /*-----------------------------------------------------------------*/
1109 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1110 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1111 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1112 count = current_state->count; /* Already matched */
1113 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1114 if (clen > 0)
1115 {
1116 BOOL OK;
1117 switch (c)
1118 {
1119 case 0x000a:
1120 case 0x000b:
1121 case 0x000c:
1122 case 0x000d:
1123 case 0x0085:
1124 case 0x2028:
1125 case 0x2029:
1126 OK = TRUE;
1127 break;
1128
1129 default:
1130 OK = FALSE;
1131 break;
1132 }
1133
1134 if (OK == (d == OP_VSPACE))
1135 {
1136 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1137 {
1138 active_count--; /* Remove non-match possibility */
1139 next_active_state--;
1140 }
1141 count++;
1142 ADD_NEW_DATA(-state_offset, count, 0);
1143 }
1144 }
1145 break;
1146
1147 /*-----------------------------------------------------------------*/
1148 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1149 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1150 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1151 count = current_state->count; /* Already matched */
1152 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1153 if (clen > 0)
1154 {
1155 BOOL OK;
1156 switch (c)
1157 {
1158 case 0x09: /* HT */
1159 case 0x20: /* SPACE */
1160 case 0xa0: /* NBSP */
1161 case 0x1680: /* OGHAM SPACE MARK */
1162 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1163 case 0x2000: /* EN QUAD */
1164 case 0x2001: /* EM QUAD */
1165 case 0x2002: /* EN SPACE */
1166 case 0x2003: /* EM SPACE */
1167 case 0x2004: /* THREE-PER-EM SPACE */
1168 case 0x2005: /* FOUR-PER-EM SPACE */
1169 case 0x2006: /* SIX-PER-EM SPACE */
1170 case 0x2007: /* FIGURE SPACE */
1171 case 0x2008: /* PUNCTUATION SPACE */
1172 case 0x2009: /* THIN SPACE */
1173 case 0x200A: /* HAIR SPACE */
1174 case 0x202f: /* NARROW NO-BREAK SPACE */
1175 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1176 case 0x3000: /* IDEOGRAPHIC SPACE */
1177 OK = TRUE;
1178 break;
1179
1180 default:
1181 OK = FALSE;
1182 break;
1183 }
1184
1185 if (OK == (d == OP_HSPACE))
1186 {
1187 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1188 {
1189 active_count--; /* Remove non-match possibility */
1190 next_active_state--;
1191 }
1192 count++;
1193 ADD_NEW_DATA(-state_offset, count, 0);
1194 }
1195 }
1196 break;
1197
1198 /*-----------------------------------------------------------------*/
1199 #ifdef SUPPORT_UCP
1200 case OP_PROP_EXTRA + OP_TYPEQUERY:
1201 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1202 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1203 count = 4;
1204 goto QS1;
1205
1206 case OP_PROP_EXTRA + OP_TYPESTAR:
1207 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1208 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1209 count = 0;
1210
1211 QS1:
1212
1213 ADD_ACTIVE(state_offset + 4, 0);
1214 if (clen > 0)
1215 {
1216 BOOL OK;
1217 const ucd_record * prop = GET_UCD(c);
1218 switch(code[2])
1219 {
1220 case PT_ANY:
1221 OK = TRUE;
1222 break;
1223
1224 case PT_LAMP:
1225 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1226 break;
1227
1228 case PT_GC:
1229 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1230 break;
1231
1232 case PT_PC:
1233 OK = prop->chartype == code[3];
1234 break;
1235
1236 case PT_SC:
1237 OK = prop->script == code[3];
1238 break;
1239
1240 /* Should never occur, but keep compilers from grumbling. */
1241
1242 default:
1243 OK = codevalue != OP_PROP;
1244 break;
1245 }
1246
1247 if (OK == (d == OP_PROP))
1248 {
1249 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1250 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1251 {
1252 active_count--; /* Remove non-match possibility */
1253 next_active_state--;
1254 }
1255 ADD_NEW(state_offset + count, 0);
1256 }
1257 }
1258 break;
1259
1260 /*-----------------------------------------------------------------*/
1261 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1262 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1263 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1264 count = 2;
1265 goto QS2;
1266
1267 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1268 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1269 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1270 count = 0;
1271
1272 QS2:
1273
1274 ADD_ACTIVE(state_offset + 2, 0);
1275 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1276 {
1277 const uschar *nptr = ptr + clen;
1278 int ncount = 0;
1279 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1280 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1281 {
1282 active_count--; /* Remove non-match possibility */
1283 next_active_state--;
1284 }
1285 while (nptr < end_subject)
1286 {
1287 int nd;
1288 int ndlen = 1;
1289 GETCHARLEN(nd, nptr, ndlen);
1290 if (UCD_CATEGORY(nd) != ucp_M) break;
1291 ncount++;
1292 nptr += ndlen;
1293 }
1294 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1295 }
1296 break;
1297 #endif
1298
1299 /*-----------------------------------------------------------------*/
1300 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1301 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1302 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1303 count = 2;
1304 goto QS3;
1305
1306 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1307 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1308 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1309 count = 0;
1310
1311 QS3:
1312 ADD_ACTIVE(state_offset + 2, 0);
1313 if (clen > 0)
1314 {
1315 int ncount = 0;
1316 switch (c)
1317 {
1318 case 0x000b:
1319 case 0x000c:
1320 case 0x0085:
1321 case 0x2028:
1322 case 0x2029:
1323 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1324 goto ANYNL02;
1325
1326 case 0x000d:
1327 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1328 /* Fall through */
1329
1330 ANYNL02:
1331 case 0x000a:
1332 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1333 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1334 {
1335 active_count--; /* Remove non-match possibility */
1336 next_active_state--;
1337 }
1338 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1339 break;
1340
1341 default:
1342 break;
1343 }
1344 }
1345 break;
1346
1347 /*-----------------------------------------------------------------*/
1348 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1349 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1350 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1351 count = 2;
1352 goto QS4;
1353
1354 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1355 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1356 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1357 count = 0;
1358
1359 QS4:
1360 ADD_ACTIVE(state_offset + 2, 0);
1361 if (clen > 0)
1362 {
1363 BOOL OK;
1364 switch (c)
1365 {
1366 case 0x000a:
1367 case 0x000b:
1368 case 0x000c:
1369 case 0x000d:
1370 case 0x0085:
1371 case 0x2028:
1372 case 0x2029:
1373 OK = TRUE;
1374 break;
1375
1376 default:
1377 OK = FALSE;
1378 break;
1379 }
1380 if (OK == (d == OP_VSPACE))
1381 {
1382 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1383 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1384 {
1385 active_count--; /* Remove non-match possibility */
1386 next_active_state--;
1387 }
1388 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1389 }
1390 }
1391 break;
1392
1393 /*-----------------------------------------------------------------*/
1394 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1395 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1396 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1397 count = 2;
1398 goto QS5;
1399
1400 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1401 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1402 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1403 count = 0;
1404
1405 QS5:
1406 ADD_ACTIVE(state_offset + 2, 0);
1407 if (clen > 0)
1408 {
1409 BOOL OK;
1410 switch (c)
1411 {
1412 case 0x09: /* HT */
1413 case 0x20: /* SPACE */
1414 case 0xa0: /* NBSP */
1415 case 0x1680: /* OGHAM SPACE MARK */
1416 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1417 case 0x2000: /* EN QUAD */
1418 case 0x2001: /* EM QUAD */
1419 case 0x2002: /* EN SPACE */
1420 case 0x2003: /* EM SPACE */
1421 case 0x2004: /* THREE-PER-EM SPACE */
1422 case 0x2005: /* FOUR-PER-EM SPACE */
1423 case 0x2006: /* SIX-PER-EM SPACE */
1424 case 0x2007: /* FIGURE SPACE */
1425 case 0x2008: /* PUNCTUATION SPACE */
1426 case 0x2009: /* THIN SPACE */
1427 case 0x200A: /* HAIR SPACE */
1428 case 0x202f: /* NARROW NO-BREAK SPACE */
1429 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1430 case 0x3000: /* IDEOGRAPHIC SPACE */
1431 OK = TRUE;
1432 break;
1433
1434 default:
1435 OK = FALSE;
1436 break;
1437 }
1438
1439 if (OK == (d == OP_HSPACE))
1440 {
1441 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1442 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1443 {
1444 active_count--; /* Remove non-match possibility */
1445 next_active_state--;
1446 }
1447 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1448 }
1449 }
1450 break;
1451
1452 /*-----------------------------------------------------------------*/
1453 #ifdef SUPPORT_UCP
1454 case OP_PROP_EXTRA + OP_TYPEEXACT:
1455 case OP_PROP_EXTRA + OP_TYPEUPTO:
1456 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1457 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1458 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1459 { ADD_ACTIVE(state_offset + 6, 0); }
1460 count = current_state->count; /* Number already matched */
1461 if (clen > 0)
1462 {
1463 BOOL OK;
1464 const ucd_record * prop = GET_UCD(c);
1465 switch(code[4])
1466 {
1467 case PT_ANY:
1468 OK = TRUE;
1469 break;
1470
1471 case PT_LAMP:
1472 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1473 break;
1474
1475 case PT_GC:
1476 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1477 break;
1478
1479 case PT_PC:
1480 OK = prop->chartype == code[5];
1481 break;
1482
1483 case PT_SC:
1484 OK = prop->script == code[5];
1485 break;
1486
1487 /* Should never occur, but keep compilers from grumbling. */
1488
1489 default:
1490 OK = codevalue != OP_PROP;
1491 break;
1492 }
1493
1494 if (OK == (d == OP_PROP))
1495 {
1496 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1497 {
1498 active_count--; /* Remove non-match possibility */
1499 next_active_state--;
1500 }
1501 if (++count >= GET2(code, 1))
1502 { ADD_NEW(state_offset + 6, 0); }
1503 else
1504 { ADD_NEW(state_offset, count); }
1505 }
1506 }
1507 break;
1508
1509 /*-----------------------------------------------------------------*/
1510 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1511 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1512 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1513 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1514 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1515 { ADD_ACTIVE(state_offset + 4, 0); }
1516 count = current_state->count; /* Number already matched */
1517 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1518 {
1519 const uschar *nptr = ptr + clen;
1520 int ncount = 0;
1521 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1522 {
1523 active_count--; /* Remove non-match possibility */
1524 next_active_state--;
1525 }
1526 while (nptr < end_subject)
1527 {
1528 int nd;
1529 int ndlen = 1;
1530 GETCHARLEN(nd, nptr, ndlen);
1531 if (UCD_CATEGORY(nd) != ucp_M) break;
1532 ncount++;
1533 nptr += ndlen;
1534 }
1535 if (++count >= GET2(code, 1))
1536 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1537 else
1538 { ADD_NEW_DATA(-state_offset, count, ncount); }
1539 }
1540 break;
1541 #endif
1542
1543 /*-----------------------------------------------------------------*/
1544 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1545 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1546 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1547 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1548 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1549 { ADD_ACTIVE(state_offset + 4, 0); }
1550 count = current_state->count; /* Number already matched */
1551 if (clen > 0)
1552 {
1553 int ncount = 0;
1554 switch (c)
1555 {
1556 case 0x000b:
1557 case 0x000c:
1558 case 0x0085:
1559 case 0x2028:
1560 case 0x2029:
1561 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1562 goto ANYNL03;
1563
1564 case 0x000d:
1565 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1566 /* Fall through */
1567
1568 ANYNL03:
1569 case 0x000a:
1570 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1571 {
1572 active_count--; /* Remove non-match possibility */
1573 next_active_state--;
1574 }
1575 if (++count >= GET2(code, 1))
1576 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1577 else
1578 { ADD_NEW_DATA(-state_offset, count, ncount); }
1579 break;
1580
1581 default:
1582 break;
1583 }
1584 }
1585 break;
1586
1587 /*-----------------------------------------------------------------*/
1588 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1589 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1590 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1591 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1592 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1593 { ADD_ACTIVE(state_offset + 4, 0); }
1594 count = current_state->count; /* Number already matched */
1595 if (clen > 0)
1596 {
1597 BOOL OK;
1598 switch (c)
1599 {
1600 case 0x000a:
1601 case 0x000b:
1602 case 0x000c:
1603 case 0x000d:
1604 case 0x0085:
1605 case 0x2028:
1606 case 0x2029:
1607 OK = TRUE;
1608 break;
1609
1610 default:
1611 OK = FALSE;
1612 }
1613
1614 if (OK == (d == OP_VSPACE))
1615 {
1616 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1617 {
1618 active_count--; /* Remove non-match possibility */
1619 next_active_state--;
1620 }
1621 if (++count >= GET2(code, 1))
1622 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1623 else
1624 { ADD_NEW_DATA(-state_offset, count, 0); }
1625 }
1626 }
1627 break;
1628
1629 /*-----------------------------------------------------------------*/
1630 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1631 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1632 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1633 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1634 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1635 { ADD_ACTIVE(state_offset + 4, 0); }
1636 count = current_state->count; /* Number already matched */
1637 if (clen > 0)
1638 {
1639 BOOL OK;
1640 switch (c)
1641 {
1642 case 0x09: /* HT */
1643 case 0x20: /* SPACE */
1644 case 0xa0: /* NBSP */
1645 case 0x1680: /* OGHAM SPACE MARK */
1646 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1647 case 0x2000: /* EN QUAD */
1648 case 0x2001: /* EM QUAD */
1649 case 0x2002: /* EN SPACE */
1650 case 0x2003: /* EM SPACE */
1651 case 0x2004: /* THREE-PER-EM SPACE */
1652 case 0x2005: /* FOUR-PER-EM SPACE */
1653 case 0x2006: /* SIX-PER-EM SPACE */
1654 case 0x2007: /* FIGURE SPACE */
1655 case 0x2008: /* PUNCTUATION SPACE */
1656 case 0x2009: /* THIN SPACE */
1657 case 0x200A: /* HAIR SPACE */
1658 case 0x202f: /* NARROW NO-BREAK SPACE */
1659 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1660 case 0x3000: /* IDEOGRAPHIC SPACE */
1661 OK = TRUE;
1662 break;
1663
1664 default:
1665 OK = FALSE;
1666 break;
1667 }
1668
1669 if (OK == (d == OP_HSPACE))
1670 {
1671 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1672 {
1673 active_count--; /* Remove non-match possibility */
1674 next_active_state--;
1675 }
1676 if (++count >= GET2(code, 1))
1677 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1678 else
1679 { ADD_NEW_DATA(-state_offset, count, 0); }
1680 }
1681 }
1682 break;
1683
1684 /* ========================================================================== */
1685 /* These opcodes are followed by a character that is usually compared
1686 to the current subject character; it is loaded into d. We still get
1687 here even if there is no subject character, because in some cases zero
1688 repetitions are permitted. */
1689
1690 /*-----------------------------------------------------------------*/
1691 case OP_CHAR:
1692 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1693 break;
1694
1695 /*-----------------------------------------------------------------*/
1696 case OP_CHARNC:
1697 if (clen == 0) break;
1698
1699 #ifdef SUPPORT_UTF8
1700 if (utf8)
1701 {
1702 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1703 {
1704 unsigned int othercase;
1705 if (c < 128) othercase = fcc[c]; else
1706
1707 /* If we have Unicode property support, we can use it to test the
1708 other case of the character. */
1709
1710 #ifdef SUPPORT_UCP
1711 othercase = UCD_OTHERCASE(c);
1712 #else
1713 othercase = NOTACHAR;
1714 #endif
1715
1716 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1717 }
1718 }
1719 else
1720 #endif /* SUPPORT_UTF8 */
1721
1722 /* Non-UTF-8 mode */
1723 {
1724 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1725 }
1726 break;
1727
1728
1729 #ifdef SUPPORT_UCP
1730 /*-----------------------------------------------------------------*/
1731 /* This is a tricky one because it can match more than one character.
1732 Find out how many characters to skip, and then set up a negative state
1733 to wait for them to pass before continuing. */
1734
1735 case OP_EXTUNI:
1736 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1737 {
1738 const uschar *nptr = ptr + clen;
1739 int ncount = 0;
1740 while (nptr < end_subject)
1741 {
1742 int nclen = 1;
1743 GETCHARLEN(c, nptr, nclen);
1744 if (UCD_CATEGORY(c) != ucp_M) break;
1745 ncount++;
1746 nptr += nclen;
1747 }
1748 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1749 }
1750 break;
1751 #endif
1752
1753 /*-----------------------------------------------------------------*/
1754 /* This is a tricky like EXTUNI because it too can match more than one
1755 character (when CR is followed by LF). In this case, set up a negative
1756 state to wait for one character to pass before continuing. */
1757
1758 case OP_ANYNL:
1759 if (clen > 0) switch(c)
1760 {
1761 case 0x000b:
1762 case 0x000c:
1763 case 0x0085:
1764 case 0x2028:
1765 case 0x2029:
1766 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1767
1768 case 0x000a:
1769 ADD_NEW(state_offset + 1, 0);
1770 break;
1771
1772 case 0x000d:
1773 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1774 {
1775 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1776 }
1777 else
1778 {
1779 ADD_NEW(state_offset + 1, 0);
1780 }
1781 break;
1782 }
1783 break;
1784
1785 /*-----------------------------------------------------------------*/
1786 case OP_NOT_VSPACE:
1787 if (clen > 0) switch(c)
1788 {
1789 case 0x000a:
1790 case 0x000b:
1791 case 0x000c:
1792 case 0x000d:
1793 case 0x0085:
1794 case 0x2028:
1795 case 0x2029:
1796 break;
1797
1798 default:
1799 ADD_NEW(state_offset + 1, 0);
1800 break;
1801 }
1802 break;
1803
1804 /*-----------------------------------------------------------------*/
1805 case OP_VSPACE:
1806 if (clen > 0) switch(c)
1807 {
1808 case 0x000a:
1809 case 0x000b:
1810 case 0x000c:
1811 case 0x000d:
1812 case 0x0085:
1813 case 0x2028:
1814 case 0x2029:
1815 ADD_NEW(state_offset + 1, 0);
1816 break;
1817
1818 default: break;
1819 }
1820 break;
1821
1822 /*-----------------------------------------------------------------*/
1823 case OP_NOT_HSPACE:
1824 if (clen > 0) switch(c)
1825 {
1826 case 0x09: /* HT */
1827 case 0x20: /* SPACE */
1828 case 0xa0: /* NBSP */
1829 case 0x1680: /* OGHAM SPACE MARK */
1830 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1831 case 0x2000: /* EN QUAD */
1832 case 0x2001: /* EM QUAD */
1833 case 0x2002: /* EN SPACE */
1834 case 0x2003: /* EM SPACE */
1835 case 0x2004: /* THREE-PER-EM SPACE */
1836 case 0x2005: /* FOUR-PER-EM SPACE */
1837 case 0x2006: /* SIX-PER-EM SPACE */
1838 case 0x2007: /* FIGURE SPACE */
1839 case 0x2008: /* PUNCTUATION SPACE */
1840 case 0x2009: /* THIN SPACE */
1841 case 0x200A: /* HAIR SPACE */
1842 case 0x202f: /* NARROW NO-BREAK SPACE */
1843 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1844 case 0x3000: /* IDEOGRAPHIC SPACE */
1845 break;
1846
1847 default:
1848 ADD_NEW(state_offset + 1, 0);
1849 break;
1850 }
1851 break;
1852
1853 /*-----------------------------------------------------------------*/
1854 case OP_HSPACE:
1855 if (clen > 0) switch(c)
1856 {
1857 case 0x09: /* HT */
1858 case 0x20: /* SPACE */
1859 case 0xa0: /* NBSP */
1860 case 0x1680: /* OGHAM SPACE MARK */
1861 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1862 case 0x2000: /* EN QUAD */
1863 case 0x2001: /* EM QUAD */
1864 case 0x2002: /* EN SPACE */
1865 case 0x2003: /* EM SPACE */
1866 case 0x2004: /* THREE-PER-EM SPACE */
1867 case 0x2005: /* FOUR-PER-EM SPACE */
1868 case 0x2006: /* SIX-PER-EM SPACE */
1869 case 0x2007: /* FIGURE SPACE */
1870 case 0x2008: /* PUNCTUATION SPACE */
1871 case 0x2009: /* THIN SPACE */
1872 case 0x200A: /* HAIR SPACE */
1873 case 0x202f: /* NARROW NO-BREAK SPACE */
1874 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1875 case 0x3000: /* IDEOGRAPHIC SPACE */
1876 ADD_NEW(state_offset + 1, 0);
1877 break;
1878 }
1879 break;
1880
1881 /*-----------------------------------------------------------------*/
1882 /* Match a negated single character. This is only used for one-byte
1883 characters, that is, we know that d < 256. The character we are
1884 checking (c) can be multibyte. */
1885
1886 case OP_NOT:
1887 if (clen > 0)
1888 {
1889 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1890 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1891 }
1892 break;
1893
1894 /*-----------------------------------------------------------------*/
1895 case OP_PLUS:
1896 case OP_MINPLUS:
1897 case OP_POSPLUS:
1898 case OP_NOTPLUS:
1899 case OP_NOTMINPLUS:
1900 case OP_NOTPOSPLUS:
1901 count = current_state->count; /* Already matched */
1902 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1903 if (clen > 0)
1904 {
1905 unsigned int otherd = NOTACHAR;
1906 if ((ims & PCRE_CASELESS) != 0)
1907 {
1908 #ifdef SUPPORT_UTF8
1909 if (utf8 && d >= 128)
1910 {
1911 #ifdef SUPPORT_UCP
1912 otherd = UCD_OTHERCASE(d);
1913 #endif /* SUPPORT_UCP */
1914 }
1915 else
1916 #endif /* SUPPORT_UTF8 */
1917 otherd = fcc[d];
1918 }
1919 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1920 {
1921 if (count > 0 &&
1922 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1923 {
1924 active_count--; /* Remove non-match possibility */
1925 next_active_state--;
1926 }
1927 count++;
1928 ADD_NEW(state_offset, count);
1929 }
1930 }
1931 break;
1932
1933 /*-----------------------------------------------------------------*/
1934 case OP_QUERY:
1935 case OP_MINQUERY:
1936 case OP_POSQUERY:
1937 case OP_NOTQUERY:
1938 case OP_NOTMINQUERY:
1939 case OP_NOTPOSQUERY:
1940 ADD_ACTIVE(state_offset + dlen + 1, 0);
1941 if (clen > 0)
1942 {
1943 unsigned int otherd = NOTACHAR;
1944 if ((ims & PCRE_CASELESS) != 0)
1945 {
1946 #ifdef SUPPORT_UTF8
1947 if (utf8 && d >= 128)
1948 {
1949 #ifdef SUPPORT_UCP
1950 otherd = UCD_OTHERCASE(d);
1951 #endif /* SUPPORT_UCP */
1952 }
1953 else
1954 #endif /* SUPPORT_UTF8 */
1955 otherd = fcc[d];
1956 }
1957 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1958 {
1959 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1960 {
1961 active_count--; /* Remove non-match possibility */
1962 next_active_state--;
1963 }
1964 ADD_NEW(state_offset + dlen + 1, 0);
1965 }
1966 }
1967 break;
1968
1969 /*-----------------------------------------------------------------*/
1970 case OP_STAR:
1971 case OP_MINSTAR:
1972 case OP_POSSTAR:
1973 case OP_NOTSTAR:
1974 case OP_NOTMINSTAR:
1975 case OP_NOTPOSSTAR:
1976 ADD_ACTIVE(state_offset + dlen + 1, 0);
1977 if (clen > 0)
1978 {
1979 unsigned int otherd = NOTACHAR;
1980 if ((ims & PCRE_CASELESS) != 0)
1981 {
1982 #ifdef SUPPORT_UTF8
1983 if (utf8 && d >= 128)
1984 {
1985 #ifdef SUPPORT_UCP
1986 otherd = UCD_OTHERCASE(d);
1987 #endif /* SUPPORT_UCP */
1988 }
1989 else
1990 #endif /* SUPPORT_UTF8 */
1991 otherd = fcc[d];
1992 }
1993 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1994 {
1995 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1996 {
1997 active_count--; /* Remove non-match possibility */
1998 next_active_state--;
1999 }
2000 ADD_NEW(state_offset, 0);
2001 }
2002 }
2003 break;
2004
2005 /*-----------------------------------------------------------------*/
2006 case OP_EXACT:
2007 case OP_NOTEXACT:
2008 count = current_state->count; /* Number already matched */
2009 if (clen > 0)
2010 {
2011 unsigned int otherd = NOTACHAR;
2012 if ((ims & PCRE_CASELESS) != 0)
2013 {
2014 #ifdef SUPPORT_UTF8
2015 if (utf8 && d >= 128)
2016 {
2017 #ifdef SUPPORT_UCP
2018 otherd = UCD_OTHERCASE(d);
2019 #endif /* SUPPORT_UCP */
2020 }
2021 else
2022 #endif /* SUPPORT_UTF8 */
2023 otherd = fcc[d];
2024 }
2025 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2026 {
2027 if (++count >= GET2(code, 1))
2028 { ADD_NEW(state_offset + dlen + 3, 0); }
2029 else
2030 { ADD_NEW(state_offset, count); }
2031 }
2032 }
2033 break;
2034
2035 /*-----------------------------------------------------------------*/
2036 case OP_UPTO:
2037 case OP_MINUPTO:
2038 case OP_POSUPTO:
2039 case OP_NOTUPTO:
2040 case OP_NOTMINUPTO:
2041 case OP_NOTPOSUPTO:
2042 ADD_ACTIVE(state_offset + dlen + 3, 0);
2043 count = current_state->count; /* Number already matched */
2044 if (clen > 0)
2045 {
2046 unsigned int otherd = NOTACHAR;
2047 if ((ims & PCRE_CASELESS) != 0)
2048 {
2049 #ifdef SUPPORT_UTF8
2050 if (utf8 && d >= 128)
2051 {
2052 #ifdef SUPPORT_UCP
2053 otherd = UCD_OTHERCASE(d);
2054 #endif /* SUPPORT_UCP */
2055 }
2056 else
2057 #endif /* SUPPORT_UTF8 */
2058 otherd = fcc[d];
2059 }
2060 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2061 {
2062 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2063 {
2064 active_count--; /* Remove non-match possibility */
2065 next_active_state--;
2066 }
2067 if (++count >= GET2(code, 1))
2068 { ADD_NEW(state_offset + dlen + 3, 0); }
2069 else
2070 { ADD_NEW(state_offset, count); }
2071 }
2072 }
2073 break;
2074
2075
2076 /* ========================================================================== */
2077 /* These are the class-handling opcodes */
2078
2079 case OP_CLASS:
2080 case OP_NCLASS:
2081 case OP_XCLASS:
2082 {
2083 BOOL isinclass = FALSE;
2084 int next_state_offset;
2085 const uschar *ecode;
2086
2087 /* For a simple class, there is always just a 32-byte table, and we
2088 can set isinclass from it. */
2089
2090 if (codevalue != OP_XCLASS)
2091 {
2092 ecode = code + 33;
2093 if (clen > 0)
2094 {
2095 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2096 ((code[1 + c/8] & (1 << (c&7))) != 0);
2097 }
2098 }
2099
2100 /* An extended class may have a table or a list of single characters,
2101 ranges, or both, and it may be positive or negative. There's a
2102 function that sorts all this out. */
2103
2104 else
2105 {
2106 ecode = code + GET(code, 1);
2107 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2108 }
2109
2110 /* At this point, isinclass is set for all kinds of class, and ecode
2111 points to the byte after the end of the class. If there is a
2112 quantifier, this is where it will be. */
2113
2114 next_state_offset = ecode - start_code;
2115
2116 switch (*ecode)
2117 {
2118 case OP_CRSTAR:
2119 case OP_CRMINSTAR:
2120 ADD_ACTIVE(next_state_offset + 1, 0);
2121 if (isinclass) { ADD_NEW(state_offset, 0); }
2122 break;
2123
2124 case OP_CRPLUS:
2125 case OP_CRMINPLUS:
2126 count = current_state->count; /* Already matched */
2127 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2128 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2129 break;
2130
2131 case OP_CRQUERY:
2132 case OP_CRMINQUERY:
2133 ADD_ACTIVE(next_state_offset + 1, 0);
2134 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2135 break;
2136
2137 case OP_CRRANGE:
2138 case OP_CRMINRANGE:
2139 count = current_state->count; /* Already matched */
2140 if (count >= GET2(ecode, 1))
2141 { ADD_ACTIVE(next_state_offset + 5, 0); }
2142 if (isinclass)
2143 {
2144 int max = GET2(ecode, 3);
2145 if (++count >= max && max != 0) /* Max 0 => no limit */
2146 { ADD_NEW(next_state_offset + 5, 0); }
2147 else
2148 { ADD_NEW(state_offset, count); }
2149 }
2150 break;
2151
2152 default:
2153 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2154 break;
2155 }
2156 }
2157 break;
2158
2159 /* ========================================================================== */
2160 /* These are the opcodes for fancy brackets of various kinds. We have
2161 to use recursion in order to handle them. The "always failing" assersion
2162 (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2163 though the other "backtracking verbs" are not supported. */
2164
2165 case OP_FAIL:
2166 break;
2167
2168 case OP_ASSERT:
2169 case OP_ASSERT_NOT:
2170 case OP_ASSERTBACK:
2171 case OP_ASSERTBACK_NOT:
2172 {
2173 int rc;
2174 int local_offsets[2];
2175 int local_workspace[1000];
2176 const uschar *endasscode = code + GET(code, 1);
2177
2178 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2179
2180 rc = internal_dfa_exec(
2181 md, /* static match data */
2182 code, /* this subexpression's code */
2183 ptr, /* where we currently are */
2184 ptr - start_subject, /* start offset */
2185 local_offsets, /* offset vector */
2186 sizeof(local_offsets)/sizeof(int), /* size of same */
2187 local_workspace, /* workspace vector */
2188 sizeof(local_workspace)/sizeof(int), /* size of same */
2189 ims, /* the current ims flags */
2190 rlevel, /* function recursion level */
2191 recursing); /* pass on regex recursion */
2192
2193 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2194 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2195 }
2196 break;
2197
2198 /*-----------------------------------------------------------------*/
2199 case OP_COND:
2200 case OP_SCOND:
2201 {
2202 int local_offsets[1000];
2203 int local_workspace[1000];
2204 int condcode = code[LINK_SIZE+1];
2205
2206 /* Back reference conditions are not supported */
2207
2208 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2209
2210 /* The DEFINE condition is always false */
2211
2212 if (condcode == OP_DEF)
2213 {
2214 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2215 }
2216
2217 /* The only supported version of OP_RREF is for the value RREF_ANY,
2218 which means "test if in any recursion". We can't test for specifically
2219 recursed groups. */
2220
2221 else if (condcode == OP_RREF)
2222 {
2223 int value = GET2(code, LINK_SIZE+2);
2224 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2225 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2226 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2227 }
2228
2229 /* Otherwise, the condition is an assertion */
2230
2231 else
2232 {
2233 int rc;
2234 const uschar *asscode = code + LINK_SIZE + 1;
2235 const uschar *endasscode = asscode + GET(asscode, 1);
2236
2237 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2238
2239 rc = internal_dfa_exec(
2240 md, /* fixed match data */
2241 asscode, /* this subexpression's code */
2242 ptr, /* where we currently are */
2243 ptr - start_subject, /* start offset */
2244 local_offsets, /* offset vector */
2245 sizeof(local_offsets)/sizeof(int), /* size of same */
2246 local_workspace, /* workspace vector */
2247 sizeof(local_workspace)/sizeof(int), /* size of same */
2248 ims, /* the current ims flags */
2249 rlevel, /* function recursion level */
2250 recursing); /* pass on regex recursion */
2251
2252 if ((rc >= 0) ==
2253 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2254 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2255 else
2256 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2257 }
2258 }
2259 break;
2260
2261 /*-----------------------------------------------------------------*/
2262 case OP_RECURSE:
2263 {
2264 int local_offsets[1000];
2265 int local_workspace[1000];
2266 int rc;
2267
2268 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2269 recursing + 1));
2270
2271 rc = internal_dfa_exec(
2272 md, /* fixed match data */
2273 start_code + GET(code, 1), /* this subexpression's code */
2274 ptr, /* where we currently are */
2275 ptr - start_subject, /* start offset */
2276 local_offsets, /* offset vector */
2277 sizeof(local_offsets)/sizeof(int), /* size of same */
2278 local_workspace, /* workspace vector */
2279 sizeof(local_workspace)/sizeof(int), /* size of same */
2280 ims, /* the current ims flags */
2281 rlevel, /* function recursion level */
2282 recursing + 1); /* regex recurse level */
2283
2284 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2285 recursing + 1, rc));
2286
2287 /* Ran out of internal offsets */
2288
2289 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2290
2291 /* For each successful matched substring, set up the next state with a
2292 count of characters to skip before trying it. Note that the count is in
2293 characters, not bytes. */
2294
2295 if (rc > 0)
2296 {
2297 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2298 {
2299 const uschar *p = start_subject + local_offsets[rc];
2300 const uschar *pp = start_subject + local_offsets[rc+1];
2301 int charcount = local_offsets[rc+1] - local_offsets[rc];
2302 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2303 if (charcount > 0)
2304 {
2305 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2306 }
2307 else
2308 {
2309 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2310 }
2311 }
2312 }
2313 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2314 }
2315 break;
2316
2317 /*-----------------------------------------------------------------*/
2318 case OP_ONCE:
2319 {
2320 int local_offsets[2];
2321 int local_workspace[1000];
2322
2323 int rc = internal_dfa_exec(
2324 md, /* fixed match data */
2325 code, /* this subexpression's code */
2326 ptr, /* where we currently are */
2327 ptr - start_subject, /* start offset */
2328 local_offsets, /* offset vector */
2329 sizeof(local_offsets)/sizeof(int), /* size of same */
2330 local_workspace, /* workspace vector */
2331 sizeof(local_workspace)/sizeof(int), /* size of same */
2332 ims, /* the current ims flags */
2333 rlevel, /* function recursion level */
2334 recursing); /* pass on regex recursion */
2335
2336 if (rc >= 0)
2337 {
2338 const uschar *end_subpattern = code;
2339 int charcount = local_offsets[1] - local_offsets[0];
2340 int next_state_offset, repeat_state_offset;
2341
2342 do { end_subpattern += GET(end_subpattern, 1); }
2343 while (*end_subpattern == OP_ALT);
2344 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2345
2346 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2347 arrange for the repeat state also to be added to the relevant list.
2348 Calculate the offset, or set -1 for no repeat. */
2349
2350 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2351 *end_subpattern == OP_KETRMIN)?
2352 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2353
2354 /* If we have matched an empty string, add the next state at the
2355 current character pointer. This is important so that the duplicate
2356 checking kicks in, which is what breaks infinite loops that match an
2357 empty string. */
2358
2359 if (charcount == 0)
2360 {
2361 ADD_ACTIVE(next_state_offset, 0);
2362 }
2363
2364 /* Optimization: if there are no more active states, and there
2365 are no new states yet set up, then skip over the subject string
2366 right here, to save looping. Otherwise, set up the new state to swing
2367 into action when the end of the substring is reached. */
2368
2369 else if (i + 1 >= active_count && new_count == 0)
2370 {
2371 ptr += charcount;
2372 clen = 0;
2373 ADD_NEW(next_state_offset, 0);
2374
2375 /* If we are adding a repeat state at the new character position,
2376 we must fudge things so that it is the only current state.
2377 Otherwise, it might be a duplicate of one we processed before, and
2378 that would cause it to be skipped. */
2379
2380 if (repeat_state_offset >= 0)
2381 {
2382 next_active_state = active_states;
2383 active_count = 0;
2384 i = -1;
2385 ADD_ACTIVE(repeat_state_offset, 0);
2386 }
2387 }
2388 else
2389 {
2390 const uschar *p = start_subject + local_offsets[0];
2391 const uschar *pp = start_subject + local_offsets[1];
2392 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2393 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2394 if (repeat_state_offset >= 0)
2395 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2396 }
2397
2398 }
2399 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2400 }
2401 break;
2402
2403
2404 /* ========================================================================== */
2405 /* Handle callouts */
2406
2407 case OP_CALLOUT:
2408 if (pcre_callout != NULL)
2409 {
2410 int rrc;
2411 pcre_callout_block cb;
2412 cb.version = 1; /* Version 1 of the callout block */
2413 cb.callout_number = code[1];
2414 cb.offset_vector = offsets;
2415 cb.subject = (PCRE_SPTR)start_subject;
2416 cb.subject_length = end_subject - start_subject;
2417 cb.start_match = current_subject - start_subject;
2418 cb.current_position = ptr - start_subject;
2419 cb.pattern_position = GET(code, 2);
2420 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2421 cb.capture_top = 1;
2422 cb.capture_last = -1;
2423 cb.callout_data = md->callout_data;
2424 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2425 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2426 }
2427 break;
2428
2429
2430 /* ========================================================================== */
2431 default: /* Unsupported opcode */
2432 return PCRE_ERROR_DFA_UITEM;
2433 }
2434
2435 NEXT_ACTIVE_STATE: continue;
2436
2437 } /* End of loop scanning active states */
2438
2439 /* We have finished the processing at the current subject character. If no
2440 new states have been set for the next character, we have found all the
2441 matches that we are going to find. If we are at the top level and partial
2442 matching has been requested, check for appropriate conditions. */
2443
2444 if (new_count <= 0)
2445 {
2446 if (match_count < 0 && /* No matches found */
2447 rlevel == 1 && /* Top level match function */
2448 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2449 ptr >= end_subject && /* Reached end of subject */
2450 ptr > current_subject) /* Matched non-empty string */
2451 {
2452 if (offsetcount >= 2)
2453 {
2454 offsets[0] = current_subject - start_subject;
2455 offsets[1] = end_subject - start_subject;
2456 }
2457 match_count = PCRE_ERROR_PARTIAL;
2458 }
2459
2460 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2461 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2462 rlevel*2-2, SP));
2463 break; /* In effect, "return", but see the comment below */
2464 }
2465
2466 /* One or more states are active for the next character. */
2467
2468 ptr += clen; /* Advance to next subject character */
2469 } /* Loop to move along the subject string */
2470
2471 /* Control gets here from "break" a few lines above. We do it this way because
2472 if we use "return" above, we have compiler trouble. Some compilers warn if
2473 there's nothing here because they think the function doesn't return a value. On
2474 the other hand, if we put a dummy statement here, some more clever compilers
2475 complain that it can't be reached. Sigh. */
2476
2477 return match_count;
2478 }
2479
2480
2481
2482
2483 /*************************************************
2484 * Execute a Regular Expression - DFA engine *
2485 *************************************************/
2486
2487 /* This external function applies a compiled re to a subject string using a DFA
2488 engine. This function calls the internal function multiple times if the pattern
2489 is not anchored.
2490
2491 Arguments:
2492 argument_re points to the compiled expression
2493 extra_data points to extra data or is NULL
2494 subject points to the subject string
2495 length length of subject string (may contain binary zeros)
2496 start_offset where to start in the subject string
2497 options option bits
2498 offsets vector of match offsets
2499 offsetcount size of same
2500 workspace workspace vector
2501 wscount size of same
2502
2503 Returns: > 0 => number of match offset pairs placed in offsets
2504 = 0 => offsets overflowed; longest matches are present
2505 -1 => failed to match
2506 < -1 => some kind of unexpected problem
2507 */
2508
2509 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2510 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2511 const char *subject, int length, int start_offset, int options, int *offsets,
2512 int offsetcount, int *workspace, int wscount)
2513 {
2514 real_pcre *re = (real_pcre *)argument_re;
2515 dfa_match_data match_block;
2516 dfa_match_data *md = &match_block;
2517 BOOL utf8, anchored, startline, firstline;
2518 const uschar *current_subject, *end_subject, *lcc;
2519
2520 pcre_study_data internal_study;
2521 const pcre_study_data *study = NULL;
2522 real_pcre internal_re;
2523
2524 const uschar *req_byte_ptr;
2525 const uschar *start_bits = NULL;
2526 BOOL first_byte_caseless = FALSE;
2527 BOOL req_byte_caseless = FALSE;
2528 int first_byte = -1;
2529 int req_byte = -1;
2530 int req_byte2 = -1;
2531 int newline;
2532
2533 /* Plausibility checks */
2534
2535 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2536 if (re == NULL || subject == NULL || workspace == NULL ||
2537 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2538 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2539 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2540
2541 /* We need to find the pointer to any study data before we test for byte
2542 flipping, so we scan the extra_data block first. This may set two fields in the
2543 match block, so we must initialize them beforehand. However, the other fields
2544 in the match block must not be set until after the byte flipping. */
2545
2546 md->tables = re->tables;
2547 md->callout_data = NULL;
2548
2549 if (extra_data != NULL)
2550 {
2551 unsigned int flags = extra_data->flags;
2552 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2553 study = (const pcre_study_data *)extra_data->study_data;
2554 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2555 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2556 return PCRE_ERROR_DFA_UMLIMIT;
2557 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2558 md->callout_data = extra_data->callout_data;
2559 if ((flags & PCRE_EXTRA_TABLES) != 0)
2560 md->tables = extra_data->tables;
2561 }
2562
2563 /* Check that the first field in the block is the magic number. If it is not,
2564 test for a regex that was compiled on a host of opposite endianness. If this is
2565 the case, flipped values are put in internal_re and internal_study if there was
2566 study data too. */
2567
2568 if (re->magic_number != MAGIC_NUMBER)
2569 {
2570 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2571 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2572 if (study != NULL) study = &internal_study;
2573 }
2574
2575 /* Set some local values */
2576
2577 current_subject = (const unsigned char *)subject + start_offset;
2578 end_subject = (const unsigned char *)subject + length;
2579 req_byte_ptr = current_subject - 1;
2580
2581 #ifdef SUPPORT_UTF8
2582 utf8 = (re->options & PCRE_UTF8) != 0;
2583 #else
2584 utf8 = FALSE;
2585 #endif
2586
2587 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2588 (re->options & PCRE_ANCHORED) != 0;
2589
2590 /* The remaining fixed data for passing around. */
2591
2592 md->start_code = (const uschar *)argument_re +
2593 re->name_table_offset + re->name_count * re->name_entry_size;
2594 md->start_subject = (const unsigned char *)subject;
2595 md->end_subject = end_subject;
2596 md->moptions = options;
2597 md->poptions = re->options;
2598
2599 /* If the BSR option is not set at match time, copy what was set
2600 at compile time. */
2601
2602 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2603 {
2604 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2605 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2606 #ifdef BSR_ANYCRLF
2607 else md->moptions |= PCRE_BSR_ANYCRLF;
2608 #endif
2609 }
2610
2611 /* Handle different types of newline. The three bits give eight cases. If
2612 nothing is set at run time, whatever was used at compile time applies. */
2613
2614 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2615 PCRE_NEWLINE_BITS)
2616 {
2617 case 0: newline = NEWLINE; break; /* Compile-time default */
2618 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2619 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2620 case PCRE_NEWLINE_CR+
2621 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2622 case PCRE_NEWLINE_ANY: newline = -1; break;
2623 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2624 default: return PCRE_ERROR_BADNEWLINE;
2625 }
2626
2627 if (newline == -2)
2628 {
2629 md->nltype = NLTYPE_ANYCRLF;
2630 }
2631 else if (newline < 0)
2632 {
2633 md->nltype = NLTYPE_ANY;
2634 }
2635 else
2636 {
2637 md->nltype = NLTYPE_FIXED;
2638 if (newline > 255)
2639 {
2640 md->nllen = 2;
2641 md->nl[0] = (newline >> 8) & 255;
2642 md->nl[1] = newline & 255;
2643 }
2644 else
2645 {
2646 md->nllen = 1;
2647 md->nl[0] = newline;
2648 }
2649 }
2650
2651 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2652 back the character offset. */
2653
2654 #ifdef SUPPORT_UTF8
2655 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2656 {
2657 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2658 return PCRE_ERROR_BADUTF8;
2659 if (start_offset > 0 && start_offset < length)
2660 {
2661 int tb = ((uschar *)subject)[start_offset];
2662 if (tb > 127)
2663 {
2664 tb &= 0xc0;
2665 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2666 }
2667 }
2668 }
2669 #endif
2670
2671 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2672 is a feature that makes it possible to save compiled regex and re-use them
2673 in other programs later. */
2674
2675 if (md->tables == NULL) md->tables = _pcre_default_tables;
2676
2677 /* The lower casing table and the "must be at the start of a line" flag are
2678 used in a loop when finding where to start. */
2679
2680 lcc = md->tables + lcc_offset;
2681 startline = (re->flags & PCRE_STARTLINE) != 0;
2682 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2683
2684 /* Set up the first character to match, if available. The first_byte value is
2685 never set for an anchored regular expression, but the anchoring may be forced
2686 at run time, so we have to test for anchoring. The first char may be unset for
2687 an unanchored pattern, of course. If there's no first char and the pattern was
2688 studied, there may be a bitmap of possible first characters. */
2689
2690 if (!anchored)
2691 {
2692 if ((re->flags & PCRE_FIRSTSET) != 0)
2693 {
2694 first_byte = re->first_byte & 255;
2695 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2696 first_byte = lcc[first_byte];
2697 }
2698 else
2699 {
2700 if (startline && study != NULL &&
2701 (study->options & PCRE_STUDY_MAPPED) != 0)
2702 start_bits = study->start_bits;
2703 }
2704 }
2705
2706 /* For anchored or unanchored matches, there may be a "last known required
2707 character" set. */
2708
2709 if ((re->flags & PCRE_REQCHSET) != 0)
2710 {
2711 req_byte = re->req_byte & 255;
2712 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2713 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2714 }
2715
2716 /* Call the main matching function, looping for a non-anchored regex after a
2717 failed match. If not restarting, perform certain optimizations at the start of
2718 a match. */
2719
2720 for (;;)
2721 {
2722 int rc;
2723
2724 if ((options & PCRE_DFA_RESTART) == 0)
2725 {
2726 const uschar *save_end_subject = end_subject;
2727
2728 /* If firstline is TRUE, the start of the match is constrained to the first
2729 line of a multiline string. Implement this by temporarily adjusting
2730 end_subject so that we stop scanning at a newline. If the match fails at
2731 the newline, later code breaks this loop. */
2732
2733 if (firstline)
2734 {
2735 USPTR t = current_subject;
2736 #ifdef SUPPORT_UTF8
2737 if (utf8)
2738 {
2739 while (t < md->end_subject && !IS_NEWLINE(t))
2740 {
2741 t++;
2742 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2743 }
2744 }
2745 else
2746 #endif
2747 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2748 end_subject = t;
2749 }
2750
2751 /* There are some optimizations that avoid running the match if a known
2752 starting point is not found, or if a known later character is not present.
2753 However, there is an option that disables these, for testing and for
2754 ensuring that all callouts do actually occur. */
2755
2756 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2757 {
2758
2759 /* Advance to a known first byte. */
2760
2761 if (first_byte >= 0)
2762 {
2763 if (first_byte_caseless)
2764 while (current_subject < end_subject &&
2765 lcc[*current_subject] != first_byte)
2766 current_subject++;
2767 else
2768 while (current_subject < end_subject &&
2769 *current_subject != first_byte)
2770 current_subject++;
2771 }
2772
2773 /* Or to just after a linebreak for a multiline match if possible */
2774
2775 else if (startline)
2776 {
2777 if (current_subject > md->start_subject + start_offset)
2778 {
2779 #ifdef SUPPORT_UTF8
2780 if (utf8)
2781 {
2782 while (current_subject < end_subject &&
2783 !WAS_NEWLINE(current_subject))
2784 {
2785 current_subject++;
2786 while(current_subject < end_subject &&
2787 (*current_subject & 0xc0) == 0x80)
2788 current_subject++;
2789 }
2790 }
2791 else
2792 #endif
2793 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2794 current_subject++;
2795
2796 /* If we have just passed a CR and the newline option is ANY or
2797 ANYCRLF, and we are now at a LF, advance the match position by one
2798 more character. */
2799
2800 if (current_subject[-1] == CHAR_CR &&
2801 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2802 current_subject < end_subject &&
2803 *current_subject == CHAR_NL)
2804 current_subject++;
2805 }
2806 }
2807
2808 /* Or to a non-unique first char after study */
2809
2810 else if (start_bits != NULL)
2811 {
2812 while (current_subject < end_subject)
2813 {
2814 register unsigned int c = *current_subject;
2815 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2816 else break;
2817 }
2818 }
2819 }
2820
2821 /* Restore fudged end_subject */
2822
2823 end_subject = save_end_subject;
2824 }
2825
2826 /* If req_byte is set, we know that that character must appear in the subject
2827 for the match to succeed. If the first character is set, req_byte must be
2828 later in the subject; otherwise the test starts at the match point. This
2829 optimization can save a huge amount of work in patterns with nested unlimited
2830 repeats that aren't going to match. Writing separate code for cased/caseless
2831 versions makes it go faster, as does using an autoincrement and backing off
2832 on a match.
2833
2834 HOWEVER: when the subject string is very, very long, searching to its end can
2835 take a long time, and give bad performance on quite ordinary patterns. This
2836 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2837 don't do this when the string is sufficiently long.
2838
2839 ALSO: this processing is disabled when partial matching is requested, and can
2840 also be explicitly deactivated. */
2841
2842 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2843 req_byte >= 0 &&
2844 end_subject - current_subject < REQ_BYTE_MAX &&
2845 (options & PCRE_PARTIAL) == 0)
2846 {
2847 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2848
2849 /* We don't need to repeat the search if we haven't yet reached the
2850 place we found it at last time. */
2851
2852 if (p > req_byte_ptr)
2853 {
2854 if (req_byte_caseless)
2855 {
2856 while (p < end_subject)
2857 {
2858 register int pp = *p++;
2859 if (pp == req_byte || pp == req_byte2) { p--; break; }
2860 }
2861 }
2862 else
2863 {
2864 while (p < end_subject)
2865 {
2866 if (*p++ == req_byte) { p--; break; }
2867 }
2868 }
2869
2870 /* If we can't find the required character, break the matching loop,
2871 which will cause a return or PCRE_ERROR_NOMATCH. */
2872
2873 if (p >= end_subject) break;
2874
2875 /* If we have found the required character, save the point where we
2876 found it, so that we don't search again next time round the loop if
2877 the start hasn't passed this character yet. */
2878
2879 req_byte_ptr = p;
2880 }
2881 }
2882
2883 /* OK, now we can do the business */
2884
2885 rc = internal_dfa_exec(
2886 md, /* fixed match data */
2887 md->start_code, /* this subexpression's code */
2888 current_subject, /* where we currently are */
2889 start_offset, /* start offset in subject */
2890 offsets, /* offset vector */
2891 offsetcount, /* size of same */
2892 workspace, /* workspace vector */
2893 wscount, /* size of same */
2894 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2895 0, /* function recurse level */
2896 0); /* regex recurse level */
2897
2898 /* Anything other than "no match" means we are done, always; otherwise, carry
2899 on only if not anchored. */
2900
2901 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2902
2903 /* Advance to the next subject character unless we are at the end of a line
2904 and firstline is set. */
2905
2906 if (firstline && IS_NEWLINE(current_subject)) break;
2907 current_subject++;
2908 if (utf8)
2909 {
2910 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2911 current_subject++;
2912 }
2913 if (current_subject > end_subject) break;
2914
2915 /* If we have just passed a CR and we are now at a LF, and the pattern does
2916 not contain any explicit matches for \r or \n, and the newline option is CRLF
2917 or ANY or ANYCRLF, advance the match position by one more character. */
2918
2919 if (current_subject[-1] == CHAR_CR &&
2920 current_subject < end_subject &&
2921 *current_subject == CHAR_NL &&
2922 (re->flags & PCRE_HASCRORLF) == 0 &&
2923 (md->nltype == NLTYPE_ANY ||
2924 md->nltype == NLTYPE_ANYCRLF ||
2925 md->nllen == 2))
2926 current_subject++;
2927
2928 } /* "Bumpalong" loop */
2929
2930 return PCRE_ERROR_NOMATCH;
2931 }
2932
2933 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12