/[pcre]/code/trunk/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 199 - (show annotations) (download)
Tue Jul 31 14:39:09 2007 UTC (7 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 94242 byte(s)
Daniel's patch for config.h and Windows DLL declarations (not fully working).

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains the external function pcre_dfa_exec(), which is an
42 alternative matching function that uses a sort of DFA algorithm (not a true
43 FSM). This is NOT Perl- compatible, but it has advantages in certain
44 applications. */
45
46
47 #ifdef HAVE_CONFIG_H
48 #include <config.h>
49 #endif
50
51 #define NLBLOCK md /* Block containing newline information */
52 #define PSSTART start_subject /* Field containing processed string start */
53 #define PSEND end_subject /* Field containing processed string end */
54
55 #include "pcre_internal.h"
56
57
58 /* For use to indent debugging output */
59
60 #define SP " "
61
62
63
64 /*************************************************
65 * Code parameters and static tables *
66 *************************************************/
67
68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69 into others, under special conditions. A gap of 20 between the blocks should be
70 enough. The resulting opcodes don't have to be less than 256 because they are
71 never stored, so we push them well clear of the normal opcodes. */
72
73 #define OP_PROP_EXTRA 300
74 #define OP_EXTUNI_EXTRA 320
75 #define OP_ANYNL_EXTRA 340
76 #define OP_HSPACE_EXTRA 360
77 #define OP_VSPACE_EXTRA 380
78
79
80 /* This table identifies those opcodes that are followed immediately by a
81 character that is to be tested in some way. This makes is possible to
82 centralize the loading of these characters. In the case of Type * etc, the
83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84 small value. ***NOTE*** If the start of this table is modified, the two tables
85 that follow must also be modified. */
86
87 static uschar coptable[] = {
88 0, /* End */
89 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91 0, 0, /* Any, Anybyte */
92 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95 1, /* Char */
96 1, /* Charnc */
97 1, /* not */
98 /* Positive single-char repeats */
99 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100 3, 3, 3, /* upto, minupto, exact */
101 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102 /* Negative single-char repeats - only for chars < 256 */
103 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104 3, 3, 3, /* NOT upto, minupto, exact */
105 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106 /* Positive type repeats */
107 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108 3, 3, 3, /* Type upto, minupto, exact */
109 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110 /* Character class & ref repeats */
111 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112 0, 0, /* CRRANGE, CRMINRANGE */
113 0, /* CLASS */
114 0, /* NCLASS */
115 0, /* XCLASS - variable length */
116 0, /* REF */
117 0, /* RECURSE */
118 0, /* CALLOUT */
119 0, /* Alt */
120 0, /* Ket */
121 0, /* KetRmax */
122 0, /* KetRmin */
123 0, /* Assert */
124 0, /* Assert not */
125 0, /* Assert behind */
126 0, /* Assert behind not */
127 0, /* Reverse */
128 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129 0, 0, 0, /* SBRA, SCBRA, SCOND */
130 0, /* CREF */
131 0, /* RREF */
132 0, /* DEF */
133 0, 0 /* BRAZERO, BRAMINZERO */
134 };
135
136 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
137 and \w */
138
139 static uschar toptable1[] = {
140 0, 0, 0, 0, 0, 0,
141 ctype_digit, ctype_digit,
142 ctype_space, ctype_space,
143 ctype_word, ctype_word,
144 0 /* OP_ANY */
145 };
146
147 static uschar toptable2[] = {
148 0, 0, 0, 0, 0, 0,
149 ctype_digit, 0,
150 ctype_space, 0,
151 ctype_word, 0,
152 1 /* OP_ANY */
153 };
154
155
156 /* Structure for holding data about a particular state, which is in effect the
157 current data for an active path through the match tree. It must consist
158 entirely of ints because the working vector we are passed, and which we put
159 these structures in, is a vector of ints. */
160
161 typedef struct stateblock {
162 int offset; /* Offset to opcode */
163 int count; /* Count for repeats */
164 int ims; /* ims flag bits */
165 int data; /* Some use extra data */
166 } stateblock;
167
168 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
169
170
171 #ifdef DEBUG
172 /*************************************************
173 * Print character string *
174 *************************************************/
175
176 /* Character string printing function for debugging.
177
178 Arguments:
179 p points to string
180 length number of bytes
181 f where to print
182
183 Returns: nothing
184 */
185
186 static void
187 pchars(unsigned char *p, int length, FILE *f)
188 {
189 int c;
190 while (length-- > 0)
191 {
192 if (isprint(c = *(p++)))
193 fprintf(f, "%c", c);
194 else
195 fprintf(f, "\\x%02x", c);
196 }
197 }
198 #endif
199
200
201
202 /*************************************************
203 * Execute a Regular Expression - DFA engine *
204 *************************************************/
205
206 /* This internal function applies a compiled pattern to a subject string,
207 starting at a given point, using a DFA engine. This function is called from the
208 external one, possibly multiple times if the pattern is not anchored. The
209 function calls itself recursively for some kinds of subpattern.
210
211 Arguments:
212 md the match_data block with fixed information
213 this_start_code the opening bracket of this subexpression's code
214 current_subject where we currently are in the subject string
215 start_offset start offset in the subject string
216 offsets vector to contain the matching string offsets
217 offsetcount size of same
218 workspace vector of workspace
219 wscount size of same
220 ims the current ims flags
221 rlevel function call recursion level
222 recursing regex recursive call level
223
224 Returns: > 0 =>
225 = 0 =>
226 -1 => failed to match
227 < -1 => some kind of unexpected problem
228
229 The following macros are used for adding states to the two state vectors (one
230 for the current character, one for the following character). */
231
232 #define ADD_ACTIVE(x,y) \
233 if (active_count++ < wscount) \
234 { \
235 next_active_state->offset = (x); \
236 next_active_state->count = (y); \
237 next_active_state->ims = ims; \
238 next_active_state++; \
239 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
240 } \
241 else return PCRE_ERROR_DFA_WSSIZE
242
243 #define ADD_ACTIVE_DATA(x,y,z) \
244 if (active_count++ < wscount) \
245 { \
246 next_active_state->offset = (x); \
247 next_active_state->count = (y); \
248 next_active_state->ims = ims; \
249 next_active_state->data = (z); \
250 next_active_state++; \
251 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
252 } \
253 else return PCRE_ERROR_DFA_WSSIZE
254
255 #define ADD_NEW(x,y) \
256 if (new_count++ < wscount) \
257 { \
258 next_new_state->offset = (x); \
259 next_new_state->count = (y); \
260 next_new_state->ims = ims; \
261 next_new_state++; \
262 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
263 } \
264 else return PCRE_ERROR_DFA_WSSIZE
265
266 #define ADD_NEW_DATA(x,y,z) \
267 if (new_count++ < wscount) \
268 { \
269 next_new_state->offset = (x); \
270 next_new_state->count = (y); \
271 next_new_state->ims = ims; \
272 next_new_state->data = (z); \
273 next_new_state++; \
274 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
275 } \
276 else return PCRE_ERROR_DFA_WSSIZE
277
278 /* And now, here is the code */
279
280 static int
281 internal_dfa_exec(
282 dfa_match_data *md,
283 const uschar *this_start_code,
284 const uschar *current_subject,
285 int start_offset,
286 int *offsets,
287 int offsetcount,
288 int *workspace,
289 int wscount,
290 int ims,
291 int rlevel,
292 int recursing)
293 {
294 stateblock *active_states, *new_states, *temp_states;
295 stateblock *next_active_state, *next_new_state;
296
297 const uschar *ctypes, *lcc, *fcc;
298 const uschar *ptr;
299 const uschar *end_code, *first_op;
300
301 int active_count, new_count, match_count;
302
303 /* Some fields in the md block are frequently referenced, so we load them into
304 independent variables in the hope that this will perform better. */
305
306 const uschar *start_subject = md->start_subject;
307 const uschar *end_subject = md->end_subject;
308 const uschar *start_code = md->start_code;
309
310 #ifdef SUPPORT_UTF8
311 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
312 #else
313 BOOL utf8 = FALSE;
314 #endif
315
316 rlevel++;
317 offsetcount &= (-2);
318
319 wscount -= 2;
320 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
321 (2 * INTS_PER_STATEBLOCK);
322
323 DPRINTF(("\n%.*s---------------------\n"
324 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
325 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
326
327 ctypes = md->tables + ctypes_offset;
328 lcc = md->tables + lcc_offset;
329 fcc = md->tables + fcc_offset;
330
331 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
332
333 active_states = (stateblock *)(workspace + 2);
334 next_new_state = new_states = active_states + wscount;
335 new_count = 0;
336
337 first_op = this_start_code + 1 + LINK_SIZE +
338 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
339
340 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
341 the alternative states onto the list, and find out where the end is. This
342 makes is possible to use this function recursively, when we want to stop at a
343 matching internal ket rather than at the end.
344
345 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
346 a backward assertion. In that case, we have to find out the maximum amount to
347 move back, and set up each alternative appropriately. */
348
349 if (*first_op == OP_REVERSE)
350 {
351 int max_back = 0;
352 int gone_back;
353
354 end_code = this_start_code;
355 do
356 {
357 int back = GET(end_code, 2+LINK_SIZE);
358 if (back > max_back) max_back = back;
359 end_code += GET(end_code, 1);
360 }
361 while (*end_code == OP_ALT);
362
363 /* If we can't go back the amount required for the longest lookbehind
364 pattern, go back as far as we can; some alternatives may still be viable. */
365
366 #ifdef SUPPORT_UTF8
367 /* In character mode we have to step back character by character */
368
369 if (utf8)
370 {
371 for (gone_back = 0; gone_back < max_back; gone_back++)
372 {
373 if (current_subject <= start_subject) break;
374 current_subject--;
375 while (current_subject > start_subject &&
376 (*current_subject & 0xc0) == 0x80)
377 current_subject--;
378 }
379 }
380 else
381 #endif
382
383 /* In byte-mode we can do this quickly. */
384
385 {
386 gone_back = (current_subject - max_back < start_subject)?
387 current_subject - start_subject : max_back;
388 current_subject -= gone_back;
389 }
390
391 /* Now we can process the individual branches. */
392
393 end_code = this_start_code;
394 do
395 {
396 int back = GET(end_code, 2+LINK_SIZE);
397 if (back <= gone_back)
398 {
399 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
400 ADD_NEW_DATA(-bstate, 0, gone_back - back);
401 }
402 end_code += GET(end_code, 1);
403 }
404 while (*end_code == OP_ALT);
405 }
406
407 /* This is the code for a "normal" subpattern (not a backward assertion). The
408 start of a whole pattern is always one of these. If we are at the top level,
409 we may be asked to restart matching from the same point that we reached for a
410 previous partial match. We still have to scan through the top-level branches to
411 find the end state. */
412
413 else
414 {
415 end_code = this_start_code;
416
417 /* Restarting */
418
419 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
420 {
421 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
422 new_count = workspace[1];
423 if (!workspace[0])
424 memcpy(new_states, active_states, new_count * sizeof(stateblock));
425 }
426
427 /* Not restarting */
428
429 else
430 {
431 int length = 1 + LINK_SIZE +
432 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
433 do
434 {
435 ADD_NEW(end_code - start_code + length, 0);
436 end_code += GET(end_code, 1);
437 length = 1 + LINK_SIZE;
438 }
439 while (*end_code == OP_ALT);
440 }
441 }
442
443 workspace[0] = 0; /* Bit indicating which vector is current */
444
445 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
446
447 /* Loop for scanning the subject */
448
449 ptr = current_subject;
450 for (;;)
451 {
452 int i, j;
453 int clen, dlen;
454 unsigned int c, d;
455
456 /* Make the new state list into the active state list and empty the
457 new state list. */
458
459 temp_states = active_states;
460 active_states = new_states;
461 new_states = temp_states;
462 active_count = new_count;
463 new_count = 0;
464
465 workspace[0] ^= 1; /* Remember for the restarting feature */
466 workspace[1] = active_count;
467
468 #ifdef DEBUG
469 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
470 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
471 printf("\"\n");
472
473 printf("%.*sActive states: ", rlevel*2-2, SP);
474 for (i = 0; i < active_count; i++)
475 printf("%d/%d ", active_states[i].offset, active_states[i].count);
476 printf("\n");
477 #endif
478
479 /* Set the pointers for adding new states */
480
481 next_active_state = active_states + active_count;
482 next_new_state = new_states;
483
484 /* Load the current character from the subject outside the loop, as many
485 different states may want to look at it, and we assume that at least one
486 will. */
487
488 if (ptr < end_subject)
489 {
490 clen = 1; /* Number of bytes in the character */
491 #ifdef SUPPORT_UTF8
492 if (utf8) { GETCHARLEN(c, ptr, clen); } else
493 #endif /* SUPPORT_UTF8 */
494 c = *ptr;
495 }
496 else
497 {
498 clen = 0; /* This indicates the end of the subject */
499 c = NOTACHAR; /* This value should never actually be used */
500 }
501
502 /* Scan up the active states and act on each one. The result of an action
503 may be to add more states to the currently active list (e.g. on hitting a
504 parenthesis) or it may be to put states on the new list, for considering
505 when we move the character pointer on. */
506
507 for (i = 0; i < active_count; i++)
508 {
509 stateblock *current_state = active_states + i;
510 const uschar *code;
511 int state_offset = current_state->offset;
512 int count, codevalue;
513 #ifdef SUPPORT_UCP
514 int chartype, script;
515 #endif
516
517 #ifdef DEBUG
518 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
519 if (clen == 0) printf("EOL\n");
520 else if (c > 32 && c < 127) printf("'%c'\n", c);
521 else printf("0x%02x\n", c);
522 #endif
523
524 /* This variable is referred to implicity in the ADD_xxx macros. */
525
526 ims = current_state->ims;
527
528 /* A negative offset is a special case meaning "hold off going to this
529 (negated) state until the number of characters in the data field have
530 been skipped". */
531
532 if (state_offset < 0)
533 {
534 if (current_state->data > 0)
535 {
536 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
537 ADD_NEW_DATA(state_offset, current_state->count,
538 current_state->data - 1);
539 continue;
540 }
541 else
542 {
543 current_state->offset = state_offset = -state_offset;
544 }
545 }
546
547 /* Check for a duplicate state with the same count, and skip if found. */
548
549 for (j = 0; j < i; j++)
550 {
551 if (active_states[j].offset == state_offset &&
552 active_states[j].count == current_state->count)
553 {
554 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
555 goto NEXT_ACTIVE_STATE;
556 }
557 }
558
559 /* The state offset is the offset to the opcode */
560
561 code = start_code + state_offset;
562 codevalue = *code;
563
564 /* If this opcode is followed by an inline character, load it. It is
565 tempting to test for the presence of a subject character here, but that
566 is wrong, because sometimes zero repetitions of the subject are
567 permitted.
568
569 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
570 argument that is not a data character - but is always one byte long. We
571 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
572 this case. To keep the other cases fast, convert these ones to new opcodes.
573 */
574
575 if (coptable[codevalue] > 0)
576 {
577 dlen = 1;
578 #ifdef SUPPORT_UTF8
579 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
580 #endif /* SUPPORT_UTF8 */
581 d = code[coptable[codevalue]];
582 if (codevalue >= OP_TYPESTAR)
583 {
584 switch(d)
585 {
586 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
587 case OP_NOTPROP:
588 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
589 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
590 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
591 case OP_NOT_HSPACE:
592 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
593 case OP_NOT_VSPACE:
594 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
595 default: break;
596 }
597 }
598 }
599 else
600 {
601 dlen = 0; /* Not strictly necessary, but compilers moan */
602 d = NOTACHAR; /* if these variables are not set. */
603 }
604
605
606 /* Now process the individual opcodes */
607
608 switch (codevalue)
609 {
610
611 /* ========================================================================== */
612 /* Reached a closing bracket. If not at the end of the pattern, carry
613 on with the next opcode. Otherwise, unless we have an empty string and
614 PCRE_NOTEMPTY is set, save the match data, shifting up all previous
615 matches so we always have the longest first. */
616
617 case OP_KET:
618 case OP_KETRMIN:
619 case OP_KETRMAX:
620 if (code != end_code)
621 {
622 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
623 if (codevalue != OP_KET)
624 {
625 ADD_ACTIVE(state_offset - GET(code, 1), 0);
626 }
627 }
628 else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
629 {
630 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
631 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
632 match_count = 0;
633 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
634 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
635 if (offsetcount >= 2)
636 {
637 offsets[0] = current_subject - start_subject;
638 offsets[1] = ptr - start_subject;
639 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
640 offsets[1] - offsets[0], current_subject));
641 }
642 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
643 {
644 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
645 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
646 match_count, rlevel*2-2, SP));
647 return match_count;
648 }
649 }
650 break;
651
652 /* ========================================================================== */
653 /* These opcodes add to the current list of states without looking
654 at the current character. */
655
656 /*-----------------------------------------------------------------*/
657 case OP_ALT:
658 do { code += GET(code, 1); } while (*code == OP_ALT);
659 ADD_ACTIVE(code - start_code, 0);
660 break;
661
662 /*-----------------------------------------------------------------*/
663 case OP_BRA:
664 case OP_SBRA:
665 do
666 {
667 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
668 code += GET(code, 1);
669 }
670 while (*code == OP_ALT);
671 break;
672
673 /*-----------------------------------------------------------------*/
674 case OP_CBRA:
675 case OP_SCBRA:
676 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
677 code += GET(code, 1);
678 while (*code == OP_ALT)
679 {
680 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
681 code += GET(code, 1);
682 }
683 break;
684
685 /*-----------------------------------------------------------------*/
686 case OP_BRAZERO:
687 case OP_BRAMINZERO:
688 ADD_ACTIVE(state_offset + 1, 0);
689 code += 1 + GET(code, 2);
690 while (*code == OP_ALT) code += GET(code, 1);
691 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
692 break;
693
694 /*-----------------------------------------------------------------*/
695 case OP_CIRC:
696 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
697 ((ims & PCRE_MULTILINE) != 0 &&
698 ptr != end_subject &&
699 WAS_NEWLINE(ptr)))
700 { ADD_ACTIVE(state_offset + 1, 0); }
701 break;
702
703 /*-----------------------------------------------------------------*/
704 case OP_EOD:
705 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
706 break;
707
708 /*-----------------------------------------------------------------*/
709 case OP_OPT:
710 ims = code[1];
711 ADD_ACTIVE(state_offset + 2, 0);
712 break;
713
714 /*-----------------------------------------------------------------*/
715 case OP_SOD:
716 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
717 break;
718
719 /*-----------------------------------------------------------------*/
720 case OP_SOM:
721 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
722 break;
723
724
725 /* ========================================================================== */
726 /* These opcodes inspect the next subject character, and sometimes
727 the previous one as well, but do not have an argument. The variable
728 clen contains the length of the current character and is zero if we are
729 at the end of the subject. */
730
731 /*-----------------------------------------------------------------*/
732 case OP_ANY:
733 if (clen > 0 && ((ims & PCRE_DOTALL) != 0 || !IS_NEWLINE(ptr)))
734 { ADD_NEW(state_offset + 1, 0); }
735 break;
736
737 /*-----------------------------------------------------------------*/
738 case OP_EODN:
739 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
740 { ADD_ACTIVE(state_offset + 1, 0); }
741 break;
742
743 /*-----------------------------------------------------------------*/
744 case OP_DOLL:
745 if ((md->moptions & PCRE_NOTEOL) == 0)
746 {
747 if (clen == 0 ||
748 (IS_NEWLINE(ptr) &&
749 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
750 ))
751 { ADD_ACTIVE(state_offset + 1, 0); }
752 }
753 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
754 { ADD_ACTIVE(state_offset + 1, 0); }
755 break;
756
757 /*-----------------------------------------------------------------*/
758
759 case OP_DIGIT:
760 case OP_WHITESPACE:
761 case OP_WORDCHAR:
762 if (clen > 0 && c < 256 &&
763 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
764 { ADD_NEW(state_offset + 1, 0); }
765 break;
766
767 /*-----------------------------------------------------------------*/
768 case OP_NOT_DIGIT:
769 case OP_NOT_WHITESPACE:
770 case OP_NOT_WORDCHAR:
771 if (clen > 0 && (c >= 256 ||
772 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
773 { ADD_NEW(state_offset + 1, 0); }
774 break;
775
776 /*-----------------------------------------------------------------*/
777 case OP_WORD_BOUNDARY:
778 case OP_NOT_WORD_BOUNDARY:
779 {
780 int left_word, right_word;
781
782 if (ptr > start_subject)
783 {
784 const uschar *temp = ptr - 1;
785 #ifdef SUPPORT_UTF8
786 if (utf8) BACKCHAR(temp);
787 #endif
788 GETCHARTEST(d, temp);
789 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
790 }
791 else left_word = 0;
792
793 if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
794 else right_word = 0;
795
796 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
797 { ADD_ACTIVE(state_offset + 1, 0); }
798 }
799 break;
800
801
802 /*-----------------------------------------------------------------*/
803 /* Check the next character by Unicode property. We will get here only
804 if the support is in the binary; otherwise a compile-time error occurs.
805 */
806
807 #ifdef SUPPORT_UCP
808 case OP_PROP:
809 case OP_NOTPROP:
810 if (clen > 0)
811 {
812 BOOL OK;
813 int category = _pcre_ucp_findprop(c, &chartype, &script);
814 switch(code[1])
815 {
816 case PT_ANY:
817 OK = TRUE;
818 break;
819
820 case PT_LAMP:
821 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
822 break;
823
824 case PT_GC:
825 OK = category == code[2];
826 break;
827
828 case PT_PC:
829 OK = chartype == code[2];
830 break;
831
832 case PT_SC:
833 OK = script == code[2];
834 break;
835
836 /* Should never occur, but keep compilers from grumbling. */
837
838 default:
839 OK = codevalue != OP_PROP;
840 break;
841 }
842
843 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
844 }
845 break;
846 #endif
847
848
849
850 /* ========================================================================== */
851 /* These opcodes likewise inspect the subject character, but have an
852 argument that is not a data character. It is one of these opcodes:
853 OP_ANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE, OP_WORDCHAR,
854 OP_NOT_WORDCHAR. The value is loaded into d. */
855
856 case OP_TYPEPLUS:
857 case OP_TYPEMINPLUS:
858 case OP_TYPEPOSPLUS:
859 count = current_state->count; /* Already matched */
860 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
861 if (clen > 0)
862 {
863 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
864 (c < 256 &&
865 (d != OP_ANY ||
866 (ims & PCRE_DOTALL) != 0 ||
867 !IS_NEWLINE(ptr)
868 ) &&
869 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
870 {
871 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
872 {
873 active_count--; /* Remove non-match possibility */
874 next_active_state--;
875 }
876 count++;
877 ADD_NEW(state_offset, count);
878 }
879 }
880 break;
881
882 /*-----------------------------------------------------------------*/
883 case OP_TYPEQUERY:
884 case OP_TYPEMINQUERY:
885 case OP_TYPEPOSQUERY:
886 ADD_ACTIVE(state_offset + 2, 0);
887 if (clen > 0)
888 {
889 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
890 (c < 256 &&
891 (d != OP_ANY ||
892 (ims & PCRE_DOTALL) != 0 ||
893 !IS_NEWLINE(ptr)
894 ) &&
895 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
896 {
897 if (codevalue == OP_TYPEPOSQUERY)
898 {
899 active_count--; /* Remove non-match possibility */
900 next_active_state--;
901 }
902 ADD_NEW(state_offset + 2, 0);
903 }
904 }
905 break;
906
907 /*-----------------------------------------------------------------*/
908 case OP_TYPESTAR:
909 case OP_TYPEMINSTAR:
910 case OP_TYPEPOSSTAR:
911 ADD_ACTIVE(state_offset + 2, 0);
912 if (clen > 0)
913 {
914 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
915 (c < 256 &&
916 (d != OP_ANY ||
917 (ims & PCRE_DOTALL) != 0 ||
918 !IS_NEWLINE(ptr)
919 ) &&
920 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
921 {
922 if (codevalue == OP_TYPEPOSSTAR)
923 {
924 active_count--; /* Remove non-match possibility */
925 next_active_state--;
926 }
927 ADD_NEW(state_offset, 0);
928 }
929 }
930 break;
931
932 /*-----------------------------------------------------------------*/
933 case OP_TYPEEXACT:
934 count = current_state->count; /* Number already matched */
935 if (clen > 0)
936 {
937 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
938 (c < 256 &&
939 (d != OP_ANY ||
940 (ims & PCRE_DOTALL) != 0 ||
941 !IS_NEWLINE(ptr)
942 ) &&
943 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944 {
945 if (++count >= GET2(code, 1))
946 { ADD_NEW(state_offset + 4, 0); }
947 else
948 { ADD_NEW(state_offset, count); }
949 }
950 }
951 break;
952
953 /*-----------------------------------------------------------------*/
954 case OP_TYPEUPTO:
955 case OP_TYPEMINUPTO:
956 case OP_TYPEPOSUPTO:
957 ADD_ACTIVE(state_offset + 4, 0);
958 count = current_state->count; /* Number already matched */
959 if (clen > 0)
960 {
961 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962 (c < 256 &&
963 (d != OP_ANY ||
964 (ims & PCRE_DOTALL) != 0 ||
965 !IS_NEWLINE(ptr)
966 ) &&
967 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
968 {
969 if (codevalue == OP_TYPEPOSUPTO)
970 {
971 active_count--; /* Remove non-match possibility */
972 next_active_state--;
973 }
974 if (++count >= GET2(code, 1))
975 { ADD_NEW(state_offset + 4, 0); }
976 else
977 { ADD_NEW(state_offset, count); }
978 }
979 }
980 break;
981
982 /* ========================================================================== */
983 /* These are virtual opcodes that are used when something like
984 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
985 argument. It keeps the code above fast for the other cases. The argument
986 is in the d variable. */
987
988 #ifdef SUPPORT_UCP
989 case OP_PROP_EXTRA + OP_TYPEPLUS:
990 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
991 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
992 count = current_state->count; /* Already matched */
993 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
994 if (clen > 0)
995 {
996 BOOL OK;
997 int category = _pcre_ucp_findprop(c, &chartype, &script);
998 switch(code[2])
999 {
1000 case PT_ANY:
1001 OK = TRUE;
1002 break;
1003
1004 case PT_LAMP:
1005 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1006 break;
1007
1008 case PT_GC:
1009 OK = category == code[3];
1010 break;
1011
1012 case PT_PC:
1013 OK = chartype == code[3];
1014 break;
1015
1016 case PT_SC:
1017 OK = script == code[3];
1018 break;
1019
1020 /* Should never occur, but keep compilers from grumbling. */
1021
1022 default:
1023 OK = codevalue != OP_PROP;
1024 break;
1025 }
1026
1027 if (OK == (d == OP_PROP))
1028 {
1029 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1030 {
1031 active_count--; /* Remove non-match possibility */
1032 next_active_state--;
1033 }
1034 count++;
1035 ADD_NEW(state_offset, count);
1036 }
1037 }
1038 break;
1039
1040 /*-----------------------------------------------------------------*/
1041 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1042 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1043 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1044 count = current_state->count; /* Already matched */
1045 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1046 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1047 {
1048 const uschar *nptr = ptr + clen;
1049 int ncount = 0;
1050 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1051 {
1052 active_count--; /* Remove non-match possibility */
1053 next_active_state--;
1054 }
1055 while (nptr < end_subject)
1056 {
1057 int nd;
1058 int ndlen = 1;
1059 GETCHARLEN(nd, nptr, ndlen);
1060 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1061 ncount++;
1062 nptr += ndlen;
1063 }
1064 count++;
1065 ADD_NEW_DATA(-state_offset, count, ncount);
1066 }
1067 break;
1068 #endif
1069
1070 /*-----------------------------------------------------------------*/
1071 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1072 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1073 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1074 count = current_state->count; /* Already matched */
1075 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1076 if (clen > 0)
1077 {
1078 int ncount = 0;
1079 switch (c)
1080 {
1081 case 0x000d:
1082 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1083 /* Fall through */
1084 case 0x000a:
1085 case 0x000b:
1086 case 0x000c:
1087 case 0x0085:
1088 case 0x2028:
1089 case 0x2029:
1090 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1091 {
1092 active_count--; /* Remove non-match possibility */
1093 next_active_state--;
1094 }
1095 count++;
1096 ADD_NEW_DATA(-state_offset, count, ncount);
1097 break;
1098 default:
1099 break;
1100 }
1101 }
1102 break;
1103
1104 /*-----------------------------------------------------------------*/
1105 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1106 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1107 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1108 count = current_state->count; /* Already matched */
1109 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1110 if (clen > 0)
1111 {
1112 BOOL OK;
1113 switch (c)
1114 {
1115 case 0x000a:
1116 case 0x000b:
1117 case 0x000c:
1118 case 0x000d:
1119 case 0x0085:
1120 case 0x2028:
1121 case 0x2029:
1122 OK = TRUE;
1123 break;
1124
1125 default:
1126 OK = FALSE;
1127 break;
1128 }
1129
1130 if (OK == (d == OP_VSPACE))
1131 {
1132 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1133 {
1134 active_count--; /* Remove non-match possibility */
1135 next_active_state--;
1136 }
1137 count++;
1138 ADD_NEW_DATA(-state_offset, count, 0);
1139 }
1140 }
1141 break;
1142
1143 /*-----------------------------------------------------------------*/
1144 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1145 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1146 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1147 count = current_state->count; /* Already matched */
1148 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1149 if (clen > 0)
1150 {
1151 BOOL OK;
1152 switch (c)
1153 {
1154 case 0x09: /* HT */
1155 case 0x20: /* SPACE */
1156 case 0xa0: /* NBSP */
1157 case 0x1680: /* OGHAM SPACE MARK */
1158 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1159 case 0x2000: /* EN QUAD */
1160 case 0x2001: /* EM QUAD */
1161 case 0x2002: /* EN SPACE */
1162 case 0x2003: /* EM SPACE */
1163 case 0x2004: /* THREE-PER-EM SPACE */
1164 case 0x2005: /* FOUR-PER-EM SPACE */
1165 case 0x2006: /* SIX-PER-EM SPACE */
1166 case 0x2007: /* FIGURE SPACE */
1167 case 0x2008: /* PUNCTUATION SPACE */
1168 case 0x2009: /* THIN SPACE */
1169 case 0x200A: /* HAIR SPACE */
1170 case 0x202f: /* NARROW NO-BREAK SPACE */
1171 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1172 case 0x3000: /* IDEOGRAPHIC SPACE */
1173 OK = TRUE;
1174 break;
1175
1176 default:
1177 OK = FALSE;
1178 break;
1179 }
1180
1181 if (OK == (d == OP_HSPACE))
1182 {
1183 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1184 {
1185 active_count--; /* Remove non-match possibility */
1186 next_active_state--;
1187 }
1188 count++;
1189 ADD_NEW_DATA(-state_offset, count, 0);
1190 }
1191 }
1192 break;
1193
1194 /*-----------------------------------------------------------------*/
1195 #ifdef SUPPORT_UCP
1196 case OP_PROP_EXTRA + OP_TYPEQUERY:
1197 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1198 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1199 count = 4;
1200 goto QS1;
1201
1202 case OP_PROP_EXTRA + OP_TYPESTAR:
1203 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1204 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1205 count = 0;
1206
1207 QS1:
1208
1209 ADD_ACTIVE(state_offset + 4, 0);
1210 if (clen > 0)
1211 {
1212 BOOL OK;
1213 int category = _pcre_ucp_findprop(c, &chartype, &script);
1214 switch(code[2])
1215 {
1216 case PT_ANY:
1217 OK = TRUE;
1218 break;
1219
1220 case PT_LAMP:
1221 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1222 break;
1223
1224 case PT_GC:
1225 OK = category == code[3];
1226 break;
1227
1228 case PT_PC:
1229 OK = chartype == code[3];
1230 break;
1231
1232 case PT_SC:
1233 OK = script == code[3];
1234 break;
1235
1236 /* Should never occur, but keep compilers from grumbling. */
1237
1238 default:
1239 OK = codevalue != OP_PROP;
1240 break;
1241 }
1242
1243 if (OK == (d == OP_PROP))
1244 {
1245 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1246 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1247 {
1248 active_count--; /* Remove non-match possibility */
1249 next_active_state--;
1250 }
1251 ADD_NEW(state_offset + count, 0);
1252 }
1253 }
1254 break;
1255
1256 /*-----------------------------------------------------------------*/
1257 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1258 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1259 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1260 count = 2;
1261 goto QS2;
1262
1263 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1264 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1265 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1266 count = 0;
1267
1268 QS2:
1269
1270 ADD_ACTIVE(state_offset + 2, 0);
1271 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1272 {
1273 const uschar *nptr = ptr + clen;
1274 int ncount = 0;
1275 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1276 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1277 {
1278 active_count--; /* Remove non-match possibility */
1279 next_active_state--;
1280 }
1281 while (nptr < end_subject)
1282 {
1283 int nd;
1284 int ndlen = 1;
1285 GETCHARLEN(nd, nptr, ndlen);
1286 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1287 ncount++;
1288 nptr += ndlen;
1289 }
1290 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1291 }
1292 break;
1293 #endif
1294
1295 /*-----------------------------------------------------------------*/
1296 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1297 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1298 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1299 count = 2;
1300 goto QS3;
1301
1302 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1303 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1304 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1305 count = 0;
1306
1307 QS3:
1308 ADD_ACTIVE(state_offset + 2, 0);
1309 if (clen > 0)
1310 {
1311 int ncount = 0;
1312 switch (c)
1313 {
1314 case 0x000d:
1315 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1316 /* Fall through */
1317 case 0x000a:
1318 case 0x000b:
1319 case 0x000c:
1320 case 0x0085:
1321 case 0x2028:
1322 case 0x2029:
1323 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1324 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1325 {
1326 active_count--; /* Remove non-match possibility */
1327 next_active_state--;
1328 }
1329 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1330 break;
1331 default:
1332 break;
1333 }
1334 }
1335 break;
1336
1337 /*-----------------------------------------------------------------*/
1338 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1339 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1340 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1341 count = 2;
1342 goto QS4;
1343
1344 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1345 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1346 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1347 count = 0;
1348
1349 QS4:
1350 ADD_ACTIVE(state_offset + 2, 0);
1351 if (clen > 0)
1352 {
1353 BOOL OK;
1354 switch (c)
1355 {
1356 case 0x000a:
1357 case 0x000b:
1358 case 0x000c:
1359 case 0x000d:
1360 case 0x0085:
1361 case 0x2028:
1362 case 0x2029:
1363 OK = TRUE;
1364 break;
1365
1366 default:
1367 OK = FALSE;
1368 break;
1369 }
1370 if (OK == (d == OP_VSPACE))
1371 {
1372 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1373 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1374 {
1375 active_count--; /* Remove non-match possibility */
1376 next_active_state--;
1377 }
1378 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1379 }
1380 }
1381 break;
1382
1383 /*-----------------------------------------------------------------*/
1384 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1385 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1386 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1387 count = 2;
1388 goto QS5;
1389
1390 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1391 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1392 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1393 count = 0;
1394
1395 QS5:
1396 ADD_ACTIVE(state_offset + 2, 0);
1397 if (clen > 0)
1398 {
1399 BOOL OK;
1400 switch (c)
1401 {
1402 case 0x09: /* HT */
1403 case 0x20: /* SPACE */
1404 case 0xa0: /* NBSP */
1405 case 0x1680: /* OGHAM SPACE MARK */
1406 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1407 case 0x2000: /* EN QUAD */
1408 case 0x2001: /* EM QUAD */
1409 case 0x2002: /* EN SPACE */
1410 case 0x2003: /* EM SPACE */
1411 case 0x2004: /* THREE-PER-EM SPACE */
1412 case 0x2005: /* FOUR-PER-EM SPACE */
1413 case 0x2006: /* SIX-PER-EM SPACE */
1414 case 0x2007: /* FIGURE SPACE */
1415 case 0x2008: /* PUNCTUATION SPACE */
1416 case 0x2009: /* THIN SPACE */
1417 case 0x200A: /* HAIR SPACE */
1418 case 0x202f: /* NARROW NO-BREAK SPACE */
1419 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1420 case 0x3000: /* IDEOGRAPHIC SPACE */
1421 OK = TRUE;
1422 break;
1423
1424 default:
1425 OK = FALSE;
1426 break;
1427 }
1428
1429 if (OK == (d == OP_HSPACE))
1430 {
1431 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1432 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1433 {
1434 active_count--; /* Remove non-match possibility */
1435 next_active_state--;
1436 }
1437 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1438 }
1439 }
1440 break;
1441
1442 /*-----------------------------------------------------------------*/
1443 #ifdef SUPPORT_UCP
1444 case OP_PROP_EXTRA + OP_TYPEEXACT:
1445 case OP_PROP_EXTRA + OP_TYPEUPTO:
1446 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1447 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1448 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1449 { ADD_ACTIVE(state_offset + 6, 0); }
1450 count = current_state->count; /* Number already matched */
1451 if (clen > 0)
1452 {
1453 BOOL OK;
1454 int category = _pcre_ucp_findprop(c, &chartype, &script);
1455 switch(code[4])
1456 {
1457 case PT_ANY:
1458 OK = TRUE;
1459 break;
1460
1461 case PT_LAMP:
1462 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1463 break;
1464
1465 case PT_GC:
1466 OK = category == code[5];
1467 break;
1468
1469 case PT_PC:
1470 OK = chartype == code[5];
1471 break;
1472
1473 case PT_SC:
1474 OK = script == code[5];
1475 break;
1476
1477 /* Should never occur, but keep compilers from grumbling. */
1478
1479 default:
1480 OK = codevalue != OP_PROP;
1481 break;
1482 }
1483
1484 if (OK == (d == OP_PROP))
1485 {
1486 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1487 {
1488 active_count--; /* Remove non-match possibility */
1489 next_active_state--;
1490 }
1491 if (++count >= GET2(code, 1))
1492 { ADD_NEW(state_offset + 6, 0); }
1493 else
1494 { ADD_NEW(state_offset, count); }
1495 }
1496 }
1497 break;
1498
1499 /*-----------------------------------------------------------------*/
1500 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1501 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1502 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1503 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1504 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1505 { ADD_ACTIVE(state_offset + 4, 0); }
1506 count = current_state->count; /* Number already matched */
1507 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1508 {
1509 const uschar *nptr = ptr + clen;
1510 int ncount = 0;
1511 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1512 {
1513 active_count--; /* Remove non-match possibility */
1514 next_active_state--;
1515 }
1516 while (nptr < end_subject)
1517 {
1518 int nd;
1519 int ndlen = 1;
1520 GETCHARLEN(nd, nptr, ndlen);
1521 if (_pcre_ucp_findprop(nd, &chartype, &script) != ucp_M) break;
1522 ncount++;
1523 nptr += ndlen;
1524 }
1525 if (++count >= GET2(code, 1))
1526 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1527 else
1528 { ADD_NEW_DATA(-state_offset, count, ncount); }
1529 }
1530 break;
1531 #endif
1532
1533 /*-----------------------------------------------------------------*/
1534 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1535 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1536 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1537 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1538 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1539 { ADD_ACTIVE(state_offset + 4, 0); }
1540 count = current_state->count; /* Number already matched */
1541 if (clen > 0)
1542 {
1543 int ncount = 0;
1544 switch (c)
1545 {
1546 case 0x000d:
1547 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1548 /* Fall through */
1549 case 0x000a:
1550 case 0x000b:
1551 case 0x000c:
1552 case 0x0085:
1553 case 0x2028:
1554 case 0x2029:
1555 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1556 {
1557 active_count--; /* Remove non-match possibility */
1558 next_active_state--;
1559 }
1560 if (++count >= GET2(code, 1))
1561 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1562 else
1563 { ADD_NEW_DATA(-state_offset, count, ncount); }
1564 break;
1565 default:
1566 break;
1567 }
1568 }
1569 break;
1570
1571 /*-----------------------------------------------------------------*/
1572 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1573 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1574 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1575 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1576 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1577 { ADD_ACTIVE(state_offset + 4, 0); }
1578 count = current_state->count; /* Number already matched */
1579 if (clen > 0)
1580 {
1581 BOOL OK;
1582 switch (c)
1583 {
1584 case 0x000a:
1585 case 0x000b:
1586 case 0x000c:
1587 case 0x000d:
1588 case 0x0085:
1589 case 0x2028:
1590 case 0x2029:
1591 OK = TRUE;
1592 break;
1593
1594 default:
1595 OK = FALSE;
1596 }
1597
1598 if (OK == (d == OP_VSPACE))
1599 {
1600 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1601 {
1602 active_count--; /* Remove non-match possibility */
1603 next_active_state--;
1604 }
1605 if (++count >= GET2(code, 1))
1606 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1607 else
1608 { ADD_NEW_DATA(-state_offset, count, 0); }
1609 }
1610 }
1611 break;
1612
1613 /*-----------------------------------------------------------------*/
1614 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1615 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1616 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1617 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1618 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1619 { ADD_ACTIVE(state_offset + 4, 0); }
1620 count = current_state->count; /* Number already matched */
1621 if (clen > 0)
1622 {
1623 BOOL OK;
1624 switch (c)
1625 {
1626 case 0x09: /* HT */
1627 case 0x20: /* SPACE */
1628 case 0xa0: /* NBSP */
1629 case 0x1680: /* OGHAM SPACE MARK */
1630 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1631 case 0x2000: /* EN QUAD */
1632 case 0x2001: /* EM QUAD */
1633 case 0x2002: /* EN SPACE */
1634 case 0x2003: /* EM SPACE */
1635 case 0x2004: /* THREE-PER-EM SPACE */
1636 case 0x2005: /* FOUR-PER-EM SPACE */
1637 case 0x2006: /* SIX-PER-EM SPACE */
1638 case 0x2007: /* FIGURE SPACE */
1639 case 0x2008: /* PUNCTUATION SPACE */
1640 case 0x2009: /* THIN SPACE */
1641 case 0x200A: /* HAIR SPACE */
1642 case 0x202f: /* NARROW NO-BREAK SPACE */
1643 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1644 case 0x3000: /* IDEOGRAPHIC SPACE */
1645 OK = TRUE;
1646 break;
1647
1648 default:
1649 OK = FALSE;
1650 break;
1651 }
1652
1653 if (OK == (d == OP_HSPACE))
1654 {
1655 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1656 {
1657 active_count--; /* Remove non-match possibility */
1658 next_active_state--;
1659 }
1660 if (++count >= GET2(code, 1))
1661 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1662 else
1663 { ADD_NEW_DATA(-state_offset, count, 0); }
1664 }
1665 }
1666 break;
1667
1668 /* ========================================================================== */
1669 /* These opcodes are followed by a character that is usually compared
1670 to the current subject character; it is loaded into d. We still get
1671 here even if there is no subject character, because in some cases zero
1672 repetitions are permitted. */
1673
1674 /*-----------------------------------------------------------------*/
1675 case OP_CHAR:
1676 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1677 break;
1678
1679 /*-----------------------------------------------------------------*/
1680 case OP_CHARNC:
1681 if (clen == 0) break;
1682
1683 #ifdef SUPPORT_UTF8
1684 if (utf8)
1685 {
1686 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1687 {
1688 unsigned int othercase;
1689 if (c < 128) othercase = fcc[c]; else
1690
1691 /* If we have Unicode property support, we can use it to test the
1692 other case of the character. */
1693
1694 #ifdef SUPPORT_UCP
1695 othercase = _pcre_ucp_othercase(c);
1696 #else
1697 othercase = NOTACHAR;
1698 #endif
1699
1700 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1701 }
1702 }
1703 else
1704 #endif /* SUPPORT_UTF8 */
1705
1706 /* Non-UTF-8 mode */
1707 {
1708 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1709 }
1710 break;
1711
1712
1713 #ifdef SUPPORT_UCP
1714 /*-----------------------------------------------------------------*/
1715 /* This is a tricky one because it can match more than one character.
1716 Find out how many characters to skip, and then set up a negative state
1717 to wait for them to pass before continuing. */
1718
1719 case OP_EXTUNI:
1720 if (clen > 0 && _pcre_ucp_findprop(c, &chartype, &script) != ucp_M)
1721 {
1722 const uschar *nptr = ptr + clen;
1723 int ncount = 0;
1724 while (nptr < end_subject)
1725 {
1726 int nclen = 1;
1727 GETCHARLEN(c, nptr, nclen);
1728 if (_pcre_ucp_findprop(c, &chartype, &script) != ucp_M) break;
1729 ncount++;
1730 nptr += nclen;
1731 }
1732 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1733 }
1734 break;
1735 #endif
1736
1737 /*-----------------------------------------------------------------*/
1738 /* This is a tricky like EXTUNI because it too can match more than one
1739 character (when CR is followed by LF). In this case, set up a negative
1740 state to wait for one character to pass before continuing. */
1741
1742 case OP_ANYNL:
1743 if (clen > 0) switch(c)
1744 {
1745 case 0x000a:
1746 case 0x000b:
1747 case 0x000c:
1748 case 0x0085:
1749 case 0x2028:
1750 case 0x2029:
1751 ADD_NEW(state_offset + 1, 0);
1752 break;
1753 case 0x000d:
1754 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1755 {
1756 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1757 }
1758 else
1759 {
1760 ADD_NEW(state_offset + 1, 0);
1761 }
1762 break;
1763 }
1764 break;
1765
1766 /*-----------------------------------------------------------------*/
1767 case OP_NOT_VSPACE:
1768 if (clen > 0) switch(c)
1769 {
1770 case 0x000a:
1771 case 0x000b:
1772 case 0x000c:
1773 case 0x000d:
1774 case 0x0085:
1775 case 0x2028:
1776 case 0x2029:
1777 break;
1778
1779 default:
1780 ADD_NEW(state_offset + 1, 0);
1781 break;
1782 }
1783 break;
1784
1785 /*-----------------------------------------------------------------*/
1786 case OP_VSPACE:
1787 if (clen > 0) switch(c)
1788 {
1789 case 0x000a:
1790 case 0x000b:
1791 case 0x000c:
1792 case 0x000d:
1793 case 0x0085:
1794 case 0x2028:
1795 case 0x2029:
1796 ADD_NEW(state_offset + 1, 0);
1797 break;
1798
1799 default: break;
1800 }
1801 break;
1802
1803 /*-----------------------------------------------------------------*/
1804 case OP_NOT_HSPACE:
1805 if (clen > 0) switch(c)
1806 {
1807 case 0x09: /* HT */
1808 case 0x20: /* SPACE */
1809 case 0xa0: /* NBSP */
1810 case 0x1680: /* OGHAM SPACE MARK */
1811 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1812 case 0x2000: /* EN QUAD */
1813 case 0x2001: /* EM QUAD */
1814 case 0x2002: /* EN SPACE */
1815 case 0x2003: /* EM SPACE */
1816 case 0x2004: /* THREE-PER-EM SPACE */
1817 case 0x2005: /* FOUR-PER-EM SPACE */
1818 case 0x2006: /* SIX-PER-EM SPACE */
1819 case 0x2007: /* FIGURE SPACE */
1820 case 0x2008: /* PUNCTUATION SPACE */
1821 case 0x2009: /* THIN SPACE */
1822 case 0x200A: /* HAIR SPACE */
1823 case 0x202f: /* NARROW NO-BREAK SPACE */
1824 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1825 case 0x3000: /* IDEOGRAPHIC SPACE */
1826 break;
1827
1828 default:
1829 ADD_NEW(state_offset + 1, 0);
1830 break;
1831 }
1832 break;
1833
1834 /*-----------------------------------------------------------------*/
1835 case OP_HSPACE:
1836 if (clen > 0) switch(c)
1837 {
1838 case 0x09: /* HT */
1839 case 0x20: /* SPACE */
1840 case 0xa0: /* NBSP */
1841 case 0x1680: /* OGHAM SPACE MARK */
1842 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1843 case 0x2000: /* EN QUAD */
1844 case 0x2001: /* EM QUAD */
1845 case 0x2002: /* EN SPACE */
1846 case 0x2003: /* EM SPACE */
1847 case 0x2004: /* THREE-PER-EM SPACE */
1848 case 0x2005: /* FOUR-PER-EM SPACE */
1849 case 0x2006: /* SIX-PER-EM SPACE */
1850 case 0x2007: /* FIGURE SPACE */
1851 case 0x2008: /* PUNCTUATION SPACE */
1852 case 0x2009: /* THIN SPACE */
1853 case 0x200A: /* HAIR SPACE */
1854 case 0x202f: /* NARROW NO-BREAK SPACE */
1855 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1856 case 0x3000: /* IDEOGRAPHIC SPACE */
1857 ADD_NEW(state_offset + 1, 0);
1858 break;
1859 }
1860 break;
1861
1862 /*-----------------------------------------------------------------*/
1863 /* Match a negated single character. This is only used for one-byte
1864 characters, that is, we know that d < 256. The character we are
1865 checking (c) can be multibyte. */
1866
1867 case OP_NOT:
1868 if (clen > 0)
1869 {
1870 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1871 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1872 }
1873 break;
1874
1875 /*-----------------------------------------------------------------*/
1876 case OP_PLUS:
1877 case OP_MINPLUS:
1878 case OP_POSPLUS:
1879 case OP_NOTPLUS:
1880 case OP_NOTMINPLUS:
1881 case OP_NOTPOSPLUS:
1882 count = current_state->count; /* Already matched */
1883 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1884 if (clen > 0)
1885 {
1886 unsigned int otherd = NOTACHAR;
1887 if ((ims & PCRE_CASELESS) != 0)
1888 {
1889 #ifdef SUPPORT_UTF8
1890 if (utf8 && d >= 128)
1891 {
1892 #ifdef SUPPORT_UCP
1893 otherd = _pcre_ucp_othercase(d);
1894 #endif /* SUPPORT_UCP */
1895 }
1896 else
1897 #endif /* SUPPORT_UTF8 */
1898 otherd = fcc[d];
1899 }
1900 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1901 {
1902 if (count > 0 &&
1903 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1904 {
1905 active_count--; /* Remove non-match possibility */
1906 next_active_state--;
1907 }
1908 count++;
1909 ADD_NEW(state_offset, count);
1910 }
1911 }
1912 break;
1913
1914 /*-----------------------------------------------------------------*/
1915 case OP_QUERY:
1916 case OP_MINQUERY:
1917 case OP_POSQUERY:
1918 case OP_NOTQUERY:
1919 case OP_NOTMINQUERY:
1920 case OP_NOTPOSQUERY:
1921 ADD_ACTIVE(state_offset + dlen + 1, 0);
1922 if (clen > 0)
1923 {
1924 unsigned int otherd = NOTACHAR;
1925 if ((ims & PCRE_CASELESS) != 0)
1926 {
1927 #ifdef SUPPORT_UTF8
1928 if (utf8 && d >= 128)
1929 {
1930 #ifdef SUPPORT_UCP
1931 otherd = _pcre_ucp_othercase(d);
1932 #endif /* SUPPORT_UCP */
1933 }
1934 else
1935 #endif /* SUPPORT_UTF8 */
1936 otherd = fcc[d];
1937 }
1938 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1939 {
1940 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1941 {
1942 active_count--; /* Remove non-match possibility */
1943 next_active_state--;
1944 }
1945 ADD_NEW(state_offset + dlen + 1, 0);
1946 }
1947 }
1948 break;
1949
1950 /*-----------------------------------------------------------------*/
1951 case OP_STAR:
1952 case OP_MINSTAR:
1953 case OP_POSSTAR:
1954 case OP_NOTSTAR:
1955 case OP_NOTMINSTAR:
1956 case OP_NOTPOSSTAR:
1957 ADD_ACTIVE(state_offset + dlen + 1, 0);
1958 if (clen > 0)
1959 {
1960 unsigned int otherd = NOTACHAR;
1961 if ((ims & PCRE_CASELESS) != 0)
1962 {
1963 #ifdef SUPPORT_UTF8
1964 if (utf8 && d >= 128)
1965 {
1966 #ifdef SUPPORT_UCP
1967 otherd = _pcre_ucp_othercase(d);
1968 #endif /* SUPPORT_UCP */
1969 }
1970 else
1971 #endif /* SUPPORT_UTF8 */
1972 otherd = fcc[d];
1973 }
1974 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1975 {
1976 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1977 {
1978 active_count--; /* Remove non-match possibility */
1979 next_active_state--;
1980 }
1981 ADD_NEW(state_offset, 0);
1982 }
1983 }
1984 break;
1985
1986 /*-----------------------------------------------------------------*/
1987 case OP_EXACT:
1988 case OP_NOTEXACT:
1989 count = current_state->count; /* Number already matched */
1990 if (clen > 0)
1991 {
1992 unsigned int otherd = NOTACHAR;
1993 if ((ims & PCRE_CASELESS) != 0)
1994 {
1995 #ifdef SUPPORT_UTF8
1996 if (utf8 && d >= 128)
1997 {
1998 #ifdef SUPPORT_UCP
1999 otherd = _pcre_ucp_othercase(d);
2000 #endif /* SUPPORT_UCP */
2001 }
2002 else
2003 #endif /* SUPPORT_UTF8 */
2004 otherd = fcc[d];
2005 }
2006 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2007 {
2008 if (++count >= GET2(code, 1))
2009 { ADD_NEW(state_offset + dlen + 3, 0); }
2010 else
2011 { ADD_NEW(state_offset, count); }
2012 }
2013 }
2014 break;
2015
2016 /*-----------------------------------------------------------------*/
2017 case OP_UPTO:
2018 case OP_MINUPTO:
2019 case OP_POSUPTO:
2020 case OP_NOTUPTO:
2021 case OP_NOTMINUPTO:
2022 case OP_NOTPOSUPTO:
2023 ADD_ACTIVE(state_offset + dlen + 3, 0);
2024 count = current_state->count; /* Number already matched */
2025 if (clen > 0)
2026 {
2027 unsigned int otherd = NOTACHAR;
2028 if ((ims & PCRE_CASELESS) != 0)
2029 {
2030 #ifdef SUPPORT_UTF8
2031 if (utf8 && d >= 128)
2032 {
2033 #ifdef SUPPORT_UCP
2034 otherd = _pcre_ucp_othercase(d);
2035 #endif /* SUPPORT_UCP */
2036 }
2037 else
2038 #endif /* SUPPORT_UTF8 */
2039 otherd = fcc[d];
2040 }
2041 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2042 {
2043 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2044 {
2045 active_count--; /* Remove non-match possibility */
2046 next_active_state--;
2047 }
2048 if (++count >= GET2(code, 1))
2049 { ADD_NEW(state_offset + dlen + 3, 0); }
2050 else
2051 { ADD_NEW(state_offset, count); }
2052 }
2053 }
2054 break;
2055
2056
2057 /* ========================================================================== */
2058 /* These are the class-handling opcodes */
2059
2060 case OP_CLASS:
2061 case OP_NCLASS:
2062 case OP_XCLASS:
2063 {
2064 BOOL isinclass = FALSE;
2065 int next_state_offset;
2066 const uschar *ecode;
2067
2068 /* For a simple class, there is always just a 32-byte table, and we
2069 can set isinclass from it. */
2070
2071 if (codevalue != OP_XCLASS)
2072 {
2073 ecode = code + 33;
2074 if (clen > 0)
2075 {
2076 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2077 ((code[1 + c/8] & (1 << (c&7))) != 0);
2078 }
2079 }
2080
2081 /* An extended class may have a table or a list of single characters,
2082 ranges, or both, and it may be positive or negative. There's a
2083 function that sorts all this out. */
2084
2085 else
2086 {
2087 ecode = code + GET(code, 1);
2088 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2089 }
2090
2091 /* At this point, isinclass is set for all kinds of class, and ecode
2092 points to the byte after the end of the class. If there is a
2093 quantifier, this is where it will be. */
2094
2095 next_state_offset = ecode - start_code;
2096
2097 switch (*ecode)
2098 {
2099 case OP_CRSTAR:
2100 case OP_CRMINSTAR:
2101 ADD_ACTIVE(next_state_offset + 1, 0);
2102 if (isinclass) { ADD_NEW(state_offset, 0); }
2103 break;
2104
2105 case OP_CRPLUS:
2106 case OP_CRMINPLUS:
2107 count = current_state->count; /* Already matched */
2108 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2109 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2110 break;
2111
2112 case OP_CRQUERY:
2113 case OP_CRMINQUERY:
2114 ADD_ACTIVE(next_state_offset + 1, 0);
2115 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2116 break;
2117
2118 case OP_CRRANGE:
2119 case OP_CRMINRANGE:
2120 count = current_state->count; /* Already matched */
2121 if (count >= GET2(ecode, 1))
2122 { ADD_ACTIVE(next_state_offset + 5, 0); }
2123 if (isinclass)
2124 {
2125 int max = GET2(ecode, 3);
2126 if (++count >= max && max != 0) /* Max 0 => no limit */
2127 { ADD_NEW(next_state_offset + 5, 0); }
2128 else
2129 { ADD_NEW(state_offset, count); }
2130 }
2131 break;
2132
2133 default:
2134 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2135 break;
2136 }
2137 }
2138 break;
2139
2140 /* ========================================================================== */
2141 /* These are the opcodes for fancy brackets of various kinds. We have
2142 to use recursion in order to handle them. */
2143
2144 case OP_ASSERT:
2145 case OP_ASSERT_NOT:
2146 case OP_ASSERTBACK:
2147 case OP_ASSERTBACK_NOT:
2148 {
2149 int rc;
2150 int local_offsets[2];
2151 int local_workspace[1000];
2152 const uschar *endasscode = code + GET(code, 1);
2153
2154 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2155
2156 rc = internal_dfa_exec(
2157 md, /* static match data */
2158 code, /* this subexpression's code */
2159 ptr, /* where we currently are */
2160 ptr - start_subject, /* start offset */
2161 local_offsets, /* offset vector */
2162 sizeof(local_offsets)/sizeof(int), /* size of same */
2163 local_workspace, /* workspace vector */
2164 sizeof(local_workspace)/sizeof(int), /* size of same */
2165 ims, /* the current ims flags */
2166 rlevel, /* function recursion level */
2167 recursing); /* pass on regex recursion */
2168
2169 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2170 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2171 }
2172 break;
2173
2174 /*-----------------------------------------------------------------*/
2175 case OP_COND:
2176 case OP_SCOND:
2177 {
2178 int local_offsets[1000];
2179 int local_workspace[1000];
2180 int condcode = code[LINK_SIZE+1];
2181
2182 /* Back reference conditions are not supported */
2183
2184 if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2185
2186 /* The DEFINE condition is always false */
2187
2188 if (condcode == OP_DEF)
2189 {
2190 ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2191 }
2192
2193 /* The only supported version of OP_RREF is for the value RREF_ANY,
2194 which means "test if in any recursion". We can't test for specifically
2195 recursed groups. */
2196
2197 else if (condcode == OP_RREF)
2198 {
2199 int value = GET2(code, LINK_SIZE+2);
2200 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2201 if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2202 else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2203 }
2204
2205 /* Otherwise, the condition is an assertion */
2206
2207 else
2208 {
2209 int rc;
2210 const uschar *asscode = code + LINK_SIZE + 1;
2211 const uschar *endasscode = asscode + GET(asscode, 1);
2212
2213 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2214
2215 rc = internal_dfa_exec(
2216 md, /* fixed match data */
2217 asscode, /* this subexpression's code */
2218 ptr, /* where we currently are */
2219 ptr - start_subject, /* start offset */
2220 local_offsets, /* offset vector */
2221 sizeof(local_offsets)/sizeof(int), /* size of same */
2222 local_workspace, /* workspace vector */
2223 sizeof(local_workspace)/sizeof(int), /* size of same */
2224 ims, /* the current ims flags */
2225 rlevel, /* function recursion level */
2226 recursing); /* pass on regex recursion */
2227
2228 if ((rc >= 0) ==
2229 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2230 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2231 else
2232 { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2233 }
2234 }
2235 break;
2236
2237 /*-----------------------------------------------------------------*/
2238 case OP_RECURSE:
2239 {
2240 int local_offsets[1000];
2241 int local_workspace[1000];
2242 int rc;
2243
2244 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2245 recursing + 1));
2246
2247 rc = internal_dfa_exec(
2248 md, /* fixed match data */
2249 start_code + GET(code, 1), /* this subexpression's code */
2250 ptr, /* where we currently are */
2251 ptr - start_subject, /* start offset */
2252 local_offsets, /* offset vector */
2253 sizeof(local_offsets)/sizeof(int), /* size of same */
2254 local_workspace, /* workspace vector */
2255 sizeof(local_workspace)/sizeof(int), /* size of same */
2256 ims, /* the current ims flags */
2257 rlevel, /* function recursion level */
2258 recursing + 1); /* regex recurse level */
2259
2260 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2261 recursing + 1, rc));
2262
2263 /* Ran out of internal offsets */
2264
2265 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2266
2267 /* For each successful matched substring, set up the next state with a
2268 count of characters to skip before trying it. Note that the count is in
2269 characters, not bytes. */
2270
2271 if (rc > 0)
2272 {
2273 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2274 {
2275 const uschar *p = start_subject + local_offsets[rc];
2276 const uschar *pp = start_subject + local_offsets[rc+1];
2277 int charcount = local_offsets[rc+1] - local_offsets[rc];
2278 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2279 if (charcount > 0)
2280 {
2281 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2282 }
2283 else
2284 {
2285 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2286 }
2287 }
2288 }
2289 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2290 }
2291 break;
2292
2293 /*-----------------------------------------------------------------*/
2294 case OP_ONCE:
2295 {
2296 int local_offsets[2];
2297 int local_workspace[1000];
2298
2299 int rc = internal_dfa_exec(
2300 md, /* fixed match data */
2301 code, /* this subexpression's code */
2302 ptr, /* where we currently are */
2303 ptr - start_subject, /* start offset */
2304 local_offsets, /* offset vector */
2305 sizeof(local_offsets)/sizeof(int), /* size of same */
2306 local_workspace, /* workspace vector */
2307 sizeof(local_workspace)/sizeof(int), /* size of same */
2308 ims, /* the current ims flags */
2309 rlevel, /* function recursion level */
2310 recursing); /* pass on regex recursion */
2311
2312 if (rc >= 0)
2313 {
2314 const uschar *end_subpattern = code;
2315 int charcount = local_offsets[1] - local_offsets[0];
2316 int next_state_offset, repeat_state_offset;
2317
2318 do { end_subpattern += GET(end_subpattern, 1); }
2319 while (*end_subpattern == OP_ALT);
2320 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2321
2322 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2323 arrange for the repeat state also to be added to the relevant list.
2324 Calculate the offset, or set -1 for no repeat. */
2325
2326 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2327 *end_subpattern == OP_KETRMIN)?
2328 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2329
2330 /* If we have matched an empty string, add the next state at the
2331 current character pointer. This is important so that the duplicate
2332 checking kicks in, which is what breaks infinite loops that match an
2333 empty string. */
2334
2335 if (charcount == 0)
2336 {
2337 ADD_ACTIVE(next_state_offset, 0);
2338 }
2339
2340 /* Optimization: if there are no more active states, and there
2341 are no new states yet set up, then skip over the subject string
2342 right here, to save looping. Otherwise, set up the new state to swing
2343 into action when the end of the substring is reached. */
2344
2345 else if (i + 1 >= active_count && new_count == 0)
2346 {
2347 ptr += charcount;
2348 clen = 0;
2349 ADD_NEW(next_state_offset, 0);
2350
2351 /* If we are adding a repeat state at the new character position,
2352 we must fudge things so that it is the only current state.
2353 Otherwise, it might be a duplicate of one we processed before, and
2354 that would cause it to be skipped. */
2355
2356 if (repeat_state_offset >= 0)
2357 {
2358 next_active_state = active_states;
2359 active_count = 0;
2360 i = -1;
2361 ADD_ACTIVE(repeat_state_offset, 0);
2362 }
2363 }
2364 else
2365 {
2366 const uschar *p = start_subject + local_offsets[0];
2367 const uschar *pp = start_subject + local_offsets[1];
2368 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2369 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2370 if (repeat_state_offset >= 0)
2371 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2372 }
2373
2374 }
2375 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2376 }
2377 break;
2378
2379
2380 /* ========================================================================== */
2381 /* Handle callouts */
2382
2383 case OP_CALLOUT:
2384 if (pcre_callout != NULL)
2385 {
2386 int rrc;
2387 pcre_callout_block cb;
2388 cb.version = 1; /* Version 1 of the callout block */
2389 cb.callout_number = code[1];
2390 cb.offset_vector = offsets;
2391 cb.subject = (PCRE_SPTR)start_subject;
2392 cb.subject_length = end_subject - start_subject;
2393 cb.start_match = current_subject - start_subject;
2394 cb.current_position = ptr - start_subject;
2395 cb.pattern_position = GET(code, 2);
2396 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2397 cb.capture_top = 1;
2398 cb.capture_last = -1;
2399 cb.callout_data = md->callout_data;
2400 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2401 if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2402 }
2403 break;
2404
2405
2406 /* ========================================================================== */
2407 default: /* Unsupported opcode */
2408 return PCRE_ERROR_DFA_UITEM;
2409 }
2410
2411 NEXT_ACTIVE_STATE: continue;
2412
2413 } /* End of loop scanning active states */
2414
2415 /* We have finished the processing at the current subject character. If no
2416 new states have been set for the next character, we have found all the
2417 matches that we are going to find. If we are at the top level and partial
2418 matching has been requested, check for appropriate conditions. */
2419
2420 if (new_count <= 0)
2421 {
2422 if (match_count < 0 && /* No matches found */
2423 rlevel == 1 && /* Top level match function */
2424 (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2425 ptr >= end_subject && /* Reached end of subject */
2426 ptr > current_subject) /* Matched non-empty string */
2427 {
2428 if (offsetcount >= 2)
2429 {
2430 offsets[0] = current_subject - start_subject;
2431 offsets[1] = end_subject - start_subject;
2432 }
2433 match_count = PCRE_ERROR_PARTIAL;
2434 }
2435
2436 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2437 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2438 rlevel*2-2, SP));
2439 break; /* In effect, "return", but see the comment below */
2440 }
2441
2442 /* One or more states are active for the next character. */
2443
2444 ptr += clen; /* Advance to next subject character */
2445 } /* Loop to move along the subject string */
2446
2447 /* Control gets here from "break" a few lines above. We do it this way because
2448 if we use "return" above, we have compiler trouble. Some compilers warn if
2449 there's nothing here because they think the function doesn't return a value. On
2450 the other hand, if we put a dummy statement here, some more clever compilers
2451 complain that it can't be reached. Sigh. */
2452
2453 return match_count;
2454 }
2455
2456
2457
2458
2459 /*************************************************
2460 * Execute a Regular Expression - DFA engine *
2461 *************************************************/
2462
2463 /* This external function applies a compiled re to a subject string using a DFA
2464 engine. This function calls the internal function multiple times if the pattern
2465 is not anchored.
2466
2467 Arguments:
2468 argument_re points to the compiled expression
2469 extra_data points to extra data or is NULL
2470 subject points to the subject string
2471 length length of subject string (may contain binary zeros)
2472 start_offset where to start in the subject string
2473 options option bits
2474 offsets vector of match offsets
2475 offsetcount size of same
2476 workspace workspace vector
2477 wscount size of same
2478
2479 Returns: > 0 => number of match offset pairs placed in offsets
2480 = 0 => offsets overflowed; longest matches are present
2481 -1 => failed to match
2482 < -1 => some kind of unexpected problem
2483 */
2484
2485 PCRE_EXP_DEFN int
2486 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2487 const char *subject, int length, int start_offset, int options, int *offsets,
2488 int offsetcount, int *workspace, int wscount)
2489 {
2490 real_pcre *re = (real_pcre *)argument_re;
2491 dfa_match_data match_block;
2492 dfa_match_data *md = &match_block;
2493 BOOL utf8, anchored, startline, firstline;
2494 const uschar *current_subject, *end_subject, *lcc;
2495
2496 pcre_study_data internal_study;
2497 const pcre_study_data *study = NULL;
2498 real_pcre internal_re;
2499
2500 const uschar *req_byte_ptr;
2501 const uschar *start_bits = NULL;
2502 BOOL first_byte_caseless = FALSE;
2503 BOOL req_byte_caseless = FALSE;
2504 int first_byte = -1;
2505 int req_byte = -1;
2506 int req_byte2 = -1;
2507 int newline;
2508
2509 /* Plausibility checks */
2510
2511 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2512 if (re == NULL || subject == NULL || workspace == NULL ||
2513 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2514 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2515 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2516
2517 /* We need to find the pointer to any study data before we test for byte
2518 flipping, so we scan the extra_data block first. This may set two fields in the
2519 match block, so we must initialize them beforehand. However, the other fields
2520 in the match block must not be set until after the byte flipping. */
2521
2522 md->tables = re->tables;
2523 md->callout_data = NULL;
2524
2525 if (extra_data != NULL)
2526 {
2527 unsigned int flags = extra_data->flags;
2528 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2529 study = (const pcre_study_data *)extra_data->study_data;
2530 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2531 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2532 return PCRE_ERROR_DFA_UMLIMIT;
2533 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2534 md->callout_data = extra_data->callout_data;
2535 if ((flags & PCRE_EXTRA_TABLES) != 0)
2536 md->tables = extra_data->tables;
2537 }
2538
2539 /* Check that the first field in the block is the magic number. If it is not,
2540 test for a regex that was compiled on a host of opposite endianness. If this is
2541 the case, flipped values are put in internal_re and internal_study if there was
2542 study data too. */
2543
2544 if (re->magic_number != MAGIC_NUMBER)
2545 {
2546 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2547 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2548 if (study != NULL) study = &internal_study;
2549 }
2550
2551 /* Set some local values */
2552
2553 current_subject = (const unsigned char *)subject + start_offset;
2554 end_subject = (const unsigned char *)subject + length;
2555 req_byte_ptr = current_subject - 1;
2556
2557 #ifdef SUPPORT_UTF8
2558 utf8 = (re->options & PCRE_UTF8) != 0;
2559 #else
2560 utf8 = FALSE;
2561 #endif
2562
2563 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2564 (re->options & PCRE_ANCHORED) != 0;
2565
2566 /* The remaining fixed data for passing around. */
2567
2568 md->start_code = (const uschar *)argument_re +
2569 re->name_table_offset + re->name_count * re->name_entry_size;
2570 md->start_subject = (const unsigned char *)subject;
2571 md->end_subject = end_subject;
2572 md->moptions = options;
2573 md->poptions = re->options;
2574
2575 /* Handle different types of newline. The three bits give eight cases. If
2576 nothing is set at run time, whatever was used at compile time applies. */
2577
2578 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2579 PCRE_NEWLINE_BITS)
2580 {
2581 case 0: newline = NEWLINE; break; /* Compile-time default */
2582 case PCRE_NEWLINE_CR: newline = '\r'; break;
2583 case PCRE_NEWLINE_LF: newline = '\n'; break;
2584 case PCRE_NEWLINE_CR+
2585 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2586 case PCRE_NEWLINE_ANY: newline = -1; break;
2587 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2588 default: return PCRE_ERROR_BADNEWLINE;
2589 }
2590
2591 if (newline == -2)
2592 {
2593 md->nltype = NLTYPE_ANYCRLF;
2594 }
2595 else if (newline < 0)
2596 {
2597 md->nltype = NLTYPE_ANY;
2598 }
2599 else
2600 {
2601 md->nltype = NLTYPE_FIXED;
2602 if (newline > 255)
2603 {
2604 md->nllen = 2;
2605 md->nl[0] = (newline >> 8) & 255;
2606 md->nl[1] = newline & 255;
2607 }
2608 else
2609 {
2610 md->nllen = 1;
2611 md->nl[0] = newline;
2612 }
2613 }
2614
2615 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2616 back the character offset. */
2617
2618 #ifdef SUPPORT_UTF8
2619 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2620 {
2621 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2622 return PCRE_ERROR_BADUTF8;
2623 if (start_offset > 0 && start_offset < length)
2624 {
2625 int tb = ((uschar *)subject)[start_offset];
2626 if (tb > 127)
2627 {
2628 tb &= 0xc0;
2629 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2630 }
2631 }
2632 }
2633 #endif
2634
2635 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2636 is a feature that makes it possible to save compiled regex and re-use them
2637 in other programs later. */
2638
2639 if (md->tables == NULL) md->tables = _pcre_default_tables;
2640
2641 /* The lower casing table and the "must be at the start of a line" flag are
2642 used in a loop when finding where to start. */
2643
2644 lcc = md->tables + lcc_offset;
2645 startline = (re->options & PCRE_STARTLINE) != 0;
2646 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2647
2648 /* Set up the first character to match, if available. The first_byte value is
2649 never set for an anchored regular expression, but the anchoring may be forced
2650 at run time, so we have to test for anchoring. The first char may be unset for
2651 an unanchored pattern, of course. If there's no first char and the pattern was
2652 studied, there may be a bitmap of possible first characters. */
2653
2654 if (!anchored)
2655 {
2656 if ((re->options & PCRE_FIRSTSET) != 0)
2657 {
2658 first_byte = re->first_byte & 255;
2659 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2660 first_byte = lcc[first_byte];
2661 }
2662 else
2663 {
2664 if (startline && study != NULL &&
2665 (study->options & PCRE_STUDY_MAPPED) != 0)
2666 start_bits = study->start_bits;
2667 }
2668 }
2669
2670 /* For anchored or unanchored matches, there may be a "last known required
2671 character" set. */
2672
2673 if ((re->options & PCRE_REQCHSET) != 0)
2674 {
2675 req_byte = re->req_byte & 255;
2676 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2677 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2678 }
2679
2680 /* Call the main matching function, looping for a non-anchored regex after a
2681 failed match. Unless restarting, optimize by moving to the first match
2682 character if possible, when not anchored. Then unless wanting a partial match,
2683 check for a required later character. */
2684
2685 for (;;)
2686 {
2687 int rc;
2688
2689 if ((options & PCRE_DFA_RESTART) == 0)
2690 {
2691 const uschar *save_end_subject = end_subject;
2692
2693 /* Advance to a unique first char if possible. If firstline is TRUE, the
2694 start of the match is constrained to the first line of a multiline string.
2695 Implement this by temporarily adjusting end_subject so that we stop
2696 scanning at a newline. If the match fails at the newline, later code breaks
2697 this loop. */
2698
2699 if (firstline)
2700 {
2701 const uschar *t = current_subject;
2702 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2703 end_subject = t;
2704 }
2705
2706 if (first_byte >= 0)
2707 {
2708 if (first_byte_caseless)
2709 while (current_subject < end_subject &&
2710 lcc[*current_subject] != first_byte)
2711 current_subject++;
2712 else
2713 while (current_subject < end_subject && *current_subject != first_byte)
2714 current_subject++;
2715 }
2716
2717 /* Or to just after a linebreak for a multiline match if possible */
2718
2719 else if (startline)
2720 {
2721 if (current_subject > md->start_subject + start_offset)
2722 {
2723 while (current_subject <= end_subject && !WAS_NEWLINE(current_subject))
2724 current_subject++;
2725
2726 /* If we have just passed a CR and the newline option is ANY or
2727 ANYCRLF, and we are now at a LF, advance the match position by one more
2728 character. */
2729
2730 if (current_subject[-1] == '\r' &&
2731 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2732 current_subject < end_subject &&
2733 *current_subject == '\n')
2734 current_subject++;
2735 }
2736 }
2737
2738 /* Or to a non-unique first char after study */
2739
2740 else if (start_bits != NULL)
2741 {
2742 while (current_subject < end_subject)
2743 {
2744 register unsigned int c = *current_subject;
2745 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2746 else break;
2747 }
2748 }
2749
2750 /* Restore fudged end_subject */
2751
2752 end_subject = save_end_subject;
2753 }
2754
2755 /* If req_byte is set, we know that that character must appear in the subject
2756 for the match to succeed. If the first character is set, req_byte must be
2757 later in the subject; otherwise the test starts at the match point. This
2758 optimization can save a huge amount of work in patterns with nested unlimited
2759 repeats that aren't going to match. Writing separate code for cased/caseless
2760 versions makes it go faster, as does using an autoincrement and backing off
2761 on a match.
2762
2763 HOWEVER: when the subject string is very, very long, searching to its end can
2764 take a long time, and give bad performance on quite ordinary patterns. This
2765 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2766 don't do this when the string is sufficiently long.
2767
2768 ALSO: this processing is disabled when partial matching is requested.
2769 */
2770
2771 if (req_byte >= 0 &&
2772 end_subject - current_subject < REQ_BYTE_MAX &&
2773 (options & PCRE_PARTIAL) == 0)
2774 {
2775 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2776
2777 /* We don't need to repeat the search if we haven't yet reached the
2778 place we found it at last time. */
2779
2780 if (p > req_byte_ptr)
2781 {
2782 if (req_byte_caseless)
2783 {
2784 while (p < end_subject)
2785 {
2786 register int pp = *p++;
2787 if (pp == req_byte || pp == req_byte2) { p--; break; }
2788 }
2789 }
2790 else
2791 {
2792 while (p < end_subject)
2793 {
2794 if (*p++ == req_byte) { p--; break; }
2795 }
2796 }
2797
2798 /* If we can't find the required character, break the matching loop,
2799 which will cause a return or PCRE_ERROR_NOMATCH. */
2800
2801 if (p >= end_subject) break;
2802
2803 /* If we have found the required character, save the point where we
2804 found it, so that we don't search again next time round the loop if
2805 the start hasn't passed this character yet. */
2806
2807 req_byte_ptr = p;
2808 }
2809 }
2810
2811 /* OK, now we can do the business */
2812
2813 rc = internal_dfa_exec(
2814 md, /* fixed match data */
2815 md->start_code, /* this subexpression's code */
2816 current_subject, /* where we currently are */
2817 start_offset, /* start offset in subject */
2818 offsets, /* offset vector */
2819 offsetcount, /* size of same */
2820 workspace, /* workspace vector */
2821 wscount, /* size of same */
2822 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2823 0, /* function recurse level */
2824 0); /* regex recurse level */
2825
2826 /* Anything other than "no match" means we are done, always; otherwise, carry
2827 on only if not anchored. */
2828
2829 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2830
2831 /* Advance to the next subject character unless we are at the end of a line
2832 and firstline is set. */
2833
2834 if (firstline && IS_NEWLINE(current_subject)) break;
2835 current_subject++;
2836 if (utf8)
2837 {
2838 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2839 current_subject++;
2840 }
2841 if (current_subject > end_subject) break;
2842
2843 /* If we have just passed a CR and the newline option is CRLF or ANY or
2844 ANYCRLF, and we are now at a LF, advance the match position by one more
2845 character. */
2846
2847 if (current_subject[-1] == '\r' &&
2848 (md->nltype == NLTYPE_ANY ||
2849 md->nltype == NLTYPE_ANYCRLF ||
2850 md->nllen == 2) &&
2851 current_subject < end_subject &&
2852 *current_subject == '\n')
2853 current_subject++;
2854
2855 } /* "Bumpalong" loop */
2856
2857 return PCRE_ERROR_NOMATCH;
2858 }
2859
2860 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12