/[pcre]/code/branches/pcre16/pcre_dfa_exec.c
ViewVC logotype

Contents of /code/branches/pcre16/pcre_dfa_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 462 - (show annotations) (download)
Sat Oct 17 19:55:02 2009 UTC (5 years, 2 months ago) by ph10
Original Path: code/trunk/pcre_dfa_exec.c
File MIME type: text/plain
File size: 107111 byte(s)
Fix PCRE_PARTIAL_HARD for patterns that end optionally, e.g. abc*

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language (but see
7 below for why this module is different).
8
9 Written by Philip Hazel
10 Copyright (c) 1997-2009 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41
42 /* This module contains the external function pcre_dfa_exec(), which is an
43 alternative matching function that uses a sort of DFA algorithm (not a true
44 FSM). This is NOT Perl- compatible, but it has advantages in certain
45 applications. */
46
47
48 /* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49 the performance of his patterns greatly. I could not use it as it stood, as it
50 was not thread safe, and made assumptions about pattern sizes. Also, it caused
51 test 7 to loop, and test 9 to crash with a segfault.
52
53 The issue is the check for duplicate states, which is done by a simple linear
54 search up the state list. (Grep for "duplicate" below to find the code.) For
55 many patterns, there will never be many states active at one time, so a simple
56 linear search is fine. In patterns that have many active states, it might be a
57 bottleneck. The suggested code used an indexing scheme to remember which states
58 had previously been used for each character, and avoided the linear search when
59 it knew there was no chance of a duplicate. This was implemented when adding
60 states to the state lists.
61
62 I wrote some thread-safe, not-limited code to try something similar at the time
63 of checking for duplicates (instead of when adding states), using index vectors
64 on the stack. It did give a 13% improvement with one specially constructed
65 pattern for certain subject strings, but on other strings and on many of the
66 simpler patterns in the test suite it did worse. The major problem, I think,
67 was the extra time to initialize the index. This had to be done for each call
68 of internal_dfa_exec(). (The supplied patch used a static vector, initialized
69 only once - I suspect this was the cause of the problems with the tests.)
70
71 Overall, I concluded that the gains in some cases did not outweigh the losses
72 in others, so I abandoned this code. */
73
74
75
76 #ifdef HAVE_CONFIG_H
77 #include "config.h"
78 #endif
79
80 #define NLBLOCK md /* Block containing newline information */
81 #define PSSTART start_subject /* Field containing processed string start */
82 #define PSEND end_subject /* Field containing processed string end */
83
84 #include "pcre_internal.h"
85
86
87 /* For use to indent debugging output */
88
89 #define SP " "
90
91
92 /*************************************************
93 * Code parameters and static tables *
94 *************************************************/
95
96 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97 into others, under special conditions. A gap of 20 between the blocks should be
98 enough. The resulting opcodes don't have to be less than 256 because they are
99 never stored, so we push them well clear of the normal opcodes. */
100
101 #define OP_PROP_EXTRA 300
102 #define OP_EXTUNI_EXTRA 320
103 #define OP_ANYNL_EXTRA 340
104 #define OP_HSPACE_EXTRA 360
105 #define OP_VSPACE_EXTRA 380
106
107
108 /* This table identifies those opcodes that are followed immediately by a
109 character that is to be tested in some way. This makes is possible to
110 centralize the loading of these characters. In the case of Type * etc, the
111 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112 small value. Non-zero values in the table are the offsets from the opcode where
113 the character is to be found. ***NOTE*** If the start of this table is
114 modified, the three tables that follow must also be modified. */
115
116 static const uschar coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
124 1, /* Char */
125 1, /* Charnc */
126 1, /* not */
127 /* Positive single-char repeats */
128 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
129 3, 3, 3, /* upto, minupto, exact */
130 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
131 /* Negative single-char repeats - only for chars < 256 */
132 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
133 3, 3, 3, /* NOT upto, minupto, exact */
134 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
135 /* Positive type repeats */
136 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
137 3, 3, 3, /* Type upto, minupto, exact */
138 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
139 /* Character class & ref repeats */
140 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
141 0, 0, /* CRRANGE, CRMINRANGE */
142 0, /* CLASS */
143 0, /* NCLASS */
144 0, /* XCLASS - variable length */
145 0, /* REF */
146 0, /* RECURSE */
147 0, /* CALLOUT */
148 0, /* Alt */
149 0, /* Ket */
150 0, /* KetRmax */
151 0, /* KetRmin */
152 0, /* Assert */
153 0, /* Assert not */
154 0, /* Assert behind */
155 0, /* Assert behind not */
156 0, /* Reverse */
157 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
158 0, 0, 0, /* SBRA, SCBRA, SCOND */
159 0, /* CREF */
160 0, /* RREF */
161 0, /* DEF */
162 0, 0, /* BRAZERO, BRAMINZERO */
163 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
164 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
165 };
166
167 /* This table identifies those opcodes that inspect a character. It is used to
168 remember the fact that a character could have been inspected when the end of
169 the subject is reached, in order to support PCRE_PARTIAL_HARD behaviour.
170 ***NOTE*** If the start of this table is modified, the two tables that follow
171 must also be modified. */
172
173 static const uschar poptable[] = {
174 0, /* End */
175 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
176 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
177 1, 1, 1, /* Any, AllAny, Anybyte */
178 1, 1, 1, /* NOTPROP, PROP, EXTUNI */
179 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
180 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
181 1, /* Char */
182 1, /* Charnc */
183 1, /* not */
184 /* Positive single-char repeats */
185 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
186 1, 1, 1, /* upto, minupto, exact */
187 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
188 /* Negative single-char repeats - only for chars < 256 */
189 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
190 1, 1, 1, /* NOT upto, minupto, exact */
191 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
192 /* Positive type repeats */
193 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
194 1, 1, 1, /* Type upto, minupto, exact */
195 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
196 /* Character class & ref repeats */
197 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
198 1, 1, /* CRRANGE, CRMINRANGE */
199 1, /* CLASS */
200 1, /* NCLASS */
201 1, /* XCLASS - variable length */
202 0, /* REF */
203 0, /* RECURSE */
204 0, /* CALLOUT */
205 0, /* Alt */
206 0, /* Ket */
207 0, /* KetRmax */
208 0, /* KetRmin */
209 0, /* Assert */
210 0, /* Assert not */
211 0, /* Assert behind */
212 0, /* Assert behind not */
213 0, /* Reverse */
214 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
215 0, 0, 0, /* SBRA, SCBRA, SCOND */
216 0, /* CREF */
217 0, /* RREF */
218 0, /* DEF */
219 0, 0, /* BRAZERO, BRAMINZERO */
220 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
221 0, 0, 0, 0 /* FAIL, ACCEPT, CLOSE, SKIPZERO */
222 };
223
224 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
225 and \w */
226
227 static const uschar toptable1[] = {
228 0, 0, 0, 0, 0, 0,
229 ctype_digit, ctype_digit,
230 ctype_space, ctype_space,
231 ctype_word, ctype_word,
232 0, 0 /* OP_ANY, OP_ALLANY */
233 };
234
235 static const uschar toptable2[] = {
236 0, 0, 0, 0, 0, 0,
237 ctype_digit, 0,
238 ctype_space, 0,
239 ctype_word, 0,
240 1, 1 /* OP_ANY, OP_ALLANY */
241 };
242
243
244 /* Structure for holding data about a particular state, which is in effect the
245 current data for an active path through the match tree. It must consist
246 entirely of ints because the working vector we are passed, and which we put
247 these structures in, is a vector of ints. */
248
249 typedef struct stateblock {
250 int offset; /* Offset to opcode */
251 int count; /* Count for repeats */
252 int ims; /* ims flag bits */
253 int data; /* Some use extra data */
254 } stateblock;
255
256 #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
257
258
259 #ifdef DEBUG
260 /*************************************************
261 * Print character string *
262 *************************************************/
263
264 /* Character string printing function for debugging.
265
266 Arguments:
267 p points to string
268 length number of bytes
269 f where to print
270
271 Returns: nothing
272 */
273
274 static void
275 pchars(unsigned char *p, int length, FILE *f)
276 {
277 int c;
278 while (length-- > 0)
279 {
280 if (isprint(c = *(p++)))
281 fprintf(f, "%c", c);
282 else
283 fprintf(f, "\\x%02x", c);
284 }
285 }
286 #endif
287
288
289
290 /*************************************************
291 * Execute a Regular Expression - DFA engine *
292 *************************************************/
293
294 /* This internal function applies a compiled pattern to a subject string,
295 starting at a given point, using a DFA engine. This function is called from the
296 external one, possibly multiple times if the pattern is not anchored. The
297 function calls itself recursively for some kinds of subpattern.
298
299 Arguments:
300 md the match_data block with fixed information
301 this_start_code the opening bracket of this subexpression's code
302 current_subject where we currently are in the subject string
303 start_offset start offset in the subject string
304 offsets vector to contain the matching string offsets
305 offsetcount size of same
306 workspace vector of workspace
307 wscount size of same
308 ims the current ims flags
309 rlevel function call recursion level
310 recursing regex recursive call level
311
312 Returns: > 0 => number of match offset pairs placed in offsets
313 = 0 => offsets overflowed; longest matches are present
314 -1 => failed to match
315 < -1 => some kind of unexpected problem
316
317 The following macros are used for adding states to the two state vectors (one
318 for the current character, one for the following character). */
319
320 #define ADD_ACTIVE(x,y) \
321 if (active_count++ < wscount) \
322 { \
323 next_active_state->offset = (x); \
324 next_active_state->count = (y); \
325 next_active_state->ims = ims; \
326 next_active_state++; \
327 DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
328 } \
329 else return PCRE_ERROR_DFA_WSSIZE
330
331 #define ADD_ACTIVE_DATA(x,y,z) \
332 if (active_count++ < wscount) \
333 { \
334 next_active_state->offset = (x); \
335 next_active_state->count = (y); \
336 next_active_state->ims = ims; \
337 next_active_state->data = (z); \
338 next_active_state++; \
339 DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
340 } \
341 else return PCRE_ERROR_DFA_WSSIZE
342
343 #define ADD_NEW(x,y) \
344 if (new_count++ < wscount) \
345 { \
346 next_new_state->offset = (x); \
347 next_new_state->count = (y); \
348 next_new_state->ims = ims; \
349 next_new_state++; \
350 DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
351 } \
352 else return PCRE_ERROR_DFA_WSSIZE
353
354 #define ADD_NEW_DATA(x,y,z) \
355 if (new_count++ < wscount) \
356 { \
357 next_new_state->offset = (x); \
358 next_new_state->count = (y); \
359 next_new_state->ims = ims; \
360 next_new_state->data = (z); \
361 next_new_state++; \
362 DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
363 } \
364 else return PCRE_ERROR_DFA_WSSIZE
365
366 /* And now, here is the code */
367
368 static int
369 internal_dfa_exec(
370 dfa_match_data *md,
371 const uschar *this_start_code,
372 const uschar *current_subject,
373 int start_offset,
374 int *offsets,
375 int offsetcount,
376 int *workspace,
377 int wscount,
378 int ims,
379 int rlevel,
380 int recursing)
381 {
382 stateblock *active_states, *new_states, *temp_states;
383 stateblock *next_active_state, *next_new_state;
384
385 const uschar *ctypes, *lcc, *fcc;
386 const uschar *ptr;
387 const uschar *end_code, *first_op;
388
389 int active_count, new_count, match_count;
390
391 /* Some fields in the md block are frequently referenced, so we load them into
392 independent variables in the hope that this will perform better. */
393
394 const uschar *start_subject = md->start_subject;
395 const uschar *end_subject = md->end_subject;
396 const uschar *start_code = md->start_code;
397
398 #ifdef SUPPORT_UTF8
399 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
400 #else
401 BOOL utf8 = FALSE;
402 #endif
403
404 rlevel++;
405 offsetcount &= (-2);
406
407 wscount -= 2;
408 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
409 (2 * INTS_PER_STATEBLOCK);
410
411 DPRINTF(("\n%.*s---------------------\n"
412 "%.*sCall to internal_dfa_exec f=%d r=%d\n",
413 rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
414
415 ctypes = md->tables + ctypes_offset;
416 lcc = md->tables + lcc_offset;
417 fcc = md->tables + fcc_offset;
418
419 match_count = PCRE_ERROR_NOMATCH; /* A negative number */
420
421 active_states = (stateblock *)(workspace + 2);
422 next_new_state = new_states = active_states + wscount;
423 new_count = 0;
424
425 first_op = this_start_code + 1 + LINK_SIZE +
426 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
427
428 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
429 the alternative states onto the list, and find out where the end is. This
430 makes is possible to use this function recursively, when we want to stop at a
431 matching internal ket rather than at the end.
432
433 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
434 a backward assertion. In that case, we have to find out the maximum amount to
435 move back, and set up each alternative appropriately. */
436
437 if (*first_op == OP_REVERSE)
438 {
439 int max_back = 0;
440 int gone_back;
441
442 end_code = this_start_code;
443 do
444 {
445 int back = GET(end_code, 2+LINK_SIZE);
446 if (back > max_back) max_back = back;
447 end_code += GET(end_code, 1);
448 }
449 while (*end_code == OP_ALT);
450
451 /* If we can't go back the amount required for the longest lookbehind
452 pattern, go back as far as we can; some alternatives may still be viable. */
453
454 #ifdef SUPPORT_UTF8
455 /* In character mode we have to step back character by character */
456
457 if (utf8)
458 {
459 for (gone_back = 0; gone_back < max_back; gone_back++)
460 {
461 if (current_subject <= start_subject) break;
462 current_subject--;
463 while (current_subject > start_subject &&
464 (*current_subject & 0xc0) == 0x80)
465 current_subject--;
466 }
467 }
468 else
469 #endif
470
471 /* In byte-mode we can do this quickly. */
472
473 {
474 gone_back = (current_subject - max_back < start_subject)?
475 current_subject - start_subject : max_back;
476 current_subject -= gone_back;
477 }
478
479 /* Save the earliest consulted character */
480
481 if (current_subject < md->start_used_ptr)
482 md->start_used_ptr = current_subject;
483
484 /* Now we can process the individual branches. */
485
486 end_code = this_start_code;
487 do
488 {
489 int back = GET(end_code, 2+LINK_SIZE);
490 if (back <= gone_back)
491 {
492 int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
493 ADD_NEW_DATA(-bstate, 0, gone_back - back);
494 }
495 end_code += GET(end_code, 1);
496 }
497 while (*end_code == OP_ALT);
498 }
499
500 /* This is the code for a "normal" subpattern (not a backward assertion). The
501 start of a whole pattern is always one of these. If we are at the top level,
502 we may be asked to restart matching from the same point that we reached for a
503 previous partial match. We still have to scan through the top-level branches to
504 find the end state. */
505
506 else
507 {
508 end_code = this_start_code;
509
510 /* Restarting */
511
512 if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
513 {
514 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
515 new_count = workspace[1];
516 if (!workspace[0])
517 memcpy(new_states, active_states, new_count * sizeof(stateblock));
518 }
519
520 /* Not restarting */
521
522 else
523 {
524 int length = 1 + LINK_SIZE +
525 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
526 do
527 {
528 ADD_NEW(end_code - start_code + length, 0);
529 end_code += GET(end_code, 1);
530 length = 1 + LINK_SIZE;
531 }
532 while (*end_code == OP_ALT);
533 }
534 }
535
536 workspace[0] = 0; /* Bit indicating which vector is current */
537
538 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
539
540 /* Loop for scanning the subject */
541
542 ptr = current_subject;
543 for (;;)
544 {
545 int i, j;
546 int clen, dlen;
547 unsigned int c, d;
548 int forced_fail = 0;
549 int reached_end = 0;
550 BOOL could_continue = FALSE;
551
552 /* Make the new state list into the active state list and empty the
553 new state list. */
554
555 temp_states = active_states;
556 active_states = new_states;
557 new_states = temp_states;
558 active_count = new_count;
559 new_count = 0;
560
561 workspace[0] ^= 1; /* Remember for the restarting feature */
562 workspace[1] = active_count;
563
564 #ifdef DEBUG
565 printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
566 pchars((uschar *)ptr, strlen((char *)ptr), stdout);
567 printf("\"\n");
568
569 printf("%.*sActive states: ", rlevel*2-2, SP);
570 for (i = 0; i < active_count; i++)
571 printf("%d/%d ", active_states[i].offset, active_states[i].count);
572 printf("\n");
573 #endif
574
575 /* Set the pointers for adding new states */
576
577 next_active_state = active_states + active_count;
578 next_new_state = new_states;
579
580 /* Load the current character from the subject outside the loop, as many
581 different states may want to look at it, and we assume that at least one
582 will. */
583
584 if (ptr < end_subject)
585 {
586 clen = 1; /* Number of bytes in the character */
587 #ifdef SUPPORT_UTF8
588 if (utf8) { GETCHARLEN(c, ptr, clen); } else
589 #endif /* SUPPORT_UTF8 */
590 c = *ptr;
591 }
592 else
593 {
594 clen = 0; /* This indicates the end of the subject */
595 c = NOTACHAR; /* This value should never actually be used */
596 }
597
598 /* Scan up the active states and act on each one. The result of an action
599 may be to add more states to the currently active list (e.g. on hitting a
600 parenthesis) or it may be to put states on the new list, for considering
601 when we move the character pointer on. */
602
603 for (i = 0; i < active_count; i++)
604 {
605 stateblock *current_state = active_states + i;
606 const uschar *code;
607 int state_offset = current_state->offset;
608 int count, codevalue, rrc;
609
610 #ifdef DEBUG
611 printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
612 if (clen == 0) printf("EOL\n");
613 else if (c > 32 && c < 127) printf("'%c'\n", c);
614 else printf("0x%02x\n", c);
615 #endif
616
617 /* This variable is referred to implicity in the ADD_xxx macros. */
618
619 ims = current_state->ims;
620
621 /* A negative offset is a special case meaning "hold off going to this
622 (negated) state until the number of characters in the data field have
623 been skipped". */
624
625 if (state_offset < 0)
626 {
627 if (current_state->data > 0)
628 {
629 DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
630 ADD_NEW_DATA(state_offset, current_state->count,
631 current_state->data - 1);
632 continue;
633 }
634 else
635 {
636 current_state->offset = state_offset = -state_offset;
637 }
638 }
639
640 /* Check for a duplicate state with the same count, and skip if found.
641 See the note at the head of this module about the possibility of improving
642 performance here. */
643
644 for (j = 0; j < i; j++)
645 {
646 if (active_states[j].offset == state_offset &&
647 active_states[j].count == current_state->count)
648 {
649 DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
650 goto NEXT_ACTIVE_STATE;
651 }
652 }
653
654 /* The state offset is the offset to the opcode */
655
656 code = start_code + state_offset;
657 codevalue = *code;
658
659 /* If this opcode inspects a character, but we are at the end of the
660 subject, remember the fact so that we can support PCRE_PARTIAL_HARD. */
661
662 if (clen == 0 && poptable[codevalue] != 0)
663 could_continue = TRUE;
664
665 /* If this opcode is followed by an inline character, load it. It is
666 tempting to test for the presence of a subject character here, but that
667 is wrong, because sometimes zero repetitions of the subject are
668 permitted.
669
670 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
671 argument that is not a data character - but is always one byte long. We
672 have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
673 this case. To keep the other cases fast, convert these ones to new opcodes.
674 */
675
676 if (coptable[codevalue] > 0)
677 {
678 dlen = 1;
679 #ifdef SUPPORT_UTF8
680 if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
681 #endif /* SUPPORT_UTF8 */
682 d = code[coptable[codevalue]];
683 if (codevalue >= OP_TYPESTAR)
684 {
685 switch(d)
686 {
687 case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
688 case OP_NOTPROP:
689 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
690 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
691 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
692 case OP_NOT_HSPACE:
693 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
694 case OP_NOT_VSPACE:
695 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
696 default: break;
697 }
698 }
699 }
700 else
701 {
702 dlen = 0; /* Not strictly necessary, but compilers moan */
703 d = NOTACHAR; /* if these variables are not set. */
704 }
705
706
707 /* Now process the individual opcodes */
708
709 switch (codevalue)
710 {
711
712 /* ========================================================================== */
713 /* Reached a closing bracket. If not at the end of the pattern, carry
714 on with the next opcode. Otherwise, unless we have an empty string and
715 PCRE_NOTEMPTY is set, or PCRE_NOTEMPTY_ATSTART is set and we are at the
716 start of the subject, save the match data, shifting up all previous
717 matches so we always have the longest first. */
718
719 case OP_KET:
720 case OP_KETRMIN:
721 case OP_KETRMAX:
722 if (code != end_code)
723 {
724 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
725 if (codevalue != OP_KET)
726 {
727 ADD_ACTIVE(state_offset - GET(code, 1), 0);
728 }
729 }
730 else
731 {
732 reached_end++; /* Count branches that reach the end */
733 if (ptr > current_subject ||
734 ((md->moptions & PCRE_NOTEMPTY) == 0 &&
735 ((md->moptions & PCRE_NOTEMPTY_ATSTART) == 0 ||
736 current_subject > start_subject + md->start_offset)))
737 {
738 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
739 else if (match_count > 0 && ++match_count * 2 >= offsetcount)
740 match_count = 0;
741 count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
742 if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
743 if (offsetcount >= 2)
744 {
745 offsets[0] = current_subject - start_subject;
746 offsets[1] = ptr - start_subject;
747 DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
748 offsets[1] - offsets[0], current_subject));
749 }
750 if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
751 {
752 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
753 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
754 match_count, rlevel*2-2, SP));
755 return match_count;
756 }
757 }
758 }
759 break;
760
761 /* ========================================================================== */
762 /* These opcodes add to the current list of states without looking
763 at the current character. */
764
765 /*-----------------------------------------------------------------*/
766 case OP_ALT:
767 do { code += GET(code, 1); } while (*code == OP_ALT);
768 ADD_ACTIVE(code - start_code, 0);
769 break;
770
771 /*-----------------------------------------------------------------*/
772 case OP_BRA:
773 case OP_SBRA:
774 do
775 {
776 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
777 code += GET(code, 1);
778 }
779 while (*code == OP_ALT);
780 break;
781
782 /*-----------------------------------------------------------------*/
783 case OP_CBRA:
784 case OP_SCBRA:
785 ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
786 code += GET(code, 1);
787 while (*code == OP_ALT)
788 {
789 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
790 code += GET(code, 1);
791 }
792 break;
793
794 /*-----------------------------------------------------------------*/
795 case OP_BRAZERO:
796 case OP_BRAMINZERO:
797 ADD_ACTIVE(state_offset + 1, 0);
798 code += 1 + GET(code, 2);
799 while (*code == OP_ALT) code += GET(code, 1);
800 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
801 break;
802
803 /*-----------------------------------------------------------------*/
804 case OP_SKIPZERO:
805 code += 1 + GET(code, 2);
806 while (*code == OP_ALT) code += GET(code, 1);
807 ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
808 break;
809
810 /*-----------------------------------------------------------------*/
811 case OP_CIRC:
812 if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
813 ((ims & PCRE_MULTILINE) != 0 &&
814 ptr != end_subject &&
815 WAS_NEWLINE(ptr)))
816 { ADD_ACTIVE(state_offset + 1, 0); }
817 break;
818
819 /*-----------------------------------------------------------------*/
820 case OP_EOD:
821 if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
822 break;
823
824 /*-----------------------------------------------------------------*/
825 case OP_OPT:
826 ims = code[1];
827 ADD_ACTIVE(state_offset + 2, 0);
828 break;
829
830 /*-----------------------------------------------------------------*/
831 case OP_SOD:
832 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
833 break;
834
835 /*-----------------------------------------------------------------*/
836 case OP_SOM:
837 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
838 break;
839
840
841 /* ========================================================================== */
842 /* These opcodes inspect the next subject character, and sometimes
843 the previous one as well, but do not have an argument. The variable
844 clen contains the length of the current character and is zero if we are
845 at the end of the subject. */
846
847 /*-----------------------------------------------------------------*/
848 case OP_ANY:
849 if (clen > 0 && !IS_NEWLINE(ptr))
850 { ADD_NEW(state_offset + 1, 0); }
851 break;
852
853 /*-----------------------------------------------------------------*/
854 case OP_ALLANY:
855 if (clen > 0)
856 { ADD_NEW(state_offset + 1, 0); }
857 break;
858
859 /*-----------------------------------------------------------------*/
860 case OP_EODN:
861 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
862 { ADD_ACTIVE(state_offset + 1, 0); }
863 break;
864
865 /*-----------------------------------------------------------------*/
866 case OP_DOLL:
867 if ((md->moptions & PCRE_NOTEOL) == 0)
868 {
869 if (clen == 0 ||
870 ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
871 ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
872 ))
873 { ADD_ACTIVE(state_offset + 1, 0); }
874 }
875 else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
876 { ADD_ACTIVE(state_offset + 1, 0); }
877 break;
878
879 /*-----------------------------------------------------------------*/
880
881 case OP_DIGIT:
882 case OP_WHITESPACE:
883 case OP_WORDCHAR:
884 if (clen > 0 && c < 256 &&
885 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
886 { ADD_NEW(state_offset + 1, 0); }
887 break;
888
889 /*-----------------------------------------------------------------*/
890 case OP_NOT_DIGIT:
891 case OP_NOT_WHITESPACE:
892 case OP_NOT_WORDCHAR:
893 if (clen > 0 && (c >= 256 ||
894 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
895 { ADD_NEW(state_offset + 1, 0); }
896 break;
897
898 /*-----------------------------------------------------------------*/
899 case OP_WORD_BOUNDARY:
900 case OP_NOT_WORD_BOUNDARY:
901 {
902 int left_word, right_word;
903
904 if (ptr > start_subject)
905 {
906 const uschar *temp = ptr - 1;
907 if (temp < md->start_used_ptr) md->start_used_ptr = temp;
908 #ifdef SUPPORT_UTF8
909 if (utf8) BACKCHAR(temp);
910 #endif
911 GETCHARTEST(d, temp);
912 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
913 }
914 else left_word = 0;
915
916 if (clen > 0)
917 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
918 else /* This is a fudge to ensure that if this is the */
919 { /* last item in the pattern, we don't count it as */
920 reached_end--; /* reached, thus disabling a partial match. */
921 right_word = 0;
922 }
923
924 if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
925 { ADD_ACTIVE(state_offset + 1, 0); }
926 }
927 break;
928
929
930 /*-----------------------------------------------------------------*/
931 /* Check the next character by Unicode property. We will get here only
932 if the support is in the binary; otherwise a compile-time error occurs.
933 */
934
935 #ifdef SUPPORT_UCP
936 case OP_PROP:
937 case OP_NOTPROP:
938 if (clen > 0)
939 {
940 BOOL OK;
941 const ucd_record * prop = GET_UCD(c);
942 switch(code[1])
943 {
944 case PT_ANY:
945 OK = TRUE;
946 break;
947
948 case PT_LAMP:
949 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
950 break;
951
952 case PT_GC:
953 OK = _pcre_ucp_gentype[prop->chartype] == code[2];
954 break;
955
956 case PT_PC:
957 OK = prop->chartype == code[2];
958 break;
959
960 case PT_SC:
961 OK = prop->script == code[2];
962 break;
963
964 /* Should never occur, but keep compilers from grumbling. */
965
966 default:
967 OK = codevalue != OP_PROP;
968 break;
969 }
970
971 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
972 }
973 break;
974 #endif
975
976
977
978 /* ========================================================================== */
979 /* These opcodes likewise inspect the subject character, but have an
980 argument that is not a data character. It is one of these opcodes:
981 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
982 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
983
984 case OP_TYPEPLUS:
985 case OP_TYPEMINPLUS:
986 case OP_TYPEPOSPLUS:
987 count = current_state->count; /* Already matched */
988 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
989 if (clen > 0)
990 {
991 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
992 (c < 256 &&
993 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
994 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
995 {
996 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
997 {
998 active_count--; /* Remove non-match possibility */
999 next_active_state--;
1000 }
1001 count++;
1002 ADD_NEW(state_offset, count);
1003 }
1004 }
1005 break;
1006
1007 /*-----------------------------------------------------------------*/
1008 case OP_TYPEQUERY:
1009 case OP_TYPEMINQUERY:
1010 case OP_TYPEPOSQUERY:
1011 ADD_ACTIVE(state_offset + 2, 0);
1012 if (clen > 0)
1013 {
1014 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1015 (c < 256 &&
1016 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1017 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1018 {
1019 if (codevalue == OP_TYPEPOSQUERY)
1020 {
1021 active_count--; /* Remove non-match possibility */
1022 next_active_state--;
1023 }
1024 ADD_NEW(state_offset + 2, 0);
1025 }
1026 }
1027 break;
1028
1029 /*-----------------------------------------------------------------*/
1030 case OP_TYPESTAR:
1031 case OP_TYPEMINSTAR:
1032 case OP_TYPEPOSSTAR:
1033 ADD_ACTIVE(state_offset + 2, 0);
1034 if (clen > 0)
1035 {
1036 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1037 (c < 256 &&
1038 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1039 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1040 {
1041 if (codevalue == OP_TYPEPOSSTAR)
1042 {
1043 active_count--; /* Remove non-match possibility */
1044 next_active_state--;
1045 }
1046 ADD_NEW(state_offset, 0);
1047 }
1048 }
1049 break;
1050
1051 /*-----------------------------------------------------------------*/
1052 case OP_TYPEEXACT:
1053 count = current_state->count; /* Number already matched */
1054 if (clen > 0)
1055 {
1056 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1057 (c < 256 &&
1058 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1059 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1060 {
1061 if (++count >= GET2(code, 1))
1062 { ADD_NEW(state_offset + 4, 0); }
1063 else
1064 { ADD_NEW(state_offset, count); }
1065 }
1066 }
1067 break;
1068
1069 /*-----------------------------------------------------------------*/
1070 case OP_TYPEUPTO:
1071 case OP_TYPEMINUPTO:
1072 case OP_TYPEPOSUPTO:
1073 ADD_ACTIVE(state_offset + 4, 0);
1074 count = current_state->count; /* Number already matched */
1075 if (clen > 0)
1076 {
1077 if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1078 (c < 256 &&
1079 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1080 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1081 {
1082 if (codevalue == OP_TYPEPOSUPTO)
1083 {
1084 active_count--; /* Remove non-match possibility */
1085 next_active_state--;
1086 }
1087 if (++count >= GET2(code, 1))
1088 { ADD_NEW(state_offset + 4, 0); }
1089 else
1090 { ADD_NEW(state_offset, count); }
1091 }
1092 }
1093 break;
1094
1095 /* ========================================================================== */
1096 /* These are virtual opcodes that are used when something like
1097 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1098 argument. It keeps the code above fast for the other cases. The argument
1099 is in the d variable. */
1100
1101 #ifdef SUPPORT_UCP
1102 case OP_PROP_EXTRA + OP_TYPEPLUS:
1103 case OP_PROP_EXTRA + OP_TYPEMINPLUS:
1104 case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
1105 count = current_state->count; /* Already matched */
1106 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1107 if (clen > 0)
1108 {
1109 BOOL OK;
1110 const ucd_record * prop = GET_UCD(c);
1111 switch(code[2])
1112 {
1113 case PT_ANY:
1114 OK = TRUE;
1115 break;
1116
1117 case PT_LAMP:
1118 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1119 break;
1120
1121 case PT_GC:
1122 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1123 break;
1124
1125 case PT_PC:
1126 OK = prop->chartype == code[3];
1127 break;
1128
1129 case PT_SC:
1130 OK = prop->script == code[3];
1131 break;
1132
1133 /* Should never occur, but keep compilers from grumbling. */
1134
1135 default:
1136 OK = codevalue != OP_PROP;
1137 break;
1138 }
1139
1140 if (OK == (d == OP_PROP))
1141 {
1142 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1143 {
1144 active_count--; /* Remove non-match possibility */
1145 next_active_state--;
1146 }
1147 count++;
1148 ADD_NEW(state_offset, count);
1149 }
1150 }
1151 break;
1152
1153 /*-----------------------------------------------------------------*/
1154 case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1155 case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1156 case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1157 count = current_state->count; /* Already matched */
1158 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1159 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1160 {
1161 const uschar *nptr = ptr + clen;
1162 int ncount = 0;
1163 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1164 {
1165 active_count--; /* Remove non-match possibility */
1166 next_active_state--;
1167 }
1168 while (nptr < end_subject)
1169 {
1170 int nd;
1171 int ndlen = 1;
1172 GETCHARLEN(nd, nptr, ndlen);
1173 if (UCD_CATEGORY(nd) != ucp_M) break;
1174 ncount++;
1175 nptr += ndlen;
1176 }
1177 count++;
1178 ADD_NEW_DATA(-state_offset, count, ncount);
1179 }
1180 break;
1181 #endif
1182
1183 /*-----------------------------------------------------------------*/
1184 case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1185 case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1186 case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1187 count = current_state->count; /* Already matched */
1188 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1189 if (clen > 0)
1190 {
1191 int ncount = 0;
1192 switch (c)
1193 {
1194 case 0x000b:
1195 case 0x000c:
1196 case 0x0085:
1197 case 0x2028:
1198 case 0x2029:
1199 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1200 goto ANYNL01;
1201
1202 case 0x000d:
1203 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1204 /* Fall through */
1205
1206 ANYNL01:
1207 case 0x000a:
1208 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1209 {
1210 active_count--; /* Remove non-match possibility */
1211 next_active_state--;
1212 }
1213 count++;
1214 ADD_NEW_DATA(-state_offset, count, ncount);
1215 break;
1216
1217 default:
1218 break;
1219 }
1220 }
1221 break;
1222
1223 /*-----------------------------------------------------------------*/
1224 case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1225 case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1226 case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1227 count = current_state->count; /* Already matched */
1228 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1229 if (clen > 0)
1230 {
1231 BOOL OK;
1232 switch (c)
1233 {
1234 case 0x000a:
1235 case 0x000b:
1236 case 0x000c:
1237 case 0x000d:
1238 case 0x0085:
1239 case 0x2028:
1240 case 0x2029:
1241 OK = TRUE;
1242 break;
1243
1244 default:
1245 OK = FALSE;
1246 break;
1247 }
1248
1249 if (OK == (d == OP_VSPACE))
1250 {
1251 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1252 {
1253 active_count--; /* Remove non-match possibility */
1254 next_active_state--;
1255 }
1256 count++;
1257 ADD_NEW_DATA(-state_offset, count, 0);
1258 }
1259 }
1260 break;
1261
1262 /*-----------------------------------------------------------------*/
1263 case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1264 case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1265 case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1266 count = current_state->count; /* Already matched */
1267 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1268 if (clen > 0)
1269 {
1270 BOOL OK;
1271 switch (c)
1272 {
1273 case 0x09: /* HT */
1274 case 0x20: /* SPACE */
1275 case 0xa0: /* NBSP */
1276 case 0x1680: /* OGHAM SPACE MARK */
1277 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1278 case 0x2000: /* EN QUAD */
1279 case 0x2001: /* EM QUAD */
1280 case 0x2002: /* EN SPACE */
1281 case 0x2003: /* EM SPACE */
1282 case 0x2004: /* THREE-PER-EM SPACE */
1283 case 0x2005: /* FOUR-PER-EM SPACE */
1284 case 0x2006: /* SIX-PER-EM SPACE */
1285 case 0x2007: /* FIGURE SPACE */
1286 case 0x2008: /* PUNCTUATION SPACE */
1287 case 0x2009: /* THIN SPACE */
1288 case 0x200A: /* HAIR SPACE */
1289 case 0x202f: /* NARROW NO-BREAK SPACE */
1290 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1291 case 0x3000: /* IDEOGRAPHIC SPACE */
1292 OK = TRUE;
1293 break;
1294
1295 default:
1296 OK = FALSE;
1297 break;
1298 }
1299
1300 if (OK == (d == OP_HSPACE))
1301 {
1302 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1303 {
1304 active_count--; /* Remove non-match possibility */
1305 next_active_state--;
1306 }
1307 count++;
1308 ADD_NEW_DATA(-state_offset, count, 0);
1309 }
1310 }
1311 break;
1312
1313 /*-----------------------------------------------------------------*/
1314 #ifdef SUPPORT_UCP
1315 case OP_PROP_EXTRA + OP_TYPEQUERY:
1316 case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1317 case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1318 count = 4;
1319 goto QS1;
1320
1321 case OP_PROP_EXTRA + OP_TYPESTAR:
1322 case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1323 case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1324 count = 0;
1325
1326 QS1:
1327
1328 ADD_ACTIVE(state_offset + 4, 0);
1329 if (clen > 0)
1330 {
1331 BOOL OK;
1332 const ucd_record * prop = GET_UCD(c);
1333 switch(code[2])
1334 {
1335 case PT_ANY:
1336 OK = TRUE;
1337 break;
1338
1339 case PT_LAMP:
1340 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1341 break;
1342
1343 case PT_GC:
1344 OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1345 break;
1346
1347 case PT_PC:
1348 OK = prop->chartype == code[3];
1349 break;
1350
1351 case PT_SC:
1352 OK = prop->script == code[3];
1353 break;
1354
1355 /* Should never occur, but keep compilers from grumbling. */
1356
1357 default:
1358 OK = codevalue != OP_PROP;
1359 break;
1360 }
1361
1362 if (OK == (d == OP_PROP))
1363 {
1364 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1365 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1366 {
1367 active_count--; /* Remove non-match possibility */
1368 next_active_state--;
1369 }
1370 ADD_NEW(state_offset + count, 0);
1371 }
1372 }
1373 break;
1374
1375 /*-----------------------------------------------------------------*/
1376 case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1377 case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1378 case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1379 count = 2;
1380 goto QS2;
1381
1382 case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1383 case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1384 case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1385 count = 0;
1386
1387 QS2:
1388
1389 ADD_ACTIVE(state_offset + 2, 0);
1390 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1391 {
1392 const uschar *nptr = ptr + clen;
1393 int ncount = 0;
1394 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1395 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1396 {
1397 active_count--; /* Remove non-match possibility */
1398 next_active_state--;
1399 }
1400 while (nptr < end_subject)
1401 {
1402 int nd;
1403 int ndlen = 1;
1404 GETCHARLEN(nd, nptr, ndlen);
1405 if (UCD_CATEGORY(nd) != ucp_M) break;
1406 ncount++;
1407 nptr += ndlen;
1408 }
1409 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1410 }
1411 break;
1412 #endif
1413
1414 /*-----------------------------------------------------------------*/
1415 case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1416 case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1417 case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1418 count = 2;
1419 goto QS3;
1420
1421 case OP_ANYNL_EXTRA + OP_TYPESTAR:
1422 case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1423 case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1424 count = 0;
1425
1426 QS3:
1427 ADD_ACTIVE(state_offset + 2, 0);
1428 if (clen > 0)
1429 {
1430 int ncount = 0;
1431 switch (c)
1432 {
1433 case 0x000b:
1434 case 0x000c:
1435 case 0x0085:
1436 case 0x2028:
1437 case 0x2029:
1438 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1439 goto ANYNL02;
1440
1441 case 0x000d:
1442 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1443 /* Fall through */
1444
1445 ANYNL02:
1446 case 0x000a:
1447 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1448 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1449 {
1450 active_count--; /* Remove non-match possibility */
1451 next_active_state--;
1452 }
1453 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1454 break;
1455
1456 default:
1457 break;
1458 }
1459 }
1460 break;
1461
1462 /*-----------------------------------------------------------------*/
1463 case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1464 case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1465 case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1466 count = 2;
1467 goto QS4;
1468
1469 case OP_VSPACE_EXTRA + OP_TYPESTAR:
1470 case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1471 case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1472 count = 0;
1473
1474 QS4:
1475 ADD_ACTIVE(state_offset + 2, 0);
1476 if (clen > 0)
1477 {
1478 BOOL OK;
1479 switch (c)
1480 {
1481 case 0x000a:
1482 case 0x000b:
1483 case 0x000c:
1484 case 0x000d:
1485 case 0x0085:
1486 case 0x2028:
1487 case 0x2029:
1488 OK = TRUE;
1489 break;
1490
1491 default:
1492 OK = FALSE;
1493 break;
1494 }
1495 if (OK == (d == OP_VSPACE))
1496 {
1497 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1498 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1499 {
1500 active_count--; /* Remove non-match possibility */
1501 next_active_state--;
1502 }
1503 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1504 }
1505 }
1506 break;
1507
1508 /*-----------------------------------------------------------------*/
1509 case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1510 case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1511 case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1512 count = 2;
1513 goto QS5;
1514
1515 case OP_HSPACE_EXTRA + OP_TYPESTAR:
1516 case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1517 case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1518 count = 0;
1519
1520 QS5:
1521 ADD_ACTIVE(state_offset + 2, 0);
1522 if (clen > 0)
1523 {
1524 BOOL OK;
1525 switch (c)
1526 {
1527 case 0x09: /* HT */
1528 case 0x20: /* SPACE */
1529 case 0xa0: /* NBSP */
1530 case 0x1680: /* OGHAM SPACE MARK */
1531 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1532 case 0x2000: /* EN QUAD */
1533 case 0x2001: /* EM QUAD */
1534 case 0x2002: /* EN SPACE */
1535 case 0x2003: /* EM SPACE */
1536 case 0x2004: /* THREE-PER-EM SPACE */
1537 case 0x2005: /* FOUR-PER-EM SPACE */
1538 case 0x2006: /* SIX-PER-EM SPACE */
1539 case 0x2007: /* FIGURE SPACE */
1540 case 0x2008: /* PUNCTUATION SPACE */
1541 case 0x2009: /* THIN SPACE */
1542 case 0x200A: /* HAIR SPACE */
1543 case 0x202f: /* NARROW NO-BREAK SPACE */
1544 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1545 case 0x3000: /* IDEOGRAPHIC SPACE */
1546 OK = TRUE;
1547 break;
1548
1549 default:
1550 OK = FALSE;
1551 break;
1552 }
1553
1554 if (OK == (d == OP_HSPACE))
1555 {
1556 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1557 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1558 {
1559 active_count--; /* Remove non-match possibility */
1560 next_active_state--;
1561 }
1562 ADD_NEW_DATA(-(state_offset + count), 0, 0);
1563 }
1564 }
1565 break;
1566
1567 /*-----------------------------------------------------------------*/
1568 #ifdef SUPPORT_UCP
1569 case OP_PROP_EXTRA + OP_TYPEEXACT:
1570 case OP_PROP_EXTRA + OP_TYPEUPTO:
1571 case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1572 case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1573 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1574 { ADD_ACTIVE(state_offset + 6, 0); }
1575 count = current_state->count; /* Number already matched */
1576 if (clen > 0)
1577 {
1578 BOOL OK;
1579 const ucd_record * prop = GET_UCD(c);
1580 switch(code[4])
1581 {
1582 case PT_ANY:
1583 OK = TRUE;
1584 break;
1585
1586 case PT_LAMP:
1587 OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1588 break;
1589
1590 case PT_GC:
1591 OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1592 break;
1593
1594 case PT_PC:
1595 OK = prop->chartype == code[5];
1596 break;
1597
1598 case PT_SC:
1599 OK = prop->script == code[5];
1600 break;
1601
1602 /* Should never occur, but keep compilers from grumbling. */
1603
1604 default:
1605 OK = codevalue != OP_PROP;
1606 break;
1607 }
1608
1609 if (OK == (d == OP_PROP))
1610 {
1611 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1612 {
1613 active_count--; /* Remove non-match possibility */
1614 next_active_state--;
1615 }
1616 if (++count >= GET2(code, 1))
1617 { ADD_NEW(state_offset + 6, 0); }
1618 else
1619 { ADD_NEW(state_offset, count); }
1620 }
1621 }
1622 break;
1623
1624 /*-----------------------------------------------------------------*/
1625 case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1626 case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1627 case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1628 case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1629 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1630 { ADD_ACTIVE(state_offset + 4, 0); }
1631 count = current_state->count; /* Number already matched */
1632 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1633 {
1634 const uschar *nptr = ptr + clen;
1635 int ncount = 0;
1636 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1637 {
1638 active_count--; /* Remove non-match possibility */
1639 next_active_state--;
1640 }
1641 while (nptr < end_subject)
1642 {
1643 int nd;
1644 int ndlen = 1;
1645 GETCHARLEN(nd, nptr, ndlen);
1646 if (UCD_CATEGORY(nd) != ucp_M) break;
1647 ncount++;
1648 nptr += ndlen;
1649 }
1650 if (++count >= GET2(code, 1))
1651 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1652 else
1653 { ADD_NEW_DATA(-state_offset, count, ncount); }
1654 }
1655 break;
1656 #endif
1657
1658 /*-----------------------------------------------------------------*/
1659 case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1660 case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1661 case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1662 case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1663 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1664 { ADD_ACTIVE(state_offset + 4, 0); }
1665 count = current_state->count; /* Number already matched */
1666 if (clen > 0)
1667 {
1668 int ncount = 0;
1669 switch (c)
1670 {
1671 case 0x000b:
1672 case 0x000c:
1673 case 0x0085:
1674 case 0x2028:
1675 case 0x2029:
1676 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1677 goto ANYNL03;
1678
1679 case 0x000d:
1680 if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1681 /* Fall through */
1682
1683 ANYNL03:
1684 case 0x000a:
1685 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1686 {
1687 active_count--; /* Remove non-match possibility */
1688 next_active_state--;
1689 }
1690 if (++count >= GET2(code, 1))
1691 { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1692 else
1693 { ADD_NEW_DATA(-state_offset, count, ncount); }
1694 break;
1695
1696 default:
1697 break;
1698 }
1699 }
1700 break;
1701
1702 /*-----------------------------------------------------------------*/
1703 case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1704 case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1705 case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1706 case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1707 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1708 { ADD_ACTIVE(state_offset + 4, 0); }
1709 count = current_state->count; /* Number already matched */
1710 if (clen > 0)
1711 {
1712 BOOL OK;
1713 switch (c)
1714 {
1715 case 0x000a:
1716 case 0x000b:
1717 case 0x000c:
1718 case 0x000d:
1719 case 0x0085:
1720 case 0x2028:
1721 case 0x2029:
1722 OK = TRUE;
1723 break;
1724
1725 default:
1726 OK = FALSE;
1727 }
1728
1729 if (OK == (d == OP_VSPACE))
1730 {
1731 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1732 {
1733 active_count--; /* Remove non-match possibility */
1734 next_active_state--;
1735 }
1736 if (++count >= GET2(code, 1))
1737 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1738 else
1739 { ADD_NEW_DATA(-state_offset, count, 0); }
1740 }
1741 }
1742 break;
1743
1744 /*-----------------------------------------------------------------*/
1745 case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1746 case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1747 case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1748 case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1749 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1750 { ADD_ACTIVE(state_offset + 4, 0); }
1751 count = current_state->count; /* Number already matched */
1752 if (clen > 0)
1753 {
1754 BOOL OK;
1755 switch (c)
1756 {
1757 case 0x09: /* HT */
1758 case 0x20: /* SPACE */
1759 case 0xa0: /* NBSP */
1760 case 0x1680: /* OGHAM SPACE MARK */
1761 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1762 case 0x2000: /* EN QUAD */
1763 case 0x2001: /* EM QUAD */
1764 case 0x2002: /* EN SPACE */
1765 case 0x2003: /* EM SPACE */
1766 case 0x2004: /* THREE-PER-EM SPACE */
1767 case 0x2005: /* FOUR-PER-EM SPACE */
1768 case 0x2006: /* SIX-PER-EM SPACE */
1769 case 0x2007: /* FIGURE SPACE */
1770 case 0x2008: /* PUNCTUATION SPACE */
1771 case 0x2009: /* THIN SPACE */
1772 case 0x200A: /* HAIR SPACE */
1773 case 0x202f: /* NARROW NO-BREAK SPACE */
1774 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1775 case 0x3000: /* IDEOGRAPHIC SPACE */
1776 OK = TRUE;
1777 break;
1778
1779 default:
1780 OK = FALSE;
1781 break;
1782 }
1783
1784 if (OK == (d == OP_HSPACE))
1785 {
1786 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1787 {
1788 active_count--; /* Remove non-match possibility */
1789 next_active_state--;
1790 }
1791 if (++count >= GET2(code, 1))
1792 { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1793 else
1794 { ADD_NEW_DATA(-state_offset, count, 0); }
1795 }
1796 }
1797 break;
1798
1799 /* ========================================================================== */
1800 /* These opcodes are followed by a character that is usually compared
1801 to the current subject character; it is loaded into d. We still get
1802 here even if there is no subject character, because in some cases zero
1803 repetitions are permitted. */
1804
1805 /*-----------------------------------------------------------------*/
1806 case OP_CHAR:
1807 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1808 break;
1809
1810 /*-----------------------------------------------------------------*/
1811 case OP_CHARNC:
1812 if (clen == 0) break;
1813
1814 #ifdef SUPPORT_UTF8
1815 if (utf8)
1816 {
1817 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1818 {
1819 unsigned int othercase;
1820 if (c < 128) othercase = fcc[c]; else
1821
1822 /* If we have Unicode property support, we can use it to test the
1823 other case of the character. */
1824
1825 #ifdef SUPPORT_UCP
1826 othercase = UCD_OTHERCASE(c);
1827 #else
1828 othercase = NOTACHAR;
1829 #endif
1830
1831 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1832 }
1833 }
1834 else
1835 #endif /* SUPPORT_UTF8 */
1836
1837 /* Non-UTF-8 mode */
1838 {
1839 if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1840 }
1841 break;
1842
1843
1844 #ifdef SUPPORT_UCP
1845 /*-----------------------------------------------------------------*/
1846 /* This is a tricky one because it can match more than one character.
1847 Find out how many characters to skip, and then set up a negative state
1848 to wait for them to pass before continuing. */
1849
1850 case OP_EXTUNI:
1851 if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1852 {
1853 const uschar *nptr = ptr + clen;
1854 int ncount = 0;
1855 while (nptr < end_subject)
1856 {
1857 int nclen = 1;
1858 GETCHARLEN(c, nptr, nclen);
1859 if (UCD_CATEGORY(c) != ucp_M) break;
1860 ncount++;
1861 nptr += nclen;
1862 }
1863 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1864 }
1865 break;
1866 #endif
1867
1868 /*-----------------------------------------------------------------*/
1869 /* This is a tricky like EXTUNI because it too can match more than one
1870 character (when CR is followed by LF). In this case, set up a negative
1871 state to wait for one character to pass before continuing. */
1872
1873 case OP_ANYNL:
1874 if (clen > 0) switch(c)
1875 {
1876 case 0x000b:
1877 case 0x000c:
1878 case 0x0085:
1879 case 0x2028:
1880 case 0x2029:
1881 if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1882
1883 case 0x000a:
1884 ADD_NEW(state_offset + 1, 0);
1885 break;
1886
1887 case 0x000d:
1888 if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1889 {
1890 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1891 }
1892 else
1893 {
1894 ADD_NEW(state_offset + 1, 0);
1895 }
1896 break;
1897 }
1898 break;
1899
1900 /*-----------------------------------------------------------------*/
1901 case OP_NOT_VSPACE:
1902 if (clen > 0) switch(c)
1903 {
1904 case 0x000a:
1905 case 0x000b:
1906 case 0x000c:
1907 case 0x000d:
1908 case 0x0085:
1909 case 0x2028:
1910 case 0x2029:
1911 break;
1912
1913 default:
1914 ADD_NEW(state_offset + 1, 0);
1915 break;
1916 }
1917 break;
1918
1919 /*-----------------------------------------------------------------*/
1920 case OP_VSPACE:
1921 if (clen > 0) switch(c)
1922 {
1923 case 0x000a:
1924 case 0x000b:
1925 case 0x000c:
1926 case 0x000d:
1927 case 0x0085:
1928 case 0x2028:
1929 case 0x2029:
1930 ADD_NEW(state_offset + 1, 0);
1931 break;
1932
1933 default: break;
1934 }
1935 break;
1936
1937 /*-----------------------------------------------------------------*/
1938 case OP_NOT_HSPACE:
1939 if (clen > 0) switch(c)
1940 {
1941 case 0x09: /* HT */
1942 case 0x20: /* SPACE */
1943 case 0xa0: /* NBSP */
1944 case 0x1680: /* OGHAM SPACE MARK */
1945 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1946 case 0x2000: /* EN QUAD */
1947 case 0x2001: /* EM QUAD */
1948 case 0x2002: /* EN SPACE */
1949 case 0x2003: /* EM SPACE */
1950 case 0x2004: /* THREE-PER-EM SPACE */
1951 case 0x2005: /* FOUR-PER-EM SPACE */
1952 case 0x2006: /* SIX-PER-EM SPACE */
1953 case 0x2007: /* FIGURE SPACE */
1954 case 0x2008: /* PUNCTUATION SPACE */
1955 case 0x2009: /* THIN SPACE */
1956 case 0x200A: /* HAIR SPACE */
1957 case 0x202f: /* NARROW NO-BREAK SPACE */
1958 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1959 case 0x3000: /* IDEOGRAPHIC SPACE */
1960 break;
1961
1962 default:
1963 ADD_NEW(state_offset + 1, 0);
1964 break;
1965 }
1966 break;
1967
1968 /*-----------------------------------------------------------------*/
1969 case OP_HSPACE:
1970 if (clen > 0) switch(c)
1971 {
1972 case 0x09: /* HT */
1973 case 0x20: /* SPACE */
1974 case 0xa0: /* NBSP */
1975 case 0x1680: /* OGHAM SPACE MARK */
1976 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1977 case 0x2000: /* EN QUAD */
1978 case 0x2001: /* EM QUAD */
1979 case 0x2002: /* EN SPACE */
1980 case 0x2003: /* EM SPACE */
1981 case 0x2004: /* THREE-PER-EM SPACE */
1982 case 0x2005: /* FOUR-PER-EM SPACE */
1983 case 0x2006: /* SIX-PER-EM SPACE */
1984 case 0x2007: /* FIGURE SPACE */
1985 case 0x2008: /* PUNCTUATION SPACE */
1986 case 0x2009: /* THIN SPACE */
1987 case 0x200A: /* HAIR SPACE */
1988 case 0x202f: /* NARROW NO-BREAK SPACE */
1989 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1990 case 0x3000: /* IDEOGRAPHIC SPACE */
1991 ADD_NEW(state_offset + 1, 0);
1992 break;
1993 }
1994 break;
1995
1996 /*-----------------------------------------------------------------*/
1997 /* Match a negated single character. This is only used for one-byte
1998 characters, that is, we know that d < 256. The character we are
1999 checking (c) can be multibyte. */
2000
2001 case OP_NOT:
2002 if (clen > 0)
2003 {
2004 unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
2005 if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
2006 }
2007 break;
2008
2009 /*-----------------------------------------------------------------*/
2010 case OP_PLUS:
2011 case OP_MINPLUS:
2012 case OP_POSPLUS:
2013 case OP_NOTPLUS:
2014 case OP_NOTMINPLUS:
2015 case OP_NOTPOSPLUS:
2016 count = current_state->count; /* Already matched */
2017 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2018 if (clen > 0)
2019 {
2020 unsigned int otherd = NOTACHAR;
2021 if ((ims & PCRE_CASELESS) != 0)
2022 {
2023 #ifdef SUPPORT_UTF8
2024 if (utf8 && d >= 128)
2025 {
2026 #ifdef SUPPORT_UCP
2027 otherd = UCD_OTHERCASE(d);
2028 #endif /* SUPPORT_UCP */
2029 }
2030 else
2031 #endif /* SUPPORT_UTF8 */
2032 otherd = fcc[d];
2033 }
2034 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2035 {
2036 if (count > 0 &&
2037 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2038 {
2039 active_count--; /* Remove non-match possibility */
2040 next_active_state--;
2041 }
2042 count++;
2043 ADD_NEW(state_offset, count);
2044 }
2045 }
2046 break;
2047
2048 /*-----------------------------------------------------------------*/
2049 case OP_QUERY:
2050 case OP_MINQUERY:
2051 case OP_POSQUERY:
2052 case OP_NOTQUERY:
2053 case OP_NOTMINQUERY:
2054 case OP_NOTPOSQUERY:
2055 ADD_ACTIVE(state_offset + dlen + 1, 0);
2056 if (clen > 0)
2057 {
2058 unsigned int otherd = NOTACHAR;
2059 if ((ims & PCRE_CASELESS) != 0)
2060 {
2061 #ifdef SUPPORT_UTF8
2062 if (utf8 && d >= 128)
2063 {
2064 #ifdef SUPPORT_UCP
2065 otherd = UCD_OTHERCASE(d);
2066 #endif /* SUPPORT_UCP */
2067 }
2068 else
2069 #endif /* SUPPORT_UTF8 */
2070 otherd = fcc[d];
2071 }
2072 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2073 {
2074 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2075 {
2076 active_count--; /* Remove non-match possibility */
2077 next_active_state--;
2078 }
2079 ADD_NEW(state_offset + dlen + 1, 0);
2080 }
2081 }
2082 break;
2083
2084 /*-----------------------------------------------------------------*/
2085 case OP_STAR:
2086 case OP_MINSTAR:
2087 case OP_POSSTAR:
2088 case OP_NOTSTAR:
2089 case OP_NOTMINSTAR:
2090 case OP_NOTPOSSTAR:
2091 ADD_ACTIVE(state_offset + dlen + 1, 0);
2092 if (clen > 0)
2093 {
2094 unsigned int otherd = NOTACHAR;
2095 if ((ims & PCRE_CASELESS) != 0)
2096 {
2097 #ifdef SUPPORT_UTF8
2098 if (utf8 && d >= 128)
2099 {
2100 #ifdef SUPPORT_UCP
2101 otherd = UCD_OTHERCASE(d);
2102 #endif /* SUPPORT_UCP */
2103 }
2104 else
2105 #endif /* SUPPORT_UTF8 */
2106 otherd = fcc[d];
2107 }
2108 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2109 {
2110 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2111 {
2112 active_count--; /* Remove non-match possibility */
2113 next_active_state--;
2114 }
2115 ADD_NEW(state_offset, 0);
2116 }
2117 }
2118 break;
2119
2120 /*-----------------------------------------------------------------*/
2121 case OP_EXACT:
2122 case OP_NOTEXACT:
2123 count = current_state->count; /* Number already matched */
2124 if (clen > 0)
2125 {
2126 unsigned int otherd = NOTACHAR;
2127 if ((ims & PCRE_CASELESS) != 0)
2128 {
2129 #ifdef SUPPORT_UTF8
2130 if (utf8 && d >= 128)
2131 {
2132 #ifdef SUPPORT_UCP
2133 otherd = UCD_OTHERCASE(d);
2134 #endif /* SUPPORT_UCP */
2135 }
2136 else
2137 #endif /* SUPPORT_UTF8 */
2138 otherd = fcc[d];
2139 }
2140 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2141 {
2142 if (++count >= GET2(code, 1))
2143 { ADD_NEW(state_offset + dlen + 3, 0); }
2144 else
2145 { ADD_NEW(state_offset, count); }
2146 }
2147 }
2148 break;
2149
2150 /*-----------------------------------------------------------------*/
2151 case OP_UPTO:
2152 case OP_MINUPTO:
2153 case OP_POSUPTO:
2154 case OP_NOTUPTO:
2155 case OP_NOTMINUPTO:
2156 case OP_NOTPOSUPTO:
2157 ADD_ACTIVE(state_offset + dlen + 3, 0);
2158 count = current_state->count; /* Number already matched */
2159 if (clen > 0)
2160 {
2161 unsigned int otherd = NOTACHAR;
2162 if ((ims & PCRE_CASELESS) != 0)
2163 {
2164 #ifdef SUPPORT_UTF8
2165 if (utf8 && d >= 128)
2166 {
2167 #ifdef SUPPORT_UCP
2168 otherd = UCD_OTHERCASE(d);
2169 #endif /* SUPPORT_UCP */
2170 }
2171 else
2172 #endif /* SUPPORT_UTF8 */
2173 otherd = fcc[d];
2174 }
2175 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2176 {
2177 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2178 {
2179 active_count--; /* Remove non-match possibility */
2180 next_active_state--;
2181 }
2182 if (++count >= GET2(code, 1))
2183 { ADD_NEW(state_offset + dlen + 3, 0); }
2184 else
2185 { ADD_NEW(state_offset, count); }
2186 }
2187 }
2188 break;
2189
2190
2191 /* ========================================================================== */
2192 /* These are the class-handling opcodes */
2193
2194 case OP_CLASS:
2195 case OP_NCLASS:
2196 case OP_XCLASS:
2197 {
2198 BOOL isinclass = FALSE;
2199 int next_state_offset;
2200 const uschar *ecode;
2201
2202 /* For a simple class, there is always just a 32-byte table, and we
2203 can set isinclass from it. */
2204
2205 if (codevalue != OP_XCLASS)
2206 {
2207 ecode = code + 33;
2208 if (clen > 0)
2209 {
2210 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2211 ((code[1 + c/8] & (1 << (c&7))) != 0);
2212 }
2213 }
2214
2215 /* An extended class may have a table or a list of single characters,
2216 ranges, or both, and it may be positive or negative. There's a
2217 function that sorts all this out. */
2218
2219 else
2220 {
2221 ecode = code + GET(code, 1);
2222 if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2223 }
2224
2225 /* At this point, isinclass is set for all kinds of class, and ecode
2226 points to the byte after the end of the class. If there is a
2227 quantifier, this is where it will be. */
2228
2229 next_state_offset = ecode - start_code;
2230
2231 switch (*ecode)
2232 {
2233 case OP_CRSTAR:
2234 case OP_CRMINSTAR:
2235 ADD_ACTIVE(next_state_offset + 1, 0);
2236 if (isinclass) { ADD_NEW(state_offset, 0); }
2237 break;
2238
2239 case OP_CRPLUS:
2240 case OP_CRMINPLUS:
2241 count = current_state->count; /* Already matched */
2242 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2243 if (isinclass) { count++; ADD_NEW(state_offset, count); }
2244 break;
2245
2246 case OP_CRQUERY:
2247 case OP_CRMINQUERY:
2248 ADD_ACTIVE(next_state_offset + 1, 0);
2249 if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2250 break;
2251
2252 case OP_CRRANGE:
2253 case OP_CRMINRANGE:
2254 count = current_state->count; /* Already matched */
2255 if (count >= GET2(ecode, 1))
2256 { ADD_ACTIVE(next_state_offset + 5, 0); }
2257 if (isinclass)
2258 {
2259 int max = GET2(ecode, 3);
2260 if (++count >= max && max != 0) /* Max 0 => no limit */
2261 { ADD_NEW(next_state_offset + 5, 0); }
2262 else
2263 { ADD_NEW(state_offset, count); }
2264 }
2265 break;
2266
2267 default:
2268 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2269 break;
2270 }
2271 }
2272 break;
2273
2274 /* ========================================================================== */
2275 /* These are the opcodes for fancy brackets of various kinds. We have
2276 to use recursion in order to handle them. The "always failing" assertion
2277 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2278 though the other "backtracking verbs" are not supported. */
2279
2280 case OP_FAIL:
2281 forced_fail++; /* Count FAILs for multiple states */
2282 break;
2283
2284 case OP_ASSERT:
2285 case OP_ASSERT_NOT:
2286 case OP_ASSERTBACK:
2287 case OP_ASSERTBACK_NOT:
2288 {
2289 int rc;
2290 int local_offsets[2];
2291 int local_workspace[1000];
2292 const uschar *endasscode = code + GET(code, 1);
2293
2294 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2295
2296 rc = internal_dfa_exec(
2297 md, /* static match data */
2298 code, /* this subexpression's code */
2299 ptr, /* where we currently are */
2300 ptr - start_subject, /* start offset */
2301 local_offsets, /* offset vector */
2302 sizeof(local_offsets)/sizeof(int), /* size of same */
2303 local_workspace, /* workspace vector */
2304 sizeof(local_workspace)/sizeof(int), /* size of same */
2305 ims, /* the current ims flags */
2306 rlevel, /* function recursion level */
2307 recursing); /* pass on regex recursion */
2308
2309 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2310 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2311 }
2312 break;
2313
2314 /*-----------------------------------------------------------------*/
2315 case OP_COND:
2316 case OP_SCOND:
2317 {
2318 int local_offsets[1000];
2319 int local_workspace[1000];
2320 int codelink = GET(code, 1);
2321 int condcode;
2322
2323 /* Because of the way auto-callout works during compile, a callout item
2324 is inserted between OP_COND and an assertion condition. This does not
2325 happen for the other conditions. */
2326
2327 if (code[LINK_SIZE+1] == OP_CALLOUT)
2328 {
2329 rrc = 0;
2330 if (pcre_callout != NULL)
2331 {
2332 pcre_callout_block cb;
2333 cb.version = 1; /* Version 1 of the callout block */
2334 cb.callout_number = code[LINK_SIZE+2];
2335 cb.offset_vector = offsets;
2336 cb.subject = (PCRE_SPTR)start_subject;
2337 cb.subject_length = end_subject - start_subject;
2338 cb.start_match = current_subject - start_subject;
2339 cb.current_position = ptr - start_subject;
2340 cb.pattern_position = GET(code, LINK_SIZE + 3);
2341 cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2342 cb.capture_top = 1;
2343 cb.capture_last = -1;
2344 cb.callout_data = md->callout_data;
2345 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2346 }
2347 if (rrc > 0) break; /* Fail this thread */
2348 code += _pcre_OP_lengths[OP_CALLOUT]; /* Skip callout data */
2349 }
2350
2351 condcode = code[LINK_SIZE+1];
2352
2353 /* Back reference conditions are not supported */
2354
2355 if (condcode == OP_CREF || condcode == OP_NCREF)
2356 return PCRE_ERROR_DFA_UCOND;
2357
2358 /* The DEFINE condition is always false */
2359
2360 if (condcode == OP_DEF)
2361 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2362
2363 /* The only supported version of OP_RREF is for the value RREF_ANY,
2364 which means "test if in any recursion". We can't test for specifically
2365 recursed groups. */
2366
2367 else if (condcode == OP_RREF || condcode == OP_NRREF)
2368 {
2369 int value = GET2(code, LINK_SIZE+2);
2370 if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2371 if (recursing > 0)
2372 { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2373 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2374 }
2375
2376 /* Otherwise, the condition is an assertion */
2377
2378 else
2379 {
2380 int rc;
2381 const uschar *asscode = code + LINK_SIZE + 1;
2382 const uschar *endasscode = asscode + GET(asscode, 1);
2383
2384 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2385
2386 rc = internal_dfa_exec(
2387 md, /* fixed match data */
2388 asscode, /* this subexpression's code */
2389 ptr, /* where we currently are */
2390 ptr - start_subject, /* start offset */
2391 local_offsets, /* offset vector */
2392 sizeof(local_offsets)/sizeof(int), /* size of same */
2393 local_workspace, /* workspace vector */
2394 sizeof(local_workspace)/sizeof(int), /* size of same */
2395 ims, /* the current ims flags */
2396 rlevel, /* function recursion level */
2397 recursing); /* pass on regex recursion */
2398
2399 if ((rc >= 0) ==
2400 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2401 { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2402 else
2403 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2404 }
2405 }
2406 break;
2407
2408 /*-----------------------------------------------------------------*/
2409 case OP_RECURSE:
2410 {
2411 int local_offsets[1000];
2412 int local_workspace[1000];
2413 int rc;
2414
2415 DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2416 recursing + 1));
2417
2418 rc = internal_dfa_exec(
2419 md, /* fixed match data */
2420 start_code + GET(code, 1), /* this subexpression's code */
2421 ptr, /* where we currently are */
2422 ptr - start_subject, /* start offset */
2423 local_offsets, /* offset vector */
2424 sizeof(local_offsets)/sizeof(int), /* size of same */
2425 local_workspace, /* workspace vector */
2426 sizeof(local_workspace)/sizeof(int), /* size of same */
2427 ims, /* the current ims flags */
2428 rlevel, /* function recursion level */
2429 recursing + 1); /* regex recurse level */
2430
2431 DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2432 recursing + 1, rc));
2433
2434 /* Ran out of internal offsets */
2435
2436 if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2437
2438 /* For each successful matched substring, set up the next state with a
2439 count of characters to skip before trying it. Note that the count is in
2440 characters, not bytes. */
2441
2442 if (rc > 0)
2443 {
2444 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2445 {
2446 const uschar *p = start_subject + local_offsets[rc];
2447 const uschar *pp = start_subject + local_offsets[rc+1];
2448 int charcount = local_offsets[rc+1] - local_offsets[rc];
2449 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2450 if (charcount > 0)
2451 {
2452 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2453 }
2454 else
2455 {
2456 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2457 }
2458 }
2459 }
2460 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2461 }
2462 break;
2463
2464 /*-----------------------------------------------------------------*/
2465 case OP_ONCE:
2466 {
2467 int local_offsets[2];
2468 int local_workspace[1000];
2469
2470 int rc = internal_dfa_exec(
2471 md, /* fixed match data */
2472 code, /* this subexpression's code */
2473 ptr, /* where we currently are */
2474 ptr - start_subject, /* start offset */
2475 local_offsets, /* offset vector */
2476 sizeof(local_offsets)/sizeof(int), /* size of same */
2477 local_workspace, /* workspace vector */
2478 sizeof(local_workspace)/sizeof(int), /* size of same */
2479 ims, /* the current ims flags */
2480 rlevel, /* function recursion level */
2481 recursing); /* pass on regex recursion */
2482
2483 if (rc >= 0)
2484 {
2485 const uschar *end_subpattern = code;
2486 int charcount = local_offsets[1] - local_offsets[0];
2487 int next_state_offset, repeat_state_offset;
2488
2489 do { end_subpattern += GET(end_subpattern, 1); }
2490 while (*end_subpattern == OP_ALT);
2491 next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2492
2493 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2494 arrange for the repeat state also to be added to the relevant list.
2495 Calculate the offset, or set -1 for no repeat. */
2496
2497 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2498 *end_subpattern == OP_KETRMIN)?
2499 end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2500
2501 /* If we have matched an empty string, add the next state at the
2502 current character pointer. This is important so that the duplicate
2503 checking kicks in, which is what breaks infinite loops that match an
2504 empty string. */
2505
2506 if (charcount == 0)
2507 {
2508 ADD_ACTIVE(next_state_offset, 0);
2509 }
2510
2511 /* Optimization: if there are no more active states, and there
2512 are no new states yet set up, then skip over the subject string
2513 right here, to save looping. Otherwise, set up the new state to swing
2514 into action when the end of the substring is reached. */
2515
2516 else if (i + 1 >= active_count && new_count == 0)
2517 {
2518 ptr += charcount;
2519 clen = 0;
2520 ADD_NEW(next_state_offset, 0);
2521
2522 /* If we are adding a repeat state at the new character position,
2523 we must fudge things so that it is the only current state.
2524 Otherwise, it might be a duplicate of one we processed before, and
2525 that would cause it to be skipped. */
2526
2527 if (repeat_state_offset >= 0)
2528 {
2529 next_active_state = active_states;
2530 active_count = 0;
2531 i = -1;
2532 ADD_ACTIVE(repeat_state_offset, 0);
2533 }
2534 }
2535 else
2536 {
2537 const uschar *p = start_subject + local_offsets[0];
2538 const uschar *pp = start_subject + local_offsets[1];
2539 while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2540 ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2541 if (repeat_state_offset >= 0)
2542 { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2543 }
2544
2545 }
2546 else if (rc != PCRE_ERROR_NOMATCH) return rc;
2547 }
2548 break;
2549
2550
2551 /* ========================================================================== */
2552 /* Handle callouts */
2553
2554 case OP_CALLOUT:
2555 rrc = 0;
2556 if (pcre_callout != NULL)
2557 {
2558 pcre_callout_block cb;
2559 cb.version = 1; /* Version 1 of the callout block */
2560 cb.callout_number = code[1];
2561 cb.offset_vector = offsets;
2562 cb.subject = (PCRE_SPTR)start_subject;
2563 cb.subject_length = end_subject - start_subject;
2564 cb.start_match = current_subject - start_subject;
2565 cb.current_position = ptr - start_subject;
2566 cb.pattern_position = GET(code, 2);
2567 cb.next_item_length = GET(code, 2 + LINK_SIZE);
2568 cb.capture_top = 1;
2569 cb.capture_last = -1;
2570 cb.callout_data = md->callout_data;
2571 if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2572 }
2573 if (rrc == 0)
2574 { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2575 break;
2576
2577
2578 /* ========================================================================== */
2579 default: /* Unsupported opcode */
2580 return PCRE_ERROR_DFA_UITEM;
2581 }
2582
2583 NEXT_ACTIVE_STATE: continue;
2584
2585 } /* End of loop scanning active states */
2586
2587 /* We have finished the processing at the current subject character. If no
2588 new states have been set for the next character, we have found all the
2589 matches that we are going to find. If we are at the top level and partial
2590 matching has been requested, check for appropriate conditions.
2591
2592 The "forced_ fail" variable counts the number of (*F) encountered for the
2593 character. If it is equal to the original active_count (saved in
2594 workspace[1]) it means that (*F) was found on every active state. In this
2595 case we don't want to give a partial match.
2596
2597 The "reached_end" variable counts the number of threads that have reached the
2598 end of the pattern. The "could_continue" variable is true if a thread could
2599 have continued but for the fact that the end of the subject was reached. */
2600
2601 if (new_count <= 0)
2602 {
2603 if (rlevel == 1 && /* Top level, and */
2604 ( /* either... */
2605 reached_end != workspace[1] || /* Not all reached end */
2606 could_continue /* or some could go on */
2607 ) && /* and... */
2608 forced_fail != workspace[1] && /* Not all forced fail & */
2609 ( /* either... */
2610 (md->moptions & PCRE_PARTIAL_HARD) != 0 /* Hard partial */
2611 || /* or... */
2612 ((md->moptions & PCRE_PARTIAL_SOFT) != 0 && /* Soft partial and */
2613 match_count < 0) /* no matches */
2614 ) && /* And... */
2615 ptr >= end_subject && /* Reached end of subject */
2616 ptr > current_subject) /* Matched non-empty string */
2617 {
2618 if (offsetcount >= 2)
2619 {
2620 offsets[0] = md->start_used_ptr - start_subject;
2621 offsets[1] = end_subject - start_subject;
2622 }
2623 match_count = PCRE_ERROR_PARTIAL;
2624 }
2625
2626 DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2627 "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2628 rlevel*2-2, SP));
2629 break; /* In effect, "return", but see the comment below */
2630 }
2631
2632 /* One or more states are active for the next character. */
2633
2634 ptr += clen; /* Advance to next subject character */
2635 } /* Loop to move along the subject string */
2636
2637 /* Control gets here from "break" a few lines above. We do it this way because
2638 if we use "return" above, we have compiler trouble. Some compilers warn if
2639 there's nothing here because they think the function doesn't return a value. On
2640 the other hand, if we put a dummy statement here, some more clever compilers
2641 complain that it can't be reached. Sigh. */
2642
2643 return match_count;
2644 }
2645
2646
2647
2648
2649 /*************************************************
2650 * Execute a Regular Expression - DFA engine *
2651 *************************************************/
2652
2653 /* This external function applies a compiled re to a subject string using a DFA
2654 engine. This function calls the internal function multiple times if the pattern
2655 is not anchored.
2656
2657 Arguments:
2658 argument_re points to the compiled expression
2659 extra_data points to extra data or is NULL
2660 subject points to the subject string
2661 length length of subject string (may contain binary zeros)
2662 start_offset where to start in the subject string
2663 options option bits
2664 offsets vector of match offsets
2665 offsetcount size of same
2666 workspace workspace vector
2667 wscount size of same
2668
2669 Returns: > 0 => number of match offset pairs placed in offsets
2670 = 0 => offsets overflowed; longest matches are present
2671 -1 => failed to match
2672 < -1 => some kind of unexpected problem
2673 */
2674
2675 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2676 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2677 const char *subject, int length, int start_offset, int options, int *offsets,
2678 int offsetcount, int *workspace, int wscount)
2679 {
2680 real_pcre *re = (real_pcre *)argument_re;
2681 dfa_match_data match_block;
2682 dfa_match_data *md = &match_block;
2683 BOOL utf8, anchored, startline, firstline;
2684 const uschar *current_subject, *end_subject, *lcc;
2685
2686 pcre_study_data internal_study;
2687 const pcre_study_data *study = NULL;
2688 real_pcre internal_re;
2689
2690 const uschar *req_byte_ptr;
2691 const uschar *start_bits = NULL;
2692 BOOL first_byte_caseless = FALSE;
2693 BOOL req_byte_caseless = FALSE;
2694 int first_byte = -1;
2695 int req_byte = -1;
2696 int req_byte2 = -1;
2697 int newline;
2698
2699 /* Plausibility checks */
2700
2701 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2702 if (re == NULL || subject == NULL || workspace == NULL ||
2703 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2704 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2705 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2706
2707 /* We need to find the pointer to any study data before we test for byte
2708 flipping, so we scan the extra_data block first. This may set two fields in the
2709 match block, so we must initialize them beforehand. However, the other fields
2710 in the match block must not be set until after the byte flipping. */
2711
2712 md->tables = re->tables;
2713 md->callout_data = NULL;
2714
2715 if (extra_data != NULL)
2716 {
2717 unsigned int flags = extra_data->flags;
2718 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2719 study = (const pcre_study_data *)extra_data->study_data;
2720 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2721 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2722 return PCRE_ERROR_DFA_UMLIMIT;
2723 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2724 md->callout_data = extra_data->callout_data;
2725 if ((flags & PCRE_EXTRA_TABLES) != 0)
2726 md->tables = extra_data->tables;
2727 }
2728
2729 /* Check that the first field in the block is the magic number. If it is not,
2730 test for a regex that was compiled on a host of opposite endianness. If this is
2731 the case, flipped values are put in internal_re and internal_study if there was
2732 study data too. */
2733
2734 if (re->magic_number != MAGIC_NUMBER)
2735 {
2736 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2737 if (re == NULL) return PCRE_ERROR_BADMAGIC;
2738 if (study != NULL) study = &internal_study;
2739 }
2740
2741 /* Set some local values */
2742
2743 current_subject = (const unsigned char *)subject + start_offset;
2744 end_subject = (const unsigned char *)subject + length;
2745 req_byte_ptr = current_subject - 1;
2746
2747 #ifdef SUPPORT_UTF8
2748 utf8 = (re->options & PCRE_UTF8) != 0;
2749 #else
2750 utf8 = FALSE;
2751 #endif
2752
2753 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2754 (re->options & PCRE_ANCHORED) != 0;
2755
2756 /* The remaining fixed data for passing around. */
2757
2758 md->start_code = (const uschar *)argument_re +
2759 re->name_table_offset + re->name_count * re->name_entry_size;
2760 md->start_subject = (const unsigned char *)subject;
2761 md->end_subject = end_subject;
2762 md->start_offset = start_offset;
2763 md->moptions = options;
2764 md->poptions = re->options;
2765
2766 /* If the BSR option is not set at match time, copy what was set
2767 at compile time. */
2768
2769 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2770 {
2771 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2772 md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2773 #ifdef BSR_ANYCRLF
2774 else md->moptions |= PCRE_BSR_ANYCRLF;
2775 #endif
2776 }
2777
2778 /* Handle different types of newline. The three bits give eight cases. If
2779 nothing is set at run time, whatever was used at compile time applies. */
2780
2781 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2782 PCRE_NEWLINE_BITS)
2783 {
2784 case 0: newline = NEWLINE; break; /* Compile-time default */
2785 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2786 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2787 case PCRE_NEWLINE_CR+
2788 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2789 case PCRE_NEWLINE_ANY: newline = -1; break;
2790 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2791 default: return PCRE_ERROR_BADNEWLINE;
2792 }
2793
2794 if (newline == -2)
2795 {
2796 md->nltype = NLTYPE_ANYCRLF;
2797 }
2798 else if (newline < 0)
2799 {
2800 md->nltype = NLTYPE_ANY;
2801 }
2802 else
2803 {
2804 md->nltype = NLTYPE_FIXED;
2805 if (newline > 255)
2806 {
2807 md->nllen = 2;
2808 md->nl[0] = (newline >> 8) & 255;
2809 md->nl[1] = newline & 255;
2810 }
2811 else
2812 {
2813 md->nllen = 1;
2814 md->nl[0] = newline;
2815 }
2816 }
2817
2818 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2819 back the character offset. */
2820
2821 #ifdef SUPPORT_UTF8
2822 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2823 {
2824 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2825 return PCRE_ERROR_BADUTF8;
2826 if (start_offset > 0 && start_offset < length)
2827 {
2828 int tb = ((uschar *)subject)[start_offset];
2829 if (tb > 127)
2830 {
2831 tb &= 0xc0;
2832 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2833 }
2834 }
2835 }
2836 #endif
2837
2838 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2839 is a feature that makes it possible to save compiled regex and re-use them
2840 in other programs later. */
2841
2842 if (md->tables == NULL) md->tables = _pcre_default_tables;
2843
2844 /* The lower casing table and the "must be at the start of a line" flag are
2845 used in a loop when finding where to start. */
2846
2847 lcc = md->tables + lcc_offset;
2848 startline = (re->flags & PCRE_STARTLINE) != 0;
2849 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2850
2851 /* Set up the first character to match, if available. The first_byte value is
2852 never set for an anchored regular expression, but the anchoring may be forced
2853 at run time, so we have to test for anchoring. The first char may be unset for
2854 an unanchored pattern, of course. If there's no first char and the pattern was
2855 studied, there may be a bitmap of possible first characters. */
2856
2857 if (!anchored)
2858 {
2859 if ((re->flags & PCRE_FIRSTSET) != 0)
2860 {
2861 first_byte = re->first_byte & 255;
2862 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2863 first_byte = lcc[first_byte];
2864 }
2865 else
2866 {
2867 if (!startline && study != NULL &&
2868 (study->flags & PCRE_STUDY_MAPPED) != 0)
2869 start_bits = study->start_bits;
2870 }
2871 }
2872
2873 /* For anchored or unanchored matches, there may be a "last known required
2874 character" set. */
2875
2876 if ((re->flags & PCRE_REQCHSET) != 0)
2877 {
2878 req_byte = re->req_byte & 255;
2879 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2880 req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2881 }
2882
2883 /* Call the main matching function, looping for a non-anchored regex after a
2884 failed match. If not restarting, perform certain optimizations at the start of
2885 a match. */
2886
2887 for (;;)
2888 {
2889 int rc;
2890
2891 if ((options & PCRE_DFA_RESTART) == 0)
2892 {
2893 const uschar *save_end_subject = end_subject;
2894
2895 /* If firstline is TRUE, the start of the match is constrained to the first
2896 line of a multiline string. Implement this by temporarily adjusting
2897 end_subject so that we stop scanning at a newline. If the match fails at
2898 the newline, later code breaks this loop. */
2899
2900 if (firstline)
2901 {
2902 USPTR t = current_subject;
2903 #ifdef SUPPORT_UTF8
2904 if (utf8)
2905 {
2906 while (t < md->end_subject && !IS_NEWLINE(t))
2907 {
2908 t++;
2909 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2910 }
2911 }
2912 else
2913 #endif
2914 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2915 end_subject = t;
2916 }
2917
2918 /* There are some optimizations that avoid running the match if a known
2919 starting point is not found. However, there is an option that disables
2920 these, for testing and for ensuring that all callouts do actually occur. */
2921
2922 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2923 {
2924 /* Advance to a known first byte. */
2925
2926 if (first_byte >= 0)
2927 {
2928 if (first_byte_caseless)
2929 while (current_subject < end_subject &&
2930 lcc[*current_subject] != first_byte)
2931 current_subject++;
2932 else
2933 while (current_subject < end_subject &&
2934 *current_subject != first_byte)
2935 current_subject++;
2936 }
2937
2938 /* Or to just after a linebreak for a multiline match if possible */
2939
2940 else if (startline)
2941 {
2942 if (current_subject > md->start_subject + start_offset)
2943 {
2944 #ifdef SUPPORT_UTF8
2945 if (utf8)
2946 {
2947 while (current_subject < end_subject &&
2948 !WAS_NEWLINE(current_subject))
2949 {
2950 current_subject++;
2951 while(current_subject < end_subject &&
2952 (*current_subject & 0xc0) == 0x80)
2953 current_subject++;
2954 }
2955 }
2956 else
2957 #endif
2958 while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2959 current_subject++;
2960
2961 /* If we have just passed a CR and the newline option is ANY or
2962 ANYCRLF, and we are now at a LF, advance the match position by one
2963 more character. */
2964
2965 if (current_subject[-1] == CHAR_CR &&
2966 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2967 current_subject < end_subject &&
2968 *current_subject == CHAR_NL)
2969 current_subject++;
2970 }
2971 }
2972
2973 /* Or to a non-unique first char after study */
2974
2975 else if (start_bits != NULL)
2976 {
2977 while (current_subject < end_subject)
2978 {
2979 register unsigned int c = *current_subject;
2980 if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2981 else break;
2982 }
2983 }
2984 }
2985
2986 /* Restore fudged end_subject */
2987
2988 end_subject = save_end_subject;
2989
2990 /* The following two optimizations are disabled for partial matching or if
2991 disabling is explicitly requested (and of course, by the test above, this
2992 code is not obeyed when restarting after a partial match). */
2993
2994 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2995 (options & (PCRE_PARTIAL_HARD|PCRE_PARTIAL_SOFT)) == 0)
2996 {
2997 /* If the pattern was studied, a minimum subject length may be set. This
2998 is a lower bound; no actual string of that length may actually match the
2999 pattern. Although the value is, strictly, in characters, we treat it as
3000 bytes to avoid spending too much time in this optimization. */
3001
3002 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
3003 end_subject - current_subject < study->minlength)
3004 return PCRE_ERROR_NOMATCH;
3005
3006 /* If req_byte is set, we know that that character must appear in the
3007 subject for the match to succeed. If the first character is set, req_byte
3008 must be later in the subject; otherwise the test starts at the match
3009 point. This optimization can save a huge amount of work in patterns with
3010 nested unlimited repeats that aren't going to match. Writing separate
3011 code for cased/caseless versions makes it go faster, as does using an
3012 autoincrement and backing off on a match.
3013
3014 HOWEVER: when the subject string is very, very long, searching to its end
3015 can take a long time, and give bad performance on quite ordinary
3016 patterns. This showed up when somebody was matching /^C/ on a 32-megabyte
3017 string... so we don't do this when the string is sufficiently long. */
3018
3019 if (req_byte >= 0 && end_subject - current_subject < REQ_BYTE_MAX)
3020 {
3021 register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
3022
3023 /* We don't need to repeat the search if we haven't yet reached the
3024 place we found it at last time. */
3025
3026 if (p > req_byte_ptr)
3027 {
3028 if (req_byte_caseless)
3029 {
3030 while (p < end_subject)
3031 {
3032 register int pp = *p++;
3033 if (pp == req_byte || pp == req_byte2) { p--; break; }
3034 }
3035 }
3036 else
3037 {
3038 while (p < end_subject)
3039 {
3040 if (*p++ == req_byte) { p--; break; }
3041 }
3042 }
3043
3044 /* If we can't find the required character, break the matching loop,
3045 which will cause a return or PCRE_ERROR_NOMATCH. */
3046
3047 if (p >= end_subject) break;
3048
3049 /* If we have found the required character, save the point where we
3050 found it, so that we don't search again next time round the loop if
3051 the start hasn't passed this character yet. */
3052
3053 req_byte_ptr = p;
3054 }
3055 }
3056 }
3057 } /* End of optimizations that are done when not restarting */
3058
3059 /* OK, now we can do the business */
3060
3061 md->start_used_ptr = current_subject;
3062
3063 rc = internal_dfa_exec(
3064 md, /* fixed match data */
3065 md->start_code, /* this subexpression's code */
3066 current_subject, /* where we currently are */
3067 start_offset, /* start offset in subject */
3068 offsets, /* offset vector */
3069 offsetcount, /* size of same */
3070 workspace, /* workspace vector */
3071 wscount, /* size of same */
3072 re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
3073 0, /* function recurse level */
3074 0); /* regex recurse level */
3075
3076 /* Anything other than "no match" means we are done, always; otherwise, carry
3077 on only if not anchored. */
3078
3079 if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
3080
3081 /* Advance to the next subject character unless we are at the end of a line
3082 and firstline is set. */
3083
3084 if (firstline && IS_NEWLINE(current_subject)) break;
3085 current_subject++;
3086 if (utf8)
3087 {
3088 while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
3089 current_subject++;
3090 }
3091 if (current_subject > end_subject) break;
3092
3093 /* If we have just passed a CR and we are now at a LF, and the pattern does
3094 not contain any explicit matches for \r or \n, and the newline option is CRLF
3095 or ANY or ANYCRLF, advance the match position by one more character. */
3096
3097 if (current_subject[-1] == CHAR_CR &&
3098 current_subject < end_subject &&
3099 *current_subject == CHAR_NL &&
3100 (re->flags & PCRE_HASCRORLF) == 0 &&
3101 (md->nltype == NLTYPE_ANY ||
3102 md->nltype == NLTYPE_ANYCRLF ||
3103 md->nllen == 2))
3104 current_subject++;
3105
3106 } /* "Bumpalong" loop */
3107
3108 return PCRE_ERROR_NOMATCH;
3109 }
3110
3111 /* End of pcre_dfa_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12