/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 510 - (show annotations) (download)
Sat Mar 27 17:45:29 2010 UTC (4 years, 7 months ago) by ph10
File MIME type: text/plain
File size: 174552 byte(s)
Add support for *MARK and names for *PRUNE, *SKIP, *THEN.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_SKIP_ARG (-996)
78 #define MATCH_THEN (-995)
79
80 /* This is a convenience macro for code that occurs many times. */
81
82 #define MRRETURN(ra) \
83 { \
84 md->mark = markptr; \
85 RRETURN(ra); \
86 }
87
88 /* Maximum number of ints of offset to save on the stack for recursive calls.
89 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
90 because the offset vector is always a multiple of 3 long. */
91
92 #define REC_STACK_SAVE_MAX 30
93
94 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
95
96 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
97 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
98
99
100
101 #ifdef PCRE_DEBUG
102 /*************************************************
103 * Debugging function to print chars *
104 *************************************************/
105
106 /* Print a sequence of chars in printable format, stopping at the end of the
107 subject if the requested.
108
109 Arguments:
110 p points to characters
111 length number to print
112 is_subject TRUE if printing from within md->start_subject
113 md pointer to matching data block, if is_subject is TRUE
114
115 Returns: nothing
116 */
117
118 static void
119 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
120 {
121 unsigned int c;
122 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
123 while (length-- > 0)
124 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
125 }
126 #endif
127
128
129
130 /*************************************************
131 * Match a back-reference *
132 *************************************************/
133
134 /* If a back reference hasn't been set, the length that is passed is greater
135 than the number of characters left in the string, so the match fails.
136
137 Arguments:
138 offset index into the offset vector
139 eptr points into the subject
140 length length to be matched
141 md points to match data block
142 ims the ims flags
143
144 Returns: TRUE if matched
145 */
146
147 static BOOL
148 match_ref(int offset, register USPTR eptr, int length, match_data *md,
149 unsigned long int ims)
150 {
151 USPTR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if not enough characters left */
167
168 if (length > md->end_subject - eptr) return FALSE;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if ((ims & PCRE_CASELESS) != 0)
175 {
176 #ifdef SUPPORT_UTF8
177 #ifdef SUPPORT_UCP
178 if (md->utf8)
179 {
180 USPTR endptr = eptr + length;
181 while (eptr < endptr)
182 {
183 int c, d;
184 GETCHARINC(c, eptr);
185 GETCHARINC(d, p);
186 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
187 }
188 }
189 else
190 #endif
191 #endif
192
193 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
194 is no UCP support. */
195
196 while (length-- > 0)
197 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
198 }
199
200 /* In the caseful case, we can just compare the bytes, whether or not we
201 are in UTF-8 mode. */
202
203 else
204 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
205
206 return TRUE;
207 }
208
209
210
211 /***************************************************************************
212 ****************************************************************************
213 RECURSION IN THE match() FUNCTION
214
215 The match() function is highly recursive, though not every recursive call
216 increases the recursive depth. Nevertheless, some regular expressions can cause
217 it to recurse to a great depth. I was writing for Unix, so I just let it call
218 itself recursively. This uses the stack for saving everything that has to be
219 saved for a recursive call. On Unix, the stack can be large, and this works
220 fine.
221
222 It turns out that on some non-Unix-like systems there are problems with
223 programs that use a lot of stack. (This despite the fact that every last chip
224 has oodles of memory these days, and techniques for extending the stack have
225 been known for decades.) So....
226
227 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
228 calls by keeping local variables that need to be preserved in blocks of memory
229 obtained from malloc() instead instead of on the stack. Macros are used to
230 achieve this so that the actual code doesn't look very different to what it
231 always used to.
232
233 The original heap-recursive code used longjmp(). However, it seems that this
234 can be very slow on some operating systems. Following a suggestion from Stan
235 Switzer, the use of longjmp() has been abolished, at the cost of having to
236 provide a unique number for each call to RMATCH. There is no way of generating
237 a sequence of numbers at compile time in C. I have given them names, to make
238 them stand out more clearly.
239
240 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
241 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
242 tests. Furthermore, not using longjmp() means that local dynamic variables
243 don't have indeterminate values; this has meant that the frame size can be
244 reduced because the result can be "passed back" by straight setting of the
245 variable instead of being passed in the frame.
246 ****************************************************************************
247 ***************************************************************************/
248
249 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
250 below must be updated in sync. */
251
252 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
253 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
254 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
255 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
256 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
257 RM51, RM52, RM53, RM54 };
258
259 /* These versions of the macros use the stack, as normal. There are debugging
260 versions and production versions. Note that the "rw" argument of RMATCH isn't
261 actually used in this definition. */
262
263 #ifndef NO_RECURSE
264 #define REGISTER register
265
266 #ifdef PCRE_DEBUG
267 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
268 { \
269 printf("match() called in line %d\n", __LINE__); \
270 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
271 printf("to line %d\n", __LINE__); \
272 }
273 #define RRETURN(ra) \
274 { \
275 printf("match() returned %d from line %d ", ra, __LINE__); \
276 return ra; \
277 }
278 #else
279 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
280 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
281 #define RRETURN(ra) return ra
282 #endif
283
284 #else
285
286
287 /* These versions of the macros manage a private stack on the heap. Note that
288 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
289 argument of match(), which never changes. */
290
291 #define REGISTER
292
293 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
294 {\
295 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
296 frame->Xwhere = rw; \
297 newframe->Xeptr = ra;\
298 newframe->Xecode = rb;\
299 newframe->Xmstart = mstart;\
300 newframe->Xmarkptr = markptr;\
301 newframe->Xoffset_top = rc;\
302 newframe->Xims = re;\
303 newframe->Xeptrb = rf;\
304 newframe->Xflags = rg;\
305 newframe->Xrdepth = frame->Xrdepth + 1;\
306 newframe->Xprevframe = frame;\
307 frame = newframe;\
308 DPRINTF(("restarting from line %d\n", __LINE__));\
309 goto HEAP_RECURSE;\
310 L_##rw:\
311 DPRINTF(("jumped back to line %d\n", __LINE__));\
312 }
313
314 #define RRETURN(ra)\
315 {\
316 heapframe *newframe = frame;\
317 frame = newframe->Xprevframe;\
318 (pcre_stack_free)(newframe);\
319 if (frame != NULL)\
320 {\
321 rrc = ra;\
322 goto HEAP_RETURN;\
323 }\
324 return ra;\
325 }
326
327
328 /* Structure for remembering the local variables in a private frame */
329
330 typedef struct heapframe {
331 struct heapframe *Xprevframe;
332
333 /* Function arguments that may change */
334
335 USPTR Xeptr;
336 const uschar *Xecode;
337 USPTR Xmstart;
338 USPTR Xmarkptr;
339 int Xoffset_top;
340 long int Xims;
341 eptrblock *Xeptrb;
342 int Xflags;
343 unsigned int Xrdepth;
344
345 /* Function local variables */
346
347 USPTR Xcallpat;
348 #ifdef SUPPORT_UTF8
349 USPTR Xcharptr;
350 #endif
351 USPTR Xdata;
352 USPTR Xnext;
353 USPTR Xpp;
354 USPTR Xprev;
355 USPTR Xsaved_eptr;
356
357 recursion_info Xnew_recursive;
358
359 BOOL Xcur_is_word;
360 BOOL Xcondition;
361 BOOL Xprev_is_word;
362
363 unsigned long int Xoriginal_ims;
364
365 #ifdef SUPPORT_UCP
366 int Xprop_type;
367 int Xprop_value;
368 int Xprop_fail_result;
369 int Xprop_category;
370 int Xprop_chartype;
371 int Xprop_script;
372 int Xoclength;
373 uschar Xocchars[8];
374 #endif
375
376 int Xcodelink;
377 int Xctype;
378 unsigned int Xfc;
379 int Xfi;
380 int Xlength;
381 int Xmax;
382 int Xmin;
383 int Xnumber;
384 int Xoffset;
385 int Xop;
386 int Xsave_capture_last;
387 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
388 int Xstacksave[REC_STACK_SAVE_MAX];
389
390 eptrblock Xnewptrb;
391
392 /* Where to jump back to */
393
394 int Xwhere;
395
396 } heapframe;
397
398 #endif
399
400
401 /***************************************************************************
402 ***************************************************************************/
403
404
405
406 /*************************************************
407 * Match from current position *
408 *************************************************/
409
410 /* This function is called recursively in many circumstances. Whenever it
411 returns a negative (error) response, the outer incarnation must also return the
412 same response. */
413
414 /* These macros pack up tests that are used for partial matching, and which
415 appears several times in the code. We set the "hit end" flag if the pointer is
416 at the end of the subject and also past the start of the subject (i.e.
417 something has been matched). For hard partial matching, we then return
418 immediately. The second one is used when we already know we are past the end of
419 the subject. */
420
421 #define CHECK_PARTIAL()\
422 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
423 {\
424 md->hitend = TRUE;\
425 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
426 }
427
428 #define SCHECK_PARTIAL()\
429 if (md->partial != 0 && eptr > mstart)\
430 {\
431 md->hitend = TRUE;\
432 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
433 }
434
435
436 /* Performance note: It might be tempting to extract commonly used fields from
437 the md structure (e.g. utf8, end_subject) into individual variables to improve
438 performance. Tests using gcc on a SPARC disproved this; in the first case, it
439 made performance worse.
440
441 Arguments:
442 eptr pointer to current character in subject
443 ecode pointer to current position in compiled code
444 mstart pointer to the current match start position (can be modified
445 by encountering \K)
446 markptr pointer to the most recent MARK name, or NULL
447 offset_top current top pointer
448 md pointer to "static" info for the match
449 ims current /i, /m, and /s options
450 eptrb pointer to chain of blocks containing eptr at start of
451 brackets - for testing for empty matches
452 flags can contain
453 match_condassert - this is an assertion condition
454 match_cbegroup - this is the start of an unlimited repeat
455 group that can match an empty string
456 rdepth the recursion depth
457
458 Returns: MATCH_MATCH if matched ) these values are >= 0
459 MATCH_NOMATCH if failed to match )
460 a negative MATCH_xxx value for PRUNE, SKIP, etc
461 a negative PCRE_ERROR_xxx value if aborted by an error condition
462 (e.g. stopped by repeated call or recursion limit)
463 */
464
465 static int
466 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
467 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
468 eptrblock *eptrb, int flags, unsigned int rdepth)
469 {
470 /* These variables do not need to be preserved over recursion in this function,
471 so they can be ordinary variables in all cases. Mark some of them with
472 "register" because they are used a lot in loops. */
473
474 register int rrc; /* Returns from recursive calls */
475 register int i; /* Used for loops not involving calls to RMATCH() */
476 register unsigned int c; /* Character values not kept over RMATCH() calls */
477 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
478
479 BOOL minimize, possessive; /* Quantifier options */
480 int condcode;
481
482 /* When recursion is not being used, all "local" variables that have to be
483 preserved over calls to RMATCH() are part of a "frame" which is obtained from
484 heap storage. Set up the top-level frame here; others are obtained from the
485 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
486
487 #ifdef NO_RECURSE
488 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
489 frame->Xprevframe = NULL; /* Marks the top level */
490
491 /* Copy in the original argument variables */
492
493 frame->Xeptr = eptr;
494 frame->Xecode = ecode;
495 frame->Xmstart = mstart;
496 frame->Xmarkptr = markptr;
497 frame->Xoffset_top = offset_top;
498 frame->Xims = ims;
499 frame->Xeptrb = eptrb;
500 frame->Xflags = flags;
501 frame->Xrdepth = rdepth;
502
503 /* This is where control jumps back to to effect "recursion" */
504
505 HEAP_RECURSE:
506
507 /* Macros make the argument variables come from the current frame */
508
509 #define eptr frame->Xeptr
510 #define ecode frame->Xecode
511 #define mstart frame->Xmstart
512 #define markptr frame->Xmarkptr
513 #define offset_top frame->Xoffset_top
514 #define ims frame->Xims
515 #define eptrb frame->Xeptrb
516 #define flags frame->Xflags
517 #define rdepth frame->Xrdepth
518
519 /* Ditto for the local variables */
520
521 #ifdef SUPPORT_UTF8
522 #define charptr frame->Xcharptr
523 #endif
524 #define callpat frame->Xcallpat
525 #define codelink frame->Xcodelink
526 #define data frame->Xdata
527 #define next frame->Xnext
528 #define pp frame->Xpp
529 #define prev frame->Xprev
530 #define saved_eptr frame->Xsaved_eptr
531
532 #define new_recursive frame->Xnew_recursive
533
534 #define cur_is_word frame->Xcur_is_word
535 #define condition frame->Xcondition
536 #define prev_is_word frame->Xprev_is_word
537
538 #define original_ims frame->Xoriginal_ims
539
540 #ifdef SUPPORT_UCP
541 #define prop_type frame->Xprop_type
542 #define prop_value frame->Xprop_value
543 #define prop_fail_result frame->Xprop_fail_result
544 #define prop_category frame->Xprop_category
545 #define prop_chartype frame->Xprop_chartype
546 #define prop_script frame->Xprop_script
547 #define oclength frame->Xoclength
548 #define occhars frame->Xocchars
549 #endif
550
551 #define ctype frame->Xctype
552 #define fc frame->Xfc
553 #define fi frame->Xfi
554 #define length frame->Xlength
555 #define max frame->Xmax
556 #define min frame->Xmin
557 #define number frame->Xnumber
558 #define offset frame->Xoffset
559 #define op frame->Xop
560 #define save_capture_last frame->Xsave_capture_last
561 #define save_offset1 frame->Xsave_offset1
562 #define save_offset2 frame->Xsave_offset2
563 #define save_offset3 frame->Xsave_offset3
564 #define stacksave frame->Xstacksave
565
566 #define newptrb frame->Xnewptrb
567
568 /* When recursion is being used, local variables are allocated on the stack and
569 get preserved during recursion in the normal way. In this environment, fi and
570 i, and fc and c, can be the same variables. */
571
572 #else /* NO_RECURSE not defined */
573 #define fi i
574 #define fc c
575
576
577 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
578 const uschar *charptr; /* in small blocks of the code. My normal */
579 #endif /* style of coding would have declared */
580 const uschar *callpat; /* them within each of those blocks. */
581 const uschar *data; /* However, in order to accommodate the */
582 const uschar *next; /* version of this code that uses an */
583 USPTR pp; /* external "stack" implemented on the */
584 const uschar *prev; /* heap, it is easier to declare them all */
585 USPTR saved_eptr; /* here, so the declarations can be cut */
586 /* out in a block. The only declarations */
587 recursion_info new_recursive; /* within blocks below are for variables */
588 /* that do not have to be preserved over */
589 BOOL cur_is_word; /* a recursive call to RMATCH(). */
590 BOOL condition;
591 BOOL prev_is_word;
592
593 unsigned long int original_ims;
594
595 #ifdef SUPPORT_UCP
596 int prop_type;
597 int prop_value;
598 int prop_fail_result;
599 int prop_category;
600 int prop_chartype;
601 int prop_script;
602 int oclength;
603 uschar occhars[8];
604 #endif
605
606 int codelink;
607 int ctype;
608 int length;
609 int max;
610 int min;
611 int number;
612 int offset;
613 int op;
614 int save_capture_last;
615 int save_offset1, save_offset2, save_offset3;
616 int stacksave[REC_STACK_SAVE_MAX];
617
618 eptrblock newptrb;
619 #endif /* NO_RECURSE */
620
621 /* These statements are here to stop the compiler complaining about unitialized
622 variables. */
623
624 #ifdef SUPPORT_UCP
625 prop_value = 0;
626 prop_fail_result = 0;
627 #endif
628
629
630 /* This label is used for tail recursion, which is used in a few cases even
631 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
632 used. Thanks to Ian Taylor for noticing this possibility and sending the
633 original patch. */
634
635 TAIL_RECURSE:
636
637 /* OK, now we can get on with the real code of the function. Recursive calls
638 are specified by the macro RMATCH and RRETURN is used to return. When
639 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
640 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
641 defined). However, RMATCH isn't like a function call because it's quite a
642 complicated macro. It has to be used in one particular way. This shouldn't,
643 however, impact performance when true recursion is being used. */
644
645 #ifdef SUPPORT_UTF8
646 utf8 = md->utf8; /* Local copy of the flag */
647 #else
648 utf8 = FALSE;
649 #endif
650
651 /* First check that we haven't called match() too many times, or that we
652 haven't exceeded the recursive call limit. */
653
654 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
655 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
656
657 original_ims = ims; /* Save for resetting on ')' */
658
659 /* At the start of a group with an unlimited repeat that may match an empty
660 string, the match_cbegroup flag is set. When this is the case, add the current
661 subject pointer to the chain of such remembered pointers, to be checked when we
662 hit the closing ket, in order to break infinite loops that match no characters.
663 When match() is called in other circumstances, don't add to the chain. The
664 match_cbegroup flag must NOT be used with tail recursion, because the memory
665 block that is used is on the stack, so a new one may be required for each
666 match(). */
667
668 if ((flags & match_cbegroup) != 0)
669 {
670 newptrb.epb_saved_eptr = eptr;
671 newptrb.epb_prev = eptrb;
672 eptrb = &newptrb;
673 }
674
675 /* Now start processing the opcodes. */
676
677 for (;;)
678 {
679 minimize = possessive = FALSE;
680 op = *ecode;
681
682 switch(op)
683 {
684 case OP_MARK:
685 markptr = ecode + 2;
686 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
687 ims, eptrb, flags, RM51);
688
689 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
690 argument, and we must check whether that argument matches this MARK's
691 argument. It is passed back in md->start_match_ptr (an overloading of that
692 variable). If it does match, we reset that variable to the current subject
693 position and return MATCH_SKIP. Otherwise, pass back the return code
694 unaltered. */
695
696 if (rrc == MATCH_SKIP_ARG &&
697 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
698 {
699 md->start_match_ptr = eptr;
700 RRETURN(MATCH_SKIP);
701 }
702
703 if (md->mark == NULL) md->mark = markptr;
704 RRETURN(rrc);
705
706 case OP_FAIL:
707 MRRETURN(MATCH_NOMATCH);
708
709 case OP_COMMIT:
710 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
711 ims, eptrb, flags, RM52);
712 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
713 MRRETURN(MATCH_COMMIT);
714
715 case OP_PRUNE:
716 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717 ims, eptrb, flags, RM51);
718 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
719 MRRETURN(MATCH_PRUNE);
720
721 case OP_PRUNE_ARG:
722 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
723 ims, eptrb, flags, RM51);
724 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
725 md->mark = ecode + 2;
726 RRETURN(MATCH_PRUNE);
727
728 case OP_SKIP:
729 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
730 ims, eptrb, flags, RM53);
731 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
732 md->start_match_ptr = eptr; /* Pass back current position */
733 MRRETURN(MATCH_SKIP);
734
735 case OP_SKIP_ARG:
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
737 ims, eptrb, flags, RM53);
738 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
739
740 /* Pass back the current skip name by overloading md->start_match_ptr and
741 returning the special MATCH_SKIP_ARG return code. This will either be
742 caught by a matching MARK, or get to the top, where it is treated the same
743 as PRUNE. */
744
745 md->start_match_ptr = ecode + 2;
746 RRETURN(MATCH_SKIP_ARG);
747
748 case OP_THEN:
749 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
750 ims, eptrb, flags, RM54);
751 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
752 MRRETURN(MATCH_THEN);
753
754 case OP_THEN_ARG:
755 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
756 ims, eptrb, flags, RM54);
757 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
758 md->mark = ecode + 2;
759 RRETURN(MATCH_THEN);
760
761 /* Handle a capturing bracket. If there is space in the offset vector, save
762 the current subject position in the working slot at the top of the vector.
763 We mustn't change the current values of the data slot, because they may be
764 set from a previous iteration of this group, and be referred to by a
765 reference inside the group.
766
767 If the bracket fails to match, we need to restore this value and also the
768 values of the final offsets, in case they were set by a previous iteration
769 of the same bracket.
770
771 If there isn't enough space in the offset vector, treat this as if it were
772 a non-capturing bracket. Don't worry about setting the flag for the error
773 case here; that is handled in the code for KET. */
774
775 case OP_CBRA:
776 case OP_SCBRA:
777 number = GET2(ecode, 1+LINK_SIZE);
778 offset = number << 1;
779
780 #ifdef PCRE_DEBUG
781 printf("start bracket %d\n", number);
782 printf("subject=");
783 pchars(eptr, 16, TRUE, md);
784 printf("\n");
785 #endif
786
787 if (offset < md->offset_max)
788 {
789 save_offset1 = md->offset_vector[offset];
790 save_offset2 = md->offset_vector[offset+1];
791 save_offset3 = md->offset_vector[md->offset_end - number];
792 save_capture_last = md->capture_last;
793
794 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
795 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
796
797 flags = (op == OP_SCBRA)? match_cbegroup : 0;
798 do
799 {
800 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
801 ims, eptrb, flags, RM1);
802 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
803 md->capture_last = save_capture_last;
804 ecode += GET(ecode, 1);
805 }
806 while (*ecode == OP_ALT);
807
808 DPRINTF(("bracket %d failed\n", number));
809
810 md->offset_vector[offset] = save_offset1;
811 md->offset_vector[offset+1] = save_offset2;
812 md->offset_vector[md->offset_end - number] = save_offset3;
813
814 if (rrc != MATCH_THEN) md->mark = markptr;
815 RRETURN(MATCH_NOMATCH);
816 }
817
818 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
819 as a non-capturing bracket. */
820
821 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
822 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
823
824 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
825
826 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
827 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
828
829 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
830 final alternative within the brackets, we would return the result of a
831 recursive call to match() whatever happened. We can reduce stack usage by
832 turning this into a tail recursion, except in the case when match_cbegroup
833 is set.*/
834
835 case OP_BRA:
836 case OP_SBRA:
837 DPRINTF(("start non-capturing bracket\n"));
838 flags = (op >= OP_SBRA)? match_cbegroup : 0;
839 for (;;)
840 {
841 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
842 {
843 if (flags == 0) /* Not a possibly empty group */
844 {
845 ecode += _pcre_OP_lengths[*ecode];
846 DPRINTF(("bracket 0 tail recursion\n"));
847 goto TAIL_RECURSE;
848 }
849
850 /* Possibly empty group; can't use tail recursion. */
851
852 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
853 eptrb, flags, RM48);
854 if (rrc == MATCH_NOMATCH) md->mark = markptr;
855 RRETURN(rrc);
856 }
857
858 /* For non-final alternatives, continue the loop for a NOMATCH result;
859 otherwise return. */
860
861 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
862 eptrb, flags, RM2);
863 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
864 ecode += GET(ecode, 1);
865 }
866 /* Control never reaches here. */
867
868 /* Conditional group: compilation checked that there are no more than
869 two branches. If the condition is false, skipping the first branch takes us
870 past the end if there is only one branch, but that's OK because that is
871 exactly what going to the ket would do. As there is only one branch to be
872 obeyed, we can use tail recursion to avoid using another stack frame. */
873
874 case OP_COND:
875 case OP_SCOND:
876 codelink= GET(ecode, 1);
877
878 /* Because of the way auto-callout works during compile, a callout item is
879 inserted between OP_COND and an assertion condition. */
880
881 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
882 {
883 if (pcre_callout != NULL)
884 {
885 pcre_callout_block cb;
886 cb.version = 1; /* Version 1 of the callout block */
887 cb.callout_number = ecode[LINK_SIZE+2];
888 cb.offset_vector = md->offset_vector;
889 cb.subject = (PCRE_SPTR)md->start_subject;
890 cb.subject_length = md->end_subject - md->start_subject;
891 cb.start_match = mstart - md->start_subject;
892 cb.current_position = eptr - md->start_subject;
893 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
894 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
895 cb.capture_top = offset_top/2;
896 cb.capture_last = md->capture_last;
897 cb.callout_data = md->callout_data;
898 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
899 if (rrc < 0) RRETURN(rrc);
900 }
901 ecode += _pcre_OP_lengths[OP_CALLOUT];
902 }
903
904 condcode = ecode[LINK_SIZE+1];
905
906 /* Now see what the actual condition is */
907
908 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
909 {
910 if (md->recursive == NULL) /* Not recursing => FALSE */
911 {
912 condition = FALSE;
913 ecode += GET(ecode, 1);
914 }
915 else
916 {
917 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
918 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
919
920 /* If the test is for recursion into a specific subpattern, and it is
921 false, but the test was set up by name, scan the table to see if the
922 name refers to any other numbers, and test them. The condition is true
923 if any one is set. */
924
925 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
926 {
927 uschar *slotA = md->name_table;
928 for (i = 0; i < md->name_count; i++)
929 {
930 if (GET2(slotA, 0) == recno) break;
931 slotA += md->name_entry_size;
932 }
933
934 /* Found a name for the number - there can be only one; duplicate
935 names for different numbers are allowed, but not vice versa. First
936 scan down for duplicates. */
937
938 if (i < md->name_count)
939 {
940 uschar *slotB = slotA;
941 while (slotB > md->name_table)
942 {
943 slotB -= md->name_entry_size;
944 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
945 {
946 condition = GET2(slotB, 0) == md->recursive->group_num;
947 if (condition) break;
948 }
949 else break;
950 }
951
952 /* Scan up for duplicates */
953
954 if (!condition)
955 {
956 slotB = slotA;
957 for (i++; i < md->name_count; i++)
958 {
959 slotB += md->name_entry_size;
960 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
961 {
962 condition = GET2(slotB, 0) == md->recursive->group_num;
963 if (condition) break;
964 }
965 else break;
966 }
967 }
968 }
969 }
970
971 /* Chose branch according to the condition */
972
973 ecode += condition? 3 : GET(ecode, 1);
974 }
975 }
976
977 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
978 {
979 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
980 condition = offset < offset_top && md->offset_vector[offset] >= 0;
981
982 /* If the numbered capture is unset, but the reference was by name,
983 scan the table to see if the name refers to any other numbers, and test
984 them. The condition is true if any one is set. This is tediously similar
985 to the code above, but not close enough to try to amalgamate. */
986
987 if (!condition && condcode == OP_NCREF)
988 {
989 int refno = offset >> 1;
990 uschar *slotA = md->name_table;
991
992 for (i = 0; i < md->name_count; i++)
993 {
994 if (GET2(slotA, 0) == refno) break;
995 slotA += md->name_entry_size;
996 }
997
998 /* Found a name for the number - there can be only one; duplicate names
999 for different numbers are allowed, but not vice versa. First scan down
1000 for duplicates. */
1001
1002 if (i < md->name_count)
1003 {
1004 uschar *slotB = slotA;
1005 while (slotB > md->name_table)
1006 {
1007 slotB -= md->name_entry_size;
1008 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1009 {
1010 offset = GET2(slotB, 0) << 1;
1011 condition = offset < offset_top &&
1012 md->offset_vector[offset] >= 0;
1013 if (condition) break;
1014 }
1015 else break;
1016 }
1017
1018 /* Scan up for duplicates */
1019
1020 if (!condition)
1021 {
1022 slotB = slotA;
1023 for (i++; i < md->name_count; i++)
1024 {
1025 slotB += md->name_entry_size;
1026 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1027 {
1028 offset = GET2(slotB, 0) << 1;
1029 condition = offset < offset_top &&
1030 md->offset_vector[offset] >= 0;
1031 if (condition) break;
1032 }
1033 else break;
1034 }
1035 }
1036 }
1037 }
1038
1039 /* Chose branch according to the condition */
1040
1041 ecode += condition? 3 : GET(ecode, 1);
1042 }
1043
1044 else if (condcode == OP_DEF) /* DEFINE - always false */
1045 {
1046 condition = FALSE;
1047 ecode += GET(ecode, 1);
1048 }
1049
1050 /* The condition is an assertion. Call match() to evaluate it - setting
1051 the final argument match_condassert causes it to stop at the end of an
1052 assertion. */
1053
1054 else
1055 {
1056 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1057 match_condassert, RM3);
1058 if (rrc == MATCH_MATCH)
1059 {
1060 condition = TRUE;
1061 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1062 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1063 }
1064 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1065 {
1066 RRETURN(rrc); /* Need braces because of following else */
1067 }
1068 else
1069 {
1070 condition = FALSE;
1071 ecode += codelink;
1072 }
1073 }
1074
1075 /* We are now at the branch that is to be obeyed. As there is only one,
1076 we can use tail recursion to avoid using another stack frame, except when
1077 match_cbegroup is required for an unlimited repeat of a possibly empty
1078 group. If the second alternative doesn't exist, we can just plough on. */
1079
1080 if (condition || *ecode == OP_ALT)
1081 {
1082 ecode += 1 + LINK_SIZE;
1083 if (op == OP_SCOND) /* Possibly empty group */
1084 {
1085 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1086 RRETURN(rrc);
1087 }
1088 else /* Group must match something */
1089 {
1090 flags = 0;
1091 goto TAIL_RECURSE;
1092 }
1093 }
1094 else /* Condition false & no alternative */
1095 {
1096 ecode += 1 + LINK_SIZE;
1097 }
1098 break;
1099
1100
1101 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1102 to close any currently open capturing brackets. */
1103
1104 case OP_CLOSE:
1105 number = GET2(ecode, 1);
1106 offset = number << 1;
1107
1108 #ifdef PCRE_DEBUG
1109 printf("end bracket %d at *ACCEPT", number);
1110 printf("\n");
1111 #endif
1112
1113 md->capture_last = number;
1114 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1115 {
1116 md->offset_vector[offset] =
1117 md->offset_vector[md->offset_end - number];
1118 md->offset_vector[offset+1] = eptr - md->start_subject;
1119 if (offset_top <= offset) offset_top = offset + 2;
1120 }
1121 ecode += 3;
1122 break;
1123
1124
1125 /* End of the pattern, either real or forced. If we are in a top-level
1126 recursion, we should restore the offsets appropriately and continue from
1127 after the call. */
1128
1129 case OP_ACCEPT:
1130 case OP_END:
1131 if (md->recursive != NULL && md->recursive->group_num == 0)
1132 {
1133 recursion_info *rec = md->recursive;
1134 DPRINTF(("End of pattern in a (?0) recursion\n"));
1135 md->recursive = rec->prevrec;
1136 memmove(md->offset_vector, rec->offset_save,
1137 rec->saved_max * sizeof(int));
1138 offset_top = rec->save_offset_top;
1139 ims = original_ims;
1140 ecode = rec->after_call;
1141 break;
1142 }
1143
1144 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1145 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1146 the subject. In both cases, backtracking will then try other alternatives,
1147 if any. */
1148
1149 if (eptr == mstart &&
1150 (md->notempty ||
1151 (md->notempty_atstart &&
1152 mstart == md->start_subject + md->start_offset)))
1153 MRRETURN(MATCH_NOMATCH);
1154
1155 /* Otherwise, we have a match. */
1156
1157 md->end_match_ptr = eptr; /* Record where we ended */
1158 md->end_offset_top = offset_top; /* and how many extracts were taken */
1159 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1160 MRRETURN(MATCH_MATCH);
1161
1162 /* Change option settings */
1163
1164 case OP_OPT:
1165 ims = ecode[1];
1166 ecode += 2;
1167 DPRINTF(("ims set to %02lx\n", ims));
1168 break;
1169
1170 /* Assertion brackets. Check the alternative branches in turn - the
1171 matching won't pass the KET for an assertion. If any one branch matches,
1172 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1173 start of each branch to move the current point backwards, so the code at
1174 this level is identical to the lookahead case. */
1175
1176 case OP_ASSERT:
1177 case OP_ASSERTBACK:
1178 do
1179 {
1180 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1181 RM4);
1182 if (rrc == MATCH_MATCH)
1183 {
1184 mstart = md->start_match_ptr; /* In case \K reset it */
1185 break;
1186 }
1187 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1188 ecode += GET(ecode, 1);
1189 }
1190 while (*ecode == OP_ALT);
1191 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1192
1193 /* If checking an assertion for a condition, return MATCH_MATCH. */
1194
1195 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1196
1197 /* Continue from after the assertion, updating the offsets high water
1198 mark, since extracts may have been taken during the assertion. */
1199
1200 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1201 ecode += 1 + LINK_SIZE;
1202 offset_top = md->end_offset_top;
1203 continue;
1204
1205 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1206 PRUNE, or COMMIT means we must assume failure without checking subsequent
1207 branches. */
1208
1209 case OP_ASSERT_NOT:
1210 case OP_ASSERTBACK_NOT:
1211 do
1212 {
1213 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1214 RM5);
1215 if (rrc == MATCH_MATCH) MRRETURN(MATCH_NOMATCH);
1216 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1217 {
1218 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1219 break;
1220 }
1221 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1222 ecode += GET(ecode,1);
1223 }
1224 while (*ecode == OP_ALT);
1225
1226 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1227
1228 ecode += 1 + LINK_SIZE;
1229 continue;
1230
1231 /* Move the subject pointer back. This occurs only at the start of
1232 each branch of a lookbehind assertion. If we are too close to the start to
1233 move back, this match function fails. When working with UTF-8 we move
1234 back a number of characters, not bytes. */
1235
1236 case OP_REVERSE:
1237 #ifdef SUPPORT_UTF8
1238 if (utf8)
1239 {
1240 i = GET(ecode, 1);
1241 while (i-- > 0)
1242 {
1243 eptr--;
1244 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1245 BACKCHAR(eptr);
1246 }
1247 }
1248 else
1249 #endif
1250
1251 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1252
1253 {
1254 eptr -= GET(ecode, 1);
1255 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1256 }
1257
1258 /* Save the earliest consulted character, then skip to next op code */
1259
1260 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1261 ecode += 1 + LINK_SIZE;
1262 break;
1263
1264 /* The callout item calls an external function, if one is provided, passing
1265 details of the match so far. This is mainly for debugging, though the
1266 function is able to force a failure. */
1267
1268 case OP_CALLOUT:
1269 if (pcre_callout != NULL)
1270 {
1271 pcre_callout_block cb;
1272 cb.version = 1; /* Version 1 of the callout block */
1273 cb.callout_number = ecode[1];
1274 cb.offset_vector = md->offset_vector;
1275 cb.subject = (PCRE_SPTR)md->start_subject;
1276 cb.subject_length = md->end_subject - md->start_subject;
1277 cb.start_match = mstart - md->start_subject;
1278 cb.current_position = eptr - md->start_subject;
1279 cb.pattern_position = GET(ecode, 2);
1280 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1281 cb.capture_top = offset_top/2;
1282 cb.capture_last = md->capture_last;
1283 cb.callout_data = md->callout_data;
1284 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1285 if (rrc < 0) RRETURN(rrc);
1286 }
1287 ecode += 2 + 2*LINK_SIZE;
1288 break;
1289
1290 /* Recursion either matches the current regex, or some subexpression. The
1291 offset data is the offset to the starting bracket from the start of the
1292 whole pattern. (This is so that it works from duplicated subpatterns.)
1293
1294 If there are any capturing brackets started but not finished, we have to
1295 save their starting points and reinstate them after the recursion. However,
1296 we don't know how many such there are (offset_top records the completed
1297 total) so we just have to save all the potential data. There may be up to
1298 65535 such values, which is too large to put on the stack, but using malloc
1299 for small numbers seems expensive. As a compromise, the stack is used when
1300 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1301 is used. A problem is what to do if the malloc fails ... there is no way of
1302 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1303 values on the stack, and accept that the rest may be wrong.
1304
1305 There are also other values that have to be saved. We use a chained
1306 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1307 for the original version of this logic. */
1308
1309 case OP_RECURSE:
1310 {
1311 callpat = md->start_code + GET(ecode, 1);
1312 new_recursive.group_num = (callpat == md->start_code)? 0 :
1313 GET2(callpat, 1 + LINK_SIZE);
1314
1315 /* Add to "recursing stack" */
1316
1317 new_recursive.prevrec = md->recursive;
1318 md->recursive = &new_recursive;
1319
1320 /* Find where to continue from afterwards */
1321
1322 ecode += 1 + LINK_SIZE;
1323 new_recursive.after_call = ecode;
1324
1325 /* Now save the offset data. */
1326
1327 new_recursive.saved_max = md->offset_end;
1328 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1329 new_recursive.offset_save = stacksave;
1330 else
1331 {
1332 new_recursive.offset_save =
1333 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1334 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1335 }
1336
1337 memcpy(new_recursive.offset_save, md->offset_vector,
1338 new_recursive.saved_max * sizeof(int));
1339 new_recursive.save_offset_top = offset_top;
1340
1341 /* OK, now we can do the recursion. For each top-level alternative we
1342 restore the offset and recursion data. */
1343
1344 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1345 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1346 do
1347 {
1348 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1349 md, ims, eptrb, flags, RM6);
1350 if (rrc == MATCH_MATCH)
1351 {
1352 DPRINTF(("Recursion matched\n"));
1353 md->recursive = new_recursive.prevrec;
1354 if (new_recursive.offset_save != stacksave)
1355 (pcre_free)(new_recursive.offset_save);
1356 MRRETURN(MATCH_MATCH);
1357 }
1358 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1359 {
1360 DPRINTF(("Recursion gave error %d\n", rrc));
1361 if (new_recursive.offset_save != stacksave)
1362 (pcre_free)(new_recursive.offset_save);
1363 RRETURN(rrc);
1364 }
1365
1366 md->recursive = &new_recursive;
1367 memcpy(md->offset_vector, new_recursive.offset_save,
1368 new_recursive.saved_max * sizeof(int));
1369 callpat += GET(callpat, 1);
1370 }
1371 while (*callpat == OP_ALT);
1372
1373 DPRINTF(("Recursion didn't match\n"));
1374 md->recursive = new_recursive.prevrec;
1375 if (new_recursive.offset_save != stacksave)
1376 (pcre_free)(new_recursive.offset_save);
1377 MRRETURN(MATCH_NOMATCH);
1378 }
1379 /* Control never reaches here */
1380
1381 /* "Once" brackets are like assertion brackets except that after a match,
1382 the point in the subject string is not moved back. Thus there can never be
1383 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1384 Check the alternative branches in turn - the matching won't pass the KET
1385 for this kind of subpattern. If any one branch matches, we carry on as at
1386 the end of a normal bracket, leaving the subject pointer, but resetting
1387 the start-of-match value in case it was changed by \K. */
1388
1389 case OP_ONCE:
1390 prev = ecode;
1391 saved_eptr = eptr;
1392
1393 do
1394 {
1395 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1396 if (rrc == MATCH_MATCH)
1397 {
1398 mstart = md->start_match_ptr;
1399 break;
1400 }
1401 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1402 ecode += GET(ecode,1);
1403 }
1404 while (*ecode == OP_ALT);
1405
1406 /* If hit the end of the group (which could be repeated), fail */
1407
1408 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1409
1410 /* Continue as from after the assertion, updating the offsets high water
1411 mark, since extracts may have been taken. */
1412
1413 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1414
1415 offset_top = md->end_offset_top;
1416 eptr = md->end_match_ptr;
1417
1418 /* For a non-repeating ket, just continue at this level. This also
1419 happens for a repeating ket if no characters were matched in the group.
1420 This is the forcible breaking of infinite loops as implemented in Perl
1421 5.005. If there is an options reset, it will get obeyed in the normal
1422 course of events. */
1423
1424 if (*ecode == OP_KET || eptr == saved_eptr)
1425 {
1426 ecode += 1+LINK_SIZE;
1427 break;
1428 }
1429
1430 /* The repeating kets try the rest of the pattern or restart from the
1431 preceding bracket, in the appropriate order. The second "call" of match()
1432 uses tail recursion, to avoid using another stack frame. We need to reset
1433 any options that changed within the bracket before re-running it, so
1434 check the next opcode. */
1435
1436 if (ecode[1+LINK_SIZE] == OP_OPT)
1437 {
1438 ims = (ims & ~PCRE_IMS) | ecode[4];
1439 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1440 }
1441
1442 if (*ecode == OP_KETRMIN)
1443 {
1444 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1446 ecode = prev;
1447 flags = 0;
1448 goto TAIL_RECURSE;
1449 }
1450 else /* OP_KETRMAX */
1451 {
1452 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1453 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1454 ecode += 1 + LINK_SIZE;
1455 flags = 0;
1456 goto TAIL_RECURSE;
1457 }
1458 /* Control never gets here */
1459
1460 /* An alternation is the end of a branch; scan along to find the end of the
1461 bracketed group and go to there. */
1462
1463 case OP_ALT:
1464 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1465 break;
1466
1467 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1468 indicating that it may occur zero times. It may repeat infinitely, or not
1469 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1470 with fixed upper repeat limits are compiled as a number of copies, with the
1471 optional ones preceded by BRAZERO or BRAMINZERO. */
1472
1473 case OP_BRAZERO:
1474 {
1475 next = ecode+1;
1476 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1477 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1478 do next += GET(next,1); while (*next == OP_ALT);
1479 ecode = next + 1 + LINK_SIZE;
1480 }
1481 break;
1482
1483 case OP_BRAMINZERO:
1484 {
1485 next = ecode+1;
1486 do next += GET(next, 1); while (*next == OP_ALT);
1487 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1488 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1489 ecode++;
1490 }
1491 break;
1492
1493 case OP_SKIPZERO:
1494 {
1495 next = ecode+1;
1496 do next += GET(next,1); while (*next == OP_ALT);
1497 ecode = next + 1 + LINK_SIZE;
1498 }
1499 break;
1500
1501 /* End of a group, repeated or non-repeating. */
1502
1503 case OP_KET:
1504 case OP_KETRMIN:
1505 case OP_KETRMAX:
1506 prev = ecode - GET(ecode, 1);
1507
1508 /* If this was a group that remembered the subject start, in order to break
1509 infinite repeats of empty string matches, retrieve the subject start from
1510 the chain. Otherwise, set it NULL. */
1511
1512 if (*prev >= OP_SBRA)
1513 {
1514 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1515 eptrb = eptrb->epb_prev; /* Backup to previous group */
1516 }
1517 else saved_eptr = NULL;
1518
1519 /* If we are at the end of an assertion group or an atomic group, stop
1520 matching and return MATCH_MATCH, but record the current high water mark for
1521 use by positive assertions. We also need to record the match start in case
1522 it was changed by \K. */
1523
1524 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1525 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1526 *prev == OP_ONCE)
1527 {
1528 md->end_match_ptr = eptr; /* For ONCE */
1529 md->end_offset_top = offset_top;
1530 md->start_match_ptr = mstart;
1531 MRRETURN(MATCH_MATCH);
1532 }
1533
1534 /* For capturing groups we have to check the group number back at the start
1535 and if necessary complete handling an extraction by setting the offsets and
1536 bumping the high water mark. Note that whole-pattern recursion is coded as
1537 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1538 when the OP_END is reached. Other recursion is handled here. */
1539
1540 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1541 {
1542 number = GET2(prev, 1+LINK_SIZE);
1543 offset = number << 1;
1544
1545 #ifdef PCRE_DEBUG
1546 printf("end bracket %d", number);
1547 printf("\n");
1548 #endif
1549
1550 md->capture_last = number;
1551 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1552 {
1553 md->offset_vector[offset] =
1554 md->offset_vector[md->offset_end - number];
1555 md->offset_vector[offset+1] = eptr - md->start_subject;
1556 if (offset_top <= offset) offset_top = offset + 2;
1557 }
1558
1559 /* Handle a recursively called group. Restore the offsets
1560 appropriately and continue from after the call. */
1561
1562 if (md->recursive != NULL && md->recursive->group_num == number)
1563 {
1564 recursion_info *rec = md->recursive;
1565 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1566 md->recursive = rec->prevrec;
1567 memcpy(md->offset_vector, rec->offset_save,
1568 rec->saved_max * sizeof(int));
1569 offset_top = rec->save_offset_top;
1570 ecode = rec->after_call;
1571 ims = original_ims;
1572 break;
1573 }
1574 }
1575
1576 /* For both capturing and non-capturing groups, reset the value of the ims
1577 flags, in case they got changed during the group. */
1578
1579 ims = original_ims;
1580 DPRINTF(("ims reset to %02lx\n", ims));
1581
1582 /* For a non-repeating ket, just continue at this level. This also
1583 happens for a repeating ket if no characters were matched in the group.
1584 This is the forcible breaking of infinite loops as implemented in Perl
1585 5.005. If there is an options reset, it will get obeyed in the normal
1586 course of events. */
1587
1588 if (*ecode == OP_KET || eptr == saved_eptr)
1589 {
1590 ecode += 1 + LINK_SIZE;
1591 break;
1592 }
1593
1594 /* The repeating kets try the rest of the pattern or restart from the
1595 preceding bracket, in the appropriate order. In the second case, we can use
1596 tail recursion to avoid using another stack frame, unless we have an
1597 unlimited repeat of a group that can match an empty string. */
1598
1599 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1600
1601 if (*ecode == OP_KETRMIN)
1602 {
1603 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1604 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1605 if (flags != 0) /* Could match an empty string */
1606 {
1607 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1608 RRETURN(rrc);
1609 }
1610 ecode = prev;
1611 goto TAIL_RECURSE;
1612 }
1613 else /* OP_KETRMAX */
1614 {
1615 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1616 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1617 ecode += 1 + LINK_SIZE;
1618 flags = 0;
1619 goto TAIL_RECURSE;
1620 }
1621 /* Control never gets here */
1622
1623 /* Start of subject unless notbol, or after internal newline if multiline */
1624
1625 case OP_CIRC:
1626 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1627 if ((ims & PCRE_MULTILINE) != 0)
1628 {
1629 if (eptr != md->start_subject &&
1630 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1631 MRRETURN(MATCH_NOMATCH);
1632 ecode++;
1633 break;
1634 }
1635 /* ... else fall through */
1636
1637 /* Start of subject assertion */
1638
1639 case OP_SOD:
1640 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1641 ecode++;
1642 break;
1643
1644 /* Start of match assertion */
1645
1646 case OP_SOM:
1647 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1648 ecode++;
1649 break;
1650
1651 /* Reset the start of match point */
1652
1653 case OP_SET_SOM:
1654 mstart = eptr;
1655 ecode++;
1656 break;
1657
1658 /* Assert before internal newline if multiline, or before a terminating
1659 newline unless endonly is set, else end of subject unless noteol is set. */
1660
1661 case OP_DOLL:
1662 if ((ims & PCRE_MULTILINE) != 0)
1663 {
1664 if (eptr < md->end_subject)
1665 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1666 else
1667 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1668 ecode++;
1669 break;
1670 }
1671 else
1672 {
1673 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1674 if (!md->endonly)
1675 {
1676 if (eptr != md->end_subject &&
1677 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1678 MRRETURN(MATCH_NOMATCH);
1679 ecode++;
1680 break;
1681 }
1682 }
1683 /* ... else fall through for endonly */
1684
1685 /* End of subject assertion (\z) */
1686
1687 case OP_EOD:
1688 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1689 ecode++;
1690 break;
1691
1692 /* End of subject or ending \n assertion (\Z) */
1693
1694 case OP_EODN:
1695 if (eptr != md->end_subject &&
1696 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1697 MRRETURN(MATCH_NOMATCH);
1698 ecode++;
1699 break;
1700
1701 /* Word boundary assertions */
1702
1703 case OP_NOT_WORD_BOUNDARY:
1704 case OP_WORD_BOUNDARY:
1705 {
1706
1707 /* Find out if the previous and current characters are "word" characters.
1708 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1709 be "non-word" characters. Remember the earliest consulted character for
1710 partial matching. */
1711
1712 #ifdef SUPPORT_UTF8
1713 if (utf8)
1714 {
1715 if (eptr == md->start_subject) prev_is_word = FALSE; else
1716 {
1717 USPTR lastptr = eptr - 1;
1718 while((*lastptr & 0xc0) == 0x80) lastptr--;
1719 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1720 GETCHAR(c, lastptr);
1721 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1722 }
1723 if (eptr >= md->end_subject)
1724 {
1725 SCHECK_PARTIAL();
1726 cur_is_word = FALSE;
1727 }
1728 else
1729 {
1730 GETCHAR(c, eptr);
1731 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1732 }
1733 }
1734 else
1735 #endif
1736
1737 /* Not in UTF-8 mode */
1738
1739 {
1740 if (eptr == md->start_subject) prev_is_word = FALSE; else
1741 {
1742 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1743 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1744 }
1745 if (eptr >= md->end_subject)
1746 {
1747 SCHECK_PARTIAL();
1748 cur_is_word = FALSE;
1749 }
1750 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1751 }
1752
1753 /* Now see if the situation is what we want */
1754
1755 if ((*ecode++ == OP_WORD_BOUNDARY)?
1756 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1757 MRRETURN(MATCH_NOMATCH);
1758 }
1759 break;
1760
1761 /* Match a single character type; inline for speed */
1762
1763 case OP_ANY:
1764 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1765 /* Fall through */
1766
1767 case OP_ALLANY:
1768 if (eptr++ >= md->end_subject)
1769 {
1770 SCHECK_PARTIAL();
1771 MRRETURN(MATCH_NOMATCH);
1772 }
1773 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1774 ecode++;
1775 break;
1776
1777 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1778 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1779
1780 case OP_ANYBYTE:
1781 if (eptr++ >= md->end_subject)
1782 {
1783 SCHECK_PARTIAL();
1784 MRRETURN(MATCH_NOMATCH);
1785 }
1786 ecode++;
1787 break;
1788
1789 case OP_NOT_DIGIT:
1790 if (eptr >= md->end_subject)
1791 {
1792 SCHECK_PARTIAL();
1793 MRRETURN(MATCH_NOMATCH);
1794 }
1795 GETCHARINCTEST(c, eptr);
1796 if (
1797 #ifdef SUPPORT_UTF8
1798 c < 256 &&
1799 #endif
1800 (md->ctypes[c] & ctype_digit) != 0
1801 )
1802 MRRETURN(MATCH_NOMATCH);
1803 ecode++;
1804 break;
1805
1806 case OP_DIGIT:
1807 if (eptr >= md->end_subject)
1808 {
1809 SCHECK_PARTIAL();
1810 MRRETURN(MATCH_NOMATCH);
1811 }
1812 GETCHARINCTEST(c, eptr);
1813 if (
1814 #ifdef SUPPORT_UTF8
1815 c >= 256 ||
1816 #endif
1817 (md->ctypes[c] & ctype_digit) == 0
1818 )
1819 MRRETURN(MATCH_NOMATCH);
1820 ecode++;
1821 break;
1822
1823 case OP_NOT_WHITESPACE:
1824 if (eptr >= md->end_subject)
1825 {
1826 SCHECK_PARTIAL();
1827 MRRETURN(MATCH_NOMATCH);
1828 }
1829 GETCHARINCTEST(c, eptr);
1830 if (
1831 #ifdef SUPPORT_UTF8
1832 c < 256 &&
1833 #endif
1834 (md->ctypes[c] & ctype_space) != 0
1835 )
1836 MRRETURN(MATCH_NOMATCH);
1837 ecode++;
1838 break;
1839
1840 case OP_WHITESPACE:
1841 if (eptr >= md->end_subject)
1842 {
1843 SCHECK_PARTIAL();
1844 MRRETURN(MATCH_NOMATCH);
1845 }
1846 GETCHARINCTEST(c, eptr);
1847 if (
1848 #ifdef SUPPORT_UTF8
1849 c >= 256 ||
1850 #endif
1851 (md->ctypes[c] & ctype_space) == 0
1852 )
1853 MRRETURN(MATCH_NOMATCH);
1854 ecode++;
1855 break;
1856
1857 case OP_NOT_WORDCHAR:
1858 if (eptr >= md->end_subject)
1859 {
1860 SCHECK_PARTIAL();
1861 MRRETURN(MATCH_NOMATCH);
1862 }
1863 GETCHARINCTEST(c, eptr);
1864 if (
1865 #ifdef SUPPORT_UTF8
1866 c < 256 &&
1867 #endif
1868 (md->ctypes[c] & ctype_word) != 0
1869 )
1870 MRRETURN(MATCH_NOMATCH);
1871 ecode++;
1872 break;
1873
1874 case OP_WORDCHAR:
1875 if (eptr >= md->end_subject)
1876 {
1877 SCHECK_PARTIAL();
1878 MRRETURN(MATCH_NOMATCH);
1879 }
1880 GETCHARINCTEST(c, eptr);
1881 if (
1882 #ifdef SUPPORT_UTF8
1883 c >= 256 ||
1884 #endif
1885 (md->ctypes[c] & ctype_word) == 0
1886 )
1887 MRRETURN(MATCH_NOMATCH);
1888 ecode++;
1889 break;
1890
1891 case OP_ANYNL:
1892 if (eptr >= md->end_subject)
1893 {
1894 SCHECK_PARTIAL();
1895 MRRETURN(MATCH_NOMATCH);
1896 }
1897 GETCHARINCTEST(c, eptr);
1898 switch(c)
1899 {
1900 default: MRRETURN(MATCH_NOMATCH);
1901 case 0x000d:
1902 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1903 break;
1904
1905 case 0x000a:
1906 break;
1907
1908 case 0x000b:
1909 case 0x000c:
1910 case 0x0085:
1911 case 0x2028:
1912 case 0x2029:
1913 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1914 break;
1915 }
1916 ecode++;
1917 break;
1918
1919 case OP_NOT_HSPACE:
1920 if (eptr >= md->end_subject)
1921 {
1922 SCHECK_PARTIAL();
1923 MRRETURN(MATCH_NOMATCH);
1924 }
1925 GETCHARINCTEST(c, eptr);
1926 switch(c)
1927 {
1928 default: break;
1929 case 0x09: /* HT */
1930 case 0x20: /* SPACE */
1931 case 0xa0: /* NBSP */
1932 case 0x1680: /* OGHAM SPACE MARK */
1933 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1934 case 0x2000: /* EN QUAD */
1935 case 0x2001: /* EM QUAD */
1936 case 0x2002: /* EN SPACE */
1937 case 0x2003: /* EM SPACE */
1938 case 0x2004: /* THREE-PER-EM SPACE */
1939 case 0x2005: /* FOUR-PER-EM SPACE */
1940 case 0x2006: /* SIX-PER-EM SPACE */
1941 case 0x2007: /* FIGURE SPACE */
1942 case 0x2008: /* PUNCTUATION SPACE */
1943 case 0x2009: /* THIN SPACE */
1944 case 0x200A: /* HAIR SPACE */
1945 case 0x202f: /* NARROW NO-BREAK SPACE */
1946 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1947 case 0x3000: /* IDEOGRAPHIC SPACE */
1948 MRRETURN(MATCH_NOMATCH);
1949 }
1950 ecode++;
1951 break;
1952
1953 case OP_HSPACE:
1954 if (eptr >= md->end_subject)
1955 {
1956 SCHECK_PARTIAL();
1957 MRRETURN(MATCH_NOMATCH);
1958 }
1959 GETCHARINCTEST(c, eptr);
1960 switch(c)
1961 {
1962 default: MRRETURN(MATCH_NOMATCH);
1963 case 0x09: /* HT */
1964 case 0x20: /* SPACE */
1965 case 0xa0: /* NBSP */
1966 case 0x1680: /* OGHAM SPACE MARK */
1967 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1968 case 0x2000: /* EN QUAD */
1969 case 0x2001: /* EM QUAD */
1970 case 0x2002: /* EN SPACE */
1971 case 0x2003: /* EM SPACE */
1972 case 0x2004: /* THREE-PER-EM SPACE */
1973 case 0x2005: /* FOUR-PER-EM SPACE */
1974 case 0x2006: /* SIX-PER-EM SPACE */
1975 case 0x2007: /* FIGURE SPACE */
1976 case 0x2008: /* PUNCTUATION SPACE */
1977 case 0x2009: /* THIN SPACE */
1978 case 0x200A: /* HAIR SPACE */
1979 case 0x202f: /* NARROW NO-BREAK SPACE */
1980 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1981 case 0x3000: /* IDEOGRAPHIC SPACE */
1982 break;
1983 }
1984 ecode++;
1985 break;
1986
1987 case OP_NOT_VSPACE:
1988 if (eptr >= md->end_subject)
1989 {
1990 SCHECK_PARTIAL();
1991 MRRETURN(MATCH_NOMATCH);
1992 }
1993 GETCHARINCTEST(c, eptr);
1994 switch(c)
1995 {
1996 default: break;
1997 case 0x0a: /* LF */
1998 case 0x0b: /* VT */
1999 case 0x0c: /* FF */
2000 case 0x0d: /* CR */
2001 case 0x85: /* NEL */
2002 case 0x2028: /* LINE SEPARATOR */
2003 case 0x2029: /* PARAGRAPH SEPARATOR */
2004 MRRETURN(MATCH_NOMATCH);
2005 }
2006 ecode++;
2007 break;
2008
2009 case OP_VSPACE:
2010 if (eptr >= md->end_subject)
2011 {
2012 SCHECK_PARTIAL();
2013 MRRETURN(MATCH_NOMATCH);
2014 }
2015 GETCHARINCTEST(c, eptr);
2016 switch(c)
2017 {
2018 default: MRRETURN(MATCH_NOMATCH);
2019 case 0x0a: /* LF */
2020 case 0x0b: /* VT */
2021 case 0x0c: /* FF */
2022 case 0x0d: /* CR */
2023 case 0x85: /* NEL */
2024 case 0x2028: /* LINE SEPARATOR */
2025 case 0x2029: /* PARAGRAPH SEPARATOR */
2026 break;
2027 }
2028 ecode++;
2029 break;
2030
2031 #ifdef SUPPORT_UCP
2032 /* Check the next character by Unicode property. We will get here only
2033 if the support is in the binary; otherwise a compile-time error occurs. */
2034
2035 case OP_PROP:
2036 case OP_NOTPROP:
2037 if (eptr >= md->end_subject)
2038 {
2039 SCHECK_PARTIAL();
2040 MRRETURN(MATCH_NOMATCH);
2041 }
2042 GETCHARINCTEST(c, eptr);
2043 {
2044 const ucd_record *prop = GET_UCD(c);
2045
2046 switch(ecode[1])
2047 {
2048 case PT_ANY:
2049 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2050 break;
2051
2052 case PT_LAMP:
2053 if ((prop->chartype == ucp_Lu ||
2054 prop->chartype == ucp_Ll ||
2055 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2056 MRRETURN(MATCH_NOMATCH);
2057 break;
2058
2059 case PT_GC:
2060 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2061 MRRETURN(MATCH_NOMATCH);
2062 break;
2063
2064 case PT_PC:
2065 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2066 MRRETURN(MATCH_NOMATCH);
2067 break;
2068
2069 case PT_SC:
2070 if ((ecode[2] != prop->script) == (op == OP_PROP))
2071 MRRETURN(MATCH_NOMATCH);
2072 break;
2073
2074 default:
2075 RRETURN(PCRE_ERROR_INTERNAL);
2076 }
2077
2078 ecode += 3;
2079 }
2080 break;
2081
2082 /* Match an extended Unicode sequence. We will get here only if the support
2083 is in the binary; otherwise a compile-time error occurs. */
2084
2085 case OP_EXTUNI:
2086 if (eptr >= md->end_subject)
2087 {
2088 SCHECK_PARTIAL();
2089 MRRETURN(MATCH_NOMATCH);
2090 }
2091 GETCHARINCTEST(c, eptr);
2092 {
2093 int category = UCD_CATEGORY(c);
2094 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2095 while (eptr < md->end_subject)
2096 {
2097 int len = 1;
2098 if (!utf8) c = *eptr; else
2099 {
2100 GETCHARLEN(c, eptr, len);
2101 }
2102 category = UCD_CATEGORY(c);
2103 if (category != ucp_M) break;
2104 eptr += len;
2105 }
2106 }
2107 ecode++;
2108 break;
2109 #endif
2110
2111
2112 /* Match a back reference, possibly repeatedly. Look past the end of the
2113 item to see if there is repeat information following. The code is similar
2114 to that for character classes, but repeated for efficiency. Then obey
2115 similar code to character type repeats - written out again for speed.
2116 However, if the referenced string is the empty string, always treat
2117 it as matched, any number of times (otherwise there could be infinite
2118 loops). */
2119
2120 case OP_REF:
2121 {
2122 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2123 ecode += 3;
2124
2125 /* If the reference is unset, there are two possibilities:
2126
2127 (a) In the default, Perl-compatible state, set the length to be longer
2128 than the amount of subject left; this ensures that every attempt at a
2129 match fails. We can't just fail here, because of the possibility of
2130 quantifiers with zero minima.
2131
2132 (b) If the JavaScript compatibility flag is set, set the length to zero
2133 so that the back reference matches an empty string.
2134
2135 Otherwise, set the length to the length of what was matched by the
2136 referenced subpattern. */
2137
2138 if (offset >= offset_top || md->offset_vector[offset] < 0)
2139 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2140 else
2141 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2142
2143 /* Set up for repetition, or handle the non-repeated case */
2144
2145 switch (*ecode)
2146 {
2147 case OP_CRSTAR:
2148 case OP_CRMINSTAR:
2149 case OP_CRPLUS:
2150 case OP_CRMINPLUS:
2151 case OP_CRQUERY:
2152 case OP_CRMINQUERY:
2153 c = *ecode++ - OP_CRSTAR;
2154 minimize = (c & 1) != 0;
2155 min = rep_min[c]; /* Pick up values from tables; */
2156 max = rep_max[c]; /* zero for max => infinity */
2157 if (max == 0) max = INT_MAX;
2158 break;
2159
2160 case OP_CRRANGE:
2161 case OP_CRMINRANGE:
2162 minimize = (*ecode == OP_CRMINRANGE);
2163 min = GET2(ecode, 1);
2164 max = GET2(ecode, 3);
2165 if (max == 0) max = INT_MAX;
2166 ecode += 5;
2167 break;
2168
2169 default: /* No repeat follows */
2170 if (!match_ref(offset, eptr, length, md, ims))
2171 {
2172 CHECK_PARTIAL();
2173 MRRETURN(MATCH_NOMATCH);
2174 }
2175 eptr += length;
2176 continue; /* With the main loop */
2177 }
2178
2179 /* If the length of the reference is zero, just continue with the
2180 main loop. */
2181
2182 if (length == 0) continue;
2183
2184 /* First, ensure the minimum number of matches are present. We get back
2185 the length of the reference string explicitly rather than passing the
2186 address of eptr, so that eptr can be a register variable. */
2187
2188 for (i = 1; i <= min; i++)
2189 {
2190 if (!match_ref(offset, eptr, length, md, ims))
2191 {
2192 CHECK_PARTIAL();
2193 MRRETURN(MATCH_NOMATCH);
2194 }
2195 eptr += length;
2196 }
2197
2198 /* If min = max, continue at the same level without recursion.
2199 They are not both allowed to be zero. */
2200
2201 if (min == max) continue;
2202
2203 /* If minimizing, keep trying and advancing the pointer */
2204
2205 if (minimize)
2206 {
2207 for (fi = min;; fi++)
2208 {
2209 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2211 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2212 if (!match_ref(offset, eptr, length, md, ims))
2213 {
2214 CHECK_PARTIAL();
2215 MRRETURN(MATCH_NOMATCH);
2216 }
2217 eptr += length;
2218 }
2219 /* Control never gets here */
2220 }
2221
2222 /* If maximizing, find the longest string and work backwards */
2223
2224 else
2225 {
2226 pp = eptr;
2227 for (i = min; i < max; i++)
2228 {
2229 if (!match_ref(offset, eptr, length, md, ims))
2230 {
2231 CHECK_PARTIAL();
2232 break;
2233 }
2234 eptr += length;
2235 }
2236 while (eptr >= pp)
2237 {
2238 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2239 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240 eptr -= length;
2241 }
2242 MRRETURN(MATCH_NOMATCH);
2243 }
2244 }
2245 /* Control never gets here */
2246
2247 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2248 used when all the characters in the class have values in the range 0-255,
2249 and either the matching is caseful, or the characters are in the range
2250 0-127 when UTF-8 processing is enabled. The only difference between
2251 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2252 encountered.
2253
2254 First, look past the end of the item to see if there is repeat information
2255 following. Then obey similar code to character type repeats - written out
2256 again for speed. */
2257
2258 case OP_NCLASS:
2259 case OP_CLASS:
2260 {
2261 data = ecode + 1; /* Save for matching */
2262 ecode += 33; /* Advance past the item */
2263
2264 switch (*ecode)
2265 {
2266 case OP_CRSTAR:
2267 case OP_CRMINSTAR:
2268 case OP_CRPLUS:
2269 case OP_CRMINPLUS:
2270 case OP_CRQUERY:
2271 case OP_CRMINQUERY:
2272 c = *ecode++ - OP_CRSTAR;
2273 minimize = (c & 1) != 0;
2274 min = rep_min[c]; /* Pick up values from tables; */
2275 max = rep_max[c]; /* zero for max => infinity */
2276 if (max == 0) max = INT_MAX;
2277 break;
2278
2279 case OP_CRRANGE:
2280 case OP_CRMINRANGE:
2281 minimize = (*ecode == OP_CRMINRANGE);
2282 min = GET2(ecode, 1);
2283 max = GET2(ecode, 3);
2284 if (max == 0) max = INT_MAX;
2285 ecode += 5;
2286 break;
2287
2288 default: /* No repeat follows */
2289 min = max = 1;
2290 break;
2291 }
2292
2293 /* First, ensure the minimum number of matches are present. */
2294
2295 #ifdef SUPPORT_UTF8
2296 /* UTF-8 mode */
2297 if (utf8)
2298 {
2299 for (i = 1; i <= min; i++)
2300 {
2301 if (eptr >= md->end_subject)
2302 {
2303 SCHECK_PARTIAL();
2304 MRRETURN(MATCH_NOMATCH);
2305 }
2306 GETCHARINC(c, eptr);
2307 if (c > 255)
2308 {
2309 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2310 }
2311 else
2312 {
2313 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2314 }
2315 }
2316 }
2317 else
2318 #endif
2319 /* Not UTF-8 mode */
2320 {
2321 for (i = 1; i <= min; i++)
2322 {
2323 if (eptr >= md->end_subject)
2324 {
2325 SCHECK_PARTIAL();
2326 MRRETURN(MATCH_NOMATCH);
2327 }
2328 c = *eptr++;
2329 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2330 }
2331 }
2332
2333 /* If max == min we can continue with the main loop without the
2334 need to recurse. */
2335
2336 if (min == max) continue;
2337
2338 /* If minimizing, keep testing the rest of the expression and advancing
2339 the pointer while it matches the class. */
2340
2341 if (minimize)
2342 {
2343 #ifdef SUPPORT_UTF8
2344 /* UTF-8 mode */
2345 if (utf8)
2346 {
2347 for (fi = min;; fi++)
2348 {
2349 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2350 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2351 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2352 if (eptr >= md->end_subject)
2353 {
2354 SCHECK_PARTIAL();
2355 MRRETURN(MATCH_NOMATCH);
2356 }
2357 GETCHARINC(c, eptr);
2358 if (c > 255)
2359 {
2360 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2361 }
2362 else
2363 {
2364 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2365 }
2366 }
2367 }
2368 else
2369 #endif
2370 /* Not UTF-8 mode */
2371 {
2372 for (fi = min;; fi++)
2373 {
2374 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2375 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2376 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2377 if (eptr >= md->end_subject)
2378 {
2379 SCHECK_PARTIAL();
2380 MRRETURN(MATCH_NOMATCH);
2381 }
2382 c = *eptr++;
2383 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2384 }
2385 }
2386 /* Control never gets here */
2387 }
2388
2389 /* If maximizing, find the longest possible run, then work backwards. */
2390
2391 else
2392 {
2393 pp = eptr;
2394
2395 #ifdef SUPPORT_UTF8
2396 /* UTF-8 mode */
2397 if (utf8)
2398 {
2399 for (i = min; i < max; i++)
2400 {
2401 int len = 1;
2402 if (eptr >= md->end_subject)
2403 {
2404 SCHECK_PARTIAL();
2405 break;
2406 }
2407 GETCHARLEN(c, eptr, len);
2408 if (c > 255)
2409 {
2410 if (op == OP_CLASS) break;
2411 }
2412 else
2413 {
2414 if ((data[c/8] & (1 << (c&7))) == 0) break;
2415 }
2416 eptr += len;
2417 }
2418 for (;;)
2419 {
2420 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2421 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2422 if (eptr-- == pp) break; /* Stop if tried at original pos */
2423 BACKCHAR(eptr);
2424 }
2425 }
2426 else
2427 #endif
2428 /* Not UTF-8 mode */
2429 {
2430 for (i = min; i < max; i++)
2431 {
2432 if (eptr >= md->end_subject)
2433 {
2434 SCHECK_PARTIAL();
2435 break;
2436 }
2437 c = *eptr;
2438 if ((data[c/8] & (1 << (c&7))) == 0) break;
2439 eptr++;
2440 }
2441 while (eptr >= pp)
2442 {
2443 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2445 eptr--;
2446 }
2447 }
2448
2449 MRRETURN(MATCH_NOMATCH);
2450 }
2451 }
2452 /* Control never gets here */
2453
2454
2455 /* Match an extended character class. This opcode is encountered only
2456 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2457 mode, because Unicode properties are supported in non-UTF-8 mode. */
2458
2459 #ifdef SUPPORT_UTF8
2460 case OP_XCLASS:
2461 {
2462 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2463 ecode += GET(ecode, 1); /* Advance past the item */
2464
2465 switch (*ecode)
2466 {
2467 case OP_CRSTAR:
2468 case OP_CRMINSTAR:
2469 case OP_CRPLUS:
2470 case OP_CRMINPLUS:
2471 case OP_CRQUERY:
2472 case OP_CRMINQUERY:
2473 c = *ecode++ - OP_CRSTAR;
2474 minimize = (c & 1) != 0;
2475 min = rep_min[c]; /* Pick up values from tables; */
2476 max = rep_max[c]; /* zero for max => infinity */
2477 if (max == 0) max = INT_MAX;
2478 break;
2479
2480 case OP_CRRANGE:
2481 case OP_CRMINRANGE:
2482 minimize = (*ecode == OP_CRMINRANGE);
2483 min = GET2(ecode, 1);
2484 max = GET2(ecode, 3);
2485 if (max == 0) max = INT_MAX;
2486 ecode += 5;
2487 break;
2488
2489 default: /* No repeat follows */
2490 min = max = 1;
2491 break;
2492 }
2493
2494 /* First, ensure the minimum number of matches are present. */
2495
2496 for (i = 1; i <= min; i++)
2497 {
2498 if (eptr >= md->end_subject)
2499 {
2500 SCHECK_PARTIAL();
2501 MRRETURN(MATCH_NOMATCH);
2502 }
2503 GETCHARINCTEST(c, eptr);
2504 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2505 }
2506
2507 /* If max == min we can continue with the main loop without the
2508 need to recurse. */
2509
2510 if (min == max) continue;
2511
2512 /* If minimizing, keep testing the rest of the expression and advancing
2513 the pointer while it matches the class. */
2514
2515 if (minimize)
2516 {
2517 for (fi = min;; fi++)
2518 {
2519 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2520 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2521 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2522 if (eptr >= md->end_subject)
2523 {
2524 SCHECK_PARTIAL();
2525 MRRETURN(MATCH_NOMATCH);
2526 }
2527 GETCHARINCTEST(c, eptr);
2528 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2529 }
2530 /* Control never gets here */
2531 }
2532
2533 /* If maximizing, find the longest possible run, then work backwards. */
2534
2535 else
2536 {
2537 pp = eptr;
2538 for (i = min; i < max; i++)
2539 {
2540 int len = 1;
2541 if (eptr >= md->end_subject)
2542 {
2543 SCHECK_PARTIAL();
2544 break;
2545 }
2546 GETCHARLENTEST(c, eptr, len);
2547 if (!_pcre_xclass(c, data)) break;
2548 eptr += len;
2549 }
2550 for(;;)
2551 {
2552 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2553 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2554 if (eptr-- == pp) break; /* Stop if tried at original pos */
2555 if (utf8) BACKCHAR(eptr);
2556 }
2557 MRRETURN(MATCH_NOMATCH);
2558 }
2559
2560 /* Control never gets here */
2561 }
2562 #endif /* End of XCLASS */
2563
2564 /* Match a single character, casefully */
2565
2566 case OP_CHAR:
2567 #ifdef SUPPORT_UTF8
2568 if (utf8)
2569 {
2570 length = 1;
2571 ecode++;
2572 GETCHARLEN(fc, ecode, length);
2573 if (length > md->end_subject - eptr)
2574 {
2575 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2576 MRRETURN(MATCH_NOMATCH);
2577 }
2578 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2579 }
2580 else
2581 #endif
2582
2583 /* Non-UTF-8 mode */
2584 {
2585 if (md->end_subject - eptr < 1)
2586 {
2587 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2588 MRRETURN(MATCH_NOMATCH);
2589 }
2590 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2591 ecode += 2;
2592 }
2593 break;
2594
2595 /* Match a single character, caselessly */
2596
2597 case OP_CHARNC:
2598 #ifdef SUPPORT_UTF8
2599 if (utf8)
2600 {
2601 length = 1;
2602 ecode++;
2603 GETCHARLEN(fc, ecode, length);
2604
2605 if (length > md->end_subject - eptr)
2606 {
2607 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2608 MRRETURN(MATCH_NOMATCH);
2609 }
2610
2611 /* If the pattern character's value is < 128, we have only one byte, and
2612 can use the fast lookup table. */
2613
2614 if (fc < 128)
2615 {
2616 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2617 }
2618
2619 /* Otherwise we must pick up the subject character */
2620
2621 else
2622 {
2623 unsigned int dc;
2624 GETCHARINC(dc, eptr);
2625 ecode += length;
2626
2627 /* If we have Unicode property support, we can use it to test the other
2628 case of the character, if there is one. */
2629
2630 if (fc != dc)
2631 {
2632 #ifdef SUPPORT_UCP
2633 if (dc != UCD_OTHERCASE(fc))
2634 #endif
2635 MRRETURN(MATCH_NOMATCH);
2636 }
2637 }
2638 }
2639 else
2640 #endif /* SUPPORT_UTF8 */
2641
2642 /* Non-UTF-8 mode */
2643 {
2644 if (md->end_subject - eptr < 1)
2645 {
2646 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2647 MRRETURN(MATCH_NOMATCH);
2648 }
2649 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2650 ecode += 2;
2651 }
2652 break;
2653
2654 /* Match a single character repeatedly. */
2655
2656 case OP_EXACT:
2657 min = max = GET2(ecode, 1);
2658 ecode += 3;
2659 goto REPEATCHAR;
2660
2661 case OP_POSUPTO:
2662 possessive = TRUE;
2663 /* Fall through */
2664
2665 case OP_UPTO:
2666 case OP_MINUPTO:
2667 min = 0;
2668 max = GET2(ecode, 1);
2669 minimize = *ecode == OP_MINUPTO;
2670 ecode += 3;
2671 goto REPEATCHAR;
2672
2673 case OP_POSSTAR:
2674 possessive = TRUE;
2675 min = 0;
2676 max = INT_MAX;
2677 ecode++;
2678 goto REPEATCHAR;
2679
2680 case OP_POSPLUS:
2681 possessive = TRUE;
2682 min = 1;
2683 max = INT_MAX;
2684 ecode++;
2685 goto REPEATCHAR;
2686
2687 case OP_POSQUERY:
2688 possessive = TRUE;
2689 min = 0;
2690 max = 1;
2691 ecode++;
2692 goto REPEATCHAR;
2693
2694 case OP_STAR:
2695 case OP_MINSTAR:
2696 case OP_PLUS:
2697 case OP_MINPLUS:
2698 case OP_QUERY:
2699 case OP_MINQUERY:
2700 c = *ecode++ - OP_STAR;
2701 minimize = (c & 1) != 0;
2702
2703 min = rep_min[c]; /* Pick up values from tables; */
2704 max = rep_max[c]; /* zero for max => infinity */
2705 if (max == 0) max = INT_MAX;
2706
2707 /* Common code for all repeated single-character matches. */
2708
2709 REPEATCHAR:
2710 #ifdef SUPPORT_UTF8
2711 if (utf8)
2712 {
2713 length = 1;
2714 charptr = ecode;
2715 GETCHARLEN(fc, ecode, length);
2716 ecode += length;
2717
2718 /* Handle multibyte character matching specially here. There is
2719 support for caseless matching if UCP support is present. */
2720
2721 if (length > 1)
2722 {
2723 #ifdef SUPPORT_UCP
2724 unsigned int othercase;
2725 if ((ims & PCRE_CASELESS) != 0 &&
2726 (othercase = UCD_OTHERCASE(fc)) != fc)
2727 oclength = _pcre_ord2utf8(othercase, occhars);
2728 else oclength = 0;
2729 #endif /* SUPPORT_UCP */
2730
2731 for (i = 1; i <= min; i++)
2732 {
2733 if (eptr <= md->end_subject - length &&
2734 memcmp(eptr, charptr, length) == 0) eptr += length;
2735 #ifdef SUPPORT_UCP
2736 else if (oclength > 0 &&
2737 eptr <= md->end_subject - oclength &&
2738 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2739 #endif /* SUPPORT_UCP */
2740 else
2741 {
2742 CHECK_PARTIAL();
2743 MRRETURN(MATCH_NOMATCH);
2744 }
2745 }
2746
2747 if (min == max) continue;
2748
2749 if (minimize)
2750 {
2751 for (fi = min;; fi++)
2752 {
2753 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2754 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2755 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2756 if (eptr <= md->end_subject - length &&
2757 memcmp(eptr, charptr, length) == 0) eptr += length;
2758 #ifdef SUPPORT_UCP
2759 else if (oclength > 0 &&
2760 eptr <= md->end_subject - oclength &&
2761 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2762 #endif /* SUPPORT_UCP */
2763 else
2764 {
2765 CHECK_PARTIAL();
2766 MRRETURN(MATCH_NOMATCH);
2767 }
2768 }
2769 /* Control never gets here */
2770 }
2771
2772 else /* Maximize */
2773 {
2774 pp = eptr;
2775 for (i = min; i < max; i++)
2776 {
2777 if (eptr <= md->end_subject - length &&
2778 memcmp(eptr, charptr, length) == 0) eptr += length;
2779 #ifdef SUPPORT_UCP
2780 else if (oclength > 0 &&
2781 eptr <= md->end_subject - oclength &&
2782 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2783 #endif /* SUPPORT_UCP */
2784 else
2785 {
2786 CHECK_PARTIAL();
2787 break;
2788 }
2789 }
2790
2791 if (possessive) continue;
2792
2793 for(;;)
2794 {
2795 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2796 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2797 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2798 #ifdef SUPPORT_UCP
2799 eptr--;
2800 BACKCHAR(eptr);
2801 #else /* without SUPPORT_UCP */
2802 eptr -= length;
2803 #endif /* SUPPORT_UCP */
2804 }
2805 }
2806 /* Control never gets here */
2807 }
2808
2809 /* If the length of a UTF-8 character is 1, we fall through here, and
2810 obey the code as for non-UTF-8 characters below, though in this case the
2811 value of fc will always be < 128. */
2812 }
2813 else
2814 #endif /* SUPPORT_UTF8 */
2815
2816 /* When not in UTF-8 mode, load a single-byte character. */
2817
2818 fc = *ecode++;
2819
2820 /* The value of fc at this point is always less than 256, though we may or
2821 may not be in UTF-8 mode. The code is duplicated for the caseless and
2822 caseful cases, for speed, since matching characters is likely to be quite
2823 common. First, ensure the minimum number of matches are present. If min =
2824 max, continue at the same level without recursing. Otherwise, if
2825 minimizing, keep trying the rest of the expression and advancing one
2826 matching character if failing, up to the maximum. Alternatively, if
2827 maximizing, find the maximum number of characters and work backwards. */
2828
2829 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2830 max, eptr));
2831
2832 if ((ims & PCRE_CASELESS) != 0)
2833 {
2834 fc = md->lcc[fc];
2835 for (i = 1; i <= min; i++)
2836 {
2837 if (eptr >= md->end_subject)
2838 {
2839 SCHECK_PARTIAL();
2840 MRRETURN(MATCH_NOMATCH);
2841 }
2842 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2843 }
2844 if (min == max) continue;
2845 if (minimize)
2846 {
2847 for (fi = min;; fi++)
2848 {
2849 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2850 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2851 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2852 if (eptr >= md->end_subject)
2853 {
2854 SCHECK_PARTIAL();
2855 MRRETURN(MATCH_NOMATCH);
2856 }
2857 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2858 }
2859 /* Control never gets here */
2860 }
2861 else /* Maximize */
2862 {
2863 pp = eptr;
2864 for (i = min; i < max; i++)
2865 {
2866 if (eptr >= md->end_subject)
2867 {
2868 SCHECK_PARTIAL();
2869 break;
2870 }
2871 if (fc != md->lcc[*eptr]) break;
2872 eptr++;
2873 }
2874
2875 if (possessive) continue;
2876
2877 while (eptr >= pp)
2878 {
2879 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2880 eptr--;
2881 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2882 }
2883 MRRETURN(MATCH_NOMATCH);
2884 }
2885 /* Control never gets here */
2886 }
2887
2888 /* Caseful comparisons (includes all multi-byte characters) */
2889
2890 else
2891 {
2892 for (i = 1; i <= min; i++)
2893 {
2894 if (eptr >= md->end_subject)
2895 {
2896 SCHECK_PARTIAL();
2897 MRRETURN(MATCH_NOMATCH);
2898 }
2899 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2900 }
2901
2902 if (min == max) continue;
2903
2904 if (minimize)
2905 {
2906 for (fi = min;; fi++)
2907 {
2908 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2911 if (eptr >= md->end_subject)
2912 {
2913 SCHECK_PARTIAL();
2914 MRRETURN(MATCH_NOMATCH);
2915 }
2916 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2917 }
2918 /* Control never gets here */
2919 }
2920 else /* Maximize */
2921 {
2922 pp = eptr;
2923 for (i = min; i < max; i++)
2924 {
2925 if (eptr >= md->end_subject)
2926 {
2927 SCHECK_PARTIAL();
2928 break;
2929 }
2930 if (fc != *eptr) break;
2931 eptr++;
2932 }
2933 if (possessive) continue;
2934
2935 while (eptr >= pp)
2936 {
2937 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2938 eptr--;
2939 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2940 }
2941 MRRETURN(MATCH_NOMATCH);
2942 }
2943 }
2944 /* Control never gets here */
2945
2946 /* Match a negated single one-byte character. The character we are
2947 checking can be multibyte. */
2948
2949 case OP_NOT:
2950 if (eptr >= md->end_subject)
2951 {
2952 SCHECK_PARTIAL();
2953 MRRETURN(MATCH_NOMATCH);
2954 }
2955 ecode++;
2956 GETCHARINCTEST(c, eptr);
2957 if ((ims & PCRE_CASELESS) != 0)
2958 {
2959 #ifdef SUPPORT_UTF8
2960 if (c < 256)
2961 #endif
2962 c = md->lcc[c];
2963 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
2964 }
2965 else
2966 {
2967 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
2968 }
2969 break;
2970
2971 /* Match a negated single one-byte character repeatedly. This is almost a
2972 repeat of the code for a repeated single character, but I haven't found a
2973 nice way of commoning these up that doesn't require a test of the
2974 positive/negative option for each character match. Maybe that wouldn't add
2975 very much to the time taken, but character matching *is* what this is all
2976 about... */
2977
2978 case OP_NOTEXACT:
2979 min = max = GET2(ecode, 1);
2980 ecode += 3;
2981 goto REPEATNOTCHAR;
2982
2983 case OP_NOTUPTO:
2984 case OP_NOTMINUPTO:
2985 min = 0;
2986 max = GET2(ecode, 1);
2987 minimize = *ecode == OP_NOTMINUPTO;
2988 ecode += 3;
2989 goto REPEATNOTCHAR;
2990
2991 case OP_NOTPOSSTAR:
2992 possessive = TRUE;
2993 min = 0;
2994 max = INT_MAX;
2995 ecode++;
2996 goto REPEATNOTCHAR;
2997
2998 case OP_NOTPOSPLUS:
2999 possessive = TRUE;
3000 min = 1;
3001 max = INT_MAX;
3002 ecode++;
3003 goto REPEATNOTCHAR;
3004
3005 case OP_NOTPOSQUERY:
3006 possessive = TRUE;
3007 min = 0;
3008 max = 1;
3009 ecode++;
3010 goto REPEATNOTCHAR;
3011
3012 case OP_NOTPOSUPTO:
3013 possessive = TRUE;
3014 min = 0;
3015 max = GET2(ecode, 1);
3016 ecode += 3;
3017 goto REPEATNOTCHAR;
3018
3019 case OP_NOTSTAR:
3020 case OP_NOTMINSTAR:
3021 case OP_NOTPLUS:
3022 case OP_NOTMINPLUS:
3023 case OP_NOTQUERY:
3024 case OP_NOTMINQUERY:
3025 c = *ecode++ - OP_NOTSTAR;
3026 minimize = (c & 1) != 0;
3027 min = rep_min[c]; /* Pick up values from tables; */
3028 max = rep_max[c]; /* zero for max => infinity */
3029 if (max == 0) max = INT_MAX;
3030
3031 /* Common code for all repeated single-byte matches. */
3032
3033 REPEATNOTCHAR:
3034 fc = *ecode++;
3035
3036 /* The code is duplicated for the caseless and caseful cases, for speed,
3037 since matching characters is likely to be quite common. First, ensure the
3038 minimum number of matches are present. If min = max, continue at the same
3039 level without recursing. Otherwise, if minimizing, keep trying the rest of
3040 the expression and advancing one matching character if failing, up to the
3041 maximum. Alternatively, if maximizing, find the maximum number of
3042 characters and work backwards. */
3043
3044 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3045 max, eptr));
3046
3047 if ((ims & PCRE_CASELESS) != 0)
3048 {
3049 fc = md->lcc[fc];
3050
3051 #ifdef SUPPORT_UTF8
3052 /* UTF-8 mode */
3053 if (utf8)
3054 {
3055 register unsigned int d;
3056 for (i = 1; i <= min; i++)
3057 {
3058 if (eptr >= md->end_subject)
3059 {
3060 SCHECK_PARTIAL();
3061 MRRETURN(MATCH_NOMATCH);
3062 }
3063 GETCHARINC(d, eptr);
3064 if (d < 256) d = md->lcc[d];
3065 if (fc == d) MRRETURN(MATCH_NOMATCH);
3066 }
3067 }
3068 else
3069 #endif
3070
3071 /* Not UTF-8 mode */
3072 {
3073 for (i = 1; i <= min; i++)
3074 {
3075 if (eptr >= md->end_subject)
3076 {
3077 SCHECK_PARTIAL();
3078 MRRETURN(MATCH_NOMATCH);
3079 }
3080 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3081 }
3082 }
3083
3084 if (min == max) continue;
3085
3086 if (minimize)
3087 {
3088 #ifdef SUPPORT_UTF8
3089 /* UTF-8 mode */
3090 if (utf8)
3091 {
3092 register unsigned int d;
3093 for (fi = min;; fi++)
3094 {
3095 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3097 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3098 if (eptr >= md->end_subject)
3099 {
3100 SCHECK_PARTIAL();
3101 MRRETURN(MATCH_NOMATCH);
3102 }
3103 GETCHARINC(d, eptr);
3104 if (d < 256) d = md->lcc[d];
3105 if (fc == d) MRRETURN(MATCH_NOMATCH);
3106 }
3107 }
3108 else
3109 #endif
3110 /* Not UTF-8 mode */
3111 {
3112 for (fi = min;; fi++)
3113 {
3114 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3115 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3116 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3117 if (eptr >= md->end_subject)
3118 {
3119 SCHECK_PARTIAL();
3120 MRRETURN(MATCH_NOMATCH);
3121 }
3122 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3123 }
3124 }
3125 /* Control never gets here */
3126 }
3127
3128 /* Maximize case */
3129
3130 else
3131 {
3132 pp = eptr;
3133
3134 #ifdef SUPPORT_UTF8
3135 /* UTF-8 mode */
3136 if (utf8)
3137 {
3138 register unsigned int d;
3139 for (i = min; i < max; i++)
3140 {
3141 int len = 1;
3142 if (eptr >= md->end_subject)
3143 {
3144 SCHECK_PARTIAL();
3145 break;
3146 }
3147 GETCHARLEN(d, eptr, len);
3148 if (d < 256) d = md->lcc[d];
3149 if (fc == d) break;
3150 eptr += len;
3151 }
3152 if (possessive) continue;
3153 for(;;)
3154 {
3155 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3156 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3157 if (eptr-- == pp) break; /* Stop if tried at original pos */
3158 BACKCHAR(eptr);
3159 }
3160 }
3161 else
3162 #endif
3163 /* Not UTF-8 mode */
3164 {
3165 for (i = min; i < max; i++)
3166 {
3167 if (eptr >= md->end_subject)
3168 {
3169 SCHECK_PARTIAL();
3170 break;
3171 }
3172 if (fc == md->lcc[*eptr]) break;
3173 eptr++;
3174 }
3175 if (possessive) continue;
3176 while (eptr >= pp)
3177 {
3178 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3179 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3180 eptr--;
3181 }
3182 }
3183
3184 MRRETURN(MATCH_NOMATCH);
3185 }
3186 /* Control never gets here */
3187 }
3188
3189 /* Caseful comparisons */
3190
3191 else
3192 {
3193 #ifdef SUPPORT_UTF8
3194 /* UTF-8 mode */
3195 if (utf8)
3196 {
3197 register unsigned int d;
3198 for (i = 1; i <= min; i++)
3199 {
3200 if (eptr >= md->end_subject)
3201 {
3202 SCHECK_PARTIAL();
3203 MRRETURN(MATCH_NOMATCH);
3204 }
3205 GETCHARINC(d, eptr);
3206 if (fc == d) MRRETURN(MATCH_NOMATCH);
3207 }
3208 }
3209 else
3210 #endif
3211 /* Not UTF-8 mode */
3212 {
3213 for (i = 1; i <= min; i++)
3214 {
3215 if (eptr >= md->end_subject)
3216 {
3217 SCHECK_PARTIAL();
3218 MRRETURN(MATCH_NOMATCH);
3219 }
3220 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3221 }
3222 }
3223
3224 if (min == max) continue;
3225
3226 if (minimize)
3227 {
3228 #ifdef SUPPORT_UTF8
3229 /* UTF-8 mode */
3230 if (utf8)
3231 {
3232 register unsigned int d;
3233 for (fi = min;; fi++)
3234 {
3235 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3236 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3237 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3238 if (eptr >= md->end_subject)
3239 {
3240 SCHECK_PARTIAL();
3241 MRRETURN(MATCH_NOMATCH);
3242 }
3243 GETCHARINC(d, eptr);
3244 if (fc == d) MRRETURN(MATCH_NOMATCH);
3245 }
3246 }
3247 else
3248 #endif
3249 /* Not UTF-8 mode */
3250 {
3251 for (fi = min;; fi++)
3252 {
3253 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3254 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3255 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3256 if (eptr >= md->end_subject)
3257 {
3258 SCHECK_PARTIAL();
3259 MRRETURN(MATCH_NOMATCH);
3260 }
3261 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3262 }
3263 }
3264 /* Control never gets here */
3265 }
3266
3267 /* Maximize case */
3268
3269 else
3270 {
3271 pp = eptr;
3272
3273 #ifdef SUPPORT_UTF8
3274 /* UTF-8 mode */
3275 if (utf8)
3276 {
3277 register unsigned int d;
3278 for (i = min; i < max; i++)
3279 {
3280 int len = 1;
3281 if (eptr >= md->end_subject)
3282 {
3283 SCHECK_PARTIAL();
3284 break;
3285 }
3286 GETCHARLEN(d, eptr, len);
3287 if (fc == d) break;
3288 eptr += len;
3289 }
3290 if (possessive) continue;
3291 for(;;)
3292 {
3293 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3294 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3295 if (eptr-- == pp) break; /* Stop if tried at original pos */
3296 BACKCHAR(eptr);
3297 }
3298 }
3299 else
3300 #endif
3301 /* Not UTF-8 mode */
3302 {
3303 for (i = min; i < max; i++)
3304 {
3305 if (eptr >= md->end_subject)
3306 {
3307 SCHECK_PARTIAL();
3308 break;
3309 }
3310 if (fc == *eptr) break;
3311 eptr++;
3312 }
3313 if (possessive) continue;
3314 while (eptr >= pp)
3315 {
3316 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3317 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3318 eptr--;
3319 }
3320 }
3321
3322 MRRETURN(MATCH_NOMATCH);
3323 }
3324 }
3325 /* Control never gets here */
3326
3327 /* Match a single character type repeatedly; several different opcodes
3328 share code. This is very similar to the code for single characters, but we
3329 repeat it in the interests of efficiency. */
3330
3331 case OP_TYPEEXACT:
3332 min = max = GET2(ecode, 1);
3333 minimize = TRUE;
3334 ecode += 3;
3335 goto REPEATTYPE;
3336
3337 case OP_TYPEUPTO:
3338 case OP_TYPEMINUPTO:
3339 min = 0;
3340 max = GET2(ecode, 1);
3341 minimize = *ecode == OP_TYPEMINUPTO;
3342 ecode += 3;
3343 goto REPEATTYPE;
3344
3345 case OP_TYPEPOSSTAR:
3346 possessive = TRUE;
3347 min = 0;
3348 max = INT_MAX;
3349 ecode++;
3350 goto REPEATTYPE;
3351
3352 case OP_TYPEPOSPLUS:
3353 possessive = TRUE;
3354 min = 1;
3355 max = INT_MAX;
3356 ecode++;
3357 goto REPEATTYPE;
3358
3359 case OP_TYPEPOSQUERY:
3360 possessive = TRUE;
3361 min = 0;
3362 max = 1;
3363 ecode++;
3364 goto REPEATTYPE;
3365
3366 case OP_TYPEPOSUPTO:
3367 possessive = TRUE;
3368 min = 0;
3369 max = GET2(ecode, 1);
3370 ecode += 3;
3371 goto REPEATTYPE;
3372
3373 case OP_TYPESTAR:
3374 case OP_TYPEMINSTAR:
3375 case OP_TYPEPLUS:
3376 case OP_TYPEMINPLUS:
3377 case OP_TYPEQUERY:
3378 case OP_TYPEMINQUERY:
3379 c = *ecode++ - OP_TYPESTAR;
3380 minimize = (c & 1) != 0;
3381 min = rep_min[c]; /* Pick up values from tables; */
3382 max = rep_max[c]; /* zero for max => infinity */
3383 if (max == 0) max = INT_MAX;
3384
3385 /* Common code for all repeated single character type matches. Note that
3386 in UTF-8 mode, '.' matches a character of any length, but for the other
3387 character types, the valid characters are all one-byte long. */
3388
3389 REPEATTYPE:
3390 ctype = *ecode++; /* Code for the character type */
3391
3392 #ifdef SUPPORT_UCP
3393 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3394 {
3395 prop_fail_result = ctype == OP_NOTPROP;
3396 prop_type = *ecode++;
3397 prop_value = *ecode++;
3398 }
3399 else prop_type = -1;
3400 #endif
3401
3402 /* First, ensure the minimum number of matches are present. Use inline
3403 code for maximizing the speed, and do the type test once at the start
3404 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3405 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3406 and single-bytes. */
3407
3408 if (min > 0)
3409 {
3410 #ifdef SUPPORT_UCP
3411 if (prop_type >= 0)
3412 {
3413 switch(prop_type)
3414 {
3415 case PT_ANY:
3416 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3417 for (i = 1; i <= min; i++)
3418 {
3419 if (eptr >= md->end_subject)
3420 {
3421 SCHECK_PARTIAL();
3422 MRRETURN(MATCH_NOMATCH);
3423 }
3424 GETCHARINCTEST(c, eptr);
3425 }
3426 break;
3427
3428 case PT_LAMP:
3429 for (i = 1; i <= min; i++)
3430 {
3431 if (eptr >= md->end_subject)
3432 {
3433 SCHECK_PARTIAL();
3434 MRRETURN(MATCH_NOMATCH);
3435 }
3436 GETCHARINCTEST(c, eptr);
3437 prop_chartype = UCD_CHARTYPE(c);
3438 if ((prop_chartype == ucp_Lu ||
3439 prop_chartype == ucp_Ll ||
3440 prop_chartype == ucp_Lt) == prop_fail_result)
3441 MRRETURN(MATCH_NOMATCH);
3442 }
3443 break;
3444
3445 case PT_GC:
3446 for (i = 1; i <= min; i++)
3447 {
3448 if (eptr >= md->end_subject)
3449 {
3450 SCHECK_PARTIAL();
3451 MRRETURN(MATCH_NOMATCH);
3452 }
3453 GETCHARINCTEST(c, eptr);
3454 prop_category = UCD_CATEGORY(c);
3455 if ((prop_category == prop_value) == prop_fail_result)
3456 MRRETURN(MATCH_NOMATCH);
3457 }
3458 break;
3459
3460 case PT_PC:
3461 for (i = 1; i <= min; i++)
3462 {
3463 if (eptr >= md->end_subject)
3464 {
3465 SCHECK_PARTIAL();
3466 MRRETURN(MATCH_NOMATCH);
3467 }
3468 GETCHARINCTEST(c, eptr);
3469 prop_chartype = UCD_CHARTYPE(c);
3470 if ((prop_chartype == prop_value) == prop_fail_result)
3471 MRRETURN(MATCH_NOMATCH);
3472 }
3473 break;
3474
3475 case PT_SC:
3476 for (i = 1; i <= min; i++)
3477 {
3478 if (eptr >= md->end_subject)
3479 {
3480 SCHECK_PARTIAL();
3481 MRRETURN(MATCH_NOMATCH);
3482 }
3483 GETCHARINCTEST(c, eptr);
3484 prop_script = UCD_SCRIPT(c);
3485 if ((prop_script == prop_value) == prop_fail_result)
3486 MRRETURN(MATCH_NOMATCH);
3487 }
3488 break;
3489
3490 default:
3491 RRETURN(PCRE_ERROR_INTERNAL);
3492 }
3493 }
3494
3495 /* Match extended Unicode sequences. We will get here only if the
3496 support is in the binary; otherwise a compile-time error occurs. */
3497
3498 else if (ctype == OP_EXTUNI)
3499 {
3500 for (i = 1; i <= min; i++)
3501 {
3502 if (eptr >= md->end_subject)
3503 {
3504 SCHECK_PARTIAL();
3505 MRRETURN(MATCH_NOMATCH);
3506 }
3507 GETCHARINCTEST(c, eptr);
3508 prop_category = UCD_CATEGORY(c);
3509 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3510 while (eptr < md->end_subject)
3511 {
3512 int len = 1;
3513 if (!utf8) c = *eptr;
3514 else { GETCHARLEN(c, eptr, len); }
3515 prop_category = UCD_CATEGORY(c);
3516 if (prop_category != ucp_M) break;
3517 eptr += len;
3518 }
3519 }
3520 }
3521
3522 else
3523 #endif /* SUPPORT_UCP */
3524
3525 /* Handle all other cases when the coding is UTF-8 */
3526
3527 #ifdef SUPPORT_UTF8
3528 if (utf8) switch(ctype)
3529 {
3530 case OP_ANY:
3531 for (i = 1; i <= min; i++)
3532 {
3533 if (eptr >= md->end_subject)
3534 {
3535 SCHECK_PARTIAL();
3536 MRRETURN(MATCH_NOMATCH);
3537 }
3538 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3539 eptr++;
3540 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3541 }
3542 break;
3543
3544 case OP_ALLANY:
3545 for (i = 1; i <= min; i++)
3546 {
3547 if (eptr >= md->end_subject)
3548 {
3549 SCHECK_PARTIAL();
3550 MRRETURN(MATCH_NOMATCH);
3551 }
3552 eptr++;
3553 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3554 }
3555 break;
3556
3557 case OP_ANYBYTE:
3558 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3559 eptr += min;
3560 break;
3561
3562 case OP_ANYNL:
3563 for (i = 1; i <= min; i++)
3564 {
3565 if (eptr >= md->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 MRRETURN(MATCH_NOMATCH);
3569 }
3570 GETCHARINC(c, eptr);
3571 switch(c)
3572 {
3573 default: MRRETURN(MATCH_NOMATCH);
3574 case 0x000d:
3575 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3576 break;
3577
3578 case 0x000a:
3579 break;
3580
3581 case 0x000b:
3582 case 0x000c:
3583 case 0x0085:
3584 case 0x2028:
3585 case 0x2029:
3586 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3587 break;
3588 }
3589 }
3590 break;
3591
3592 case OP_NOT_HSPACE:
3593 for (i = 1; i <= min; i++)
3594 {
3595 if (eptr >= md->end_subject)
3596 {
3597 SCHECK_PARTIAL();
3598 MRRETURN(MATCH_NOMATCH);
3599 }
3600 GETCHARINC(c, eptr);
3601 switch(c)
3602 {
3603 default: break;
3604 case 0x09: /* HT */
3605 case 0x20: /* SPACE */
3606 case 0xa0: /* NBSP */
3607 case 0x1680: /* OGHAM SPACE MARK */
3608 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3609 case 0x2000: /* EN QUAD */
3610 case 0x2001: /* EM QUAD */
3611 case 0x2002: /* EN SPACE */
3612 case 0x2003: /* EM SPACE */
3613 case 0x2004: /* THREE-PER-EM SPACE */
3614 case 0x2005: /* FOUR-PER-EM SPACE */
3615 case 0x2006: /* SIX-PER-EM SPACE */
3616 case 0x2007: /* FIGURE SPACE */
3617 case 0x2008: /* PUNCTUATION SPACE */
3618 case 0x2009: /* THIN SPACE */
3619 case 0x200A: /* HAIR SPACE */
3620 case 0x202f: /* NARROW NO-BREAK SPACE */
3621 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3622 case 0x3000: /* IDEOGRAPHIC SPACE */
3623 MRRETURN(MATCH_NOMATCH);
3624 }
3625 }
3626 break;
3627
3628 case OP_HSPACE:
3629 for (i = 1; i <= min; i++)
3630 {
3631 if (eptr >= md->end_subject)
3632 {
3633 SCHECK_PARTIAL();
3634 MRRETURN(MATCH_NOMATCH);
3635 }
3636 GETCHARINC(c, eptr);
3637 switch(c)
3638 {
3639 default: MRRETURN(MATCH_NOMATCH);
3640 case 0x09: /* HT */
3641 case 0x20: /* SPACE */
3642 case 0xa0: /* NBSP */
3643 case 0x1680: /* OGHAM SPACE MARK */
3644 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3645 case 0x2000: /* EN QUAD */
3646 case 0x2001: /* EM QUAD */
3647 case 0x2002: /* EN SPACE */
3648 case 0x2003: /* EM SPACE */
3649 case 0x2004: /* THREE-PER-EM SPACE */
3650 case 0x2005: /* FOUR-PER-EM SPACE */
3651 case 0x2006: /* SIX-PER-EM SPACE */
3652 case 0x2007: /* FIGURE SPACE */
3653 case 0x2008: /* PUNCTUATION SPACE */
3654 case 0x2009: /* THIN SPACE */
3655 case 0x200A: /* HAIR SPACE */
3656 case 0x202f: /* NARROW NO-BREAK SPACE */
3657 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3658 case 0x3000: /* IDEOGRAPHIC SPACE */
3659 break;
3660 }
3661 }
3662 break;
3663
3664 case OP_NOT_VSPACE:
3665 for (i = 1; i <= min; i++)
3666 {
3667 if (eptr >= md->end_subject)
3668 {
3669 SCHECK_PARTIAL();
3670 MRRETURN(MATCH_NOMATCH);
3671 }
3672 GETCHARINC(c, eptr);
3673 switch(c)
3674 {
3675 default: break;
3676 case 0x0a: /* LF */
3677 case 0x0b: /* VT */
3678 case 0x0c: /* FF */
3679 case 0x0d: /* CR */
3680 case 0x85: /* NEL */
3681 case 0x2028: /* LINE SEPARATOR */
3682 case 0x2029: /* PARAGRAPH SEPARATOR */
3683 MRRETURN(MATCH_NOMATCH);
3684 }
3685 }
3686 break;
3687
3688 case OP_VSPACE:
3689 for (i = 1; i <= min; i++)
3690 {
3691 if (eptr >= md->end_subject)
3692 {
3693 SCHECK_PARTIAL();
3694 MRRETURN(MATCH_NOMATCH);
3695 }
3696 GETCHARINC(c, eptr);
3697 switch(c)
3698 {
3699 default: MRRETURN(MATCH_NOMATCH);
3700 case 0x0a: /* LF */
3701 case 0x0b: /* VT */
3702 case 0x0c: /* FF */
3703 case 0x0d: /* CR */
3704 case 0x85: /* NEL */
3705 case 0x2028: /* LINE SEPARATOR */
3706 case 0x2029: /* PARAGRAPH SEPARATOR */
3707 break;
3708 }
3709 }
3710 break;
3711
3712 case OP_NOT_DIGIT:
3713 for (i = 1; i <= min; i++)
3714 {
3715 if (eptr >= md->end_subject)
3716 {
3717 SCHECK_PARTIAL();
3718 MRRETURN(MATCH_NOMATCH);
3719 }
3720 GETCHARINC(c, eptr);
3721 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3722 MRRETURN(MATCH_NOMATCH);
3723 }
3724 break;
3725
3726 case OP_DIGIT:
3727 for (i = 1; i <= min; i++)
3728 {
3729 if (eptr >= md->end_subject)
3730 {
3731 SCHECK_PARTIAL();
3732 MRRETURN(MATCH_NOMATCH);
3733 }
3734 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3735 MRRETURN(MATCH_NOMATCH);
3736 /* No need to skip more bytes - we know it's a 1-byte character */
3737 }
3738 break;
3739
3740 case OP_NOT_WHITESPACE:
3741 for (i = 1; i <= min; i++)
3742 {
3743 if (eptr >= md->end_subject)
3744 {
3745 SCHECK_PARTIAL();
3746 MRRETURN(MATCH_NOMATCH);
3747 }
3748 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3749 MRRETURN(MATCH_NOMATCH);
3750 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3751 }
3752 break;
3753
3754 case OP_WHITESPACE:
3755 for (i = 1; i <= min; i++)
3756 {
3757 if (eptr >= md->end_subject)
3758 {
3759 SCHECK_PARTIAL();
3760 MRRETURN(MATCH_NOMATCH);
3761 }
3762 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3763 MRRETURN(MATCH_NOMATCH);
3764 /* No need to skip more bytes - we know it's a 1-byte character */
3765 }
3766 break;
3767
3768 case OP_NOT_WORDCHAR:
3769 for (i = 1; i <= min; i++)
3770 {
3771 if (eptr >= md->end_subject)
3772 {
3773 SCHECK_PARTIAL();
3774 MRRETURN(MATCH_NOMATCH);
3775 }
3776 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3777 MRRETURN(MATCH_NOMATCH);
3778 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3779 }
3780 break;
3781
3782 case OP_WORDCHAR:
3783 for (i = 1; i <= min; i++)
3784 {
3785 if (eptr >= md->end_subject)
3786 {
3787 SCHECK_PARTIAL();
3788 MRRETURN(MATCH_NOMATCH);
3789 }
3790 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3791 MRRETURN(MATCH_NOMATCH);
3792 /* No need to skip more bytes - we know it's a 1-byte character */
3793 }
3794 break;
3795
3796 default:
3797 RRETURN(PCRE_ERROR_INTERNAL);
3798 } /* End switch(ctype) */
3799
3800 else
3801 #endif /* SUPPORT_UTF8 */
3802
3803 /* Code for the non-UTF-8 case for minimum matching of operators other
3804 than OP_PROP and OP_NOTPROP. */
3805
3806 switch(ctype)
3807 {
3808 case OP_ANY:
3809 for (i = 1; i <= min; i++)
3810 {
3811 if (eptr >= md->end_subject)
3812 {
3813 SCHECK_PARTIAL();
3814 MRRETURN(MATCH_NOMATCH);
3815 }
3816 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3817 eptr++;
3818 }
3819 break;
3820
3821 case OP_ALLANY:
3822 if (eptr > md->end_subject - min)
3823 {
3824 SCHECK_PARTIAL();
3825 MRRETURN(MATCH_NOMATCH);
3826 }
3827 eptr += min;
3828 break;
3829
3830 case OP_ANYBYTE:
3831 if (eptr > md->end_subject - min)
3832 {
3833 SCHECK_PARTIAL();
3834 MRRETURN(MATCH_NOMATCH);
3835 }
3836 eptr += min;
3837 break;
3838
3839 case OP_ANYNL:
3840 for (i = 1; i <= min; i++)
3841 {
3842 if (eptr >= md->end_subject)
3843 {
3844 SCHECK_PARTIAL();
3845 MRRETURN(MATCH_NOMATCH);
3846 }
3847 switch(*eptr++)
3848 {
3849 default: MRRETURN(MATCH_NOMATCH);
3850 case 0x000d:
3851 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3852 break;
3853 case 0x000a:
3854 break;
3855
3856 case 0x000b:
3857 case 0x000c:
3858 case 0x0085:
3859 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3860 break;
3861 }
3862 }
3863 break;
3864
3865 case OP_NOT_HSPACE:
3866 for (i = 1; i <= min; i++)
3867 {
3868 if (eptr >= md->end_subject)
3869 {
3870 SCHECK_PARTIAL();
3871 MRRETURN(MATCH_NOMATCH);
3872 }
3873 switch(*eptr++)
3874 {
3875 default: break;
3876 case 0x09: /* HT */
3877 case 0x20: /* SPACE */
3878 case 0xa0: /* NBSP */
3879 MRRETURN(MATCH_NOMATCH);
3880 }
3881 }
3882 break;
3883
3884 case OP_HSPACE:
3885 for (i = 1; i <= min; i++)
3886 {
3887 if (eptr >= md->end_subject)
3888 {
3889 SCHECK_PARTIAL();
3890 MRRETURN(MATCH_NOMATCH);
3891 }
3892 switch(*eptr++)
3893 {
3894 default: MRRETURN(MATCH_NOMATCH);
3895 case 0x09: /* HT */
3896 case 0x20: /* SPACE */
3897 case 0xa0: /* NBSP */
3898 break;
3899 }
3900 }
3901 break;
3902
3903 case OP_NOT_VSPACE:
3904 for (i = 1; i <= min; i++)
3905 {
3906 if (eptr >= md->end_subject)
3907 {
3908 SCHECK_PARTIAL();
3909 MRRETURN(MATCH_NOMATCH);
3910 }
3911 switch(*eptr++)
3912 {
3913 default: break;
3914 case 0x0a: /* LF */
3915 case 0x0b: /* VT */
3916 case 0x0c: /* FF */
3917 case 0x0d: /* CR */
3918 case 0x85: /* NEL */
3919 MRRETURN(MATCH_NOMATCH);
3920 }
3921 }
3922 break;
3923
3924 case OP_VSPACE:
3925 for (i = 1; i <= min; i++)
3926 {
3927 if (eptr >= md->end_subject)
3928 {
3929 SCHECK_PARTIAL();
3930 MRRETURN(MATCH_NOMATCH);
3931 }
3932 switch(*eptr++)
3933 {
3934 default: MRRETURN(MATCH_NOMATCH);
3935 case 0x0a: /* LF */
3936 case 0x0b: /* VT */
3937 case 0x0c: /* FF */
3938 case 0x0d: /* CR */
3939 case 0x85: /* NEL */
3940 break;
3941 }
3942 }
3943 break;
3944
3945 case OP_NOT_DIGIT:
3946 for (i = 1; i <= min; i++)
3947 {
3948 if (eptr >= md->end_subject)
3949 {
3950 SCHECK_PARTIAL();
3951 MRRETURN(MATCH_NOMATCH);
3952 }
3953 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
3954 }
3955 break;
3956
3957 case OP_DIGIT:
3958 for (i = 1; i <= min; i++)
3959 {
3960 if (eptr >= md->end_subject)
3961 {
3962 SCHECK_PARTIAL();
3963 MRRETURN(MATCH_NOMATCH);
3964 }
3965 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
3966 }
3967 break;
3968
3969 case OP_NOT_WHITESPACE:
3970 for (i = 1; i <= min; i++)
3971 {
3972 if (eptr >= md->end_subject)
3973 {
3974 SCHECK_PARTIAL();
3975 MRRETURN(MATCH_NOMATCH);
3976 }
3977 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
3978 }
3979 break;
3980
3981 case OP_WHITESPACE:
3982 for (i = 1; i <= min; i++)
3983 {
3984 if (eptr >= md->end_subject)
3985 {
3986 SCHECK_PARTIAL();
3987 MRRETURN(MATCH_NOMATCH);
3988 }
3989 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
3990 }
3991 break;
3992
3993 case OP_NOT_WORDCHAR:
3994 for (i = 1; i <= min; i++)
3995 {
3996 if (eptr >= md->end_subject)
3997 {
3998 SCHECK_PARTIAL();
3999 MRRETURN(MATCH_NOMATCH);
4000 }
4001 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4002 MRRETURN(MATCH_NOMATCH);
4003 }
4004 break;
4005
4006 case OP_WORDCHAR:
4007 for (i = 1; i <= min; i++)
4008 {
4009 if (eptr >= md->end_subject)
4010 {
4011 SCHECK_PARTIAL();
4012 MRRETURN(MATCH_NOMATCH);
4013 }
4014 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4015 MRRETURN(MATCH_NOMATCH);
4016 }
4017 break;
4018
4019 default:
4020 RRETURN(PCRE_ERROR_INTERNAL);
4021 }
4022 }
4023
4024 /* If min = max, continue at the same level without recursing */
4025
4026 if (min == max) continue;
4027
4028 /* If minimizing, we have to test the rest of the pattern before each
4029 subsequent match. Again, separate the UTF-8 case for speed, and also
4030 separate the UCP cases. */
4031
4032 if (minimize)
4033 {
4034 #ifdef SUPPORT_UCP
4035 if (prop_type >= 0)
4036 {
4037 switch(prop_type)
4038 {
4039 case PT_ANY:
4040 for (fi = min;; fi++)
4041 {
4042 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4044 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4045 if (eptr >= md->end_subject)
4046 {
4047 SCHECK_PARTIAL();
4048 MRRETURN(MATCH_NOMATCH);
4049 }
4050 GETCHARINC(c, eptr);
4051 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4052 }
4053 /* Control never gets here */
4054
4055 case PT_LAMP:
4056 for (fi = min;; fi++)
4057 {
4058 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4060 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4061 if (eptr >= md->end_subject)
4062 {
4063 SCHECK_PARTIAL();
4064 MRRETURN(MATCH_NOMATCH);
4065 }
4066 GETCHARINC(c, eptr);
4067 prop_chartype = UCD_CHARTYPE(c);
4068 if ((prop_chartype == ucp_Lu ||
4069 prop_chartype == ucp_Ll ||
4070 prop_chartype == ucp_Lt) == prop_fail_result)
4071 MRRETURN(MATCH_NOMATCH);
4072 }
4073 /* Control never gets here */
4074
4075 case PT_GC:
4076 for (fi = min;; fi++)
4077 {
4078 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4079 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4080 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4081 if (eptr >= md->end_subject)
4082 {
4083 SCHECK_PARTIAL();
4084 MRRETURN(MATCH_NOMATCH);
4085 }
4086 GETCHARINC(c, eptr);
4087 prop_category = UCD_CATEGORY(c);
4088 if ((prop_category == prop_value) == prop_fail_result)
4089 MRRETURN(MATCH_NOMATCH);
4090 }
4091 /* Control never gets here */
4092
4093 case PT_PC:
4094 for (fi = min;; fi++)
4095 {
4096 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4097 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4098 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4099 if (eptr >= md->end_subject)
4100 {
4101 SCHECK_PARTIAL();
4102 MRRETURN(MATCH_NOMATCH);
4103 }
4104 GETCHARINC(c, eptr);
4105 prop_chartype = UCD_CHARTYPE(c);
4106 if ((prop_chartype == prop_value) == prop_fail_result)
4107 MRRETURN(MATCH_NOMATCH);
4108 }
4109 /* Control never gets here */
4110
4111 case PT_SC:
4112 for (fi = min;; fi++)
4113 {
4114 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4115 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4116 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4117 if (eptr >= md->end_subject)
4118 {
4119 SCHECK_PARTIAL();
4120 MRRETURN(MATCH_NOMATCH);
4121 }
4122 GETCHARINC(c, eptr);
4123 prop_script = UCD_SCRIPT(c);
4124 if ((prop_script == prop_value) == prop_fail_result)
4125 MRRETURN(MATCH_NOMATCH);
4126 }
4127 /* Control never gets here */
4128
4129 default:
4130 RRETURN(PCRE_ERROR_INTERNAL);
4131 }
4132 }
4133
4134 /* Match extended Unicode sequences. We will get here only if the
4135 support is in the binary; otherwise a compile-time error occurs. */
4136
4137 else if (ctype == OP_EXTUNI)
4138 {
4139 for (fi = min;; fi++)
4140 {
4141 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4142 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4143 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4144 if (eptr >= md->end_subject)
4145 {
4146 SCHECK_PARTIAL();
4147 MRRETURN(MATCH_NOMATCH);
4148 }
4149 GETCHARINCTEST(c, eptr);
4150 prop_category = UCD_CATEGORY(c);
4151 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4152 while (eptr < md->end_subject)
4153 {
4154 int len = 1;
4155 if (!utf8) c = *eptr;
4156 else { GETCHARLEN(c, eptr, len); }
4157 prop_category = UCD_CATEGORY(c);
4158 if (prop_category != ucp_M) break;
4159 eptr += len;
4160 }
4161 }
4162 }
4163
4164 else
4165 #endif /* SUPPORT_UCP */
4166
4167 #ifdef SUPPORT_UTF8
4168 /* UTF-8 mode */
4169 if (utf8)
4170 {
4171 for (fi = min;; fi++)
4172 {
4173 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4175 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4176 if (eptr >= md->end_subject)
4177 {
4178 SCHECK_PARTIAL();
4179 MRRETURN(MATCH_NOMATCH);
4180 }
4181 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4182 MRRETURN(MATCH_NOMATCH);
4183 GETCHARINC(c, eptr);
4184 switch(ctype)
4185 {
4186 case OP_ANY: /* This is the non-NL case */
4187 case OP_ALLANY:
4188 case OP_ANYBYTE:
4189 break;
4190
4191 case OP_ANYNL:
4192 switch(c)
4193 {
4194 default: MRRETURN(MATCH_NOMATCH);
4195 case 0x000d:
4196 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4197 break;
4198 case 0x000a:
4199 break;
4200
4201 case 0x000b:
4202 case 0x000c:
4203 case 0x0085:
4204 case 0x2028:
4205 case 0x2029:
4206 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4207 break;
4208 }
4209 break;
4210
4211 case OP_NOT_HSPACE:
4212 switch(c)
4213 {
4214 default: break;
4215 case 0x09: /* HT */
4216 case 0x20: /* SPACE */
4217 case 0xa0: /* NBSP */
4218 case 0x1680: /* OGHAM SPACE MARK */
4219 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4220 case 0x2000: /* EN QUAD */
4221 case 0x2001: /* EM QUAD */
4222 case 0x2002: /* EN SPACE */
4223 case 0x2003: /* EM SPACE */
4224 case 0x2004: /* THREE-PER-EM SPACE */
4225 case 0x2005: /* FOUR-PER-EM SPACE */
4226 case 0x2006: /* SIX-PER-EM SPACE */
4227 case 0x2007: /* FIGURE SPACE */
4228 case 0x2008: /* PUNCTUATION SPACE */
4229 case 0x2009: /* THIN SPACE */
4230 case 0x200A: /* HAIR SPACE */
4231 case 0x202f: /* NARROW NO-BREAK SPACE */
4232 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4233 case 0x3000: /* IDEOGRAPHIC SPACE */
4234 MRRETURN(MATCH_NOMATCH);
4235 }
4236 break;
4237
4238 case OP_HSPACE:
4239 switch(c)
4240 {
4241 default: MRRETURN(MATCH_NOMATCH);
4242 case 0x09: /* HT */
4243 case 0x20: /* SPACE */
4244 case 0xa0: /* NBSP */
4245 case 0x1680: /* OGHAM SPACE MARK */
4246 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4247 case 0x2000: /* EN QUAD */
4248 case 0x2001: /* EM QUAD */
4249 case 0x2002: /* EN SPACE */
4250 case 0x2003: /* EM SPACE */
4251 case 0x2004: /* THREE-PER-EM SPACE */
4252 case 0x2005: /* FOUR-PER-EM SPACE */
4253 case 0x2006: /* SIX-PER-EM SPACE */
4254 case 0x2007: /* FIGURE SPACE */
4255 case 0x2008: /* PUNCTUATION SPACE */
4256 case 0x2009: /* THIN SPACE */
4257 case 0x200A: /* HAIR SPACE */
4258 case 0x202f: /* NARROW NO-BREAK SPACE */
4259 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4260 case 0x3000: /* IDEOGRAPHIC SPACE */
4261 break;
4262 }
4263 break;
4264
4265 case OP_NOT_VSPACE:
4266 switch(c)
4267 {
4268 default: break;
4269 case 0x0a: /* LF */
4270 case 0x0b: /* VT */
4271 case 0x0c: /* FF */
4272 case 0x0d: /* CR */
4273 case 0x85: /* NEL */
4274 case 0x2028: /* LINE SEPARATOR */
4275 case 0x2029: /* PARAGRAPH SEPARATOR */
4276 MRRETURN(MATCH_NOMATCH);
4277 }
4278 break;
4279
4280 case OP_VSPACE:
4281 switch(c)
4282 {
4283 default: MRRETURN(MATCH_NOMATCH);
4284 case 0x0a: /* LF */
4285 case 0x0b: /* VT */
4286 case 0x0c: /* FF */
4287 case 0x0d: /* CR */
4288 case 0x85: /* NEL */
4289 case 0x2028: /* LINE SEPARATOR */
4290 case 0x2029: /* PARAGRAPH SEPARATOR */
4291 break;
4292 }
4293 break;
4294
4295 case OP_NOT_DIGIT:
4296 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4297 MRRETURN(MATCH_NOMATCH);
4298 break;
4299
4300 case OP_DIGIT:
4301 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4302 MRRETURN(MATCH_NOMATCH);
4303 break;
4304
4305 case OP_NOT_WHITESPACE:
4306 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4307 MRRETURN(MATCH_NOMATCH);
4308 break;
4309
4310 case OP_WHITESPACE:
4311 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4312 MRRETURN(MATCH_NOMATCH);
4313 break;
4314
4315 case OP_NOT_WORDCHAR:
4316 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4317 MRRETURN(MATCH_NOMATCH);
4318 break;
4319
4320 case OP_WORDCHAR:
4321 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4322 MRRETURN(MATCH_NOMATCH);
4323 break;
4324
4325 default:
4326 RRETURN(PCRE_ERROR_INTERNAL);
4327 }
4328 }
4329 }
4330 else
4331 #endif
4332 /* Not UTF-8 mode */
4333 {
4334 for (fi = min;; fi++)
4335 {
4336 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4337 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4338 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4339 if (eptr >= md->end_subject)
4340 {
4341 SCHECK_PARTIAL();
4342 MRRETURN(MATCH_NOMATCH);
4343 }
4344 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4345 MRRETURN(MATCH_NOMATCH);
4346 c = *eptr++;
4347 switch(ctype)
4348 {
4349 case OP_ANY: /* This is the non-NL case */
4350 case OP_ALLANY:
4351 case OP_ANYBYTE:
4352 break;
4353
4354 case OP_ANYNL:
4355 switch(c)
4356 {
4357 default: MRRETURN(MATCH_NOMATCH);
4358 case 0x000d:
4359 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4360 break;
4361
4362 case 0x000a:
4363 break;
4364
4365 case 0x000b:
4366 case 0x000c:
4367 case 0x0085:
4368 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4369 break;
4370 }
4371 break;
4372
4373 case OP_NOT_HSPACE:
4374 switch(c)
4375 {
4376 default: break;
4377 case 0x09: /* HT */
4378 case 0x20: /* SPACE */
4379 case 0xa0: /* NBSP */
4380 MRRETURN(MATCH_NOMATCH);
4381 }
4382 break;
4383
4384 case OP_HSPACE:
4385 switch(c)
4386 {
4387 default: MRRETURN(MATCH_NOMATCH);
4388 case 0x09: /* HT */
4389 case 0x20: /* SPACE */
4390 case 0xa0: /* NBSP */
4391 break;
4392 }
4393 break;
4394
4395 case OP_NOT_VSPACE:
4396 switch(c)
4397 {
4398 default: break;
4399 case 0x0a: /* LF */
4400 case 0x0b: /* VT */
4401 case 0x0c: /* FF */
4402 case 0x0d: /* CR */
4403 case 0x85: /* NEL */
4404 MRRETURN(MATCH_NOMATCH);
4405 }
4406 break;
4407
4408 case OP_VSPACE:
4409 switch(c)
4410 {
4411 default: MRRETURN(MATCH_NOMATCH);
4412 case 0x0a: /* LF */
4413 case 0x0b: /* VT */
4414 case 0x0c: /* FF */
4415 case 0x0d: /* CR */
4416 case 0x85: /* NEL */
4417 break;
4418 }
4419 break;
4420
4421 case OP_NOT_DIGIT:
4422 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4423 break;
4424
4425 case OP_DIGIT:
4426 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4427 break;
4428
4429 case OP_NOT_WHITESPACE:
4430 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4431 break;
4432
4433 case OP_WHITESPACE:
4434 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4435 break;
4436
4437 case OP_NOT_WORDCHAR:
4438 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4439 break;
4440
4441 case OP_WORDCHAR:
4442 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4443 break;
4444
4445 default:
4446 RRETURN(PCRE_ERROR_INTERNAL);
4447 }
4448 }
4449 }
4450 /* Control never gets here */
4451 }
4452
4453 /* If maximizing, it is worth using inline code for speed, doing the type
4454 test once at the start (i.e. keep it out of the loop). Again, keep the
4455 UTF-8 and UCP stuff separate. */
4456
4457 else
4458 {
4459 pp = eptr; /* Remember where we started */
4460
4461 #ifdef SUPPORT_UCP
4462 if (prop_type >= 0)
4463 {
4464 switch(prop_type)
4465 {
4466 case PT_ANY:
4467 for (i = min; i < max; i++)
4468 {
4469 int len = 1;
4470 if (eptr >= md->end_subject)
4471 {
4472 SCHECK_PARTIAL();
4473 break;
4474 }
4475 GETCHARLEN(c, eptr, len);
4476 if (prop_fail_result) break;
4477 eptr+= len;
4478 }
4479 break;
4480
4481 case PT_LAMP:
4482 for (i = min; i < max; i++)
4483 {
4484 int len = 1;
4485 if (eptr >= md->end_subject)
4486 {
4487 SCHECK_PARTIAL();
4488 break;
4489 }
4490 GETCHARLEN(c, eptr, len);
4491 prop_chartype = UCD_CHARTYPE(c);
4492 if ((prop_chartype == ucp_Lu ||
4493 prop_chartype == ucp_Ll ||
4494 prop_chartype == ucp_Lt) == prop_fail_result)
4495 break;
4496 eptr+= len;
4497 }
4498 break;
4499
4500 case PT_GC:
4501 for (i = min; i < max; i++)
4502 {
4503 int len = 1;
4504 if (eptr >= md->end_subject)
4505 {
4506 SCHECK_PARTIAL();
4507 break;
4508 }
4509 GETCHARLEN(c, eptr, len);
4510 prop_category = UCD_CATEGORY(c);
4511 if ((prop_category == prop_value) == prop_fail_result)
4512 break;
4513 eptr+= len;
4514 }
4515 break;
4516
4517 case PT_PC:
4518 for (i = min; i < max; i++)
4519 {
4520 int len = 1;
4521 if (eptr >= md->end_subject)
4522 {
4523 SCHECK_PARTIAL();
4524 break;
4525 }
4526 GETCHARLEN(c, eptr, len);
4527 prop_chartype = UCD_CHARTYPE(c);
4528 if ((prop_chartype == prop_value) == prop_fail_result)
4529 break;
4530 eptr+= len;
4531 }
4532 break;
4533
4534 case PT_SC:
4535 for (i = min; i < max; i++)
4536 {
4537 int len = 1;
4538 if (eptr >= md->end_subject)
4539 {
4540 SCHECK_PARTIAL();
4541 break;
4542 }
4543 GETCHARLEN(c, eptr, len);
4544 prop_script = UCD_SCRIPT(c);
4545 if ((prop_script == prop_value) == prop_fail_result)
4546 break;
4547 eptr+= len;
4548 }
4549 break;
4550 }
4551
4552 /* eptr is now past the end of the maximum run */
4553
4554 if (possessive) continue;
4555 for(;;)
4556 {
4557 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4559 if (eptr-- == pp) break; /* Stop if tried at original pos */
4560 if (utf8) BACKCHAR(eptr);
4561 }
4562 }
4563
4564 /* Match extended Unicode sequences. We will get here only if the
4565 support is in the binary; otherwise a compile-time error occurs. */
4566
4567 else if (ctype == OP_EXTUNI)
4568 {
4569 for (i = min; i < max; i++)
4570 {
4571 if (eptr >= md->end_subject)
4572 {
4573 SCHECK_PARTIAL();
4574 break;
4575 }
4576 GETCHARINCTEST(c, eptr);
4577 prop_category = UCD_CATEGORY(c);
4578 if (prop_category == ucp_M) break;
4579 while (eptr < md->end_subject)
4580 {
4581 int len = 1;
4582 if (!utf8) c = *eptr; else
4583 {
4584 GETCHARLEN(c, eptr, len);
4585 }
4586 prop_category = UCD_CATEGORY(c);
4587 if (prop_category != ucp_M) break;
4588 eptr += len;
4589 }
4590 }
4591
4592 /* eptr is now past the end of the maximum run */
4593
4594 if (possessive) continue;
4595
4596 for(;;)
4597 {
4598 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4599 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4600 if (eptr-- == pp) break; /* Stop if tried at original pos */
4601 for (;;) /* Move back over one extended */
4602 {
4603 int len = 1;
4604 if (!utf8) c = *eptr; else
4605 {
4606 BACKCHAR(eptr);
4607 GETCHARLEN(c, eptr, len);
4608 }
4609 prop_category = UCD_CATEGORY(c);
4610 if (prop_category != ucp_M) break;
4611 eptr--;
4612 }
4613 }
4614 }
4615
4616 else
4617 #endif /* SUPPORT_UCP */
4618
4619 #ifdef SUPPORT_UTF8
4620 /* UTF-8 mode */
4621
4622 if (utf8)
4623 {
4624 switch(ctype)
4625 {
4626 case OP_ANY:
4627 if (max < INT_MAX)
4628 {
4629 for (i = min; i < max; i++)
4630 {
4631 if (eptr >= md->end_subject)
4632 {
4633 SCHECK_PARTIAL();
4634 break;
4635 }
4636 if (IS_NEWLINE(eptr)) break;
4637 eptr++;
4638 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4639 }
4640 }
4641
4642 /* Handle unlimited UTF-8 repeat */
4643
4644 else
4645 {
4646 for (i = min; i < max; i++)
4647 {
4648 if (eptr >= md->end_subject)
4649 {
4650 SCHECK_PARTIAL();
4651 break;
4652 }
4653 if (IS_NEWLINE(eptr)) break;
4654 eptr++;
4655 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4656 }
4657 }
4658 break;
4659
4660 case OP_ALLANY:
4661 if (max < INT_MAX)
4662 {
4663 for (i = min; i < max; i++)
4664 {
4665 if (eptr >= md->end_subject)
4666 {
4667 SCHECK_PARTIAL();
4668 break;
4669 }
4670 eptr++;
4671 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4672 }
4673 }
4674 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4675 break;
4676
4677 /* The byte case is the same as non-UTF8 */
4678
4679 case OP_ANYBYTE:
4680 c = max - min;
4681 if (c > (unsigned int)(md->end_subject - eptr))
4682 {
4683 eptr = md->end_subject;
4684 SCHECK_PARTIAL();
4685 }
4686 else eptr += c;
4687 break;
4688
4689 case OP_ANYNL:
4690 for (i = min; i < max; i++)
4691 {
4692 int len = 1;
4693 if (eptr >= md->end_subject)
4694 {
4695 SCHECK_PARTIAL();
4696 break;
4697 }
4698 GETCHARLEN(c, eptr, len);
4699 if (c == 0x000d)
4700 {
4701 if (++eptr >= md->end_subject) break;
4702 if (*eptr == 0x000a) eptr++;
4703 }
4704 else
4705 {
4706 if (c != 0x000a &&
4707 (md->bsr_anycrlf ||
4708 (c != 0x000b && c != 0x000c &&
4709 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4710 break;
4711 eptr += len;
4712 }
4713 }
4714 break;
4715
4716 case OP_NOT_HSPACE:
4717 case OP_HSPACE:
4718 for (i = min; i < max; i++)
4719 {
4720 BOOL gotspace;
4721 int len = 1;
4722 if (eptr >= md->end_subject)
4723 {
4724 SCHECK_PARTIAL();
4725 break;
4726 }
4727 GETCHARLEN(c, eptr, len);
4728 switch(c)
4729 {
4730 default: gotspace = FALSE; break;
4731 case 0x09: /* HT */
4732 case 0x20: /* SPACE */
4733 case 0xa0: /* NBSP */
4734 case 0x1680: /* OGHAM SPACE MARK */
4735 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4736 case 0x2000: /* EN QUAD */
4737 case 0x2001: /* EM QUAD */
4738 case 0x2002: /* EN SPACE */
4739 case 0x2003: /* EM SPACE */
4740 case 0x2004: /* THREE-PER-EM SPACE */
4741 case 0x2005: /* FOUR-PER-EM SPACE */
4742 case 0x2006: /* SIX-PER-EM SPACE */
4743 case 0x2007: /* FIGURE SPACE */
4744 case 0x2008: /* PUNCTUATION SPACE */
4745 case 0x2009: /* THIN SPACE */
4746 case 0x200A: /* HAIR SPACE */
4747 case 0x202f: /* NARROW NO-BREAK SPACE */
4748 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4749 case 0x3000: /* IDEOGRAPHIC SPACE */
4750 gotspace = TRUE;
4751 break;
4752 }
4753 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4754 eptr += len;
4755 }
4756 break;
4757
4758 case OP_NOT_VSPACE:
4759 case OP_VSPACE:
4760 for (i = min; i < max; i++)
4761 {
4762 BOOL gotspace;
4763 int len = 1;
4764 if (eptr >= md->end_subject)
4765 {
4766 SCHECK_PARTIAL();
4767 break;
4768 }
4769 GETCHARLEN(c, eptr, len);
4770 switch(c)
4771 {
4772 default: gotspace = FALSE; break;
4773 case 0x0a: /* LF */
4774 case 0x0b: /* VT */
4775 case 0x0c: /* FF */
4776 case 0x0d: /* CR */
4777 case 0x85: /* NEL */
4778 case 0x2028: /* LINE SEPARATOR */
4779 case 0x2029: /* PARAGRAPH SEPARATOR */
4780 gotspace = TRUE;
4781 break;
4782 }
4783 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4784 eptr += len;
4785 }
4786 break;
4787
4788 case OP_NOT_DIGIT:
4789 for (i = min; i < max; i++)
4790 {
4791 int len = 1;
4792 if (eptr >= md->end_subject)
4793 {
4794 SCHECK_PARTIAL();
4795 break;
4796 }
4797 GETCHARLEN(c, eptr, len);
4798 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4799 eptr+= len;
4800 }
4801 break;
4802
4803 case OP_DIGIT:
4804 for (i = min; i < max; i++)
4805 {
4806 int len = 1;
4807 if (eptr >= md->end_subject)
4808 {
4809 SCHECK_PARTIAL();
4810 break;
4811 }
4812 GETCHARLEN(c, eptr, len);
4813 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4814 eptr+= len;
4815 }
4816 break;
4817
4818 case OP_NOT_WHITESPACE:
4819 for (i = min; i < max; i++)
4820 {
4821 int len = 1;
4822 if (eptr >= md->end_subject)
4823 {
4824 SCHECK_PARTIAL();
4825 break;
4826 }
4827 GETCHARLEN(c, eptr, len);
4828 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4829 eptr+= len;
4830 }
4831 break;
4832
4833 case OP_WHITESPACE:
4834 for (i = min; i < max; i++)
4835 {
4836 int len = 1;
4837 if (eptr >= md->end_subject)
4838 {
4839 SCHECK_PARTIAL();
4840 break;
4841 }
4842 GETCHARLEN(c, eptr, len);
4843 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4844 eptr+= len;
4845 }
4846 break;
4847
4848 case OP_NOT_WORDCHAR:
4849 for (i = min; i < max; i++)
4850 {
4851 int len = 1;
4852 if (eptr >= md->end_subject)
4853 {
4854 SCHECK_PARTIAL();
4855 break;
4856 }
4857 GETCHARLEN(c, eptr, len);
4858 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4859 eptr+= len;
4860 }
4861 break;
4862
4863 case OP_WORDCHAR:
4864 for (i = min; i < max; i++)
4865 {
4866 int len = 1;
4867 if (eptr >= md->end_subject)
4868 {
4869 SCHECK_PARTIAL();
4870 break;
4871 }
4872 GETCHARLEN(c, eptr, len);
4873 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4874 eptr+= len;
4875 }
4876 break;
4877
4878 default:
4879 RRETURN(PCRE_ERROR_INTERNAL);
4880 }
4881
4882 /* eptr is now past the end of the maximum run */
4883
4884 if (possessive) continue;
4885 for(;;)
4886 {
4887 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4889 if (eptr-- == pp) break; /* Stop if tried at original pos */
4890 BACKCHAR(eptr);
4891 }
4892 }
4893 else
4894 #endif /* SUPPORT_UTF8 */
4895
4896 /* Not UTF-8 mode */
4897 {
4898 switch(ctype)
4899 {
4900 case OP_ANY:
4901 for (i = min; i < max; i++)
4902 {
4903 if (eptr >= md->end_subject)
4904 {
4905 SCHECK_PARTIAL();
4906 break;
4907 }
4908 if (IS_NEWLINE(eptr)) break;
4909 eptr++;
4910 }
4911 break;
4912
4913 case OP_ALLANY:
4914 case OP_ANYBYTE:
4915 c = max - min;
4916 if (c > (unsigned int)(md->end_subject - eptr))
4917 {
4918 eptr = md->end_subject;
4919 SCHECK_PARTIAL();
4920 }
4921 else eptr += c;
4922 break;
4923
4924 case OP_ANYNL:
4925 for (i = min; i < max; i++)
4926 {
4927 if (eptr >= md->end_subject)
4928 {
4929 SCHECK_PARTIAL();
4930 break;
4931 }
4932 c = *eptr;
4933 if (c == 0x000d)
4934 {
4935 if (++eptr >= md->end_subject) break;
4936 if (*eptr == 0x000a) eptr++;
4937 }
4938 else
4939 {
4940 if (c != 0x000a &&
4941 (md->bsr_anycrlf ||
4942 (c != 0x000b && c != 0x000c && c != 0x0085)))
4943 break;
4944 eptr++;
4945 }
4946 }
4947 break;
4948
4949 case OP_NOT_HSPACE:
4950 for (i = min; i < max; i++)
4951 {
4952 if (eptr >= md->end_subject)
4953 {
4954 SCHECK_PARTIAL();
4955 break;
4956 }
4957 c = *eptr;
4958 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4959 eptr++;
4960 }
4961 break;
4962
4963 case OP_HSPACE:
4964 for (i = min; i < max; i++)
4965 {
4966 if (eptr >= md->end_subject)
4967 {
4968 SCHECK_PARTIAL();
4969 break;
4970 }
4971 c = *eptr;
4972 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4973 eptr++;
4974 }
4975 break;
4976
4977 case OP_NOT_VSPACE:
4978 for (i = min; i < max; i++)
4979 {
4980 if (eptr >= md->end_subject)
4981 {
4982 SCHECK_PARTIAL();
4983 break;
4984 }
4985 c = *eptr;
4986 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4987 break;
4988 eptr++;
4989 }
4990 break;
4991
4992 case OP_VSPACE:
4993 for (i = min; i < max; i++)
4994 {
4995 if (eptr >= md->end_subject)
4996 {
4997 SCHECK_PARTIAL();
4998 break;
4999 }
5000 c = *eptr;
5001 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5002 break;
5003 eptr++;
5004 }
5005 break;
5006
5007 case OP_NOT_DIGIT:
5008 for (i = min; i < max; i++)
5009 {
5010 if (eptr >= md->end_subject)
5011 {
5012 SCHECK_PARTIAL();
5013 break;
5014 }
5015 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5016 eptr++;
5017 }
5018 break;
5019
5020 case OP_DIGIT:
5021 for (i = min; i < max; i++)
5022 {
5023 if (eptr >= md->end_subject)
5024 {
5025 SCHECK_PARTIAL();
5026 break;
5027 }
5028 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5029 eptr++;
5030 }
5031 break;
5032
5033 case OP_NOT_WHITESPACE:
5034 for (i = min; i < max; i++)
5035 {
5036 if (eptr >= md->end_subject)
5037 {
5038 SCHECK_PARTIAL();
5039 break;
5040 }
5041 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5042 eptr++;
5043 }
5044 break;
5045
5046 case OP_WHITESPACE:
5047 for (i = min; i < max; i++)
5048 {
5049 if (eptr >= md->end_subject)
5050 {
5051 SCHECK_PARTIAL();
5052 break;
5053 }
5054 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5055 eptr++;
5056 }
5057 break;
5058
5059 case OP_NOT_WORDCHAR:
5060 for (i = min; i < max; i++)
5061 {
5062 if (eptr >= md->end_subject)
5063 {
5064 SCHECK_PARTIAL();
5065 break;
5066 }
5067 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5068 eptr++;
5069 }
5070 break;
5071
5072 case OP_WORDCHAR:
5073 for (i = min; i < max; i++)
5074 {
5075 if (eptr >= md->end_subject)
5076 {
5077 SCHECK_PARTIAL();
5078 break;
5079 }
5080 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5081 eptr++;
5082 }
5083 break;
5084
5085 default:
5086 RRETURN(PCRE_ERROR_INTERNAL);
5087 }
5088
5089 /* eptr is now past the end of the maximum run */
5090
5091 if (possessive) continue;
5092 while (eptr >= pp)
5093 {
5094 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5095 eptr--;
5096 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5097 }
5098 }
5099
5100 /* Get here if we can't make it match with any permitted repetitions */
5101
5102 MRRETURN(MATCH_NOMATCH);
5103 }
5104 /* Control never gets here */
5105
5106 /* There's been some horrible disaster. Arrival here can only mean there is
5107 something seriously wrong in the code above or the OP_xxx definitions. */
5108
5109 default:
5110 DPRINTF(("Unknown opcode %d\n", *ecode));
5111 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5112 }
5113
5114 /* Do not stick any code in here without much thought; it is assumed
5115 that "continue" in the code above comes out to here to repeat the main
5116 loop. */
5117
5118 } /* End of main loop */
5119 /* Control never reaches here */
5120
5121
5122 /* When compiling to use the heap rather than the stack for recursive calls to
5123 match(), the RRETURN() macro jumps here. The number that is saved in
5124 frame->Xwhere indicates which label we actually want to return to. */
5125
5126 #ifdef NO_RECURSE
5127 #define LBL(val) case val: goto L_RM##val;
5128 HEAP_RETURN:
5129 switch (frame->Xwhere)
5130 {
5131 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5132 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5133 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5134 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5135 LBL(53) LBL(54)
5136 #ifdef SUPPORT_UTF8
5137 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5138 LBL(32) LBL(34) LBL(42) LBL(46)
5139 #ifdef SUPPORT_UCP
5140 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5141 #endif /* SUPPORT_UCP */
5142 #endif /* SUPPORT_UTF8 */
5143 default:
5144 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5145 return PCRE_ERROR_INTERNAL;
5146 }
5147 #undef LBL
5148 #endif /* NO_RECURSE */
5149 }
5150
5151
5152 /***************************************************************************
5153 ****************************************************************************
5154 RECURSION IN THE match() FUNCTION
5155
5156 Undefine all the macros that were defined above to handle this. */
5157
5158 #ifdef NO_RECURSE
5159 #undef eptr
5160 #undef ecode
5161 #undef mstart
5162 #undef offset_top
5163 #undef ims
5164 #undef eptrb
5165 #undef flags
5166
5167 #undef callpat
5168 #undef charptr
5169 #undef data
5170 #undef next
5171 #undef pp
5172 #undef prev
5173 #undef saved_eptr
5174
5175 #undef new_recursive
5176
5177 #undef cur_is_word
5178 #undef condition
5179 #undef prev_is_word
5180
5181 #undef original_ims
5182
5183 #undef ctype
5184 #undef length
5185 #undef max
5186 #undef min
5187 #undef number
5188 #undef offset
5189 #undef op
5190 #undef save_capture_last
5191 #undef save_offset1
5192 #undef save_offset2
5193 #undef save_offset3
5194 #undef stacksave
5195
5196 #undef newptrb
5197
5198 #endif
5199
5200 /* These two are defined as macros in both cases */
5201
5202 #undef fc
5203 #undef fi
5204
5205 /***************************************************************************
5206 ***************************************************************************/
5207
5208
5209
5210 /*************************************************
5211 * Execute a Regular Expression *
5212 *************************************************/
5213
5214 /* This function applies a compiled re to a subject string and picks out
5215 portions of the string if it matches. Two elements in the vector are set for
5216 each substring: the offsets to the start and end of the substring.
5217
5218 Arguments:
5219 argument_re points to the compiled expression
5220 extra_data points to extra data or is NULL
5221 subject points to the subject string
5222 length length of subject string (may contain binary zeros)
5223 start_offset where to start in the subject string
5224 options option bits
5225 offsets points to a vector of ints to be filled in with offsets
5226 offsetcount the number of elements in the vector
5227
5228 Returns: > 0 => success; value is the number of elements filled in
5229 = 0 => success, but offsets is not big enough
5230 -1 => failed to match
5231 < -1 => some kind of unexpected problem
5232 */
5233
5234 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5235 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5236 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5237 int offsetcount)
5238 {
5239 int rc, resetcount, ocount;
5240 int first_byte = -1;
5241 int req_byte = -1;
5242 int req_byte2 = -1;
5243 int newline;
5244 unsigned long int ims;
5245 BOOL using_temporary_offsets = FALSE;
5246 BOOL anchored;
5247 BOOL startline;
5248 BOOL firstline;
5249 BOOL first_byte_caseless = FALSE;
5250 BOOL req_byte_caseless = FALSE;
5251 BOOL utf8;
5252 match_data match_block;
5253 match_data *md = &match_block;
5254 const uschar *tables;
5255 const uschar *start_bits = NULL;
5256 USPTR start_match = (USPTR)subject + start_offset;
5257 USPTR end_subject;
5258 USPTR start_partial = NULL;
5259 USPTR req_byte_ptr = start_match - 1;
5260
5261 pcre_study_data internal_study;
5262 const pcre_study_data *study;
5263
5264 real_pcre internal_re;
5265 const real_pcre *external_re = (const real_pcre *)argument_re;
5266 const real_pcre *re = external_re;
5267
5268 /* Plausibility checks */
5269
5270 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5271 if (re == NULL || subject == NULL ||
5272 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5273 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5274
5275 /* This information is for finding all the numbers associated with a given
5276 name, for condition testing. */
5277
5278 md->name_table = (uschar *)re + re->name_table_offset;
5279 md->name_count = re->name_count;
5280 md->name_entry_size = re->name_entry_size;
5281
5282 /* Fish out the optional data from the extra_data structure, first setting
5283 the default values. */
5284
5285 study = NULL;
5286 md->match_limit = MATCH_LIMIT;
5287 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5288 md->callout_data = NULL;
5289
5290 /* The table pointer is always in native byte order. */
5291
5292 tables = external_re->tables;
5293
5294 if (extra_data != NULL)
5295 {
5296 register unsigned int flags = extra_data->flags;
5297 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5298 study = (const pcre_study_data *)extra_data->study_data;
5299 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5300 md->match_limit = extra_data->match_limit;
5301 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5302 md->match_limit_recursion = extra_data->match_limit_recursion;
5303 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5304 md->callout_data = extra_data->callout_data;
5305 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5306 }
5307
5308 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5309 is a feature that makes it possible to save compiled regex and re-use them
5310 in other programs later. */
5311
5312 if (tables == NULL) tables = _pcre_default_tables;
5313
5314 /* Check that the first field in the block is the magic number. If it is not,
5315 test for a regex that was compiled on a host of opposite endianness. If this is
5316 the case, flipped values are put in internal_re and internal_study if there was
5317 study data too. */
5318
5319 if (re->magic_number != MAGIC_NUMBER)
5320 {
5321 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5322 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5323 if (study != NULL) study = &internal_study;
5324 }
5325
5326 /* Set up other data */
5327
5328 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5329 startline = (re->flags & PCRE_STARTLINE) != 0;
5330 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5331
5332 /* The code starts after the real_pcre block and the capture name table. */
5333
5334 md->start_code = (const uschar *)external_re + re->name_table_offset +
5335 re->name_count * re->name_entry_size;
5336
5337 md->start_subject = (USPTR)subject;
5338 md->start_offset = start_offset;
5339 md->end_subject = md->start_subject + length;
5340 end_subject = md->end_subject;
5341
5342 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5343 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5344 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5345
5346 md->notbol = (options & PCRE_NOTBOL) != 0;
5347 md->noteol = (options & PCRE_NOTEOL) != 0;
5348 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5349 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5350 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5351 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5352 md->hitend = FALSE;
5353 md->mark = NULL; /* In case never set */
5354
5355 md->recursive = NULL; /* No recursion at top level */
5356
5357 md->lcc = tables + lcc_offset;
5358 md->ctypes = tables + ctypes_offset;
5359
5360 /* Handle different \R options. */
5361
5362 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5363 {
5364 case 0:
5365 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5366 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5367 else
5368 #ifdef BSR_ANYCRLF
5369 md->bsr_anycrlf = TRUE;
5370 #else
5371 md->bsr_anycrlf = FALSE;
5372 #endif
5373 break;
5374
5375 case PCRE_BSR_ANYCRLF:
5376 md->bsr_anycrlf = TRUE;
5377 break;
5378
5379 case PCRE_BSR_UNICODE:
5380 md->bsr_anycrlf = FALSE;
5381 break;
5382
5383 default: return PCRE_ERROR_BADNEWLINE;
5384 }
5385
5386 /* Handle different types of newline. The three bits give eight cases. If
5387 nothing is set at run time, whatever was used at compile time applies. */
5388
5389 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5390 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5391 {
5392 case 0: newline = NEWLINE; break; /* Compile-time default */
5393 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5394 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5395 case PCRE_NEWLINE_CR+
5396 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5397 case PCRE_NEWLINE_ANY: newline = -1; break;
5398 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5399 default: return PCRE_ERROR_BADNEWLINE;
5400 }
5401
5402 if (newline == -2)
5403 {
5404 md->nltype = NLTYPE_ANYCRLF;
5405 }
5406 else if (newline < 0)
5407 {
5408 md->nltype = NLTYPE_ANY;
5409 }
5410 else
5411 {
5412 md->nltype = NLTYPE_FIXED;
5413 if (newline > 255)
5414 {
5415 md->nllen = 2;
5416 md->nl[0] = (newline >> 8) & 255;
5417 md->nl[1] = newline & 255;
5418 }
5419 else
5420 {
5421 md->nllen = 1;
5422 md->nl[0] = newline;
5423 }
5424 }
5425
5426 /* Partial matching was originally supported only for a restricted set of
5427 regexes; from release 8.00 there are no restrictions, but the bits are still
5428 defined (though never set). So there's no harm in leaving this code. */
5429
5430 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5431 return PCRE_ERROR_BADPARTIAL;
5432
5433 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5434 back the character offset. */
5435
5436 #ifdef SUPPORT_UTF8
5437 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5438 {
5439 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5440 return PCRE_ERROR_BADUTF8;
5441 if (start_offset > 0 && start_offset < length)
5442 {
5443 int tb = ((USPTR)subject)[start_offset];
5444 if (tb > 127)
5445 {
5446 tb &= 0xc0;
5447 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5448 }
5449 }
5450 }
5451 #endif
5452
5453 /* The ims options can vary during the matching as a result of the presence
5454 of (?ims) items in the pattern. They are kept in a local variable so that
5455 restoring at the exit of a group is easy. */
5456
5457 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5458
5459 /* If the expression has got more back references than the offsets supplied can
5460 hold, we get a temporary chunk of working store to use during the matching.
5461 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5462 of 3. */
5463
5464 ocount = offsetcount - (offsetcount % 3);
5465
5466 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5467 {
5468 ocount = re->top_backref * 3 + 3;
5469 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5470 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5471 using_temporary_offsets = TRUE;
5472 DPRINTF(("Got memory to hold back references\n"));
5473 }
5474 else md->offset_vector = offsets;
5475
5476 md->offset_end = ocount;
5477 md->offset_max = (2*ocount)/3;
5478 md->offset_overflow = FALSE;
5479 md->capture_last = -1;
5480
5481 /* Compute the minimum number of offsets that we need to reset each time. Doing
5482 this makes a huge difference to execution time when there aren't many brackets
5483 in the pattern. */
5484
5485 resetcount = 2 + re->top_bracket * 2;
5486 if (resetcount > offsetcount) resetcount = ocount;
5487
5488 /* Reset the working variable associated with each extraction. These should
5489 never be used unless previously set, but they get saved and restored, and so we
5490 initialize them to avoid reading uninitialized locations. */
5491
5492 if (md->offset_vector != NULL)
5493 {
5494 register int *iptr = md->offset_vector + ocount;
5495 register int *iend = iptr - resetcount/2 + 1;
5496 while (--iptr >= iend) *iptr = -1;
5497 }
5498
5499 /* Set up the first character to match, if available. The first_byte value is
5500 never set for an anchored regular expression, but the anchoring may be forced
5501 at run time, so we have to test for anchoring. The first char may be unset for
5502 an unanchored pattern, of course. If there's no first char and the pattern was
5503 studied, there may be a bitmap of possible first characters. */
5504
5505 if (!anchored)
5506 {
5507 if ((re->flags & PCRE_FIRSTSET) != 0)
5508 {
5509 first_byte = re->first_byte & 255;
5510 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5511 first_byte = md->lcc[first_byte];
5512 }
5513 else
5514 if (!startline && study != NULL &&
5515 (study->flags & PCRE_STUDY_MAPPED) != 0)
5516 start_bits = study->start_bits;
5517 }
5518
5519 /* For anchored or unanchored matches, there may be a "last known required
5520 character" set. */
5521
5522 if ((re->flags & PCRE_REQCHSET) != 0)
5523 {
5524 req_byte = re->req_byte & 255;
5525 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5526 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5527 }
5528
5529
5530 /* ==========================================================================*/
5531
5532 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5533 the loop runs just once. */
5534
5535 for(;;)
5536 {
5537 USPTR save_end_subject = end_subject;
5538 USPTR new_start_match;
5539
5540 /* Reset the maximum number of extractions we might see. */
5541
5542 if (md->offset_vector != NULL)
5543 {
5544 register int *iptr = md->offset_vector;
5545 register int *iend = iptr + resetcount;
5546 while (iptr < iend) *iptr++ = -1;
5547 }
5548
5549 /* If firstline is TRUE, the start of the match is constrained to the first
5550 line of a multiline string. That is, the match must be before or at the first
5551 newline. Implement this by temporarily adjusting end_subject so that we stop
5552 scanning at a newline. If the match fails at the newline, later code breaks
5553 this loop. */
5554
5555 if (firstline)
5556 {
5557 USPTR t = start_match;
5558 #ifdef SUPPORT_UTF8
5559 if (utf8)
5560 {
5561 while (t < md->end_subject && !IS_NEWLINE(t))
5562 {
5563 t++;
5564 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5565 }
5566 }
5567 else
5568 #endif
5569 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5570 end_subject = t;
5571 }
5572
5573 /* There are some optimizations that avoid running the match if a known
5574 starting point is not found, or if a known later character is not present.
5575 However, there is an option that disables these, for testing and for ensuring
5576 that all callouts do actually occur. */
5577
5578 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5579 {
5580 /* Advance to a unique first byte if there is one. */
5581
5582 if (first_byte >= 0)
5583 {
5584 if (first_byte_caseless)
5585 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5586 start_match++;
5587 else
5588 while (start_match < end_subject && *start_match != first_byte)
5589 start_match++;
5590 }
5591
5592 /* Or to just after a linebreak for a multiline match */
5593
5594 else if (startline)
5595 {
5596 if (start_match > md->start_subject + start_offset)
5597 {
5598 #ifdef SUPPORT_UTF8
5599 if (utf8)
5600 {
5601 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5602 {
5603 start_match++;
5604 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5605 start_match++;
5606 }
5607 }
5608 else
5609 #endif
5610 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5611 start_match++;
5612
5613 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5614 and we are now at a LF, advance the match position by one more character.
5615 */
5616
5617 if (start_match[-1] == CHAR_CR &&
5618 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5619 start_match < end_subject &&
5620 *start_match == CHAR_NL)
5621 start_match++;
5622 }
5623 }
5624
5625 /* Or to a non-unique first byte after study */
5626
5627 else if (start_bits != NULL)
5628 {
5629 while (start_match < end_subject)
5630 {
5631 register unsigned int c = *start_match;
5632 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5633 else break;
5634 }
5635 }
5636 } /* Starting optimizations */
5637
5638 /* Restore fudged end_subject */
5639
5640 end_subject = save_end_subject;
5641
5642 /* The following two optimizations are disabled for partial matching or if
5643 disabling is explicitly requested. */
5644
5645 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5646 {
5647 /* If the pattern was studied, a minimum subject length may be set. This is
5648 a lower bound; no actual string of that length may actually match the
5649 pattern. Although the value is, strictly, in characters, we treat it as
5650 bytes to avoid spending too much time in this optimization. */
5651
5652 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5653 (pcre_uint32)(end_subject - start_match) < study->minlength)
5654 {
5655 rc = MATCH_NOMATCH;
5656 break;
5657 }
5658
5659 /* If req_byte is set, we know that that character must appear in the
5660 subject for the match to succeed. If the first character is set, req_byte
5661 must be later in the subject; otherwise the test starts at the match point.
5662 This optimization can save a huge amount of backtracking in patterns with
5663 nested unlimited repeats that aren't going to match. Writing separate code
5664 for cased/caseless versions makes it go faster, as does using an
5665 autoincrement and backing off on a match.
5666
5667 HOWEVER: when the subject string is very, very long, searching to its end
5668 can take a long time, and give bad performance on quite ordinary patterns.
5669 This showed up when somebody was matching something like /^\d+C/ on a
5670 32-megabyte string... so we don't do this when the string is sufficiently
5671 long. */
5672
5673 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5674 {
5675 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5676
5677 /* We don't need to repeat the search if we haven't yet reached the
5678 place we found it at last time. */
5679
5680 if (p > req_byte_ptr)
5681 {
5682 if (req_byte_caseless)
5683 {
5684 while (p < end_subject)
5685 {
5686 register int pp = *p++;
5687 if (pp == req_byte || pp == req_byte2) { p--; break; }
5688 }
5689 }
5690 else
5691 {
5692 while (p < end_subject)
5693 {
5694 if (*p++ == req_byte) { p--; break; }
5695 }
5696 }
5697
5698 /* If we can't find the required character, break the matching loop,
5699 forcing a match failure. */
5700
5701 if (p >= end_subject)
5702 {
5703 rc = MATCH_NOMATCH;
5704 break;
5705 }
5706
5707 /* If we have found the required character, save the point where we
5708 found it, so that we don't search again next time round the loop if
5709 the start hasn't passed this character yet. */
5710
5711 req_byte_ptr = p;
5712 }
5713 }
5714 }
5715
5716 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5717 printf(">>>> Match against: ");
5718 pchars(start_match, end_subject - start_match, TRUE, md);
5719 printf("\n");
5720 #endif
5721
5722 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5723 first starting point for which a partial match was found. */
5724
5725 md->start_match_ptr = start_match;
5726 md->start_used_ptr = start_match;
5727 md->match_call_count = 0;
5728 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
5729 0, 0);
5730 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5731
5732 switch(rc)
5733 {
5734 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
5735 this level it means that a MARK that matched the SKIP's arg was not found.
5736 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
5737
5738 case MATCH_NOMATCH:
5739 case MATCH_PRUNE:
5740 case MATCH_SKIP_ARG:
5741 case MATCH_THEN:
5742 new_start_match = start_match + 1;
5743 #ifdef SUPPORT_UTF8
5744 if (utf8)
5745 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5746 new_start_match++;
5747 #endif
5748 break;
5749
5750 /* SKIP passes back the next starting point explicitly. */
5751
5752 case MATCH_SKIP:
5753 new_start_match = md->start_match_ptr;
5754 break;
5755
5756 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5757
5758 case MATCH_COMMIT:
5759 rc = MATCH_NOMATCH;
5760 goto ENDLOOP;
5761
5762 /* Any other return is either a match, or some kind of error. */
5763
5764 default:
5765 goto ENDLOOP;
5766 }
5767
5768 /* Control reaches here for the various types of "no match at this point"
5769 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5770
5771 rc = MATCH_NOMATCH;
5772
5773 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5774 newline in the subject (though it may continue over the newline). Therefore,
5775 if we have just failed to match, starting at a newline, do not continue. */
5776
5777 if (firstline && IS_NEWLINE(start_match)) break;
5778
5779 /* Advance to new matching position */
5780
5781 start_match = new_start_match;
5782
5783 /* Break the loop if the pattern is anchored or if we have passed the end of
5784 the subject. */
5785
5786 if (anchored || start_match > end_subject) break;
5787
5788 /* If we have just passed a CR and we are now at a LF, and the pattern does
5789 not contain any explicit matches for \r or \n, and the newline option is CRLF
5790 or ANY or ANYCRLF, advance the match position by one more character. */
5791
5792 if (start_match[-1] == CHAR_CR &&
5793 start_match < end_subject &&
5794 *start_match == CHAR_NL &&
5795 (re->flags & PCRE_HASCRORLF) == 0 &&
5796 (md->nltype == NLTYPE_ANY ||
5797 md->nltype == NLTYPE_ANYCRLF ||
5798 md->nllen == 2))
5799 start_match++;
5800
5801 md->mark = NULL; /* Reset for start of next match attempt */
5802 } /* End of for(;;) "bumpalong" loop */
5803
5804 /* ==========================================================================*/
5805
5806 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5807 conditions is true:
5808
5809 (1) The pattern is anchored or the match was failed by (*COMMIT);
5810
5811 (2) We are past the end of the subject;
5812
5813 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5814 this option requests that a match occur at or before the first newline in
5815 the subject.
5816
5817 When we have a match and the offset vector is big enough to deal with any
5818 backreferences, captured substring offsets will already be set up. In the case
5819 where we had to get some local store to hold offsets for backreference
5820 processing, copy those that we can. In this case there need not be overflow if
5821 certain parts of the pattern were not used, even though there are more
5822 capturing parentheses than vector slots. */
5823
5824 ENDLOOP:
5825
5826 if (rc == MATCH_MATCH)
5827 {
5828 if (using_temporary_offsets)
5829 {
5830 if (offsetcount >= 4)
5831 {
5832 memcpy(offsets + 2, md->offset_vector + 2,
5833 (offsetcount - 2) * sizeof(int));
5834 DPRINTF(("Copied offsets from temporary memory\n"));
5835 }
5836 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5837 DPRINTF(("Freeing temporary memory\n"));
5838 (pcre_free)(md->offset_vector);
5839 }
5840
5841 /* Set the return code to the number of captured strings, or 0 if there are
5842 too many to fit into the vector. */
5843
5844 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5845
5846 /* If there is space, set up the whole thing as substring 0. The value of
5847 md->start_match_ptr might be modified if \K was encountered on the success
5848 matching path. */
5849
5850 if (offsetcount < 2) rc = 0; else
5851 {
5852 offsets[0] = md->start_match_ptr - md->start_subject;
5853 offsets[1] = md->end_match_ptr - md->start_subject;
5854 }
5855
5856 DPRINTF((">>>> returning %d\n", rc));
5857 goto RETURN_MARK;
5858 }
5859
5860 /* Control gets here if there has been an error, or if the overall match
5861 attempt has failed at all permitted starting positions. */
5862
5863 if (using_temporary_offsets)
5864 {
5865 DPRINTF(("Freeing temporary memory\n"));
5866 (pcre_free)(md->offset_vector);
5867 }
5868
5869 /* For anything other than nomatch or partial match, just return the code. */
5870
5871 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5872 {
5873 DPRINTF((">>>> error: returning %d\n", rc));
5874 return rc;
5875 }
5876
5877 /* Handle partial matches - disable any mark data */
5878
5879 if (start_partial != NULL)
5880 {
5881 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5882 md->mark = NULL;
5883 if (offsetcount > 1)
5884 {
5885 offsets[0] = start_partial - (USPTR)subject;
5886 offsets[1] = end_subject - (USPTR)subject;
5887 }
5888 rc = PCRE_ERROR_PARTIAL;
5889 }
5890
5891 /* This is the classic nomatch case */
5892
5893 else
5894 {
5895 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5896 rc = PCRE_ERROR_NOMATCH;
5897 }
5898
5899 /* Return the MARK data if it has been requested. */
5900
5901 RETURN_MARK:
5902
5903 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
5904 *(extra_data->mark) = (unsigned char *)(md->mark);
5905 return rc;
5906 }
5907
5908 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12