/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 517 - (show annotations) (download)
Wed May 5 10:44:20 2010 UTC (4 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 183493 byte(s)
Add new special properties Xan, Xps, Xsp, Xwd to help with \w etc.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2010 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_ACCEPT (-999)
75 #define MATCH_COMMIT (-998)
76 #define MATCH_PRUNE (-997)
77 #define MATCH_SKIP (-996)
78 #define MATCH_SKIP_ARG (-995)
79 #define MATCH_THEN (-994)
80
81 /* This is a convenience macro for code that occurs many times. */
82
83 #define MRRETURN(ra) \
84 { \
85 md->mark = markptr; \
86 RRETURN(ra); \
87 }
88
89 /* Maximum number of ints of offset to save on the stack for recursive calls.
90 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91 because the offset vector is always a multiple of 3 long. */
92
93 #define REC_STACK_SAVE_MAX 30
94
95 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96
97 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99
100
101
102 #ifdef PCRE_DEBUG
103 /*************************************************
104 * Debugging function to print chars *
105 *************************************************/
106
107 /* Print a sequence of chars in printable format, stopping at the end of the
108 subject if the requested.
109
110 Arguments:
111 p points to characters
112 length number to print
113 is_subject TRUE if printing from within md->start_subject
114 md pointer to matching data block, if is_subject is TRUE
115
116 Returns: nothing
117 */
118
119 static void
120 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121 {
122 unsigned int c;
123 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124 while (length-- > 0)
125 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126 }
127 #endif
128
129
130
131 /*************************************************
132 * Match a back-reference *
133 *************************************************/
134
135 /* If a back reference hasn't been set, the length that is passed is greater
136 than the number of characters left in the string, so the match fails.
137
138 Arguments:
139 offset index into the offset vector
140 eptr points into the subject
141 length length to be matched
142 md points to match data block
143 ims the ims flags
144
145 Returns: TRUE if matched
146 */
147
148 static BOOL
149 match_ref(int offset, register USPTR eptr, int length, match_data *md,
150 unsigned long int ims)
151 {
152 USPTR p = md->start_subject + md->offset_vector[offset];
153
154 #ifdef PCRE_DEBUG
155 if (eptr >= md->end_subject)
156 printf("matching subject <null>");
157 else
158 {
159 printf("matching subject ");
160 pchars(eptr, length, TRUE, md);
161 }
162 printf(" against backref ");
163 pchars(p, length, FALSE, md);
164 printf("\n");
165 #endif
166
167 /* Always fail if not enough characters left */
168
169 if (length > md->end_subject - eptr) return FALSE;
170
171 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172 properly if Unicode properties are supported. Otherwise, we can check only
173 ASCII characters. */
174
175 if ((ims & PCRE_CASELESS) != 0)
176 {
177 #ifdef SUPPORT_UTF8
178 #ifdef SUPPORT_UCP
179 if (md->utf8)
180 {
181 USPTR endptr = eptr + length;
182 while (eptr < endptr)
183 {
184 int c, d;
185 GETCHARINC(c, eptr);
186 GETCHARINC(d, p);
187 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188 }
189 }
190 else
191 #endif
192 #endif
193
194 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195 is no UCP support. */
196
197 while (length-- > 0)
198 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
199 }
200
201 /* In the caseful case, we can just compare the bytes, whether or not we
202 are in UTF-8 mode. */
203
204 else
205 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206
207 return TRUE;
208 }
209
210
211
212 /***************************************************************************
213 ****************************************************************************
214 RECURSION IN THE match() FUNCTION
215
216 The match() function is highly recursive, though not every recursive call
217 increases the recursive depth. Nevertheless, some regular expressions can cause
218 it to recurse to a great depth. I was writing for Unix, so I just let it call
219 itself recursively. This uses the stack for saving everything that has to be
220 saved for a recursive call. On Unix, the stack can be large, and this works
221 fine.
222
223 It turns out that on some non-Unix-like systems there are problems with
224 programs that use a lot of stack. (This despite the fact that every last chip
225 has oodles of memory these days, and techniques for extending the stack have
226 been known for decades.) So....
227
228 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229 calls by keeping local variables that need to be preserved in blocks of memory
230 obtained from malloc() instead instead of on the stack. Macros are used to
231 achieve this so that the actual code doesn't look very different to what it
232 always used to.
233
234 The original heap-recursive code used longjmp(). However, it seems that this
235 can be very slow on some operating systems. Following a suggestion from Stan
236 Switzer, the use of longjmp() has been abolished, at the cost of having to
237 provide a unique number for each call to RMATCH. There is no way of generating
238 a sequence of numbers at compile time in C. I have given them names, to make
239 them stand out more clearly.
240
241 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243 tests. Furthermore, not using longjmp() means that local dynamic variables
244 don't have indeterminate values; this has meant that the frame size can be
245 reduced because the result can be "passed back" by straight setting of the
246 variable instead of being passed in the frame.
247 ****************************************************************************
248 ***************************************************************************/
249
250 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251 below must be updated in sync. */
252
253 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
258 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58 };
259
260 /* These versions of the macros use the stack, as normal. There are debugging
261 versions and production versions. Note that the "rw" argument of RMATCH isn't
262 actually used in this definition. */
263
264 #ifndef NO_RECURSE
265 #define REGISTER register
266
267 #ifdef PCRE_DEBUG
268 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
269 { \
270 printf("match() called in line %d\n", __LINE__); \
271 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
272 printf("to line %d\n", __LINE__); \
273 }
274 #define RRETURN(ra) \
275 { \
276 printf("match() returned %d from line %d ", ra, __LINE__); \
277 return ra; \
278 }
279 #else
280 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
281 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
282 #define RRETURN(ra) return ra
283 #endif
284
285 #else
286
287
288 /* These versions of the macros manage a private stack on the heap. Note that
289 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
290 argument of match(), which never changes. */
291
292 #define REGISTER
293
294 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
295 {\
296 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
297 frame->Xwhere = rw; \
298 newframe->Xeptr = ra;\
299 newframe->Xecode = rb;\
300 newframe->Xmstart = mstart;\
301 newframe->Xmarkptr = markptr;\
302 newframe->Xoffset_top = rc;\
303 newframe->Xims = re;\
304 newframe->Xeptrb = rf;\
305 newframe->Xflags = rg;\
306 newframe->Xrdepth = frame->Xrdepth + 1;\
307 newframe->Xprevframe = frame;\
308 frame = newframe;\
309 DPRINTF(("restarting from line %d\n", __LINE__));\
310 goto HEAP_RECURSE;\
311 L_##rw:\
312 DPRINTF(("jumped back to line %d\n", __LINE__));\
313 }
314
315 #define RRETURN(ra)\
316 {\
317 heapframe *newframe = frame;\
318 frame = newframe->Xprevframe;\
319 (pcre_stack_free)(newframe);\
320 if (frame != NULL)\
321 {\
322 rrc = ra;\
323 goto HEAP_RETURN;\
324 }\
325 return ra;\
326 }
327
328
329 /* Structure for remembering the local variables in a private frame */
330
331 typedef struct heapframe {
332 struct heapframe *Xprevframe;
333
334 /* Function arguments that may change */
335
336 USPTR Xeptr;
337 const uschar *Xecode;
338 USPTR Xmstart;
339 USPTR Xmarkptr;
340 int Xoffset_top;
341 long int Xims;
342 eptrblock *Xeptrb;
343 int Xflags;
344 unsigned int Xrdepth;
345
346 /* Function local variables */
347
348 USPTR Xcallpat;
349 #ifdef SUPPORT_UTF8
350 USPTR Xcharptr;
351 #endif
352 USPTR Xdata;
353 USPTR Xnext;
354 USPTR Xpp;
355 USPTR Xprev;
356 USPTR Xsaved_eptr;
357
358 recursion_info Xnew_recursive;
359
360 BOOL Xcur_is_word;
361 BOOL Xcondition;
362 BOOL Xprev_is_word;
363
364 unsigned long int Xoriginal_ims;
365
366 #ifdef SUPPORT_UCP
367 int Xprop_type;
368 int Xprop_value;
369 int Xprop_fail_result;
370 int Xprop_category;
371 int Xprop_chartype;
372 int Xprop_script;
373 int Xoclength;
374 uschar Xocchars[8];
375 #endif
376
377 int Xcodelink;
378 int Xctype;
379 unsigned int Xfc;
380 int Xfi;
381 int Xlength;
382 int Xmax;
383 int Xmin;
384 int Xnumber;
385 int Xoffset;
386 int Xop;
387 int Xsave_capture_last;
388 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
389 int Xstacksave[REC_STACK_SAVE_MAX];
390
391 eptrblock Xnewptrb;
392
393 /* Where to jump back to */
394
395 int Xwhere;
396
397 } heapframe;
398
399 #endif
400
401
402 /***************************************************************************
403 ***************************************************************************/
404
405
406
407 /*************************************************
408 * Match from current position *
409 *************************************************/
410
411 /* This function is called recursively in many circumstances. Whenever it
412 returns a negative (error) response, the outer incarnation must also return the
413 same response. */
414
415 /* These macros pack up tests that are used for partial matching, and which
416 appears several times in the code. We set the "hit end" flag if the pointer is
417 at the end of the subject and also past the start of the subject (i.e.
418 something has been matched). For hard partial matching, we then return
419 immediately. The second one is used when we already know we are past the end of
420 the subject. */
421
422 #define CHECK_PARTIAL()\
423 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
424 {\
425 md->hitend = TRUE;\
426 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
427 }
428
429 #define SCHECK_PARTIAL()\
430 if (md->partial != 0 && eptr > mstart)\
431 {\
432 md->hitend = TRUE;\
433 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
434 }
435
436
437 /* Performance note: It might be tempting to extract commonly used fields from
438 the md structure (e.g. utf8, end_subject) into individual variables to improve
439 performance. Tests using gcc on a SPARC disproved this; in the first case, it
440 made performance worse.
441
442 Arguments:
443 eptr pointer to current character in subject
444 ecode pointer to current position in compiled code
445 mstart pointer to the current match start position (can be modified
446 by encountering \K)
447 markptr pointer to the most recent MARK name, or NULL
448 offset_top current top pointer
449 md pointer to "static" info for the match
450 ims current /i, /m, and /s options
451 eptrb pointer to chain of blocks containing eptr at start of
452 brackets - for testing for empty matches
453 flags can contain
454 match_condassert - this is an assertion condition
455 match_cbegroup - this is the start of an unlimited repeat
456 group that can match an empty string
457 rdepth the recursion depth
458
459 Returns: MATCH_MATCH if matched ) these values are >= 0
460 MATCH_NOMATCH if failed to match )
461 a negative MATCH_xxx value for PRUNE, SKIP, etc
462 a negative PCRE_ERROR_xxx value if aborted by an error condition
463 (e.g. stopped by repeated call or recursion limit)
464 */
465
466 static int
467 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
468 const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
469 eptrblock *eptrb, int flags, unsigned int rdepth)
470 {
471 /* These variables do not need to be preserved over recursion in this function,
472 so they can be ordinary variables in all cases. Mark some of them with
473 "register" because they are used a lot in loops. */
474
475 register int rrc; /* Returns from recursive calls */
476 register int i; /* Used for loops not involving calls to RMATCH() */
477 register unsigned int c; /* Character values not kept over RMATCH() calls */
478 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
479
480 BOOL minimize, possessive; /* Quantifier options */
481 int condcode;
482
483 /* When recursion is not being used, all "local" variables that have to be
484 preserved over calls to RMATCH() are part of a "frame" which is obtained from
485 heap storage. Set up the top-level frame here; others are obtained from the
486 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
487
488 #ifdef NO_RECURSE
489 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
490 frame->Xprevframe = NULL; /* Marks the top level */
491
492 /* Copy in the original argument variables */
493
494 frame->Xeptr = eptr;
495 frame->Xecode = ecode;
496 frame->Xmstart = mstart;
497 frame->Xmarkptr = markptr;
498 frame->Xoffset_top = offset_top;
499 frame->Xims = ims;
500 frame->Xeptrb = eptrb;
501 frame->Xflags = flags;
502 frame->Xrdepth = rdepth;
503
504 /* This is where control jumps back to to effect "recursion" */
505
506 HEAP_RECURSE:
507
508 /* Macros make the argument variables come from the current frame */
509
510 #define eptr frame->Xeptr
511 #define ecode frame->Xecode
512 #define mstart frame->Xmstart
513 #define markptr frame->Xmarkptr
514 #define offset_top frame->Xoffset_top
515 #define ims frame->Xims
516 #define eptrb frame->Xeptrb
517 #define flags frame->Xflags
518 #define rdepth frame->Xrdepth
519
520 /* Ditto for the local variables */
521
522 #ifdef SUPPORT_UTF8
523 #define charptr frame->Xcharptr
524 #endif
525 #define callpat frame->Xcallpat
526 #define codelink frame->Xcodelink
527 #define data frame->Xdata
528 #define next frame->Xnext
529 #define pp frame->Xpp
530 #define prev frame->Xprev
531 #define saved_eptr frame->Xsaved_eptr
532
533 #define new_recursive frame->Xnew_recursive
534
535 #define cur_is_word frame->Xcur_is_word
536 #define condition frame->Xcondition
537 #define prev_is_word frame->Xprev_is_word
538
539 #define original_ims frame->Xoriginal_ims
540
541 #ifdef SUPPORT_UCP
542 #define prop_type frame->Xprop_type
543 #define prop_value frame->Xprop_value
544 #define prop_fail_result frame->Xprop_fail_result
545 #define prop_category frame->Xprop_category
546 #define prop_chartype frame->Xprop_chartype
547 #define prop_script frame->Xprop_script
548 #define oclength frame->Xoclength
549 #define occhars frame->Xocchars
550 #endif
551
552 #define ctype frame->Xctype
553 #define fc frame->Xfc
554 #define fi frame->Xfi
555 #define length frame->Xlength
556 #define max frame->Xmax
557 #define min frame->Xmin
558 #define number frame->Xnumber
559 #define offset frame->Xoffset
560 #define op frame->Xop
561 #define save_capture_last frame->Xsave_capture_last
562 #define save_offset1 frame->Xsave_offset1
563 #define save_offset2 frame->Xsave_offset2
564 #define save_offset3 frame->Xsave_offset3
565 #define stacksave frame->Xstacksave
566
567 #define newptrb frame->Xnewptrb
568
569 /* When recursion is being used, local variables are allocated on the stack and
570 get preserved during recursion in the normal way. In this environment, fi and
571 i, and fc and c, can be the same variables. */
572
573 #else /* NO_RECURSE not defined */
574 #define fi i
575 #define fc c
576
577
578 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
579 const uschar *charptr; /* in small blocks of the code. My normal */
580 #endif /* style of coding would have declared */
581 const uschar *callpat; /* them within each of those blocks. */
582 const uschar *data; /* However, in order to accommodate the */
583 const uschar *next; /* version of this code that uses an */
584 USPTR pp; /* external "stack" implemented on the */
585 const uschar *prev; /* heap, it is easier to declare them all */
586 USPTR saved_eptr; /* here, so the declarations can be cut */
587 /* out in a block. The only declarations */
588 recursion_info new_recursive; /* within blocks below are for variables */
589 /* that do not have to be preserved over */
590 BOOL cur_is_word; /* a recursive call to RMATCH(). */
591 BOOL condition;
592 BOOL prev_is_word;
593
594 unsigned long int original_ims;
595
596 #ifdef SUPPORT_UCP
597 int prop_type;
598 int prop_value;
599 int prop_fail_result;
600 int prop_category;
601 int prop_chartype;
602 int prop_script;
603 int oclength;
604 uschar occhars[8];
605 #endif
606
607 int codelink;
608 int ctype;
609 int length;
610 int max;
611 int min;
612 int number;
613 int offset;
614 int op;
615 int save_capture_last;
616 int save_offset1, save_offset2, save_offset3;
617 int stacksave[REC_STACK_SAVE_MAX];
618
619 eptrblock newptrb;
620 #endif /* NO_RECURSE */
621
622 /* These statements are here to stop the compiler complaining about unitialized
623 variables. */
624
625 #ifdef SUPPORT_UCP
626 prop_value = 0;
627 prop_fail_result = 0;
628 #endif
629
630
631 /* This label is used for tail recursion, which is used in a few cases even
632 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
633 used. Thanks to Ian Taylor for noticing this possibility and sending the
634 original patch. */
635
636 TAIL_RECURSE:
637
638 /* OK, now we can get on with the real code of the function. Recursive calls
639 are specified by the macro RMATCH and RRETURN is used to return. When
640 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
641 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
642 defined). However, RMATCH isn't like a function call because it's quite a
643 complicated macro. It has to be used in one particular way. This shouldn't,
644 however, impact performance when true recursion is being used. */
645
646 #ifdef SUPPORT_UTF8
647 utf8 = md->utf8; /* Local copy of the flag */
648 #else
649 utf8 = FALSE;
650 #endif
651
652 /* First check that we haven't called match() too many times, or that we
653 haven't exceeded the recursive call limit. */
654
655 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
656 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
657
658 original_ims = ims; /* Save for resetting on ')' */
659
660 /* At the start of a group with an unlimited repeat that may match an empty
661 string, the match_cbegroup flag is set. When this is the case, add the current
662 subject pointer to the chain of such remembered pointers, to be checked when we
663 hit the closing ket, in order to break infinite loops that match no characters.
664 When match() is called in other circumstances, don't add to the chain. The
665 match_cbegroup flag must NOT be used with tail recursion, because the memory
666 block that is used is on the stack, so a new one may be required for each
667 match(). */
668
669 if ((flags & match_cbegroup) != 0)
670 {
671 newptrb.epb_saved_eptr = eptr;
672 newptrb.epb_prev = eptrb;
673 eptrb = &newptrb;
674 }
675
676 /* Now start processing the opcodes. */
677
678 for (;;)
679 {
680 minimize = possessive = FALSE;
681 op = *ecode;
682
683 switch(op)
684 {
685 case OP_MARK:
686 markptr = ecode + 2;
687 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
688 ims, eptrb, flags, RM55);
689
690 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
691 argument, and we must check whether that argument matches this MARK's
692 argument. It is passed back in md->start_match_ptr (an overloading of that
693 variable). If it does match, we reset that variable to the current subject
694 position and return MATCH_SKIP. Otherwise, pass back the return code
695 unaltered. */
696
697 if (rrc == MATCH_SKIP_ARG &&
698 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
699 {
700 md->start_match_ptr = eptr;
701 RRETURN(MATCH_SKIP);
702 }
703
704 if (md->mark == NULL) md->mark = markptr;
705 RRETURN(rrc);
706
707 case OP_FAIL:
708 MRRETURN(MATCH_NOMATCH);
709
710 case OP_COMMIT:
711 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
712 ims, eptrb, flags, RM52);
713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
714 MRRETURN(MATCH_COMMIT);
715
716 case OP_PRUNE:
717 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
718 ims, eptrb, flags, RM51);
719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
720 MRRETURN(MATCH_PRUNE);
721
722 case OP_PRUNE_ARG:
723 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
724 ims, eptrb, flags, RM56);
725 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
726 md->mark = ecode + 2;
727 RRETURN(MATCH_PRUNE);
728
729 case OP_SKIP:
730 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
731 ims, eptrb, flags, RM53);
732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
733 md->start_match_ptr = eptr; /* Pass back current position */
734 MRRETURN(MATCH_SKIP);
735
736 case OP_SKIP_ARG:
737 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
738 ims, eptrb, flags, RM57);
739 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
740
741 /* Pass back the current skip name by overloading md->start_match_ptr and
742 returning the special MATCH_SKIP_ARG return code. This will either be
743 caught by a matching MARK, or get to the top, where it is treated the same
744 as PRUNE. */
745
746 md->start_match_ptr = ecode + 2;
747 RRETURN(MATCH_SKIP_ARG);
748
749 case OP_THEN:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 ims, eptrb, flags, RM54);
752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
753 MRRETURN(MATCH_THEN);
754
755 case OP_THEN_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 ims, eptrb, flags, RM58);
758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_THEN);
761
762 /* Handle a capturing bracket. If there is space in the offset vector, save
763 the current subject position in the working slot at the top of the vector.
764 We mustn't change the current values of the data slot, because they may be
765 set from a previous iteration of this group, and be referred to by a
766 reference inside the group.
767
768 If the bracket fails to match, we need to restore this value and also the
769 values of the final offsets, in case they were set by a previous iteration
770 of the same bracket.
771
772 If there isn't enough space in the offset vector, treat this as if it were
773 a non-capturing bracket. Don't worry about setting the flag for the error
774 case here; that is handled in the code for KET. */
775
776 case OP_CBRA:
777 case OP_SCBRA:
778 number = GET2(ecode, 1+LINK_SIZE);
779 offset = number << 1;
780
781 #ifdef PCRE_DEBUG
782 printf("start bracket %d\n", number);
783 printf("subject=");
784 pchars(eptr, 16, TRUE, md);
785 printf("\n");
786 #endif
787
788 if (offset < md->offset_max)
789 {
790 save_offset1 = md->offset_vector[offset];
791 save_offset2 = md->offset_vector[offset+1];
792 save_offset3 = md->offset_vector[md->offset_end - number];
793 save_capture_last = md->capture_last;
794
795 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
796 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
797
798 flags = (op == OP_SCBRA)? match_cbegroup : 0;
799 do
800 {
801 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
802 ims, eptrb, flags, RM1);
803 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
804 md->capture_last = save_capture_last;
805 ecode += GET(ecode, 1);
806 }
807 while (*ecode == OP_ALT);
808
809 DPRINTF(("bracket %d failed\n", number));
810
811 md->offset_vector[offset] = save_offset1;
812 md->offset_vector[offset+1] = save_offset2;
813 md->offset_vector[md->offset_end - number] = save_offset3;
814
815 if (rrc != MATCH_THEN) md->mark = markptr;
816 RRETURN(MATCH_NOMATCH);
817 }
818
819 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
820 as a non-capturing bracket. */
821
822 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
823 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
824
825 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
826
827 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
828 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
829
830 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
831 final alternative within the brackets, we would return the result of a
832 recursive call to match() whatever happened. We can reduce stack usage by
833 turning this into a tail recursion, except in the case when match_cbegroup
834 is set.*/
835
836 case OP_BRA:
837 case OP_SBRA:
838 DPRINTF(("start non-capturing bracket\n"));
839 flags = (op >= OP_SBRA)? match_cbegroup : 0;
840 for (;;)
841 {
842 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
843 {
844 if (flags == 0) /* Not a possibly empty group */
845 {
846 ecode += _pcre_OP_lengths[*ecode];
847 DPRINTF(("bracket 0 tail recursion\n"));
848 goto TAIL_RECURSE;
849 }
850
851 /* Possibly empty group; can't use tail recursion. */
852
853 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
854 eptrb, flags, RM48);
855 if (rrc == MATCH_NOMATCH) md->mark = markptr;
856 RRETURN(rrc);
857 }
858
859 /* For non-final alternatives, continue the loop for a NOMATCH result;
860 otherwise return. */
861
862 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
863 eptrb, flags, RM2);
864 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
865 ecode += GET(ecode, 1);
866 }
867 /* Control never reaches here. */
868
869 /* Conditional group: compilation checked that there are no more than
870 two branches. If the condition is false, skipping the first branch takes us
871 past the end if there is only one branch, but that's OK because that is
872 exactly what going to the ket would do. As there is only one branch to be
873 obeyed, we can use tail recursion to avoid using another stack frame. */
874
875 case OP_COND:
876 case OP_SCOND:
877 codelink= GET(ecode, 1);
878
879 /* Because of the way auto-callout works during compile, a callout item is
880 inserted between OP_COND and an assertion condition. */
881
882 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
883 {
884 if (pcre_callout != NULL)
885 {
886 pcre_callout_block cb;
887 cb.version = 1; /* Version 1 of the callout block */
888 cb.callout_number = ecode[LINK_SIZE+2];
889 cb.offset_vector = md->offset_vector;
890 cb.subject = (PCRE_SPTR)md->start_subject;
891 cb.subject_length = md->end_subject - md->start_subject;
892 cb.start_match = mstart - md->start_subject;
893 cb.current_position = eptr - md->start_subject;
894 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
895 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
896 cb.capture_top = offset_top/2;
897 cb.capture_last = md->capture_last;
898 cb.callout_data = md->callout_data;
899 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
900 if (rrc < 0) RRETURN(rrc);
901 }
902 ecode += _pcre_OP_lengths[OP_CALLOUT];
903 }
904
905 condcode = ecode[LINK_SIZE+1];
906
907 /* Now see what the actual condition is */
908
909 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
910 {
911 if (md->recursive == NULL) /* Not recursing => FALSE */
912 {
913 condition = FALSE;
914 ecode += GET(ecode, 1);
915 }
916 else
917 {
918 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
919 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
920
921 /* If the test is for recursion into a specific subpattern, and it is
922 false, but the test was set up by name, scan the table to see if the
923 name refers to any other numbers, and test them. The condition is true
924 if any one is set. */
925
926 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
927 {
928 uschar *slotA = md->name_table;
929 for (i = 0; i < md->name_count; i++)
930 {
931 if (GET2(slotA, 0) == recno) break;
932 slotA += md->name_entry_size;
933 }
934
935 /* Found a name for the number - there can be only one; duplicate
936 names for different numbers are allowed, but not vice versa. First
937 scan down for duplicates. */
938
939 if (i < md->name_count)
940 {
941 uschar *slotB = slotA;
942 while (slotB > md->name_table)
943 {
944 slotB -= md->name_entry_size;
945 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
946 {
947 condition = GET2(slotB, 0) == md->recursive->group_num;
948 if (condition) break;
949 }
950 else break;
951 }
952
953 /* Scan up for duplicates */
954
955 if (!condition)
956 {
957 slotB = slotA;
958 for (i++; i < md->name_count; i++)
959 {
960 slotB += md->name_entry_size;
961 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
962 {
963 condition = GET2(slotB, 0) == md->recursive->group_num;
964 if (condition) break;
965 }
966 else break;
967 }
968 }
969 }
970 }
971
972 /* Chose branch according to the condition */
973
974 ecode += condition? 3 : GET(ecode, 1);
975 }
976 }
977
978 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
979 {
980 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
981 condition = offset < offset_top && md->offset_vector[offset] >= 0;
982
983 /* If the numbered capture is unset, but the reference was by name,
984 scan the table to see if the name refers to any other numbers, and test
985 them. The condition is true if any one is set. This is tediously similar
986 to the code above, but not close enough to try to amalgamate. */
987
988 if (!condition && condcode == OP_NCREF)
989 {
990 int refno = offset >> 1;
991 uschar *slotA = md->name_table;
992
993 for (i = 0; i < md->name_count; i++)
994 {
995 if (GET2(slotA, 0) == refno) break;
996 slotA += md->name_entry_size;
997 }
998
999 /* Found a name for the number - there can be only one; duplicate names
1000 for different numbers are allowed, but not vice versa. First scan down
1001 for duplicates. */
1002
1003 if (i < md->name_count)
1004 {
1005 uschar *slotB = slotA;
1006 while (slotB > md->name_table)
1007 {
1008 slotB -= md->name_entry_size;
1009 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1010 {
1011 offset = GET2(slotB, 0) << 1;
1012 condition = offset < offset_top &&
1013 md->offset_vector[offset] >= 0;
1014 if (condition) break;
1015 }
1016 else break;
1017 }
1018
1019 /* Scan up for duplicates */
1020
1021 if (!condition)
1022 {
1023 slotB = slotA;
1024 for (i++; i < md->name_count; i++)
1025 {
1026 slotB += md->name_entry_size;
1027 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1028 {
1029 offset = GET2(slotB, 0) << 1;
1030 condition = offset < offset_top &&
1031 md->offset_vector[offset] >= 0;
1032 if (condition) break;
1033 }
1034 else break;
1035 }
1036 }
1037 }
1038 }
1039
1040 /* Chose branch according to the condition */
1041
1042 ecode += condition? 3 : GET(ecode, 1);
1043 }
1044
1045 else if (condcode == OP_DEF) /* DEFINE - always false */
1046 {
1047 condition = FALSE;
1048 ecode += GET(ecode, 1);
1049 }
1050
1051 /* The condition is an assertion. Call match() to evaluate it - setting
1052 the final argument match_condassert causes it to stop at the end of an
1053 assertion. */
1054
1055 else
1056 {
1057 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1058 match_condassert, RM3);
1059 if (rrc == MATCH_MATCH)
1060 {
1061 condition = TRUE;
1062 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1063 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1064 }
1065 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1066 {
1067 RRETURN(rrc); /* Need braces because of following else */
1068 }
1069 else
1070 {
1071 condition = FALSE;
1072 ecode += codelink;
1073 }
1074 }
1075
1076 /* We are now at the branch that is to be obeyed. As there is only one,
1077 we can use tail recursion to avoid using another stack frame, except when
1078 match_cbegroup is required for an unlimited repeat of a possibly empty
1079 group. If the second alternative doesn't exist, we can just plough on. */
1080
1081 if (condition || *ecode == OP_ALT)
1082 {
1083 ecode += 1 + LINK_SIZE;
1084 if (op == OP_SCOND) /* Possibly empty group */
1085 {
1086 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1087 RRETURN(rrc);
1088 }
1089 else /* Group must match something */
1090 {
1091 flags = 0;
1092 goto TAIL_RECURSE;
1093 }
1094 }
1095 else /* Condition false & no alternative */
1096 {
1097 ecode += 1 + LINK_SIZE;
1098 }
1099 break;
1100
1101
1102 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1103 to close any currently open capturing brackets. */
1104
1105 case OP_CLOSE:
1106 number = GET2(ecode, 1);
1107 offset = number << 1;
1108
1109 #ifdef PCRE_DEBUG
1110 printf("end bracket %d at *ACCEPT", number);
1111 printf("\n");
1112 #endif
1113
1114 md->capture_last = number;
1115 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1116 {
1117 md->offset_vector[offset] =
1118 md->offset_vector[md->offset_end - number];
1119 md->offset_vector[offset+1] = eptr - md->start_subject;
1120 if (offset_top <= offset) offset_top = offset + 2;
1121 }
1122 ecode += 3;
1123 break;
1124
1125
1126 /* End of the pattern, either real or forced. If we are in a top-level
1127 recursion, we should restore the offsets appropriately and continue from
1128 after the call. */
1129
1130 case OP_ACCEPT:
1131 case OP_END:
1132 if (md->recursive != NULL && md->recursive->group_num == 0)
1133 {
1134 recursion_info *rec = md->recursive;
1135 DPRINTF(("End of pattern in a (?0) recursion\n"));
1136 md->recursive = rec->prevrec;
1137 memmove(md->offset_vector, rec->offset_save,
1138 rec->saved_max * sizeof(int));
1139 offset_top = rec->save_offset_top;
1140 ims = original_ims;
1141 ecode = rec->after_call;
1142 break;
1143 }
1144
1145 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1146 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1147 the subject. In both cases, backtracking will then try other alternatives,
1148 if any. */
1149
1150 if (eptr == mstart &&
1151 (md->notempty ||
1152 (md->notempty_atstart &&
1153 mstart == md->start_subject + md->start_offset)))
1154 MRRETURN(MATCH_NOMATCH);
1155
1156 /* Otherwise, we have a match. */
1157
1158 md->end_match_ptr = eptr; /* Record where we ended */
1159 md->end_offset_top = offset_top; /* and how many extracts were taken */
1160 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1161
1162 /* For some reason, the macros don't work properly if an expression is
1163 given as the argument to MRRETURN when the heap is in use. */
1164
1165 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1166 MRRETURN(rrc);
1167
1168 /* Change option settings */
1169
1170 case OP_OPT:
1171 ims = ecode[1];
1172 ecode += 2;
1173 DPRINTF(("ims set to %02lx\n", ims));
1174 break;
1175
1176 /* Assertion brackets. Check the alternative branches in turn - the
1177 matching won't pass the KET for an assertion. If any one branch matches,
1178 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1179 start of each branch to move the current point backwards, so the code at
1180 this level is identical to the lookahead case. */
1181
1182 case OP_ASSERT:
1183 case OP_ASSERTBACK:
1184 do
1185 {
1186 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1187 RM4);
1188 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1189 {
1190 mstart = md->start_match_ptr; /* In case \K reset it */
1191 break;
1192 }
1193 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1194 ecode += GET(ecode, 1);
1195 }
1196 while (*ecode == OP_ALT);
1197 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1198
1199 /* If checking an assertion for a condition, return MATCH_MATCH. */
1200
1201 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1202
1203 /* Continue from after the assertion, updating the offsets high water
1204 mark, since extracts may have been taken during the assertion. */
1205
1206 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1207 ecode += 1 + LINK_SIZE;
1208 offset_top = md->end_offset_top;
1209 continue;
1210
1211 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1212 PRUNE, or COMMIT means we must assume failure without checking subsequent
1213 branches. */
1214
1215 case OP_ASSERT_NOT:
1216 case OP_ASSERTBACK_NOT:
1217 do
1218 {
1219 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1220 RM5);
1221 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1222 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1223 {
1224 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1225 break;
1226 }
1227 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1228 ecode += GET(ecode,1);
1229 }
1230 while (*ecode == OP_ALT);
1231
1232 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1233
1234 ecode += 1 + LINK_SIZE;
1235 continue;
1236
1237 /* Move the subject pointer back. This occurs only at the start of
1238 each branch of a lookbehind assertion. If we are too close to the start to
1239 move back, this match function fails. When working with UTF-8 we move
1240 back a number of characters, not bytes. */
1241
1242 case OP_REVERSE:
1243 #ifdef SUPPORT_UTF8
1244 if (utf8)
1245 {
1246 i = GET(ecode, 1);
1247 while (i-- > 0)
1248 {
1249 eptr--;
1250 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1251 BACKCHAR(eptr);
1252 }
1253 }
1254 else
1255 #endif
1256
1257 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1258
1259 {
1260 eptr -= GET(ecode, 1);
1261 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1262 }
1263
1264 /* Save the earliest consulted character, then skip to next op code */
1265
1266 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1267 ecode += 1 + LINK_SIZE;
1268 break;
1269
1270 /* The callout item calls an external function, if one is provided, passing
1271 details of the match so far. This is mainly for debugging, though the
1272 function is able to force a failure. */
1273
1274 case OP_CALLOUT:
1275 if (pcre_callout != NULL)
1276 {
1277 pcre_callout_block cb;
1278 cb.version = 1; /* Version 1 of the callout block */
1279 cb.callout_number = ecode[1];
1280 cb.offset_vector = md->offset_vector;
1281 cb.subject = (PCRE_SPTR)md->start_subject;
1282 cb.subject_length = md->end_subject - md->start_subject;
1283 cb.start_match = mstart - md->start_subject;
1284 cb.current_position = eptr - md->start_subject;
1285 cb.pattern_position = GET(ecode, 2);
1286 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1287 cb.capture_top = offset_top/2;
1288 cb.capture_last = md->capture_last;
1289 cb.callout_data = md->callout_data;
1290 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1291 if (rrc < 0) RRETURN(rrc);
1292 }
1293 ecode += 2 + 2*LINK_SIZE;
1294 break;
1295
1296 /* Recursion either matches the current regex, or some subexpression. The
1297 offset data is the offset to the starting bracket from the start of the
1298 whole pattern. (This is so that it works from duplicated subpatterns.)
1299
1300 If there are any capturing brackets started but not finished, we have to
1301 save their starting points and reinstate them after the recursion. However,
1302 we don't know how many such there are (offset_top records the completed
1303 total) so we just have to save all the potential data. There may be up to
1304 65535 such values, which is too large to put on the stack, but using malloc
1305 for small numbers seems expensive. As a compromise, the stack is used when
1306 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1307 is used. A problem is what to do if the malloc fails ... there is no way of
1308 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1309 values on the stack, and accept that the rest may be wrong.
1310
1311 There are also other values that have to be saved. We use a chained
1312 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1313 for the original version of this logic. */
1314
1315 case OP_RECURSE:
1316 {
1317 callpat = md->start_code + GET(ecode, 1);
1318 new_recursive.group_num = (callpat == md->start_code)? 0 :
1319 GET2(callpat, 1 + LINK_SIZE);
1320
1321 /* Add to "recursing stack" */
1322
1323 new_recursive.prevrec = md->recursive;
1324 md->recursive = &new_recursive;
1325
1326 /* Find where to continue from afterwards */
1327
1328 ecode += 1 + LINK_SIZE;
1329 new_recursive.after_call = ecode;
1330
1331 /* Now save the offset data. */
1332
1333 new_recursive.saved_max = md->offset_end;
1334 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1335 new_recursive.offset_save = stacksave;
1336 else
1337 {
1338 new_recursive.offset_save =
1339 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1340 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1341 }
1342
1343 memcpy(new_recursive.offset_save, md->offset_vector,
1344 new_recursive.saved_max * sizeof(int));
1345 new_recursive.save_offset_top = offset_top;
1346
1347 /* OK, now we can do the recursion. For each top-level alternative we
1348 restore the offset and recursion data. */
1349
1350 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1351 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1352 do
1353 {
1354 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1355 md, ims, eptrb, flags, RM6);
1356 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1357 {
1358 DPRINTF(("Recursion matched\n"));
1359 md->recursive = new_recursive.prevrec;
1360 if (new_recursive.offset_save != stacksave)
1361 (pcre_free)(new_recursive.offset_save);
1362 MRRETURN(MATCH_MATCH);
1363 }
1364 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1365 {
1366 DPRINTF(("Recursion gave error %d\n", rrc));
1367 if (new_recursive.offset_save != stacksave)
1368 (pcre_free)(new_recursive.offset_save);
1369 RRETURN(rrc);
1370 }
1371
1372 md->recursive = &new_recursive;
1373 memcpy(md->offset_vector, new_recursive.offset_save,
1374 new_recursive.saved_max * sizeof(int));
1375 callpat += GET(callpat, 1);
1376 }
1377 while (*callpat == OP_ALT);
1378
1379 DPRINTF(("Recursion didn't match\n"));
1380 md->recursive = new_recursive.prevrec;
1381 if (new_recursive.offset_save != stacksave)
1382 (pcre_free)(new_recursive.offset_save);
1383 MRRETURN(MATCH_NOMATCH);
1384 }
1385 /* Control never reaches here */
1386
1387 /* "Once" brackets are like assertion brackets except that after a match,
1388 the point in the subject string is not moved back. Thus there can never be
1389 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1390 Check the alternative branches in turn - the matching won't pass the KET
1391 for this kind of subpattern. If any one branch matches, we carry on as at
1392 the end of a normal bracket, leaving the subject pointer, but resetting
1393 the start-of-match value in case it was changed by \K. */
1394
1395 case OP_ONCE:
1396 prev = ecode;
1397 saved_eptr = eptr;
1398
1399 do
1400 {
1401 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1402 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1403 {
1404 mstart = md->start_match_ptr;
1405 break;
1406 }
1407 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1408 ecode += GET(ecode,1);
1409 }
1410 while (*ecode == OP_ALT);
1411
1412 /* If hit the end of the group (which could be repeated), fail */
1413
1414 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1415
1416 /* Continue as from after the assertion, updating the offsets high water
1417 mark, since extracts may have been taken. */
1418
1419 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1420
1421 offset_top = md->end_offset_top;
1422 eptr = md->end_match_ptr;
1423
1424 /* For a non-repeating ket, just continue at this level. This also
1425 happens for a repeating ket if no characters were matched in the group.
1426 This is the forcible breaking of infinite loops as implemented in Perl
1427 5.005. If there is an options reset, it will get obeyed in the normal
1428 course of events. */
1429
1430 if (*ecode == OP_KET || eptr == saved_eptr)
1431 {
1432 ecode += 1+LINK_SIZE;
1433 break;
1434 }
1435
1436 /* The repeating kets try the rest of the pattern or restart from the
1437 preceding bracket, in the appropriate order. The second "call" of match()
1438 uses tail recursion, to avoid using another stack frame. We need to reset
1439 any options that changed within the bracket before re-running it, so
1440 check the next opcode. */
1441
1442 if (ecode[1+LINK_SIZE] == OP_OPT)
1443 {
1444 ims = (ims & ~PCRE_IMS) | ecode[4];
1445 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1446 }
1447
1448 if (*ecode == OP_KETRMIN)
1449 {
1450 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1451 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1452 ecode = prev;
1453 flags = 0;
1454 goto TAIL_RECURSE;
1455 }
1456 else /* OP_KETRMAX */
1457 {
1458 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1460 ecode += 1 + LINK_SIZE;
1461 flags = 0;
1462 goto TAIL_RECURSE;
1463 }
1464 /* Control never gets here */
1465
1466 /* An alternation is the end of a branch; scan along to find the end of the
1467 bracketed group and go to there. */
1468
1469 case OP_ALT:
1470 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1471 break;
1472
1473 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1474 indicating that it may occur zero times. It may repeat infinitely, or not
1475 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1476 with fixed upper repeat limits are compiled as a number of copies, with the
1477 optional ones preceded by BRAZERO or BRAMINZERO. */
1478
1479 case OP_BRAZERO:
1480 {
1481 next = ecode+1;
1482 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1483 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1484 do next += GET(next,1); while (*next == OP_ALT);
1485 ecode = next + 1 + LINK_SIZE;
1486 }
1487 break;
1488
1489 case OP_BRAMINZERO:
1490 {
1491 next = ecode+1;
1492 do next += GET(next, 1); while (*next == OP_ALT);
1493 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1495 ecode++;
1496 }
1497 break;
1498
1499 case OP_SKIPZERO:
1500 {
1501 next = ecode+1;
1502 do next += GET(next,1); while (*next == OP_ALT);
1503 ecode = next + 1 + LINK_SIZE;
1504 }
1505 break;
1506
1507 /* End of a group, repeated or non-repeating. */
1508
1509 case OP_KET:
1510 case OP_KETRMIN:
1511 case OP_KETRMAX:
1512 prev = ecode - GET(ecode, 1);
1513
1514 /* If this was a group that remembered the subject start, in order to break
1515 infinite repeats of empty string matches, retrieve the subject start from
1516 the chain. Otherwise, set it NULL. */
1517
1518 if (*prev >= OP_SBRA)
1519 {
1520 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1521 eptrb = eptrb->epb_prev; /* Backup to previous group */
1522 }
1523 else saved_eptr = NULL;
1524
1525 /* If we are at the end of an assertion group or an atomic group, stop
1526 matching and return MATCH_MATCH, but record the current high water mark for
1527 use by positive assertions. We also need to record the match start in case
1528 it was changed by \K. */
1529
1530 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1531 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1532 *prev == OP_ONCE)
1533 {
1534 md->end_match_ptr = eptr; /* For ONCE */
1535 md->end_offset_top = offset_top;
1536 md->start_match_ptr = mstart;
1537 MRRETURN(MATCH_MATCH);
1538 }
1539
1540 /* For capturing groups we have to check the group number back at the start
1541 and if necessary complete handling an extraction by setting the offsets and
1542 bumping the high water mark. Note that whole-pattern recursion is coded as
1543 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1544 when the OP_END is reached. Other recursion is handled here. */
1545
1546 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1547 {
1548 number = GET2(prev, 1+LINK_SIZE);
1549 offset = number << 1;
1550
1551 #ifdef PCRE_DEBUG
1552 printf("end bracket %d", number);
1553 printf("\n");
1554 #endif
1555
1556 md->capture_last = number;
1557 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1558 {
1559 md->offset_vector[offset] =
1560 md->offset_vector[md->offset_end - number];
1561 md->offset_vector[offset+1] = eptr - md->start_subject;
1562 if (offset_top <= offset) offset_top = offset + 2;
1563 }
1564
1565 /* Handle a recursively called group. Restore the offsets
1566 appropriately and continue from after the call. */
1567
1568 if (md->recursive != NULL && md->recursive->group_num == number)
1569 {
1570 recursion_info *rec = md->recursive;
1571 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1572 md->recursive = rec->prevrec;
1573 memcpy(md->offset_vector, rec->offset_save,
1574 rec->saved_max * sizeof(int));
1575 offset_top = rec->save_offset_top;
1576 ecode = rec->after_call;
1577 ims = original_ims;
1578 break;
1579 }
1580 }
1581
1582 /* For both capturing and non-capturing groups, reset the value of the ims
1583 flags, in case they got changed during the group. */
1584
1585 ims = original_ims;
1586 DPRINTF(("ims reset to %02lx\n", ims));
1587
1588 /* For a non-repeating ket, just continue at this level. This also
1589 happens for a repeating ket if no characters were matched in the group.
1590 This is the forcible breaking of infinite loops as implemented in Perl
1591 5.005. If there is an options reset, it will get obeyed in the normal
1592 course of events. */
1593
1594 if (*ecode == OP_KET || eptr == saved_eptr)
1595 {
1596 ecode += 1 + LINK_SIZE;
1597 break;
1598 }
1599
1600 /* The repeating kets try the rest of the pattern or restart from the
1601 preceding bracket, in the appropriate order. In the second case, we can use
1602 tail recursion to avoid using another stack frame, unless we have an
1603 unlimited repeat of a group that can match an empty string. */
1604
1605 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1606
1607 if (*ecode == OP_KETRMIN)
1608 {
1609 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1610 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1611 if (flags != 0) /* Could match an empty string */
1612 {
1613 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1614 RRETURN(rrc);
1615 }
1616 ecode = prev;
1617 goto TAIL_RECURSE;
1618 }
1619 else /* OP_KETRMAX */
1620 {
1621 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1623 ecode += 1 + LINK_SIZE;
1624 flags = 0;
1625 goto TAIL_RECURSE;
1626 }
1627 /* Control never gets here */
1628
1629 /* Start of subject unless notbol, or after internal newline if multiline */
1630
1631 case OP_CIRC:
1632 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1633 if ((ims & PCRE_MULTILINE) != 0)
1634 {
1635 if (eptr != md->start_subject &&
1636 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1637 MRRETURN(MATCH_NOMATCH);
1638 ecode++;
1639 break;
1640 }
1641 /* ... else fall through */
1642
1643 /* Start of subject assertion */
1644
1645 case OP_SOD:
1646 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1647 ecode++;
1648 break;
1649
1650 /* Start of match assertion */
1651
1652 case OP_SOM:
1653 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1654 ecode++;
1655 break;
1656
1657 /* Reset the start of match point */
1658
1659 case OP_SET_SOM:
1660 mstart = eptr;
1661 ecode++;
1662 break;
1663
1664 /* Assert before internal newline if multiline, or before a terminating
1665 newline unless endonly is set, else end of subject unless noteol is set. */
1666
1667 case OP_DOLL:
1668 if ((ims & PCRE_MULTILINE) != 0)
1669 {
1670 if (eptr < md->end_subject)
1671 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1672 else
1673 { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1674 ecode++;
1675 break;
1676 }
1677 else
1678 {
1679 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1680 if (!md->endonly)
1681 {
1682 if (eptr != md->end_subject &&
1683 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1684 MRRETURN(MATCH_NOMATCH);
1685 ecode++;
1686 break;
1687 }
1688 }
1689 /* ... else fall through for endonly */
1690
1691 /* End of subject assertion (\z) */
1692
1693 case OP_EOD:
1694 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1695 ecode++;
1696 break;
1697
1698 /* End of subject or ending \n assertion (\Z) */
1699
1700 case OP_EODN:
1701 if (eptr != md->end_subject &&
1702 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1703 MRRETURN(MATCH_NOMATCH);
1704 ecode++;
1705 break;
1706
1707 /* Word boundary assertions */
1708
1709 case OP_NOT_WORD_BOUNDARY:
1710 case OP_WORD_BOUNDARY:
1711 {
1712
1713 /* Find out if the previous and current characters are "word" characters.
1714 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1715 be "non-word" characters. Remember the earliest consulted character for
1716 partial matching. */
1717
1718 #ifdef SUPPORT_UTF8
1719 if (utf8)
1720 {
1721 if (eptr == md->start_subject) prev_is_word = FALSE; else
1722 {
1723 USPTR lastptr = eptr - 1;
1724 while((*lastptr & 0xc0) == 0x80) lastptr--;
1725 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1726 GETCHAR(c, lastptr);
1727 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1728 }
1729 if (eptr >= md->end_subject)
1730 {
1731 SCHECK_PARTIAL();
1732 cur_is_word = FALSE;
1733 }
1734 else
1735 {
1736 GETCHAR(c, eptr);
1737 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1738 }
1739 }
1740 else
1741 #endif
1742
1743 /* Not in UTF-8 mode */
1744
1745 {
1746 if (eptr == md->start_subject) prev_is_word = FALSE; else
1747 {
1748 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1749 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1750 }
1751 if (eptr >= md->end_subject)
1752 {
1753 SCHECK_PARTIAL();
1754 cur_is_word = FALSE;
1755 }
1756 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1757 }
1758
1759 /* Now see if the situation is what we want */
1760
1761 if ((*ecode++ == OP_WORD_BOUNDARY)?
1762 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1763 MRRETURN(MATCH_NOMATCH);
1764 }
1765 break;
1766
1767 /* Match a single character type; inline for speed */
1768
1769 case OP_ANY:
1770 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1771 /* Fall through */
1772
1773 case OP_ALLANY:
1774 if (eptr++ >= md->end_subject)
1775 {
1776 SCHECK_PARTIAL();
1777 MRRETURN(MATCH_NOMATCH);
1778 }
1779 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1780 ecode++;
1781 break;
1782
1783 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1784 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1785
1786 case OP_ANYBYTE:
1787 if (eptr++ >= md->end_subject)
1788 {
1789 SCHECK_PARTIAL();
1790 MRRETURN(MATCH_NOMATCH);
1791 }
1792 ecode++;
1793 break;
1794
1795 case OP_NOT_DIGIT:
1796 if (eptr >= md->end_subject)
1797 {
1798 SCHECK_PARTIAL();
1799 MRRETURN(MATCH_NOMATCH);
1800 }
1801 GETCHARINCTEST(c, eptr);
1802 if (
1803 #ifdef SUPPORT_UTF8
1804 c < 256 &&
1805 #endif
1806 (md->ctypes[c] & ctype_digit) != 0
1807 )
1808 MRRETURN(MATCH_NOMATCH);
1809 ecode++;
1810 break;
1811
1812 case OP_DIGIT:
1813 if (eptr >= md->end_subject)
1814 {
1815 SCHECK_PARTIAL();
1816 MRRETURN(MATCH_NOMATCH);
1817 }
1818 GETCHARINCTEST(c, eptr);
1819 if (
1820 #ifdef SUPPORT_UTF8
1821 c >= 256 ||
1822 #endif
1823 (md->ctypes[c] & ctype_digit) == 0
1824 )
1825 MRRETURN(MATCH_NOMATCH);
1826 ecode++;
1827 break;
1828
1829 case OP_NOT_WHITESPACE:
1830 if (eptr >= md->end_subject)
1831 {
1832 SCHECK_PARTIAL();
1833 MRRETURN(MATCH_NOMATCH);
1834 }
1835 GETCHARINCTEST(c, eptr);
1836 if (
1837 #ifdef SUPPORT_UTF8
1838 c < 256 &&
1839 #endif
1840 (md->ctypes[c] & ctype_space) != 0
1841 )
1842 MRRETURN(MATCH_NOMATCH);
1843 ecode++;
1844 break;
1845
1846 case OP_WHITESPACE:
1847 if (eptr >= md->end_subject)
1848 {
1849 SCHECK_PARTIAL();
1850 MRRETURN(MATCH_NOMATCH);
1851 }
1852 GETCHARINCTEST(c, eptr);
1853 if (
1854 #ifdef SUPPORT_UTF8
1855 c >= 256 ||
1856 #endif
1857 (md->ctypes[c] & ctype_space) == 0
1858 )
1859 MRRETURN(MATCH_NOMATCH);
1860 ecode++;
1861 break;
1862
1863 case OP_NOT_WORDCHAR:
1864 if (eptr >= md->end_subject)
1865 {
1866 SCHECK_PARTIAL();
1867 MRRETURN(MATCH_NOMATCH);
1868 }
1869 GETCHARINCTEST(c, eptr);
1870 if (
1871 #ifdef SUPPORT_UTF8
1872 c < 256 &&
1873 #endif
1874 (md->ctypes[c] & ctype_word) != 0
1875 )
1876 MRRETURN(MATCH_NOMATCH);
1877 ecode++;
1878 break;
1879
1880 case OP_WORDCHAR:
1881 if (eptr >= md->end_subject)
1882 {
1883 SCHECK_PARTIAL();
1884 MRRETURN(MATCH_NOMATCH);
1885 }
1886 GETCHARINCTEST(c, eptr);
1887 if (
1888 #ifdef SUPPORT_UTF8
1889 c >= 256 ||
1890 #endif
1891 (md->ctypes[c] & ctype_word) == 0
1892 )
1893 MRRETURN(MATCH_NOMATCH);
1894 ecode++;
1895 break;
1896
1897 case OP_ANYNL:
1898 if (eptr >= md->end_subject)
1899 {
1900 SCHECK_PARTIAL();
1901 MRRETURN(MATCH_NOMATCH);
1902 }
1903 GETCHARINCTEST(c, eptr);
1904 switch(c)
1905 {
1906 default: MRRETURN(MATCH_NOMATCH);
1907 case 0x000d:
1908 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1909 break;
1910
1911 case 0x000a:
1912 break;
1913
1914 case 0x000b:
1915 case 0x000c:
1916 case 0x0085:
1917 case 0x2028:
1918 case 0x2029:
1919 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1920 break;
1921 }
1922 ecode++;
1923 break;
1924
1925 case OP_NOT_HSPACE:
1926 if (eptr >= md->end_subject)
1927 {
1928 SCHECK_PARTIAL();
1929 MRRETURN(MATCH_NOMATCH);
1930 }
1931 GETCHARINCTEST(c, eptr);
1932 switch(c)
1933 {
1934 default: break;
1935 case 0x09: /* HT */
1936 case 0x20: /* SPACE */
1937 case 0xa0: /* NBSP */
1938 case 0x1680: /* OGHAM SPACE MARK */
1939 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1940 case 0x2000: /* EN QUAD */
1941 case 0x2001: /* EM QUAD */
1942 case 0x2002: /* EN SPACE */
1943 case 0x2003: /* EM SPACE */
1944 case 0x2004: /* THREE-PER-EM SPACE */
1945 case 0x2005: /* FOUR-PER-EM SPACE */
1946 case 0x2006: /* SIX-PER-EM SPACE */
1947 case 0x2007: /* FIGURE SPACE */
1948 case 0x2008: /* PUNCTUATION SPACE */
1949 case 0x2009: /* THIN SPACE */
1950 case 0x200A: /* HAIR SPACE */
1951 case 0x202f: /* NARROW NO-BREAK SPACE */
1952 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1953 case 0x3000: /* IDEOGRAPHIC SPACE */
1954 MRRETURN(MATCH_NOMATCH);
1955 }
1956 ecode++;
1957 break;
1958
1959 case OP_HSPACE:
1960 if (eptr >= md->end_subject)
1961 {
1962 SCHECK_PARTIAL();
1963 MRRETURN(MATCH_NOMATCH);
1964 }
1965 GETCHARINCTEST(c, eptr);
1966 switch(c)
1967 {
1968 default: MRRETURN(MATCH_NOMATCH);
1969 case 0x09: /* HT */
1970 case 0x20: /* SPACE */
1971 case 0xa0: /* NBSP */
1972 case 0x1680: /* OGHAM SPACE MARK */
1973 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1974 case 0x2000: /* EN QUAD */
1975 case 0x2001: /* EM QUAD */
1976 case 0x2002: /* EN SPACE */
1977 case 0x2003: /* EM SPACE */
1978 case 0x2004: /* THREE-PER-EM SPACE */
1979 case 0x2005: /* FOUR-PER-EM SPACE */
1980 case 0x2006: /* SIX-PER-EM SPACE */
1981 case 0x2007: /* FIGURE SPACE */
1982 case 0x2008: /* PUNCTUATION SPACE */
1983 case 0x2009: /* THIN SPACE */
1984 case 0x200A: /* HAIR SPACE */
1985 case 0x202f: /* NARROW NO-BREAK SPACE */
1986 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1987 case 0x3000: /* IDEOGRAPHIC SPACE */
1988 break;
1989 }
1990 ecode++;
1991 break;
1992
1993 case OP_NOT_VSPACE:
1994 if (eptr >= md->end_subject)
1995 {
1996 SCHECK_PARTIAL();
1997 MRRETURN(MATCH_NOMATCH);
1998 }
1999 GETCHARINCTEST(c, eptr);
2000 switch(c)
2001 {
2002 default: break;
2003 case 0x0a: /* LF */
2004 case 0x0b: /* VT */
2005 case 0x0c: /* FF */
2006 case 0x0d: /* CR */
2007 case 0x85: /* NEL */
2008 case 0x2028: /* LINE SEPARATOR */
2009 case 0x2029: /* PARAGRAPH SEPARATOR */
2010 MRRETURN(MATCH_NOMATCH);
2011 }
2012 ecode++;
2013 break;
2014
2015 case OP_VSPACE:
2016 if (eptr >= md->end_subject)
2017 {
2018 SCHECK_PARTIAL();
2019 MRRETURN(MATCH_NOMATCH);
2020 }
2021 GETCHARINCTEST(c, eptr);
2022 switch(c)
2023 {
2024 default: MRRETURN(MATCH_NOMATCH);
2025 case 0x0a: /* LF */
2026 case 0x0b: /* VT */
2027 case 0x0c: /* FF */
2028 case 0x0d: /* CR */
2029 case 0x85: /* NEL */
2030 case 0x2028: /* LINE SEPARATOR */
2031 case 0x2029: /* PARAGRAPH SEPARATOR */
2032 break;
2033 }
2034 ecode++;
2035 break;
2036
2037 #ifdef SUPPORT_UCP
2038 /* Check the next character by Unicode property. We will get here only
2039 if the support is in the binary; otherwise a compile-time error occurs. */
2040
2041 case OP_PROP:
2042 case OP_NOTPROP:
2043 if (eptr >= md->end_subject)
2044 {
2045 SCHECK_PARTIAL();
2046 MRRETURN(MATCH_NOMATCH);
2047 }
2048 GETCHARINCTEST(c, eptr);
2049 {
2050 const ucd_record *prop = GET_UCD(c);
2051
2052 switch(ecode[1])
2053 {
2054 case PT_ANY:
2055 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2056 break;
2057
2058 case PT_LAMP:
2059 if ((prop->chartype == ucp_Lu ||
2060 prop->chartype == ucp_Ll ||
2061 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2062 MRRETURN(MATCH_NOMATCH);
2063 break;
2064
2065 case PT_GC:
2066 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2067 MRRETURN(MATCH_NOMATCH);
2068 break;
2069
2070 case PT_PC:
2071 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2072 MRRETURN(MATCH_NOMATCH);
2073 break;
2074
2075 case PT_SC:
2076 if ((ecode[2] != prop->script) == (op == OP_PROP))
2077 MRRETURN(MATCH_NOMATCH);
2078 break;
2079
2080 /* These are specials */
2081
2082 case PT_ALNUM:
2083 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2084 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2085 MRRETURN(MATCH_NOMATCH);
2086 break;
2087
2088 case PT_SPACE: /* Perl space */
2089 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2090 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2091 == (op == OP_NOTPROP))
2092 MRRETURN(MATCH_NOMATCH);
2093 break;
2094
2095 case PT_PXSPACE: /* POSIX space */
2096 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2097 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2098 c == CHAR_FF || c == CHAR_CR)
2099 == (op == OP_NOTPROP))
2100 MRRETURN(MATCH_NOMATCH);
2101 break;
2102
2103 case PT_WORD:
2104 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2105 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2106 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2107 MRRETURN(MATCH_NOMATCH);
2108 break;
2109
2110 /* This should never occur */
2111
2112 default:
2113 RRETURN(PCRE_ERROR_INTERNAL);
2114 }
2115
2116 ecode += 3;
2117 }
2118 break;
2119
2120 /* Match an extended Unicode sequence. We will get here only if the support
2121 is in the binary; otherwise a compile-time error occurs. */
2122
2123 case OP_EXTUNI:
2124 if (eptr >= md->end_subject)
2125 {
2126 SCHECK_PARTIAL();
2127 MRRETURN(MATCH_NOMATCH);
2128 }
2129 GETCHARINCTEST(c, eptr);
2130 {
2131 int category = UCD_CATEGORY(c);
2132 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2133 while (eptr < md->end_subject)
2134 {
2135 int len = 1;
2136 if (!utf8) c = *eptr; else
2137 {
2138 GETCHARLEN(c, eptr, len);
2139 }
2140 category = UCD_CATEGORY(c);
2141 if (category != ucp_M) break;
2142 eptr += len;
2143 }
2144 }
2145 ecode++;
2146 break;
2147 #endif
2148
2149
2150 /* Match a back reference, possibly repeatedly. Look past the end of the
2151 item to see if there is repeat information following. The code is similar
2152 to that for character classes, but repeated for efficiency. Then obey
2153 similar code to character type repeats - written out again for speed.
2154 However, if the referenced string is the empty string, always treat
2155 it as matched, any number of times (otherwise there could be infinite
2156 loops). */
2157
2158 case OP_REF:
2159 {
2160 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2161 ecode += 3;
2162
2163 /* If the reference is unset, there are two possibilities:
2164
2165 (a) In the default, Perl-compatible state, set the length to be longer
2166 than the amount of subject left; this ensures that every attempt at a
2167 match fails. We can't just fail here, because of the possibility of
2168 quantifiers with zero minima.
2169
2170 (b) If the JavaScript compatibility flag is set, set the length to zero
2171 so that the back reference matches an empty string.
2172
2173 Otherwise, set the length to the length of what was matched by the
2174 referenced subpattern. */
2175
2176 if (offset >= offset_top || md->offset_vector[offset] < 0)
2177 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2178 else
2179 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2180
2181 /* Set up for repetition, or handle the non-repeated case */
2182
2183 switch (*ecode)
2184 {
2185 case OP_CRSTAR:
2186 case OP_CRMINSTAR:
2187 case OP_CRPLUS:
2188 case OP_CRMINPLUS:
2189 case OP_CRQUERY:
2190 case OP_CRMINQUERY:
2191 c = *ecode++ - OP_CRSTAR;
2192 minimize = (c & 1) != 0;
2193 min = rep_min[c]; /* Pick up values from tables; */
2194 max = rep_max[c]; /* zero for max => infinity */
2195 if (max == 0) max = INT_MAX;
2196 break;
2197
2198 case OP_CRRANGE:
2199 case OP_CRMINRANGE:
2200 minimize = (*ecode == OP_CRMINRANGE);
2201 min = GET2(ecode, 1);
2202 max = GET2(ecode, 3);
2203 if (max == 0) max = INT_MAX;
2204 ecode += 5;
2205 break;
2206
2207 default: /* No repeat follows */
2208 if (!match_ref(offset, eptr, length, md, ims))
2209 {
2210 CHECK_PARTIAL();
2211 MRRETURN(MATCH_NOMATCH);
2212 }
2213 eptr += length;
2214 continue; /* With the main loop */
2215 }
2216
2217 /* If the length of the reference is zero, just continue with the
2218 main loop. */
2219
2220 if (length == 0) continue;
2221
2222 /* First, ensure the minimum number of matches are present. We get back
2223 the length of the reference string explicitly rather than passing the
2224 address of eptr, so that eptr can be a register variable. */
2225
2226 for (i = 1; i <= min; i++)
2227 {
2228 if (!match_ref(offset, eptr, length, md, ims))
2229 {
2230 CHECK_PARTIAL();
2231 MRRETURN(MATCH_NOMATCH);
2232 }
2233 eptr += length;
2234 }
2235
2236 /* If min = max, continue at the same level without recursion.
2237 They are not both allowed to be zero. */
2238
2239 if (min == max) continue;
2240
2241 /* If minimizing, keep trying and advancing the pointer */
2242
2243 if (minimize)
2244 {
2245 for (fi = min;; fi++)
2246 {
2247 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2248 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2249 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2250 if (!match_ref(offset, eptr, length, md, ims))
2251 {
2252 CHECK_PARTIAL();
2253 MRRETURN(MATCH_NOMATCH);
2254 }
2255 eptr += length;
2256 }
2257 /* Control never gets here */
2258 }
2259
2260 /* If maximizing, find the longest string and work backwards */
2261
2262 else
2263 {
2264 pp = eptr;
2265 for (i = min; i < max; i++)
2266 {
2267 if (!match_ref(offset, eptr, length, md, ims))
2268 {
2269 CHECK_PARTIAL();
2270 break;
2271 }
2272 eptr += length;
2273 }
2274 while (eptr >= pp)
2275 {
2276 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2278 eptr -= length;
2279 }
2280 MRRETURN(MATCH_NOMATCH);
2281 }
2282 }
2283 /* Control never gets here */
2284
2285 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2286 used when all the characters in the class have values in the range 0-255,
2287 and either the matching is caseful, or the characters are in the range
2288 0-127 when UTF-8 processing is enabled. The only difference between
2289 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2290 encountered.
2291
2292 First, look past the end of the item to see if there is repeat information
2293 following. Then obey similar code to character type repeats - written out
2294 again for speed. */
2295
2296 case OP_NCLASS:
2297 case OP_CLASS:
2298 {
2299 data = ecode + 1; /* Save for matching */
2300 ecode += 33; /* Advance past the item */
2301
2302 switch (*ecode)
2303 {
2304 case OP_CRSTAR:
2305 case OP_CRMINSTAR:
2306 case OP_CRPLUS:
2307 case OP_CRMINPLUS:
2308 case OP_CRQUERY:
2309 case OP_CRMINQUERY:
2310 c = *ecode++ - OP_CRSTAR;
2311 minimize = (c & 1) != 0;
2312 min = rep_min[c]; /* Pick up values from tables; */
2313 max = rep_max[c]; /* zero for max => infinity */
2314 if (max == 0) max = INT_MAX;
2315 break;
2316
2317 case OP_CRRANGE:
2318 case OP_CRMINRANGE:
2319 minimize = (*ecode == OP_CRMINRANGE);
2320 min = GET2(ecode, 1);
2321 max = GET2(ecode, 3);
2322 if (max == 0) max = INT_MAX;
2323 ecode += 5;
2324 break;
2325
2326 default: /* No repeat follows */
2327 min = max = 1;
2328 break;
2329 }
2330
2331 /* First, ensure the minimum number of matches are present. */
2332
2333 #ifdef SUPPORT_UTF8
2334 /* UTF-8 mode */
2335 if (utf8)
2336 {
2337 for (i = 1; i <= min; i++)
2338 {
2339 if (eptr >= md->end_subject)
2340 {
2341 SCHECK_PARTIAL();
2342 MRRETURN(MATCH_NOMATCH);
2343 }
2344 GETCHARINC(c, eptr);
2345 if (c > 255)
2346 {
2347 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2348 }
2349 else
2350 {
2351 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2352 }
2353 }
2354 }
2355 else
2356 #endif
2357 /* Not UTF-8 mode */
2358 {
2359 for (i = 1; i <= min; i++)
2360 {
2361 if (eptr >= md->end_subject)
2362 {
2363 SCHECK_PARTIAL();
2364 MRRETURN(MATCH_NOMATCH);
2365 }
2366 c = *eptr++;
2367 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2368 }
2369 }
2370
2371 /* If max == min we can continue with the main loop without the
2372 need to recurse. */
2373
2374 if (min == max) continue;
2375
2376 /* If minimizing, keep testing the rest of the expression and advancing
2377 the pointer while it matches the class. */
2378
2379 if (minimize)
2380 {
2381 #ifdef SUPPORT_UTF8
2382 /* UTF-8 mode */
2383 if (utf8)
2384 {
2385 for (fi = min;; fi++)
2386 {
2387 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2388 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2389 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2390 if (eptr >= md->end_subject)
2391 {
2392 SCHECK_PARTIAL();
2393 MRRETURN(MATCH_NOMATCH);
2394 }
2395 GETCHARINC(c, eptr);
2396 if (c > 255)
2397 {
2398 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2399 }
2400 else
2401 {
2402 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2403 }
2404 }
2405 }
2406 else
2407 #endif
2408 /* Not UTF-8 mode */
2409 {
2410 for (fi = min;; fi++)
2411 {
2412 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2413 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2414 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2415 if (eptr >= md->end_subject)
2416 {
2417 SCHECK_PARTIAL();
2418 MRRETURN(MATCH_NOMATCH);
2419 }
2420 c = *eptr++;
2421 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2422 }
2423 }
2424 /* Control never gets here */
2425 }
2426
2427 /* If maximizing, find the longest possible run, then work backwards. */
2428
2429 else
2430 {
2431 pp = eptr;
2432
2433 #ifdef SUPPORT_UTF8
2434 /* UTF-8 mode */
2435 if (utf8)
2436 {
2437 for (i = min; i < max; i++)
2438 {
2439 int len = 1;
2440 if (eptr >= md->end_subject)
2441 {
2442 SCHECK_PARTIAL();
2443 break;
2444 }
2445 GETCHARLEN(c, eptr, len);
2446 if (c > 255)
2447 {
2448 if (op == OP_CLASS) break;
2449 }
2450 else
2451 {
2452 if ((data[c/8] & (1 << (c&7))) == 0) break;
2453 }
2454 eptr += len;
2455 }
2456 for (;;)
2457 {
2458 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2459 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2460 if (eptr-- == pp) break; /* Stop if tried at original pos */
2461 BACKCHAR(eptr);
2462 }
2463 }
2464 else
2465 #endif
2466 /* Not UTF-8 mode */
2467 {
2468 for (i = min; i < max; i++)
2469 {
2470 if (eptr >= md->end_subject)
2471 {
2472 SCHECK_PARTIAL();
2473 break;
2474 }
2475 c = *eptr;
2476 if ((data[c/8] & (1 << (c&7))) == 0) break;
2477 eptr++;
2478 }
2479 while (eptr >= pp)
2480 {
2481 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2482 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2483 eptr--;
2484 }
2485 }
2486
2487 MRRETURN(MATCH_NOMATCH);
2488 }
2489 }
2490 /* Control never gets here */
2491
2492
2493 /* Match an extended character class. This opcode is encountered only
2494 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2495 mode, because Unicode properties are supported in non-UTF-8 mode. */
2496
2497 #ifdef SUPPORT_UTF8
2498 case OP_XCLASS:
2499 {
2500 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2501 ecode += GET(ecode, 1); /* Advance past the item */
2502
2503 switch (*ecode)
2504 {
2505 case OP_CRSTAR:
2506 case OP_CRMINSTAR:
2507 case OP_CRPLUS:
2508 case OP_CRMINPLUS:
2509 case OP_CRQUERY:
2510 case OP_CRMINQUERY:
2511 c = *ecode++ - OP_CRSTAR;
2512 minimize = (c & 1) != 0;
2513 min = rep_min[c]; /* Pick up values from tables; */
2514 max = rep_max[c]; /* zero for max => infinity */
2515 if (max == 0) max = INT_MAX;
2516 break;
2517
2518 case OP_CRRANGE:
2519 case OP_CRMINRANGE:
2520 minimize = (*ecode == OP_CRMINRANGE);
2521 min = GET2(ecode, 1);
2522 max = GET2(ecode, 3);
2523 if (max == 0) max = INT_MAX;
2524 ecode += 5;
2525 break;
2526
2527 default: /* No repeat follows */
2528 min = max = 1;
2529 break;
2530 }
2531
2532 /* First, ensure the minimum number of matches are present. */
2533
2534 for (i = 1; i <= min; i++)
2535 {
2536 if (eptr >= md->end_subject)
2537 {
2538 SCHECK_PARTIAL();
2539 MRRETURN(MATCH_NOMATCH);
2540 }
2541 GETCHARINCTEST(c, eptr);
2542 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2543 }
2544
2545 /* If max == min we can continue with the main loop without the
2546 need to recurse. */
2547
2548 if (min == max) continue;
2549
2550 /* If minimizing, keep testing the rest of the expression and advancing
2551 the pointer while it matches the class. */
2552
2553 if (minimize)
2554 {
2555 for (fi = min;; fi++)
2556 {
2557 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2559 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2560 if (eptr >= md->end_subject)
2561 {
2562 SCHECK_PARTIAL();
2563 MRRETURN(MATCH_NOMATCH);
2564 }
2565 GETCHARINCTEST(c, eptr);
2566 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2567 }
2568 /* Control never gets here */
2569 }
2570
2571 /* If maximizing, find the longest possible run, then work backwards. */
2572
2573 else
2574 {
2575 pp = eptr;
2576 for (i = min; i < max; i++)
2577 {
2578 int len = 1;
2579 if (eptr >= md->end_subject)
2580 {
2581 SCHECK_PARTIAL();
2582 break;
2583 }
2584 GETCHARLENTEST(c, eptr, len);
2585 if (!_pcre_xclass(c, data)) break;
2586 eptr += len;
2587 }
2588 for(;;)
2589 {
2590 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2592 if (eptr-- == pp) break; /* Stop if tried at original pos */
2593 if (utf8) BACKCHAR(eptr);
2594 }
2595 MRRETURN(MATCH_NOMATCH);
2596 }
2597
2598 /* Control never gets here */
2599 }
2600 #endif /* End of XCLASS */
2601
2602 /* Match a single character, casefully */
2603
2604 case OP_CHAR:
2605 #ifdef SUPPORT_UTF8
2606 if (utf8)
2607 {
2608 length = 1;
2609 ecode++;
2610 GETCHARLEN(fc, ecode, length);
2611 if (length > md->end_subject - eptr)
2612 {
2613 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2614 MRRETURN(MATCH_NOMATCH);
2615 }
2616 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2617 }
2618 else
2619 #endif
2620
2621 /* Non-UTF-8 mode */
2622 {
2623 if (md->end_subject - eptr < 1)
2624 {
2625 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2626 MRRETURN(MATCH_NOMATCH);
2627 }
2628 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2629 ecode += 2;
2630 }
2631 break;
2632
2633 /* Match a single character, caselessly */
2634
2635 case OP_CHARNC:
2636 #ifdef SUPPORT_UTF8
2637 if (utf8)
2638 {
2639 length = 1;
2640 ecode++;
2641 GETCHARLEN(fc, ecode, length);
2642
2643 if (length > md->end_subject - eptr)
2644 {
2645 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2646 MRRETURN(MATCH_NOMATCH);
2647 }
2648
2649 /* If the pattern character's value is < 128, we have only one byte, and
2650 can use the fast lookup table. */
2651
2652 if (fc < 128)
2653 {
2654 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2655 }
2656
2657 /* Otherwise we must pick up the subject character */
2658
2659 else
2660 {
2661 unsigned int dc;
2662 GETCHARINC(dc, eptr);
2663 ecode += length;
2664
2665 /* If we have Unicode property support, we can use it to test the other
2666 case of the character, if there is one. */
2667
2668 if (fc != dc)
2669 {
2670 #ifdef SUPPORT_UCP
2671 if (dc != UCD_OTHERCASE(fc))
2672 #endif
2673 MRRETURN(MATCH_NOMATCH);
2674 }
2675 }
2676 }
2677 else
2678 #endif /* SUPPORT_UTF8 */
2679
2680 /* Non-UTF-8 mode */
2681 {
2682 if (md->end_subject - eptr < 1)
2683 {
2684 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2685 MRRETURN(MATCH_NOMATCH);
2686 }
2687 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2688 ecode += 2;
2689 }
2690 break;
2691
2692 /* Match a single character repeatedly. */
2693
2694 case OP_EXACT:
2695 min = max = GET2(ecode, 1);
2696 ecode += 3;
2697 goto REPEATCHAR;
2698
2699 case OP_POSUPTO:
2700 possessive = TRUE;
2701 /* Fall through */
2702
2703 case OP_UPTO:
2704 case OP_MINUPTO:
2705 min = 0;
2706 max = GET2(ecode, 1);
2707 minimize = *ecode == OP_MINUPTO;
2708 ecode += 3;
2709 goto REPEATCHAR;
2710
2711 case OP_POSSTAR:
2712 possessive = TRUE;
2713 min = 0;
2714 max = INT_MAX;
2715 ecode++;
2716 goto REPEATCHAR;
2717
2718 case OP_POSPLUS:
2719 possessive = TRUE;
2720 min = 1;
2721 max = INT_MAX;
2722 ecode++;
2723 goto REPEATCHAR;
2724
2725 case OP_POSQUERY:
2726 possessive = TRUE;
2727 min = 0;
2728 max = 1;
2729 ecode++;
2730 goto REPEATCHAR;
2731
2732 case OP_STAR:
2733 case OP_MINSTAR:
2734 case OP_PLUS:
2735 case OP_MINPLUS:
2736 case OP_QUERY:
2737 case OP_MINQUERY:
2738 c = *ecode++ - OP_STAR;
2739 minimize = (c & 1) != 0;
2740
2741 min = rep_min[c]; /* Pick up values from tables; */
2742 max = rep_max[c]; /* zero for max => infinity */
2743 if (max == 0) max = INT_MAX;
2744
2745 /* Common code for all repeated single-character matches. */
2746
2747 REPEATCHAR:
2748 #ifdef SUPPORT_UTF8
2749 if (utf8)
2750 {
2751 length = 1;
2752 charptr = ecode;
2753 GETCHARLEN(fc, ecode, length);
2754 ecode += length;
2755
2756 /* Handle multibyte character matching specially here. There is
2757 support for caseless matching if UCP support is present. */
2758
2759 if (length > 1)
2760 {
2761 #ifdef SUPPORT_UCP
2762 unsigned int othercase;
2763 if ((ims & PCRE_CASELESS) != 0 &&
2764 (othercase = UCD_OTHERCASE(fc)) != fc)
2765 oclength = _pcre_ord2utf8(othercase, occhars);
2766 else oclength = 0;
2767 #endif /* SUPPORT_UCP */
2768
2769 for (i = 1; i <= min; i++)
2770 {
2771 if (eptr <= md->end_subject - length &&
2772 memcmp(eptr, charptr, length) == 0) eptr += length;
2773 #ifdef SUPPORT_UCP
2774 else if (oclength > 0 &&
2775 eptr <= md->end_subject - oclength &&
2776 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2777 #endif /* SUPPORT_UCP */
2778 else
2779 {
2780 CHECK_PARTIAL();
2781 MRRETURN(MATCH_NOMATCH);
2782 }
2783 }
2784
2785 if (min == max) continue;
2786
2787 if (minimize)
2788 {
2789 for (fi = min;; fi++)
2790 {
2791 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2792 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2793 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2794 if (eptr <= md->end_subject - length &&
2795 memcmp(eptr, charptr, length) == 0) eptr += length;
2796 #ifdef SUPPORT_UCP
2797 else if (oclength > 0 &&
2798 eptr <= md->end_subject - oclength &&
2799 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2800 #endif /* SUPPORT_UCP */
2801 else
2802 {
2803 CHECK_PARTIAL();
2804 MRRETURN(MATCH_NOMATCH);
2805 }
2806 }
2807 /* Control never gets here */
2808 }
2809
2810 else /* Maximize */
2811 {
2812 pp = eptr;
2813 for (i = min; i < max; i++)
2814 {
2815 if (eptr <= md->end_subject - length &&
2816 memcmp(eptr, charptr, length) == 0) eptr += length;
2817 #ifdef SUPPORT_UCP
2818 else if (oclength > 0 &&
2819 eptr <= md->end_subject - oclength &&
2820 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2821 #endif /* SUPPORT_UCP */
2822 else
2823 {
2824 CHECK_PARTIAL();
2825 break;
2826 }
2827 }
2828
2829 if (possessive) continue;
2830
2831 for(;;)
2832 {
2833 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2834 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2835 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
2836 #ifdef SUPPORT_UCP
2837 eptr--;
2838 BACKCHAR(eptr);
2839 #else /* without SUPPORT_UCP */
2840 eptr -= length;
2841 #endif /* SUPPORT_UCP */
2842 }
2843 }
2844 /* Control never gets here */
2845 }
2846
2847 /* If the length of a UTF-8 character is 1, we fall through here, and
2848 obey the code as for non-UTF-8 characters below, though in this case the
2849 value of fc will always be < 128. */
2850 }
2851 else
2852 #endif /* SUPPORT_UTF8 */
2853
2854 /* When not in UTF-8 mode, load a single-byte character. */
2855
2856 fc = *ecode++;
2857
2858 /* The value of fc at this point is always less than 256, though we may or
2859 may not be in UTF-8 mode. The code is duplicated for the caseless and
2860 caseful cases, for speed, since matching characters is likely to be quite
2861 common. First, ensure the minimum number of matches are present. If min =
2862 max, continue at the same level without recursing. Otherwise, if
2863 minimizing, keep trying the rest of the expression and advancing one
2864 matching character if failing, up to the maximum. Alternatively, if
2865 maximizing, find the maximum number of characters and work backwards. */
2866
2867 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2868 max, eptr));
2869
2870 if ((ims & PCRE_CASELESS) != 0)
2871 {
2872 fc = md->lcc[fc];
2873 for (i = 1; i <= min; i++)
2874 {
2875 if (eptr >= md->end_subject)
2876 {
2877 SCHECK_PARTIAL();
2878 MRRETURN(MATCH_NOMATCH);
2879 }
2880 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2881 }
2882 if (min == max) continue;
2883 if (minimize)
2884 {
2885 for (fi = min;; fi++)
2886 {
2887 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2889 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2890 if (eptr >= md->end_subject)
2891 {
2892 SCHECK_PARTIAL();
2893 MRRETURN(MATCH_NOMATCH);
2894 }
2895 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2896 }
2897 /* Control never gets here */
2898 }
2899 else /* Maximize */
2900 {
2901 pp = eptr;
2902 for (i = min; i < max; i++)
2903 {
2904 if (eptr >= md->end_subject)
2905 {
2906 SCHECK_PARTIAL();
2907 break;
2908 }
2909 if (fc != md->lcc[*eptr]) break;
2910 eptr++;
2911 }
2912
2913 if (possessive) continue;
2914
2915 while (eptr >= pp)
2916 {
2917 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2918 eptr--;
2919 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2920 }
2921 MRRETURN(MATCH_NOMATCH);
2922 }
2923 /* Control never gets here */
2924 }
2925
2926 /* Caseful comparisons (includes all multi-byte characters) */
2927
2928 else
2929 {
2930 for (i = 1; i <= min; i++)
2931 {
2932 if (eptr >= md->end_subject)
2933 {
2934 SCHECK_PARTIAL();
2935 MRRETURN(MATCH_NOMATCH);
2936 }
2937 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2938 }
2939
2940 if (min == max) continue;
2941
2942 if (minimize)
2943 {
2944 for (fi = min;; fi++)
2945 {
2946 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2948 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2949 if (eptr >= md->end_subject)
2950 {
2951 SCHECK_PARTIAL();
2952 MRRETURN(MATCH_NOMATCH);
2953 }
2954 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
2955 }
2956 /* Control never gets here */
2957 }
2958 else /* Maximize */
2959 {
2960 pp = eptr;
2961 for (i = min; i < max; i++)
2962 {
2963 if (eptr >= md->end_subject)
2964 {
2965 SCHECK_PARTIAL();
2966 break;
2967 }
2968 if (fc != *eptr) break;
2969 eptr++;
2970 }
2971 if (possessive) continue;
2972
2973 while (eptr >= pp)
2974 {
2975 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2976 eptr--;
2977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2978 }
2979 MRRETURN(MATCH_NOMATCH);
2980 }
2981 }
2982 /* Control never gets here */
2983
2984 /* Match a negated single one-byte character. The character we are
2985 checking can be multibyte. */
2986
2987 case OP_NOT:
2988 if (eptr >= md->end_subject)
2989 {
2990 SCHECK_PARTIAL();
2991 MRRETURN(MATCH_NOMATCH);
2992 }
2993 ecode++;
2994 GETCHARINCTEST(c, eptr);
2995 if ((ims & PCRE_CASELESS) != 0)
2996 {
2997 #ifdef SUPPORT_UTF8
2998 if (c < 256)
2999 #endif
3000 c = md->lcc[c];
3001 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3002 }
3003 else
3004 {
3005 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3006 }
3007 break;
3008
3009 /* Match a negated single one-byte character repeatedly. This is almost a
3010 repeat of the code for a repeated single character, but I haven't found a
3011 nice way of commoning these up that doesn't require a test of the
3012 positive/negative option for each character match. Maybe that wouldn't add
3013 very much to the time taken, but character matching *is* what this is all
3014 about... */
3015
3016 case OP_NOTEXACT:
3017 min = max = GET2(ecode, 1);
3018 ecode += 3;
3019 goto REPEATNOTCHAR;
3020
3021 case OP_NOTUPTO:
3022 case OP_NOTMINUPTO:
3023 min = 0;
3024 max = GET2(ecode, 1);
3025 minimize = *ecode == OP_NOTMINUPTO;
3026 ecode += 3;
3027 goto REPEATNOTCHAR;
3028
3029 case OP_NOTPOSSTAR:
3030 possessive = TRUE;
3031 min = 0;
3032 max = INT_MAX;
3033 ecode++;
3034 goto REPEATNOTCHAR;
3035
3036 case OP_NOTPOSPLUS:
3037 possessive = TRUE;
3038 min = 1;
3039 max = INT_MAX;
3040 ecode++;
3041 goto REPEATNOTCHAR;
3042
3043 case OP_NOTPOSQUERY:
3044 possessive = TRUE;
3045 min = 0;
3046 max = 1;
3047 ecode++;
3048 goto REPEATNOTCHAR;
3049
3050 case OP_NOTPOSUPTO:
3051 possessive = TRUE;
3052 min = 0;
3053 max = GET2(ecode, 1);
3054 ecode += 3;
3055 goto REPEATNOTCHAR;
3056
3057 case OP_NOTSTAR:
3058 case OP_NOTMINSTAR:
3059 case OP_NOTPLUS:
3060 case OP_NOTMINPLUS:
3061 case OP_NOTQUERY:
3062 case OP_NOTMINQUERY:
3063 c = *ecode++ - OP_NOTSTAR;
3064 minimize = (c & 1) != 0;
3065 min = rep_min[c]; /* Pick up values from tables; */
3066 max = rep_max[c]; /* zero for max => infinity */
3067 if (max == 0) max = INT_MAX;
3068
3069 /* Common code for all repeated single-byte matches. */
3070
3071 REPEATNOTCHAR:
3072 fc = *ecode++;
3073
3074 /* The code is duplicated for the caseless and caseful cases, for speed,
3075 since matching characters is likely to be quite common. First, ensure the
3076 minimum number of matches are present. If min = max, continue at the same
3077 level without recursing. Otherwise, if minimizing, keep trying the rest of
3078 the expression and advancing one matching character if failing, up to the
3079 maximum. Alternatively, if maximizing, find the maximum number of
3080 characters and work backwards. */
3081
3082 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3083 max, eptr));
3084
3085 if ((ims & PCRE_CASELESS) != 0)
3086 {
3087 fc = md->lcc[fc];
3088
3089 #ifdef SUPPORT_UTF8
3090 /* UTF-8 mode */
3091 if (utf8)
3092 {
3093 register unsigned int d;
3094 for (i = 1; i <= min; i++)
3095 {
3096 if (eptr >= md->end_subject)
3097 {
3098 SCHECK_PARTIAL();
3099 MRRETURN(MATCH_NOMATCH);
3100 }
3101 GETCHARINC(d, eptr);
3102 if (d < 256) d = md->lcc[d];
3103 if (fc == d) MRRETURN(MATCH_NOMATCH);
3104 }
3105 }
3106 else
3107 #endif
3108
3109 /* Not UTF-8 mode */
3110 {
3111 for (i = 1; i <= min; i++)
3112 {
3113 if (eptr >= md->end_subject)
3114 {
3115 SCHECK_PARTIAL();
3116 MRRETURN(MATCH_NOMATCH);
3117 }
3118 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3119 }
3120 }
3121
3122 if (min == max) continue;
3123
3124 if (minimize)
3125 {
3126 #ifdef SUPPORT_UTF8
3127 /* UTF-8 mode */
3128 if (utf8)
3129 {
3130 register unsigned int d;
3131 for (fi = min;; fi++)
3132 {
3133 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3134 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3135 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3136 if (eptr >= md->end_subject)
3137 {
3138 SCHECK_PARTIAL();
3139 MRRETURN(MATCH_NOMATCH);
3140 }
3141 GETCHARINC(d, eptr);
3142 if (d < 256) d = md->lcc[d];
3143 if (fc == d) MRRETURN(MATCH_NOMATCH);
3144 }
3145 }
3146 else
3147 #endif
3148 /* Not UTF-8 mode */
3149 {
3150 for (fi = min;; fi++)
3151 {
3152 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3153 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3154 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3155 if (eptr >= md->end_subject)
3156 {
3157 SCHECK_PARTIAL();
3158 MRRETURN(MATCH_NOMATCH);
3159 }
3160 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3161 }
3162 }
3163 /* Control never gets here */
3164 }
3165
3166 /* Maximize case */
3167
3168 else
3169 {
3170 pp = eptr;
3171
3172 #ifdef SUPPORT_UTF8
3173 /* UTF-8 mode */
3174 if (utf8)
3175 {
3176 register unsigned int d;
3177 for (i = min; i < max; i++)
3178 {
3179 int len = 1;
3180 if (eptr >= md->end_subject)
3181 {
3182 SCHECK_PARTIAL();
3183 break;
3184 }
3185 GETCHARLEN(d, eptr, len);
3186 if (d < 256) d = md->lcc[d];
3187 if (fc == d) break;
3188 eptr += len;
3189 }
3190 if (possessive) continue;
3191 for(;;)
3192 {
3193 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3195 if (eptr-- == pp) break; /* Stop if tried at original pos */
3196 BACKCHAR(eptr);
3197 }
3198 }
3199 else
3200 #endif
3201 /* Not UTF-8 mode */
3202 {
3203 for (i = min; i < max; i++)
3204 {
3205 if (eptr >= md->end_subject)
3206 {
3207 SCHECK_PARTIAL();
3208 break;
3209 }
3210 if (fc == md->lcc[*eptr]) break;
3211 eptr++;
3212 }
3213 if (possessive) continue;
3214 while (eptr >= pp)
3215 {
3216 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3217 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3218 eptr--;
3219 }
3220 }
3221
3222 MRRETURN(MATCH_NOMATCH);
3223 }
3224 /* Control never gets here */
3225 }
3226
3227 /* Caseful comparisons */
3228
3229 else
3230 {
3231 #ifdef SUPPORT_UTF8
3232 /* UTF-8 mode */
3233 if (utf8)
3234 {
3235 register unsigned int d;
3236 for (i = 1; i <= min; i++)
3237 {
3238 if (eptr >= md->end_subject)
3239 {
3240 SCHECK_PARTIAL();
3241 MRRETURN(MATCH_NOMATCH);
3242 }
3243 GETCHARINC(d, eptr);
3244 if (fc == d) MRRETURN(MATCH_NOMATCH);
3245 }
3246 }
3247 else
3248 #endif
3249 /* Not UTF-8 mode */
3250 {
3251 for (i = 1; i <= min; i++)
3252 {
3253 if (eptr >= md->end_subject)
3254 {
3255 SCHECK_PARTIAL();
3256 MRRETURN(MATCH_NOMATCH);
3257 }
3258 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3259 }
3260 }
3261
3262 if (min == max) continue;
3263
3264 if (minimize)
3265 {
3266 #ifdef SUPPORT_UTF8
3267 /* UTF-8 mode */
3268 if (utf8)
3269 {
3270 register unsigned int d;
3271 for (fi = min;; fi++)
3272 {
3273 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3274 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3275 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3276 if (eptr >= md->end_subject)
3277 {
3278 SCHECK_PARTIAL();
3279 MRRETURN(MATCH_NOMATCH);
3280 }
3281 GETCHARINC(d, eptr);
3282 if (fc == d) MRRETURN(MATCH_NOMATCH);
3283 }
3284 }
3285 else
3286 #endif
3287 /* Not UTF-8 mode */
3288 {
3289 for (fi = min;; fi++)
3290 {
3291 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3292 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3293 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3294 if (eptr >= md->end_subject)
3295 {
3296 SCHECK_PARTIAL();
3297 MRRETURN(MATCH_NOMATCH);
3298 }
3299 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3300 }
3301 }
3302 /* Control never gets here */
3303 }
3304
3305 /* Maximize case */
3306
3307 else
3308 {
3309 pp = eptr;
3310
3311 #ifdef SUPPORT_UTF8
3312 /* UTF-8 mode */
3313 if (utf8)
3314 {
3315 register unsigned int d;
3316 for (i = min; i < max; i++)
3317 {
3318 int len = 1;
3319 if (eptr >= md->end_subject)
3320 {
3321 SCHECK_PARTIAL();
3322 break;
3323 }
3324 GETCHARLEN(d, eptr, len);
3325 if (fc == d) break;
3326 eptr += len;
3327 }
3328 if (possessive) continue;
3329 for(;;)
3330 {
3331 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3332 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3333 if (eptr-- == pp) break; /* Stop if tried at original pos */
3334 BACKCHAR(eptr);
3335 }
3336 }
3337 else
3338 #endif
3339 /* Not UTF-8 mode */
3340 {
3341 for (i = min; i < max; i++)
3342 {
3343 if (eptr >= md->end_subject)
3344 {
3345 SCHECK_PARTIAL();
3346 break;
3347 }
3348 if (fc == *eptr) break;
3349 eptr++;
3350 }
3351 if (possessive) continue;
3352 while (eptr >= pp)
3353 {
3354 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3355 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356 eptr--;
3357 }
3358 }
3359
3360 MRRETURN(MATCH_NOMATCH);
3361 }
3362 }
3363 /* Control never gets here */
3364
3365 /* Match a single character type repeatedly; several different opcodes
3366 share code. This is very similar to the code for single characters, but we
3367 repeat it in the interests of efficiency. */
3368
3369 case OP_TYPEEXACT:
3370 min = max = GET2(ecode, 1);
3371 minimize = TRUE;
3372 ecode += 3;
3373 goto REPEATTYPE;
3374
3375 case OP_TYPEUPTO:
3376 case OP_TYPEMINUPTO:
3377 min = 0;
3378 max = GET2(ecode, 1);
3379 minimize = *ecode == OP_TYPEMINUPTO;
3380 ecode += 3;
3381 goto REPEATTYPE;
3382
3383 case OP_TYPEPOSSTAR:
3384 possessive = TRUE;
3385 min = 0;
3386 max = INT_MAX;
3387 ecode++;
3388 goto REPEATTYPE;
3389
3390 case OP_TYPEPOSPLUS:
3391 possessive = TRUE;
3392 min = 1;
3393 max = INT_MAX;
3394 ecode++;
3395 goto REPEATTYPE;
3396
3397 case OP_TYPEPOSQUERY:
3398 possessive = TRUE;
3399 min = 0;
3400 max = 1;
3401 ecode++;
3402 goto REPEATTYPE;
3403
3404 case OP_TYPEPOSUPTO:
3405 possessive = TRUE;
3406 min = 0;
3407 max = GET2(ecode, 1);
3408 ecode += 3;
3409 goto REPEATTYPE;
3410
3411 case OP_TYPESTAR:
3412 case OP_TYPEMINSTAR:
3413 case OP_TYPEPLUS:
3414 case OP_TYPEMINPLUS:
3415 case OP_TYPEQUERY:
3416 case OP_TYPEMINQUERY:
3417 c = *ecode++ - OP_TYPESTAR;
3418 minimize = (c & 1) != 0;
3419 min = rep_min[c]; /* Pick up values from tables; */
3420 max = rep_max[c]; /* zero for max => infinity */
3421 if (max == 0) max = INT_MAX;
3422
3423 /* Common code for all repeated single character type matches. Note that
3424 in UTF-8 mode, '.' matches a character of any length, but for the other
3425 character types, the valid characters are all one-byte long. */
3426
3427 REPEATTYPE:
3428 ctype = *ecode++; /* Code for the character type */
3429
3430 #ifdef SUPPORT_UCP
3431 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3432 {
3433 prop_fail_result = ctype == OP_NOTPROP;
3434 prop_type = *ecode++;
3435 prop_value = *ecode++;
3436 }
3437 else prop_type = -1;
3438 #endif
3439
3440 /* First, ensure the minimum number of matches are present. Use inline
3441 code for maximizing the speed, and do the type test once at the start
3442 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3443 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3444 and single-bytes. */
3445
3446 if (min > 0)
3447 {
3448 #ifdef SUPPORT_UCP
3449 if (prop_type >= 0)
3450 {
3451 switch(prop_type)
3452 {
3453 case PT_ANY:
3454 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3455 for (i = 1; i <= min; i++)
3456 {
3457 if (eptr >= md->end_subject)
3458 {
3459 SCHECK_PARTIAL();
3460 MRRETURN(MATCH_NOMATCH);
3461 }
3462 GETCHARINCTEST(c, eptr);
3463 }
3464 break;
3465
3466 case PT_LAMP:
3467 for (i = 1; i <= min; i++)
3468 {
3469 if (eptr >= md->end_subject)
3470 {
3471 SCHECK_PARTIAL();
3472 MRRETURN(MATCH_NOMATCH);
3473 }
3474 GETCHARINCTEST(c, eptr);
3475 prop_chartype = UCD_CHARTYPE(c);
3476 if ((prop_chartype == ucp_Lu ||
3477 prop_chartype == ucp_Ll ||
3478 prop_chartype == ucp_Lt) == prop_fail_result)
3479 MRRETURN(MATCH_NOMATCH);
3480 }
3481 break;
3482
3483 case PT_GC:
3484 for (i = 1; i <= min; i++)
3485 {
3486 if (eptr >= md->end_subject)
3487 {
3488 SCHECK_PARTIAL();
3489 MRRETURN(MATCH_NOMATCH);
3490 }
3491 GETCHARINCTEST(c, eptr);
3492 prop_category = UCD_CATEGORY(c);
3493 if ((prop_category == prop_value) == prop_fail_result)
3494 MRRETURN(MATCH_NOMATCH);
3495 }
3496 break;
3497
3498 case PT_PC:
3499 for (i = 1; i <= min; i++)
3500 {
3501 if (eptr >= md->end_subject)
3502 {
3503 SCHECK_PARTIAL();
3504 MRRETURN(MATCH_NOMATCH);
3505 }
3506 GETCHARINCTEST(c, eptr);
3507 prop_chartype = UCD_CHARTYPE(c);
3508 if ((prop_chartype == prop_value) == prop_fail_result)
3509 MRRETURN(MATCH_NOMATCH);
3510 }
3511 break;
3512
3513 case PT_SC:
3514 for (i = 1; i <= min; i++)
3515 {
3516 if (eptr >= md->end_subject)
3517 {
3518 SCHECK_PARTIAL();
3519 MRRETURN(MATCH_NOMATCH);
3520 }
3521 GETCHARINCTEST(c, eptr);
3522 prop_script = UCD_SCRIPT(c);
3523 if ((prop_script == prop_value) == prop_fail_result)
3524 MRRETURN(MATCH_NOMATCH);
3525 }
3526 break;
3527
3528 case PT_ALNUM:
3529 for (i = 1; i <= min; i++)
3530 {
3531 if (eptr >= md->end_subject)
3532 {
3533 SCHECK_PARTIAL();
3534 MRRETURN(MATCH_NOMATCH);
3535 }
3536 GETCHARINCTEST(c, eptr);
3537 prop_category = UCD_CATEGORY(c);
3538 if ((prop_category == ucp_L || prop_category == ucp_N)
3539 == prop_fail_result)
3540 MRRETURN(MATCH_NOMATCH);
3541 }
3542 break;
3543
3544 case PT_SPACE: /* Perl space */
3545 for (i = 1; i <= min; i++)
3546 {
3547 if (eptr >= md->end_subject)
3548 {
3549 SCHECK_PARTIAL();
3550 MRRETURN(MATCH_NOMATCH);
3551 }
3552 GETCHARINCTEST(c, eptr);
3553 prop_category = UCD_CATEGORY(c);
3554 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3555 c == CHAR_FF || c == CHAR_CR)
3556 == prop_fail_result)
3557 MRRETURN(MATCH_NOMATCH);
3558 }
3559 break;
3560
3561 case PT_PXSPACE: /* POSIX space */
3562 for (i = 1; i <= min; i++)
3563 {
3564 if (eptr >= md->end_subject)
3565 {
3566 SCHECK_PARTIAL();
3567 MRRETURN(MATCH_NOMATCH);
3568 }
3569 GETCHARINCTEST(c, eptr);
3570 prop_category = UCD_CATEGORY(c);
3571 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3572 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3573 == prop_fail_result)
3574 MRRETURN(MATCH_NOMATCH);
3575 }
3576 break;
3577
3578 case PT_WORD:
3579 for (i = 1; i <= min; i++)
3580 {
3581 if (eptr >= md->end_subject)
3582 {
3583 SCHECK_PARTIAL();
3584 MRRETURN(MATCH_NOMATCH);
3585 }
3586 GETCHARINCTEST(c, eptr);
3587 prop_category = UCD_CATEGORY(c);
3588 if ((prop_category == ucp_L || prop_category == ucp_N ||
3589 c == CHAR_UNDERSCORE)
3590 == prop_fail_result)
3591 MRRETURN(MATCH_NOMATCH);
3592 }
3593 break;
3594
3595 /* This should not occur */
3596
3597 default:
3598 RRETURN(PCRE_ERROR_INTERNAL);
3599 }
3600 }
3601
3602 /* Match extended Unicode sequences. We will get here only if the
3603 support is in the binary; otherwise a compile-time error occurs. */
3604
3605 else if (ctype == OP_EXTUNI)
3606 {
3607 for (i = 1; i <= min; i++)
3608 {
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 MRRETURN(MATCH_NOMATCH);
3613 }
3614 GETCHARINCTEST(c, eptr);
3615 prop_category = UCD_CATEGORY(c);
3616 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3617 while (eptr < md->end_subject)
3618 {
3619 int len = 1;
3620 if (!utf8) c = *eptr;
3621 else { GETCHARLEN(c, eptr, len); }
3622 prop_category = UCD_CATEGORY(c);
3623 if (prop_category != ucp_M) break;
3624 eptr += len;
3625 }
3626 }
3627 }
3628
3629 else
3630 #endif /* SUPPORT_UCP */
3631
3632 /* Handle all other cases when the coding is UTF-8 */
3633
3634 #ifdef SUPPORT_UTF8
3635 if (utf8) switch(ctype)
3636 {
3637 case OP_ANY:
3638 for (i = 1; i <= min; i++)
3639 {
3640 if (eptr >= md->end_subject)
3641 {
3642 SCHECK_PARTIAL();
3643 MRRETURN(MATCH_NOMATCH);
3644 }
3645 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3646 eptr++;
3647 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3648 }
3649 break;
3650
3651 case OP_ALLANY:
3652 for (i = 1; i <= min; i++)
3653 {
3654 if (eptr >= md->end_subject)
3655 {
3656 SCHECK_PARTIAL();
3657 MRRETURN(MATCH_NOMATCH);
3658 }
3659 eptr++;
3660 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3661 }
3662 break;
3663
3664 case OP_ANYBYTE:
3665 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3666 eptr += min;
3667 break;
3668
3669 case OP_ANYNL:
3670 for (i = 1; i <= min; i++)
3671 {
3672 if (eptr >= md->end_subject)
3673 {
3674 SCHECK_PARTIAL();
3675 MRRETURN(MATCH_NOMATCH);
3676 }
3677 GETCHARINC(c, eptr);
3678 switch(c)
3679 {
3680 default: MRRETURN(MATCH_NOMATCH);
3681 case 0x000d:
3682 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3683 break;
3684
3685 case 0x000a:
3686 break;
3687
3688 case 0x000b:
3689 case 0x000c:
3690 case 0x0085:
3691 case 0x2028:
3692 case 0x2029:
3693 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3694 break;
3695 }
3696 }
3697 break;
3698
3699 case OP_NOT_HSPACE:
3700 for (i = 1; i <= min; i++)
3701 {
3702 if (eptr >= md->end_subject)
3703 {
3704 SCHECK_PARTIAL();
3705 MRRETURN(MATCH_NOMATCH);
3706 }
3707 GETCHARINC(c, eptr);
3708 switch(c)
3709 {
3710 default: break;
3711 case 0x09: /* HT */
3712 case 0x20: /* SPACE */
3713 case 0xa0: /* NBSP */
3714 case 0x1680: /* OGHAM SPACE MARK */
3715 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3716 case 0x2000: /* EN QUAD */
3717 case 0x2001: /* EM QUAD */
3718 case 0x2002: /* EN SPACE */
3719 case 0x2003: /* EM SPACE */
3720 case 0x2004: /* THREE-PER-EM SPACE */
3721 case 0x2005: /* FOUR-PER-EM SPACE */
3722 case 0x2006: /* SIX-PER-EM SPACE */
3723 case 0x2007: /* FIGURE SPACE */
3724 case 0x2008: /* PUNCTUATION SPACE */
3725 case 0x2009: /* THIN SPACE */
3726 case 0x200A: /* HAIR SPACE */
3727 case 0x202f: /* NARROW NO-BREAK SPACE */
3728 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3729 case 0x3000: /* IDEOGRAPHIC SPACE */
3730 MRRETURN(MATCH_NOMATCH);
3731 }
3732 }
3733 break;
3734
3735 case OP_HSPACE:
3736 for (i = 1; i <= min; i++)
3737 {
3738 if (eptr >= md->end_subject)
3739 {
3740 SCHECK_PARTIAL();
3741 MRRETURN(MATCH_NOMATCH);
3742 }
3743 GETCHARINC(c, eptr);
3744 switch(c)
3745 {
3746 default: MRRETURN(MATCH_NOMATCH);
3747 case 0x09: /* HT */
3748 case 0x20: /* SPACE */
3749 case 0xa0: /* NBSP */
3750 case 0x1680: /* OGHAM SPACE MARK */
3751 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3752 case 0x2000: /* EN QUAD */
3753 case 0x2001: /* EM QUAD */
3754 case 0x2002: /* EN SPACE */
3755 case 0x2003: /* EM SPACE */
3756 case 0x2004: /* THREE-PER-EM SPACE */
3757 case 0x2005: /* FOUR-PER-EM SPACE */
3758 case 0x2006: /* SIX-PER-EM SPACE */
3759 case 0x2007: /* FIGURE SPACE */
3760 case 0x2008: /* PUNCTUATION SPACE */
3761 case 0x2009: /* THIN SPACE */
3762 case 0x200A: /* HAIR SPACE */
3763 case 0x202f: /* NARROW NO-BREAK SPACE */
3764 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3765 case 0x3000: /* IDEOGRAPHIC SPACE */
3766 break;
3767 }
3768 }
3769 break;
3770
3771 case OP_NOT_VSPACE:
3772 for (i = 1; i <= min; i++)
3773 {
3774 if (eptr >= md->end_subject)
3775 {
3776 SCHECK_PARTIAL();
3777 MRRETURN(MATCH_NOMATCH);
3778 }
3779 GETCHARINC(c, eptr);
3780 switch(c)
3781 {
3782 default: break;
3783 case 0x0a: /* LF */
3784 case 0x0b: /* VT */
3785 case 0x0c: /* FF */
3786 case 0x0d: /* CR */
3787 case 0x85: /* NEL */
3788 case 0x2028: /* LINE SEPARATOR */
3789 case 0x2029: /* PARAGRAPH SEPARATOR */
3790 MRRETURN(MATCH_NOMATCH);
3791 }
3792 }
3793 break;
3794
3795 case OP_VSPACE:
3796 for (i = 1; i <= min; i++)
3797 {
3798 if (eptr >= md->end_subject)
3799 {
3800 SCHECK_PARTIAL();
3801 MRRETURN(MATCH_NOMATCH);
3802 }
3803 GETCHARINC(c, eptr);
3804 switch(c)
3805 {
3806 default: MRRETURN(MATCH_NOMATCH);
3807 case 0x0a: /* LF */
3808 case 0x0b: /* VT */
3809 case 0x0c: /* FF */
3810 case 0x0d: /* CR */
3811 case 0x85: /* NEL */
3812 case 0x2028: /* LINE SEPARATOR */
3813 case 0x2029: /* PARAGRAPH SEPARATOR */
3814 break;
3815 }
3816 }
3817 break;
3818
3819 case OP_NOT_DIGIT:
3820 for (i = 1; i <= min; i++)
3821 {
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 MRRETURN(MATCH_NOMATCH);
3826 }
3827 GETCHARINC(c, eptr);
3828 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3829 MRRETURN(MATCH_NOMATCH);
3830 }
3831 break;
3832
3833 case OP_DIGIT:
3834 for (i = 1; i <= min; i++)
3835 {
3836 if (eptr >= md->end_subject)
3837 {
3838 SCHECK_PARTIAL();
3839 MRRETURN(MATCH_NOMATCH);
3840 }
3841 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3842 MRRETURN(MATCH_NOMATCH);
3843 /* No need to skip more bytes - we know it's a 1-byte character */
3844 }
3845 break;
3846
3847 case OP_NOT_WHITESPACE:
3848 for (i = 1; i <= min; i++)
3849 {
3850 if (eptr >= md->end_subject)
3851 {
3852 SCHECK_PARTIAL();
3853 MRRETURN(MATCH_NOMATCH);
3854 }
3855 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3856 MRRETURN(MATCH_NOMATCH);
3857 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3858 }
3859 break;
3860
3861 case OP_WHITESPACE:
3862 for (i = 1; i <= min; i++)
3863 {
3864 if (eptr >= md->end_subject)
3865 {
3866 SCHECK_PARTIAL();
3867 MRRETURN(MATCH_NOMATCH);
3868 }
3869 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3870 MRRETURN(MATCH_NOMATCH);
3871 /* No need to skip more bytes - we know it's a 1-byte character */
3872 }
3873 break;
3874
3875 case OP_NOT_WORDCHAR:
3876 for (i = 1; i <= min; i++)
3877 {
3878 if (eptr >= md->end_subject)
3879 {
3880 SCHECK_PARTIAL();
3881 MRRETURN(MATCH_NOMATCH);
3882 }
3883 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3884 MRRETURN(MATCH_NOMATCH);
3885 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3886 }
3887 break;
3888
3889 case OP_WORDCHAR:
3890 for (i = 1; i <= min; i++)
3891 {
3892 if (eptr >= md->end_subject)
3893 {
3894 SCHECK_PARTIAL();
3895 MRRETURN(MATCH_NOMATCH);
3896 }
3897 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3898 MRRETURN(MATCH_NOMATCH);
3899 /* No need to skip more bytes - we know it's a 1-byte character */
3900 }
3901 break;
3902
3903 default:
3904 RRETURN(PCRE_ERROR_INTERNAL);
3905 } /* End switch(ctype) */
3906
3907 else
3908 #endif /* SUPPORT_UTF8 */
3909
3910 /* Code for the non-UTF-8 case for minimum matching of operators other
3911 than OP_PROP and OP_NOTPROP. */
3912
3913 switch(ctype)
3914 {
3915 case OP_ANY:
3916 for (i = 1; i <= min; i++)
3917 {
3918 if (eptr >= md->end_subject)
3919 {
3920 SCHECK_PARTIAL();
3921 MRRETURN(MATCH_NOMATCH);
3922 }
3923 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3924 eptr++;
3925 }
3926 break;
3927
3928 case OP_ALLANY:
3929 if (eptr > md->end_subject - min)
3930 {
3931 SCHECK_PARTIAL();
3932 MRRETURN(MATCH_NOMATCH);
3933 }
3934 eptr += min;
3935 break;
3936
3937 case OP_ANYBYTE:
3938 if (eptr > md->end_subject - min)
3939 {
3940 SCHECK_PARTIAL();
3941 MRRETURN(MATCH_NOMATCH);
3942 }
3943 eptr += min;
3944 break;
3945
3946 case OP_ANYNL:
3947 for (i = 1; i <= min; i++)
3948 {
3949 if (eptr >= md->end_subject)
3950 {
3951 SCHECK_PARTIAL();
3952 MRRETURN(MATCH_NOMATCH);
3953 }
3954 switch(*eptr++)
3955 {
3956 default: MRRETURN(MATCH_NOMATCH);
3957 case 0x000d:
3958 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3959 break;
3960 case 0x000a:
3961 break;
3962
3963 case 0x000b:
3964 case 0x000c:
3965 case 0x0085:
3966 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3967 break;
3968 }
3969 }
3970 break;
3971
3972 case OP_NOT_HSPACE:
3973 for (i = 1; i <= min; i++)
3974 {
3975 if (eptr >= md->end_subject)
3976 {
3977 SCHECK_PARTIAL();
3978 MRRETURN(MATCH_NOMATCH);
3979 }
3980 switch(*eptr++)
3981 {
3982 default: break;
3983 case 0x09: /* HT */
3984 case 0x20: /* SPACE */
3985 case 0xa0: /* NBSP */
3986 MRRETURN(MATCH_NOMATCH);
3987 }
3988 }
3989 break;
3990
3991 case OP_HSPACE:
3992 for (i = 1; i <= min; i++)
3993 {
3994 if (eptr >= md->end_subject)
3995 {
3996 SCHECK_PARTIAL();
3997 MRRETURN(MATCH_NOMATCH);
3998 }
3999 switch(*eptr++)
4000 {
4001 default: MRRETURN(MATCH_NOMATCH);
4002 case 0x09: /* HT */
4003 case 0x20: /* SPACE */
4004 case 0xa0: /* NBSP */
4005 break;
4006 }
4007 }
4008 break;
4009
4010 case OP_NOT_VSPACE:
4011 for (i = 1; i <= min; i++)
4012 {
4013 if (eptr >= md->end_subject)
4014 {
4015 SCHECK_PARTIAL();
4016 MRRETURN(MATCH_NOMATCH);
4017 }
4018 switch(*eptr++)
4019 {
4020 default: break;
4021 case 0x0a: /* LF */
4022 case 0x0b: /* VT */
4023 case 0x0c: /* FF */
4024 case 0x0d: /* CR */
4025 case 0x85: /* NEL */
4026 MRRETURN(MATCH_NOMATCH);
4027 }
4028 }
4029 break;
4030
4031 case OP_VSPACE:
4032 for (i = 1; i <= min; i++)
4033 {
4034 if (eptr >= md->end_subject)
4035 {
4036 SCHECK_PARTIAL();
4037 MRRETURN(MATCH_NOMATCH);
4038 }
4039 switch(*eptr++)
4040 {
4041 default: MRRETURN(MATCH_NOMATCH);
4042 case 0x0a: /* LF */
4043 case 0x0b: /* VT */
4044 case 0x0c: /* FF */
4045 case 0x0d: /* CR */
4046 case 0x85: /* NEL */
4047 break;
4048 }
4049 }
4050 break;
4051
4052 case OP_NOT_DIGIT:
4053 for (i = 1; i <= min; i++)
4054 {
4055 if (eptr >= md->end_subject)
4056 {
4057 SCHECK_PARTIAL();
4058 MRRETURN(MATCH_NOMATCH);
4059 }
4060 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4061 }
4062 break;
4063
4064 case OP_DIGIT:
4065 for (i = 1; i <= min; i++)
4066 {
4067 if (eptr >= md->end_subject)
4068 {
4069 SCHECK_PARTIAL();
4070 MRRETURN(MATCH_NOMATCH);
4071 }
4072 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4073 }
4074 break;
4075
4076 case OP_NOT_WHITESPACE:
4077 for (i = 1; i <= min; i++)
4078 {
4079 if (eptr >= md->end_subject)
4080 {
4081 SCHECK_PARTIAL();
4082 MRRETURN(MATCH_NOMATCH);
4083 }
4084 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4085 }
4086 break;
4087
4088 case OP_WHITESPACE:
4089 for (i = 1; i <= min; i++)
4090 {
4091 if (eptr >= md->end_subject)
4092 {
4093 SCHECK_PARTIAL();
4094 MRRETURN(MATCH_NOMATCH);
4095 }
4096 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4097 }
4098 break;
4099
4100 case OP_NOT_WORDCHAR:
4101 for (i = 1; i <= min; i++)
4102 {
4103 if (eptr >= md->end_subject)
4104 {
4105 SCHECK_PARTIAL();
4106 MRRETURN(MATCH_NOMATCH);
4107 }
4108 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4109 MRRETURN(MATCH_NOMATCH);
4110 }
4111 break;
4112
4113 case OP_WORDCHAR:
4114 for (i = 1; i <= min; i++)
4115 {
4116 if (eptr >= md->end_subject)
4117 {
4118 SCHECK_PARTIAL();
4119 MRRETURN(MATCH_NOMATCH);
4120 }
4121 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4122 MRRETURN(MATCH_NOMATCH);
4123 }
4124 break;
4125
4126 default:
4127 RRETURN(PCRE_ERROR_INTERNAL);
4128 }
4129 }
4130
4131 /* If min = max, continue at the same level without recursing */
4132
4133 if (min == max) continue;
4134
4135 /* If minimizing, we have to test the rest of the pattern before each
4136 subsequent match. Again, separate the UTF-8 case for speed, and also
4137 separate the UCP cases. */
4138
4139 if (minimize)
4140 {
4141 #ifdef SUPPORT_UCP
4142 if (prop_type >= 0)
4143 {
4144 switch(prop_type)
4145 {
4146 case PT_ANY:
4147 for (fi = min;; fi++)
4148 {
4149 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4150 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4151 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4152 if (eptr >= md->end_subject)
4153 {
4154 SCHECK_PARTIAL();
4155 MRRETURN(MATCH_NOMATCH);
4156 }
4157 GETCHARINC(c, eptr);
4158 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4159 }
4160 /* Control never gets here */
4161
4162 case PT_LAMP:
4163 for (fi = min;; fi++)
4164 {
4165 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4166 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4167 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4168 if (eptr >= md->end_subject)
4169 {
4170 SCHECK_PARTIAL();
4171 MRRETURN(MATCH_NOMATCH);
4172 }
4173 GETCHARINC(c, eptr);
4174 prop_chartype = UCD_CHARTYPE(c);
4175 if ((prop_chartype == ucp_Lu ||
4176 prop_chartype == ucp_Ll ||
4177 prop_chartype == ucp_Lt) == prop_fail_result)
4178 MRRETURN(MATCH_NOMATCH);
4179 }
4180 /* Control never gets here */
4181
4182 case PT_GC:
4183 for (fi = min;; fi++)
4184 {
4185 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4186 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4187 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4188 if (eptr >= md->end_subject)
4189 {
4190 SCHECK_PARTIAL();
4191 MRRETURN(MATCH_NOMATCH);
4192 }
4193 GETCHARINC(c, eptr);
4194 prop_category = UCD_CATEGORY(c);
4195 if ((prop_category == prop_value) == prop_fail_result)
4196 MRRETURN(MATCH_NOMATCH);
4197 }
4198 /* Control never gets here */
4199
4200 case PT_PC:
4201 for (fi = min;; fi++)
4202 {
4203 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4204 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4205 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4206 if (eptr >= md->end_subject)
4207 {
4208 SCHECK_PARTIAL();
4209 MRRETURN(MATCH_NOMATCH);
4210 }
4211 GETCHARINC(c, eptr);
4212 prop_chartype = UCD_CHARTYPE(c);
4213 if ((prop_chartype == prop_value) == prop_fail_result)
4214 MRRETURN(MATCH_NOMATCH);
4215 }
4216 /* Control never gets here */
4217
4218 case PT_SC:
4219 for (fi = min;; fi++)
4220 {
4221 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4222 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4223 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4224 if (eptr >= md->end_subject)
4225 {
4226 SCHECK_PARTIAL();
4227 MRRETURN(MATCH_NOMATCH);
4228 }
4229 GETCHARINC(c, eptr);
4230 prop_script = UCD_SCRIPT(c);
4231 if ((prop_script == prop_value) == prop_fail_result)
4232 MRRETURN(MATCH_NOMATCH);
4233 }
4234 /* Control never gets here */
4235
4236 case PT_ALNUM:
4237 for (fi = min;; fi++)
4238 {
4239 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4240 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4241 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4242 if (eptr >= md->end_subject)
4243 {
4244 SCHECK_PARTIAL();
4245 MRRETURN(MATCH_NOMATCH);
4246 }
4247 GETCHARINC(c, eptr);
4248 prop_category = UCD_CATEGORY(c);
4249 if ((prop_category == ucp_L || prop_category == ucp_N)
4250 == prop_fail_result)
4251 MRRETURN(MATCH_NOMATCH);
4252 }
4253 /* Control never gets here */
4254
4255 case PT_SPACE: /* Perl space */
4256 for (fi = min;; fi++)
4257 {
4258 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4259 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4260 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4261 if (eptr >= md->end_subject)
4262 {
4263 SCHECK_PARTIAL();
4264 MRRETURN(MATCH_NOMATCH);
4265 }
4266 GETCHARINC(c, eptr);
4267 prop_category = UCD_CATEGORY(c);
4268 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4269 c == CHAR_FF || c == CHAR_CR)
4270 == prop_fail_result)
4271 MRRETURN(MATCH_NOMATCH);
4272 }
4273 /* Control never gets here */
4274
4275 case PT_PXSPACE: /* POSIX space */
4276 for (fi = min;; fi++)
4277 {
4278 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4279 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4280 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4281 if (eptr >= md->end_subject)
4282 {
4283 SCHECK_PARTIAL();
4284 MRRETURN(MATCH_NOMATCH);
4285 }
4286 GETCHARINC(c, eptr);
4287 prop_category = UCD_CATEGORY(c);
4288 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4289 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4290 == prop_fail_result)
4291 MRRETURN(MATCH_NOMATCH);
4292 }
4293 /* Control never gets here */
4294
4295 case PT_WORD:
4296 for (fi = min;; fi++)
4297 {
4298 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4299 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4300 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4301 if (eptr >= md->end_subject)
4302 {
4303 SCHECK_PARTIAL();
4304 MRRETURN(MATCH_NOMATCH);
4305 }
4306 GETCHARINC(c, eptr);
4307 prop_category = UCD_CATEGORY(c);
4308 if ((prop_category == ucp_L ||
4309 prop_category == ucp_N ||
4310 c == CHAR_UNDERSCORE)
4311 == prop_fail_result)
4312 MRRETURN(MATCH_NOMATCH);
4313 }
4314 /* Control never gets here */
4315
4316 /* This should never occur */
4317
4318 default:
4319 RRETURN(PCRE_ERROR_INTERNAL);
4320 }
4321 }
4322
4323 /* Match extended Unicode sequences. We will get here only if the
4324 support is in the binary; otherwise a compile-time error occurs. */
4325
4326 else if (ctype == OP_EXTUNI)
4327 {
4328 for (fi = min;; fi++)
4329 {
4330 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4331 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4332 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4333 if (eptr >= md->end_subject)
4334 {
4335 SCHECK_PARTIAL();
4336 MRRETURN(MATCH_NOMATCH);
4337 }
4338 GETCHARINCTEST(c, eptr);
4339 prop_category = UCD_CATEGORY(c);
4340 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4341 while (eptr < md->end_subject)
4342 {
4343 int len = 1;
4344 if (!utf8) c = *eptr;
4345 else { GETCHARLEN(c, eptr, len); }
4346 prop_category = UCD_CATEGORY(c);
4347 if (prop_category != ucp_M) break;
4348 eptr += len;
4349 }
4350 }
4351 }
4352
4353 else
4354 #endif /* SUPPORT_UCP */
4355
4356 #ifdef SUPPORT_UTF8
4357 /* UTF-8 mode */
4358 if (utf8)
4359 {
4360 for (fi = min;; fi++)
4361 {
4362 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4363 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4364 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4365 if (eptr >= md->end_subject)
4366 {
4367 SCHECK_PARTIAL();
4368 MRRETURN(MATCH_NOMATCH);
4369 }
4370 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4371 MRRETURN(MATCH_NOMATCH);
4372 GETCHARINC(c, eptr);
4373 switch(ctype)
4374 {
4375 case OP_ANY: /* This is the non-NL case */
4376 case OP_ALLANY:
4377 case OP_ANYBYTE:
4378 break;
4379
4380 case OP_ANYNL:
4381 switch(c)
4382 {
4383 default: MRRETURN(MATCH_NOMATCH);
4384 case 0x000d:
4385 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4386 break;
4387 case 0x000a:
4388 break;
4389
4390 case 0x000b:
4391 case 0x000c:
4392 case 0x0085:
4393 case 0x2028:
4394 case 0x2029:
4395 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4396 break;
4397 }
4398 break;
4399
4400 case OP_NOT_HSPACE:
4401 switch(c)
4402 {
4403 default: break;
4404 case 0x09: /* HT */
4405 case 0x20: /* SPACE */
4406 case 0xa0: /* NBSP */
4407 case 0x1680: /* OGHAM SPACE MARK */
4408 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4409 case 0x2000: /* EN QUAD */
4410 case 0x2001: /* EM QUAD */
4411 case 0x2002: /* EN SPACE */
4412 case 0x2003: /* EM SPACE */
4413 case 0x2004: /* THREE-PER-EM SPACE */
4414 case 0x2005: /* FOUR-PER-EM SPACE */
4415 case 0x2006: /* SIX-PER-EM SPACE */
4416 case 0x2007: /* FIGURE SPACE */
4417 case 0x2008: /* PUNCTUATION SPACE */
4418 case 0x2009: /* THIN SPACE */
4419 case 0x200A: /* HAIR SPACE */
4420 case 0x202f: /* NARROW NO-BREAK SPACE */
4421 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4422 case 0x3000: /* IDEOGRAPHIC SPACE */
4423 MRRETURN(MATCH_NOMATCH);
4424 }
4425 break;
4426
4427 case OP_HSPACE:
4428 switch(c)
4429 {
4430 default: MRRETURN(MATCH_NOMATCH);
4431 case 0x09: /* HT */
4432 case 0x20: /* SPACE */
4433 case 0xa0: /* NBSP */
4434 case 0x1680: /* OGHAM SPACE MARK */
4435 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4436 case 0x2000: /* EN QUAD */
4437 case 0x2001: /* EM QUAD */
4438 case 0x2002: /* EN SPACE */
4439 case 0x2003: /* EM SPACE */
4440 case 0x2004: /* THREE-PER-EM SPACE */
4441 case 0x2005: /* FOUR-PER-EM SPACE */
4442 case 0x2006: /* SIX-PER-EM SPACE */
4443 case 0x2007: /* FIGURE SPACE */
4444 case 0x2008: /* PUNCTUATION SPACE */
4445 case 0x2009: /* THIN SPACE */
4446 case 0x200A: /* HAIR SPACE */
4447 case 0x202f: /* NARROW NO-BREAK SPACE */
4448 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4449 case 0x3000: /* IDEOGRAPHIC SPACE */
4450 break;
4451 }
4452 break;
4453
4454 case OP_NOT_VSPACE:
4455 switch(c)
4456 {
4457 default: break;
4458 case 0x0a: /* LF */
4459 case 0x0b: /* VT */
4460 case 0x0c: /* FF */
4461 case 0x0d: /* CR */
4462 case 0x85: /* NEL */
4463 case 0x2028: /* LINE SEPARATOR */
4464 case 0x2029: /* PARAGRAPH SEPARATOR */
4465 MRRETURN(MATCH_NOMATCH);
4466 }
4467 break;
4468
4469 case OP_VSPACE:
4470 switch(c)
4471 {
4472 default: MRRETURN(MATCH_NOMATCH);
4473 case 0x0a: /* LF */
4474 case 0x0b: /* VT */
4475 case 0x0c: /* FF */
4476 case 0x0d: /* CR */
4477 case 0x85: /* NEL */
4478 case 0x2028: /* LINE SEPARATOR */
4479 case 0x2029: /* PARAGRAPH SEPARATOR */
4480 break;
4481 }
4482 break;
4483
4484 case OP_NOT_DIGIT:
4485 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4486 MRRETURN(MATCH_NOMATCH);
4487 break;
4488
4489 case OP_DIGIT:
4490 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4491 MRRETURN(MATCH_NOMATCH);
4492 break;
4493
4494 case OP_NOT_WHITESPACE:
4495 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4496 MRRETURN(MATCH_NOMATCH);
4497 break;
4498
4499 case OP_WHITESPACE:
4500 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4501 MRRETURN(MATCH_NOMATCH);
4502 break;
4503
4504 case OP_NOT_WORDCHAR:
4505 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4506 MRRETURN(MATCH_NOMATCH);
4507 break;
4508
4509 case OP_WORDCHAR:
4510 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4511 MRRETURN(MATCH_NOMATCH);
4512 break;
4513
4514 default:
4515 RRETURN(PCRE_ERROR_INTERNAL);
4516 }
4517 }
4518 }
4519 else
4520 #endif
4521 /* Not UTF-8 mode */
4522 {
4523 for (fi = min;; fi++)
4524 {
4525 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4527 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4528 if (eptr >= md->end_subject)
4529 {
4530 SCHECK_PARTIAL();
4531 MRRETURN(MATCH_NOMATCH);
4532 }
4533 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4534 MRRETURN(MATCH_NOMATCH);
4535 c = *eptr++;
4536 switch(ctype)
4537 {
4538 case OP_ANY: /* This is the non-NL case */
4539 case OP_ALLANY:
4540 case OP_ANYBYTE:
4541 break;
4542
4543 case OP_ANYNL:
4544 switch(c)
4545 {
4546 default: MRRETURN(MATCH_NOMATCH);
4547 case 0x000d:
4548 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4549 break;
4550
4551 case 0x000a:
4552 break;
4553
4554 case 0x000b:
4555 case 0x000c:
4556 case 0x0085:
4557 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4558 break;
4559 }
4560 break;
4561
4562 case OP_NOT_HSPACE:
4563 switch(c)
4564 {
4565 default: break;
4566 case 0x09: /* HT */
4567 case 0x20: /* SPACE */
4568 case 0xa0: /* NBSP */
4569 MRRETURN(MATCH_NOMATCH);
4570 }
4571 break;
4572
4573 case OP_HSPACE:
4574 switch(c)
4575 {
4576 default: MRRETURN(MATCH_NOMATCH);
4577 case 0x09: /* HT */
4578 case 0x20: /* SPACE */
4579 case 0xa0: /* NBSP */
4580 break;
4581 }
4582 break;
4583
4584 case OP_NOT_VSPACE:
4585 switch(c)
4586 {
4587 default: break;
4588 case 0x0a: /* LF */
4589 case 0x0b: /* VT */
4590 case 0x0c: /* FF */
4591 case 0x0d: /* CR */
4592 case 0x85: /* NEL */
4593 MRRETURN(MATCH_NOMATCH);
4594 }
4595 break;
4596
4597 case OP_VSPACE:
4598 switch(c)
4599 {
4600 default: MRRETURN(MATCH_NOMATCH);
4601 case 0x0a: /* LF */
4602 case 0x0b: /* VT */
4603 case 0x0c: /* FF */
4604 case 0x0d: /* CR */
4605 case 0x85: /* NEL */
4606 break;
4607 }
4608 break;
4609
4610 case OP_NOT_DIGIT:
4611 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4612 break;
4613
4614 case OP_DIGIT:
4615 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4616 break;
4617
4618 case OP_NOT_WHITESPACE:
4619 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4620 break;
4621
4622 case OP_WHITESPACE:
4623 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4624 break;
4625
4626 case OP_NOT_WORDCHAR:
4627 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4628 break;
4629
4630 case OP_WORDCHAR:
4631 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4632 break;
4633
4634 default:
4635 RRETURN(PCRE_ERROR_INTERNAL);
4636 }
4637 }
4638 }
4639 /* Control never gets here */
4640 }
4641
4642 /* If maximizing, it is worth using inline code for speed, doing the type
4643 test once at the start (i.e. keep it out of the loop). Again, keep the
4644 UTF-8 and UCP stuff separate. */
4645
4646 else
4647 {
4648 pp = eptr; /* Remember where we started */
4649
4650 #ifdef SUPPORT_UCP
4651 if (prop_type >= 0)
4652 {
4653 switch(prop_type)
4654 {
4655 case PT_ANY:
4656 for (i = min; i < max; i++)
4657 {
4658 int len = 1;
4659 if (eptr >= md->end_subject)
4660 {
4661 SCHECK_PARTIAL();
4662 break;
4663 }
4664 GETCHARLEN(c, eptr, len);
4665 if (prop_fail_result) break;
4666 eptr+= len;
4667 }
4668 break;
4669
4670 case PT_LAMP:
4671 for (i = min; i < max; i++)
4672 {
4673 int len = 1;
4674 if (eptr >= md->end_subject)
4675 {
4676 SCHECK_PARTIAL();
4677 break;
4678 }
4679 GETCHARLEN(c, eptr, len);
4680 prop_chartype = UCD_CHARTYPE(c);
4681 if ((prop_chartype == ucp_Lu ||
4682 prop_chartype == ucp_Ll ||
4683 prop_chartype == ucp_Lt) == prop_fail_result)
4684 break;
4685 eptr+= len;
4686 }
4687 break;
4688
4689 case PT_GC:
4690 for (i = min; i < max; i++)
4691 {
4692 int len = 1;
4693 if (eptr >= md->end_subject)
4694 {
4695 SCHECK_PARTIAL();
4696 break;
4697 }
4698 GETCHARLEN(c, eptr, len);
4699 prop_category = UCD_CATEGORY(c);
4700 if ((prop_category == prop_value) == prop_fail_result)
4701 break;
4702 eptr+= len;
4703 }
4704 break;
4705
4706 case PT_PC:
4707 for (i = min; i < max; i++)
4708 {
4709 int len = 1;
4710 if (eptr >= md->end_subject)
4711 {
4712 SCHECK_PARTIAL();
4713 break;
4714 }
4715 GETCHARLEN(c, eptr, len);
4716 prop_chartype = UCD_CHARTYPE(c);
4717 if ((prop_chartype == prop_value) == prop_fail_result)
4718 break;
4719 eptr+= len;
4720 }
4721 break;
4722
4723 case PT_SC:
4724 for (i = min; i < max; i++)
4725 {
4726 int len = 1;
4727 if (eptr >= md->end_subject)
4728 {
4729 SCHECK_PARTIAL();
4730 break;
4731 }
4732 GETCHARLEN(c, eptr, len);
4733 prop_script = UCD_SCRIPT(c);
4734 if ((prop_script == prop_value) == prop_fail_result)
4735 break;
4736 eptr+= len;
4737 }
4738 break;
4739
4740 case PT_ALNUM:
4741 for (i = min; i < max; i++)
4742 {
4743 int len = 1;
4744 if (eptr >= md->end_subject)
4745 {
4746 SCHECK_PARTIAL();
4747 break;
4748 }
4749 GETCHARLEN(c, eptr, len);
4750 prop_category = UCD_CATEGORY(c);
4751 if ((prop_category == ucp_L || prop_category == ucp_N)
4752 == prop_fail_result)
4753 break;
4754 eptr+= len;
4755 }
4756 break;
4757
4758 case PT_SPACE: /* Perl space */
4759 for (i = min; i < max; i++)
4760 {
4761 int len = 1;
4762 if (eptr >= md->end_subject)
4763 {
4764 SCHECK_PARTIAL();
4765 break;
4766 }
4767 GETCHARLEN(c, eptr, len);
4768 prop_category = UCD_CATEGORY(c);
4769 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4770 c == CHAR_FF || c == CHAR_CR)
4771 == prop_fail_result)
4772 break;
4773 eptr+= len;
4774 }
4775 break;
4776
4777 case PT_PXSPACE: /* POSIX space */
4778 for (i = min; i < max; i++)
4779 {
4780 int len = 1;
4781 if (eptr >= md->end_subject)
4782 {
4783 SCHECK_PARTIAL();
4784 break;
4785 }
4786 GETCHARLEN(c, eptr, len);
4787 prop_category = UCD_CATEGORY(c);
4788 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4789 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4790 == prop_fail_result)
4791 break;
4792 eptr+= len;
4793 }
4794 break;
4795
4796 case PT_WORD:
4797 for (i = min; i < max; i++)
4798 {
4799 int len = 1;
4800 if (eptr >= md->end_subject)
4801 {
4802 SCHECK_PARTIAL();
4803 break;
4804 }
4805 GETCHARLEN(c, eptr, len);
4806 prop_category = UCD_CATEGORY(c);
4807 if ((prop_category == ucp_L || prop_category == ucp_N ||
4808 c == CHAR_UNDERSCORE) == prop_fail_result)
4809 break;
4810 eptr+= len;
4811 }
4812 break;
4813
4814 default:
4815 RRETURN(PCRE_ERROR_INTERNAL);
4816 }
4817
4818 /* eptr is now past the end of the maximum run */
4819
4820 if (possessive) continue;
4821 for(;;)
4822 {
4823 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4825 if (eptr-- == pp) break; /* Stop if tried at original pos */
4826 if (utf8) BACKCHAR(eptr);
4827 }
4828 }
4829
4830 /* Match extended Unicode sequences. We will get here only if the
4831 support is in the binary; otherwise a compile-time error occurs. */
4832
4833 else if (ctype == OP_EXTUNI)
4834 {
4835 for (i = min; i < max; i++)
4836 {
4837 if (eptr >= md->end_subject)
4838 {
4839 SCHECK_PARTIAL();
4840 break;
4841 }
4842 GETCHARINCTEST(c, eptr);
4843 prop_category = UCD_CATEGORY(c);
4844 if (prop_category == ucp_M) break;
4845 while (eptr < md->end_subject)
4846 {
4847 int len = 1;
4848 if (!utf8) c = *eptr; else
4849 {
4850 GETCHARLEN(c, eptr, len);
4851 }
4852 prop_category = UCD_CATEGORY(c);
4853 if (prop_category != ucp_M) break;
4854 eptr += len;
4855 }
4856 }
4857
4858 /* eptr is now past the end of the maximum run */
4859
4860 if (possessive) continue;
4861
4862 for(;;)
4863 {
4864 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4866 if (eptr-- == pp) break; /* Stop if tried at original pos */
4867 for (;;) /* Move back over one extended */
4868 {
4869 int len = 1;
4870 if (!utf8) c = *eptr; else
4871 {
4872 BACKCHAR(eptr);
4873 GETCHARLEN(c, eptr, len);
4874 }
4875 prop_category = UCD_CATEGORY(c);
4876 if (prop_category != ucp_M) break;
4877 eptr--;
4878 }
4879 }
4880 }
4881
4882 else
4883 #endif /* SUPPORT_UCP */
4884
4885 #ifdef SUPPORT_UTF8
4886 /* UTF-8 mode */
4887
4888 if (utf8)
4889 {
4890 switch(ctype)
4891 {
4892 case OP_ANY:
4893 if (max < INT_MAX)
4894 {
4895 for (i = min; i < max; i++)
4896 {
4897 if (eptr >= md->end_subject)
4898 {
4899 SCHECK_PARTIAL();
4900 break;
4901 }
4902 if (IS_NEWLINE(eptr)) break;
4903 eptr++;
4904 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4905 }
4906 }
4907
4908 /* Handle unlimited UTF-8 repeat */
4909
4910 else
4911 {
4912 for (i = min; i < max; i++)
4913 {
4914 if (eptr >= md->end_subject)
4915 {
4916 SCHECK_PARTIAL();
4917 break;
4918 }
4919 if (IS_NEWLINE(eptr)) break;
4920 eptr++;
4921 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4922 }
4923 }
4924 break;
4925
4926 case OP_ALLANY:
4927 if (max < INT_MAX)
4928 {
4929 for (i = min; i < max; i++)
4930 {
4931 if (eptr >= md->end_subject)
4932 {
4933 SCHECK_PARTIAL();
4934 break;
4935 }
4936 eptr++;
4937 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4938 }
4939 }
4940 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4941 break;
4942
4943 /* The byte case is the same as non-UTF8 */
4944
4945 case OP_ANYBYTE:
4946 c = max - min;
4947 if (c > (unsigned int)(md->end_subject - eptr))
4948 {
4949 eptr = md->end_subject;
4950 SCHECK_PARTIAL();
4951 }
4952 else eptr += c;
4953 break;
4954
4955 case OP_ANYNL:
4956 for (i = min; i < max; i++)
4957 {
4958 int len = 1;
4959 if (eptr >= md->end_subject)
4960 {
4961 SCHECK_PARTIAL();
4962 break;
4963 }
4964 GETCHARLEN(c, eptr, len);
4965 if (c == 0x000d)
4966 {
4967 if (++eptr >= md->end_subject) break;
4968 if (*eptr == 0x000a) eptr++;
4969 }
4970 else
4971 {
4972 if (c != 0x000a &&
4973 (md->bsr_anycrlf ||
4974 (c != 0x000b && c != 0x000c &&
4975 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4976 break;
4977 eptr += len;
4978 }
4979 }
4980 break;
4981
4982 case OP_NOT_HSPACE:
4983 case OP_HSPACE:
4984 for (i = min; i < max; i++)
4985 {
4986 BOOL gotspace;
4987 int len = 1;
4988 if (eptr >= md->end_subject)
4989 {
4990 SCHECK_PARTIAL();
4991 break;
4992 }
4993 GETCHARLEN(c, eptr, len);
4994 switch(c)
4995 {
4996 default: gotspace = FALSE; break;
4997 case 0x09: /* HT */
4998 case 0x20: /* SPACE */
4999 case 0xa0: /* NBSP */
5000 case 0x1680: /* OGHAM SPACE MARK */
5001 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5002 case 0x2000: /* EN QUAD */
5003 case 0x2001: /* EM QUAD */
5004 case 0x2002: /* EN SPACE */
5005 case 0x2003: /* EM SPACE */
5006 case 0x2004: /* THREE-PER-EM SPACE */
5007 case 0x2005: /* FOUR-PER-EM SPACE */
5008 case 0x2006: /* SIX-PER-EM SPACE */
5009 case 0x2007: /* FIGURE SPACE */
5010 case 0x2008: /* PUNCTUATION SPACE */
5011 case 0x2009: /* THIN SPACE */
5012 case 0x200A: /* HAIR SPACE */
5013 case 0x202f: /* NARROW NO-BREAK SPACE */
5014 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5015 case 0x3000: /* IDEOGRAPHIC SPACE */
5016 gotspace = TRUE;
5017 break;
5018 }
5019 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5020 eptr += len;
5021 }
5022 break;
5023
5024 case OP_NOT_VSPACE:
5025 case OP_VSPACE:
5026 for (i = min; i < max; i++)
5027 {
5028 BOOL gotspace;
5029 int len = 1;
5030 if (eptr >= md->end_subject)
5031 {
5032 SCHECK_PARTIAL();
5033 break;
5034 }
5035 GETCHARLEN(c, eptr, len);
5036 switch(c)
5037 {
5038 default: gotspace = FALSE; break;
5039 case 0x0a: /* LF */
5040 case 0x0b: /* VT */
5041 case 0x0c: /* FF */
5042 case 0x0d: /* CR */
5043 case 0x85: /* NEL */
5044 case 0x2028: /* LINE SEPARATOR */
5045 case 0x2029: /* PARAGRAPH SEPARATOR */
5046 gotspace = TRUE;
5047 break;
5048 }
5049 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5050 eptr += len;
5051 }
5052 break;
5053
5054 case OP_NOT_DIGIT:
5055 for (i = min; i < max; i++)
5056 {
5057 int len = 1;
5058 if (eptr >= md->end_subject)
5059 {
5060 SCHECK_PARTIAL();
5061 break;
5062 }
5063 GETCHARLEN(c, eptr, len);
5064 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5065 eptr+= len;
5066 }
5067 break;
5068
5069 case OP_DIGIT:
5070 for (i = min; i < max; i++)
5071 {
5072 int len = 1;
5073 if (eptr >= md->end_subject)
5074 {
5075 SCHECK_PARTIAL();
5076 break;
5077 }
5078 GETCHARLEN(c, eptr, len);
5079 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5080 eptr+= len;
5081 }
5082 break;
5083
5084 case OP_NOT_WHITESPACE:
5085 for (i = min; i < max; i++)
5086 {
5087 int len = 1;
5088 if (eptr >= md->end_subject)
5089 {
5090 SCHECK_PARTIAL();
5091 break;
5092 }
5093 GETCHARLEN(c, eptr, len);
5094 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5095 eptr+= len;
5096 }
5097 break;
5098
5099 case OP_WHITESPACE:
5100 for (i = min; i < max; i++)
5101 {
5102 int len = 1;
5103 if (eptr >= md->end_subject)
5104 {
5105 SCHECK_PARTIAL();
5106 break;
5107 }
5108 GETCHARLEN(c, eptr, len);
5109 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5110 eptr+= len;
5111 }
5112 break;
5113
5114 case OP_NOT_WORDCHAR:
5115 for (i = min; i < max; i++)
5116 {
5117 int len = 1;
5118 if (eptr >= md->end_subject)
5119 {
5120 SCHECK_PARTIAL();
5121 break;
5122 }
5123 GETCHARLEN(c, eptr, len);
5124 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5125 eptr+= len;
5126 }
5127 break;
5128
5129 case OP_WORDCHAR:
5130 for (i = min; i < max; i++)
5131 {
5132 int len = 1;
5133 if (eptr >= md->end_subject)
5134 {
5135 SCHECK_PARTIAL();
5136 break;
5137 }
5138 GETCHARLEN(c, eptr, len);
5139 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5140 eptr+= len;
5141 }
5142 break;
5143
5144 default:
5145 RRETURN(PCRE_ERROR_INTERNAL);
5146 }
5147
5148 /* eptr is now past the end of the maximum run */
5149
5150 if (possessive) continue;
5151 for(;;)
5152 {
5153 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5154 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5155 if (eptr-- == pp) break; /* Stop if tried at original pos */
5156 BACKCHAR(eptr);
5157 }
5158 }
5159 else
5160 #endif /* SUPPORT_UTF8 */
5161
5162 /* Not UTF-8 mode */
5163 {
5164 switch(ctype)
5165 {
5166 case OP_ANY:
5167 for (i = min; i < max; i++)
5168 {
5169 if (eptr >= md->end_subject)
5170 {
5171 SCHECK_PARTIAL();
5172 break;
5173 }
5174 if (IS_NEWLINE(eptr)) break;
5175 eptr++;
5176 }
5177 break;
5178
5179 case OP_ALLANY:
5180 case OP_ANYBYTE:
5181 c = max - min;
5182 if (c > (unsigned int)(md->end_subject - eptr))
5183 {
5184 eptr = md->end_subject;
5185 SCHECK_PARTIAL();
5186 }
5187 else eptr += c;
5188 break;
5189
5190 case OP_ANYNL:
5191 for (i = min; i < max; i++)
5192 {
5193 if (eptr >= md->end_subject)
5194 {
5195 SCHECK_PARTIAL();
5196 break;
5197 }
5198 c = *eptr;
5199 if (c == 0x000d)
5200 {
5201 if (++eptr >= md->end_subject) break;
5202 if (*eptr == 0x000a) eptr++;
5203 }
5204 else
5205 {
5206 if (c != 0x000a &&
5207 (md->bsr_anycrlf ||
5208 (c != 0x000b && c != 0x000c && c != 0x0085)))
5209 break;
5210 eptr++;
5211 }
5212 }
5213 break;
5214
5215 case OP_NOT_HSPACE:
5216 for (i = min; i < max; i++)
5217 {
5218 if (eptr >= md->end_subject)
5219 {
5220 SCHECK_PARTIAL();
5221 break;
5222 }
5223 c = *eptr;
5224 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5225 eptr++;
5226 }
5227 break;
5228
5229 case OP_HSPACE:
5230 for (i = min; i < max; i++)
5231 {
5232 if (eptr >= md->end_subject)
5233 {
5234 SCHECK_PARTIAL();
5235 break;
5236 }
5237 c = *eptr;
5238 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5239 eptr++;
5240 }
5241 break;
5242
5243 case OP_NOT_VSPACE:
5244 for (i = min; i < max; i++)
5245 {
5246 if (eptr >= md->end_subject)
5247 {
5248 SCHECK_PARTIAL();
5249 break;
5250 }
5251 c = *eptr;
5252 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5253 break;
5254 eptr++;
5255 }
5256 break;
5257
5258 case OP_VSPACE:
5259 for (i = min; i < max; i++)
5260 {
5261 if (eptr >= md->end_subject)
5262 {
5263 SCHECK_PARTIAL();
5264 break;
5265 }
5266 c = *eptr;
5267 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5268 break;
5269 eptr++;
5270 }
5271 break;
5272
5273 case OP_NOT_DIGIT:
5274 for (i = min; i < max; i++)
5275 {
5276 if (eptr >= md->end_subject)
5277 {
5278 SCHECK_PARTIAL();
5279 break;
5280 }
5281 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5282 eptr++;
5283 }
5284 break;
5285
5286 case OP_DIGIT:
5287 for (i = min; i < max; i++)
5288 {
5289 if (eptr >= md->end_subject)
5290 {
5291 SCHECK_PARTIAL();
5292 break;
5293 }
5294 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5295 eptr++;
5296 }
5297 break;
5298
5299 case OP_NOT_WHITESPACE:
5300 for (i = min; i < max; i++)
5301 {
5302 if (eptr >= md->end_subject)
5303 {
5304 SCHECK_PARTIAL();
5305 break;
5306 }
5307 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5308 eptr++;
5309 }
5310 break;
5311
5312 case OP_WHITESPACE:
5313 for (i = min; i < max; i++)
5314 {
5315 if (eptr >= md->end_subject)
5316 {
5317 SCHECK_PARTIAL();
5318 break;
5319 }
5320 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5321 eptr++;
5322 }
5323 break;
5324
5325 case OP_NOT_WORDCHAR:
5326 for (i = min; i < max; i++)
5327 {
5328 if (eptr >= md->end_subject)
5329 {
5330 SCHECK_PARTIAL();
5331 break;
5332 }
5333 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5334 eptr++;
5335 }
5336 break;
5337
5338 case OP_WORDCHAR:
5339 for (i = min; i < max; i++)
5340 {
5341 if (eptr >= md->end_subject)
5342 {
5343 SCHECK_PARTIAL();
5344 break;
5345 }
5346 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5347 eptr++;
5348 }
5349 break;
5350
5351 default:
5352 RRETURN(PCRE_ERROR_INTERNAL);
5353 }
5354
5355 /* eptr is now past the end of the maximum run */
5356
5357 if (possessive) continue;
5358 while (eptr >= pp)
5359 {
5360 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5361 eptr--;
5362 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5363 }
5364 }
5365
5366 /* Get here if we can't make it match with any permitted repetitions */
5367
5368 MRRETURN(MATCH_NOMATCH);
5369 }
5370 /* Control never gets here */
5371
5372 /* There's been some horrible disaster. Arrival here can only mean there is
5373 something seriously wrong in the code above or the OP_xxx definitions. */
5374
5375 default:
5376 DPRINTF(("Unknown opcode %d\n", *ecode));
5377 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5378 }
5379
5380 /* Do not stick any code in here without much thought; it is assumed
5381 that "continue" in the code above comes out to here to repeat the main
5382 loop. */
5383
5384 } /* End of main loop */
5385 /* Control never reaches here */
5386
5387
5388 /* When compiling to use the heap rather than the stack for recursive calls to
5389 match(), the RRETURN() macro jumps here. The number that is saved in
5390 frame->Xwhere indicates which label we actually want to return to. */
5391
5392 #ifdef NO_RECURSE
5393 #define LBL(val) case val: goto L_RM##val;
5394 HEAP_RETURN:
5395 switch (frame->Xwhere)
5396 {
5397 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5398 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5399 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5400 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5401 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
5402 #ifdef SUPPORT_UTF8
5403 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5404 LBL(32) LBL(34) LBL(42) LBL(46)
5405 #ifdef SUPPORT_UCP
5406 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5407 #endif /* SUPPORT_UCP */
5408 #endif /* SUPPORT_UTF8 */
5409 default:
5410 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5411 return PCRE_ERROR_INTERNAL;
5412 }
5413 #undef LBL
5414 #endif /* NO_RECURSE */
5415 }
5416
5417
5418 /***************************************************************************
5419 ****************************************************************************
5420 RECURSION IN THE match() FUNCTION
5421
5422 Undefine all the macros that were defined above to handle this. */
5423
5424 #ifdef NO_RECURSE
5425 #undef eptr
5426 #undef ecode
5427 #undef mstart
5428 #undef offset_top
5429 #undef ims
5430 #undef eptrb
5431 #undef flags
5432
5433 #undef callpat
5434 #undef charptr
5435 #undef data
5436 #undef next
5437 #undef pp
5438 #undef prev
5439 #undef saved_eptr
5440
5441 #undef new_recursive
5442
5443 #undef cur_is_word
5444 #undef condition
5445 #undef prev_is_word
5446
5447 #undef original_ims
5448
5449 #undef ctype
5450 #undef length
5451 #undef max
5452 #undef min
5453 #undef number
5454 #undef offset
5455 #undef op
5456 #undef save_capture_last
5457 #undef save_offset1
5458 #undef save_offset2
5459 #undef save_offset3
5460 #undef stacksave
5461
5462 #undef newptrb
5463
5464 #endif
5465
5466 /* These two are defined as macros in both cases */
5467
5468 #undef fc
5469 #undef fi
5470
5471 /***************************************************************************
5472 ***************************************************************************/
5473
5474
5475
5476 /*************************************************
5477 * Execute a Regular Expression *
5478 *************************************************/
5479
5480 /* This function applies a compiled re to a subject string and picks out
5481 portions of the string if it matches. Two elements in the vector are set for
5482 each substring: the offsets to the start and end of the substring.
5483
5484 Arguments:
5485 argument_re points to the compiled expression
5486 extra_data points to extra data or is NULL
5487 subject points to the subject string
5488 length length of subject string (may contain binary zeros)
5489 start_offset where to start in the subject string
5490 options option bits
5491 offsets points to a vector of ints to be filled in with offsets
5492 offsetcount the number of elements in the vector
5493
5494 Returns: > 0 => success; value is the number of elements filled in
5495 = 0 => success, but offsets is not big enough
5496 -1 => failed to match
5497 < -1 => some kind of unexpected problem
5498 */
5499
5500 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5501 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5502 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5503 int offsetcount)
5504 {
5505 int rc, resetcount, ocount;
5506 int first_byte = -1;
5507 int req_byte = -1;
5508 int req_byte2 = -1;
5509 int newline;
5510 unsigned long int ims;
5511 BOOL using_temporary_offsets = FALSE;
5512 BOOL anchored;
5513 BOOL startline;
5514 BOOL firstline;
5515 BOOL first_byte_caseless = FALSE;
5516 BOOL req_byte_caseless = FALSE;
5517 BOOL utf8;
5518 match_data match_block;
5519 match_data *md = &match_block;
5520 const uschar *tables;
5521 const uschar *start_bits = NULL;
5522 USPTR start_match = (USPTR)subject + start_offset;
5523 USPTR end_subject;
5524 USPTR start_partial = NULL;
5525 USPTR req_byte_ptr = start_match - 1;
5526
5527 pcre_study_data internal_study;
5528 const pcre_study_data *study;
5529
5530 real_pcre internal_re;
5531 const real_pcre *external_re = (const real_pcre *)argument_re;
5532 const real_pcre *re = external_re;
5533
5534 /* Plausibility checks */
5535
5536 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5537 if (re == NULL || subject == NULL ||
5538 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5539 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5540
5541 /* This information is for finding all the numbers associated with a given
5542 name, for condition testing. */
5543
5544 md->name_table = (uschar *)re + re->name_table_offset;
5545 md->name_count = re->name_count;
5546 md->name_entry_size = re->name_entry_size;
5547
5548 /* Fish out the optional data from the extra_data structure, first setting
5549 the default values. */
5550
5551 study = NULL;
5552 md->match_limit = MATCH_LIMIT;
5553 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5554 md->callout_data = NULL;
5555
5556 /* The table pointer is always in native byte order. */
5557
5558 tables = external_re->tables;
5559
5560 if (extra_data != NULL)
5561 {
5562 register unsigned int flags = extra_data->flags;
5563 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5564 study = (const pcre_study_data *)extra_data->study_data;
5565 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5566 md->match_limit = extra_data->match_limit;
5567 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5568 md->match_limit_recursion = extra_data->match_limit_recursion;
5569 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5570 md->callout_data = extra_data->callout_data;
5571 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5572 }
5573
5574 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5575 is a feature that makes it possible to save compiled regex and re-use them
5576 in other programs later. */
5577
5578 if (tables == NULL) tables = _pcre_default_tables;
5579
5580 /* Check that the first field in the block is the magic number. If it is not,
5581 test for a regex that was compiled on a host of opposite endianness. If this is
5582 the case, flipped values are put in internal_re and internal_study if there was
5583 study data too. */
5584
5585 if (re->magic_number != MAGIC_NUMBER)
5586 {
5587 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5588 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5589 if (study != NULL) study = &internal_study;
5590 }
5591
5592 /* Set up other data */
5593
5594 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5595 startline = (re->flags & PCRE_STARTLINE) != 0;
5596 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5597
5598 /* The code starts after the real_pcre block and the capture name table. */
5599
5600 md->start_code = (const uschar *)external_re + re->name_table_offset +
5601 re->name_count * re->name_entry_size;
5602
5603 md->start_subject = (USPTR)subject;
5604 md->start_offset = start_offset;
5605 md->end_subject = md->start_subject + length;
5606 end_subject = md->end_subject;
5607
5608 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5609 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5610 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5611
5612 md->notbol = (options & PCRE_NOTBOL) != 0;
5613 md->noteol = (options & PCRE_NOTEOL) != 0;
5614 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5615 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5616 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5617 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5618 md->hitend = FALSE;
5619 md->mark = NULL; /* In case never set */
5620
5621 md->recursive = NULL; /* No recursion at top level */
5622
5623 md->lcc = tables + lcc_offset;
5624 md->ctypes = tables + ctypes_offset;
5625
5626 /* Handle different \R options. */
5627
5628 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5629 {
5630 case 0:
5631 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5632 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5633 else
5634 #ifdef BSR_ANYCRLF
5635 md->bsr_anycrlf = TRUE;
5636 #else
5637 md->bsr_anycrlf = FALSE;
5638 #endif
5639 break;
5640
5641 case PCRE_BSR_ANYCRLF:
5642 md->bsr_anycrlf = TRUE;
5643 break;
5644
5645 case PCRE_BSR_UNICODE:
5646 md->bsr_anycrlf = FALSE;
5647 break;
5648
5649 default: return PCRE_ERROR_BADNEWLINE;
5650 }
5651
5652 /* Handle different types of newline. The three bits give eight cases. If
5653 nothing is set at run time, whatever was used at compile time applies. */
5654
5655 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5656 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5657 {
5658 case 0: newline = NEWLINE; break; /* Compile-time default */
5659 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5660 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5661 case PCRE_NEWLINE_CR+
5662 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5663 case PCRE_NEWLINE_ANY: newline = -1; break;
5664 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5665 default: return PCRE_ERROR_BADNEWLINE;
5666 }
5667
5668 if (newline == -2)
5669 {
5670 md->nltype = NLTYPE_ANYCRLF;
5671 }
5672 else if (newline < 0)
5673 {
5674 md->nltype = NLTYPE_ANY;
5675 }
5676 else
5677 {
5678 md->nltype = NLTYPE_FIXED;
5679 if (newline > 255)
5680 {
5681 md->nllen = 2;
5682 md->nl[0] = (newline >> 8) & 255;
5683 md->nl[1] = newline & 255;
5684 }
5685 else
5686 {
5687 md->nllen = 1;
5688 md->nl[0] = newline;
5689 }
5690 }
5691
5692 /* Partial matching was originally supported only for a restricted set of
5693 regexes; from release 8.00 there are no restrictions, but the bits are still
5694 defined (though never set). So there's no harm in leaving this code. */
5695
5696 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5697 return PCRE_ERROR_BADPARTIAL;
5698
5699 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5700 back the character offset. */
5701
5702 #ifdef SUPPORT_UTF8
5703 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5704 {
5705 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5706 return PCRE_ERROR_BADUTF8;
5707 if (start_offset > 0 && start_offset < length)
5708 {
5709 int tb = ((USPTR)subject)[start_offset];
5710 if (tb > 127)
5711 {
5712 tb &= 0xc0;
5713 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5714 }
5715 }
5716 }
5717 #endif
5718
5719 /* The ims options can vary during the matching as a result of the presence
5720 of (?ims) items in the pattern. They are kept in a local variable so that
5721 restoring at the exit of a group is easy. */
5722
5723 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5724
5725 /* If the expression has got more back references than the offsets supplied can
5726 hold, we get a temporary chunk of working store to use during the matching.
5727 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5728 of 3. */
5729
5730 ocount = offsetcount - (offsetcount % 3);
5731
5732 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5733 {
5734 ocount = re->top_backref * 3 + 3;
5735 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5736 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5737 using_temporary_offsets = TRUE;
5738 DPRINTF(("Got memory to hold back references\n"));
5739 }
5740 else md->offset_vector = offsets;
5741
5742 md->offset_end = ocount;
5743 md->offset_max = (2*ocount)/3;
5744 md->offset_overflow = FALSE;
5745 md->capture_last = -1;
5746
5747 /* Compute the minimum number of offsets that we need to reset each time. Doing
5748 this makes a huge difference to execution time when there aren't many brackets
5749 in the pattern. */
5750
5751 resetcount = 2 + re->top_bracket * 2;
5752 if (resetcount > offsetcount) resetcount = ocount;
5753
5754 /* Reset the working variable associated with each extraction. These should
5755 never be used unless previously set, but they get saved and restored, and so we
5756 initialize them to avoid reading uninitialized locations. */
5757
5758 if (md->offset_vector != NULL)
5759 {
5760 register int *iptr = md->offset_vector + ocount;
5761 register int *iend = iptr - resetcount/2 + 1;
5762 while (--iptr >= iend) *iptr = -1;
5763 }
5764
5765 /* Set up the first character to match, if available. The first_byte value is
5766 never set for an anchored regular expression, but the anchoring may be forced
5767 at run time, so we have to test for anchoring. The first char may be unset for
5768 an unanchored pattern, of course. If there's no first char and the pattern was
5769 studied, there may be a bitmap of possible first characters. */
5770
5771 if (!anchored)
5772 {
5773 if ((re->flags & PCRE_FIRSTSET) != 0)
5774 {
5775 first_byte = re->first_byte & 255;
5776 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5777 first_byte = md->lcc[first_byte];
5778 }
5779 else
5780 if (!startline && study != NULL &&
5781 (study->flags & PCRE_STUDY_MAPPED) != 0)
5782 start_bits = study->start_bits;
5783 }
5784
5785 /* For anchored or unanchored matches, there may be a "last known required
5786 character" set. */
5787
5788 if ((re->flags & PCRE_REQCHSET) != 0)
5789 {
5790 req_byte = re->req_byte & 255;
5791 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5792 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5793 }
5794
5795
5796 /* ==========================================================================*/
5797
5798 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5799 the loop runs just once. */
5800
5801 for(;;)
5802 {
5803 USPTR save_end_subject = end_subject;
5804 USPTR new_start_match;
5805
5806 /* Reset the maximum number of extractions we might see. */
5807
5808 if (md->offset_vector != NULL)
5809 {
5810 register int *iptr = md->offset_vector;
5811 register int *iend = iptr + resetcount;
5812 while (iptr < iend) *iptr++ = -1;
5813 }
5814
5815 /* If firstline is TRUE, the start of the match is constrained to the first
5816 line of a multiline string. That is, the match must be before or at the first
5817 newline. Implement this by temporarily adjusting end_subject so that we stop
5818 scanning at a newline. If the match fails at the newline, later code breaks
5819 this loop. */
5820
5821 if (firstline)
5822 {
5823 USPTR t = start_match;
5824 #ifdef SUPPORT_UTF8
5825 if (utf8)
5826 {
5827 while (t < md->end_subject && !IS_NEWLINE(t))
5828 {
5829 t++;
5830 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5831 }
5832 }
5833 else
5834 #endif
5835 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5836 end_subject = t;
5837 }
5838
5839 /* There are some optimizations that avoid running the match if a known
5840 starting point is not found, or if a known later character is not present.
5841 However, there is an option that disables these, for testing and for ensuring
5842 that all callouts do actually occur. */
5843
5844 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5845 {
5846 /* Advance to a unique first byte if there is one. */
5847
5848 if (first_byte >= 0)
5849 {
5850 if (first_byte_caseless)
5851 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5852 start_match++;
5853 else
5854 while (start_match < end_subject && *start_match != first_byte)
5855 start_match++;
5856 }
5857
5858 /* Or to just after a linebreak for a multiline match */
5859
5860 else if (startline)
5861 {
5862 if (start_match > md->start_subject + start_offset)
5863 {
5864 #ifdef SUPPORT_UTF8
5865 if (utf8)
5866 {
5867 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5868 {
5869 start_match++;
5870 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5871 start_match++;
5872 }
5873 }
5874 else
5875 #endif
5876 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5877 start_match++;
5878
5879 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5880 and we are now at a LF, advance the match position by one more character.
5881 */
5882
5883 if (start_match[-1] == CHAR_CR &&
5884 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5885 start_match < end_subject &&
5886 *start_match == CHAR_NL)
5887 start_match++;
5888 }
5889 }
5890
5891 /* Or to a non-unique first byte after study */
5892
5893 else if (start_bits != NULL)
5894 {
5895 while (start_match < end_subject)
5896 {
5897 register unsigned int c = *start_match;
5898 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5899 else break;
5900 }
5901 }
5902 } /* Starting optimizations */
5903
5904 /* Restore fudged end_subject */
5905
5906 end_subject = save_end_subject;
5907
5908 /* The following two optimizations are disabled for partial matching or if
5909 disabling is explicitly requested. */
5910
5911 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5912 {
5913 /* If the pattern was studied, a minimum subject length may be set. This is
5914 a lower bound; no actual string of that length may actually match the
5915 pattern. Although the value is, strictly, in characters, we treat it as
5916 bytes to avoid spending too much time in this optimization. */
5917
5918 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5919 (pcre_uint32)(end_subject - start_match) < study->minlength)
5920 {
5921 rc = MATCH_NOMATCH;
5922 break;
5923 }
5924
5925 /* If req_byte is set, we know that that character must appear in the
5926 subject for the match to succeed. If the first character is set, req_byte
5927 must be later in the subject; otherwise the test starts at the match point.
5928 This optimization can save a huge amount of backtracking in patterns with
5929 nested unlimited repeats that aren't going to match. Writing separate code
5930 for cased/caseless versions makes it go faster, as does using an
5931 autoincrement and backing off on a match.
5932
5933 HOWEVER: when the subject string is very, very long, searching to its end
5934 can take a long time, and give bad performance on quite ordinary patterns.
5935 This showed up when somebody was matching something like /^\d+C/ on a
5936 32-megabyte string... so we don't do this when the string is sufficiently
5937 long. */
5938
5939 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5940 {
5941 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5942
5943 /* We don't need to repeat the search if we haven't yet reached the
5944 place we found it at last time. */
5945
5946 if (p > req_byte_ptr)
5947 {
5948 if (req_byte_caseless)
5949 {
5950 while (p < end_subject)
5951 {
5952 register int pp = *p++;
5953 if (pp == req_byte || pp == req_byte2) { p--; break; }
5954 }
5955 }
5956 else
5957 {
5958 while (p < end_subject)
5959 {
5960 if (*p++ == req_byte) { p--; break; }
5961 }
5962 }
5963
5964 /* If we can't find the required character, break the matching loop,
5965 forcing a match failure. */
5966
5967 if (p >= end_subject)
5968 {
5969 rc = MATCH_NOMATCH;
5970 break;
5971 }
5972
5973 /* If we have found the required character, save the point where we
5974 found it, so that we don't search again next time round the loop if
5975 the start hasn't passed this character yet. */
5976
5977 req_byte_ptr = p;
5978 }
5979 }
5980 }
5981
5982 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
5983 printf(">>>> Match against: ");
5984 pchars(start_match, end_subject - start_match, TRUE, md);
5985 printf("\n");
5986 #endif
5987
5988 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5989 first starting point for which a partial match was found. */
5990
5991 md->start_match_ptr = start_match;
5992 md->start_used_ptr = start_match;
5993 md->match_call_count = 0;
5994 rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
5995 0, 0);
5996 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5997
5998 switch(rc)
5999 {
6000 /* NOMATCH and PRUNE advance by one character. If MATCH_SKIP_ARG reaches
6001 this level it means that a MARK that matched the SKIP's arg was not found.
6002 We treat this as NOMATCH. THEN at this level acts exactly like PRUNE. */
6003
6004 case MATCH_NOMATCH:
6005 case MATCH_PRUNE:
6006 case MATCH_SKIP_ARG:
6007 case MATCH_THEN:
6008 new_start_match = start_match + 1;
6009 #ifdef SUPPORT_UTF8
6010 if (utf8)
6011 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6012 new_start_match++;
6013 #endif
6014 break;
6015
6016 /* SKIP passes back the next starting point explicitly. */
6017
6018 case MATCH_SKIP:
6019 new_start_match = md->start_match_ptr;
6020 break;
6021
6022 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6023
6024 case MATCH_COMMIT:
6025 rc = MATCH_NOMATCH;
6026 goto ENDLOOP;
6027
6028 /* Any other return is either a match, or some kind of error. */
6029
6030 default:
6031 goto ENDLOOP;
6032 }
6033
6034 /* Control reaches here for the various types of "no match at this point"
6035 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6036
6037 rc = MATCH_NOMATCH;
6038
6039 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6040 newline in the subject (though it may continue over the newline). Therefore,
6041 if we have just failed to match, starting at a newline, do not continue. */
6042
6043 if (firstline && IS_NEWLINE(start_match)) break;
6044
6045 /* Advance to new matching position */
6046
6047 start_match = new_start_match;
6048
6049 /* Break the loop if the pattern is anchored or if we have passed the end of
6050 the subject. */
6051
6052 if (anchored || start_match > end_subject) break;
6053
6054 /* If we have just passed a CR and we are now at a LF, and the pattern does
6055 not contain any explicit matches for \r or \n, and the newline option is CRLF
6056 or ANY or ANYCRLF, advance the match position by one more character. */
6057
6058 if (start_match[-1] == CHAR_CR &&
6059 start_match < end_subject &&
6060 *start_match == CHAR_NL &&
6061 (re->flags & PCRE_HASCRORLF) == 0 &&
6062 (md->nltype == NLTYPE_ANY ||
6063 md->nltype == NLTYPE_ANYCRLF ||
6064 md->nllen == 2))
6065 start_match++;
6066
6067 md->mark = NULL; /* Reset for start of next match attempt */
6068 } /* End of for(;;) "bumpalong" loop */
6069
6070 /* ==========================================================================*/
6071
6072 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6073 conditions is true:
6074
6075 (1) The pattern is anchored or the match was failed by (*COMMIT);
6076
6077 (2) We are past the end of the subject;
6078
6079 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6080 this option requests that a match occur at or before the first newline in
6081 the subject.
6082
6083 When we have a match and the offset vector is big enough to deal with any
6084 backreferences, captured substring offsets will already be set up. In the case
6085 where we had to get some local store to hold offsets for backreference
6086 processing, copy those that we can. In this case there need not be overflow if
6087 certain parts of the pattern were not used, even though there are more
6088 capturing parentheses than vector slots. */
6089
6090 ENDLOOP:
6091
6092 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6093 {
6094 if (using_temporary_offsets)
6095 {
6096 if (offsetcount >= 4)
6097 {
6098 memcpy(offsets + 2, md->offset_vector + 2,
6099 (offsetcount - 2) * sizeof(int));
6100 DPRINTF(("Copied offsets from temporary memory\n"));
6101 }
6102 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6103 DPRINTF(("Freeing temporary memory\n"));
6104 (pcre_free)(md->offset_vector);
6105 }
6106
6107 /* Set the return code to the number of captured strings, or 0 if there are
6108 too many to fit into the vector. */
6109
6110 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6111
6112 /* If there is space, set up the whole thing as substring 0. The value of
6113 md->start_match_ptr might be modified if \K was encountered on the success
6114 matching path. */
6115
6116 if (offsetcount < 2) rc = 0; else
6117 {
6118 offsets[0] = md->start_match_ptr - md->start_subject;
6119 offsets[1] = md->end_match_ptr - md->start_subject;
6120 }
6121
6122 DPRINTF((">>>> returning %d\n", rc));
6123 goto RETURN_MARK;
6124 }
6125
6126 /* Control gets here if there has been an error, or if the overall match
6127 attempt has failed at all permitted starting positions. */
6128
6129 if (using_temporary_offsets)
6130 {
6131 DPRINTF(("Freeing temporary memory\n"));
6132 (pcre_free)(md->offset_vector);
6133 }
6134
6135 /* For anything other than nomatch or partial match, just return the code. */
6136
6137 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6138 {
6139 DPRINTF((">>>> error: returning %d\n", rc));
6140 return rc;
6141 }
6142
6143 /* Handle partial matches - disable any mark data */
6144
6145 if (start_partial != NULL)
6146 {
6147 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6148 md->mark = NULL;
6149 if (offsetcount > 1)
6150 {
6151 offsets[0] = start_partial - (USPTR)subject;
6152 offsets[1] = end_subject - (USPTR)subject;
6153 }
6154 rc = PCRE_ERROR_PARTIAL;
6155 }
6156
6157 /* This is the classic nomatch case */
6158
6159 else
6160 {
6161 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6162 rc = PCRE_ERROR_NOMATCH;
6163 }
6164
6165 /* Return the MARK data if it has been requested. */
6166
6167 RETURN_MARK:
6168
6169 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6170 *(extra_data->mark) = (unsigned char *)(md->mark);
6171 return rc;
6172 }
6173
6174 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12