/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1513 - (show annotations) (download)
Wed Nov 19 20:57:13 2014 UTC (4 weeks, 1 day ago) by ph10
File MIME type: text/plain
File size: 218189 byte(s)
Fix zero-repeat assertion condition bug.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
199 if (utf)
200 {
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
208 wrong). */
209
210 PCRE_PUCHAR endptr = p + length;
211 while (p < endptr)
212 {
213 pcre_uint32 c, d;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
216 GETCHARINC(c, eptr);
217 GETCHARINC(d, p);
218 ur = GET_UCD(d);
219 if (c != d && c != d + ur->other_case)
220 {
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222 for (;;)
223 {
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
226 }
227 }
228 }
229 }
230 else
231 #endif
232
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
235 {
236 while (length-- > 0)
237 {
238 pcre_uint32 cc, cp;
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
241 cp = UCHAR21TEST(p);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
243 p++;
244 eptr++;
245 }
246 }
247 }
248
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
251
252 else
253 {
254 while (length-- > 0)
255 {
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
258 }
259 }
260
261 return (int)(eptr - eptr_start);
262 }
263
264
265
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
269
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
275 fine.
276
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
281
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
286 always used to.
287
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
294
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
303
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
306
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
314
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
318
319 #ifndef NO_RECURSE
320 #define REGISTER register
321
322 #ifdef PCRE_DEBUG
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
324 { \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
328 }
329 #define RRETURN(ra) \
330 { \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
332 return ra; \
333 }
334 #else
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
338 #endif
339
340 #else
341
342
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
346
347 #define REGISTER
348
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
350 {\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
353 {\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
358 }\
359 frame->Xwhere = rw;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
367 frame = newframe;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
369 goto HEAP_RECURSE;\
370 L_##rw:\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
372 }
373
374 #define RRETURN(ra)\
375 {\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
378 if (frame != NULL)\
379 {\
380 rrc = ra;\
381 goto HEAP_RETURN;\
382 }\
383 return ra;\
384 }
385
386
387 /* Structure for remembering the local variables in a private frame */
388
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
392
393 /* Function arguments that may change */
394
395 PCRE_PUCHAR Xeptr;
396 const pcre_uchar *Xecode;
397 PCRE_PUCHAR Xmstart;
398 int Xoffset_top;
399 eptrblock *Xeptrb;
400 unsigned int Xrdepth;
401
402 /* Function local variables */
403
404 PCRE_PUCHAR Xcallpat;
405 #ifdef SUPPORT_UTF
406 PCRE_PUCHAR Xcharptr;
407 #endif
408 PCRE_PUCHAR Xdata;
409 PCRE_PUCHAR Xnext;
410 PCRE_PUCHAR Xpp;
411 PCRE_PUCHAR Xprev;
412 PCRE_PUCHAR Xsaved_eptr;
413
414 recursion_info Xnew_recursive;
415
416 BOOL Xcur_is_word;
417 BOOL Xcondition;
418 BOOL Xprev_is_word;
419
420 #ifdef SUPPORT_UCP
421 int Xprop_type;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
424 int Xoclength;
425 pcre_uchar Xocchars[6];
426 #endif
427
428 int Xcodelink;
429 int Xctype;
430 unsigned int Xfc;
431 int Xfi;
432 int Xlength;
433 int Xmax;
434 int Xmin;
435 unsigned int Xnumber;
436 int Xoffset;
437 unsigned int Xop;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
441
442 eptrblock Xnewptrb;
443
444 /* Where to jump back to */
445
446 int Xwhere;
447
448 } heapframe;
449
450 #endif
451
452
453 /***************************************************************************
454 ***************************************************************************/
455
456
457
458 /*************************************************
459 * Match from current position *
460 *************************************************/
461
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
464 same response. */
465
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
471 the subject. */
472
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
476 { \
477 md->hitend = TRUE; \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
479 }
480
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
483 { \
484 md->hitend = TRUE; \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
486 }
487
488
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
493
494 Arguments:
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
498 by encountering \K)
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
504
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
510 */
511
512 static int
513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515 unsigned int rdepth)
516 {
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
520
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
525
526 BOOL minimize, possessive; /* Quantifier options */
527 BOOL caseless;
528 int condcode;
529
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
536
537 #ifdef NO_RECURSE
538 heapframe *frame = (heapframe *)md->match_frames_base;
539
540 /* Copy in the original argument variables */
541
542 frame->Xeptr = eptr;
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
548
549 /* This is where control jumps back to to effect "recursion" */
550
551 HEAP_RECURSE:
552
553 /* Macros make the argument variables come from the current frame */
554
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
561
562 /* Ditto for the local variables */
563
564 #ifdef SUPPORT_UTF
565 #define charptr frame->Xcharptr
566 #endif
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
574
575 #define new_recursive frame->Xnew_recursive
576
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
580
581 #ifdef SUPPORT_UCP
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
587 #endif
588
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
603
604 #define newptrb frame->Xnewptrb
605
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
609
610 #else /* NO_RECURSE not defined */
611 #define fi i
612 #define fc c
613
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
620 to RMATCH(). */
621
622 #ifdef SUPPORT_UTF
623 const pcre_uchar *charptr;
624 #endif
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
628 PCRE_PUCHAR pp;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
631
632 recursion_info new_recursive;
633
634 BOOL cur_is_word;
635 BOOL condition;
636 BOOL prev_is_word;
637
638 #ifdef SUPPORT_UCP
639 int prop_type;
640 unsigned int prop_value;
641 int prop_fail_result;
642 int oclength;
643 pcre_uchar occhars[6];
644 #endif
645
646 int codelink;
647 int ctype;
648 int length;
649 int max;
650 int min;
651 unsigned int number;
652 int offset;
653 unsigned int op;
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
657
658 eptrblock newptrb;
659
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
664 size. */
665
666 if (ecode == NULL)
667 {
668 if (rdepth == 0)
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670 else
671 {
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
674 }
675 }
676 #endif /* NO_RECURSE */
677
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
682
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
688 #define foc number
689 #define save_mark data
690
691 /* These statements are here to stop the compiler complaining about unitialized
692 variables. */
693
694 #ifdef SUPPORT_UCP
695 prop_value = 0;
696 prop_fail_result = 0;
697 #endif
698
699
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
703 original patch. */
704
705 TAIL_RECURSE:
706
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
714
715 #ifdef SUPPORT_UTF
716 utf = md->utf; /* Local copy of the flag */
717 #else
718 utf = FALSE;
719 #endif
720
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
723
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
726
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
731
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
738
739 if (md->match_function_type == MATCH_CBEGROUP)
740 {
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
743 eptrb = &newptrb;
744 md->match_function_type = 0;
745 }
746
747 /* Now start processing the opcodes. */
748
749 for (;;)
750 {
751 minimize = possessive = FALSE;
752 op = *ecode;
753
754 switch(op)
755 {
756 case OP_MARK:
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
760 eptrb, RM55);
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
763
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
769 unaltered. */
770
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
773 {
774 md->start_match_ptr = eptr;
775 RRETURN(MATCH_SKIP);
776 }
777 RRETURN(rrc);
778
779 case OP_FAIL:
780 RRETURN(MATCH_NOMATCH);
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
787
788 case OP_PRUNE:
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 eptrb, RM51);
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
793
794 case OP_PRUNE_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
798 eptrb, RM56);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
803
804 case OP_SKIP:
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
806 eptrb, RM53);
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
809 RRETURN(MATCH_SKIP);
810
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
818
819 case OP_SKIP_ARG:
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
822 {
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
824 break;
825 }
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
827 eptrb, RM57);
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
829
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
834
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
837
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
841
842 case OP_THEN:
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
844 eptrb, RM54);
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
847 RRETURN(MATCH_THEN);
848
849 case OP_THEN_ARG:
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
853 md, eptrb, RM58);
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
858 RRETURN(MATCH_THEN);
859
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
867
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
872
873 case OP_ONCE_NC:
874 prev = ecode;
875 saved_eptr = eptr;
876 save_mark = md->mark;
877 do
878 {
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
881 {
882 mstart = md->start_match_ptr;
883 break;
884 }
885 if (rrc == MATCH_THEN)
886 {
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
890 rrc = MATCH_NOMATCH;
891 }
892
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
896 }
897 while (*ecode == OP_ALT);
898
899 /* If hit the end of the group (which could be repeated), fail */
900
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
902
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
905
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
907
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
910
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
914 5.005. */
915
916 if (*ecode == OP_KET || eptr == saved_eptr)
917 {
918 ecode += 1+LINK_SIZE;
919 break;
920 }
921
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
925
926 if (*ecode == OP_KETRMIN)
927 {
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930 ecode = prev;
931 goto TAIL_RECURSE;
932 }
933 else /* OP_KETRMAX */
934 {
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
938 goto TAIL_RECURSE;
939 }
940 /* Control never gets here */
941
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
951
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
955
956 case OP_CBRA:
957 case OP_SCBRA:
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
960
961 #ifdef PCRE_DEBUG
962 printf("start bracket %d\n", number);
963 printf("subject=");
964 pchars(eptr, 16, TRUE, md);
965 printf("\n");
966 #endif
967
968 if (offset < md->offset_max)
969 {
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
975
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
979
980 for (;;)
981 {
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
984 eptrb, RM1);
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
986
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
997
998 if (rrc == MATCH_THEN)
999 {
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1004 }
1005
1006 /* Anything other than NOMATCH is passed back. */
1007
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1013 }
1014
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1019
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1021
1022 RRETURN(rrc);
1023 }
1024
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1027
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1032
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1038
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1045
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1048
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1055
1056 case OP_ONCE:
1057 case OP_BRA:
1058 case OP_SBRA:
1059 DPRINTF(("start non-capturing bracket\n"));
1060
1061 for (;;)
1062 {
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1065
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1068 above. */
1069
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1071 {
1072 ecode += PRIV(OP_lengths)[*ecode];
1073 goto TAIL_RECURSE;
1074 }
1075
1076 /* In all other cases, we have to make another call to match(). */
1077
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1081 RM2);
1082
1083 /* See comment in the code for capturing groups above about handling
1084 THEN. */
1085
1086 if (rrc == MATCH_THEN)
1087 {
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1092 }
1093
1094 if (rrc != MATCH_NOMATCH)
1095 {
1096 if (rrc == MATCH_ONCE)
1097 {
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1100 {
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1103 }
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1105 }
1106 RRETURN(rrc);
1107 }
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1112 }
1113
1114 RRETURN(MATCH_NOMATCH);
1115
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1123
1124 case OP_CBRAPOS:
1125 case OP_SCBRAPOS:
1126 allow_zero = FALSE;
1127
1128 POSSESSIVE_CAPTURE:
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1131
1132 #ifdef PCRE_DEBUG
1133 printf("start possessive bracket %d\n", number);
1134 printf("subject=");
1135 pchars(eptr, 16, TRUE, md);
1136 printf("\n");
1137 #endif
1138
1139 if (offset < md->offset_max)
1140 {
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1143
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1148
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1150
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1159
1160 for (;;)
1161 {
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1166 eptrb, RM63);
1167 if (rrc == MATCH_KETRPOS)
1168 {
1169 offset_top = md->end_offset_top;
1170 ecode = md->start_code + code_offset;
1171 save_capture_last = md->capture_last;
1172 matched_once = TRUE;
1173 mstart = md->start_match_ptr; /* In case \K changed it */
1174 if (eptr == md->end_match_ptr) /* Matched an empty string */
1175 {
1176 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1177 break;
1178 }
1179 eptr = md->end_match_ptr;
1180 continue;
1181 }
1182
1183 /* See comment in the code for capturing groups above about handling
1184 THEN. */
1185
1186 if (rrc == MATCH_THEN)
1187 {
1188 next = ecode + GET(ecode,1);
1189 if (md->start_match_ptr < next &&
1190 (*ecode == OP_ALT || *next == OP_ALT))
1191 rrc = MATCH_NOMATCH;
1192 }
1193
1194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1195 md->capture_last = save_capture_last;
1196 ecode += GET(ecode, 1);
1197 if (*ecode != OP_ALT) break;
1198 }
1199
1200 if (!matched_once)
1201 {
1202 md->offset_vector[offset] = save_offset1;
1203 md->offset_vector[offset+1] = save_offset2;
1204 md->offset_vector[md->offset_end - number] = save_offset3;
1205 }
1206
1207 if (allow_zero || matched_once)
1208 {
1209 ecode += 1 + LINK_SIZE;
1210 break;
1211 }
1212
1213 RRETURN(MATCH_NOMATCH);
1214 }
1215
1216 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1217 as a non-capturing bracket. */
1218
1219 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1220 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1221
1222 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1223
1224 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1225 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1226
1227 /* Non-capturing possessive bracket with unlimited repeat. We come here
1228 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1229 without the capturing complication. It is written out separately for speed
1230 and cleanliness. */
1231
1232 case OP_BRAPOS:
1233 case OP_SBRAPOS:
1234 allow_zero = FALSE;
1235
1236 POSSESSIVE_NON_CAPTURE:
1237 matched_once = FALSE;
1238 code_offset = (int)(ecode - md->start_code);
1239 save_capture_last = md->capture_last;
1240
1241 for (;;)
1242 {
1243 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1244 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1245 eptrb, RM48);
1246 if (rrc == MATCH_KETRPOS)
1247 {
1248 offset_top = md->end_offset_top;
1249 ecode = md->start_code + code_offset;
1250 matched_once = TRUE;
1251 mstart = md->start_match_ptr; /* In case \K reset it */
1252 if (eptr == md->end_match_ptr) /* Matched an empty string */
1253 {
1254 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1255 break;
1256 }
1257 eptr = md->end_match_ptr;
1258 continue;
1259 }
1260
1261 /* See comment in the code for capturing groups above about handling
1262 THEN. */
1263
1264 if (rrc == MATCH_THEN)
1265 {
1266 next = ecode + GET(ecode,1);
1267 if (md->start_match_ptr < next &&
1268 (*ecode == OP_ALT || *next == OP_ALT))
1269 rrc = MATCH_NOMATCH;
1270 }
1271
1272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1273 ecode += GET(ecode, 1);
1274 if (*ecode != OP_ALT) break;
1275 md->capture_last = save_capture_last;
1276 }
1277
1278 if (matched_once || allow_zero)
1279 {
1280 ecode += 1 + LINK_SIZE;
1281 break;
1282 }
1283 RRETURN(MATCH_NOMATCH);
1284
1285 /* Control never reaches here. */
1286
1287 /* Conditional group: compilation checked that there are no more than two
1288 branches. If the condition is false, skipping the first branch takes us
1289 past the end of the item if there is only one branch, but that's exactly
1290 what we want. */
1291
1292 case OP_COND:
1293 case OP_SCOND:
1294
1295 /* The variable codelink will be added to ecode when the condition is
1296 false, to get to the second branch. Setting it to the offset to the ALT
1297 or KET, then incrementing ecode achieves this effect. We now have ecode
1298 pointing to the condition or callout. */
1299
1300 codelink = GET(ecode, 1); /* Offset to the second branch */
1301 ecode += 1 + LINK_SIZE; /* From this opcode */
1302
1303 /* Because of the way auto-callout works during compile, a callout item is
1304 inserted between OP_COND and an assertion condition. */
1305
1306 if (*ecode == OP_CALLOUT)
1307 {
1308 if (PUBL(callout) != NULL)
1309 {
1310 PUBL(callout_block) cb;
1311 cb.version = 2; /* Version 1 of the callout block */
1312 cb.callout_number = ecode[1];
1313 cb.offset_vector = md->offset_vector;
1314 #if defined COMPILE_PCRE8
1315 cb.subject = (PCRE_SPTR)md->start_subject;
1316 #elif defined COMPILE_PCRE16
1317 cb.subject = (PCRE_SPTR16)md->start_subject;
1318 #elif defined COMPILE_PCRE32
1319 cb.subject = (PCRE_SPTR32)md->start_subject;
1320 #endif
1321 cb.subject_length = (int)(md->end_subject - md->start_subject);
1322 cb.start_match = (int)(mstart - md->start_subject);
1323 cb.current_position = (int)(eptr - md->start_subject);
1324 cb.pattern_position = GET(ecode, 2);
1325 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1326 cb.capture_top = offset_top/2;
1327 cb.capture_last = md->capture_last & CAPLMASK;
1328 /* Internal change requires this for API compatibility. */
1329 if (cb.capture_last == 0) cb.capture_last = -1;
1330 cb.callout_data = md->callout_data;
1331 cb.mark = md->nomatch_mark;
1332 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1333 if (rrc < 0) RRETURN(rrc);
1334 }
1335
1336 /* Advance ecode past the callout, so it now points to the condition. We
1337 must adjust codelink so that the value of ecode+codelink is unchanged. */
1338
1339 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1340 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1341 }
1342
1343 /* Test the various possible conditions */
1344
1345 condition = FALSE;
1346 switch(condcode = *ecode)
1347 {
1348 case OP_RREF: /* Numbered group recursion test */
1349 if (md->recursive != NULL) /* Not recursing => FALSE */
1350 {
1351 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1352 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1353 }
1354 break;
1355
1356 case OP_DNRREF: /* Duplicate named group recursion test */
1357 if (md->recursive != NULL)
1358 {
1359 int count = GET2(ecode, 1 + IMM2_SIZE);
1360 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1361 while (count-- > 0)
1362 {
1363 unsigned int recno = GET2(slot, 0);
1364 condition = recno == md->recursive->group_num;
1365 if (condition) break;
1366 slot += md->name_entry_size;
1367 }
1368 }
1369 break;
1370
1371 case OP_CREF: /* Numbered group used test */
1372 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1373 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1374 break;
1375
1376 case OP_DNCREF: /* Duplicate named group used test */
1377 {
1378 int count = GET2(ecode, 1 + IMM2_SIZE);
1379 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1380 while (count-- > 0)
1381 {
1382 offset = GET2(slot, 0) << 1;
1383 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1384 if (condition) break;
1385 slot += md->name_entry_size;
1386 }
1387 }
1388 break;
1389
1390 case OP_DEF: /* DEFINE - always false */
1391 break;
1392
1393 /* The condition is an assertion. Call match() to evaluate it - setting
1394 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1395 of an assertion. */
1396
1397 default:
1398 md->match_function_type = MATCH_CONDASSERT;
1399 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1400 if (rrc == MATCH_MATCH)
1401 {
1402 if (md->end_offset_top > offset_top)
1403 offset_top = md->end_offset_top; /* Captures may have happened */
1404 condition = TRUE;
1405
1406 /* Advance ecode past the assertion to the start of the first branch,
1407 but adjust it so that the general choosing code below works. If the
1408 assertion has a quantifier that allows zero repeats we must skip over
1409 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1410
1411 if (*ecode == OP_BRAZERO) ecode++;
1412 ecode += GET(ecode, 1);
1413 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1414 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1415 }
1416
1417 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1418 assertion; it is therefore treated as NOMATCH. Any other return is an
1419 error. */
1420
1421 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1422 {
1423 RRETURN(rrc); /* Need braces because of following else */
1424 }
1425 break;
1426 }
1427
1428 /* Choose branch according to the condition */
1429
1430 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1431
1432 /* We are now at the branch that is to be obeyed. As there is only one, we
1433 can use tail recursion to avoid using another stack frame, except when
1434 there is unlimited repeat of a possibly empty group. In the latter case, a
1435 recursive call to match() is always required, unless the second alternative
1436 doesn't exist, in which case we can just plough on. Note that, for
1437 compatibility with Perl, the | in a conditional group is NOT treated as
1438 creating two alternatives. If a THEN is encountered in the branch, it
1439 propagates out to the enclosing alternative (unless nested in a deeper set
1440 of alternatives, of course). */
1441
1442 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1443 {
1444 if (op != OP_SCOND)
1445 {
1446 goto TAIL_RECURSE;
1447 }
1448
1449 md->match_function_type = MATCH_CBEGROUP;
1450 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1451 RRETURN(rrc);
1452 }
1453
1454 /* Condition false & no alternative; continue after the group. */
1455
1456 else
1457 {
1458 }
1459 break;
1460
1461
1462 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1463 to close any currently open capturing brackets. */
1464
1465 case OP_CLOSE:
1466 number = GET2(ecode, 1); /* Must be less than 65536 */
1467 offset = number << 1;
1468
1469 #ifdef PCRE_DEBUG
1470 printf("end bracket %d at *ACCEPT", number);
1471 printf("\n");
1472 #endif
1473
1474 md->capture_last = (md->capture_last & OVFLMASK) | number;
1475 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1476 {
1477 md->offset_vector[offset] =
1478 md->offset_vector[md->offset_end - number];
1479 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1480
1481 /* If this group is at or above the current highwater mark, ensure that
1482 any groups between the current high water mark and this group are marked
1483 unset and then update the high water mark. */
1484
1485 if (offset >= offset_top)
1486 {
1487 register int *iptr = md->offset_vector + offset_top;
1488 register int *iend = md->offset_vector + offset;
1489 while (iptr < iend) *iptr++ = -1;
1490 offset_top = offset + 2;
1491 }
1492 }
1493 ecode += 1 + IMM2_SIZE;
1494 break;
1495
1496
1497 /* End of the pattern, either real or forced. */
1498
1499 case OP_END:
1500 case OP_ACCEPT:
1501 case OP_ASSERT_ACCEPT:
1502
1503 /* If we have matched an empty string, fail if not in an assertion and not
1504 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1505 is set and we have matched at the start of the subject. In both cases,
1506 backtracking will then try other alternatives, if any. */
1507
1508 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1509 md->recursive == NULL &&
1510 (md->notempty ||
1511 (md->notempty_atstart &&
1512 mstart == md->start_subject + md->start_offset)))
1513 RRETURN(MATCH_NOMATCH);
1514
1515 /* Otherwise, we have a match. */
1516
1517 md->end_match_ptr = eptr; /* Record where we ended */
1518 md->end_offset_top = offset_top; /* and how many extracts were taken */
1519 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1520
1521 /* For some reason, the macros don't work properly if an expression is
1522 given as the argument to RRETURN when the heap is in use. */
1523
1524 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1525 RRETURN(rrc);
1526
1527 /* Assertion brackets. Check the alternative branches in turn - the
1528 matching won't pass the KET for an assertion. If any one branch matches,
1529 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1530 start of each branch to move the current point backwards, so the code at
1531 this level is identical to the lookahead case. When the assertion is part
1532 of a condition, we want to return immediately afterwards. The caller of
1533 this incarnation of the match() function will have set MATCH_CONDASSERT in
1534 md->match_function type, and one of these opcodes will be the first opcode
1535 that is processed. We use a local variable that is preserved over calls to
1536 match() to remember this case. */
1537
1538 case OP_ASSERT:
1539 case OP_ASSERTBACK:
1540 save_mark = md->mark;
1541 if (md->match_function_type == MATCH_CONDASSERT)
1542 {
1543 condassert = TRUE;
1544 md->match_function_type = 0;
1545 }
1546 else condassert = FALSE;
1547
1548 /* Loop for each branch */
1549
1550 do
1551 {
1552 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1553
1554 /* A match means that the assertion is true; break out of the loop
1555 that matches its alternatives. */
1556
1557 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1558 {
1559 mstart = md->start_match_ptr; /* In case \K reset it */
1560 break;
1561 }
1562
1563 /* If not matched, restore the previous mark setting. */
1564
1565 md->mark = save_mark;
1566
1567 /* See comment in the code for capturing groups above about handling
1568 THEN. */
1569
1570 if (rrc == MATCH_THEN)
1571 {
1572 next = ecode + GET(ecode,1);
1573 if (md->start_match_ptr < next &&
1574 (*ecode == OP_ALT || *next == OP_ALT))
1575 rrc = MATCH_NOMATCH;
1576 }
1577
1578 /* Anything other than NOMATCH causes the entire assertion to fail,
1579 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1580 uncaptured THEN, which means they take their normal effect. This
1581 consistent approach does not always have exactly the same effect as in
1582 Perl. */
1583
1584 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1585 ecode += GET(ecode, 1);
1586 }
1587 while (*ecode == OP_ALT); /* Continue for next alternative */
1588
1589 /* If we have tried all the alternative branches, the assertion has
1590 failed. If not, we broke out after a match. */
1591
1592 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1593
1594 /* If checking an assertion for a condition, return MATCH_MATCH. */
1595
1596 if (condassert) RRETURN(MATCH_MATCH);
1597
1598 /* Continue from after a successful assertion, updating the offsets high
1599 water mark, since extracts may have been taken during the assertion. */
1600
1601 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1602 ecode += 1 + LINK_SIZE;
1603 offset_top = md->end_offset_top;
1604 continue;
1605
1606 /* Negative assertion: all branches must fail to match for the assertion to
1607 succeed. */
1608
1609 case OP_ASSERT_NOT:
1610 case OP_ASSERTBACK_NOT:
1611 save_mark = md->mark;
1612 if (md->match_function_type == MATCH_CONDASSERT)
1613 {
1614 condassert = TRUE;
1615 md->match_function_type = 0;
1616 }
1617 else condassert = FALSE;
1618
1619 /* Loop for each alternative branch. */
1620
1621 do
1622 {
1623 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1624 md->mark = save_mark; /* Always restore the mark setting */
1625
1626 switch(rrc)
1627 {
1628 case MATCH_MATCH: /* A successful match means */
1629 case MATCH_ACCEPT: /* the assertion has failed. */
1630 RRETURN(MATCH_NOMATCH);
1631
1632 case MATCH_NOMATCH: /* Carry on with next branch */
1633 break;
1634
1635 /* See comment in the code for capturing groups above about handling
1636 THEN. */
1637
1638 case MATCH_THEN:
1639 next = ecode + GET(ecode,1);
1640 if (md->start_match_ptr < next &&
1641 (*ecode == OP_ALT || *next == OP_ALT))
1642 {
1643 rrc = MATCH_NOMATCH;
1644 break;
1645 }
1646 /* Otherwise fall through. */
1647
1648 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1649 assertion to fail to match, without considering any more alternatives.
1650 Failing to match means the assertion is true. This is a consistent
1651 approach, but does not always have the same effect as in Perl. */
1652
1653 case MATCH_COMMIT:
1654 case MATCH_SKIP:
1655 case MATCH_SKIP_ARG:
1656 case MATCH_PRUNE:
1657 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1658 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1659
1660 /* Anything else is an error */
1661
1662 default:
1663 RRETURN(rrc);
1664 }
1665
1666 /* Continue with next branch */
1667
1668 ecode += GET(ecode,1);
1669 }
1670 while (*ecode == OP_ALT);
1671
1672 /* All branches in the assertion failed to match. */
1673
1674 NEG_ASSERT_TRUE:
1675 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1676 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1677 continue;
1678
1679 /* Move the subject pointer back. This occurs only at the start of
1680 each branch of a lookbehind assertion. If we are too close to the start to
1681 move back, this match function fails. When working with UTF-8 we move
1682 back a number of characters, not bytes. */
1683
1684 case OP_REVERSE:
1685 #ifdef SUPPORT_UTF
1686 if (utf)
1687 {
1688 i = GET(ecode, 1);
1689 while (i-- > 0)
1690 {
1691 eptr--;
1692 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1693 BACKCHAR(eptr);
1694 }
1695 }
1696 else
1697 #endif
1698
1699 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1700
1701 {
1702 eptr -= GET(ecode, 1);
1703 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1704 }
1705
1706 /* Save the earliest consulted character, then skip to next op code */
1707
1708 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1709 ecode += 1 + LINK_SIZE;
1710 break;
1711
1712 /* The callout item calls an external function, if one is provided, passing
1713 details of the match so far. This is mainly for debugging, though the
1714 function is able to force a failure. */
1715
1716 case OP_CALLOUT:
1717 if (PUBL(callout) != NULL)
1718 {
1719 PUBL(callout_block) cb;
1720 cb.version = 2; /* Version 1 of the callout block */
1721 cb.callout_number = ecode[1];
1722 cb.offset_vector = md->offset_vector;
1723 #if defined COMPILE_PCRE8
1724 cb.subject = (PCRE_SPTR)md->start_subject;
1725 #elif defined COMPILE_PCRE16
1726 cb.subject = (PCRE_SPTR16)md->start_subject;
1727 #elif defined COMPILE_PCRE32
1728 cb.subject = (PCRE_SPTR32)md->start_subject;
1729 #endif
1730 cb.subject_length = (int)(md->end_subject - md->start_subject);
1731 cb.start_match = (int)(mstart - md->start_subject);
1732 cb.current_position = (int)(eptr - md->start_subject);
1733 cb.pattern_position = GET(ecode, 2);
1734 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1735 cb.capture_top = offset_top/2;
1736 cb.capture_last = md->capture_last & CAPLMASK;
1737 /* Internal change requires this for API compatibility. */
1738 if (cb.capture_last == 0) cb.capture_last = -1;
1739 cb.callout_data = md->callout_data;
1740 cb.mark = md->nomatch_mark;
1741 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1742 if (rrc < 0) RRETURN(rrc);
1743 }
1744 ecode += 2 + 2*LINK_SIZE;
1745 break;
1746
1747 /* Recursion either matches the current regex, or some subexpression. The
1748 offset data is the offset to the starting bracket from the start of the
1749 whole pattern. (This is so that it works from duplicated subpatterns.)
1750
1751 The state of the capturing groups is preserved over recursion, and
1752 re-instated afterwards. We don't know how many are started and not yet
1753 finished (offset_top records the completed total) so we just have to save
1754 all the potential data. There may be up to 65535 such values, which is too
1755 large to put on the stack, but using malloc for small numbers seems
1756 expensive. As a compromise, the stack is used when there are no more than
1757 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1758
1759 There are also other values that have to be saved. We use a chained
1760 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1761 for the original version of this logic. It has, however, been hacked around
1762 a lot, so he is not to blame for the current way it works. */
1763
1764 case OP_RECURSE:
1765 {
1766 recursion_info *ri;
1767 unsigned int recno;
1768
1769 callpat = md->start_code + GET(ecode, 1);
1770 recno = (callpat == md->start_code)? 0 :
1771 GET2(callpat, 1 + LINK_SIZE);
1772
1773 /* Check for repeating a recursion without advancing the subject pointer.
1774 This should catch convoluted mutual recursions. (Some simple cases are
1775 caught at compile time.) */
1776
1777 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1778 if (recno == ri->group_num && eptr == ri->subject_position)
1779 RRETURN(PCRE_ERROR_RECURSELOOP);
1780
1781 /* Add to "recursing stack" */
1782
1783 new_recursive.group_num = recno;
1784 new_recursive.saved_capture_last = md->capture_last;
1785 new_recursive.subject_position = eptr;
1786 new_recursive.prevrec = md->recursive;
1787 md->recursive = &new_recursive;
1788
1789 /* Where to continue from afterwards */
1790
1791 ecode += 1 + LINK_SIZE;
1792
1793 /* Now save the offset data */
1794
1795 new_recursive.saved_max = md->offset_end;
1796 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1797 new_recursive.offset_save = stacksave;
1798 else
1799 {
1800 new_recursive.offset_save =
1801 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1802 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1803 }
1804 memcpy(new_recursive.offset_save, md->offset_vector,
1805 new_recursive.saved_max * sizeof(int));
1806
1807 /* OK, now we can do the recursion. After processing each alternative,
1808 restore the offset data and the last captured value. If there were nested
1809 recursions, md->recursive might be changed, so reset it before looping.
1810 */
1811
1812 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1813 cbegroup = (*callpat >= OP_SBRA);
1814 do
1815 {
1816 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1817 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1818 md, eptrb, RM6);
1819 memcpy(md->offset_vector, new_recursive.offset_save,
1820 new_recursive.saved_max * sizeof(int));
1821 md->capture_last = new_recursive.saved_capture_last;
1822 md->recursive = new_recursive.prevrec;
1823 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1824 {
1825 DPRINTF(("Recursion matched\n"));
1826 if (new_recursive.offset_save != stacksave)
1827 (PUBL(free))(new_recursive.offset_save);
1828
1829 /* Set where we got to in the subject, and reset the start in case
1830 it was changed by \K. This *is* propagated back out of a recursion,
1831 for Perl compatibility. */
1832
1833 eptr = md->end_match_ptr;
1834 mstart = md->start_match_ptr;
1835 goto RECURSION_MATCHED; /* Exit loop; end processing */
1836 }
1837
1838 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1839 recursion; they cause a NOMATCH for the entire recursion. These codes
1840 are defined in a range that can be tested for. */
1841
1842 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1843 RRETURN(MATCH_NOMATCH);
1844
1845 /* Any return code other than NOMATCH is an error. */
1846
1847 if (rrc != MATCH_NOMATCH)
1848 {
1849 DPRINTF(("Recursion gave error %d\n", rrc));
1850 if (new_recursive.offset_save != stacksave)
1851 (PUBL(free))(new_recursive.offset_save);
1852 RRETURN(rrc);
1853 }
1854
1855 md->recursive = &new_recursive;
1856 callpat += GET(callpat, 1);
1857 }
1858 while (*callpat == OP_ALT);
1859
1860 DPRINTF(("Recursion didn't match\n"));
1861 md->recursive = new_recursive.prevrec;
1862 if (new_recursive.offset_save != stacksave)
1863 (PUBL(free))(new_recursive.offset_save);
1864 RRETURN(MATCH_NOMATCH);
1865 }
1866
1867 RECURSION_MATCHED:
1868 break;
1869
1870 /* An alternation is the end of a branch; scan along to find the end of the
1871 bracketed group and go to there. */
1872
1873 case OP_ALT:
1874 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1875 break;
1876
1877 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1878 indicating that it may occur zero times. It may repeat infinitely, or not
1879 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1880 with fixed upper repeat limits are compiled as a number of copies, with the
1881 optional ones preceded by BRAZERO or BRAMINZERO. */
1882
1883 case OP_BRAZERO:
1884 next = ecode + 1;
1885 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1887 do next += GET(next, 1); while (*next == OP_ALT);
1888 ecode = next + 1 + LINK_SIZE;
1889 break;
1890
1891 case OP_BRAMINZERO:
1892 next = ecode + 1;
1893 do next += GET(next, 1); while (*next == OP_ALT);
1894 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1896 ecode++;
1897 break;
1898
1899 case OP_SKIPZERO:
1900 next = ecode+1;
1901 do next += GET(next,1); while (*next == OP_ALT);
1902 ecode = next + 1 + LINK_SIZE;
1903 break;
1904
1905 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1906 here; just jump to the group, with allow_zero set TRUE. */
1907
1908 case OP_BRAPOSZERO:
1909 op = *(++ecode);
1910 allow_zero = TRUE;
1911 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1912 goto POSSESSIVE_NON_CAPTURE;
1913
1914 /* End of a group, repeated or non-repeating. */
1915
1916 case OP_KET:
1917 case OP_KETRMIN:
1918 case OP_KETRMAX:
1919 case OP_KETRPOS:
1920 prev = ecode - GET(ecode, 1);
1921
1922 /* If this was a group that remembered the subject start, in order to break
1923 infinite repeats of empty string matches, retrieve the subject start from
1924 the chain. Otherwise, set it NULL. */
1925
1926 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1927 {
1928 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1929 eptrb = eptrb->epb_prev; /* Backup to previous group */
1930 }
1931 else saved_eptr = NULL;
1932
1933 /* If we are at the end of an assertion group or a non-capturing atomic
1934 group, stop matching and return MATCH_MATCH, but record the current high
1935 water mark for use by positive assertions. We also need to record the match
1936 start in case it was changed by \K. */
1937
1938 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1939 *prev == OP_ONCE_NC)
1940 {
1941 md->end_match_ptr = eptr; /* For ONCE_NC */
1942 md->end_offset_top = offset_top;
1943 md->start_match_ptr = mstart;
1944 RRETURN(MATCH_MATCH); /* Sets md->mark */
1945 }
1946
1947 /* For capturing groups we have to check the group number back at the start
1948 and if necessary complete handling an extraction by setting the offsets and
1949 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1950 into group 0, so it won't be picked up here. Instead, we catch it when the
1951 OP_END is reached. Other recursion is handled here. We just have to record
1952 the current subject position and start match pointer and give a MATCH
1953 return. */
1954
1955 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1956 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1957 {
1958 number = GET2(prev, 1+LINK_SIZE);
1959 offset = number << 1;
1960
1961 #ifdef PCRE_DEBUG
1962 printf("end bracket %d", number);
1963 printf("\n");
1964 #endif
1965
1966 /* Handle a recursively called group. */
1967
1968 if (md->recursive != NULL && md->recursive->group_num == number)
1969 {
1970 md->end_match_ptr = eptr;
1971 md->start_match_ptr = mstart;
1972 RRETURN(MATCH_MATCH);
1973 }
1974
1975 /* Deal with capturing */
1976
1977 md->capture_last = (md->capture_last & OVFLMASK) | number;
1978 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1979 {
1980 /* If offset is greater than offset_top, it means that we are
1981 "skipping" a capturing group, and that group's offsets must be marked
1982 unset. In earlier versions of PCRE, all the offsets were unset at the
1983 start of matching, but this doesn't work because atomic groups and
1984 assertions can cause a value to be set that should later be unset.
1985 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1986 part of the atomic group, but this is not on the final matching path,
1987 so must be unset when 2 is set. (If there is no group 2, there is no
1988 problem, because offset_top will then be 2, indicating no capture.) */
1989
1990 if (offset > offset_top)
1991 {
1992 register int *iptr = md->offset_vector + offset_top;
1993 register int *iend = md->offset_vector + offset;
1994 while (iptr < iend) *iptr++ = -1;
1995 }
1996
1997 /* Now make the extraction */
1998
1999 md->offset_vector[offset] =
2000 md->offset_vector[md->offset_end - number];
2001 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
2002 if (offset_top <= offset) offset_top = offset + 2;
2003 }
2004 }
2005
2006 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2007 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2008 at a time from the outer level, thus saving stack. This must precede the
2009 empty string test - in this case that test is done at the outer level. */
2010
2011 if (*ecode == OP_KETRPOS)
2012 {
2013 md->start_match_ptr = mstart; /* In case \K reset it */
2014 md->end_match_ptr = eptr;
2015 md->end_offset_top = offset_top;
2016 RRETURN(MATCH_KETRPOS);
2017 }
2018
2019 /* For an ordinary non-repeating ket, just continue at this level. This
2020 also happens for a repeating ket if no characters were matched in the
2021 group. This is the forcible breaking of infinite loops as implemented in
2022 Perl 5.005. For a non-repeating atomic group that includes captures,
2023 establish a backup point by processing the rest of the pattern at a lower
2024 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2025 original OP_ONCE level, thereby bypassing intermediate backup points, but
2026 resetting any captures that happened along the way. */
2027
2028 if (*ecode == OP_KET || eptr == saved_eptr)
2029 {
2030 if (*prev == OP_ONCE)
2031 {
2032 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2033 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2034 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2035 RRETURN(MATCH_ONCE);
2036 }
2037 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2038 break;
2039 }
2040
2041 /* The normal repeating kets try the rest of the pattern or restart from
2042 the preceding bracket, in the appropriate order. In the second case, we can
2043 use tail recursion to avoid using another stack frame, unless we have an
2044 an atomic group or an unlimited repeat of a group that can match an empty
2045 string. */
2046
2047 if (*ecode == OP_KETRMIN)
2048 {
2049 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2050 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2051 if (*prev == OP_ONCE)
2052 {
2053 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2055 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2056 RRETURN(MATCH_ONCE);
2057 }
2058 if (*prev >= OP_SBRA) /* Could match an empty string */
2059 {
2060 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2061 RRETURN(rrc);
2062 }
2063 ecode = prev;
2064 goto TAIL_RECURSE;
2065 }
2066 else /* OP_KETRMAX */
2067 {
2068 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2069 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2070 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2071 if (*prev == OP_ONCE)
2072 {
2073 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2075 md->once_target = prev;
2076 RRETURN(MATCH_ONCE);
2077 }
2078 ecode += 1 + LINK_SIZE;
2079 goto TAIL_RECURSE;
2080 }
2081 /* Control never gets here */
2082
2083 /* Not multiline mode: start of subject assertion, unless notbol. */
2084
2085 case OP_CIRC:
2086 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2087
2088 /* Start of subject assertion */
2089
2090 case OP_SOD:
2091 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2092 ecode++;
2093 break;
2094
2095 /* Multiline mode: start of subject unless notbol, or after any newline. */
2096
2097 case OP_CIRCM:
2098 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2099 if (eptr != md->start_subject &&
2100 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2101 RRETURN(MATCH_NOMATCH);
2102 ecode++;
2103 break;
2104
2105 /* Start of match assertion */
2106
2107 case OP_SOM:
2108 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2109 ecode++;
2110 break;
2111
2112 /* Reset the start of match point */
2113
2114 case OP_SET_SOM:
2115 mstart = eptr;
2116 ecode++;
2117 break;
2118
2119 /* Multiline mode: assert before any newline, or before end of subject
2120 unless noteol is set. */
2121
2122 case OP_DOLLM:
2123 if (eptr < md->end_subject)
2124 {
2125 if (!IS_NEWLINE(eptr))
2126 {
2127 if (md->partial != 0 &&
2128 eptr + 1 >= md->end_subject &&
2129 NLBLOCK->nltype == NLTYPE_FIXED &&
2130 NLBLOCK->nllen == 2 &&
2131 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2132 {
2133 md->hitend = TRUE;
2134 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2135 }
2136 RRETURN(MATCH_NOMATCH);
2137 }
2138 }
2139 else
2140 {
2141 if (md->noteol) RRETURN(MATCH_NOMATCH);
2142 SCHECK_PARTIAL();
2143 }
2144 ecode++;
2145 break;
2146
2147 /* Not multiline mode: assert before a terminating newline or before end of
2148 subject unless noteol is set. */
2149
2150 case OP_DOLL:
2151 if (md->noteol) RRETURN(MATCH_NOMATCH);
2152 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2153
2154 /* ... else fall through for endonly */
2155
2156 /* End of subject assertion (\z) */
2157
2158 case OP_EOD:
2159 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2160 SCHECK_PARTIAL();
2161 ecode++;
2162 break;
2163
2164 /* End of subject or ending \n assertion (\Z) */
2165
2166 case OP_EODN:
2167 ASSERT_NL_OR_EOS:
2168 if (eptr < md->end_subject &&
2169 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2170 {
2171 if (md->partial != 0 &&
2172 eptr + 1 >= md->end_subject &&
2173 NLBLOCK->nltype == NLTYPE_FIXED &&
2174 NLBLOCK->nllen == 2 &&
2175 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2176 {
2177 md->hitend = TRUE;
2178 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2179 }
2180 RRETURN(MATCH_NOMATCH);
2181 }
2182
2183 /* Either at end of string or \n before end. */
2184
2185 SCHECK_PARTIAL();
2186 ecode++;
2187 break;
2188
2189 /* Word boundary assertions */
2190
2191 case OP_NOT_WORD_BOUNDARY:
2192 case OP_WORD_BOUNDARY:
2193 {
2194
2195 /* Find out if the previous and current characters are "word" characters.
2196 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2197 be "non-word" characters. Remember the earliest consulted character for
2198 partial matching. */
2199
2200 #ifdef SUPPORT_UTF
2201 if (utf)
2202 {
2203 /* Get status of previous character */
2204
2205 if (eptr == md->start_subject) prev_is_word = FALSE; else
2206 {
2207 PCRE_PUCHAR lastptr = eptr - 1;
2208 BACKCHAR(lastptr);
2209 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2210 GETCHAR(c, lastptr);
2211 #ifdef SUPPORT_UCP
2212 if (md->use_ucp)
2213 {
2214 if (c == '_') prev_is_word = TRUE; else
2215 {
2216 int cat = UCD_CATEGORY(c);
2217 prev_is_word = (cat == ucp_L || cat == ucp_N);
2218 }
2219 }
2220 else
2221 #endif
2222 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2223 }
2224
2225 /* Get status of next character */
2226
2227 if (eptr >= md->end_subject)
2228 {
2229 SCHECK_PARTIAL();
2230 cur_is_word = FALSE;
2231 }
2232 else
2233 {
2234 GETCHAR(c, eptr);
2235 #ifdef SUPPORT_UCP
2236 if (md->use_ucp)
2237 {
2238 if (c == '_') cur_is_word = TRUE; else
2239 {
2240 int cat = UCD_CATEGORY(c);
2241 cur_is_word = (cat == ucp_L || cat == ucp_N);
2242 }
2243 }
2244 else
2245 #endif
2246 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2247 }
2248 }
2249 else
2250 #endif
2251
2252 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2253 consistency with the behaviour of \w we do use it in this case. */
2254
2255 {
2256 /* Get status of previous character */
2257
2258 if (eptr == md->start_subject) prev_is_word = FALSE; else
2259 {
2260 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2261 #ifdef SUPPORT_UCP
2262 if (md->use_ucp)
2263 {
2264 c = eptr[-1];
2265 if (c == '_') prev_is_word = TRUE; else
2266 {
2267 int cat = UCD_CATEGORY(c);
2268 prev_is_word = (cat == ucp_L || cat == ucp_N);
2269 }
2270 }
2271 else
2272 #endif
2273 prev_is_word = MAX_255(eptr[-1])
2274 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2275 }
2276
2277 /* Get status of next character */
2278
2279 if (eptr >= md->end_subject)
2280 {
2281 SCHECK_PARTIAL();
2282 cur_is_word = FALSE;
2283 }
2284 else
2285 #ifdef SUPPORT_UCP
2286 if (md->use_ucp)
2287 {
2288 c = *eptr;
2289 if (c == '_') cur_is_word = TRUE; else
2290 {
2291 int cat = UCD_CATEGORY(c);
2292 cur_is_word = (cat == ucp_L || cat == ucp_N);
2293 }
2294 }
2295 else
2296 #endif
2297 cur_is_word = MAX_255(*eptr)
2298 && ((md->ctypes[*eptr] & ctype_word) != 0);
2299 }
2300
2301 /* Now see if the situation is what we want */
2302
2303 if ((*ecode++ == OP_WORD_BOUNDARY)?
2304 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2305 RRETURN(MATCH_NOMATCH);
2306 }
2307 break;
2308
2309 /* Match any single character type except newline; have to take care with
2310 CRLF newlines and partial matching. */
2311
2312 case OP_ANY:
2313 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2314 if (md->partial != 0 &&
2315 eptr + 1 >= md->end_subject &&
2316 NLBLOCK->nltype == NLTYPE_FIXED &&
2317 NLBLOCK->nllen == 2 &&
2318 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2319 {
2320 md->hitend = TRUE;
2321 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2322 }
2323
2324 /* Fall through */
2325
2326 /* Match any single character whatsoever. */
2327
2328 case OP_ALLANY:
2329 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2330 { /* not be updated before SCHECK_PARTIAL. */
2331 SCHECK_PARTIAL();
2332 RRETURN(MATCH_NOMATCH);
2333 }
2334 eptr++;
2335 #ifdef SUPPORT_UTF
2336 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2337 #endif
2338 ecode++;
2339 break;
2340
2341 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2342 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2343
2344 case OP_ANYBYTE:
2345 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2346 { /* not be updated before SCHECK_PARTIAL. */
2347 SCHECK_PARTIAL();
2348 RRETURN(MATCH_NOMATCH);
2349 }
2350 eptr++;
2351 ecode++;
2352 break;
2353
2354 case OP_NOT_DIGIT:
2355 if (eptr >= md->end_subject)
2356 {
2357 SCHECK_PARTIAL();
2358 RRETURN(MATCH_NOMATCH);
2359 }
2360 GETCHARINCTEST(c, eptr);
2361 if (
2362 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2363 c < 256 &&
2364 #endif
2365 (md->ctypes[c] & ctype_digit) != 0
2366 )
2367 RRETURN(MATCH_NOMATCH);
2368 ecode++;
2369 break;
2370
2371 case OP_DIGIT:
2372 if (eptr >= md->end_subject)
2373 {
2374 SCHECK_PARTIAL();
2375 RRETURN(MATCH_NOMATCH);
2376 }
2377 GETCHARINCTEST(c, eptr);
2378 if (
2379 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2380 c > 255 ||
2381 #endif
2382 (md->ctypes[c] & ctype_digit) == 0
2383 )
2384 RRETURN(MATCH_NOMATCH);
2385 ecode++;
2386 break;
2387
2388 case OP_NOT_WHITESPACE:
2389 if (eptr >= md->end_subject)
2390 {
2391 SCHECK_PARTIAL();
2392 RRETURN(MATCH_NOMATCH);
2393 }
2394 GETCHARINCTEST(c, eptr);
2395 if (
2396 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2397 c < 256 &&
2398 #endif
2399 (md->ctypes[c] & ctype_space) != 0
2400 )
2401 RRETURN(MATCH_NOMATCH);
2402 ecode++;
2403 break;
2404
2405 case OP_WHITESPACE:
2406 if (eptr >= md->end_subject)
2407 {
2408 SCHECK_PARTIAL();
2409 RRETURN(MATCH_NOMATCH);
2410 }
2411 GETCHARINCTEST(c, eptr);
2412 if (
2413 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2414 c > 255 ||
2415 #endif
2416 (md->ctypes[c] & ctype_space) == 0
2417 )
2418 RRETURN(MATCH_NOMATCH);
2419 ecode++;
2420 break;
2421
2422 case OP_NOT_WORDCHAR:
2423 if (eptr >= md->end_subject)
2424 {
2425 SCHECK_PARTIAL();
2426 RRETURN(MATCH_NOMATCH);
2427 }
2428 GETCHARINCTEST(c, eptr);
2429 if (
2430 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2431 c < 256 &&
2432 #endif
2433 (md->ctypes[c] & ctype_word) != 0
2434 )
2435 RRETURN(MATCH_NOMATCH);
2436 ecode++;
2437 break;
2438
2439 case OP_WORDCHAR:
2440 if (eptr >= md->end_subject)
2441 {
2442 SCHECK_PARTIAL();
2443 RRETURN(MATCH_NOMATCH);
2444 }
2445 GETCHARINCTEST(c, eptr);
2446 if (
2447 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2448 c > 255 ||
2449 #endif
2450 (md->ctypes[c] & ctype_word) == 0
2451 )
2452 RRETURN(MATCH_NOMATCH);
2453 ecode++;
2454 break;
2455
2456 case OP_ANYNL:
2457 if (eptr >= md->end_subject)
2458 {
2459 SCHECK_PARTIAL();
2460 RRETURN(MATCH_NOMATCH);
2461 }
2462 GETCHARINCTEST(c, eptr);
2463 switch(c)
2464 {
2465 default: RRETURN(MATCH_NOMATCH);
2466
2467 case CHAR_CR:
2468 if (eptr >= md->end_subject)
2469 {
2470 SCHECK_PARTIAL();
2471 }
2472 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2473 break;
2474
2475 case CHAR_LF:
2476 break;
2477
2478 case CHAR_VT:
2479 case CHAR_FF:
2480 case CHAR_NEL:
2481 #ifndef EBCDIC
2482 case 0x2028:
2483 case 0x2029:
2484 #endif /* Not EBCDIC */
2485 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2486 break;
2487 }
2488 ecode++;
2489 break;
2490
2491 case OP_NOT_HSPACE:
2492 if (eptr >= md->end_subject)
2493 {
2494 SCHECK_PARTIAL();
2495 RRETURN(MATCH_NOMATCH);
2496 }
2497 GETCHARINCTEST(c, eptr);
2498 switch(c)
2499 {
2500 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2501 default: break;
2502 }
2503 ecode++;
2504 break;
2505
2506 case OP_HSPACE:
2507 if (eptr >= md->end_subject)
2508 {
2509 SCHECK_PARTIAL();
2510 RRETURN(MATCH_NOMATCH);
2511 }
2512 GETCHARINCTEST(c, eptr);
2513 switch(c)
2514 {
2515 HSPACE_CASES: break; /* Byte and multibyte cases */
2516 default: RRETURN(MATCH_NOMATCH);
2517 }
2518 ecode++;
2519 break;
2520
2521 case OP_NOT_VSPACE:
2522 if (eptr >= md->end_subject)
2523 {
2524 SCHECK_PARTIAL();
2525 RRETURN(MATCH_NOMATCH);
2526 }
2527 GETCHARINCTEST(c, eptr);
2528 switch(c)
2529 {
2530 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2531 default: break;
2532 }
2533 ecode++;
2534 break;
2535
2536 case OP_VSPACE:
2537 if (eptr >= md->end_subject)
2538 {
2539 SCHECK_PARTIAL();
2540 RRETURN(MATCH_NOMATCH);
2541 }
2542 GETCHARINCTEST(c, eptr);
2543 switch(c)
2544 {
2545 VSPACE_CASES: break;
2546 default: RRETURN(MATCH_NOMATCH);
2547 }
2548 ecode++;
2549 break;
2550
2551 #ifdef SUPPORT_UCP
2552 /* Check the next character by Unicode property. We will get here only
2553 if the support is in the binary; otherwise a compile-time error occurs. */
2554
2555 case OP_PROP:
2556 case OP_NOTPROP:
2557 if (eptr >= md->end_subject)
2558 {
2559 SCHECK_PARTIAL();
2560 RRETURN(MATCH_NOMATCH);
2561 }
2562 GETCHARINCTEST(c, eptr);
2563 {
2564 const pcre_uint32 *cp;
2565 const ucd_record *prop = GET_UCD(c);
2566
2567 switch(ecode[1])
2568 {
2569 case PT_ANY:
2570 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2571 break;
2572
2573 case PT_LAMP:
2574 if ((prop->chartype == ucp_Lu ||
2575 prop->chartype == ucp_Ll ||
2576 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2577 RRETURN(MATCH_NOMATCH);
2578 break;
2579
2580 case PT_GC:
2581 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2582 RRETURN(MATCH_NOMATCH);
2583 break;
2584
2585 case PT_PC:
2586 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2587 RRETURN(MATCH_NOMATCH);
2588 break;
2589
2590 case PT_SC:
2591 if ((ecode[2] != prop->script) == (op == OP_PROP))
2592 RRETURN(MATCH_NOMATCH);
2593 break;
2594
2595 /* These are specials */
2596
2597 case PT_ALNUM:
2598 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2599 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2600 RRETURN(MATCH_NOMATCH);
2601 break;
2602
2603 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2604 which means that Perl space and POSIX space are now identical. PCRE
2605 was changed at release 8.34. */
2606
2607 case PT_SPACE: /* Perl space */
2608 case PT_PXSPACE: /* POSIX space */
2609 switch(c)
2610 {
2611 HSPACE_CASES:
2612 VSPACE_CASES:
2613 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2614 break;
2615
2616 default:
2617 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2618 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2619 break;
2620 }
2621 break;
2622
2623 case PT_WORD:
2624 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2625 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2626 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2627 RRETURN(MATCH_NOMATCH);
2628 break;
2629
2630 case PT_CLIST:
2631 cp = PRIV(ucd_caseless_sets) + ecode[2];
2632 for (;;)
2633 {
2634 if (c < *cp)
2635 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2636 if (c == *cp++)
2637 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2638 }
2639 break;
2640
2641 case PT_UCNC:
2642 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2643 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2644 c >= 0xe000) == (op == OP_NOTPROP))
2645 RRETURN(MATCH_NOMATCH);
2646 break;
2647
2648 /* This should never occur */
2649
2650 default:
2651 RRETURN(PCRE_ERROR_INTERNAL);
2652 }
2653
2654 ecode += 3;
2655 }
2656 break;
2657
2658 /* Match an extended Unicode sequence. We will get here only if the support
2659 is in the binary; otherwise a compile-time error occurs. */
2660
2661 case OP_EXTUNI:
2662 if (eptr >= md->end_subject)
2663 {
2664 SCHECK_PARTIAL();
2665 RRETURN(MATCH_NOMATCH);
2666 }
2667 else
2668 {
2669 int lgb, rgb;
2670 GETCHARINCTEST(c, eptr);
2671 lgb = UCD_GRAPHBREAK(c);
2672 while (eptr < md->end_subject)
2673 {
2674 int len = 1;
2675 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2676 rgb = UCD_GRAPHBREAK(c);
2677 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2678 lgb = rgb;
2679 eptr += len;
2680 }
2681 }
2682 CHECK_PARTIAL();
2683 ecode++;
2684 break;
2685 #endif /* SUPPORT_UCP */
2686
2687
2688 /* Match a back reference, possibly repeatedly. Look past the end of the
2689 item to see if there is repeat information following. The code is similar
2690 to that for character classes, but repeated for efficiency. Then obey
2691 similar code to character type repeats - written out again for speed.
2692 However, if the referenced string is the empty string, always treat
2693 it as matched, any number of times (otherwise there could be infinite
2694 loops). If the reference is unset, there are two possibilities:
2695
2696 (a) In the default, Perl-compatible state, set the length negative;
2697 this ensures that every attempt at a match fails. We can't just fail
2698 here, because of the possibility of quantifiers with zero minima.
2699
2700 (b) If the JavaScript compatibility flag is set, set the length to zero
2701 so that the back reference matches an empty string.
2702
2703 Otherwise, set the length to the length of what was matched by the
2704 referenced subpattern.
2705
2706 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2707 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2708 and OP_DNREFI are used. In this case we must scan the list of groups to
2709 which the name refers, and use the first one that is set. */
2710
2711 case OP_DNREF:
2712 case OP_DNREFI:
2713 caseless = op == OP_DNREFI;
2714 {
2715 int count = GET2(ecode, 1+IMM2_SIZE);
2716 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2717 ecode += 1 + 2*IMM2_SIZE;
2718
2719 /* Setting the default length first and initializing 'offset' avoids
2720 compiler warnings in the REF_REPEAT code. */
2721
2722 length = (md->jscript_compat)? 0 : -1;
2723 offset = 0;
2724
2725 while (count-- > 0)
2726 {
2727 offset = GET2(slot, 0) << 1;
2728 if (offset < offset_top && md->offset_vector[offset] >= 0)
2729 {
2730 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2731 break;
2732 }
2733 slot += md->name_entry_size;
2734 }
2735 }
2736 goto REF_REPEAT;
2737
2738 case OP_REF:
2739 case OP_REFI:
2740 caseless = op == OP_REFI;
2741 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2742 ecode += 1 + IMM2_SIZE;
2743 if (offset >= offset_top || md->offset_vector[offset] < 0)
2744 length = (md->jscript_compat)? 0 : -1;
2745 else
2746 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2747
2748 /* Set up for repetition, or handle the non-repeated case */
2749
2750 REF_REPEAT:
2751 switch (*ecode)
2752 {
2753 case OP_CRSTAR:
2754 case OP_CRMINSTAR:
2755 case OP_CRPLUS:
2756 case OP_CRMINPLUS:
2757 case OP_CRQUERY:
2758 case OP_CRMINQUERY:
2759 c = *ecode++ - OP_CRSTAR;
2760 minimize = (c & 1) != 0;
2761 min = rep_min[c]; /* Pick up values from tables; */
2762 max = rep_max[c]; /* zero for max => infinity */
2763 if (max == 0) max = INT_MAX;
2764 break;
2765
2766 case OP_CRRANGE:
2767 case OP_CRMINRANGE:
2768 minimize = (*ecode == OP_CRMINRANGE);
2769 min = GET2(ecode, 1);
2770 max = GET2(ecode, 1 + IMM2_SIZE);
2771 if (max == 0) max = INT_MAX;
2772 ecode += 1 + 2 * IMM2_SIZE;
2773 break;
2774
2775 default: /* No repeat follows */
2776 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2777 {
2778 if (length == -2) eptr = md->end_subject; /* Partial match */
2779 CHECK_PARTIAL();
2780 RRETURN(MATCH_NOMATCH);
2781 }
2782 eptr += length;
2783 continue; /* With the main loop */
2784 }
2785
2786 /* Handle repeated back references. If the length of the reference is
2787 zero, just continue with the main loop. If the length is negative, it
2788 means the reference is unset in non-Java-compatible mode. If the minimum is
2789 zero, we can continue at the same level without recursion. For any other
2790 minimum, carrying on will result in NOMATCH. */
2791
2792 if (length == 0) continue;
2793 if (length < 0 && min == 0) continue;
2794
2795 /* First, ensure the minimum number of matches are present. We get back
2796 the length of the reference string explicitly rather than passing the
2797 address of eptr, so that eptr can be a register variable. */
2798
2799 for (i = 1; i <= min; i++)
2800 {
2801 int slength;
2802 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2803 {
2804 if (slength == -2) eptr = md->end_subject; /* Partial match */
2805 CHECK_PARTIAL();
2806 RRETURN(MATCH_NOMATCH);
2807 }
2808 eptr += slength;
2809 }
2810
2811 /* If min = max, continue at the same level without recursion.
2812 They are not both allowed to be zero. */
2813
2814 if (min == max) continue;
2815
2816 /* If minimizing, keep trying and advancing the pointer */
2817
2818 if (minimize)
2819 {
2820 for (fi = min;; fi++)
2821 {
2822 int slength;
2823 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2825 if (fi >= max) RRETURN(MATCH_NOMATCH);
2826 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2827 {
2828 if (slength == -2) eptr = md->end_subject; /* Partial match */
2829 CHECK_PARTIAL();
2830 RRETURN(MATCH_NOMATCH);
2831 }
2832 eptr += slength;
2833 }
2834 /* Control never gets here */
2835 }
2836
2837 /* If maximizing, find the longest string and work backwards */
2838
2839 else
2840 {
2841 pp = eptr;
2842 for (i = min; i < max; i++)
2843 {
2844 int slength;
2845 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2846 {
2847 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2848 the soft partial matching case. */
2849
2850 if (slength == -2 && md->partial != 0 &&
2851 md->end_subject > md->start_used_ptr)
2852 {
2853 md->hitend = TRUE;
2854 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2855 }
2856 break;
2857 }
2858 eptr += slength;
2859 }
2860
2861 while (eptr >= pp)
2862 {
2863 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2864 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2865 eptr -= length;
2866 }
2867 RRETURN(MATCH_NOMATCH);
2868 }
2869 /* Control never gets here */
2870
2871 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2872 used when all the characters in the class have values in the range 0-255,
2873 and either the matching is caseful, or the characters are in the range
2874 0-127 when UTF-8 processing is enabled. The only difference between
2875 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2876 encountered.
2877
2878 First, look past the end of the item to see if there is repeat information
2879 following. Then obey similar code to character type repeats - written out
2880 again for speed. */
2881
2882 case OP_NCLASS:
2883 case OP_CLASS:
2884 {
2885 /* The data variable is saved across frames, so the byte map needs to
2886 be stored there. */
2887 #define BYTE_MAP ((pcre_uint8 *)data)
2888 data = ecode + 1; /* Save for matching */
2889 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2890
2891 switch (*ecode)
2892 {
2893 case OP_CRSTAR:
2894 case OP_CRMINSTAR:
2895 case OP_CRPLUS:
2896 case OP_CRMINPLUS:
2897 case OP_CRQUERY:
2898 case OP_CRMINQUERY:
2899 case OP_CRPOSSTAR:
2900 case OP_CRPOSPLUS:
2901 case OP_CRPOSQUERY:
2902 c = *ecode++ - OP_CRSTAR;
2903 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2904 else possessive = TRUE;
2905 min = rep_min[c]; /* Pick up values from tables; */
2906 max = rep_max[c]; /* zero for max => infinity */
2907 if (max == 0) max = INT_MAX;
2908 break;
2909
2910 case OP_CRRANGE:
2911 case OP_CRMINRANGE:
2912 case OP_CRPOSRANGE:
2913 minimize = (*ecode == OP_CRMINRANGE);
2914 possessive = (*ecode == OP_CRPOSRANGE);
2915 min = GET2(ecode, 1);
2916 max = GET2(ecode, 1 + IMM2_SIZE);
2917 if (max == 0) max = INT_MAX;
2918 ecode += 1 + 2 * IMM2_SIZE;
2919 break;
2920
2921 default: /* No repeat follows */
2922 min = max = 1;
2923 break;
2924 }
2925
2926 /* First, ensure the minimum number of matches are present. */
2927
2928 #ifdef SUPPORT_UTF
2929 if (utf)
2930 {
2931 for (i = 1; i <= min; i++)
2932 {
2933 if (eptr >= md->end_subject)
2934 {
2935 SCHECK_PARTIAL();
2936 RRETURN(MATCH_NOMATCH);
2937 }
2938 GETCHARINC(c, eptr);
2939 if (c > 255)
2940 {
2941 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2942 }
2943 else
2944 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2945 }
2946 }
2947 else
2948 #endif
2949 /* Not UTF mode */
2950 {
2951 for (i = 1; i <= min; i++)
2952 {
2953 if (eptr >= md->end_subject)
2954 {
2955 SCHECK_PARTIAL();
2956 RRETURN(MATCH_NOMATCH);
2957 }
2958 c = *eptr++;
2959 #ifndef COMPILE_PCRE8
2960 if (c > 255)
2961 {
2962 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2963 }
2964 else
2965 #endif
2966 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2967 }
2968 }
2969
2970 /* If max == min we can continue with the main loop without the
2971 need to recurse. */
2972
2973 if (min == max) continue;
2974
2975 /* If minimizing, keep testing the rest of the expression and advancing
2976 the pointer while it matches the class. */
2977
2978 if (minimize)
2979 {
2980 #ifdef SUPPORT_UTF
2981 if (utf)
2982 {
2983 for (fi = min;; fi++)
2984 {
2985 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2986 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2987 if (fi >= max) RRETURN(MATCH_NOMATCH);
2988 if (eptr >= md->end_subject)
2989 {
2990 SCHECK_PARTIAL();
2991 RRETURN(MATCH_NOMATCH);
2992 }
2993 GETCHARINC(c, eptr);
2994 if (c > 255)
2995 {
2996 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2997 }
2998 else
2999 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3000 }
3001 }
3002 else
3003 #endif
3004 /* Not UTF mode */
3005 {
3006 for (fi = min;; fi++)
3007 {
3008 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3010 if (fi >= max) RRETURN(MATCH_NOMATCH);
3011 if (eptr >= md->end_subject)
3012 {
3013 SCHECK_PARTIAL();
3014 RRETURN(MATCH_NOMATCH);
3015 }
3016 c = *eptr++;
3017 #ifndef COMPILE_PCRE8
3018 if (c > 255)
3019 {
3020 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3021 }
3022 else
3023 #endif
3024 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3025 }
3026 }
3027 /* Control never gets here */
3028 }
3029
3030 /* If maximizing, find the longest possible run, then work backwards. */
3031
3032 else
3033 {
3034 pp = eptr;
3035
3036 #ifdef SUPPORT_UTF
3037 if (utf)
3038 {
3039 for (i = min; i < max; i++)
3040 {
3041 int len = 1;
3042 if (eptr >= md->end_subject)
3043 {
3044 SCHECK_PARTIAL();
3045 break;
3046 }
3047 GETCHARLEN(c, eptr, len);
3048 if (c > 255)
3049 {
3050 if (op == OP_CLASS) break;
3051 }
3052 else
3053 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3054 eptr += len;
3055 }
3056
3057 if (possessive) continue; /* No backtracking */
3058
3059 for (;;)
3060 {
3061 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3062 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3063 if (eptr-- == pp) break; /* Stop if tried at original pos */
3064 BACKCHAR(eptr);
3065 }
3066 }
3067 else
3068 #endif
3069 /* Not UTF mode */
3070 {
3071 for (i = min; i < max; i++)
3072 {
3073 if (eptr >= md->end_subject)
3074 {
3075 SCHECK_PARTIAL();
3076 break;
3077 }
3078 c = *eptr;
3079 #ifndef COMPILE_PCRE8
3080 if (c > 255)
3081 {
3082 if (op == OP_CLASS) break;
3083 }
3084 else
3085 #endif
3086 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3087 eptr++;
3088 }
3089
3090 if (possessive) continue; /* No backtracking */
3091
3092 while (eptr >= pp)
3093 {
3094 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3095 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3096 eptr--;
3097 }
3098 }
3099
3100 RRETURN(MATCH_NOMATCH);
3101 }
3102 #undef BYTE_MAP
3103 }
3104 /* Control never gets here */
3105
3106
3107 /* Match an extended character class. In the 8-bit library, this opcode is
3108 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3109 32-bit libraries, codepoints greater than 255 may be encountered even when
3110 UTF is not supported. */
3111
3112 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3113 case OP_XCLASS:
3114 {
3115 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3116 ecode += GET(ecode, 1); /* Advance past the item */
3117
3118 switch (*ecode)
3119 {
3120 case OP_CRSTAR:
3121 case OP_CRMINSTAR:
3122 case OP_CRPLUS:
3123 case OP_CRMINPLUS:
3124 case OP_CRQUERY:
3125 case OP_CRMINQUERY:
3126 case OP_CRPOSSTAR:
3127 case OP_CRPOSPLUS:
3128 case OP_CRPOSQUERY:
3129 c = *ecode++ - OP_CRSTAR;
3130 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3131 else possessive = TRUE;
3132 min = rep_min[c]; /* Pick up values from tables; */
3133 max = rep_max[c]; /* zero for max => infinity */
3134 if (max == 0) max = INT_MAX;
3135 break;
3136
3137 case OP_CRRANGE:
3138 case OP_CRMINRANGE:
3139 case OP_CRPOSRANGE:
3140 minimize = (*ecode == OP_CRMINRANGE);
3141 possessive = (*ecode == OP_CRPOSRANGE);
3142 min = GET2(ecode, 1);
3143 max = GET2(ecode, 1 + IMM2_SIZE);
3144 if (max == 0) max = INT_MAX;
3145 ecode += 1 + 2 * IMM2_SIZE;
3146 break;
3147
3148 default: /* No repeat follows */
3149 min = max = 1;
3150 break;
3151 }
3152
3153 /* First, ensure the minimum number of matches are present. */
3154
3155 for (i = 1; i <= min; i++)
3156 {
3157 if (eptr >= md->end_subject)
3158 {
3159 SCHECK_PARTIAL();
3160 RRETURN(MATCH_NOMATCH);
3161 }
3162 GETCHARINCTEST(c, eptr);
3163 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3164 }
3165
3166 /* If max == min we can continue with the main loop without the
3167 need to recurse. */
3168
3169 if (min == max) continue;
3170
3171 /* If minimizing, keep testing the rest of the expression and advancing
3172 the pointer while it matches the class. */
3173
3174 if (minimize)
3175 {
3176 for (fi = min;; fi++)
3177 {
3178 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3179 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3180 if (fi >= max) RRETURN(MATCH_NOMATCH);
3181 if (eptr >= md->end_subject)
3182 {
3183 SCHECK_PARTIAL();
3184 RRETURN(MATCH_NOMATCH);
3185 }
3186 GETCHARINCTEST(c, eptr);
3187 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3188 }
3189 /* Control never gets here */
3190 }
3191
3192 /* If maximizing, find the longest possible run, then work backwards. */
3193
3194 else
3195 {
3196 pp = eptr;
3197 for (i = min; i < max; i++)
3198 {
3199 int len = 1;
3200 if (eptr >= md->end_subject)
3201 {
3202 SCHECK_PARTIAL();
3203 break;
3204 }
3205 #ifdef SUPPORT_UTF
3206 GETCHARLENTEST(c, eptr, len);
3207 #else
3208 c = *eptr;
3209 #endif
3210 if (!PRIV(xclass)(c, data, utf)) break;
3211 eptr += len;
3212 }
3213
3214 if (possessive) continue; /* No backtracking */
3215
3216 for(;;)
3217 {
3218 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3219 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3220 if (eptr-- == pp) break; /* Stop if tried at original pos */
3221 #ifdef SUPPORT_UTF
3222 if (utf) BACKCHAR(eptr);
3223 #endif
3224 }
3225 RRETURN(MATCH_NOMATCH);
3226 }
3227
3228 /* Control never gets here */
3229 }
3230 #endif /* End of XCLASS */
3231
3232 /* Match a single character, casefully */
3233
3234 case OP_CHAR:
3235 #ifdef SUPPORT_UTF
3236 if (utf)
3237 {
3238 length = 1;
3239 ecode++;
3240 GETCHARLEN(fc, ecode, length);
3241 if (length > md->end_subject - eptr)
3242 {
3243 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3244 RRETURN(MATCH_NOMATCH);
3245 }
3246 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3247 }
3248 else
3249 #endif
3250 /* Not UTF mode */
3251 {
3252 if (md->end_subject - eptr < 1)
3253 {
3254 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3255 RRETURN(MATCH_NOMATCH);
3256 }
3257 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3258 ecode += 2;
3259 }
3260 break;
3261
3262 /* Match a single character, caselessly. If we are at the end of the
3263 subject, give up immediately. */
3264
3265 case OP_CHARI:
3266 if (eptr >= md->end_subject)
3267 {
3268 SCHECK_PARTIAL();
3269 RRETURN(MATCH_NOMATCH);
3270 }
3271
3272 #ifdef SUPPORT_UTF
3273 if (utf)
3274 {
3275 length = 1;
3276 ecode++;
3277 GETCHARLEN(fc, ecode, length);
3278
3279 /* If the pattern character's value is < 128, we have only one byte, and
3280 we know that its other case must also be one byte long, so we can use the
3281 fast lookup table. We know that there is at least one byte left in the
3282 subject. */
3283
3284 if (fc < 128)
3285 {
3286 pcre_uint32 cc = UCHAR21(eptr);
3287 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3288 ecode++;
3289 eptr++;
3290 }
3291
3292 /* Otherwise we must pick up the subject character. Note that we cannot
3293 use the value of "length" to check for sufficient bytes left, because the
3294 other case of the character may have more or fewer bytes. */
3295
3296 else
3297 {
3298 pcre_uint32 dc;
3299 GETCHARINC(dc, eptr);
3300 ecode += length;
3301
3302 /* If we have Unicode property support, we can use it to test the other
3303 case of the character, if there is one. */
3304
3305 if (fc != dc)
3306 {
3307 #ifdef SUPPORT_UCP
3308 if (dc != UCD_OTHERCASE(fc))
3309 #endif
3310 RRETURN(MATCH_NOMATCH);
3311 }
3312 }
3313 }
3314 else
3315 #endif /* SUPPORT_UTF */
3316
3317 /* Not UTF mode */
3318 {
3319 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3320 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3321 eptr++;
3322 ecode += 2;
3323 }
3324 break;
3325
3326 /* Match a single character repeatedly. */
3327
3328 case OP_EXACT:
3329 case OP_EXACTI:
3330 min = max = GET2(ecode, 1);
3331 ecode += 1 + IMM2_SIZE;
3332 goto REPEATCHAR;
3333
3334 case OP_POSUPTO:
3335 case OP_POSUPTOI:
3336 possessive = TRUE;
3337 /* Fall through */
3338
3339 case OP_UPTO:
3340 case OP_UPTOI:
3341 case OP_MINUPTO:
3342 case OP_MINUPTOI:
3343 min = 0;
3344 max = GET2(ecode, 1);
3345 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3346 ecode += 1 + IMM2_SIZE;
3347 goto REPEATCHAR;
3348
3349 case OP_POSSTAR:
3350 case OP_POSSTARI:
3351 possessive = TRUE;
3352 min = 0;
3353 max = INT_MAX;
3354 ecode++;
3355 goto REPEATCHAR;
3356
3357 case OP_POSPLUS:
3358 case OP_POSPLUSI:
3359 possessive = TRUE;
3360 min = 1;
3361 max = INT_MAX;
3362 ecode++;
3363 goto REPEATCHAR;
3364
3365 case OP_POSQUERY:
3366 case OP_POSQUERYI:
3367 possessive = TRUE;
3368 min = 0;
3369 max = 1;
3370 ecode++;
3371 goto REPEATCHAR;
3372
3373 case OP_STAR:
3374 case OP_STARI:
3375 case OP_MINSTAR:
3376 case OP_MINSTARI:
3377 case OP_PLUS:
3378 case OP_PLUSI:
3379 case OP_MINPLUS:
3380 case OP_MINPLUSI:
3381 case OP_QUERY:
3382 case OP_QUERYI:
3383 case OP_MINQUERY:
3384 case OP_MINQUERYI:
3385 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3386 minimize = (c & 1) != 0;
3387 min = rep_min[c]; /* Pick up values from tables; */
3388 max = rep_max[c]; /* zero for max => infinity */
3389 if (max == 0) max = INT_MAX;
3390
3391 /* Common code for all repeated single-character matches. We first check
3392 for the minimum number of characters. If the minimum equals the maximum, we
3393 are done. Otherwise, if minimizing, check the rest of the pattern for a
3394 match; if there isn't one, advance up to the maximum, one character at a
3395 time.
3396
3397 If maximizing, advance up to the maximum number of matching characters,
3398 until eptr is past the end of the maximum run. If possessive, we are
3399 then done (no backing up). Otherwise, match at this position; anything
3400 other than no match is immediately returned. For nomatch, back up one
3401 character, unless we are matching \R and the last thing matched was
3402 \r\n, in which case, back up two bytes. When we reach the first optional
3403 character position, we can save stack by doing a tail recurse.
3404
3405 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3406 for speed. */
3407
3408 REPEATCHAR:
3409 #ifdef SUPPORT_UTF
3410 if (utf)
3411 {
3412 length = 1;
3413 charptr = ecode;
3414 GETCHARLEN(fc, ecode, length);
3415 ecode += length;
3416
3417 /* Handle multibyte character matching specially here. There is
3418 support for caseless matching if UCP support is present. */
3419
3420 if (length > 1)
3421 {
3422 #ifdef SUPPORT_UCP
3423 pcre_uint32 othercase;
3424 if (op >= OP_STARI && /* Caseless */
3425 (othercase = UCD_OTHERCASE(fc)) != fc)
3426 oclength = PRIV(ord2utf)(othercase, occhars);
3427 else oclength = 0;
3428 #endif /* SUPPORT_UCP */
3429
3430 for (i = 1; i <= min; i++)
3431 {
3432 if (eptr <= md->end_subject - length &&
3433 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3434 #ifdef SUPPORT_UCP
3435 else if (oclength > 0 &&
3436 eptr <= md->end_subject - oclength &&
3437 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3438 #endif /* SUPPORT_UCP */
3439 else
3440 {
3441 CHECK_PARTIAL();
3442 RRETURN(MATCH_NOMATCH);
3443 }
3444 }
3445
3446 if (min == max) continue;
3447
3448 if (minimize)
3449 {
3450 for (fi = min;; fi++)
3451 {
3452 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3453 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3454 if (fi >= max) RRETURN(MATCH_NOMATCH);
3455 if (eptr <= md->end_subject - length &&
3456 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3457 #ifdef SUPPORT_UCP
3458 else if (oclength > 0 &&
3459 eptr <= md->end_subject - oclength &&
3460 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3461 #endif /* SUPPORT_UCP */
3462 else
3463 {
3464 CHECK_PARTIAL();
3465 RRETURN(MATCH_NOMATCH);
3466 }
3467 }
3468 /* Control never gets here */
3469 }
3470
3471 else /* Maximize */
3472 {
3473 pp = eptr;
3474 for (i = min; i < max; i++)
3475 {
3476 if (eptr <= md->end_subject - length &&
3477 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3478 #ifdef SUPPORT_UCP
3479 else if (oclength > 0 &&
3480 eptr <= md->end_subject - oclength &&
3481 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3482 #endif /* SUPPORT_UCP */
3483 else
3484 {
3485 CHECK_PARTIAL();
3486 break;
3487 }
3488 }
3489
3490 if (possessive) continue; /* No backtracking */
3491 for(;;)
3492 {
3493 if (eptr == pp) goto TAIL_RECURSE;
3494 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3495 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3496 #ifdef SUPPORT_UCP
3497 eptr--;
3498 BACKCHAR(eptr);
3499 #else /* without SUPPORT_UCP */
3500 eptr -= length;
3501 #endif /* SUPPORT_UCP */
3502 }
3503 }
3504 /* Control never gets here */
3505 }
3506
3507 /* If the length of a UTF-8 character is 1, we fall through here, and
3508 obey the code as for non-UTF-8 characters below, though in this case the
3509 value of fc will always be < 128. */
3510 }
3511 else
3512 #endif /* SUPPORT_UTF */
3513 /* When not in UTF-8 mode, load a single-byte character. */
3514 fc = *ecode++;
3515
3516 /* The value of fc at this point is always one character, though we may
3517 or may not be in UTF mode. The code is duplicated for the caseless and
3518 caseful cases, for speed, since matching characters is likely to be quite
3519 common. First, ensure the minimum number of matches are present. If min =
3520 max, continue at the same level without recursing. Otherwise, if
3521 minimizing, keep trying the rest of the expression and advancing one
3522 matching character if failing, up to the maximum. Alternatively, if
3523 maximizing, find the maximum number of characters and work backwards. */
3524
3525 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3526 max, (char *)eptr));
3527
3528 if (op >= OP_STARI) /* Caseless */
3529 {
3530 #ifdef COMPILE_PCRE8
3531 /* fc must be < 128 if UTF is enabled. */
3532 foc = md->fcc[fc];
3533 #else
3534 #ifdef SUPPORT_UTF
3535 #ifdef SUPPORT_UCP
3536 if (utf && fc > 127)
3537 foc = UCD_OTHERCASE(fc);
3538 #else
3539 if (utf && fc > 127)
3540 foc = fc;
3541 #endif /* SUPPORT_UCP */
3542 else
3543 #endif /* SUPPORT_UTF */
3544 foc = TABLE_GET(fc, md->fcc, fc);
3545 #endif /* COMPILE_PCRE8 */
3546
3547 for (i = 1; i <= min; i++)
3548 {
3549 pcre_uint32 cc; /* Faster than pcre_uchar */
3550 if (eptr >= md->end_subject)
3551 {
3552 SCHECK_PARTIAL();
3553 RRETURN(MATCH_NOMATCH);
3554 }
3555 cc = UCHAR21TEST(eptr);
3556 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3557 eptr++;
3558 }
3559 if (min == max) continue;
3560 if (minimize)
3561 {
3562 for (fi = min;; fi++)
3563 {
3564 pcre_uint32 cc; /* Faster than pcre_uchar */
3565 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3566 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3567 if (fi >= max) RRETURN(MATCH_NOMATCH);
3568 if (eptr >= md->end_subject)
3569 {
3570 SCHECK_PARTIAL();
3571 RRETURN(MATCH_NOMATCH);
3572 }
3573 cc = UCHAR21TEST(eptr);
3574 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3575 eptr++;
3576 }
3577 /* Control never gets here */
3578 }
3579 else /* Maximize */
3580 {
3581 pp = eptr;
3582 for (i = min; i < max; i++)
3583 {
3584 pcre_uint32 cc; /* Faster than pcre_uchar */
3585 if (eptr >= md->end_subject)
3586 {
3587 SCHECK_PARTIAL();
3588 break;
3589 }
3590 cc = UCHAR21TEST(eptr);
3591 if (fc != cc && foc != cc) break;
3592 eptr++;
3593 }
3594 if (possessive) continue; /* No backtracking */
3595 for (;;)
3596 {
3597 if (eptr == pp) goto TAIL_RECURSE;
3598 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3599 eptr--;
3600 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3601 }
3602 /* Control never gets here */
3603 }
3604 }
3605
3606 /* Caseful comparisons (includes all multi-byte characters) */
3607
3608 else
3609 {
3610 for (i = 1; i <= min; i++)
3611 {
3612 if (eptr >= md->end_subject)
3613 {
3614 SCHECK_PARTIAL();
3615 RRETURN(MATCH_NOMATCH);
3616 }
3617 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3618 }
3619
3620 if (min == max) continue;
3621
3622 if (minimize)
3623 {
3624 for (fi = min;; fi++)
3625 {
3626 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3628 if (fi >= max) RRETURN(MATCH_NOMATCH);
3629 if (eptr >= md->end_subject)
3630 {
3631 SCHECK_PARTIAL();
3632 RRETURN(MATCH_NOMATCH);
3633 }
3634 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3635 }
3636 /* Control never gets here */
3637 }
3638 else /* Maximize */
3639 {
3640 pp = eptr;
3641 for (i = min; i < max; i++)
3642 {
3643 if (eptr >= md->end_subject)
3644 {
3645 SCHECK_PARTIAL();
3646 break;
3647 }
3648 if (fc != UCHAR21TEST(eptr)) break;
3649 eptr++;
3650 }
3651 if (possessive) continue; /* No backtracking */
3652 for (;;)
3653 {
3654 if (eptr == pp) goto TAIL_RECURSE;
3655 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3656 eptr--;
3657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3658 }
3659 /* Control never gets here */
3660 }
3661 }
3662 /* Control never gets here */
3663
3664 /* Match a negated single one-byte character. The character we are
3665 checking can be multibyte. */
3666
3667 case OP_NOT:
3668 case OP_NOTI:
3669 if (eptr >= md->end_subject)
3670 {
3671 SCHECK_PARTIAL();
3672 RRETURN(MATCH_NOMATCH);
3673 }
3674 #ifdef SUPPORT_UTF
3675 if (utf)
3676 {
3677 register pcre_uint32 ch, och;
3678
3679 ecode++;
3680 GETCHARINC(ch, ecode);
3681 GETCHARINC(c, eptr);
3682
3683 if (op == OP_NOT)
3684 {
3685 if (ch == c) RRETURN(MATCH_NOMATCH);
3686 }
3687 else
3688 {
3689 #ifdef SUPPORT_UCP
3690 if (ch > 127)
3691 och = UCD_OTHERCASE(ch);
3692 #else
3693 if (ch > 127)
3694 och = ch;
3695 #endif /* SUPPORT_UCP */
3696 else
3697 och = TABLE_GET(ch, md->fcc, ch);
3698 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3699 }
3700 }
3701 else
3702 #endif
3703 {
3704 register pcre_uint32 ch = ecode[1];
3705 c = *eptr++;
3706 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3707 RRETURN(MATCH_NOMATCH);
3708 ecode += 2;
3709 }
3710 break;
3711
3712 /* Match a negated single one-byte character repeatedly. This is almost a
3713 repeat of the code for a repeated single character, but I haven't found a
3714 nice way of commoning these up that doesn't require a test of the
3715 positive/negative option for each character match. Maybe that wouldn't add
3716 very much to the time taken, but character matching *is* what this is all
3717 about... */
3718
3719 case OP_NOTEXACT:
3720 case OP_NOTEXACTI:
3721 min = max = GET2(ecode, 1);
3722 ecode += 1 + IMM2_SIZE;
3723 goto REPEATNOTCHAR;
3724
3725 case OP_NOTUPTO:
3726 case OP_NOTUPTOI:
3727 case OP_NOTMINUPTO:
3728 case OP_NOTMINUPTOI:
3729 min = 0;
3730 max = GET2(ecode, 1);
3731 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3732 ecode += 1 + IMM2_SIZE;
3733 goto REPEATNOTCHAR;
3734
3735 case OP_NOTPOSSTAR:
3736 case OP_NOTPOSSTARI:
3737 possessive = TRUE;
3738 min = 0;
3739 max = INT_MAX;
3740 ecode++;
3741 goto REPEATNOTCHAR;
3742
3743 case OP_NOTPOSPLUS:
3744 case OP_NOTPOSPLUSI:
3745 possessive = TRUE;
3746 min = 1;
3747 max = INT_MAX;
3748 ecode++;
3749 goto REPEATNOTCHAR;
3750
3751 case OP_NOTPOSQUERY:
3752 case OP_NOTPOSQUERYI:
3753 possessive = TRUE;
3754 min = 0;
3755 max = 1;
3756 ecode++;
3757 goto REPEATNOTCHAR;
3758
3759 case OP_NOTPOSUPTO:
3760 case OP_NOTPOSUPTOI:
3761 possessive = TRUE;
3762 min = 0;
3763 max = GET2(ecode, 1);
3764 ecode += 1 + IMM2_SIZE;
3765 goto REPEATNOTCHAR;
3766
3767 case OP_NOTSTAR:
3768 case OP_NOTSTARI:
3769 case OP_NOTMINSTAR:
3770 case OP_NOTMINSTARI:
3771 case OP_NOTPLUS:
3772 case OP_NOTPLUSI:
3773 case OP_NOTMINPLUS:
3774 case OP_NOTMINPLUSI:
3775 case OP_NOTQUERY:
3776 case OP_NOTQUERYI:
3777 case OP_NOTMINQUERY:
3778 case OP_NOTMINQUERYI:
3779 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3780 minimize = (c & 1) != 0;
3781 min = rep_min[c]; /* Pick up values from tables; */
3782 max = rep_max[c]; /* zero for max => infinity */
3783 if (max == 0) max = INT_MAX;
3784
3785 /* Common code for all repeated single-byte matches. */
3786
3787 REPEATNOTCHAR:
3788 GETCHARINCTEST(fc, ecode);
3789
3790 /* The code is duplicated for the caseless and caseful cases, for speed,
3791 since matching characters is likely to be quite common. First, ensure the
3792 minimum number of matches are present. If min = max, continue at the same
3793 level without recursing. Otherwise, if minimizing, keep trying the rest of
3794 the expression and advancing one matching character if failing, up to the
3795 maximum. Alternatively, if maximizing, find the maximum number of
3796 characters and work backwards. */
3797
3798 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3799 max, (char *)eptr));
3800
3801 if (op >= OP_NOTSTARI) /* Caseless */
3802 {
3803 #ifdef SUPPORT_UTF
3804 #ifdef SUPPORT_UCP
3805 if (utf && fc > 127)
3806 foc = UCD_OTHERCASE(fc);
3807 #else
3808 if (utf && fc > 127)
3809 foc = fc;
3810 #endif /* SUPPORT_UCP */
3811 else
3812 #endif /* SUPPORT_UTF */
3813 foc = TABLE_GET(fc, md->fcc, fc);
3814
3815 #ifdef SUPPORT_UTF
3816 if (utf)
3817 {
3818 register pcre_uint32 d;
3819 for (i = 1; i <= min; i++)
3820 {
3821 if (eptr >= md->end_subject)
3822 {
3823 SCHECK_PARTIAL();
3824 RRETURN(MATCH_NOMATCH);
3825 }
3826 GETCHARINC(d, eptr);
3827 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3828 }
3829 }
3830 else
3831 #endif /* SUPPORT_UTF */
3832 /* Not UTF mode */
3833 {
3834 for (i = 1; i <= min; i++)
3835 {
3836 if (eptr >= md->end_subject)
3837 {
3838 SCHECK_PARTIAL();
3839 RRETURN(MATCH_NOMATCH);
3840 }
3841 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3842 eptr++;
3843 }
3844 }
3845
3846 if (min == max) continue;
3847
3848 if (minimize)
3849 {
3850 #ifdef SUPPORT_UTF
3851 if (utf)
3852 {
3853 register pcre_uint32 d;
3854 for (fi = min;; fi++)
3855 {
3856 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858 if (fi >= max) RRETURN(MATCH_NOMATCH);
3859 if (eptr >= md->end_subject)
3860 {
3861 SCHECK_PARTIAL();
3862 RRETURN(MATCH_NOMATCH);
3863 }
3864 GETCHARINC(d, eptr);
3865 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3866 }
3867 }
3868 else
3869 #endif /*SUPPORT_UTF */
3870 /* Not UTF mode */
3871 {
3872 for (fi = min;; fi++)
3873 {
3874 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3875 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3876 if (fi >= max) RRETURN(MATCH_NOMATCH);
3877 if (eptr >= md->end_subject)
3878 {
3879 SCHECK_PARTIAL();
3880 RRETURN(MATCH_NOMATCH);
3881 }
3882 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3883 eptr++;
3884 }
3885 }
3886 /* Control never gets here */
3887 }
3888
3889 /* Maximize case */
3890
3891 else
3892 {
3893 pp = eptr;
3894
3895 #ifdef SUPPORT_UTF
3896 if (utf)
3897 {
3898 register pcre_uint32 d;
3899 for (i = min; i < max; i++)
3900 {
3901 int len = 1;
3902 if (eptr >= md->end_subject)
3903 {
3904 SCHECK_PARTIAL();
3905 break;
3906 }
3907 GETCHARLEN(d, eptr, len);
3908 if (fc == d || (unsigned int)foc == d) break;
3909 eptr += len;
3910 }
3911 if (possessive) continue; /* No backtracking */
3912 for(;;)
3913 {
3914 if (eptr == pp) goto TAIL_RECURSE;
3915 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3917 eptr--;
3918 BACKCHAR(eptr);
3919 }
3920 }
3921 else
3922 #endif /* SUPPORT_UTF */
3923 /* Not UTF mode */
3924 {
3925 for (i = min; i < max; i++)
3926 {
3927 if (eptr >= md->end_subject)
3928 {
3929 SCHECK_PARTIAL();
3930 break;
3931 }
3932 if (fc == *eptr || foc == *eptr) break;
3933 eptr++;
3934 }
3935 if (possessive) continue; /* No backtracking */
3936 for (;;)
3937 {
3938 if (eptr == pp) goto TAIL_RECURSE;
3939 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3941 eptr--;
3942 }
3943 }
3944 /* Control never gets here */
3945 }
3946 }
3947
3948 /* Caseful comparisons */
3949
3950 else
3951 {
3952 #ifdef SUPPORT_UTF
3953 if (utf)
3954 {
3955 register pcre_uint32 d;
3956 for (i = 1; i <= min; i++)
3957 {
3958 if (eptr >= md->end_subject)
3959 {
3960 SCHECK_PARTIAL();
3961 RRETURN(MATCH_NOMATCH);
3962 }
3963 GETCHARINC(d, eptr);
3964 if (fc == d) RRETURN(MATCH_NOMATCH);
3965 }
3966 }
3967 else
3968 #endif
3969 /* Not UTF mode */
3970 {
3971 for (i = 1; i <= min; i++)
3972 {
3973 if (eptr >= md->end_subject)
3974 {
3975 SCHECK_PARTIAL();
3976 RRETURN(MATCH_NOMATCH);
3977 }
3978 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3979 }
3980 }
3981
3982 if (min == max) continue;
3983
3984 if (minimize)
3985 {
3986 #ifdef SUPPORT_UTF
3987 if (utf)
3988 {
3989 register pcre_uint32 d;
3990 for (fi = min;; fi++)
3991 {
3992 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3993 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3994 if (fi >= max) RRETURN(MATCH_NOMATCH);
3995 if (eptr >= md->end_subject)
3996 {
3997 SCHECK_PARTIAL();
3998 RRETURN(MATCH_NOMATCH);
3999 }
4000 GETCHARINC(d, eptr);
4001 if (fc == d) RRETURN(MATCH_NOMATCH);
4002 }
4003 }
4004 else
4005 #endif
4006 /* Not UTF mode */
4007 {
4008 for (fi = min;; fi++)
4009 {
4010 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4011 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4012 if (fi >= max) RRETURN(MATCH_NOMATCH);
4013 if (eptr >= md->end_subject)
4014 {
4015 SCHECK_PARTIAL();
4016 RRETURN(MATCH_NOMATCH);
4017 }
4018 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4019 }
4020 }
4021 /* Control never gets here */
4022 }
4023
4024 /* Maximize case */
4025
4026 else
4027 {
4028 pp = eptr;
4029
4030 #ifdef SUPPORT_UTF
4031 if (utf)
4032 {
4033 register pcre_uint32 d;
4034 for (i = min; i < max; i++)
4035 {
4036 int len = 1;
4037 if (eptr >= md->end_subject)
4038 {
4039 SCHECK_PARTIAL();
4040 break;
4041 }
4042 GETCHARLEN(d, eptr, len);
4043 if (fc == d) break;
4044 eptr += len;
4045 }
4046 if (possessive) continue; /* No backtracking */
4047 for(;;)
4048 {
4049 if (eptr == pp) goto TAIL_RECURSE;
4050 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4052 eptr--;
4053 BACKCHAR(eptr);
4054 }
4055 }
4056 else
4057 #endif
4058 /* Not UTF mode */
4059 {
4060 for (i = min; i < max; i++)
4061 {
4062 if (eptr >= md->end_subject)
4063 {
4064 SCHECK_PARTIAL();
4065 break;
4066 }
4067 if (fc == *eptr) break;
4068 eptr++;
4069 }
4070 if (possessive) continue; /* No backtracking */
4071 for (;;)
4072 {
4073 if (eptr == pp) goto TAIL_RECURSE;
4074 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4075 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4076 eptr--;
4077 }
4078 }
4079 /* Control never gets here */
4080 }
4081 }
4082 /* Control never gets here */
4083
4084 /* Match a single character type repeatedly; several different opcodes
4085 share code. This is very similar to the code for single characters, but we
4086 repeat it in the interests of efficiency. */
4087
4088 case OP_TYPEEXACT:
4089 min = max = GET2(ecode, 1);
4090 minimize = TRUE;
4091 ecode += 1 + IMM2_SIZE;
4092 goto REPEATTYPE;
4093
4094 case OP_TYPEUPTO:
4095 case OP_TYPEMINUPTO:
4096 min = 0;
4097 max = GET2(ecode, 1);
4098 minimize = *ecode == OP_TYPEMINUPTO;
4099 ecode += 1 + IMM2_SIZE;
4100 goto REPEATTYPE;
4101
4102 case OP_TYPEPOSSTAR:
4103 possessive = TRUE;
4104 min = 0;
4105 max = INT_MAX;
4106 ecode++;
4107 goto REPEATTYPE;
4108
4109 case OP_TYPEPOSPLUS:
4110 possessive = TRUE;
4111 min = 1;
4112 max = INT_MAX;
4113 ecode++;
4114 goto REPEATTYPE;
4115
4116 case OP_TYPEPOSQUERY:
4117 possessive = TRUE;
4118 min = 0;
4119 max = 1;
4120 ecode++;
4121 goto REPEATTYPE;
4122
4123 case OP_TYPEPOSUPTO:
4124 possessive = TRUE;
4125 min = 0;
4126 max = GET2(ecode, 1);
4127 ecode += 1 + IMM2_SIZE;
4128 goto REPEATTYPE;
4129
4130 case OP_TYPESTAR:
4131 case OP_TYPEMINSTAR:
4132 case OP_TYPEPLUS:
4133 case OP_TYPEMINPLUS:
4134 case OP_TYPEQUERY:
4135 case OP_TYPEMINQUERY:
4136 c = *ecode++ - OP_TYPESTAR;
4137 minimize = (c & 1) != 0;
4138 min = rep_min[c]; /* Pick up values from tables; */
4139 max = rep_max[c]; /* zero for max => infinity */
4140 if (max == 0) max = INT_MAX;
4141
4142 /* Common code for all repeated single character type matches. Note that
4143 in UTF-8 mode, '.' matches a character of any length, but for the other
4144 character types, the valid characters are all one-byte long. */
4145
4146 REPEATTYPE:
4147 ctype = *ecode++; /* Code for the character type */
4148
4149 #ifdef SUPPORT_UCP
4150 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4151 {
4152 prop_fail_result = ctype == OP_NOTPROP;
4153 prop_type = *ecode++;
4154 prop_value = *ecode++;
4155 }
4156 else prop_type = -1;
4157 #endif
4158
4159 /* First, ensure the minimum number of matches are present. Use inline
4160 code for maximizing the speed, and do the type test once at the start
4161 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4162 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4163 and single-bytes. */
4164
4165 if (min > 0)
4166 {
4167 #ifdef SUPPORT_UCP
4168 if (prop_type >= 0)
4169 {
4170 switch(prop_type)
4171 {
4172 case PT_ANY:
4173 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4174 for (i = 1; i <= min; i++)
4175 {
4176 if (eptr >= md->end_subject)
4177 {
4178 SCHECK_PARTIAL();
4179 RRETURN(MATCH_NOMATCH);
4180 }
4181 GETCHARINCTEST(c, eptr);
4182 }
4183 break;
4184
4185 case PT_LAMP:
4186 for (i = 1; i <= min; i++)
4187 {
4188 int chartype;
4189 if (eptr >= md->end_subject)
4190 {
4191 SCHECK_PARTIAL();
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 GETCHARINCTEST(c, eptr);
4195 chartype = UCD_CHARTYPE(c);
4196 if ((chartype == ucp_Lu ||
4197 chartype == ucp_Ll ||
4198 chartype == ucp_Lt) == prop_fail_result)
4199 RRETURN(MATCH_NOMATCH);
4200 }
4201 break;
4202
4203 case PT_GC:
4204 for (i = 1; i <= min; i++)
4205 {
4206 if (eptr >= md->end_subject)
4207 {
4208 SCHECK_PARTIAL();
4209 RRETURN(MATCH_NOMATCH);
4210 }
4211 GETCHARINCTEST(c, eptr);
4212 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4213 RRETURN(MATCH_NOMATCH);
4214 }
4215 break;
4216
4217 case PT_PC:
4218 for (i = 1; i <= min; i++)
4219 {
4220 if (eptr >= md->end_subject)
4221 {
4222 SCHECK_PARTIAL();
4223 RRETURN(MATCH_NOMATCH);
4224 }
4225 GETCHARINCTEST(c, eptr);
4226 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4227 RRETURN(MATCH_NOMATCH);
4228 }
4229 break;
4230
4231 case PT_SC:
4232 for (i = 1; i <= min; i++)
4233 {
4234 if (eptr >= md->end_subject)
4235 {
4236 SCHECK_PARTIAL();
4237 RRETURN(MATCH_NOMATCH);
4238 }
4239 GETCHARINCTEST(c, eptr);
4240 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4241 RRETURN(MATCH_NOMATCH);
4242 }
4243 break;
4244
4245 case PT_ALNUM:
4246 for (i = 1; i <= min; i++)
4247 {
4248 int category;
4249 if (eptr >= md->end_subject)
4250 {
4251 SCHECK_PARTIAL();
4252 RRETURN(MATCH_NOMATCH);
4253 }
4254 GETCHARINCTEST(c, eptr);
4255 category = UCD_CATEGORY(c);
4256 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4257 RRETURN(MATCH_NOMATCH);
4258 }
4259 break;
4260
4261 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4262 which means that Perl space and POSIX space are now identical. PCRE
4263 was changed at release 8.34. */
4264
4265 case PT_SPACE: /* Perl space */
4266 case PT_PXSPACE: /* POSIX space */
4267 for (i = 1; i <= min; i++)
4268 {
4269 if (eptr >= md->end_subject)
4270 {
4271 SCHECK_PARTIAL();
4272 RRETURN(MATCH_NOMATCH);
4273 }
4274 GETCHARINCTEST(c, eptr);
4275 switch(c)
4276 {
4277 HSPACE_CASES:
4278 VSPACE_CASES:
4279 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4280 break;
4281
4282 default:
4283 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4284 RRETURN(MATCH_NOMATCH);
4285 break;
4286 }
4287 }
4288 break;
4289
4290 case PT_WORD:
4291 for (i = 1; i <= min; i++)
4292 {
4293 int category;
4294 if (eptr >= md->end_subject)
4295 {
4296 SCHECK_PARTIAL();
4297 RRETURN(MATCH_NOMATCH);
4298 }
4299 GETCHARINCTEST(c, eptr);
4300 category = UCD_CATEGORY(c);
4301 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4302 == prop_fail_result)
4303 RRETURN(MATCH_NOMATCH);
4304 }
4305 break;
4306
4307 case PT_CLIST:
4308 for (i = 1; i <= min; i++)
4309 {
4310 const pcre_uint32 *cp;
4311 if (eptr >= md->end_subject)
4312 {
4313 SCHECK_PARTIAL();
4314 RRETURN(MATCH_NOMATCH);
4315 }
4316 GETCHARINCTEST(c, eptr);
4317 cp = PRIV(ucd_caseless_sets) + prop_value;
4318 for (;;)
4319 {
4320 if (c < *cp)
4321 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4322 if (c == *cp++)
4323 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4324 }
4325 }
4326 break;
4327
4328 case PT_UCNC:
4329 for (i = 1; i <= min; i++)
4330 {
4331 if (eptr >= md->end_subject)
4332 {
4333 SCHECK_PARTIAL();
4334 RRETURN(MATCH_NOMATCH);
4335 }
4336 GETCHARINCTEST(c, eptr);
4337 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4338 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4339 c >= 0xe000) == prop_fail_result)
4340 RRETURN(MATCH_NOMATCH);
4341 }
4342 break;
4343
4344 /* This should not occur */
4345
4346 default:
4347 RRETURN(PCRE_ERROR_INTERNAL);
4348 }
4349 }
4350
4351 /* Match extended Unicode sequences. We will get here only if the
4352 support is in the binary; otherwise a compile-time error occurs. */
4353
4354 else if (ctype == OP_EXTUNI)
4355 {
4356 for (i = 1; i <= min; i++)
4357 {
4358 if (eptr >= md->end_subject)
4359 {
4360 SCHECK_PARTIAL();
4361 RRETURN(MATCH_NOMATCH);
4362 }
4363 else
4364 {
4365 int lgb, rgb;
4366 GETCHARINCTEST(c, eptr);
4367 lgb = UCD_GRAPHBREAK(c);
4368 while (eptr < md->end_subject)
4369 {
4370 int len = 1;
4371 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4372 rgb = UCD_GRAPHBREAK(c);
4373 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4374 lgb = rgb;
4375 eptr += len;
4376 }
4377 }
4378 CHECK_PARTIAL();
4379 }
4380 }
4381
4382 else
4383 #endif /* SUPPORT_UCP */
4384
4385 /* Handle all other cases when the coding is UTF-8 */
4386
4387 #ifdef SUPPORT_UTF
4388 if (utf) switch(ctype)
4389 {
4390 case OP_ANY:
4391 for (i = 1; i <= min; i++)
4392 {
4393 if (eptr >= md->end_subject)
4394 {
4395 SCHECK_PARTIAL();
4396 RRETURN(MATCH_NOMATCH);
4397 }
4398 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4399 if (md->partial != 0 &&
4400 eptr + 1 >= md->end_subject &&
4401 NLBLOCK->nltype == NLTYPE_FIXED &&
4402 NLBLOCK->nllen == 2 &&
4403 UCHAR21(eptr) == NLBLOCK->nl[0])
4404 {
4405 md->hitend = TRUE;
4406 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4407 }
4408 eptr++;
4409 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4410 }
4411 break;
4412
4413 case OP_ALLANY:
4414 for (i = 1; i <= min; i++)
4415 {
4416 if (eptr >= md->end_subject)
4417 {
4418 SCHECK_PARTIAL();
4419 RRETURN(MATCH_NOMATCH);
4420 }
4421 eptr++;
4422 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4423 }
4424 break;
4425
4426 case OP_ANYBYTE:
4427 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4428 eptr += min;
4429 break;
4430
4431 case OP_ANYNL:
4432 for (i = 1; i <= min; i++)
4433 {
4434 if (eptr >= md->end_subject)
4435 {
4436 SCHECK_PARTIAL();
4437 RRETURN(MATCH_NOMATCH);
4438 }
4439 GETCHARINC(c, eptr);
4440 switch(c)
4441 {
4442 default: RRETURN(MATCH_NOMATCH);
4443
4444 case CHAR_CR:
4445 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4446 break;
4447
4448 case CHAR_LF:
4449 break;
4450
4451 case CHAR_VT:
4452 case CHAR_FF:
4453 case CHAR_NEL:
4454 #ifndef EBCDIC
4455 case 0x2028:
4456 case 0x2029:
4457 #endif /* Not EBCDIC */
4458 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4459 break;
4460 }
4461 }
4462 break;
4463
4464 case OP_NOT_HSPACE:
4465 for (i = 1; i <= min; i++)
4466 {
4467 if (eptr >= md->end_subject)
4468 {
4469 SCHECK_PARTIAL();
4470 RRETURN(MATCH_NOMATCH);
4471 }
4472 GETCHARINC(c, eptr);
4473 switch(c)
4474 {
4475 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4476 default: break;
4477 }
4478 }
4479 break;
4480
4481 case OP_HSPACE:
4482 for (i = 1; i <= min; i++)
4483 {
4484 if (eptr >= md->end_subject)
4485 {
4486 SCHECK_PARTIAL();
4487 RRETURN(MATCH_NOMATCH);
4488 }
4489 GETCHARINC(c, eptr);
4490 switch(c)
4491 {
4492 HSPACE_CASES: break; /* Byte and multibyte cases */
4493 default: RRETURN(MATCH_NOMATCH);
4494 }
4495 }
4496 break;
4497
4498 case OP_NOT_VSPACE:
4499 for (i = 1; i <= min; i++)
4500 {
4501 if (eptr >= md->end_subject)
4502 {
4503 SCHECK_PARTIAL();
4504 RRETURN(MATCH_NOMATCH);
4505 }
4506 GETCHARINC(c, eptr);
4507 switch(c)
4508 {
4509 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4510 default: break;
4511 }
4512 }
4513 break;
4514
4515 case OP_VSPACE:
4516 for (i = 1; i <= min; i++)
4517 {
4518 if (eptr >= md->end_subject)
4519 {
4520 SCHECK_PARTIAL();
4521 RRETURN(MATCH_NOMATCH);
4522 }
4523 GETCHARINC(c, eptr);
4524 switch(c)
4525 {
4526 VSPACE_CASES: break;
4527 default: RRETURN(MATCH_NOMATCH);
4528 }
4529 }
4530 break;
4531
4532 case OP_NOT_DIGIT:
4533 for (i = 1; i <= min; i++)
4534 {
4535 if (eptr >= md->end_subject)
4536 {
4537 SCHECK_PARTIAL();
4538 RRETURN(MATCH_NOMATCH);
4539 }
4540 GETCHARINC(c, eptr);
4541 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4542 RRETURN(MATCH_NOMATCH);
4543 }
4544 break;
4545
4546 case OP_DIGIT:
4547 for (i = 1; i <= min; i++)
4548 {
4549 pcre_uint32 cc;
4550 if (eptr >= md->end_subject)
4551 {
4552 SCHECK_PARTIAL();
4553 RRETURN(MATCH_NOMATCH);
4554 }
4555 cc = UCHAR21(eptr);
4556 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4557 RRETURN(MATCH_NOMATCH);
4558 eptr++;
4559 /* No need to skip more bytes - we know it's a 1-byte character */
4560 }
4561 break;
4562
4563 case OP_NOT_WHITESPACE:
4564 for (i = 1; i <= min; i++)
4565 {
4566 pcre_uint32 cc;
4567 if (eptr >= md->end_subject)
4568 {
4569 SCHECK_PARTIAL();
4570 RRETURN(MATCH_NOMATCH);
4571 }
4572 cc = UCHAR21(eptr);
4573 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4574 RRETURN(MATCH_NOMATCH);
4575 eptr++;
4576 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4577 }
4578 break;
4579
4580 case OP_WHITESPACE:
4581 for (i = 1; i <= min; i++)
4582 {
4583 pcre_uint32 cc;
4584 if (eptr >= md->end_subject)
4585 {
4586 SCHECK_PARTIAL();
4587 RRETURN(MATCH_NOMATCH);
4588 }
4589 cc = UCHAR21(eptr);
4590 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4591 RRETURN(MATCH_NOMATCH);
4592 eptr++;
4593 /* No need to skip more bytes - we know it's a 1-byte character */
4594 }
4595 break;
4596
4597 case OP_NOT_WORDCHAR:
4598 for (i = 1; i <= min; i++)
4599 {
4600 pcre_uint32 cc;
4601 if (eptr >= md->end_subject)
4602 {
4603 SCHECK_PARTIAL();
4604 RRETURN(MATCH_NOMATCH);
4605 }
4606 cc = UCHAR21(eptr);
4607 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4608 RRETURN(MATCH_NOMATCH);
4609 eptr++;
4610 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4611 }
4612 break;
4613
4614 case OP_WORDCHAR:
4615 for (i = 1; i <= min; i++)
4616 {
4617 pcre_uint32 cc;
4618 if (eptr >= md->end_subject)
4619 {
4620 SCHECK_PARTIAL();
4621 RRETURN(MATCH_NOMATCH);
4622 }
4623 cc = UCHAR21(eptr);
4624 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4625 RRETURN(MATCH_NOMATCH);
4626 eptr++;
4627 /* No need to skip more bytes - we know it's a 1-byte character */
4628 }
4629 break;
4630
4631 default:
4632 RRETURN(PCRE_ERROR_INTERNAL);
4633 } /* End switch(ctype) */
4634
4635 else
4636 #endif /* SUPPORT_UTF */
4637
4638 /* Code for the non-UTF-8 case for minimum matching of operators other
4639 than OP_PROP and OP_NOTPROP. */
4640
4641 switch(ctype)
4642 {
4643 case OP_ANY:
4644 for (i = 1; i <= min; i++)
4645 {
4646 if (eptr >= md->end_subject)
4647 {
4648 SCHECK_PARTIAL();
4649 RRETURN(MATCH_NOMATCH);
4650 }
4651 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4652 if (md->partial != 0 &&
4653 eptr + 1 >= md->end_subject &&
4654 NLBLOCK->nltype == NLTYPE_FIXED &&
4655 NLBLOCK->nllen == 2 &&
4656 *eptr == NLBLOCK->nl[0])
4657 {
4658 md->hitend = TRUE;
4659 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4660 }
4661 eptr++;
4662 }
4663 break;
4664
4665 case OP_ALLANY:
4666 if (eptr > md->end_subject - min)
4667 {
4668 SCHECK_PARTIAL();
4669 RRETURN(MATCH_NOMATCH);
4670 }
4671 eptr += min;
4672 break;
4673
4674 case OP_ANYBYTE:
4675 if (eptr > md->end_subject - min)
4676 {
4677 SCHECK_PARTIAL();
4678 RRETURN(MATCH_NOMATCH);
4679 }
4680 eptr += min;
4681 break;
4682
4683 case OP_ANYNL:
4684 for (i = 1; i <= min; i++)
4685 {
4686 if (eptr >= md->end_subject)
4687 {
4688 SCHECK_PARTIAL();
4689 RRETURN(MATCH_NOMATCH);
4690 }
4691 switch(*eptr++)
4692 {
4693 default: RRETURN(MATCH_NOMATCH);
4694
4695 case CHAR_CR:
4696 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4697 break;
4698
4699 case CHAR_LF:
4700 break;
4701
4702 case CHAR_VT:
4703 case CHAR_FF:
4704 case CHAR_NEL:
4705 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4706 case 0x2028:
4707 case 0x2029:
4708 #endif
4709 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4710 break;
4711 }
4712 }
4713 break;
4714
4715 case OP_NOT_HSPACE:
4716 for (i = 1; i <= min; i++)
4717 {
4718 if (eptr >= md->end_subject)
4719 {
4720 SCHECK_PARTIAL();
4721 RRETURN(MATCH_NOMATCH);
4722 }
4723 switch(*eptr++)
4724 {
4725 default: break;
4726 HSPACE_BYTE_CASES:
4727 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4728 HSPACE_MULTIBYTE_CASES:
4729 #endif
4730 RRETURN(MATCH_NOMATCH);
4731 }
4732 }
4733 break;
4734
4735 case OP_HSPACE:
4736 for (i = 1; i <= min; i++)
4737 {
4738 if (eptr >= md->end_subject)
4739 {
4740 SCHECK_PARTIAL();
4741 RRETURN(MATCH_NOMATCH);
4742 }
4743 switch(*eptr++)
4744 {
4745 default: RRETURN(MATCH_NOMATCH);
4746 HSPACE_BYTE_CASES:
4747 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4748 HSPACE_MULTIBYTE_CASES:
4749 #endif
4750 break;
4751 }
4752 }
4753 break;
4754
4755 case OP_NOT_VSPACE:
4756 for (i = 1; i <= min; i++)
4757 {
4758 if (eptr >= md->end_subject)
4759 {
4760 SCHECK_PARTIAL();
4761 RRETURN(MATCH_NOMATCH);
4762 }
4763 switch(*eptr++)
4764 {
4765 VSPACE_BYTE_CASES:
4766 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4767 VSPACE_MULTIBYTE_CASES:
4768 #endif
4769 RRETURN(MATCH_NOMATCH);
4770 default: break;
4771 }
4772 }
4773 break;
4774
4775 case OP_VSPACE:
4776 for (i = 1; i <= min; i++)
4777 {
4778 if (eptr >= md->end_subject)
4779 {
4780 SCHECK_PARTIAL();
4781 RRETURN(MATCH_NOMATCH);
4782 }
4783 switch(*eptr++)
4784 {
4785 default: RRETURN(MATCH_NOMATCH);
4786 VSPACE_BYTE_CASES:
4787 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4788 VSPACE_MULTIBYTE_CASES:
4789 #endif
4790 break;
4791 }
4792 }
4793 break;
4794
4795 case OP_NOT_DIGIT:
4796 for (i = 1; i <= min; i++)
4797 {
4798 if (eptr >= md->end_subject)
4799 {
4800 SCHECK_PARTIAL();
4801 RRETURN(MATCH_NOMATCH);
4802 }
4803 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4804 RRETURN(MATCH_NOMATCH);
4805 eptr++;
4806 }
4807 break;
4808
4809 case OP_DIGIT:
4810 for (i = 1; i <= min; i++)
4811 {
4812 if (eptr >= md->end_subject)
4813 {
4814 SCHECK_PARTIAL();
4815 RRETURN(MATCH_NOMATCH);
4816 }
4817 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4818 RRETURN(MATCH_NOMATCH);
4819 eptr++;
4820 }
4821 break;
4822
4823 case OP_NOT_WHITESPACE:
4824 for (i = 1; i <= min; i++)
4825 {
4826 if (eptr >= md->end_subject)
4827 {
4828 SCHECK_PARTIAL();
4829 RRETURN(MATCH_NOMATCH);
4830 }
4831 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4832 RRETURN(MATCH_NOMATCH);
4833 eptr++;
4834 }
4835 break;
4836
4837 case OP_WHITESPACE:
4838 for (i = 1; i <= min; i++)
4839 {
4840 if (eptr >= md->end_subject)
4841 {
4842 SCHECK_PARTIAL();
4843 RRETURN(MATCH_NOMATCH);
4844 }
4845 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4846 RRETURN(MATCH_NOMATCH);
4847 eptr++;
4848 }
4849 break;
4850
4851 case OP_NOT_WORDCHAR:
4852 for (i = 1; i <= min; i++)
4853 {
4854 if (eptr >= md->end_subject)
4855 {
4856 SCHECK_PARTIAL();
4857 RRETURN(MATCH_NOMATCH);
4858 }
4859 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4860 RRETURN(MATCH_NOMATCH);
4861 eptr++;
4862 }
4863 break;
4864
4865 case OP_WORDCHAR:
4866 for (i = 1; i <= min; i++)
4867 {
4868 if (eptr >= md->end_subject)
4869 {
4870 SCHECK_PARTIAL();
4871 RRETURN(MATCH_NOMATCH);
4872 }
4873 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4874 RRETURN(MATCH_NOMATCH);
4875 eptr++;
4876 }
4877 break;
4878
4879 default:
4880 RRETURN(PCRE_ERROR_INTERNAL);
4881 }
4882 }
4883
4884 /* If min = max, continue at the same level without recursing */
4885
4886 if (min == max) continue;
4887
4888 /* If minimizing, we have to test the rest of the pattern before each
4889 subsequent match. Again, separate the UTF-8 case for speed, and also
4890 separate the UCP cases. */
4891
4892 if (minimize)
4893 {
4894 #ifdef SUPPORT_UCP
4895 if (prop_type >= 0)
4896 {
4897 switch(prop_type)
4898 {
4899 case PT_ANY:
4900 for (fi = min;; fi++)
4901 {
4902 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4904 if (fi >= max) RRETURN(MATCH_NOMATCH);
4905 if (eptr >= md->end_subject)
4906 {
4907 SCHECK_PARTIAL();
4908 RRETURN(MATCH_NOMATCH);
4909 }
4910 GETCHARINCTEST(c, eptr);
4911 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4912 }
4913 /* Control never gets here */
4914
4915 case PT_LAMP:
4916 for (fi = min;; fi++)
4917 {
4918 int chartype;
4919 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4921 if (fi >= max) RRETURN(MATCH_NOMATCH);
4922 if (eptr >= md->end_subject)
4923 {
4924 SCHECK_PARTIAL();
4925 RRETURN(MATCH_NOMATCH);
4926 }
4927 GETCHARINCTEST(c, eptr);
4928 chartype = UCD_CHARTYPE(c);
4929 if ((chartype == ucp_Lu ||
4930 chartype == ucp_Ll ||
4931 chartype == ucp_Lt) == prop_fail_result)
4932 RRETURN(MATCH_NOMATCH);
4933 }
4934 /* Control never gets here */
4935
4936 case PT_GC:
4937 for (fi = min;; fi++)
4938 {
4939 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4940 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4941 if (fi >= max) RRETURN(MATCH_NOMATCH);
4942 if (eptr >= md->end_subject)
4943 {
4944 SCHECK_PARTIAL();
4945 RRETURN(MATCH_NOMATCH);
4946 }
4947 GETCHARINCTEST(c, eptr);
4948 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4949 RRETURN(MATCH_NOMATCH);
4950 }
4951 /* Control never gets here */
4952
4953 case PT_PC:
4954 for (fi = min;; fi++)
4955 {
4956 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4958 if (fi >= max) RRETURN(MATCH_NOMATCH);
4959 if (eptr >= md->end_subject)
4960 {
4961 SCHECK_PARTIAL();
4962 RRETURN(MATCH_NOMATCH);
4963 }
4964 GETCHARINCTEST(c, eptr);
4965 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4966 RRETURN(MATCH_NOMATCH);
4967 }
4968 /* Control never gets here */
4969
4970 case PT_SC:
4971 for (fi = min;; fi++)
4972 {
4973 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4975 if (fi >= max) RRETURN(MATCH_NOMATCH);
4976 if (eptr >= md->end_subject)
4977 {
4978 SCHECK_PARTIAL();
4979 RRETURN(MATCH_NOMATCH);
4980 }
4981 GETCHARINCTEST(c, eptr);
4982 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4983 RRETURN(MATCH_NOMATCH);
4984 }
4985 /* Control never gets here */
4986
4987 case PT_ALNUM:
4988 for (fi = min;; fi++)
4989 {
4990 int category;
4991 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4992 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4993 if (fi >= max) RRETURN(MATCH_NOMATCH);
4994 if (eptr >= md->end_subject)
4995 {
4996 SCHECK_PARTIAL();
4997 RRETURN(MATCH_NOMATCH);
4998 }
4999 GETCHARINCTEST(c, eptr);
5000 category = UCD_CATEGORY(c);
5001 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5002 RRETURN(MATCH_NOMATCH);
5003 }
5004 /* Control never gets here */
5005
5006 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5007 which means that Perl space and POSIX space are now identical. PCRE
5008 was changed at release 8.34. */
5009
5010 case PT_SPACE: /* Perl space */
5011 case PT_PXSPACE: /* POSIX space */
5012 for (fi = min;; fi++)
5013 {
5014 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5015 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5016 if (fi >= max) RRETURN(MATCH_NOMATCH);
5017 if (eptr >= md->end_subject)
5018 {
5019 SCHECK_PARTIAL();
5020 RRETURN(MATCH_NOMATCH);
5021 }
5022 GETCHARINCTEST(c, eptr);
5023 switch(c)
5024 {
5025 HSPACE_CASES:
5026 VSPACE_CASES:
5027 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5028 break;
5029
5030 default:
5031 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5032 RRETURN(MATCH_NOMATCH);
5033 break;
5034 }
5035 }
5036 /* Control never gets here */
5037
5038 case PT_WORD:
5039 for (fi = min;; fi++)
5040 {
5041 int category;
5042 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5044 if (fi >= max) RRETURN(MATCH_NOMATCH);
5045 if (eptr >= md->end_subject)
5046 {
5047 SCHECK_PARTIAL();
5048 RRETURN(MATCH_NOMATCH);
5049 }
5050 GETCHARINCTEST(c, eptr);
5051 category = UCD_CATEGORY(c);
5052 if ((category == ucp_L ||
5053 category == ucp_N ||
5054 c == CHAR_UNDERSCORE)
5055 == prop_fail_result)
5056 RRETURN(MATCH_NOMATCH);
5057 }
5058 /* Control never gets here */
5059
5060 case PT_CLIST:
5061 for (fi = min;; fi++)
5062 {
5063 const pcre_uint32 *cp;
5064 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5065 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5066 if (fi >= max) RRETURN(MATCH_NOMATCH);
5067 if (eptr >= md->end_subject)
5068 {
5069 SCHECK_PARTIAL();
5070 RRETURN(MATCH_NOMATCH);
5071 }
5072 GETCHARINCTEST(c, eptr);
5073 cp = PRIV(ucd_caseless_sets) + prop_value;
5074 for (;;)
5075 {
5076 if (c < *cp)
5077 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5078 if (c == *cp++)
5079 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5080 }
5081 }
5082 /* Control never gets here */
5083
5084 case PT_UCNC:
5085 for (fi = min;; fi++)
5086 {
5087 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5088 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5089 if (fi >= max) RRETURN(MATCH_NOMATCH);
5090 if (eptr >= md->end_subject)
5091 {
5092 SCHECK_PARTIAL();
5093 RRETURN(MATCH_NOMATCH);
5094 }
5095 GETCHARINCTEST(c, eptr);
5096 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5097 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5098 c >= 0xe000) == prop_fail_result)
5099 RRETURN(MATCH_NOMATCH);
5100 }
5101 /* Control never gets here */
5102
5103 /* This should never occur */
5104 default:
5105 RRETURN(PCRE_ERROR_INTERNAL);
5106 }
5107 }
5108
5109 /* Match extended Unicode sequences. We will get here only if the
5110 support is in the binary; otherwise a compile-time error occurs. */
5111
5112 else if (ctype == OP_EXTUNI)
5113 {
5114 for (fi = min;; fi++)
5115 {
5116 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5118 if (fi >= max) RRETURN(MATCH_NOMATCH);
5119 if (eptr >= md->end_subject)
5120 {
5121 SCHECK_PARTIAL();
5122 RRETURN(MATCH_NOMATCH);
5123 }
5124 else
5125 {
5126 int lgb, rgb;
5127 GETCHARINCTEST(c, eptr);
5128 lgb = UCD_GRAPHBREAK(c);
5129 while (eptr < md->end_subject)
5130 {
5131 int len = 1;
5132 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5133 rgb = UCD_GRAPHBREAK(c);
5134 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5135 lgb = rgb;
5136 eptr += len;
5137 }
5138 }
5139 CHECK_PARTIAL();
5140 }
5141 }
5142 else
5143 #endif /* SUPPORT_UCP */
5144
5145 #ifdef SUPPORT_UTF
5146 if (utf)
5147 {
5148 for (fi = min;; fi++)
5149 {
5150 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5151 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5152 if (fi >= max) RRETURN(MATCH_NOMATCH);
5153 if (eptr >= md->end_subject)
5154 {
5155 SCHECK_PARTIAL();
5156 RRETURN(MATCH_NOMATCH);
5157 }
5158 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5159 RRETURN(MATCH_NOMATCH);
5160 GETCHARINC(c, eptr);
5161 switch(ctype)
5162 {
5163 case OP_ANY: /* This is the non-NL case */
5164 if (md->partial != 0 && /* Take care with CRLF partial */
5165 eptr >= md->end_subject &&
5166 NLBLOCK->nltype == NLTYPE_FIXED &&
5167 NLBLOCK->nllen == 2 &&
5168 c == NLBLOCK->nl[0])
5169 {
5170 md->hitend = TRUE;
5171 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5172 }
5173 break;
5174
5175 case OP_ALLANY:
5176 case OP_ANYBYTE:
5177 break;
5178
5179 case OP_ANYNL:
5180 switch(c)
5181 {
5182 default: RRETURN(MATCH_NOMATCH);
5183 case CHAR_CR:
5184 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5185 break;
5186
5187 case CHAR_LF:
5188 break;
5189
5190 case CHAR_VT:
5191 case CHAR_FF:
5192 case CHAR_NEL:
5193 #ifndef EBCDIC
5194 case 0x2028:
5195 case 0x2029:
5196 #endif /* Not EBCDIC */
5197 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5198 break;
5199 }
5200 break;
5201
5202 case OP_NOT_HSPACE:
5203 switch(c)
5204 {
5205 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5206 default: break;
5207 }
5208 break;
5209
5210 case OP_HSPACE:
5211 switch(c)
5212 {
5213 HSPACE_CASES: break;
5214 default: RRETURN(MATCH_NOMATCH);
5215 }
5216 break;
5217
5218 case OP_NOT_VSPACE:
5219 switch(c)
5220 {
5221 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5222 default: break;
5223 }
5224 break;
5225
5226 case OP_VSPACE:
5227 switch(c)
5228 {
5229 VSPACE_CASES: break;
5230 default: RRETURN(MATCH_NOMATCH);
5231 }
5232 break;
5233
5234 case OP_NOT_DIGIT:
5235 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5236 RRETURN(MATCH_NOMATCH);
5237 break;
5238
5239 case OP_DIGIT:
5240 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5241 RRETURN(MATCH_NOMATCH);
5242 break;
5243
5244 case OP_NOT_WHITESPACE:
5245 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5246 RRETURN(MATCH_NOMATCH);
5247 break;
5248
5249 case OP_WHITESPACE:
5250 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5251 RRETURN(MATCH_NOMATCH);
5252 break;
5253
5254 case OP_NOT_WORDCHAR:
5255 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5256 RRETURN(MATCH_NOMATCH);
5257 break;
5258
5259 case OP_WORDCHAR:
5260 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5261 RRETURN(MATCH_NOMATCH);
5262 break;
5263
5264 default:
5265 RRETURN(PCRE_ERROR_INTERNAL);
5266 }
5267 }
5268 }
5269 else
5270 #endif
5271 /* Not UTF mode */
5272 {
5273 for (fi = min;; fi++)
5274 {
5275 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5276 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5277 if (fi >= max) RRETURN(MATCH_NOMATCH);
5278 if (eptr >= md->end_subject)
5279 {
5280 SCHECK_PARTIAL();
5281 RRETURN(MATCH_NOMATCH);
5282 }
5283 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5284 RRETURN(MATCH_NOMATCH);
5285 c = *eptr++;
5286 switch(ctype)
5287 {
5288 case OP_ANY: /* This is the non-NL case */
5289 if (md->partial != 0 && /* Take care with CRLF partial */
5290 eptr >= md->end_subject &&
5291 NLBLOCK->nltype == NLTYPE_FIXED &&
5292 NLBLOCK->nllen == 2 &&
5293 c == NLBLOCK->nl[0])
5294 {
5295 md->hitend = TRUE;
5296 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5297 }
5298 break;
5299
5300 case OP_ALLANY:
5301 case OP_ANYBYTE:
5302 break;
5303
5304 case OP_ANYNL:
5305 switch(c)
5306 {
5307 default: RRETURN(MATCH_NOMATCH);
5308 case CHAR_CR:
5309 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5310 break;
5311
5312 case CHAR_LF:
5313 break;
5314
5315 case CHAR_VT:
5316 case CHAR_FF:
5317 case CHAR_NEL:
5318 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5319 case 0x2028:
5320 case 0x2029:
5321 #endif
5322 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5323 break;
5324 }
5325 break;
5326
5327 case OP_NOT_HSPACE:
5328 switch(c)
5329 {
5330 default: break;
5331 HSPACE_BYTE_CASES:
5332 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5333 HSPACE_MULTIBYTE_CASES:
5334 #endif
5335 RRETURN(MATCH_NOMATCH);
5336 }
5337 break;
5338
5339 case OP_HSPACE:
5340 switch(c)
5341 {
5342 default: RRETURN(MATCH_NOMATCH);
5343 HSPACE_BYTE_CASES:
5344 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5345 HSPACE_MULTIBYTE_CASES:
5346 #endif
5347 break;
5348 }
5349 break;
5350
5351 case OP_NOT_VSPACE:
5352 switch(c)
5353 {
5354 default: break;
5355 VSPACE_BYTE_CASES:
5356 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5357 VSPACE_MULTIBYTE_CASES:
5358 #endif
5359 RRETURN(MATCH_NOMATCH);
5360 }
5361 break;
5362
5363 case OP_VSPACE:
5364 switch(c)
5365 {
5366 default: RRETURN(MATCH_NOMATCH);
5367 VSPACE_BYTE_CASES:
5368 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5369 VSPACE_MULTIBYTE_CASES:
5370 #endif
5371 break;
5372 }
5373 break;
5374
5375 case OP_NOT_DIGIT:
5376 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5377 break;
5378
5379 case OP_DIGIT:
5380 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5381 break;
5382
5383 case OP_NOT_WHITESPACE:
5384 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5385 break;
5386
5387 case OP_WHITESPACE:
5388 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5389 break;
5390
5391 case OP_NOT_WORDCHAR:
5392 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5393 break;
5394
5395 case OP_WORDCHAR:
5396 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5397 break;
5398
5399 default:
5400 RRETURN(PCRE_ERROR_INTERNAL);
5401 }
5402 }
5403 }
5404 /* Control never gets here */
5405 }
5406
5407 /* If maximizing, it is worth using inline code for speed, doing the type
5408 test once at the start (i.e. keep it out of the loop). Again, keep the
5409 UTF-8 and UCP stuff separate. */
5410
5411 else
5412 {
5413 pp = eptr; /* Remember where we started */
5414
5415 #ifdef SUPPORT_UCP
5416 if (prop_type >= 0)
5417 {
5418 switch(prop_type)
5419 {
5420 case PT_ANY:
5421 for (i = min; i < max; i++)
5422 {
5423 int len = 1;
5424 if (eptr >= md->end_subject)
5425 {
5426 SCHECK_PARTIAL();
5427 break;
5428 }
5429 GETCHARLENTEST(c, eptr, len);
5430 if (prop_fail_result) break;
5431 eptr+= len;
5432 }
5433 break;
5434
5435 case PT_LAMP:
5436 for (i = min; i < max; i++)
5437 {
5438 int chartype;
5439 int len = 1;
5440 if (eptr >= md->end_subject)
5441 {
5442 SCHECK_PARTIAL();
5443 break;
5444 }
5445 GETCHARLENTEST(c, eptr, len);
5446 chartype = UCD_CHARTYPE(c);
5447 if ((chartype == ucp_Lu ||
5448 chartype == ucp_Ll ||
5449 chartype == ucp_Lt) == prop_fail_result)
5450 break;
5451 eptr+= len;
5452 }
5453 break;
5454
5455 case PT_GC:
5456 for (i = min; i < max; i++)
5457 {
5458 int len = 1;
5459 if (eptr >= md->end_subject)
5460 {
5461 SCHECK_PARTIAL();
5462 break;
5463 }
5464 GETCHARLENTEST(c, eptr, len);
5465 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5466 eptr+= len;
5467 }
5468 break;
5469
5470 case PT_PC:
5471 for (i = min; i < max; i++)
5472 {
5473 int len = 1;
5474 if (eptr >= md->end_subject)
5475 {
5476 SCHECK_PARTIAL();
5477 break;
5478 }
5479 GETCHARLENTEST(c, eptr, len);
5480 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5481 eptr+= len;
5482 }
5483 break;
5484
5485 case PT_SC:
5486 for (i = min; i < max; i++)
5487 {
5488 int len = 1;
5489 if (eptr >= md->end_subject)
5490 {
5491 SCHECK_PARTIAL();
5492 break;
5493 }
5494 GETCHARLENTEST(c, eptr, len);
5495 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5496 eptr+= len;
5497 }
5498 break;
5499
5500 case PT_ALNUM:
5501 for (i = min; i < max; i++)
5502 {
5503 int category;
5504 int len = 1;
5505 if (eptr >= md->end_subject)
5506 {
5507 SCHECK_PARTIAL();
5508 break;
5509 }
5510 GETCHARLENTEST(c, eptr, len);
5511 category = UCD_CATEGORY(c);
5512 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5513 break;
5514 eptr+= len;
5515 }
5516 break;
5517
5518 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5519 which means that Perl space and POSIX space are now identical. PCRE
5520 was changed at release 8.34. */
5521
5522 case PT_SPACE: /* Perl space */
5523 case PT_PXSPACE: /* POSIX space */
5524 for (i = min; i < max; i++)
5525 {
5526 int len = 1;
5527 if (eptr >= md->end_subject)
5528 {
5529 SCHECK_PARTIAL();
5530 break;
5531 }
5532 GETCHARLENTEST(c, eptr, len);
5533 switch(c)
5534 {
5535 HSPACE_CASES:
5536 VSPACE_CASES:
5537 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5538 break;
5539
5540 default:
5541 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5542 goto ENDLOOP99; /* Break the loop */
5543 break;
5544 }
5545 eptr+= len;
5546 }
5547 ENDLOOP99:
5548 break;
5549
5550 case PT_WORD:
5551 for (i = min; i < max; i++)
5552 {
5553 int category;
5554 int len = 1;
5555 if (eptr >= md->end_subject)
5556 {
5557 SCHECK_PARTIAL();
5558 break;
5559 }
5560 GETCHARLENTEST(c, eptr, len);
5561 category = UCD_CATEGORY(c);
5562 if ((category == ucp_L || category == ucp_N ||
5563 c == CHAR_UNDERSCORE) == prop_fail_result)
5564 break;
5565 eptr+= len;
5566 }
5567 break;
5568
5569 case PT_CLIST:
5570 for (i = min; i < max; i++)
5571 {
5572 const pcre_uint32 *cp;
5573 int len = 1;
5574 if (eptr >= md->end_subject)
5575 {
5576 SCHECK_PARTIAL();
5577 break;
5578 }
5579 GETCHARLENTEST(c, eptr, len);
5580 cp = PRIV(ucd_caseless_sets) + prop_value;
5581 for (;;)
5582 {
5583 if (c < *cp)
5584 { if (prop_fail_result) break; else goto GOT_MAX; }
5585 if (c == *cp++)
5586 { if (prop_fail_result) goto GOT_MAX; else break; }
5587 }
5588 eptr += len;
5589 }
5590 GOT_MAX:
5591 break;
5592
5593 case PT_UCNC:
5594 for (i = min; i < max; i++)
5595 {
5596 int len = 1;
5597 if (eptr >= md->end_subject)
5598 {
5599 SCHECK_PARTIAL();
5600 break;
5601 }
5602 GETCHARLENTEST(c, eptr, len);
5603 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5604 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5605 c >= 0xe000) == prop_fail_result)
5606 break;
5607 eptr += len;
5608 }
5609 break;
5610
5611 default:
5612 RRETURN(PCRE_ERROR_INTERNAL);
5613 }
5614
5615 /* eptr is now past the end of the maximum run */
5616
5617 if (possessive) continue; /* No backtracking */
5618 for(;;)
5619 {
5620 if (eptr == pp) goto TAIL_RECURSE;
5621 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5622 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5623 eptr--;
5624 if (utf) BACKCHAR(eptr);
5625 }
5626 }
5627
5628 /* Match extended Unicode grapheme clusters. We will get here only if the
5629 support is in the binary; otherwise a compile-time error occurs. */
5630
5631 else if (ctype == OP_EXTUNI)
5632 {
5633 for (i = min; i < max; i++)
5634 {
5635 if (eptr >= md->end_subject)
5636 {
5637 SCHECK_PARTIAL();
5638 break;
5639 }
5640 else
5641 {
5642 int lgb, rgb;
5643 GETCHARINCTEST(c, eptr);
5644 lgb = UCD_GRAPHBREAK(c);
5645 while (eptr < md->end_subject)
5646 {
5647 int len = 1;
5648 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5649 rgb = UCD_GRAPHBREAK(c);
5650 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5651 lgb = rgb;
5652 eptr += len;
5653 }
5654 }
5655 CHECK_PARTIAL();
5656 }
5657
5658 /* eptr is now past the end of the maximum run */
5659
5660 if (possessive) continue; /* No backtracking */
5661
5662 for(;;)
5663 {
5664 int lgb, rgb;
5665 PCRE_PUCHAR fptr;
5666
5667 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5668 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5670
5671 /* Backtracking over an extended grapheme cluster involves inspecting
5672 the previous two characters (if present) to see if a break is
5673 permitted between them. */
5674
5675 eptr--;
5676 if (!utf) c = *eptr; else
5677 {
5678 BACKCHAR(eptr);
5679 GETCHAR(c, eptr);
5680 }
5681 rgb = UCD_GRAPHBREAK(c);
5682
5683 for (;;)
5684 {
5685 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5686 fptr = eptr - 1;
5687 if (!utf) c = *fptr; else
5688 {
5689 BACKCHAR(fptr);
5690 GETCHAR(c, fptr);
5691 }
5692 lgb = UCD_GRAPHBREAK(c);
5693 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5694 eptr = fptr;
5695 rgb = lgb;
5696 }
5697 }
5698 }
5699
5700 else
5701 #endif /* SUPPORT_UCP */
5702
5703 #ifdef SUPPORT_UTF
5704 if (utf)
5705 {
5706 switch(ctype)
5707 {
5708 case OP_ANY:
5709 for (i = min; i < max; i++)
5710 {
5711 if (eptr >= md->end_subject)
5712 {
5713 SCHECK_PARTIAL();
5714 break;
5715 }
5716 if (IS_NEWLINE(eptr)) break;
5717 if (md->partial != 0 && /* Take care with CRLF partial */
5718 eptr + 1 >= md->end_subject &&
5719 NLBLOCK->nltype == NLTYPE_FIXED &&
5720 NLBLOCK->nllen == 2 &&
5721 UCHAR21(eptr) == NLBLOCK->nl[0])
5722 {
5723 md->hitend = TRUE;
5724 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5725 }
5726 eptr++;
5727 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5728 }
5729 break;
5730
5731 case OP_ALLANY:
5732 if (max < INT_MAX)
5733 {
5734 for (i = min; i < max; i++)
5735 {
5736 if (eptr >= md->end_subject)
5737 {
5738 SCHECK_PARTIAL();
5739 break;
5740 }
5741 eptr++;
5742 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5743 }
5744 }
5745 else
5746 {
5747 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5748 SCHECK_PARTIAL();
5749 }
5750 break;
5751
5752 /* The byte case is the same as non-UTF8 */
5753
5754 case OP_ANYBYTE:
5755 c = max - min;
5756 if (c > (unsigned int)(md->end_subject - eptr))
5757 {
5758 eptr = md->end_subject;
5759 SCHECK_PARTIAL();
5760 }
5761 else eptr += c;
5762 break;
5763
5764 case OP_ANYNL:
5765 for (i = min; i < max; i++)
5766 {
5767 int len = 1;
5768 if (eptr >= md->end_subject)
5769 {
5770 SCHECK_PARTIAL();
5771 break;
5772 }
5773 GETCHARLEN(c, eptr, len);
5774 if (c == CHAR_CR)
5775 {
5776 if (++eptr >= md->end_subject) break;
5777 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5778 }
5779 else
5780 {
5781 if (c != CHAR_LF &&
5782 (md->bsr_anycrlf ||
5783 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5784 #ifndef EBCDIC
5785 && c != 0x2028 && c != 0x2029
5786 #endif /* Not EBCDIC */
5787 )))
5788 break;
5789 eptr += len;
5790 }
5791 }
5792 break;
5793
5794 case OP_NOT_HSPACE:
5795 case OP_HSPACE:
5796 for (i = min; i < max; i++)
5797 {
5798 BOOL gotspace;
5799 int len = 1;
5800 if (eptr >= md->end_subject)
5801 {
5802 SCHECK_PARTIAL();
5803 break;
5804 }
5805 GETCHARLEN(c, eptr, len);
5806 switch(c)
5807 {
5808 HSPACE_CASES: gotspace = TRUE; break;
5809 default: gotspace = FALSE; break;
5810 }
5811 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5812 eptr += len;
5813 }
5814 break;
5815
5816 case OP_NOT_VSPACE:
5817 case OP_VSPACE:
5818 for (i = min; i < max; i++)
5819 {
5820 BOOL gotspace;
5821 int len = 1;
5822 if (eptr >= md->end_subject)
5823 {
5824 SCHECK_PARTIAL();
5825 break;
5826 }
5827 GETCHARLEN(c, eptr, len);
5828 switch(c)
5829 {
5830 VSPACE_CASES: gotspace = TRUE; break;
5831 default: gotspace = FALSE; break;
5832 }
5833 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5834 eptr += len;
5835 }
5836 break;
5837
5838 case OP_NOT_DIGIT:
5839 for (i = min; i < max; i++)
5840 {
5841 int len = 1;
5842 if (eptr >= md->end_subject)
5843 {
5844 SCHECK_PARTIAL();
5845 break;
5846 }
5847 GETCHARLEN(c, eptr, len);
5848 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5849 eptr+= len;
5850 }
5851 break;
5852
5853 case OP_DIGIT:
5854 for (i = min; i < max; i++)
5855 {
5856 int len = 1;
5857 if (eptr >= md->end_subject)
5858 {
5859 SCHECK_PARTIAL();
5860 break;
5861 }
5862 GETCHARLEN(c, eptr, len);
5863 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5864 eptr+= len;
5865 }
5866 break;
5867
5868 case OP_NOT_WHITESPACE:
5869 for (i = min; i < max; i++)
5870 {
5871 int len = 1;
5872 if (eptr >= md->end_subject)
5873 {
5874 SCHECK_PARTIAL();
5875 break;
5876 }
5877 GETCHARLEN(c, eptr, len);
5878 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5879 eptr+= len;
5880 }
5881 break;
5882
5883 case OP_WHITESPACE:
5884 for (i = min; i < max; i++)
5885 {
5886 int len = 1;
5887 if (eptr >= md->end_subject)
5888 {
5889 SCHECK_PARTIAL();
5890 break;
5891 }
5892 GETCHARLEN(c, eptr, len);
5893 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5894 eptr+= len;
5895 }
5896 break;
5897
5898 case OP_NOT_WORDCHAR:
5899 for (i = min; i < max; i++)
5900 {
5901 int len = 1;
5902 if (eptr >= md->end_subject)
5903 {
5904 SCHECK_PARTIAL();
5905 break;
5906 }
5907 GETCHARLEN(c, eptr, len);
5908 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5909 eptr+= len;
5910 }
5911 break;
5912
5913 case OP_WORDCHAR:
5914 for (i = min; i < max; i++)
5915 {
5916 int len = 1;
5917 if (eptr >= md->end_subject)
5918 {
5919 SCHECK_PARTIAL();
5920 break;
5921 }
5922 GETCHARLEN(c, eptr, len);
5923 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5924 eptr+= len;
5925 }
5926 break;
5927
5928 default:
5929 RRETURN(PCRE_ERROR_INTERNAL);
5930 }
5931
5932 if (possessive) continue; /* No backtracking */
5933 for(;;)
5934 {
5935 if (eptr == pp) goto TAIL_RECURSE;
5936 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5937 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5938 eptr--;
5939 BACKCHAR(eptr);
5940 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5941 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5942 }
5943 }
5944 else
5945 #endif /* SUPPORT_UTF */
5946 /* Not UTF mode */
5947 {
5948 switch(ctype)
5949 {
5950 case OP_ANY:
5951 for (i = min; i < max; i++)
5952 {
5953 if (eptr >= md->end_subject)
5954 {
5955 SCHECK_PARTIAL();
5956 break;
5957 }
5958 if (IS_NEWLINE(eptr)) break;
5959 if (md->partial != 0 && /* Take care with CRLF partial */
5960 eptr + 1 >= md->end_subject &&
5961 NLBLOCK->nltype == NLTYPE_FIXED &&
5962 NLBLOCK->nllen == 2 &&
5963 *eptr == NLBLOCK->nl[0])
5964 {
5965 md->hitend = TRUE;
5966 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5967 }
5968 eptr++;
5969 }
5970 break;
5971
5972 case OP_ALLANY:
5973 case OP_ANYBYTE:
5974 c = max - min;
5975 if (c > (unsigned int)(md->end_subject - eptr))
5976 {
5977 eptr = md->end_subject;
5978 SCHECK_PARTIAL();
5979 }
5980 else eptr += c;
5981 break;
5982
5983 case OP_ANYNL:
5984 for (i = min; i < max; i++)
5985 {
5986 if (eptr >= md->end_subject)
5987 {
5988 SCHECK_PARTIAL();
5989 break;
5990 }
5991 c = *eptr;
5992 if (c == CHAR_CR)
5993 {
5994 if (++eptr >= md->end_subject) break;
5995 if (*eptr == CHAR_LF) eptr++;
5996 }
5997 else
5998 {
5999 if (c != CHAR_LF && (md->bsr_anycrlf ||
6000 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
6001 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6002 && c != 0x2028 && c != 0x2029
6003 #endif
6004 ))) break;
6005 eptr++;
6006 }
6007 }
6008 break;
6009
6010 case OP_NOT_HSPACE:
6011 for (i = min; i < max; i++)
6012 {
6013 if (eptr >= md->end_subject)
6014 {
6015 SCHECK_PARTIAL();
6016 break;
6017 }
6018 switch(*eptr)
6019 {
6020 default: eptr++; break;
6021 HSPACE_BYTE_CASES:
6022 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6023 HSPACE_MULTIBYTE_CASES:
6024 #endif
6025 goto ENDLOOP00;
6026 }
6027 }
6028 ENDLOOP00:
6029 break;
6030
6031 case OP_HSPACE:
6032 for (i = min; i < max; i++)
6033 {
6034 if (eptr >= md->end_subject)
6035 {
6036 SCHECK_PARTIAL();
6037 break;
6038 }
6039 switch(*eptr)
6040 {
6041 default: goto ENDLOOP01;
6042 HSPACE_BYTE_CASES:
6043 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6044 HSPACE_MULTIBYTE_CASES:
6045 #endif
6046 eptr++; break;
6047 }
6048 }
6049 ENDLOOP01:
6050 break;
6051
6052 case OP_NOT_VSPACE:
6053 for (i = min; i < max; i++)
6054 {
6055 if (eptr >= md->end_subject)
6056 {
6057 SCHECK_PARTIAL();
6058 break;
6059 }
6060 switch(*eptr)
6061 {
6062 default: eptr++; break;
6063 VSPACE_BYTE_CASES:
6064 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6065 VSPACE_MULTIBYTE_CASES:
6066 #endif
6067 goto ENDLOOP02;
6068 }
6069 }
6070 ENDLOOP02:
6071 break;
6072
6073 case OP_VSPACE:
6074 for (i = min; i < max; i++)
6075 {
6076 if (eptr >= md->end_subject)
6077 {
6078 SCHECK_PARTIAL();
6079 break;
6080 }
6081 switch(*eptr)
6082 {
6083 default: goto ENDLOOP03;
6084 VSPACE_BYTE_CASES:
6085 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6086 VSPACE_MULTIBYTE_CASES:
6087 #endif
6088 eptr++; break;
6089 }
6090 }
6091 ENDLOOP03:
6092 break;
6093
6094 case OP_NOT_DIGIT:
6095 for (i = min; i < max; i++)
6096 {
6097 if (eptr >= md->end_subject)
6098 {
6099 SCHECK_PARTIAL();
6100 break;
6101 }
6102 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6103 eptr++;
6104 }
6105 break;
6106
6107 case OP_DIGIT:
6108 for (i = min; i < max; i++)
6109 {
6110 if (eptr >= md->end_subject)
6111 {
6112 SCHECK_PARTIAL();
6113 break;
6114 }
6115 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6116 eptr++;
6117 }
6118 break;
6119
6120 case OP_NOT_WHITESPACE:
6121 for (i = min; i < max; i++)
6122 {
6123 if (eptr >= md->end_subject)
6124 {
6125 SCHECK_PARTIAL();
6126 break;
6127 }
6128 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6129 eptr++;
6130 }
6131 break;
6132
6133 case OP_WHITESPACE:
6134 for (i = min; i < max; i++)
6135 {
6136 if (eptr >= md->end_subject)
6137 {
6138 SCHECK_PARTIAL();
6139 break;
6140 }
6141 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6142 eptr++;
6143 }
6144 break;
6145
6146 case OP_NOT_WORDCHAR:
6147 for (i = min; i < max; i++)
6148 {
6149 if (eptr >= md->end_subject)
6150 {
6151 SCHECK_PARTIAL();
6152 break;
6153 }
6154 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6155 eptr++;
6156 }
6157 break;
6158
6159 case OP_WORDCHAR:
6160 for (i = min; i < max; i++)
6161 {
6162 if (eptr >= md->end_subject)
6163 {
6164 SCHECK_PARTIAL();
6165 break;
6166 }
6167 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6168 eptr++;
6169 }
6170 break;
6171
6172 default:
6173 RRETURN(PCRE_ERROR_INTERNAL);
6174 }
6175
6176 if (possessive) continue; /* No backtracking */
6177 for (;;)
6178 {
6179 if (eptr == pp) goto TAIL_RECURSE;
6180 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6181 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6182 eptr--;
6183 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6184 eptr[-1] == CHAR_CR) eptr--;
6185 }
6186 }
6187
6188 /* Control never gets here */
6189 }
6190
6191 /* There's been some horrible disaster. Arrival here can only mean there is
6192 something seriously wrong in the code above or the OP_xxx definitions. */
6193
6194 default:
6195 DPRINTF(("Unknown opcode %d\n", *ecode));
6196 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6197 }
6198
6199 /* Do not stick any code in here without much thought; it is assumed
6200 that "continue" in the code above comes out to here to repeat the main
6201 loop. */
6202
6203 } /* End of main loop */
6204 /* Control never reaches here */
6205
6206
6207 /* When compiling to use the heap rather than the stack for recursive calls to
6208 match(), the RRETURN() macro jumps here. The number that is saved in
6209 frame->Xwhere indicates which label we actually want to return to. */
6210
6211 #ifdef NO_RECURSE
6212 #define LBL(val) case val: goto L_RM##val;
6213 HEAP_RETURN:
6214 switch (frame->Xwhere)
6215 {
6216 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6217 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6218 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6219 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6220 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6221 LBL(65) LBL(66)
6222 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6223 LBL(20) LBL(21)
6224 #endif
6225 #ifdef SUPPORT_UTF
6226 LBL(16) LBL(18)
6227 LBL(22) LBL(23) LBL(28) LBL(30)
6228 LBL(32) LBL(34) LBL(42) LBL(46)
6229 #ifdef SUPPORT_UCP
6230 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6231 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6232 #endif /* SUPPORT_UCP */
6233 #endif /* SUPPORT_UTF */
6234 default:
6235 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6236 return PCRE_ERROR_INTERNAL;
6237 }
6238 #undef LBL
6239 #endif /* NO_RECURSE */
6240 }
6241
6242
6243 /***************************************************************************
6244 ****************************************************************************
6245 RECURSION IN THE match() FUNCTION
6246
6247 Undefine all the macros that were defined above to handle this. */
6248
6249 #ifdef NO_RECURSE
6250 #undef eptr
6251 #undef ecode
6252 #undef mstart
6253 #undef offset_top
6254 #undef eptrb
6255 #undef flags
6256
6257 #undef callpat
6258 #undef charptr
6259 #undef data
6260 #undef next
6261 #undef pp
6262 #undef prev
6263 #undef saved_eptr
6264
6265 #undef new_recursive
6266
6267 #undef cur_is_word
6268 #undef condition
6269 #undef prev_is_word
6270
6271 #undef ctype
6272 #undef length
6273 #undef max
6274 #undef min
6275 #undef number
6276 #undef offset
6277 #undef op
6278 #undef save_capture_last
6279 #undef save_offset1
6280 #undef save_offset2
6281 #undef save_offset3
6282 #undef stacksave
6283
6284 #undef newptrb
6285
6286 #endif
6287
6288 /* These two are defined as macros in both cases */
6289
6290 #undef fc
6291 #undef fi
6292
6293 /***************************************************************************
6294 ***************************************************************************/
6295
6296
6297 #ifdef NO_RECURSE
6298 /*************************************************
6299 * Release allocated heap frames *
6300 *************************************************/
6301
6302 /* This function releases all the allocated frames. The base frame is on the
6303 machine stack, and so must not be freed.
6304
6305 Argument: the address of the base frame
6306 Returns: nothing
6307 */
6308
6309 static void
6310 release_match_heapframes (heapframe *frame_base)
6311 {
6312 heapframe *nextframe = frame_base->Xnextframe;
6313 while (nextframe != NULL)
6314 {
6315 heapframe *oldframe = nextframe;
6316 nextframe = nextframe->Xnextframe;
6317 (PUBL(stack_free))(oldframe);
6318 }
6319 }
6320 #endif
6321
6322
6323 /*************************************************
6324 * Execute a Regular Expression *
6325 *************************************************/
6326
6327 /* This function applies a compiled re to a subject string and picks out
6328 portions of the string if it matches. Two elements in the vector are set for
6329 each substring: the offsets to the start and end of the substring.
6330
6331 Arguments:
6332 argument_re points to the compiled expression
6333 extra_data points to extra data or is NULL
6334 subject points to the subject string
6335 length length of subject string (may contain binary zeros)
6336 start_offset where to start in the subject string
6337 options option bits
6338 offsets points to a vector of ints to be filled in with offsets
6339 offsetcount the number of elements in the vector
6340
6341 Returns: > 0 => success; value is the number of elements filled in
6342 = 0 => success, but offsets is not big enough
6343 -1 => failed to match
6344 < -1 => some kind of unexpected problem
6345 */
6346
6347 #if defined COMPILE_PCRE8
6348 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6349 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6350 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6351 int offsetcount)
6352 #elif defined COMPILE_PCRE16
6353 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6354 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6355 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6356 int offsetcount)
6357 #elif defined COMPILE_PCRE32
6358 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6359 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6360 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6361 int offsetcount)
6362 #endif
6363 {
6364 int rc, ocount, arg_offset_max;
6365 int newline;
6366 BOOL using_temporary_offsets = FALSE;
6367 BOOL anchored;
6368 BOOL startline;
6369 BOOL firstline;
6370 BOOL utf;
6371 BOOL has_first_char = FALSE;
6372 BOOL has_req_char = FALSE;
6373 pcre_uchar first_char = 0;
6374 pcre_uchar first_char2 = 0;
6375 pcre_uchar req_char = 0;
6376 pcre_uchar req_char2 = 0;
6377 match_data match_block;
6378 match_data *md = &match_block;
6379 const pcre_uint8 *tables;
6380 const pcre_uint8 *start_bits = NULL;
6381 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6382 PCRE_PUCHAR end_subject;
6383 PCRE_PUCHAR start_partial = NULL;
6384 PCRE_PUCHAR match_partial = NULL;
6385 PCRE_PUCHAR req_char_ptr = start_match - 1;
6386
6387 const pcre_study_data *study;
6388 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6389
6390 #ifdef NO_RECURSE
6391 heapframe frame_zero;
6392 frame_zero.Xprevframe = NULL; /* Marks the top level */
6393 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6394 md->match_frames_base = &frame_zero;
6395 #endif
6396
6397 /* Check for the special magic call that measures the size of the stack used
6398 per recursive call of match(). Without the funny casting for sizeof, a Windows
6399 compiler gave this error: "unary minus operator applied to unsigned type,
6400 result still unsigned". Hopefully the cast fixes that. */
6401
6402 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6403 start_offset == -999)
6404 #ifdef NO_RECURSE
6405 return -((int)sizeof(heapframe));
6406 #else
6407 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6408 #endif
6409
6410 /* Plausibility checks */
6411
6412 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6413 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6414 return PCRE_ERROR_NULL;
6415 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6416 if (length < 0) return PCRE_ERROR_BADLENGTH;
6417 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6418
6419 /* Check that the first field in the block is the magic number. If it is not,
6420 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6421 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6422 means that the pattern is likely compiled with different endianness. */
6423
6424 if (re->magic_number != MAGIC_NUMBER)
6425 return re->magic_number == REVERSED_MAGIC_NUMBER?
6426 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6427 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6428
6429 /* These two settings are used in the code for checking a UTF-8 string that
6430 follows immediately afterwards. Other values in the md block are used only
6431 during "normal" pcre_exec() processing, not when the JIT support is in use,
6432 so they are set up later. */
6433
6434 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6435 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6436 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6437 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6438
6439 /* Check a UTF-8 string if required. Pass back the character offset and error
6440 code for an invalid string if a results vector is available. */
6441
6442 #ifdef SUPPORT_UTF
6443 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6444 {
6445 int erroroffset;
6446 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6447 if (errorcode != 0)
6448 {
6449 if (offsetcount >= 2)
6450 {
6451 offsets[0] = erroroffset;
6452 offsets[1] = errorcode;
6453 }
6454 #if defined COMPILE_PCRE8
6455 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6456 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6457 #elif defined COMPILE_PCRE16
6458 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6459 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6460 #elif defined COMPILE_PCRE32
6461 return PCRE_ERROR_BADUTF32;
6462 #endif
6463 }
6464 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6465 /* Check that a start_offset points to the start of a UTF character. */
6466 if (start_offset > 0 && start_offset < length &&
6467 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6468 return PCRE_ERROR_BADUTF8_OFFSET;
6469 #endif
6470 }
6471 #endif
6472
6473 /* If the pattern was successfully studied with JIT support, run the JIT
6474 executable instead of the rest of this function. Most options must be set at
6475 compile time for the JIT code to be usable. Fallback to the normal code path if
6476 an unsupported flag is set. */
6477
6478 #ifdef SUPPORT_JIT
6479 if (extra_data != NULL
6480 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6481 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6482 && extra_data->executable_jit != NULL
6483 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6484 {
6485 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6486 start_offset, options, offsets, offsetcount);
6487
6488 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6489 mode is not compiled. In this case we simply fallback to interpreter. */
6490
6491 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6492 }
6493 #endif
6494
6495 /* Carry on with non-JIT matching. This information is for finding all the
6496 numbers associated with a given name, for condition testing. */
6497
6498 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6499 md->name_count = re->name_count;
6500 md->name_entry_size = re->name_entry_size;
6501
6502 /* Fish out the optional data from the extra_data structure, first setting
6503 the default values. */
6504
6505 study = NULL;
6506 md->match_limit = MATCH_LIMIT;
6507 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6508 md->callout_data = NULL;
6509
6510 /* The table pointer is always in native byte order. */
6511
6512 tables = re->tables;
6513
6514 /* The two limit values override the defaults, whatever their value. */
6515
6516 if (extra_data != NULL)
6517 {
6518 unsigned long int flags = extra_data->flags;
6519 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6520 study = (const pcre_study_data *)extra_data->study_data;
6521 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6522 md->match_limit = extra_data->match_limit;
6523 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6524 md->match_limit_recursion = extra_data->match_limit_recursion;
6525 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6526 md->callout_data = extra_data->callout_data;
6527 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6528 }
6529
6530 /* Limits in the regex override only if they are smaller. */
6531
6532 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6533 md->match_limit = re->limit_match;
6534
6535 if ((re->flags & PCRE_RLSET) != 0 &&
6536 re->limit_recursion < md->match_limit_recursion)
6537 md->match_limit_recursion = re->limit_recursion;
6538
6539 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6540 is a feature that makes it possible to save compiled regex and re-use them
6541 in other programs later. */
6542
6543 if (tables == NULL) tables = PRIV(default_tables);
6544
6545 /* Set up other data */
6546
6547 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6548 startline = (re->flags & PCRE_STARTLINE) != 0;
6549 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6550
6551 /* The code starts after the real_pcre block and the capture name table. */
6552
6553 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6554 re->name_count * re->name_entry_size;
6555
6556 md->start_subject = (PCRE_PUCHAR)subject;
6557 md->start_offset = start_offset;
6558 md->end_subject = md->start_subject + length;
6559 end_subject = md->end_subject;
6560
6561 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6562 md->use_ucp = (re->options & PCRE_UCP) != 0;
6563 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6564 md->ignore_skip_arg = 0;
6565
6566 /* Some options are unpacked into BOOL variables in the hope that testing
6567 them will be faster than individual option bits. */
6568
6569 md->notbol = (options & PCRE_NOTBOL) != 0;
6570 md->noteol = (options & PCRE_NOTEOL) != 0;
6571 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6572 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6573
6574 md->hitend = FALSE;
6575 md->mark = md->nomatch_mark = NULL; /* In case never set */
6576
6577 md->recursive = NULL; /* No recursion at top level */
6578 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6579
6580 md->lcc = tables + lcc_offset;
6581 md->fcc = tables + fcc_offset;
6582 md->ctypes = tables + ctypes_offset;
6583
6584 /* Handle different \R options. */
6585
6586 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6587 {
6588 case 0:
6589 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6590 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6591 else
6592 #ifdef BSR_ANYCRLF
6593 md->bsr_anycrlf = TRUE;
6594 #else
6595 md->bsr_anycrlf = FALSE;
6596 #endif
6597 break;
6598
6599 case PCRE_BSR_ANYCRLF:
6600 md->bsr_anycrlf = TRUE;
6601 break;
6602
6603 case PCRE_BSR_UNICODE:
6604 md->bsr_anycrlf = FALSE;
6605 break;
6606
6607 default: return PCRE_ERROR_BADNEWLINE;
6608 }
6609
6610 /* Handle different types of newline. The three bits give eight cases. If
6611 nothing is set at run time, whatever was used at compile time applies. */
6612
6613 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6614 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6615 {
6616 case 0: newline = NEWLINE; break; /* Compile-time default */
6617 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6618 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6619 case PCRE_NEWLINE_CR+
6620 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6621 case PCRE_NEWLINE_ANY: newline = -1; break;
6622 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6623 default: return PCRE_ERROR_BADNEWLINE;
6624 }
6625
6626 if (newline == -2)
6627 {
6628 md->nltype = NLTYPE_ANYCRLF;
6629 }
6630 else if (newline < 0)
6631 {
6632 md->nltype = NLTYPE_ANY;
6633 }
6634 else
6635 {
6636 md->nltype = NLTYPE_FIXED;
6637 if (newline > 255)
6638 {
6639 md->nllen = 2;
6640 md->nl[0] = (newline >> 8) & 255;
6641 md->nl[1] = newline & 255;
6642 }
6643 else
6644 {
6645 md->nllen = 1;
6646 md->nl[0] = newline;
6647 }
6648 }
6649
6650 /* Partial matching was originally supported only for a restricted set of
6651 regexes; from release 8.00 there are no restrictions, but the bits are still
6652 defined (though never set). So there's no harm in leaving this code. */
6653
6654 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6655 return PCRE_ERROR_BADPARTIAL;
6656
6657 /* If the expression has got more back references than the offsets supplied can
6658 hold, we get a temporary chunk of working store to use during the matching.
6659 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6660 of 3. */
6661
6662 ocount = offsetcount - (offsetcount % 3);
6663 arg_offset_max = (2*ocount)/3;
6664
6665 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6666 {
6667 ocount = re->top_backref * 3 + 3;
6668 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6669 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6670 using_temporary_offsets = TRUE;
6671 DPRINTF(("Got memory to hold back references\n"));
6672 }
6673 else md->offset_vector = offsets;
6674 md->offset_end = ocount;
6675 md->offset_max = (2*ocount)/3;
6676 md->capture_last = 0;
6677
6678 /* Reset the working variable associated with each extraction. These should
6679 never be used unless previously set, but they get saved and restored, and so we
6680 initialize them to avoid reading uninitialized locations. Also, unset the
6681 offsets for the matched string. This is really just for tidiness with callouts,
6682 in case they inspect these fields. */
6683
6684 if (md->offset_vector != NULL)
6685 {
6686 register int *iptr = md->offset_vector + ocount;
6687 register int *iend = iptr - re->top_bracket;
6688 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6689 while (--iptr >= iend) *iptr = -1;
6690 md->offset_vector[0] = md->offset_vector[1] = -1;
6691 }
6692
6693 /* Set up the first character to match, if available. The first_char value is
6694 never set for an anchored regular expression, but the anchoring may be forced
6695 at run time, so we have to test for anchoring. The first char may be unset for
6696 an unanchored pattern, of course. If there's no first char and the pattern was
6697 studied, there may be a bitmap of possible first characters. */
6698
6699 if (!anchored)
6700 {
6701 if ((re->flags & PCRE_FIRSTSET) != 0)
6702 {
6703 has_first_char = TRUE;
6704 first_char = first_char2 = (pcre_uchar)(re->first_char);
6705 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6706 {
6707 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6708 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6709 if (utf && first_char > 127)
6710 first_char2 = UCD_OTHERCASE(first_char);
6711 #endif
6712 }
6713 }
6714 else
6715 if (!startline && study != NULL &&
6716 (study->flags & PCRE_STUDY_MAPPED) != 0)
6717 start_bits = study->start_bits;
6718 }
6719
6720 /* For anchored or unanchored matches, there may be a "last known required
6721 character" set. */
6722
6723 if ((re->flags & PCRE_REQCHSET) != 0)
6724 {
6725 has_req_char = TRUE;
6726 req_char = req_char2 = (pcre_uchar)(re->req_char);
6727 if ((re->flags & PCRE_RCH_CASELESS) != 0)