/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 1525 - (show annotations) (download)
Wed Feb 11 16:48:35 2015 UTC (2 weeks, 2 days ago) by ph10
File MIME type: text/plain
File size: 217867 byte(s)
Fix bug that did not allow zero case for (a)*+ when ovector was too small to 
capture.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2014 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40 /* This module contains pcre_exec(), the externally visible function that does
41 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42 possible. There are also some static supporting functions. */
43
44 #ifdef HAVE_CONFIG_H
45 #include "config.h"
46 #endif
47
48 #define NLBLOCK md /* Block containing newline information */
49 #define PSSTART start_subject /* Field containing processed string start */
50 #define PSEND end_subject /* Field containing processed string end */
51
52 #include "pcre_internal.h"
53
54 /* Undefine some potentially clashing cpp symbols */
55
56 #undef min
57 #undef max
58
59 /* The md->capture_last field uses the lower 16 bits for the last captured
60 substring (which can never be greater than 65535) and a bit in the top half
61 to mean "capture vector overflowed". This odd way of doing things was
62 implemented when it was realized that preserving and restoring the overflow bit
63 whenever the last capture number was saved/restored made for a neater
64 interface, and doing it this way saved on (a) another variable, which would
65 have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66 separate set of save/restore instructions. The following defines are used in
67 implementing this. */
68
69 #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70 #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71 #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72
73 /* Values for setting in md->match_function_type to indicate two special types
74 of call to match(). We do it this way to save on using another stack variable,
75 as stack usage is to be discouraged. */
76
77 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
79
80 /* Non-error returns from the match() function. Error returns are externally
81 defined PCRE_ERROR_xxx codes, which are all negative. */
82
83 #define MATCH_MATCH 1
84 #define MATCH_NOMATCH 0
85
86 /* Special internal returns from the match() function. Make them sufficiently
87 negative to avoid the external error codes. */
88
89 #define MATCH_ACCEPT (-999)
90 #define MATCH_KETRPOS (-998)
91 #define MATCH_ONCE (-997)
92 /* The next 5 must be kept together and in sequence so that a test that checks
93 for any one of them can use a range. */
94 #define MATCH_COMMIT (-996)
95 #define MATCH_PRUNE (-995)
96 #define MATCH_SKIP (-994)
97 #define MATCH_SKIP_ARG (-993)
98 #define MATCH_THEN (-992)
99 #define MATCH_BACKTRACK_MAX MATCH_THEN
100 #define MATCH_BACKTRACK_MIN MATCH_COMMIT
101
102 /* Maximum number of ints of offset to save on the stack for recursive calls.
103 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104 because the offset vector is always a multiple of 3 long. */
105
106 #define REC_STACK_SAVE_MAX 30
107
108 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109
110 static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111 static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
112
113 #ifdef PCRE_DEBUG
114 /*************************************************
115 * Debugging function to print chars *
116 *************************************************/
117
118 /* Print a sequence of chars in printable format, stopping at the end of the
119 subject if the requested.
120
121 Arguments:
122 p points to characters
123 length number to print
124 is_subject TRUE if printing from within md->start_subject
125 md pointer to matching data block, if is_subject is TRUE
126
127 Returns: nothing
128 */
129
130 static void
131 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
132 {
133 pcre_uint32 c;
134 BOOL utf = md->utf;
135 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136 while (length-- > 0)
137 if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
138 }
139 #endif
140
141
142
143 /*************************************************
144 * Match a back-reference *
145 *************************************************/
146
147 /* Normally, if a back reference hasn't been set, the length that is passed is
148 negative, so the match always fails. However, in JavaScript compatibility mode,
149 the length passed is zero. Note that in caseless UTF-8 mode, the number of
150 subject bytes matched may be different to the number of reference bytes.
151
152 Arguments:
153 offset index into the offset vector
154 eptr pointer into the subject
155 length length of reference to be matched (number of bytes)
156 md points to match data block
157 caseless TRUE if caseless
158
159 Returns: >= 0 the number of subject bytes matched
160 -1 no match
161 -2 partial match; always given if at end subject
162 */
163
164 static int
165 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166 BOOL caseless)
167 {
168 PCRE_PUCHAR eptr_start = eptr;
169 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
170 #if defined SUPPORT_UTF && defined SUPPORT_UCP
171 BOOL utf = md->utf;
172 #endif
173
174 #ifdef PCRE_DEBUG
175 if (eptr >= md->end_subject)
176 printf("matching subject <null>");
177 else
178 {
179 printf("matching subject ");
180 pchars(eptr, length, TRUE, md);
181 }
182 printf(" against backref ");
183 pchars(p, length, FALSE, md);
184 printf("\n");
185 #endif
186
187 /* Always fail if reference not set (and not JavaScript compatible - in that
188 case the length is passed as zero). */
189
190 if (length < 0) return -1;
191
192 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193 properly if Unicode properties are supported. Otherwise, we can check only
194 ASCII characters. */
195
196 if (caseless)
197 {
198 #if defined SUPPORT_UTF && defined SUPPORT_UCP
199 if (utf)
200 {
201 /* Match characters up to the end of the reference. NOTE: the number of
202 data units matched may differ, because in UTF-8 there are some characters
203 whose upper and lower case versions code have different numbers of bytes.
204 For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205 (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206 sequence of two of the latter. It is important, therefore, to check the
207 length along the reference, not along the subject (earlier code did this
208 wrong). */
209
210 PCRE_PUCHAR endptr = p + length;
211 while (p < endptr)
212 {
213 pcre_uint32 c, d;
214 const ucd_record *ur;
215 if (eptr >= md->end_subject) return -2; /* Partial match */
216 GETCHARINC(c, eptr);
217 GETCHARINC(d, p);
218 ur = GET_UCD(d);
219 if (c != d && c != d + ur->other_case)
220 {
221 const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222 for (;;)
223 {
224 if (c < *pp) return -1;
225 if (c == *pp++) break;
226 }
227 }
228 }
229 }
230 else
231 #endif
232
233 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234 is no UCP support. */
235 {
236 while (length-- > 0)
237 {
238 pcre_uint32 cc, cp;
239 if (eptr >= md->end_subject) return -2; /* Partial match */
240 cc = UCHAR21TEST(eptr);
241 cp = UCHAR21TEST(p);
242 if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
243 p++;
244 eptr++;
245 }
246 }
247 }
248
249 /* In the caseful case, we can just compare the bytes, whether or not we
250 are in UTF-8 mode. */
251
252 else
253 {
254 while (length-- > 0)
255 {
256 if (eptr >= md->end_subject) return -2; /* Partial match */
257 if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
258 }
259 }
260
261 return (int)(eptr - eptr_start);
262 }
263
264
265
266 /***************************************************************************
267 ****************************************************************************
268 RECURSION IN THE match() FUNCTION
269
270 The match() function is highly recursive, though not every recursive call
271 increases the recursive depth. Nevertheless, some regular expressions can cause
272 it to recurse to a great depth. I was writing for Unix, so I just let it call
273 itself recursively. This uses the stack for saving everything that has to be
274 saved for a recursive call. On Unix, the stack can be large, and this works
275 fine.
276
277 It turns out that on some non-Unix-like systems there are problems with
278 programs that use a lot of stack. (This despite the fact that every last chip
279 has oodles of memory these days, and techniques for extending the stack have
280 been known for decades.) So....
281
282 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283 calls by keeping local variables that need to be preserved in blocks of memory
284 obtained from malloc() instead instead of on the stack. Macros are used to
285 achieve this so that the actual code doesn't look very different to what it
286 always used to.
287
288 The original heap-recursive code used longjmp(). However, it seems that this
289 can be very slow on some operating systems. Following a suggestion from Stan
290 Switzer, the use of longjmp() has been abolished, at the cost of having to
291 provide a unique number for each call to RMATCH. There is no way of generating
292 a sequence of numbers at compile time in C. I have given them names, to make
293 them stand out more clearly.
294
295 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297 tests. Furthermore, not using longjmp() means that local dynamic variables
298 don't have indeterminate values; this has meant that the frame size can be
299 reduced because the result can be "passed back" by straight setting of the
300 variable instead of being passed in the frame.
301 ****************************************************************************
302 ***************************************************************************/
303
304 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305 below must be updated in sync. */
306
307 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
312 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
313 RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
314
315 /* These versions of the macros use the stack, as normal. There are debugging
316 versions and production versions. Note that the "rw" argument of RMATCH isn't
317 actually used in this definition. */
318
319 #ifndef NO_RECURSE
320 #define REGISTER register
321
322 #ifdef PCRE_DEBUG
323 #define RMATCH(ra,rb,rc,rd,re,rw) \
324 { \
325 printf("match() called in line %d\n", __LINE__); \
326 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
327 printf("to line %d\n", __LINE__); \
328 }
329 #define RRETURN(ra) \
330 { \
331 printf("match() returned %d from line %d\n", ra, __LINE__); \
332 return ra; \
333 }
334 #else
335 #define RMATCH(ra,rb,rc,rd,re,rw) \
336 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
337 #define RRETURN(ra) return ra
338 #endif
339
340 #else
341
342
343 /* These versions of the macros manage a private stack on the heap. Note that
344 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345 argument of match(), which never changes. */
346
347 #define REGISTER
348
349 #define RMATCH(ra,rb,rc,rd,re,rw)\
350 {\
351 heapframe *newframe = frame->Xnextframe;\
352 if (newframe == NULL)\
353 {\
354 newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356 newframe->Xnextframe = NULL;\
357 frame->Xnextframe = newframe;\
358 }\
359 frame->Xwhere = rw;\
360 newframe->Xeptr = ra;\
361 newframe->Xecode = rb;\
362 newframe->Xmstart = mstart;\
363 newframe->Xoffset_top = rc;\
364 newframe->Xeptrb = re;\
365 newframe->Xrdepth = frame->Xrdepth + 1;\
366 newframe->Xprevframe = frame;\
367 frame = newframe;\
368 DPRINTF(("restarting from line %d\n", __LINE__));\
369 goto HEAP_RECURSE;\
370 L_##rw:\
371 DPRINTF(("jumped back to line %d\n", __LINE__));\
372 }
373
374 #define RRETURN(ra)\
375 {\
376 heapframe *oldframe = frame;\
377 frame = oldframe->Xprevframe;\
378 if (frame != NULL)\
379 {\
380 rrc = ra;\
381 goto HEAP_RETURN;\
382 }\
383 return ra;\
384 }
385
386
387 /* Structure for remembering the local variables in a private frame */
388
389 typedef struct heapframe {
390 struct heapframe *Xprevframe;
391 struct heapframe *Xnextframe;
392
393 /* Function arguments that may change */
394
395 PCRE_PUCHAR Xeptr;
396 const pcre_uchar *Xecode;
397 PCRE_PUCHAR Xmstart;
398 int Xoffset_top;
399 eptrblock *Xeptrb;
400 unsigned int Xrdepth;
401
402 /* Function local variables */
403
404 PCRE_PUCHAR Xcallpat;
405 #ifdef SUPPORT_UTF
406 PCRE_PUCHAR Xcharptr;
407 #endif
408 PCRE_PUCHAR Xdata;
409 PCRE_PUCHAR Xnext;
410 PCRE_PUCHAR Xpp;
411 PCRE_PUCHAR Xprev;
412 PCRE_PUCHAR Xsaved_eptr;
413
414 recursion_info Xnew_recursive;
415
416 BOOL Xcur_is_word;
417 BOOL Xcondition;
418 BOOL Xprev_is_word;
419
420 #ifdef SUPPORT_UCP
421 int Xprop_type;
422 unsigned int Xprop_value;
423 int Xprop_fail_result;
424 int Xoclength;
425 pcre_uchar Xocchars[6];
426 #endif
427
428 int Xcodelink;
429 int Xctype;
430 unsigned int Xfc;
431 int Xfi;
432 int Xlength;
433 int Xmax;
434 int Xmin;
435 unsigned int Xnumber;
436 int Xoffset;
437 unsigned int Xop;
438 pcre_int32 Xsave_capture_last;
439 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440 int Xstacksave[REC_STACK_SAVE_MAX];
441
442 eptrblock Xnewptrb;
443
444 /* Where to jump back to */
445
446 int Xwhere;
447
448 } heapframe;
449
450 #endif
451
452
453 /***************************************************************************
454 ***************************************************************************/
455
456
457
458 /*************************************************
459 * Match from current position *
460 *************************************************/
461
462 /* This function is called recursively in many circumstances. Whenever it
463 returns a negative (error) response, the outer incarnation must also return the
464 same response. */
465
466 /* These macros pack up tests that are used for partial matching, and which
467 appear several times in the code. We set the "hit end" flag if the pointer is
468 at the end of the subject and also past the start of the subject (i.e.
469 something has been matched). For hard partial matching, we then return
470 immediately. The second one is used when we already know we are past the end of
471 the subject. */
472
473 #define CHECK_PARTIAL()\
474 if (md->partial != 0 && eptr >= md->end_subject && \
475 eptr > md->start_used_ptr) \
476 { \
477 md->hitend = TRUE; \
478 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
479 }
480
481 #define SCHECK_PARTIAL()\
482 if (md->partial != 0 && eptr > md->start_used_ptr) \
483 { \
484 md->hitend = TRUE; \
485 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
486 }
487
488
489 /* Performance note: It might be tempting to extract commonly used fields from
490 the md structure (e.g. utf, end_subject) into individual variables to improve
491 performance. Tests using gcc on a SPARC disproved this; in the first case, it
492 made performance worse.
493
494 Arguments:
495 eptr pointer to current character in subject
496 ecode pointer to current position in compiled code
497 mstart pointer to the current match start position (can be modified
498 by encountering \K)
499 offset_top current top pointer
500 md pointer to "static" info for the match
501 eptrb pointer to chain of blocks containing eptr at start of
502 brackets - for testing for empty matches
503 rdepth the recursion depth
504
505 Returns: MATCH_MATCH if matched ) these values are >= 0
506 MATCH_NOMATCH if failed to match )
507 a negative MATCH_xxx value for PRUNE, SKIP, etc
508 a negative PCRE_ERROR_xxx value if aborted by an error condition
509 (e.g. stopped by repeated call or recursion limit)
510 */
511
512 static int
513 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515 unsigned int rdepth)
516 {
517 /* These variables do not need to be preserved over recursion in this function,
518 so they can be ordinary variables in all cases. Mark some of them with
519 "register" because they are used a lot in loops. */
520
521 register int rrc; /* Returns from recursive calls */
522 register int i; /* Used for loops not involving calls to RMATCH() */
523 register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
524 register BOOL utf; /* Local copy of UTF flag for speed */
525
526 BOOL minimize, possessive; /* Quantifier options */
527 BOOL caseless;
528 int condcode;
529
530 /* When recursion is not being used, all "local" variables that have to be
531 preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532 frame on the stack here; subsequent instantiations are obtained from the heap
533 whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534 the top-level on the stack rather than malloc-ing them all gives a performance
535 boost in many cases where there is not much "recursion". */
536
537 #ifdef NO_RECURSE
538 heapframe *frame = (heapframe *)md->match_frames_base;
539
540 /* Copy in the original argument variables */
541
542 frame->Xeptr = eptr;
543 frame->Xecode = ecode;
544 frame->Xmstart = mstart;
545 frame->Xoffset_top = offset_top;
546 frame->Xeptrb = eptrb;
547 frame->Xrdepth = rdepth;
548
549 /* This is where control jumps back to to effect "recursion" */
550
551 HEAP_RECURSE:
552
553 /* Macros make the argument variables come from the current frame */
554
555 #define eptr frame->Xeptr
556 #define ecode frame->Xecode
557 #define mstart frame->Xmstart
558 #define offset_top frame->Xoffset_top
559 #define eptrb frame->Xeptrb
560 #define rdepth frame->Xrdepth
561
562 /* Ditto for the local variables */
563
564 #ifdef SUPPORT_UTF
565 #define charptr frame->Xcharptr
566 #endif
567 #define callpat frame->Xcallpat
568 #define codelink frame->Xcodelink
569 #define data frame->Xdata
570 #define next frame->Xnext
571 #define pp frame->Xpp
572 #define prev frame->Xprev
573 #define saved_eptr frame->Xsaved_eptr
574
575 #define new_recursive frame->Xnew_recursive
576
577 #define cur_is_word frame->Xcur_is_word
578 #define condition frame->Xcondition
579 #define prev_is_word frame->Xprev_is_word
580
581 #ifdef SUPPORT_UCP
582 #define prop_type frame->Xprop_type
583 #define prop_value frame->Xprop_value
584 #define prop_fail_result frame->Xprop_fail_result
585 #define oclength frame->Xoclength
586 #define occhars frame->Xocchars
587 #endif
588
589 #define ctype frame->Xctype
590 #define fc frame->Xfc
591 #define fi frame->Xfi
592 #define length frame->Xlength
593 #define max frame->Xmax
594 #define min frame->Xmin
595 #define number frame->Xnumber
596 #define offset frame->Xoffset
597 #define op frame->Xop
598 #define save_capture_last frame->Xsave_capture_last
599 #define save_offset1 frame->Xsave_offset1
600 #define save_offset2 frame->Xsave_offset2
601 #define save_offset3 frame->Xsave_offset3
602 #define stacksave frame->Xstacksave
603
604 #define newptrb frame->Xnewptrb
605
606 /* When recursion is being used, local variables are allocated on the stack and
607 get preserved during recursion in the normal way. In this environment, fi and
608 i, and fc and c, can be the same variables. */
609
610 #else /* NO_RECURSE not defined */
611 #define fi i
612 #define fc c
613
614 /* Many of the following variables are used only in small blocks of the code.
615 My normal style of coding would have declared them within each of those blocks.
616 However, in order to accommodate the version of this code that uses an external
617 "stack" implemented on the heap, it is easier to declare them all here, so the
618 declarations can be cut out in a block. The only declarations within blocks
619 below are for variables that do not have to be preserved over a recursive call
620 to RMATCH(). */
621
622 #ifdef SUPPORT_UTF
623 const pcre_uchar *charptr;
624 #endif
625 const pcre_uchar *callpat;
626 const pcre_uchar *data;
627 const pcre_uchar *next;
628 PCRE_PUCHAR pp;
629 const pcre_uchar *prev;
630 PCRE_PUCHAR saved_eptr;
631
632 recursion_info new_recursive;
633
634 BOOL cur_is_word;
635 BOOL condition;
636 BOOL prev_is_word;
637
638 #ifdef SUPPORT_UCP
639 int prop_type;
640 unsigned int prop_value;
641 int prop_fail_result;
642 int oclength;
643 pcre_uchar occhars[6];
644 #endif
645
646 int codelink;
647 int ctype;
648 int length;
649 int max;
650 int min;
651 unsigned int number;
652 int offset;
653 unsigned int op;
654 pcre_int32 save_capture_last;
655 int save_offset1, save_offset2, save_offset3;
656 int stacksave[REC_STACK_SAVE_MAX];
657
658 eptrblock newptrb;
659
660 /* There is a special fudge for calling match() in a way that causes it to
661 measure the size of its basic stack frame when the stack is being used for
662 recursion. The second argument (ecode) being NULL triggers this behaviour. It
663 cannot normally ever be NULL. The return is the negated value of the frame
664 size. */
665
666 if (ecode == NULL)
667 {
668 if (rdepth == 0)
669 return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670 else
671 {
672 int len = (char *)&rdepth - (char *)eptr;
673 return (len > 0)? -len : len;
674 }
675 }
676 #endif /* NO_RECURSE */
677
678 /* To save space on the stack and in the heap frame, I have doubled up on some
679 of the local variables that are used only in localised parts of the code, but
680 still need to be preserved over recursive calls of match(). These macros define
681 the alternative names that are used. */
682
683 #define allow_zero cur_is_word
684 #define cbegroup condition
685 #define code_offset codelink
686 #define condassert condition
687 #define matched_once prev_is_word
688 #define foc number
689 #define save_mark data
690
691 /* These statements are here to stop the compiler complaining about unitialized
692 variables. */
693
694 #ifdef SUPPORT_UCP
695 prop_value = 0;
696 prop_fail_result = 0;
697 #endif
698
699
700 /* This label is used for tail recursion, which is used in a few cases even
701 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702 used. Thanks to Ian Taylor for noticing this possibility and sending the
703 original patch. */
704
705 TAIL_RECURSE:
706
707 /* OK, now we can get on with the real code of the function. Recursive calls
708 are specified by the macro RMATCH and RRETURN is used to return. When
709 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
710 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
711 defined). However, RMATCH isn't like a function call because it's quite a
712 complicated macro. It has to be used in one particular way. This shouldn't,
713 however, impact performance when true recursion is being used. */
714
715 #ifdef SUPPORT_UTF
716 utf = md->utf; /* Local copy of the flag */
717 #else
718 utf = FALSE;
719 #endif
720
721 /* First check that we haven't called match() too many times, or that we
722 haven't exceeded the recursive call limit. */
723
724 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
726
727 /* At the start of a group with an unlimited repeat that may match an empty
728 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729 done this way to save having to use another function argument, which would take
730 up space on the stack. See also MATCH_CONDASSERT below.
731
732 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733 such remembered pointers, to be checked when we hit the closing ket, in order
734 to break infinite loops that match no characters. When match() is called in
735 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736 NOT be used with tail recursion, because the memory block that is used is on
737 the stack, so a new one may be required for each match(). */
738
739 if (md->match_function_type == MATCH_CBEGROUP)
740 {
741 newptrb.epb_saved_eptr = eptr;
742 newptrb.epb_prev = eptrb;
743 eptrb = &newptrb;
744 md->match_function_type = 0;
745 }
746
747 /* Now start processing the opcodes. */
748
749 for (;;)
750 {
751 minimize = possessive = FALSE;
752 op = *ecode;
753
754 switch(op)
755 {
756 case OP_MARK:
757 md->nomatch_mark = ecode + 2;
758 md->mark = NULL; /* In case previously set by assertion */
759 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
760 eptrb, RM55);
761 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762 md->mark == NULL) md->mark = ecode + 2;
763
764 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765 argument, and we must check whether that argument matches this MARK's
766 argument. It is passed back in md->start_match_ptr (an overloading of that
767 variable). If it does match, we reset that variable to the current subject
768 position and return MATCH_SKIP. Otherwise, pass back the return code
769 unaltered. */
770
771 else if (rrc == MATCH_SKIP_ARG &&
772 STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
773 {
774 md->start_match_ptr = eptr;
775 RRETURN(MATCH_SKIP);
776 }
777 RRETURN(rrc);
778
779 case OP_FAIL:
780 RRETURN(MATCH_NOMATCH);
781
782 case OP_COMMIT:
783 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784 eptrb, RM52);
785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
786 RRETURN(MATCH_COMMIT);
787
788 case OP_PRUNE:
789 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790 eptrb, RM51);
791 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
792 RRETURN(MATCH_PRUNE);
793
794 case OP_PRUNE_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
798 eptrb, RM56);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 RRETURN(MATCH_PRUNE);
803
804 case OP_SKIP:
805 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
806 eptrb, RM53);
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 md->start_match_ptr = eptr; /* Pass back current position */
809 RRETURN(MATCH_SKIP);
810
811 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
812 nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813 not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814 that failed and any that precede it (either they also failed, or were not
815 triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816 SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817 set to the count of the one that failed. */
818
819 case OP_SKIP_ARG:
820 md->skip_arg_count++;
821 if (md->skip_arg_count <= md->ignore_skip_arg)
822 {
823 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
824 break;
825 }
826 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
827 eptrb, RM57);
828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
829
830 /* Pass back the current skip name by overloading md->start_match_ptr and
831 returning the special MATCH_SKIP_ARG return code. This will either be
832 caught by a matching MARK, or get to the top, where it causes a rematch
833 with md->ignore_skip_arg set to the value of md->skip_arg_count. */
834
835 md->start_match_ptr = ecode + 2;
836 RRETURN(MATCH_SKIP_ARG);
837
838 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839 the branch in which it occurs can be determined. Overload the start of
840 match pointer to do this. */
841
842 case OP_THEN:
843 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
844 eptrb, RM54);
845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
846 md->start_match_ptr = ecode;
847 RRETURN(MATCH_THEN);
848
849 case OP_THEN_ARG:
850 md->nomatch_mark = ecode + 2;
851 md->mark = NULL; /* In case previously set by assertion */
852 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
853 md, eptrb, RM58);
854 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855 md->mark == NULL) md->mark = ecode + 2;
856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
857 md->start_match_ptr = ecode;
858 RRETURN(MATCH_THEN);
859
860 /* Handle an atomic group that does not contain any capturing parentheses.
861 This can be handled like an assertion. Prior to 8.13, all atomic groups
862 were handled this way. In 8.13, the code was changed as below for ONCE, so
863 that backups pass through the group and thereby reset captured values.
864 However, this uses a lot more stack, so in 8.20, atomic groups that do not
865 contain any captures generate OP_ONCE_NC, which can be handled in the old,
866 less stack intensive way.
867
868 Check the alternative branches in turn - the matching won't pass the KET
869 for this kind of subpattern. If any one branch matches, we carry on as at
870 the end of a normal bracket, leaving the subject pointer, but resetting
871 the start-of-match value in case it was changed by \K. */
872
873 case OP_ONCE_NC:
874 prev = ecode;
875 saved_eptr = eptr;
876 save_mark = md->mark;
877 do
878 {
879 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
881 {
882 mstart = md->start_match_ptr;
883 break;
884 }
885 if (rrc == MATCH_THEN)
886 {
887 next = ecode + GET(ecode,1);
888 if (md->start_match_ptr < next &&
889 (*ecode == OP_ALT || *next == OP_ALT))
890 rrc = MATCH_NOMATCH;
891 }
892
893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894 ecode += GET(ecode,1);
895 md->mark = save_mark;
896 }
897 while (*ecode == OP_ALT);
898
899 /* If hit the end of the group (which could be repeated), fail */
900
901 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
902
903 /* Continue as from after the group, updating the offsets high water
904 mark, since extracts may have been taken. */
905
906 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
907
908 offset_top = md->end_offset_top;
909 eptr = md->end_match_ptr;
910
911 /* For a non-repeating ket, just continue at this level. This also
912 happens for a repeating ket if no characters were matched in the group.
913 This is the forcible breaking of infinite loops as implemented in Perl
914 5.005. */
915
916 if (*ecode == OP_KET || eptr == saved_eptr)
917 {
918 ecode += 1+LINK_SIZE;
919 break;
920 }
921
922 /* The repeating kets try the rest of the pattern or restart from the
923 preceding bracket, in the appropriate order. The second "call" of match()
924 uses tail recursion, to avoid using another stack frame. */
925
926 if (*ecode == OP_KETRMIN)
927 {
928 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930 ecode = prev;
931 goto TAIL_RECURSE;
932 }
933 else /* OP_KETRMAX */
934 {
935 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937 ecode += 1 + LINK_SIZE;
938 goto TAIL_RECURSE;
939 }
940 /* Control never gets here */
941
942 /* Handle a capturing bracket, other than those that are possessive with an
943 unlimited repeat. If there is space in the offset vector, save the current
944 subject position in the working slot at the top of the vector. We mustn't
945 change the current values of the data slot, because they may be set from a
946 previous iteration of this group, and be referred to by a reference inside
947 the group. A failure to match might occur after the group has succeeded,
948 if something later on doesn't match. For this reason, we need to restore
949 the working value and also the values of the final offsets, in case they
950 were set by a previous iteration of the same bracket.
951
952 If there isn't enough space in the offset vector, treat this as if it were
953 a non-capturing bracket. Don't worry about setting the flag for the error
954 case here; that is handled in the code for KET. */
955
956 case OP_CBRA:
957 case OP_SCBRA:
958 number = GET2(ecode, 1+LINK_SIZE);
959 offset = number << 1;
960
961 #ifdef PCRE_DEBUG
962 printf("start bracket %d\n", number);
963 printf("subject=");
964 pchars(eptr, 16, TRUE, md);
965 printf("\n");
966 #endif
967
968 if (offset < md->offset_max)
969 {
970 save_offset1 = md->offset_vector[offset];
971 save_offset2 = md->offset_vector[offset+1];
972 save_offset3 = md->offset_vector[md->offset_end - number];
973 save_capture_last = md->capture_last;
974 save_mark = md->mark;
975
976 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
977 md->offset_vector[md->offset_end - number] =
978 (int)(eptr - md->start_subject);
979
980 for (;;)
981 {
982 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
984 eptrb, RM1);
985 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
986
987 /* If we backed up to a THEN, check whether it is within the current
988 branch by comparing the address of the THEN that is passed back with
989 the end of the branch. If it is within the current branch, and the
990 branch is one of two or more alternatives (it either starts or ends
991 with OP_ALT), we have reached the limit of THEN's action, so convert
992 the return code to NOMATCH, which will cause normal backtracking to
993 happen from now on. Otherwise, THEN is passed back to an outer
994 alternative. This implements Perl's treatment of parenthesized groups,
995 where a group not containing | does not affect the current alternative,
996 that is, (X) is NOT the same as (X|(*F)). */
997
998 if (rrc == MATCH_THEN)
999 {
1000 next = ecode + GET(ecode,1);
1001 if (md->start_match_ptr < next &&
1002 (*ecode == OP_ALT || *next == OP_ALT))
1003 rrc = MATCH_NOMATCH;
1004 }
1005
1006 /* Anything other than NOMATCH is passed back. */
1007
1008 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1009 md->capture_last = save_capture_last;
1010 ecode += GET(ecode, 1);
1011 md->mark = save_mark;
1012 if (*ecode != OP_ALT) break;
1013 }
1014
1015 DPRINTF(("bracket %d failed\n", number));
1016 md->offset_vector[offset] = save_offset1;
1017 md->offset_vector[offset+1] = save_offset2;
1018 md->offset_vector[md->offset_end - number] = save_offset3;
1019
1020 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1021
1022 RRETURN(rrc);
1023 }
1024
1025 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026 as a non-capturing bracket. */
1027
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030
1031 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1032
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035
1036 /* Non-capturing or atomic group, except for possessive with unlimited
1037 repeat and ONCE group with no captures. Loop for all the alternatives.
1038
1039 When we get to the final alternative within the brackets, we used to return
1040 the result of a recursive call to match() whatever happened so it was
1041 possible to reduce stack usage by turning this into a tail recursion,
1042 except in the case of a possibly empty group. However, now that there is
1043 the possiblity of (*THEN) occurring in the final alternative, this
1044 optimization is no longer always possible.
1045
1046 We can optimize if we know there are no (*THEN)s in the pattern; at present
1047 this is the best that can be done.
1048
1049 MATCH_ONCE is returned when the end of an atomic group is successfully
1050 reached, but subsequent matching fails. It passes back up the tree (causing
1051 captured values to be reset) until the original atomic group level is
1052 reached. This is tested by comparing md->once_target with the start of the
1053 group. At this point, the return is converted into MATCH_NOMATCH so that
1054 previous backup points can be taken. */
1055
1056 case OP_ONCE:
1057 case OP_BRA:
1058 case OP_SBRA:
1059 DPRINTF(("start non-capturing bracket\n"));
1060
1061 for (;;)
1062 {
1063 if (op >= OP_SBRA || op == OP_ONCE)
1064 md->match_function_type = MATCH_CBEGROUP;
1065
1066 /* If this is not a possibly empty group, and there are no (*THEN)s in
1067 the pattern, and this is the final alternative, optimize as described
1068 above. */
1069
1070 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1071 {
1072 ecode += PRIV(OP_lengths)[*ecode];
1073 goto TAIL_RECURSE;
1074 }
1075
1076 /* In all other cases, we have to make another call to match(). */
1077
1078 save_mark = md->mark;
1079 save_capture_last = md->capture_last;
1080 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1081 RM2);
1082
1083 /* See comment in the code for capturing groups above about handling
1084 THEN. */
1085
1086 if (rrc == MATCH_THEN)
1087 {
1088 next = ecode + GET(ecode,1);
1089 if (md->start_match_ptr < next &&
1090 (*ecode == OP_ALT || *next == OP_ALT))
1091 rrc = MATCH_NOMATCH;
1092 }
1093
1094 if (rrc != MATCH_NOMATCH)
1095 {
1096 if (rrc == MATCH_ONCE)
1097 {
1098 const pcre_uchar *scode = ecode;
1099 if (*scode != OP_ONCE) /* If not at start, find it */
1100 {
1101 while (*scode == OP_ALT) scode += GET(scode, 1);
1102 scode -= GET(scode, 1);
1103 }
1104 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1105 }
1106 RRETURN(rrc);
1107 }
1108 ecode += GET(ecode, 1);
1109 md->mark = save_mark;
1110 if (*ecode != OP_ALT) break;
1111 md->capture_last = save_capture_last;
1112 }
1113
1114 RRETURN(MATCH_NOMATCH);
1115
1116 /* Handle possessive capturing brackets with an unlimited repeat. We come
1117 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118 handled similarly to the normal case above. However, the matching is
1119 different. The end of these brackets will always be OP_KETRPOS, which
1120 returns MATCH_KETRPOS without going further in the pattern. By this means
1121 we can handle the group by iteration rather than recursion, thereby
1122 reducing the amount of stack needed. */
1123
1124 case OP_CBRAPOS:
1125 case OP_SCBRAPOS:
1126 allow_zero = FALSE;
1127
1128 POSSESSIVE_CAPTURE:
1129 number = GET2(ecode, 1+LINK_SIZE);
1130 offset = number << 1;
1131
1132 #ifdef PCRE_DEBUG
1133 printf("start possessive bracket %d\n", number);
1134 printf("subject=");
1135 pchars(eptr, 16, TRUE, md);
1136 printf("\n");
1137 #endif
1138
1139 if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE;
1140
1141 matched_once = FALSE;
1142 code_offset = (int)(ecode - md->start_code);
1143
1144 save_offset1 = md->offset_vector[offset];
1145 save_offset2 = md->offset_vector[offset+1];
1146 save_offset3 = md->offset_vector[md->offset_end - number];
1147 save_capture_last = md->capture_last;
1148
1149 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1150
1151 /* Each time round the loop, save the current subject position for use
1152 when the group matches. For MATCH_MATCH, the group has matched, so we
1153 restart it with a new subject starting position, remembering that we had
1154 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155 usual. If we haven't matched any alternatives in any iteration, check to
1156 see if a previous iteration matched. If so, the group has matched;
1157 continue from afterwards. Otherwise it has failed; restore the previous
1158 capture values before returning NOMATCH. */
1159
1160 for (;;)
1161 {
1162 md->offset_vector[md->offset_end - number] =
1163 (int)(eptr - md->start_subject);
1164 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1166 eptrb, RM63);
1167 if (rrc == MATCH_KETRPOS)
1168 {
1169 offset_top = md->end_offset_top;
1170 ecode = md->start_code + code_offset;
1171 save_capture_last = md->capture_last;
1172 matched_once = TRUE;
1173 mstart = md->start_match_ptr; /* In case \K changed it */
1174 if (eptr == md->end_match_ptr) /* Matched an empty string */
1175 {
1176 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1177 break;
1178 }
1179 eptr = md->end_match_ptr;
1180 continue;
1181 }
1182
1183 /* See comment in the code for capturing groups above about handling
1184 THEN. */
1185
1186 if (rrc == MATCH_THEN)
1187 {
1188 next = ecode + GET(ecode,1);
1189 if (md->start_match_ptr < next &&
1190 (*ecode == OP_ALT || *next == OP_ALT))
1191 rrc = MATCH_NOMATCH;
1192 }
1193
1194 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1195 md->capture_last = save_capture_last;
1196 ecode += GET(ecode, 1);
1197 if (*ecode != OP_ALT) break;
1198 }
1199
1200 if (!matched_once)
1201 {
1202 md->offset_vector[offset] = save_offset1;
1203 md->offset_vector[offset+1] = save_offset2;
1204 md->offset_vector[md->offset_end - number] = save_offset3;
1205 }
1206
1207 if (allow_zero || matched_once)
1208 {
1209 ecode += 1 + LINK_SIZE;
1210 break;
1211 }
1212
1213 RRETURN(MATCH_NOMATCH);
1214
1215 /* Non-capturing possessive bracket with unlimited repeat. We come here
1216 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1217 without the capturing complication. It is written out separately for speed
1218 and cleanliness. */
1219
1220 case OP_BRAPOS:
1221 case OP_SBRAPOS:
1222 allow_zero = FALSE;
1223
1224 POSSESSIVE_NON_CAPTURE:
1225 matched_once = FALSE;
1226 code_offset = (int)(ecode - md->start_code);
1227 save_capture_last = md->capture_last;
1228
1229 for (;;)
1230 {
1231 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1232 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1233 eptrb, RM48);
1234 if (rrc == MATCH_KETRPOS)
1235 {
1236 offset_top = md->end_offset_top;
1237 ecode = md->start_code + code_offset;
1238 matched_once = TRUE;
1239 mstart = md->start_match_ptr; /* In case \K reset it */
1240 if (eptr == md->end_match_ptr) /* Matched an empty string */
1241 {
1242 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1243 break;
1244 }
1245 eptr = md->end_match_ptr;
1246 continue;
1247 }
1248
1249 /* See comment in the code for capturing groups above about handling
1250 THEN. */
1251
1252 if (rrc == MATCH_THEN)
1253 {
1254 next = ecode + GET(ecode,1);
1255 if (md->start_match_ptr < next &&
1256 (*ecode == OP_ALT || *next == OP_ALT))
1257 rrc = MATCH_NOMATCH;
1258 }
1259
1260 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1261 ecode += GET(ecode, 1);
1262 if (*ecode != OP_ALT) break;
1263 md->capture_last = save_capture_last;
1264 }
1265
1266 if (matched_once || allow_zero)
1267 {
1268 ecode += 1 + LINK_SIZE;
1269 break;
1270 }
1271 RRETURN(MATCH_NOMATCH);
1272
1273 /* Control never reaches here. */
1274
1275 /* Conditional group: compilation checked that there are no more than two
1276 branches. If the condition is false, skipping the first branch takes us
1277 past the end of the item if there is only one branch, but that's exactly
1278 what we want. */
1279
1280 case OP_COND:
1281 case OP_SCOND:
1282
1283 /* The variable codelink will be added to ecode when the condition is
1284 false, to get to the second branch. Setting it to the offset to the ALT
1285 or KET, then incrementing ecode achieves this effect. We now have ecode
1286 pointing to the condition or callout. */
1287
1288 codelink = GET(ecode, 1); /* Offset to the second branch */
1289 ecode += 1 + LINK_SIZE; /* From this opcode */
1290
1291 /* Because of the way auto-callout works during compile, a callout item is
1292 inserted between OP_COND and an assertion condition. */
1293
1294 if (*ecode == OP_CALLOUT)
1295 {
1296 if (PUBL(callout) != NULL)
1297 {
1298 PUBL(callout_block) cb;
1299 cb.version = 2; /* Version 1 of the callout block */
1300 cb.callout_number = ecode[1];
1301 cb.offset_vector = md->offset_vector;
1302 #if defined COMPILE_PCRE8
1303 cb.subject = (PCRE_SPTR)md->start_subject;
1304 #elif defined COMPILE_PCRE16
1305 cb.subject = (PCRE_SPTR16)md->start_subject;
1306 #elif defined COMPILE_PCRE32
1307 cb.subject = (PCRE_SPTR32)md->start_subject;
1308 #endif
1309 cb.subject_length = (int)(md->end_subject - md->start_subject);
1310 cb.start_match = (int)(mstart - md->start_subject);
1311 cb.current_position = (int)(eptr - md->start_subject);
1312 cb.pattern_position = GET(ecode, 2);
1313 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1314 cb.capture_top = offset_top/2;
1315 cb.capture_last = md->capture_last & CAPLMASK;
1316 /* Internal change requires this for API compatibility. */
1317 if (cb.capture_last == 0) cb.capture_last = -1;
1318 cb.callout_data = md->callout_data;
1319 cb.mark = md->nomatch_mark;
1320 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1321 if (rrc < 0) RRETURN(rrc);
1322 }
1323
1324 /* Advance ecode past the callout, so it now points to the condition. We
1325 must adjust codelink so that the value of ecode+codelink is unchanged. */
1326
1327 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1328 codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1329 }
1330
1331 /* Test the various possible conditions */
1332
1333 condition = FALSE;
1334 switch(condcode = *ecode)
1335 {
1336 case OP_RREF: /* Numbered group recursion test */
1337 if (md->recursive != NULL) /* Not recursing => FALSE */
1338 {
1339 unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1340 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1341 }
1342 break;
1343
1344 case OP_DNRREF: /* Duplicate named group recursion test */
1345 if (md->recursive != NULL)
1346 {
1347 int count = GET2(ecode, 1 + IMM2_SIZE);
1348 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1349 while (count-- > 0)
1350 {
1351 unsigned int recno = GET2(slot, 0);
1352 condition = recno == md->recursive->group_num;
1353 if (condition) break;
1354 slot += md->name_entry_size;
1355 }
1356 }
1357 break;
1358
1359 case OP_CREF: /* Numbered group used test */
1360 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1361 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1362 break;
1363
1364 case OP_DNCREF: /* Duplicate named group used test */
1365 {
1366 int count = GET2(ecode, 1 + IMM2_SIZE);
1367 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1368 while (count-- > 0)
1369 {
1370 offset = GET2(slot, 0) << 1;
1371 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1372 if (condition) break;
1373 slot += md->name_entry_size;
1374 }
1375 }
1376 break;
1377
1378 case OP_DEF: /* DEFINE - always false */
1379 break;
1380
1381 /* The condition is an assertion. Call match() to evaluate it - setting
1382 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1383 of an assertion. */
1384
1385 default:
1386 md->match_function_type = MATCH_CONDASSERT;
1387 RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1388 if (rrc == MATCH_MATCH)
1389 {
1390 if (md->end_offset_top > offset_top)
1391 offset_top = md->end_offset_top; /* Captures may have happened */
1392 condition = TRUE;
1393
1394 /* Advance ecode past the assertion to the start of the first branch,
1395 but adjust it so that the general choosing code below works. If the
1396 assertion has a quantifier that allows zero repeats we must skip over
1397 the BRAZERO. This is a lunatic thing to do, but somebody did! */
1398
1399 if (*ecode == OP_BRAZERO) ecode++;
1400 ecode += GET(ecode, 1);
1401 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1402 ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1403 }
1404
1405 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1406 assertion; it is therefore treated as NOMATCH. Any other return is an
1407 error. */
1408
1409 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1410 {
1411 RRETURN(rrc); /* Need braces because of following else */
1412 }
1413 break;
1414 }
1415
1416 /* Choose branch according to the condition */
1417
1418 ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1419
1420 /* We are now at the branch that is to be obeyed. As there is only one, we
1421 can use tail recursion to avoid using another stack frame, except when
1422 there is unlimited repeat of a possibly empty group. In the latter case, a
1423 recursive call to match() is always required, unless the second alternative
1424 doesn't exist, in which case we can just plough on. Note that, for
1425 compatibility with Perl, the | in a conditional group is NOT treated as
1426 creating two alternatives. If a THEN is encountered in the branch, it
1427 propagates out to the enclosing alternative (unless nested in a deeper set
1428 of alternatives, of course). */
1429
1430 if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1431 {
1432 if (op != OP_SCOND)
1433 {
1434 goto TAIL_RECURSE;
1435 }
1436
1437 md->match_function_type = MATCH_CBEGROUP;
1438 RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1439 RRETURN(rrc);
1440 }
1441
1442 /* Condition false & no alternative; continue after the group. */
1443
1444 else
1445 {
1446 }
1447 break;
1448
1449
1450 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1451 to close any currently open capturing brackets. */
1452
1453 case OP_CLOSE:
1454 number = GET2(ecode, 1); /* Must be less than 65536 */
1455 offset = number << 1;
1456
1457 #ifdef PCRE_DEBUG
1458 printf("end bracket %d at *ACCEPT", number);
1459 printf("\n");
1460 #endif
1461
1462 md->capture_last = (md->capture_last & OVFLMASK) | number;
1463 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1464 {
1465 md->offset_vector[offset] =
1466 md->offset_vector[md->offset_end - number];
1467 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1468
1469 /* If this group is at or above the current highwater mark, ensure that
1470 any groups between the current high water mark and this group are marked
1471 unset and then update the high water mark. */
1472
1473 if (offset >= offset_top)
1474 {
1475 register int *iptr = md->offset_vector + offset_top;
1476 register int *iend = md->offset_vector + offset;
1477 while (iptr < iend) *iptr++ = -1;
1478 offset_top = offset + 2;
1479 }
1480 }
1481 ecode += 1 + IMM2_SIZE;
1482 break;
1483
1484
1485 /* End of the pattern, either real or forced. */
1486
1487 case OP_END:
1488 case OP_ACCEPT:
1489 case OP_ASSERT_ACCEPT:
1490
1491 /* If we have matched an empty string, fail if not in an assertion and not
1492 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1493 is set and we have matched at the start of the subject. In both cases,
1494 backtracking will then try other alternatives, if any. */
1495
1496 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1497 md->recursive == NULL &&
1498 (md->notempty ||
1499 (md->notempty_atstart &&
1500 mstart == md->start_subject + md->start_offset)))
1501 RRETURN(MATCH_NOMATCH);
1502
1503 /* Otherwise, we have a match. */
1504
1505 md->end_match_ptr = eptr; /* Record where we ended */
1506 md->end_offset_top = offset_top; /* and how many extracts were taken */
1507 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1508
1509 /* For some reason, the macros don't work properly if an expression is
1510 given as the argument to RRETURN when the heap is in use. */
1511
1512 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1513 RRETURN(rrc);
1514
1515 /* Assertion brackets. Check the alternative branches in turn - the
1516 matching won't pass the KET for an assertion. If any one branch matches,
1517 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1518 start of each branch to move the current point backwards, so the code at
1519 this level is identical to the lookahead case. When the assertion is part
1520 of a condition, we want to return immediately afterwards. The caller of
1521 this incarnation of the match() function will have set MATCH_CONDASSERT in
1522 md->match_function type, and one of these opcodes will be the first opcode
1523 that is processed. We use a local variable that is preserved over calls to
1524 match() to remember this case. */
1525
1526 case OP_ASSERT:
1527 case OP_ASSERTBACK:
1528 save_mark = md->mark;
1529 if (md->match_function_type == MATCH_CONDASSERT)
1530 {
1531 condassert = TRUE;
1532 md->match_function_type = 0;
1533 }
1534 else condassert = FALSE;
1535
1536 /* Loop for each branch */
1537
1538 do
1539 {
1540 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1541
1542 /* A match means that the assertion is true; break out of the loop
1543 that matches its alternatives. */
1544
1545 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1546 {
1547 mstart = md->start_match_ptr; /* In case \K reset it */
1548 break;
1549 }
1550
1551 /* If not matched, restore the previous mark setting. */
1552
1553 md->mark = save_mark;
1554
1555 /* See comment in the code for capturing groups above about handling
1556 THEN. */
1557
1558 if (rrc == MATCH_THEN)
1559 {
1560 next = ecode + GET(ecode,1);
1561 if (md->start_match_ptr < next &&
1562 (*ecode == OP_ALT || *next == OP_ALT))
1563 rrc = MATCH_NOMATCH;
1564 }
1565
1566 /* Anything other than NOMATCH causes the entire assertion to fail,
1567 passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1568 uncaptured THEN, which means they take their normal effect. This
1569 consistent approach does not always have exactly the same effect as in
1570 Perl. */
1571
1572 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1573 ecode += GET(ecode, 1);
1574 }
1575 while (*ecode == OP_ALT); /* Continue for next alternative */
1576
1577 /* If we have tried all the alternative branches, the assertion has
1578 failed. If not, we broke out after a match. */
1579
1580 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1581
1582 /* If checking an assertion for a condition, return MATCH_MATCH. */
1583
1584 if (condassert) RRETURN(MATCH_MATCH);
1585
1586 /* Continue from after a successful assertion, updating the offsets high
1587 water mark, since extracts may have been taken during the assertion. */
1588
1589 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1590 ecode += 1 + LINK_SIZE;
1591 offset_top = md->end_offset_top;
1592 continue;
1593
1594 /* Negative assertion: all branches must fail to match for the assertion to
1595 succeed. */
1596
1597 case OP_ASSERT_NOT:
1598 case OP_ASSERTBACK_NOT:
1599 save_mark = md->mark;
1600 if (md->match_function_type == MATCH_CONDASSERT)
1601 {
1602 condassert = TRUE;
1603 md->match_function_type = 0;
1604 }
1605 else condassert = FALSE;
1606
1607 /* Loop for each alternative branch. */
1608
1609 do
1610 {
1611 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1612 md->mark = save_mark; /* Always restore the mark setting */
1613
1614 switch(rrc)
1615 {
1616 case MATCH_MATCH: /* A successful match means */
1617 case MATCH_ACCEPT: /* the assertion has failed. */
1618 RRETURN(MATCH_NOMATCH);
1619
1620 case MATCH_NOMATCH: /* Carry on with next branch */
1621 break;
1622
1623 /* See comment in the code for capturing groups above about handling
1624 THEN. */
1625
1626 case MATCH_THEN:
1627 next = ecode + GET(ecode,1);
1628 if (md->start_match_ptr < next &&
1629 (*ecode == OP_ALT || *next == OP_ALT))
1630 {
1631 rrc = MATCH_NOMATCH;
1632 break;
1633 }
1634 /* Otherwise fall through. */
1635
1636 /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1637 assertion to fail to match, without considering any more alternatives.
1638 Failing to match means the assertion is true. This is a consistent
1639 approach, but does not always have the same effect as in Perl. */
1640
1641 case MATCH_COMMIT:
1642 case MATCH_SKIP:
1643 case MATCH_SKIP_ARG:
1644 case MATCH_PRUNE:
1645 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1646 goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1647
1648 /* Anything else is an error */
1649
1650 default:
1651 RRETURN(rrc);
1652 }
1653
1654 /* Continue with next branch */
1655
1656 ecode += GET(ecode,1);
1657 }
1658 while (*ecode == OP_ALT);
1659
1660 /* All branches in the assertion failed to match. */
1661
1662 NEG_ASSERT_TRUE:
1663 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1664 ecode += 1 + LINK_SIZE; /* Continue with current branch */
1665 continue;
1666
1667 /* Move the subject pointer back. This occurs only at the start of
1668 each branch of a lookbehind assertion. If we are too close to the start to
1669 move back, this match function fails. When working with UTF-8 we move
1670 back a number of characters, not bytes. */
1671
1672 case OP_REVERSE:
1673 #ifdef SUPPORT_UTF
1674 if (utf)
1675 {
1676 i = GET(ecode, 1);
1677 while (i-- > 0)
1678 {
1679 eptr--;
1680 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1681 BACKCHAR(eptr);
1682 }
1683 }
1684 else
1685 #endif
1686
1687 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1688
1689 {
1690 eptr -= GET(ecode, 1);
1691 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1692 }
1693
1694 /* Save the earliest consulted character, then skip to next op code */
1695
1696 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1697 ecode += 1 + LINK_SIZE;
1698 break;
1699
1700 /* The callout item calls an external function, if one is provided, passing
1701 details of the match so far. This is mainly for debugging, though the
1702 function is able to force a failure. */
1703
1704 case OP_CALLOUT:
1705 if (PUBL(callout) != NULL)
1706 {
1707 PUBL(callout_block) cb;
1708 cb.version = 2; /* Version 1 of the callout block */
1709 cb.callout_number = ecode[1];
1710 cb.offset_vector = md->offset_vector;
1711 #if defined COMPILE_PCRE8
1712 cb.subject = (PCRE_SPTR)md->start_subject;
1713 #elif defined COMPILE_PCRE16
1714 cb.subject = (PCRE_SPTR16)md->start_subject;
1715 #elif defined COMPILE_PCRE32
1716 cb.subject = (PCRE_SPTR32)md->start_subject;
1717 #endif
1718 cb.subject_length = (int)(md->end_subject - md->start_subject);
1719 cb.start_match = (int)(mstart - md->start_subject);
1720 cb.current_position = (int)(eptr - md->start_subject);
1721 cb.pattern_position = GET(ecode, 2);
1722 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1723 cb.capture_top = offset_top/2;
1724 cb.capture_last = md->capture_last & CAPLMASK;
1725 /* Internal change requires this for API compatibility. */
1726 if (cb.capture_last == 0) cb.capture_last = -1;
1727 cb.callout_data = md->callout_data;
1728 cb.mark = md->nomatch_mark;
1729 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1730 if (rrc < 0) RRETURN(rrc);
1731 }
1732 ecode += 2 + 2*LINK_SIZE;
1733 break;
1734
1735 /* Recursion either matches the current regex, or some subexpression. The
1736 offset data is the offset to the starting bracket from the start of the
1737 whole pattern. (This is so that it works from duplicated subpatterns.)
1738
1739 The state of the capturing groups is preserved over recursion, and
1740 re-instated afterwards. We don't know how many are started and not yet
1741 finished (offset_top records the completed total) so we just have to save
1742 all the potential data. There may be up to 65535 such values, which is too
1743 large to put on the stack, but using malloc for small numbers seems
1744 expensive. As a compromise, the stack is used when there are no more than
1745 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1746
1747 There are also other values that have to be saved. We use a chained
1748 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1749 for the original version of this logic. It has, however, been hacked around
1750 a lot, so he is not to blame for the current way it works. */
1751
1752 case OP_RECURSE:
1753 {
1754 recursion_info *ri;
1755 unsigned int recno;
1756
1757 callpat = md->start_code + GET(ecode, 1);
1758 recno = (callpat == md->start_code)? 0 :
1759 GET2(callpat, 1 + LINK_SIZE);
1760
1761 /* Check for repeating a recursion without advancing the subject pointer.
1762 This should catch convoluted mutual recursions. (Some simple cases are
1763 caught at compile time.) */
1764
1765 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1766 if (recno == ri->group_num && eptr == ri->subject_position)
1767 RRETURN(PCRE_ERROR_RECURSELOOP);
1768
1769 /* Add to "recursing stack" */
1770
1771 new_recursive.group_num = recno;
1772 new_recursive.saved_capture_last = md->capture_last;
1773 new_recursive.subject_position = eptr;
1774 new_recursive.prevrec = md->recursive;
1775 md->recursive = &new_recursive;
1776
1777 /* Where to continue from afterwards */
1778
1779 ecode += 1 + LINK_SIZE;
1780
1781 /* Now save the offset data */
1782
1783 new_recursive.saved_max = md->offset_end;
1784 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1785 new_recursive.offset_save = stacksave;
1786 else
1787 {
1788 new_recursive.offset_save =
1789 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1790 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1791 }
1792 memcpy(new_recursive.offset_save, md->offset_vector,
1793 new_recursive.saved_max * sizeof(int));
1794
1795 /* OK, now we can do the recursion. After processing each alternative,
1796 restore the offset data and the last captured value. If there were nested
1797 recursions, md->recursive might be changed, so reset it before looping.
1798 */
1799
1800 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1801 cbegroup = (*callpat >= OP_SBRA);
1802 do
1803 {
1804 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1805 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1806 md, eptrb, RM6);
1807 memcpy(md->offset_vector, new_recursive.offset_save,
1808 new_recursive.saved_max * sizeof(int));
1809 md->capture_last = new_recursive.saved_capture_last;
1810 md->recursive = new_recursive.prevrec;
1811 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1812 {
1813 DPRINTF(("Recursion matched\n"));
1814 if (new_recursive.offset_save != stacksave)
1815 (PUBL(free))(new_recursive.offset_save);
1816
1817 /* Set where we got to in the subject, and reset the start in case
1818 it was changed by \K. This *is* propagated back out of a recursion,
1819 for Perl compatibility. */
1820
1821 eptr = md->end_match_ptr;
1822 mstart = md->start_match_ptr;
1823 goto RECURSION_MATCHED; /* Exit loop; end processing */
1824 }
1825
1826 /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1827 recursion; they cause a NOMATCH for the entire recursion. These codes
1828 are defined in a range that can be tested for. */
1829
1830 if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1831 {
1832 if (new_recursive.offset_save != stacksave)
1833 (PUBL(free))(new_recursive.offset_save);
1834 RRETURN(MATCH_NOMATCH);
1835 }
1836
1837 /* Any return code other than NOMATCH is an error. */
1838
1839 if (rrc != MATCH_NOMATCH)
1840 {
1841 DPRINTF(("Recursion gave error %d\n", rrc));
1842 if (new_recursive.offset_save != stacksave)
1843 (PUBL(free))(new_recursive.offset_save);
1844 RRETURN(rrc);
1845 }
1846
1847 md->recursive = &new_recursive;
1848 callpat += GET(callpat, 1);
1849 }
1850 while (*callpat == OP_ALT);
1851
1852 DPRINTF(("Recursion didn't match\n"));
1853 md->recursive = new_recursive.prevrec;
1854 if (new_recursive.offset_save != stacksave)
1855 (PUBL(free))(new_recursive.offset_save);
1856 RRETURN(MATCH_NOMATCH);
1857 }
1858
1859 RECURSION_MATCHED:
1860 break;
1861
1862 /* An alternation is the end of a branch; scan along to find the end of the
1863 bracketed group and go to there. */
1864
1865 case OP_ALT:
1866 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1867 break;
1868
1869 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1870 indicating that it may occur zero times. It may repeat infinitely, or not
1871 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1872 with fixed upper repeat limits are compiled as a number of copies, with the
1873 optional ones preceded by BRAZERO or BRAMINZERO. */
1874
1875 case OP_BRAZERO:
1876 next = ecode + 1;
1877 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1878 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1879 do next += GET(next, 1); while (*next == OP_ALT);
1880 ecode = next + 1 + LINK_SIZE;
1881 break;
1882
1883 case OP_BRAMINZERO:
1884 next = ecode + 1;
1885 do next += GET(next, 1); while (*next == OP_ALT);
1886 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1887 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1888 ecode++;
1889 break;
1890
1891 case OP_SKIPZERO:
1892 next = ecode+1;
1893 do next += GET(next,1); while (*next == OP_ALT);
1894 ecode = next + 1 + LINK_SIZE;
1895 break;
1896
1897 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1898 here; just jump to the group, with allow_zero set TRUE. */
1899
1900 case OP_BRAPOSZERO:
1901 op = *(++ecode);
1902 allow_zero = TRUE;
1903 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1904 goto POSSESSIVE_NON_CAPTURE;
1905
1906 /* End of a group, repeated or non-repeating. */
1907
1908 case OP_KET:
1909 case OP_KETRMIN:
1910 case OP_KETRMAX:
1911 case OP_KETRPOS:
1912 prev = ecode - GET(ecode, 1);
1913
1914 /* If this was a group that remembered the subject start, in order to break
1915 infinite repeats of empty string matches, retrieve the subject start from
1916 the chain. Otherwise, set it NULL. */
1917
1918 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1919 {
1920 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1921 eptrb = eptrb->epb_prev; /* Backup to previous group */
1922 }
1923 else saved_eptr = NULL;
1924
1925 /* If we are at the end of an assertion group or a non-capturing atomic
1926 group, stop matching and return MATCH_MATCH, but record the current high
1927 water mark for use by positive assertions. We also need to record the match
1928 start in case it was changed by \K. */
1929
1930 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1931 *prev == OP_ONCE_NC)
1932 {
1933 md->end_match_ptr = eptr; /* For ONCE_NC */
1934 md->end_offset_top = offset_top;
1935 md->start_match_ptr = mstart;
1936 RRETURN(MATCH_MATCH); /* Sets md->mark */
1937 }
1938
1939 /* For capturing groups we have to check the group number back at the start
1940 and if necessary complete handling an extraction by setting the offsets and
1941 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1942 into group 0, so it won't be picked up here. Instead, we catch it when the
1943 OP_END is reached. Other recursion is handled here. We just have to record
1944 the current subject position and start match pointer and give a MATCH
1945 return. */
1946
1947 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1948 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1949 {
1950 number = GET2(prev, 1+LINK_SIZE);
1951 offset = number << 1;
1952
1953 #ifdef PCRE_DEBUG
1954 printf("end bracket %d", number);
1955 printf("\n");
1956 #endif
1957
1958 /* Handle a recursively called group. */
1959
1960 if (md->recursive != NULL && md->recursive->group_num == number)
1961 {
1962 md->end_match_ptr = eptr;
1963 md->start_match_ptr = mstart;
1964 RRETURN(MATCH_MATCH);
1965 }
1966
1967 /* Deal with capturing */
1968
1969 md->capture_last = (md->capture_last & OVFLMASK) | number;
1970 if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1971 {
1972 /* If offset is greater than offset_top, it means that we are
1973 "skipping" a capturing group, and that group's offsets must be marked
1974 unset. In earlier versions of PCRE, all the offsets were unset at the
1975 start of matching, but this doesn't work because atomic groups and
1976 assertions can cause a value to be set that should later be unset.
1977 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1978 part of the atomic group, but this is not on the final matching path,
1979 so must be unset when 2 is set. (If there is no group 2, there is no
1980 problem, because offset_top will then be 2, indicating no capture.) */
1981
1982 if (offset > offset_top)
1983 {
1984 register int *iptr = md->offset_vector + offset_top;
1985 register int *iend = md->offset_vector + offset;
1986 while (iptr < iend) *iptr++ = -1;
1987 }
1988
1989 /* Now make the extraction */
1990
1991 md->offset_vector[offset] =
1992 md->offset_vector[md->offset_end - number];
1993 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1994 if (offset_top <= offset) offset_top = offset + 2;
1995 }
1996 }
1997
1998 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1999 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2000 at a time from the outer level, thus saving stack. This must precede the
2001 empty string test - in this case that test is done at the outer level. */
2002
2003 if (*ecode == OP_KETRPOS)
2004 {
2005 md->start_match_ptr = mstart; /* In case \K reset it */
2006 md->end_match_ptr = eptr;
2007 md->end_offset_top = offset_top;
2008 RRETURN(MATCH_KETRPOS);
2009 }
2010
2011 /* For an ordinary non-repeating ket, just continue at this level. This
2012 also happens for a repeating ket if no characters were matched in the
2013 group. This is the forcible breaking of infinite loops as implemented in
2014 Perl 5.005. For a non-repeating atomic group that includes captures,
2015 establish a backup point by processing the rest of the pattern at a lower
2016 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2017 original OP_ONCE level, thereby bypassing intermediate backup points, but
2018 resetting any captures that happened along the way. */
2019
2020 if (*ecode == OP_KET || eptr == saved_eptr)
2021 {
2022 if (*prev == OP_ONCE)
2023 {
2024 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2025 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2026 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2027 RRETURN(MATCH_ONCE);
2028 }
2029 ecode += 1 + LINK_SIZE; /* Carry on at this level */
2030 break;
2031 }
2032
2033 /* The normal repeating kets try the rest of the pattern or restart from
2034 the preceding bracket, in the appropriate order. In the second case, we can
2035 use tail recursion to avoid using another stack frame, unless we have an
2036 an atomic group or an unlimited repeat of a group that can match an empty
2037 string. */
2038
2039 if (*ecode == OP_KETRMIN)
2040 {
2041 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
2042 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2043 if (*prev == OP_ONCE)
2044 {
2045 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2046 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2047 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2048 RRETURN(MATCH_ONCE);
2049 }
2050 if (*prev >= OP_SBRA) /* Could match an empty string */
2051 {
2052 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
2053 RRETURN(rrc);
2054 }
2055 ecode = prev;
2056 goto TAIL_RECURSE;
2057 }
2058 else /* OP_KETRMAX */
2059 {
2060 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2061 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
2062 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2063 if (*prev == OP_ONCE)
2064 {
2065 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2066 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 md->once_target = prev;
2068 RRETURN(MATCH_ONCE);
2069 }
2070 ecode += 1 + LINK_SIZE;
2071 goto TAIL_RECURSE;
2072 }
2073 /* Control never gets here */
2074
2075 /* Not multiline mode: start of subject assertion, unless notbol. */
2076
2077 case OP_CIRC:
2078 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2079
2080 /* Start of subject assertion */
2081
2082 case OP_SOD:
2083 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2084 ecode++;
2085 break;
2086
2087 /* Multiline mode: start of subject unless notbol, or after any newline. */
2088
2089 case OP_CIRCM:
2090 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2091 if (eptr != md->start_subject &&
2092 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2093 RRETURN(MATCH_NOMATCH);
2094 ecode++;
2095 break;
2096
2097 /* Start of match assertion */
2098
2099 case OP_SOM:
2100 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2101 ecode++;
2102 break;
2103
2104 /* Reset the start of match point */
2105
2106 case OP_SET_SOM:
2107 mstart = eptr;
2108 ecode++;
2109 break;
2110
2111 /* Multiline mode: assert before any newline, or before end of subject
2112 unless noteol is set. */
2113
2114 case OP_DOLLM:
2115 if (eptr < md->end_subject)
2116 {
2117 if (!IS_NEWLINE(eptr))
2118 {
2119 if (md->partial != 0 &&
2120 eptr + 1 >= md->end_subject &&
2121 NLBLOCK->nltype == NLTYPE_FIXED &&
2122 NLBLOCK->nllen == 2 &&
2123 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2124 {
2125 md->hitend = TRUE;
2126 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2127 }
2128 RRETURN(MATCH_NOMATCH);
2129 }
2130 }
2131 else
2132 {
2133 if (md->noteol) RRETURN(MATCH_NOMATCH);
2134 SCHECK_PARTIAL();
2135 }
2136 ecode++;
2137 break;
2138
2139 /* Not multiline mode: assert before a terminating newline or before end of
2140 subject unless noteol is set. */
2141
2142 case OP_DOLL:
2143 if (md->noteol) RRETURN(MATCH_NOMATCH);
2144 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2145
2146 /* ... else fall through for endonly */
2147
2148 /* End of subject assertion (\z) */
2149
2150 case OP_EOD:
2151 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2152 SCHECK_PARTIAL();
2153 ecode++;
2154 break;
2155
2156 /* End of subject or ending \n assertion (\Z) */
2157
2158 case OP_EODN:
2159 ASSERT_NL_OR_EOS:
2160 if (eptr < md->end_subject &&
2161 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2162 {
2163 if (md->partial != 0 &&
2164 eptr + 1 >= md->end_subject &&
2165 NLBLOCK->nltype == NLTYPE_FIXED &&
2166 NLBLOCK->nllen == 2 &&
2167 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2168 {
2169 md->hitend = TRUE;
2170 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2171 }
2172 RRETURN(MATCH_NOMATCH);
2173 }
2174
2175 /* Either at end of string or \n before end. */
2176
2177 SCHECK_PARTIAL();
2178 ecode++;
2179 break;
2180
2181 /* Word boundary assertions */
2182
2183 case OP_NOT_WORD_BOUNDARY:
2184 case OP_WORD_BOUNDARY:
2185 {
2186
2187 /* Find out if the previous and current characters are "word" characters.
2188 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2189 be "non-word" characters. Remember the earliest consulted character for
2190 partial matching. */
2191
2192 #ifdef SUPPORT_UTF
2193 if (utf)
2194 {
2195 /* Get status of previous character */
2196
2197 if (eptr == md->start_subject) prev_is_word = FALSE; else
2198 {
2199 PCRE_PUCHAR lastptr = eptr - 1;
2200 BACKCHAR(lastptr);
2201 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2202 GETCHAR(c, lastptr);
2203 #ifdef SUPPORT_UCP
2204 if (md->use_ucp)
2205 {
2206 if (c == '_') prev_is_word = TRUE; else
2207 {
2208 int cat = UCD_CATEGORY(c);
2209 prev_is_word = (cat == ucp_L || cat == ucp_N);
2210 }
2211 }
2212 else
2213 #endif
2214 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2215 }
2216
2217 /* Get status of next character */
2218
2219 if (eptr >= md->end_subject)
2220 {
2221 SCHECK_PARTIAL();
2222 cur_is_word = FALSE;
2223 }
2224 else
2225 {
2226 GETCHAR(c, eptr);
2227 #ifdef SUPPORT_UCP
2228 if (md->use_ucp)
2229 {
2230 if (c == '_') cur_is_word = TRUE; else
2231 {
2232 int cat = UCD_CATEGORY(c);
2233 cur_is_word = (cat == ucp_L || cat == ucp_N);
2234 }
2235 }
2236 else
2237 #endif
2238 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2239 }
2240 }
2241 else
2242 #endif
2243
2244 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2245 consistency with the behaviour of \w we do use it in this case. */
2246
2247 {
2248 /* Get status of previous character */
2249
2250 if (eptr == md->start_subject) prev_is_word = FALSE; else
2251 {
2252 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2253 #ifdef SUPPORT_UCP
2254 if (md->use_ucp)
2255 {
2256 c = eptr[-1];
2257 if (c == '_') prev_is_word = TRUE; else
2258 {
2259 int cat = UCD_CATEGORY(c);
2260 prev_is_word = (cat == ucp_L || cat == ucp_N);
2261 }
2262 }
2263 else
2264 #endif
2265 prev_is_word = MAX_255(eptr[-1])
2266 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2267 }
2268
2269 /* Get status of next character */
2270
2271 if (eptr >= md->end_subject)
2272 {
2273 SCHECK_PARTIAL();
2274 cur_is_word = FALSE;
2275 }
2276 else
2277 #ifdef SUPPORT_UCP
2278 if (md->use_ucp)
2279 {
2280 c = *eptr;
2281 if (c == '_') cur_is_word = TRUE; else
2282 {
2283 int cat = UCD_CATEGORY(c);
2284 cur_is_word = (cat == ucp_L || cat == ucp_N);
2285 }
2286 }
2287 else
2288 #endif
2289 cur_is_word = MAX_255(*eptr)
2290 && ((md->ctypes[*eptr] & ctype_word) != 0);
2291 }
2292
2293 /* Now see if the situation is what we want */
2294
2295 if ((*ecode++ == OP_WORD_BOUNDARY)?
2296 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2297 RRETURN(MATCH_NOMATCH);
2298 }
2299 break;
2300
2301 /* Match any single character type except newline; have to take care with
2302 CRLF newlines and partial matching. */
2303
2304 case OP_ANY:
2305 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2306 if (md->partial != 0 &&
2307 eptr + 1 >= md->end_subject &&
2308 NLBLOCK->nltype == NLTYPE_FIXED &&
2309 NLBLOCK->nllen == 2 &&
2310 UCHAR21TEST(eptr) == NLBLOCK->nl[0])
2311 {
2312 md->hitend = TRUE;
2313 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2314 }
2315
2316 /* Fall through */
2317
2318 /* Match any single character whatsoever. */
2319
2320 case OP_ALLANY:
2321 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2322 { /* not be updated before SCHECK_PARTIAL. */
2323 SCHECK_PARTIAL();
2324 RRETURN(MATCH_NOMATCH);
2325 }
2326 eptr++;
2327 #ifdef SUPPORT_UTF
2328 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2329 #endif
2330 ecode++;
2331 break;
2332
2333 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2334 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2335
2336 case OP_ANYBYTE:
2337 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2338 { /* not be updated before SCHECK_PARTIAL. */
2339 SCHECK_PARTIAL();
2340 RRETURN(MATCH_NOMATCH);
2341 }
2342 eptr++;
2343 ecode++;
2344 break;
2345
2346 case OP_NOT_DIGIT:
2347 if (eptr >= md->end_subject)
2348 {
2349 SCHECK_PARTIAL();
2350 RRETURN(MATCH_NOMATCH);
2351 }
2352 GETCHARINCTEST(c, eptr);
2353 if (
2354 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2355 c < 256 &&
2356 #endif
2357 (md->ctypes[c] & ctype_digit) != 0
2358 )
2359 RRETURN(MATCH_NOMATCH);
2360 ecode++;
2361 break;
2362
2363 case OP_DIGIT:
2364 if (eptr >= md->end_subject)
2365 {
2366 SCHECK_PARTIAL();
2367 RRETURN(MATCH_NOMATCH);
2368 }
2369 GETCHARINCTEST(c, eptr);
2370 if (
2371 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2372 c > 255 ||
2373 #endif
2374 (md->ctypes[c] & ctype_digit) == 0
2375 )
2376 RRETURN(MATCH_NOMATCH);
2377 ecode++;
2378 break;
2379
2380 case OP_NOT_WHITESPACE:
2381 if (eptr >= md->end_subject)
2382 {
2383 SCHECK_PARTIAL();
2384 RRETURN(MATCH_NOMATCH);
2385 }
2386 GETCHARINCTEST(c, eptr);
2387 if (
2388 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2389 c < 256 &&
2390 #endif
2391 (md->ctypes[c] & ctype_space) != 0
2392 )
2393 RRETURN(MATCH_NOMATCH);
2394 ecode++;
2395 break;
2396
2397 case OP_WHITESPACE:
2398 if (eptr >= md->end_subject)
2399 {
2400 SCHECK_PARTIAL();
2401 RRETURN(MATCH_NOMATCH);
2402 }
2403 GETCHARINCTEST(c, eptr);
2404 if (
2405 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2406 c > 255 ||
2407 #endif
2408 (md->ctypes[c] & ctype_space) == 0
2409 )
2410 RRETURN(MATCH_NOMATCH);
2411 ecode++;
2412 break;
2413
2414 case OP_NOT_WORDCHAR:
2415 if (eptr >= md->end_subject)
2416 {
2417 SCHECK_PARTIAL();
2418 RRETURN(MATCH_NOMATCH);
2419 }
2420 GETCHARINCTEST(c, eptr);
2421 if (
2422 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2423 c < 256 &&
2424 #endif
2425 (md->ctypes[c] & ctype_word) != 0
2426 )
2427 RRETURN(MATCH_NOMATCH);
2428 ecode++;
2429 break;
2430
2431 case OP_WORDCHAR:
2432 if (eptr >= md->end_subject)
2433 {
2434 SCHECK_PARTIAL();
2435 RRETURN(MATCH_NOMATCH);
2436 }
2437 GETCHARINCTEST(c, eptr);
2438 if (
2439 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2440 c > 255 ||
2441 #endif
2442 (md->ctypes[c] & ctype_word) == 0
2443 )
2444 RRETURN(MATCH_NOMATCH);
2445 ecode++;
2446 break;
2447
2448 case OP_ANYNL:
2449 if (eptr >= md->end_subject)
2450 {
2451 SCHECK_PARTIAL();
2452 RRETURN(MATCH_NOMATCH);
2453 }
2454 GETCHARINCTEST(c, eptr);
2455 switch(c)
2456 {
2457 default: RRETURN(MATCH_NOMATCH);
2458
2459 case CHAR_CR:
2460 if (eptr >= md->end_subject)
2461 {
2462 SCHECK_PARTIAL();
2463 }
2464 else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
2465 break;
2466
2467 case CHAR_LF:
2468 break;
2469
2470 case CHAR_VT:
2471 case CHAR_FF:
2472 case CHAR_NEL:
2473 #ifndef EBCDIC
2474 case 0x2028:
2475 case 0x2029:
2476 #endif /* Not EBCDIC */
2477 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2478 break;
2479 }
2480 ecode++;
2481 break;
2482
2483 case OP_NOT_HSPACE:
2484 if (eptr >= md->end_subject)
2485 {
2486 SCHECK_PARTIAL();
2487 RRETURN(MATCH_NOMATCH);
2488 }
2489 GETCHARINCTEST(c, eptr);
2490 switch(c)
2491 {
2492 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
2493 default: break;
2494 }
2495 ecode++;
2496 break;
2497
2498 case OP_HSPACE:
2499 if (eptr >= md->end_subject)
2500 {
2501 SCHECK_PARTIAL();
2502 RRETURN(MATCH_NOMATCH);
2503 }
2504 GETCHARINCTEST(c, eptr);
2505 switch(c)
2506 {
2507 HSPACE_CASES: break; /* Byte and multibyte cases */
2508 default: RRETURN(MATCH_NOMATCH);
2509 }
2510 ecode++;
2511 break;
2512
2513 case OP_NOT_VSPACE:
2514 if (eptr >= md->end_subject)
2515 {
2516 SCHECK_PARTIAL();
2517 RRETURN(MATCH_NOMATCH);
2518 }
2519 GETCHARINCTEST(c, eptr);
2520 switch(c)
2521 {
2522 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
2523 default: break;
2524 }
2525 ecode++;
2526 break;
2527
2528 case OP_VSPACE:
2529 if (eptr >= md->end_subject)
2530 {
2531 SCHECK_PARTIAL();
2532 RRETURN(MATCH_NOMATCH);
2533 }
2534 GETCHARINCTEST(c, eptr);
2535 switch(c)
2536 {
2537 VSPACE_CASES: break;
2538 default: RRETURN(MATCH_NOMATCH);
2539 }
2540 ecode++;
2541 break;
2542
2543 #ifdef SUPPORT_UCP
2544 /* Check the next character by Unicode property. We will get here only
2545 if the support is in the binary; otherwise a compile-time error occurs. */
2546
2547 case OP_PROP:
2548 case OP_NOTPROP:
2549 if (eptr >= md->end_subject)
2550 {
2551 SCHECK_PARTIAL();
2552 RRETURN(MATCH_NOMATCH);
2553 }
2554 GETCHARINCTEST(c, eptr);
2555 {
2556 const pcre_uint32 *cp;
2557 const ucd_record *prop = GET_UCD(c);
2558
2559 switch(ecode[1])
2560 {
2561 case PT_ANY:
2562 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2563 break;
2564
2565 case PT_LAMP:
2566 if ((prop->chartype == ucp_Lu ||
2567 prop->chartype == ucp_Ll ||
2568 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2569 RRETURN(MATCH_NOMATCH);
2570 break;
2571
2572 case PT_GC:
2573 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2574 RRETURN(MATCH_NOMATCH);
2575 break;
2576
2577 case PT_PC:
2578 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2579 RRETURN(MATCH_NOMATCH);
2580 break;
2581
2582 case PT_SC:
2583 if ((ecode[2] != prop->script) == (op == OP_PROP))
2584 RRETURN(MATCH_NOMATCH);
2585 break;
2586
2587 /* These are specials */
2588
2589 case PT_ALNUM:
2590 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2591 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2592 RRETURN(MATCH_NOMATCH);
2593 break;
2594
2595 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2596 which means that Perl space and POSIX space are now identical. PCRE
2597 was changed at release 8.34. */
2598
2599 case PT_SPACE: /* Perl space */
2600 case PT_PXSPACE: /* POSIX space */
2601 switch(c)
2602 {
2603 HSPACE_CASES:
2604 VSPACE_CASES:
2605 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2606 break;
2607
2608 default:
2609 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2610 (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2611 break;
2612 }
2613 break;
2614
2615 case PT_WORD:
2616 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2617 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2618 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2619 RRETURN(MATCH_NOMATCH);
2620 break;
2621
2622 case PT_CLIST:
2623 cp = PRIV(ucd_caseless_sets) + ecode[2];
2624 for (;;)
2625 {
2626 if (c < *cp)
2627 { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2628 if (c == *cp++)
2629 { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2630 }
2631 break;
2632
2633 case PT_UCNC:
2634 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2635 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2636 c >= 0xe000) == (op == OP_NOTPROP))
2637 RRETURN(MATCH_NOMATCH);
2638 break;
2639
2640 /* This should never occur */
2641
2642 default:
2643 RRETURN(PCRE_ERROR_INTERNAL);
2644 }
2645
2646 ecode += 3;
2647 }
2648 break;
2649
2650 /* Match an extended Unicode sequence. We will get here only if the support
2651 is in the binary; otherwise a compile-time error occurs. */
2652
2653 case OP_EXTUNI:
2654 if (eptr >= md->end_subject)
2655 {
2656 SCHECK_PARTIAL();
2657 RRETURN(MATCH_NOMATCH);
2658 }
2659 else
2660 {
2661 int lgb, rgb;
2662 GETCHARINCTEST(c, eptr);
2663 lgb = UCD_GRAPHBREAK(c);
2664 while (eptr < md->end_subject)
2665 {
2666 int len = 1;
2667 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2668 rgb = UCD_GRAPHBREAK(c);
2669 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2670 lgb = rgb;
2671 eptr += len;
2672 }
2673 }
2674 CHECK_PARTIAL();
2675 ecode++;
2676 break;
2677 #endif /* SUPPORT_UCP */
2678
2679
2680 /* Match a back reference, possibly repeatedly. Look past the end of the
2681 item to see if there is repeat information following. The code is similar
2682 to that for character classes, but repeated for efficiency. Then obey
2683 similar code to character type repeats - written out again for speed.
2684 However, if the referenced string is the empty string, always treat
2685 it as matched, any number of times (otherwise there could be infinite
2686 loops). If the reference is unset, there are two possibilities:
2687
2688 (a) In the default, Perl-compatible state, set the length negative;
2689 this ensures that every attempt at a match fails. We can't just fail
2690 here, because of the possibility of quantifiers with zero minima.
2691
2692 (b) If the JavaScript compatibility flag is set, set the length to zero
2693 so that the back reference matches an empty string.
2694
2695 Otherwise, set the length to the length of what was matched by the
2696 referenced subpattern.
2697
2698 The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2699 or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2700 and OP_DNREFI are used. In this case we must scan the list of groups to
2701 which the name refers, and use the first one that is set. */
2702
2703 case OP_DNREF:
2704 case OP_DNREFI:
2705 caseless = op == OP_DNREFI;
2706 {
2707 int count = GET2(ecode, 1+IMM2_SIZE);
2708 pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2709 ecode += 1 + 2*IMM2_SIZE;
2710
2711 /* Setting the default length first and initializing 'offset' avoids
2712 compiler warnings in the REF_REPEAT code. */
2713
2714 length = (md->jscript_compat)? 0 : -1;
2715 offset = 0;
2716
2717 while (count-- > 0)
2718 {
2719 offset = GET2(slot, 0) << 1;
2720 if (offset < offset_top && md->offset_vector[offset] >= 0)
2721 {
2722 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2723 break;
2724 }
2725 slot += md->name_entry_size;
2726 }
2727 }
2728 goto REF_REPEAT;
2729
2730 case OP_REF:
2731 case OP_REFI:
2732 caseless = op == OP_REFI;
2733 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2734 ecode += 1 + IMM2_SIZE;
2735 if (offset >= offset_top || md->offset_vector[offset] < 0)
2736 length = (md->jscript_compat)? 0 : -1;
2737 else
2738 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2739
2740 /* Set up for repetition, or handle the non-repeated case */
2741
2742 REF_REPEAT:
2743 switch (*ecode)
2744 {
2745 case OP_CRSTAR:
2746 case OP_CRMINSTAR:
2747 case OP_CRPLUS:
2748 case OP_CRMINPLUS:
2749 case OP_CRQUERY:
2750 case OP_CRMINQUERY:
2751 c = *ecode++ - OP_CRSTAR;
2752 minimize = (c & 1) != 0;
2753 min = rep_min[c]; /* Pick up values from tables; */
2754 max = rep_max[c]; /* zero for max => infinity */
2755 if (max == 0) max = INT_MAX;
2756 break;
2757
2758 case OP_CRRANGE:
2759 case OP_CRMINRANGE:
2760 minimize = (*ecode == OP_CRMINRANGE);
2761 min = GET2(ecode, 1);
2762 max = GET2(ecode, 1 + IMM2_SIZE);
2763 if (max == 0) max = INT_MAX;
2764 ecode += 1 + 2 * IMM2_SIZE;
2765 break;
2766
2767 default: /* No repeat follows */
2768 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2769 {
2770 if (length == -2) eptr = md->end_subject; /* Partial match */
2771 CHECK_PARTIAL();
2772 RRETURN(MATCH_NOMATCH);
2773 }
2774 eptr += length;
2775 continue; /* With the main loop */
2776 }
2777
2778 /* Handle repeated back references. If the length of the reference is
2779 zero, just continue with the main loop. If the length is negative, it
2780 means the reference is unset in non-Java-compatible mode. If the minimum is
2781 zero, we can continue at the same level without recursion. For any other
2782 minimum, carrying on will result in NOMATCH. */
2783
2784 if (length == 0) continue;
2785 if (length < 0 && min == 0) continue;
2786
2787 /* First, ensure the minimum number of matches are present. We get back
2788 the length of the reference string explicitly rather than passing the
2789 address of eptr, so that eptr can be a register variable. */
2790
2791 for (i = 1; i <= min; i++)
2792 {
2793 int slength;
2794 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2795 {
2796 if (slength == -2) eptr = md->end_subject; /* Partial match */
2797 CHECK_PARTIAL();
2798 RRETURN(MATCH_NOMATCH);
2799 }
2800 eptr += slength;
2801 }
2802
2803 /* If min = max, continue at the same level without recursion.
2804 They are not both allowed to be zero. */
2805
2806 if (min == max) continue;
2807
2808 /* If minimizing, keep trying and advancing the pointer */
2809
2810 if (minimize)
2811 {
2812 for (fi = min;; fi++)
2813 {
2814 int slength;
2815 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2816 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2817 if (fi >= max) RRETURN(MATCH_NOMATCH);
2818 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2819 {
2820 if (slength == -2) eptr = md->end_subject; /* Partial match */
2821 CHECK_PARTIAL();
2822 RRETURN(MATCH_NOMATCH);
2823 }
2824 eptr += slength;
2825 }
2826 /* Control never gets here */
2827 }
2828
2829 /* If maximizing, find the longest string and work backwards */
2830
2831 else
2832 {
2833 pp = eptr;
2834 for (i = min; i < max; i++)
2835 {
2836 int slength;
2837 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2838 {
2839 /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2840 the soft partial matching case. */
2841
2842 if (slength == -2 && md->partial != 0 &&
2843 md->end_subject > md->start_used_ptr)
2844 {
2845 md->hitend = TRUE;
2846 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2847 }
2848 break;
2849 }
2850 eptr += slength;
2851 }
2852
2853 while (eptr >= pp)
2854 {
2855 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2856 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2857 eptr -= length;
2858 }
2859 RRETURN(MATCH_NOMATCH);
2860 }
2861 /* Control never gets here */
2862
2863 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2864 used when all the characters in the class have values in the range 0-255,
2865 and either the matching is caseful, or the characters are in the range
2866 0-127 when UTF-8 processing is enabled. The only difference between
2867 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2868 encountered.
2869
2870 First, look past the end of the item to see if there is repeat information
2871 following. Then obey similar code to character type repeats - written out
2872 again for speed. */
2873
2874 case OP_NCLASS:
2875 case OP_CLASS:
2876 {
2877 /* The data variable is saved across frames, so the byte map needs to
2878 be stored there. */
2879 #define BYTE_MAP ((pcre_uint8 *)data)
2880 data = ecode + 1; /* Save for matching */
2881 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2882
2883 switch (*ecode)
2884 {
2885 case OP_CRSTAR:
2886 case OP_CRMINSTAR:
2887 case OP_CRPLUS:
2888 case OP_CRMINPLUS:
2889 case OP_CRQUERY:
2890 case OP_CRMINQUERY:
2891 case OP_CRPOSSTAR:
2892 case OP_CRPOSPLUS:
2893 case OP_CRPOSQUERY:
2894 c = *ecode++ - OP_CRSTAR;
2895 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2896 else possessive = TRUE;
2897 min = rep_min[c]; /* Pick up values from tables; */
2898 max = rep_max[c]; /* zero for max => infinity */
2899 if (max == 0) max = INT_MAX;
2900 break;
2901
2902 case OP_CRRANGE:
2903 case OP_CRMINRANGE:
2904 case OP_CRPOSRANGE:
2905 minimize = (*ecode == OP_CRMINRANGE);
2906 possessive = (*ecode == OP_CRPOSRANGE);
2907 min = GET2(ecode, 1);
2908 max = GET2(ecode, 1 + IMM2_SIZE);
2909 if (max == 0) max = INT_MAX;
2910 ecode += 1 + 2 * IMM2_SIZE;
2911 break;
2912
2913 default: /* No repeat follows */
2914 min = max = 1;
2915 break;
2916 }
2917
2918 /* First, ensure the minimum number of matches are present. */
2919
2920 #ifdef SUPPORT_UTF
2921 if (utf)
2922 {
2923 for (i = 1; i <= min; i++)
2924 {
2925 if (eptr >= md->end_subject)
2926 {
2927 SCHECK_PARTIAL();
2928 RRETURN(MATCH_NOMATCH);
2929 }
2930 GETCHARINC(c, eptr);
2931 if (c > 255)
2932 {
2933 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2934 }
2935 else
2936 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2937 }
2938 }
2939 else
2940 #endif
2941 /* Not UTF mode */
2942 {
2943 for (i = 1; i <= min; i++)
2944 {
2945 if (eptr >= md->end_subject)
2946 {
2947 SCHECK_PARTIAL();
2948 RRETURN(MATCH_NOMATCH);
2949 }
2950 c = *eptr++;
2951 #ifndef COMPILE_PCRE8
2952 if (c > 255)
2953 {
2954 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2955 }
2956 else
2957 #endif
2958 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2959 }
2960 }
2961
2962 /* If max == min we can continue with the main loop without the
2963 need to recurse. */
2964
2965 if (min == max) continue;
2966
2967 /* If minimizing, keep testing the rest of the expression and advancing
2968 the pointer while it matches the class. */
2969
2970 if (minimize)
2971 {
2972 #ifdef SUPPORT_UTF
2973 if (utf)
2974 {
2975 for (fi = min;; fi++)
2976 {
2977 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2979 if (fi >= max) RRETURN(MATCH_NOMATCH);
2980 if (eptr >= md->end_subject)
2981 {
2982 SCHECK_PARTIAL();
2983 RRETURN(MATCH_NOMATCH);
2984 }
2985 GETCHARINC(c, eptr);
2986 if (c > 255)
2987 {
2988 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2989 }
2990 else
2991 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2992 }
2993 }
2994 else
2995 #endif
2996 /* Not UTF mode */
2997 {
2998 for (fi = min;; fi++)
2999 {
3000 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
3001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3002 if (fi >= max) RRETURN(MATCH_NOMATCH);
3003 if (eptr >= md->end_subject)
3004 {
3005 SCHECK_PARTIAL();
3006 RRETURN(MATCH_NOMATCH);
3007 }
3008 c = *eptr++;
3009 #ifndef COMPILE_PCRE8
3010 if (c > 255)
3011 {
3012 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3013 }
3014 else
3015 #endif
3016 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
3017 }
3018 }
3019 /* Control never gets here */
3020 }
3021
3022 /* If maximizing, find the longest possible run, then work backwards. */
3023
3024 else
3025 {
3026 pp = eptr;
3027
3028 #ifdef SUPPORT_UTF
3029 if (utf)
3030 {
3031 for (i = min; i < max; i++)
3032 {
3033 int len = 1;
3034 if (eptr >= md->end_subject)
3035 {
3036 SCHECK_PARTIAL();
3037 break;
3038 }
3039 GETCHARLEN(c, eptr, len);
3040 if (c > 255)
3041 {
3042 if (op == OP_CLASS) break;
3043 }
3044 else
3045 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3046 eptr += len;
3047 }
3048
3049 if (possessive) continue; /* No backtracking */
3050
3051 for (;;)
3052 {
3053 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
3054 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3055 if (eptr-- == pp) break; /* Stop if tried at original pos */
3056 BACKCHAR(eptr);
3057 }
3058 }
3059 else
3060 #endif
3061 /* Not UTF mode */
3062 {
3063 for (i = min; i < max; i++)
3064 {
3065 if (eptr >= md->end_subject)
3066 {
3067 SCHECK_PARTIAL();
3068 break;
3069 }
3070 c = *eptr;
3071 #ifndef COMPILE_PCRE8
3072 if (c > 255)
3073 {
3074 if (op == OP_CLASS) break;
3075 }
3076 else
3077 #endif
3078 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
3079 eptr++;
3080 }
3081
3082 if (possessive) continue; /* No backtracking */
3083
3084 while (eptr >= pp)
3085 {
3086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
3087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3088 eptr--;
3089 }
3090 }
3091
3092 RRETURN(MATCH_NOMATCH);
3093 }
3094 #undef BYTE_MAP
3095 }
3096 /* Control never gets here */
3097
3098
3099 /* Match an extended character class. In the 8-bit library, this opcode is
3100 encountered only when UTF-8 mode mode is supported. In the 16-bit and
3101 32-bit libraries, codepoints greater than 255 may be encountered even when
3102 UTF is not supported. */
3103
3104 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3105 case OP_XCLASS:
3106 {
3107 data = ecode + 1 + LINK_SIZE; /* Save for matching */
3108 ecode += GET(ecode, 1); /* Advance past the item */
3109
3110 switch (*ecode)
3111 {
3112 case OP_CRSTAR:
3113 case OP_CRMINSTAR:
3114 case OP_CRPLUS:
3115 case OP_CRMINPLUS:
3116 case OP_CRQUERY:
3117 case OP_CRMINQUERY:
3118 case OP_CRPOSSTAR:
3119 case OP_CRPOSPLUS:
3120 case OP_CRPOSQUERY:
3121 c = *ecode++ - OP_CRSTAR;
3122 if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3123 else possessive = TRUE;
3124 min = rep_min[c]; /* Pick up values from tables; */
3125 max = rep_max[c]; /* zero for max => infinity */
3126 if (max == 0) max = INT_MAX;
3127 break;
3128
3129 case OP_CRRANGE:
3130 case OP_CRMINRANGE:
3131 case OP_CRPOSRANGE:
3132 minimize = (*ecode == OP_CRMINRANGE);
3133 possessive = (*ecode == OP_CRPOSRANGE);
3134 min = GET2(ecode, 1);
3135 max = GET2(ecode, 1 + IMM2_SIZE);
3136 if (max == 0) max = INT_MAX;
3137 ecode += 1 + 2 * IMM2_SIZE;
3138 break;
3139
3140 default: /* No repeat follows */
3141 min = max = 1;
3142 break;
3143 }
3144
3145 /* First, ensure the minimum number of matches are present. */
3146
3147 for (i = 1; i <= min; i++)
3148 {
3149 if (eptr >= md->end_subject)
3150 {
3151 SCHECK_PARTIAL();
3152 RRETURN(MATCH_NOMATCH);
3153 }
3154 GETCHARINCTEST(c, eptr);
3155 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3156 }
3157
3158 /* If max == min we can continue with the main loop without the
3159 need to recurse. */
3160
3161 if (min == max) continue;
3162
3163 /* If minimizing, keep testing the rest of the expression and advancing
3164 the pointer while it matches the class. */
3165
3166 if (minimize)
3167 {
3168 for (fi = min;; fi++)
3169 {
3170 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
3171 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3172 if (fi >= max) RRETURN(MATCH_NOMATCH);
3173 if (eptr >= md->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178 GETCHARINCTEST(c, eptr);
3179 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3180 }
3181 /* Control never gets here */
3182 }
3183
3184 /* If maximizing, find the longest possible run, then work backwards. */
3185
3186 else
3187 {
3188 pp = eptr;
3189 for (i = min; i < max; i++)
3190 {
3191 int len = 1;
3192 if (eptr >= md->end_subject)
3193 {
3194 SCHECK_PARTIAL();
3195 break;
3196 }
3197 #ifdef SUPPORT_UTF
3198 GETCHARLENTEST(c, eptr, len);
3199 #else
3200 c = *eptr;
3201 #endif
3202 if (!PRIV(xclass)(c, data, utf)) break;
3203 eptr += len;
3204 }
3205
3206 if (possessive) continue; /* No backtracking */
3207
3208 for(;;)
3209 {
3210 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3211 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3212 if (eptr-- == pp) break; /* Stop if tried at original pos */
3213 #ifdef SUPPORT_UTF
3214 if (utf) BACKCHAR(eptr);
3215 #endif
3216 }
3217 RRETURN(MATCH_NOMATCH);
3218 }
3219
3220 /* Control never gets here */
3221 }
3222 #endif /* End of XCLASS */
3223
3224 /* Match a single character, casefully */
3225
3226 case OP_CHAR:
3227 #ifdef SUPPORT_UTF
3228 if (utf)
3229 {
3230 length = 1;
3231 ecode++;
3232 GETCHARLEN(fc, ecode, length);
3233 if (length > md->end_subject - eptr)
3234 {
3235 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3236 RRETURN(MATCH_NOMATCH);
3237 }
3238 while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
3239 }
3240 else
3241 #endif
3242 /* Not UTF mode */
3243 {
3244 if (md->end_subject - eptr < 1)
3245 {
3246 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3247 RRETURN(MATCH_NOMATCH);
3248 }
3249 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3250 ecode += 2;
3251 }
3252 break;
3253
3254 /* Match a single character, caselessly. If we are at the end of the
3255 subject, give up immediately. */
3256
3257 case OP_CHARI:
3258 if (eptr >= md->end_subject)
3259 {
3260 SCHECK_PARTIAL();
3261 RRETURN(MATCH_NOMATCH);
3262 }
3263
3264 #ifdef SUPPORT_UTF
3265 if (utf)
3266 {
3267 length = 1;
3268 ecode++;
3269 GETCHARLEN(fc, ecode, length);
3270
3271 /* If the pattern character's value is < 128, we have only one byte, and
3272 we know that its other case must also be one byte long, so we can use the
3273 fast lookup table. We know that there is at least one byte left in the
3274 subject. */
3275
3276 if (fc < 128)
3277 {
3278 pcre_uint32 cc = UCHAR21(eptr);
3279 if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
3280 ecode++;
3281 eptr++;
3282 }
3283
3284 /* Otherwise we must pick up the subject character. Note that we cannot
3285 use the value of "length" to check for sufficient bytes left, because the
3286 other case of the character may have more or fewer bytes. */
3287
3288 else
3289 {
3290 pcre_uint32 dc;
3291 GETCHARINC(dc, eptr);
3292 ecode += length;
3293
3294 /* If we have Unicode property support, we can use it to test the other
3295 case of the character, if there is one. */
3296
3297 if (fc != dc)
3298 {
3299 #ifdef SUPPORT_UCP
3300 if (dc != UCD_OTHERCASE(fc))
3301 #endif
3302 RRETURN(MATCH_NOMATCH);
3303 }
3304 }
3305 }
3306 else
3307 #endif /* SUPPORT_UTF */
3308
3309 /* Not UTF mode */
3310 {
3311 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3312 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3313 eptr++;
3314 ecode += 2;
3315 }
3316 break;
3317
3318 /* Match a single character repeatedly. */
3319
3320 case OP_EXACT:
3321 case OP_EXACTI:
3322 min = max = GET2(ecode, 1);
3323 ecode += 1 + IMM2_SIZE;
3324 goto REPEATCHAR;
3325
3326 case OP_POSUPTO:
3327 case OP_POSUPTOI:
3328 possessive = TRUE;
3329 /* Fall through */
3330
3331 case OP_UPTO:
3332 case OP_UPTOI:
3333 case OP_MINUPTO:
3334 case OP_MINUPTOI:
3335 min = 0;
3336 max = GET2(ecode, 1);
3337 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3338 ecode += 1 + IMM2_SIZE;
3339 goto REPEATCHAR;
3340
3341 case OP_POSSTAR:
3342 case OP_POSSTARI:
3343 possessive = TRUE;
3344 min = 0;
3345 max = INT_MAX;
3346 ecode++;
3347 goto REPEATCHAR;
3348
3349 case OP_POSPLUS:
3350 case OP_POSPLUSI:
3351 possessive = TRUE;
3352 min = 1;
3353 max = INT_MAX;
3354 ecode++;
3355 goto REPEATCHAR;
3356
3357 case OP_POSQUERY:
3358 case OP_POSQUERYI:
3359 possessive = TRUE;
3360 min = 0;
3361 max = 1;
3362 ecode++;
3363 goto REPEATCHAR;
3364
3365 case OP_STAR:
3366 case OP_STARI:
3367 case OP_MINSTAR:
3368 case OP_MINSTARI:
3369 case OP_PLUS:
3370 case OP_PLUSI:
3371 case OP_MINPLUS:
3372 case OP_MINPLUSI:
3373 case OP_QUERY:
3374 case OP_QUERYI:
3375 case OP_MINQUERY:
3376 case OP_MINQUERYI:
3377 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3378 minimize = (c & 1) != 0;
3379 min = rep_min[c]; /* Pick up values from tables; */
3380 max = rep_max[c]; /* zero for max => infinity */
3381 if (max == 0) max = INT_MAX;
3382
3383 /* Common code for all repeated single-character matches. We first check
3384 for the minimum number of characters. If the minimum equals the maximum, we
3385 are done. Otherwise, if minimizing, check the rest of the pattern for a
3386 match; if there isn't one, advance up to the maximum, one character at a
3387 time.
3388
3389 If maximizing, advance up to the maximum number of matching characters,
3390 until eptr is past the end of the maximum run. If possessive, we are
3391 then done (no backing up). Otherwise, match at this position; anything
3392 other than no match is immediately returned. For nomatch, back up one
3393 character, unless we are matching \R and the last thing matched was
3394 \r\n, in which case, back up two bytes. When we reach the first optional
3395 character position, we can save stack by doing a tail recurse.
3396
3397 The various UTF/non-UTF and caseful/caseless cases are handled separately,
3398 for speed. */
3399
3400 REPEATCHAR:
3401 #ifdef SUPPORT_UTF
3402 if (utf)
3403 {
3404 length = 1;
3405 charptr = ecode;
3406 GETCHARLEN(fc, ecode, length);
3407 ecode += length;
3408
3409 /* Handle multibyte character matching specially here. There is
3410 support for caseless matching if UCP support is present. */
3411
3412 if (length > 1)
3413 {
3414 #ifdef SUPPORT_UCP
3415 pcre_uint32 othercase;
3416 if (op >= OP_STARI && /* Caseless */
3417 (othercase = UCD_OTHERCASE(fc)) != fc)
3418 oclength = PRIV(ord2utf)(othercase, occhars);
3419 else oclength = 0;
3420 #endif /* SUPPORT_UCP */
3421
3422 for (i = 1; i <= min; i++)
3423 {
3424 if (eptr <= md->end_subject - length &&
3425 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3426 #ifdef SUPPORT_UCP
3427 else if (oclength > 0 &&
3428 eptr <= md->end_subject - oclength &&
3429 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3430 #endif /* SUPPORT_UCP */
3431 else
3432 {
3433 CHECK_PARTIAL();
3434 RRETURN(MATCH_NOMATCH);
3435 }
3436 }
3437
3438 if (min == max) continue;
3439
3440 if (minimize)
3441 {
3442 for (fi = min;; fi++)
3443 {
3444 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3446 if (fi >= max) RRETURN(MATCH_NOMATCH);
3447 if (eptr <= md->end_subject - length &&
3448 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3449 #ifdef SUPPORT_UCP
3450 else if (oclength > 0 &&
3451 eptr <= md->end_subject - oclength &&
3452 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3453 #endif /* SUPPORT_UCP */
3454 else
3455 {
3456 CHECK_PARTIAL();
3457 RRETURN(MATCH_NOMATCH);
3458 }
3459 }
3460 /* Control never gets here */
3461 }
3462
3463 else /* Maximize */
3464 {
3465 pp = eptr;
3466 for (i = min; i < max; i++)
3467 {
3468 if (eptr <= md->end_subject - length &&
3469 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3470 #ifdef SUPPORT_UCP
3471 else if (oclength > 0 &&
3472 eptr <= md->end_subject - oclength &&
3473 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3474 #endif /* SUPPORT_UCP */
3475 else
3476 {
3477 CHECK_PARTIAL();
3478 break;
3479 }
3480 }
3481
3482 if (possessive) continue; /* No backtracking */
3483 for(;;)
3484 {
3485 if (eptr == pp) goto TAIL_RECURSE;
3486 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3487 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3488 #ifdef SUPPORT_UCP
3489 eptr--;
3490 BACKCHAR(eptr);
3491 #else /* without SUPPORT_UCP */
3492 eptr -= length;
3493 #endif /* SUPPORT_UCP */
3494 }
3495 }
3496 /* Control never gets here */
3497 }
3498
3499 /* If the length of a UTF-8 character is 1, we fall through here, and
3500 obey the code as for non-UTF-8 characters below, though in this case the
3501 value of fc will always be < 128. */
3502 }
3503 else
3504 #endif /* SUPPORT_UTF */
3505 /* When not in UTF-8 mode, load a single-byte character. */
3506 fc = *ecode++;
3507
3508 /* The value of fc at this point is always one character, though we may
3509 or may not be in UTF mode. The code is duplicated for the caseless and
3510 caseful cases, for speed, since matching characters is likely to be quite
3511 common. First, ensure the minimum number of matches are present. If min =
3512 max, continue at the same level without recursing. Otherwise, if
3513 minimizing, keep trying the rest of the expression and advancing one
3514 matching character if failing, up to the maximum. Alternatively, if
3515 maximizing, find the maximum number of characters and work backwards. */
3516
3517 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3518 max, (char *)eptr));
3519
3520 if (op >= OP_STARI) /* Caseless */
3521 {
3522 #ifdef COMPILE_PCRE8
3523 /* fc must be < 128 if UTF is enabled. */
3524 foc = md->fcc[fc];
3525 #else
3526 #ifdef SUPPORT_UTF
3527 #ifdef SUPPORT_UCP
3528 if (utf && fc > 127)
3529 foc = UCD_OTHERCASE(fc);
3530 #else
3531 if (utf && fc > 127)
3532 foc = fc;
3533 #endif /* SUPPORT_UCP */
3534 else
3535 #endif /* SUPPORT_UTF */
3536 foc = TABLE_GET(fc, md->fcc, fc);
3537 #endif /* COMPILE_PCRE8 */
3538
3539 for (i = 1; i <= min; i++)
3540 {
3541 pcre_uint32 cc; /* Faster than pcre_uchar */
3542 if (eptr >= md->end_subject)
3543 {
3544 SCHECK_PARTIAL();
3545 RRETURN(MATCH_NOMATCH);
3546 }
3547 cc = UCHAR21TEST(eptr);
3548 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3549 eptr++;
3550 }
3551 if (min == max) continue;
3552 if (minimize)
3553 {
3554 for (fi = min;; fi++)
3555 {
3556 pcre_uint32 cc; /* Faster than pcre_uchar */
3557 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3559 if (fi >= max) RRETURN(MATCH_NOMATCH);
3560 if (eptr >= md->end_subject)
3561 {
3562 SCHECK_PARTIAL();
3563 RRETURN(MATCH_NOMATCH);
3564 }
3565 cc = UCHAR21TEST(eptr);
3566 if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
3567 eptr++;
3568 }
3569 /* Control never gets here */
3570 }
3571 else /* Maximize */
3572 {
3573 pp = eptr;
3574 for (i = min; i < max; i++)
3575 {
3576 pcre_uint32 cc; /* Faster than pcre_uchar */
3577 if (eptr >= md->end_subject)
3578 {
3579 SCHECK_PARTIAL();
3580 break;
3581 }
3582 cc = UCHAR21TEST(eptr);
3583 if (fc != cc && foc != cc) break;
3584 eptr++;
3585 }
3586 if (possessive) continue; /* No backtracking */
3587 for (;;)
3588 {
3589 if (eptr == pp) goto TAIL_RECURSE;
3590 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3591 eptr--;
3592 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3593 }
3594 /* Control never gets here */
3595 }
3596 }
3597
3598 /* Caseful comparisons (includes all multi-byte characters) */
3599
3600 else
3601 {
3602 for (i = 1; i <= min; i++)
3603 {
3604 if (eptr >= md->end_subject)
3605 {
3606 SCHECK_PARTIAL();
3607 RRETURN(MATCH_NOMATCH);
3608 }
3609 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3610 }
3611
3612 if (min == max) continue;
3613
3614 if (minimize)
3615 {
3616 for (fi = min;; fi++)
3617 {
3618 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3620 if (fi >= max) RRETURN(MATCH_NOMATCH);
3621 if (eptr >= md->end_subject)
3622 {
3623 SCHECK_PARTIAL();
3624 RRETURN(MATCH_NOMATCH);
3625 }
3626 if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
3627 }
3628 /* Control never gets here */
3629 }
3630 else /* Maximize */
3631 {
3632 pp = eptr;
3633 for (i = min; i < max; i++)
3634 {
3635 if (eptr >= md->end_subject)
3636 {
3637 SCHECK_PARTIAL();
3638 break;
3639 }
3640 if (fc != UCHAR21TEST(eptr)) break;
3641 eptr++;
3642 }
3643 if (possessive) continue; /* No backtracking */
3644 for (;;)
3645 {
3646 if (eptr == pp) goto TAIL_RECURSE;
3647 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3648 eptr--;
3649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3650 }
3651 /* Control never gets here */
3652 }
3653 }
3654 /* Control never gets here */
3655
3656 /* Match a negated single one-byte character. The character we are
3657 checking can be multibyte. */
3658
3659 case OP_NOT:
3660 case OP_NOTI:
3661 if (eptr >= md->end_subject)
3662 {
3663 SCHECK_PARTIAL();
3664 RRETURN(MATCH_NOMATCH);
3665 }
3666 #ifdef SUPPORT_UTF
3667 if (utf)
3668 {
3669 register pcre_uint32 ch, och;
3670
3671 ecode++;
3672 GETCHARINC(ch, ecode);
3673 GETCHARINC(c, eptr);
3674
3675 if (op == OP_NOT)
3676 {
3677 if (ch == c) RRETURN(MATCH_NOMATCH);
3678 }
3679 else
3680 {
3681 #ifdef SUPPORT_UCP
3682 if (ch > 127)
3683 och = UCD_OTHERCASE(ch);
3684 #else
3685 if (ch > 127)
3686 och = ch;
3687 #endif /* SUPPORT_UCP */
3688 else
3689 och = TABLE_GET(ch, md->fcc, ch);
3690 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3691 }
3692 }
3693 else
3694 #endif
3695 {
3696 register pcre_uint32 ch = ecode[1];
3697 c = *eptr++;
3698 if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3699 RRETURN(MATCH_NOMATCH);
3700 ecode += 2;
3701 }
3702 break;
3703
3704 /* Match a negated single one-byte character repeatedly. This is almost a
3705 repeat of the code for a repeated single character, but I haven't found a
3706 nice way of commoning these up that doesn't require a test of the
3707 positive/negative option for each character match. Maybe that wouldn't add
3708 very much to the time taken, but character matching *is* what this is all
3709 about... */
3710
3711 case OP_NOTEXACT:
3712 case OP_NOTEXACTI:
3713 min = max = GET2(ecode, 1);
3714 ecode += 1 + IMM2_SIZE;
3715 goto REPEATNOTCHAR;
3716
3717 case OP_NOTUPTO:
3718 case OP_NOTUPTOI:
3719 case OP_NOTMINUPTO:
3720 case OP_NOTMINUPTOI:
3721 min = 0;
3722 max = GET2(ecode, 1);
3723 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3724 ecode += 1 + IMM2_SIZE;
3725 goto REPEATNOTCHAR;
3726
3727 case OP_NOTPOSSTAR:
3728 case OP_NOTPOSSTARI:
3729 possessive = TRUE;
3730 min = 0;
3731 max = INT_MAX;
3732 ecode++;
3733 goto REPEATNOTCHAR;
3734
3735 case OP_NOTPOSPLUS:
3736 case OP_NOTPOSPLUSI:
3737 possessive = TRUE;
3738 min = 1;
3739 max = INT_MAX;
3740 ecode++;
3741 goto REPEATNOTCHAR;
3742
3743 case OP_NOTPOSQUERY:
3744 case OP_NOTPOSQUERYI:
3745 possessive = TRUE;
3746 min = 0;
3747 max = 1;
3748 ecode++;
3749 goto REPEATNOTCHAR;
3750
3751 case OP_NOTPOSUPTO:
3752 case OP_NOTPOSUPTOI:
3753 possessive = TRUE;
3754 min = 0;
3755 max = GET2(ecode, 1);
3756 ecode += 1 + IMM2_SIZE;
3757 goto REPEATNOTCHAR;
3758
3759 case OP_NOTSTAR:
3760 case OP_NOTSTARI:
3761 case OP_NOTMINSTAR:
3762 case OP_NOTMINSTARI:
3763 case OP_NOTPLUS:
3764 case OP_NOTPLUSI:
3765 case OP_NOTMINPLUS:
3766 case OP_NOTMINPLUSI:
3767 case OP_NOTQUERY:
3768 case OP_NOTQUERYI:
3769 case OP_NOTMINQUERY:
3770 case OP_NOTMINQUERYI:
3771 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3772 minimize = (c & 1) != 0;
3773 min = rep_min[c]; /* Pick up values from tables; */
3774 max = rep_max[c]; /* zero for max => infinity */
3775 if (max == 0) max = INT_MAX;
3776
3777 /* Common code for all repeated single-byte matches. */
3778
3779 REPEATNOTCHAR:
3780 GETCHARINCTEST(fc, ecode);
3781
3782 /* The code is duplicated for the caseless and caseful cases, for speed,
3783 since matching characters is likely to be quite common. First, ensure the
3784 minimum number of matches are present. If min = max, continue at the same
3785 level without recursing. Otherwise, if minimizing, keep trying the rest of
3786 the expression and advancing one matching character if failing, up to the
3787 maximum. Alternatively, if maximizing, find the maximum number of
3788 characters and work backwards. */
3789
3790 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3791 max, (char *)eptr));
3792
3793 if (op >= OP_NOTSTARI) /* Caseless */
3794 {
3795 #ifdef SUPPORT_UTF
3796 #ifdef SUPPORT_UCP
3797 if (utf && fc > 127)
3798 foc = UCD_OTHERCASE(fc);
3799 #else
3800 if (utf && fc > 127)
3801 foc = fc;
3802 #endif /* SUPPORT_UCP */
3803 else
3804 #endif /* SUPPORT_UTF */
3805 foc = TABLE_GET(fc, md->fcc, fc);
3806
3807 #ifdef SUPPORT_UTF
3808 if (utf)
3809 {
3810 register pcre_uint32 d;
3811 for (i = 1; i <= min; i++)
3812 {
3813 if (eptr >= md->end_subject)
3814 {
3815 SCHECK_PARTIAL();
3816 RRETURN(MATCH_NOMATCH);
3817 }
3818 GETCHARINC(d, eptr);
3819 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3820 }
3821 }
3822 else
3823 #endif /* SUPPORT_UTF */
3824 /* Not UTF mode */
3825 {
3826 for (i = 1; i <= min; i++)
3827 {
3828 if (eptr >= md->end_subject)
3829 {
3830 SCHECK_PARTIAL();
3831 RRETURN(MATCH_NOMATCH);
3832 }
3833 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3834 eptr++;
3835 }
3836 }
3837
3838 if (min == max) continue;
3839
3840 if (minimize)
3841 {
3842 #ifdef SUPPORT_UTF
3843 if (utf)
3844 {
3845 register pcre_uint32 d;
3846 for (fi = min;; fi++)
3847 {
3848 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3850 if (fi >= max) RRETURN(MATCH_NOMATCH);
3851 if (eptr >= md->end_subject)
3852 {
3853 SCHECK_PARTIAL();
3854 RRETURN(MATCH_NOMATCH);
3855 }
3856 GETCHARINC(d, eptr);
3857 if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
3858 }
3859 }
3860 else
3861 #endif /*SUPPORT_UTF */
3862 /* Not UTF mode */
3863 {
3864 for (fi = min;; fi++)
3865 {
3866 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3867 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3868 if (fi >= max) RRETURN(MATCH_NOMATCH);
3869 if (eptr >= md->end_subject)
3870 {
3871 SCHECK_PARTIAL();
3872 RRETURN(MATCH_NOMATCH);
3873 }
3874 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3875 eptr++;
3876 }
3877 }
3878 /* Control never gets here */
3879 }
3880
3881 /* Maximize case */
3882
3883 else
3884 {
3885 pp = eptr;
3886
3887 #ifdef SUPPORT_UTF
3888 if (utf)
3889 {
3890 register pcre_uint32 d;
3891 for (i = min; i < max; i++)
3892 {
3893 int len = 1;
3894 if (eptr >= md->end_subject)
3895 {
3896 SCHECK_PARTIAL();
3897 break;
3898 }
3899 GETCHARLEN(d, eptr, len);
3900 if (fc == d || (unsigned int)foc == d) break;
3901 eptr += len;
3902 }
3903 if (possessive) continue; /* No backtracking */
3904 for(;;)
3905 {
3906 if (eptr == pp) goto TAIL_RECURSE;
3907 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3908 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3909 eptr--;
3910 BACKCHAR(eptr);
3911 }
3912 }
3913 else
3914 #endif /* SUPPORT_UTF */
3915 /* Not UTF mode */
3916 {
3917 for (i = min; i < max; i++)
3918 {
3919 if (eptr >= md->end_subject)
3920 {
3921 SCHECK_PARTIAL();
3922 break;
3923 }
3924 if (fc == *eptr || foc == *eptr) break;
3925 eptr++;
3926 }
3927 if (possessive) continue; /* No backtracking */
3928 for (;;)
3929 {
3930 if (eptr == pp) goto TAIL_RECURSE;
3931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3933 eptr--;
3934 }
3935 }
3936 /* Control never gets here */
3937 }
3938 }
3939
3940 /* Caseful comparisons */
3941
3942 else
3943 {
3944 #ifdef SUPPORT_UTF
3945 if (utf)
3946 {
3947 register pcre_uint32 d;
3948 for (i = 1; i <= min; i++)
3949 {
3950 if (eptr >= md->end_subject)
3951 {
3952 SCHECK_PARTIAL();
3953 RRETURN(MATCH_NOMATCH);
3954 }
3955 GETCHARINC(d, eptr);
3956 if (fc == d) RRETURN(MATCH_NOMATCH);
3957 }
3958 }
3959 else
3960 #endif
3961 /* Not UTF mode */
3962 {
3963 for (i = 1; i <= min; i++)
3964 {
3965 if (eptr >= md->end_subject)
3966 {
3967 SCHECK_PARTIAL();
3968 RRETURN(MATCH_NOMATCH);
3969 }
3970 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3971 }
3972 }
3973
3974 if (min == max) continue;
3975
3976 if (minimize)
3977 {
3978 #ifdef SUPPORT_UTF
3979 if (utf)
3980 {
3981 register pcre_uint32 d;
3982 for (fi = min;; fi++)
3983 {
3984 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3985 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3986 if (fi >= max) RRETURN(MATCH_NOMATCH);
3987 if (eptr >= md->end_subject)
3988 {
3989 SCHECK_PARTIAL();
3990 RRETURN(MATCH_NOMATCH);
3991 }
3992 GETCHARINC(d, eptr);
3993 if (fc == d) RRETURN(MATCH_NOMATCH);
3994 }
3995 }
3996 else
3997 #endif
3998 /* Not UTF mode */
3999 {
4000 for (fi = min;; fi++)
4001 {
4002 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
4003 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4004 if (fi >= max) RRETURN(MATCH_NOMATCH);
4005 if (eptr >= md->end_subject)
4006 {
4007 SCHECK_PARTIAL();
4008 RRETURN(MATCH_NOMATCH);
4009 }
4010 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
4011 }
4012 }
4013 /* Control never gets here */
4014 }
4015
4016 /* Maximize case */
4017
4018 else
4019 {
4020 pp = eptr;
4021
4022 #ifdef SUPPORT_UTF
4023 if (utf)
4024 {
4025 register pcre_uint32 d;
4026 for (i = min; i < max; i++)
4027 {
4028 int len = 1;
4029 if (eptr >= md->end_subject)
4030 {
4031 SCHECK_PARTIAL();
4032 break;
4033 }
4034 GETCHARLEN(d, eptr, len);
4035 if (fc == d) break;
4036 eptr += len;
4037 }
4038 if (possessive) continue; /* No backtracking */
4039 for(;;)
4040 {
4041 if (eptr == pp) goto TAIL_RECURSE;
4042 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
4043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4044 eptr--;
4045 BACKCHAR(eptr);
4046 }
4047 }
4048 else
4049 #endif
4050 /* Not UTF mode */
4051 {
4052 for (i = min; i < max; i++)
4053 {
4054 if (eptr >= md->end_subject)
4055 {
4056 SCHECK_PARTIAL();
4057 break;
4058 }
4059 if (fc == *eptr) break;
4060 eptr++;
4061 }
4062 if (possessive) continue; /* No backtracking */
4063 for (;;)
4064 {
4065 if (eptr == pp) goto TAIL_RECURSE;
4066 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
4067 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4068 eptr--;
4069 }
4070 }
4071 /* Control never gets here */
4072 }
4073 }
4074 /* Control never gets here */
4075
4076 /* Match a single character type repeatedly; several different opcodes
4077 share code. This is very similar to the code for single characters, but we
4078 repeat it in the interests of efficiency. */
4079
4080 case OP_TYPEEXACT:
4081 min = max = GET2(ecode, 1);
4082 minimize = TRUE;
4083 ecode += 1 + IMM2_SIZE;
4084 goto REPEATTYPE;
4085
4086 case OP_TYPEUPTO:
4087 case OP_TYPEMINUPTO:
4088 min = 0;
4089 max = GET2(ecode, 1);
4090 minimize = *ecode == OP_TYPEMINUPTO;
4091 ecode += 1 + IMM2_SIZE;
4092 goto REPEATTYPE;
4093
4094 case OP_TYPEPOSSTAR:
4095 possessive = TRUE;
4096 min = 0;
4097 max = INT_MAX;
4098 ecode++;
4099 goto REPEATTYPE;
4100
4101 case OP_TYPEPOSPLUS:
4102 possessive = TRUE;
4103 min = 1;
4104 max = INT_MAX;
4105 ecode++;
4106 goto REPEATTYPE;
4107
4108 case OP_TYPEPOSQUERY:
4109 possessive = TRUE;
4110 min = 0;
4111 max = 1;
4112 ecode++;
4113 goto REPEATTYPE;
4114
4115 case OP_TYPEPOSUPTO:
4116 possessive = TRUE;
4117 min = 0;
4118 max = GET2(ecode, 1);
4119 ecode += 1 + IMM2_SIZE;
4120 goto REPEATTYPE;
4121
4122 case OP_TYPESTAR:
4123 case OP_TYPEMINSTAR:
4124 case OP_TYPEPLUS:
4125 case OP_TYPEMINPLUS:
4126 case OP_TYPEQUERY:
4127 case OP_TYPEMINQUERY:
4128 c = *ecode++ - OP_TYPESTAR;
4129 minimize = (c & 1) != 0;
4130 min = rep_min[c]; /* Pick up values from tables; */
4131 max = rep_max[c]; /* zero for max => infinity */
4132 if (max == 0) max = INT_MAX;
4133
4134 /* Common code for all repeated single character type matches. Note that
4135 in UTF-8 mode, '.' matches a character of any length, but for the other
4136 character types, the valid characters are all one-byte long. */
4137
4138 REPEATTYPE:
4139 ctype = *ecode++; /* Code for the character type */
4140
4141 #ifdef SUPPORT_UCP
4142 if (ctype == OP_PROP || ctype == OP_NOTPROP)
4143 {
4144 prop_fail_result = ctype == OP_NOTPROP;
4145 prop_type = *ecode++;
4146 prop_value = *ecode++;
4147 }
4148 else prop_type = -1;
4149 #endif
4150
4151 /* First, ensure the minimum number of matches are present. Use inline
4152 code for maximizing the speed, and do the type test once at the start
4153 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
4154 is tidier. Also separate the UCP code, which can be the same for both UTF-8
4155 and single-bytes. */
4156
4157 if (min > 0)
4158 {
4159 #ifdef SUPPORT_UCP
4160 if (prop_type >= 0)
4161 {
4162 switch(prop_type)
4163 {
4164 case PT_ANY:
4165 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4166 for (i = 1; i <= min; i++)
4167 {
4168 if (eptr >= md->end_subject)
4169 {
4170 SCHECK_PARTIAL();
4171 RRETURN(MATCH_NOMATCH);
4172 }
4173 GETCHARINCTEST(c, eptr);
4174 }
4175 break;
4176
4177 case PT_LAMP:
4178 for (i = 1; i <= min; i++)
4179 {
4180 int chartype;
4181 if (eptr >= md->end_subject)
4182 {
4183 SCHECK_PARTIAL();
4184 RRETURN(MATCH_NOMATCH);
4185 }
4186 GETCHARINCTEST(c, eptr);
4187 chartype = UCD_CHARTYPE(c);
4188 if ((chartype == ucp_Lu ||
4189 chartype == ucp_Ll ||
4190 chartype == ucp_Lt) == prop_fail_result)
4191 RRETURN(MATCH_NOMATCH);
4192 }
4193 break;
4194
4195 case PT_GC:
4196 for (i = 1; i <= min; i++)
4197 {
4198 if (eptr >= md->end_subject)
4199 {
4200 SCHECK_PARTIAL();
4201 RRETURN(MATCH_NOMATCH);
4202 }
4203 GETCHARINCTEST(c, eptr);
4204 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4205 RRETURN(MATCH_NOMATCH);
4206 }
4207 break;
4208
4209 case PT_PC:
4210 for (i = 1; i <= min; i++)
4211 {
4212 if (eptr >= md->end_subject)
4213 {
4214 SCHECK_PARTIAL();
4215 RRETURN(MATCH_NOMATCH);
4216 }
4217 GETCHARINCTEST(c, eptr);
4218 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4219 RRETURN(MATCH_NOMATCH);
4220 }
4221 break;
4222
4223 case PT_SC:
4224 for (i = 1; i <= min; i++)
4225 {
4226 if (eptr >= md->end_subject)
4227 {
4228 SCHECK_PARTIAL();
4229 RRETURN(MATCH_NOMATCH);
4230 }
4231 GETCHARINCTEST(c, eptr);
4232 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4233 RRETURN(MATCH_NOMATCH);
4234 }
4235 break;
4236
4237 case PT_ALNUM:
4238 for (i = 1; i <= min; i++)
4239 {
4240 int category;
4241 if (eptr >= md->end_subject)
4242 {
4243 SCHECK_PARTIAL();
4244 RRETURN(MATCH_NOMATCH);
4245 }
4246 GETCHARINCTEST(c, eptr);
4247 category = UCD_CATEGORY(c);
4248 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4249 RRETURN(MATCH_NOMATCH);
4250 }
4251 break;
4252
4253 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4254 which means that Perl space and POSIX space are now identical. PCRE
4255 was changed at release 8.34. */
4256
4257 case PT_SPACE: /* Perl space */
4258 case PT_PXSPACE: /* POSIX space */
4259 for (i = 1; i <= min; i++)
4260 {
4261 if (eptr >= md->end_subject)
4262 {
4263 SCHECK_PARTIAL();
4264 RRETURN(MATCH_NOMATCH);
4265 }
4266 GETCHARINCTEST(c, eptr);
4267 switch(c)
4268 {
4269 HSPACE_CASES:
4270 VSPACE_CASES:
4271 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4272 break;
4273
4274 default:
4275 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4276 RRETURN(MATCH_NOMATCH);
4277 break;
4278 }
4279 }
4280 break;
4281
4282 case PT_WORD:
4283 for (i = 1; i <= min; i++)
4284 {
4285 int category;
4286 if (eptr >= md->end_subject)
4287 {
4288 SCHECK_PARTIAL();
4289 RRETURN(MATCH_NOMATCH);
4290 }
4291 GETCHARINCTEST(c, eptr);
4292 category = UCD_CATEGORY(c);
4293 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4294 == prop_fail_result)
4295 RRETURN(MATCH_NOMATCH);
4296 }
4297 break;
4298
4299 case PT_CLIST:
4300 for (i = 1; i <= min; i++)
4301 {
4302 const pcre_uint32 *cp;
4303 if (eptr >= md->end_subject)
4304 {
4305 SCHECK_PARTIAL();
4306 RRETURN(MATCH_NOMATCH);
4307 }
4308 GETCHARINCTEST(c, eptr);
4309 cp = PRIV(ucd_caseless_sets) + prop_value;
4310 for (;;)
4311 {
4312 if (c < *cp)
4313 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4314 if (c == *cp++)
4315 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4316 }
4317 }
4318 break;
4319
4320 case PT_UCNC:
4321 for (i = 1; i <= min; i++)
4322 {
4323 if (eptr >= md->end_subject)
4324 {
4325 SCHECK_PARTIAL();
4326 RRETURN(MATCH_NOMATCH);
4327 }
4328 GETCHARINCTEST(c, eptr);
4329 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4330 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4331 c >= 0xe000) == prop_fail_result)
4332 RRETURN(MATCH_NOMATCH);
4333 }
4334 break;
4335
4336 /* This should not occur */
4337
4338 default:
4339 RRETURN(PCRE_ERROR_INTERNAL);
4340 }
4341 }
4342
4343 /* Match extended Unicode sequences. We will get here only if the
4344 support is in the binary; otherwise a compile-time error occurs. */
4345
4346 else if (ctype == OP_EXTUNI)
4347 {
4348 for (i = 1; i <= min; i++)
4349 {
4350 if (eptr >= md->end_subject)
4351 {
4352 SCHECK_PARTIAL();
4353 RRETURN(MATCH_NOMATCH);
4354 }
4355 else
4356 {
4357 int lgb, rgb;
4358 GETCHARINCTEST(c, eptr);
4359 lgb = UCD_GRAPHBREAK(c);
4360 while (eptr < md->end_subject)
4361 {
4362 int len = 1;
4363 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4364 rgb = UCD_GRAPHBREAK(c);
4365 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4366 lgb = rgb;
4367 eptr += len;
4368 }
4369 }
4370 CHECK_PARTIAL();
4371 }
4372 }
4373
4374 else
4375 #endif /* SUPPORT_UCP */
4376
4377 /* Handle all other cases when the coding is UTF-8 */
4378
4379 #ifdef SUPPORT_UTF
4380 if (utf) switch(ctype)
4381 {
4382 case OP_ANY:
4383 for (i = 1; i <= min; i++)
4384 {
4385 if (eptr >= md->end_subject)
4386 {
4387 SCHECK_PARTIAL();
4388 RRETURN(MATCH_NOMATCH);
4389 }
4390 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4391 if (md->partial != 0 &&
4392 eptr + 1 >= md->end_subject &&
4393 NLBLOCK->nltype == NLTYPE_FIXED &&
4394 NLBLOCK->nllen == 2 &&
4395 UCHAR21(eptr) == NLBLOCK->nl[0])
4396 {
4397 md->hitend = TRUE;
4398 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4399 }
4400 eptr++;
4401 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4402 }
4403 break;
4404
4405 case OP_ALLANY:
4406 for (i = 1; i <= min; i++)
4407 {
4408 if (eptr >= md->end_subject)
4409 {
4410 SCHECK_PARTIAL();
4411 RRETURN(MATCH_NOMATCH);
4412 }
4413 eptr++;
4414 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4415 }
4416 break;
4417
4418 case OP_ANYBYTE:
4419 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4420 eptr += min;
4421 break;
4422
4423 case OP_ANYNL:
4424 for (i = 1; i <= min; i++)
4425 {
4426 if (eptr >= md->end_subject)
4427 {
4428 SCHECK_PARTIAL();
4429 RRETURN(MATCH_NOMATCH);
4430 }
4431 GETCHARINC(c, eptr);
4432 switch(c)
4433 {
4434 default: RRETURN(MATCH_NOMATCH);
4435
4436 case CHAR_CR:
4437 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
4438 break;
4439
4440 case CHAR_LF:
4441 break;
4442
4443 case CHAR_VT:
4444 case CHAR_FF:
4445 case CHAR_NEL:
4446 #ifndef EBCDIC
4447 case 0x2028:
4448 case 0x2029:
4449 #endif /* Not EBCDIC */
4450 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4451 break;
4452 }
4453 }
4454 break;
4455
4456 case OP_NOT_HSPACE:
4457 for (i = 1; i <= min; i++)
4458 {
4459 if (eptr >= md->end_subject)
4460 {
4461 SCHECK_PARTIAL();
4462 RRETURN(MATCH_NOMATCH);
4463 }
4464 GETCHARINC(c, eptr);
4465 switch(c)
4466 {
4467 HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
4468 default: break;
4469 }
4470 }
4471 break;
4472
4473 case OP_HSPACE:
4474 for (i = 1; i <= min; i++)
4475 {
4476 if (eptr >= md->end_subject)
4477 {
4478 SCHECK_PARTIAL();
4479 RRETURN(MATCH_NOMATCH);
4480 }
4481 GETCHARINC(c, eptr);
4482 switch(c)
4483 {
4484 HSPACE_CASES: break; /* Byte and multibyte cases */
4485 default: RRETURN(MATCH_NOMATCH);
4486 }
4487 }
4488 break;
4489
4490 case OP_NOT_VSPACE:
4491 for (i = 1; i <= min; i++)
4492 {
4493 if (eptr >= md->end_subject)
4494 {
4495 SCHECK_PARTIAL();
4496 RRETURN(MATCH_NOMATCH);
4497 }
4498 GETCHARINC(c, eptr);
4499 switch(c)
4500 {
4501 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
4502 default: break;
4503 }
4504 }
4505 break;
4506
4507 case OP_VSPACE:
4508 for (i = 1; i <= min; i++)
4509 {
4510 if (eptr >= md->end_subject)
4511 {
4512 SCHECK_PARTIAL();
4513 RRETURN(MATCH_NOMATCH);
4514 }
4515 GETCHARINC(c, eptr);
4516 switch(c)
4517 {
4518 VSPACE_CASES: break;
4519 default: RRETURN(MATCH_NOMATCH);
4520 }
4521 }
4522 break;
4523
4524 case OP_NOT_DIGIT:
4525 for (i = 1; i <= min; i++)
4526 {
4527 if (eptr >= md->end_subject)
4528 {
4529 SCHECK_PARTIAL();
4530 RRETURN(MATCH_NOMATCH);
4531 }
4532 GETCHARINC(c, eptr);
4533 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4534 RRETURN(MATCH_NOMATCH);
4535 }
4536 break;
4537
4538 case OP_DIGIT:
4539 for (i = 1; i <= min; i++)
4540 {
4541 pcre_uint32 cc;
4542 if (eptr >= md->end_subject)
4543 {
4544 SCHECK_PARTIAL();
4545 RRETURN(MATCH_NOMATCH);
4546 }
4547 cc = UCHAR21(eptr);
4548 if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
4549 RRETURN(MATCH_NOMATCH);
4550 eptr++;
4551 /* No need to skip more bytes - we know it's a 1-byte character */
4552 }
4553 break;
4554
4555 case OP_NOT_WHITESPACE:
4556 for (i = 1; i <= min; i++)
4557 {
4558 pcre_uint32 cc;
4559 if (eptr >= md->end_subject)
4560 {
4561 SCHECK_PARTIAL();
4562 RRETURN(MATCH_NOMATCH);
4563 }
4564 cc = UCHAR21(eptr);
4565 if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
4566 RRETURN(MATCH_NOMATCH);
4567 eptr++;
4568 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4569 }
4570 break;
4571
4572 case OP_WHITESPACE:
4573 for (i = 1; i <= min; i++)
4574 {
4575 pcre_uint32 cc;
4576 if (eptr >= md->end_subject)
4577 {
4578 SCHECK_PARTIAL();
4579 RRETURN(MATCH_NOMATCH);
4580 }
4581 cc = UCHAR21(eptr);
4582 if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
4583 RRETURN(MATCH_NOMATCH);
4584 eptr++;
4585 /* No need to skip more bytes - we know it's a 1-byte character */
4586 }
4587 break;
4588
4589 case OP_NOT_WORDCHAR:
4590 for (i = 1; i <= min; i++)
4591 {
4592 pcre_uint32 cc;
4593 if (eptr >= md->end_subject)
4594 {
4595 SCHECK_PARTIAL();
4596 RRETURN(MATCH_NOMATCH);
4597 }
4598 cc = UCHAR21(eptr);
4599 if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
4600 RRETURN(MATCH_NOMATCH);
4601 eptr++;
4602 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4603 }
4604 break;
4605
4606 case OP_WORDCHAR:
4607 for (i = 1; i <= min; i++)
4608 {
4609 pcre_uint32 cc;
4610 if (eptr >= md->end_subject)
4611 {
4612 SCHECK_PARTIAL();
4613 RRETURN(MATCH_NOMATCH);
4614 }
4615 cc = UCHAR21(eptr);
4616 if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
4617 RRETURN(MATCH_NOMATCH);
4618 eptr++;
4619 /* No need to skip more bytes - we know it's a 1-byte character */
4620 }
4621 break;
4622
4623 default:
4624 RRETURN(PCRE_ERROR_INTERNAL);
4625 } /* End switch(ctype) */
4626
4627 else
4628 #endif /* SUPPORT_UTF */
4629
4630 /* Code for the non-UTF-8 case for minimum matching of operators other
4631 than OP_PROP and OP_NOTPROP. */
4632
4633 switch(ctype)
4634 {
4635 case OP_ANY:
4636 for (i = 1; i <= min; i++)
4637 {
4638 if (eptr >= md->end_subject)
4639 {
4640 SCHECK_PARTIAL();
4641 RRETURN(MATCH_NOMATCH);
4642 }
4643 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4644 if (md->partial != 0 &&
4645 eptr + 1 >= md->end_subject &&
4646 NLBLOCK->nltype == NLTYPE_FIXED &&
4647 NLBLOCK->nllen == 2 &&
4648 *eptr == NLBLOCK->nl[0])
4649 {
4650 md->hitend = TRUE;
4651 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4652 }
4653 eptr++;
4654 }
4655 break;
4656
4657 case OP_ALLANY:
4658 if (eptr > md->end_subject - min)
4659 {
4660 SCHECK_PARTIAL();
4661 RRETURN(MATCH_NOMATCH);
4662 }
4663 eptr += min;
4664 break;
4665
4666 case OP_ANYBYTE:
4667 if (eptr > md->end_subject - min)
4668 {
4669 SCHECK_PARTIAL();
4670 RRETURN(MATCH_NOMATCH);
4671 }
4672 eptr += min;
4673 break;
4674
4675 case OP_ANYNL:
4676 for (i = 1; i <= min; i++)
4677 {
4678 if (eptr >= md->end_subject)
4679 {
4680 SCHECK_PARTIAL();
4681 RRETURN(MATCH_NOMATCH);
4682 }
4683 switch(*eptr++)
4684 {
4685 default: RRETURN(MATCH_NOMATCH);
4686
4687 case CHAR_CR:
4688 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
4689 break;
4690
4691 case CHAR_LF:
4692 break;
4693
4694 case CHAR_VT:
4695 case CHAR_FF:
4696 case CHAR_NEL:
4697 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4698 case 0x2028:
4699 case 0x2029:
4700 #endif
4701 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4702 break;
4703 }
4704 }
4705 break;
4706
4707 case OP_NOT_HSPACE:
4708 for (i = 1; i <= min; i++)
4709 {
4710 if (eptr >= md->end_subject)
4711 {
4712 SCHECK_PARTIAL();
4713 RRETURN(MATCH_NOMATCH);
4714 }
4715 switch(*eptr++)
4716 {
4717 default: break;
4718 HSPACE_BYTE_CASES:
4719 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4720 HSPACE_MULTIBYTE_CASES:
4721 #endif
4722 RRETURN(MATCH_NOMATCH);
4723 }
4724 }
4725 break;
4726
4727 case OP_HSPACE:
4728 for (i = 1; i <= min; i++)
4729 {
4730 if (eptr >= md->end_subject)
4731 {
4732 SCHECK_PARTIAL();
4733 RRETURN(MATCH_NOMATCH);
4734 }
4735 switch(*eptr++)
4736 {
4737 default: RRETURN(MATCH_NOMATCH);
4738 HSPACE_BYTE_CASES:
4739 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4740 HSPACE_MULTIBYTE_CASES:
4741 #endif
4742 break;
4743 }
4744 }
4745 break;
4746
4747 case OP_NOT_VSPACE:
4748 for (i = 1; i <= min; i++)
4749 {
4750 if (eptr >= md->end_subject)
4751 {
4752 SCHECK_PARTIAL();
4753 RRETURN(MATCH_NOMATCH);
4754 }
4755 switch(*eptr++)
4756 {
4757 VSPACE_BYTE_CASES:
4758 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4759 VSPACE_MULTIBYTE_CASES:
4760 #endif
4761 RRETURN(MATCH_NOMATCH);
4762 default: break;
4763 }
4764 }
4765 break;
4766
4767 case OP_VSPACE:
4768 for (i = 1; i <= min; i++)
4769 {
4770 if (eptr >= md->end_subject)
4771 {
4772 SCHECK_PARTIAL();
4773 RRETURN(MATCH_NOMATCH);
4774 }
4775 switch(*eptr++)
4776 {
4777 default: RRETURN(MATCH_NOMATCH);
4778 VSPACE_BYTE_CASES:
4779 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4780 VSPACE_MULTIBYTE_CASES:
4781 #endif
4782 break;
4783 }
4784 }
4785 break;
4786
4787 case OP_NOT_DIGIT:
4788 for (i = 1; i <= min; i++)
4789 {
4790 if (eptr >= md->end_subject)
4791 {
4792 SCHECK_PARTIAL();
4793 RRETURN(MATCH_NOMATCH);
4794 }
4795 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4796 RRETURN(MATCH_NOMATCH);
4797 eptr++;
4798 }
4799 break;
4800
4801 case OP_DIGIT:
4802 for (i = 1; i <= min; i++)
4803 {
4804 if (eptr >= md->end_subject)
4805 {
4806 SCHECK_PARTIAL();
4807 RRETURN(MATCH_NOMATCH);
4808 }
4809 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4810 RRETURN(MATCH_NOMATCH);
4811 eptr++;
4812 }
4813 break;
4814
4815 case OP_NOT_WHITESPACE:
4816 for (i = 1; i <= min; i++)
4817 {
4818 if (eptr >= md->end_subject)
4819 {
4820 SCHECK_PARTIAL();
4821 RRETURN(MATCH_NOMATCH);
4822 }
4823 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4824 RRETURN(MATCH_NOMATCH);
4825 eptr++;
4826 }
4827 break;
4828
4829 case OP_WHITESPACE:
4830 for (i = 1; i <= min; i++)
4831 {
4832 if (eptr >= md->end_subject)
4833 {
4834 SCHECK_PARTIAL();
4835 RRETURN(MATCH_NOMATCH);
4836 }
4837 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4838 RRETURN(MATCH_NOMATCH);
4839 eptr++;
4840 }
4841 break;
4842
4843 case OP_NOT_WORDCHAR:
4844 for (i = 1; i <= min; i++)
4845 {
4846 if (eptr >= md->end_subject)
4847 {
4848 SCHECK_PARTIAL();
4849 RRETURN(MATCH_NOMATCH);
4850 }
4851 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4852 RRETURN(MATCH_NOMATCH);
4853 eptr++;
4854 }
4855 break;
4856
4857 case OP_WORDCHAR:
4858 for (i = 1; i <= min; i++)
4859 {
4860 if (eptr >= md->end_subject)
4861 {
4862 SCHECK_PARTIAL();
4863 RRETURN(MATCH_NOMATCH);
4864 }
4865 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4866 RRETURN(MATCH_NOMATCH);
4867 eptr++;
4868 }
4869 break;
4870
4871 default:
4872 RRETURN(PCRE_ERROR_INTERNAL);
4873 }
4874 }
4875
4876 /* If min = max, continue at the same level without recursing */
4877
4878 if (min == max) continue;
4879
4880 /* If minimizing, we have to test the rest of the pattern before each
4881 subsequent match. Again, separate the UTF-8 case for speed, and also
4882 separate the UCP cases. */
4883
4884 if (minimize)
4885 {
4886 #ifdef SUPPORT_UCP
4887 if (prop_type >= 0)
4888 {
4889 switch(prop_type)
4890 {
4891 case PT_ANY:
4892 for (fi = min;; fi++)
4893 {
4894 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4896 if (fi >= max) RRETURN(MATCH_NOMATCH);
4897 if (eptr >= md->end_subject)
4898 {
4899 SCHECK_PARTIAL();
4900 RRETURN(MATCH_NOMATCH);
4901 }
4902 GETCHARINCTEST(c, eptr);
4903 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4904 }
4905 /* Control never gets here */
4906
4907 case PT_LAMP:
4908 for (fi = min;; fi++)
4909 {
4910 int chartype;
4911 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4912 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4913 if (fi >= max) RRETURN(MATCH_NOMATCH);
4914 if (eptr >= md->end_subject)
4915 {
4916 SCHECK_PARTIAL();
4917 RRETURN(MATCH_NOMATCH);
4918 }
4919 GETCHARINCTEST(c, eptr);
4920 chartype = UCD_CHARTYPE(c);
4921 if ((chartype == ucp_Lu ||
4922 chartype == ucp_Ll ||
4923 chartype == ucp_Lt) == prop_fail_result)
4924 RRETURN(MATCH_NOMATCH);
4925 }
4926 /* Control never gets here */
4927
4928 case PT_GC:
4929 for (fi = min;; fi++)
4930 {
4931 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4932 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4933 if (fi >= max) RRETURN(MATCH_NOMATCH);
4934 if (eptr >= md->end_subject)
4935 {
4936 SCHECK_PARTIAL();
4937 RRETURN(MATCH_NOMATCH);
4938 }
4939 GETCHARINCTEST(c, eptr);
4940 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4941 RRETURN(MATCH_NOMATCH);
4942 }
4943 /* Control never gets here */
4944
4945 case PT_PC:
4946 for (fi = min;; fi++)
4947 {
4948 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4950 if (fi >= max) RRETURN(MATCH_NOMATCH);
4951 if (eptr >= md->end_subject)
4952 {
4953 SCHECK_PARTIAL();
4954 RRETURN(MATCH_NOMATCH);
4955 }
4956 GETCHARINCTEST(c, eptr);
4957 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4958 RRETURN(MATCH_NOMATCH);
4959 }
4960 /* Control never gets here */
4961
4962 case PT_SC:
4963 for (fi = min;; fi++)
4964 {
4965 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4966 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4967 if (fi >= max) RRETURN(MATCH_NOMATCH);
4968 if (eptr >= md->end_subject)
4969 {
4970 SCHECK_PARTIAL();
4971 RRETURN(MATCH_NOMATCH);
4972 }
4973 GETCHARINCTEST(c, eptr);
4974 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4975 RRETURN(MATCH_NOMATCH);
4976 }
4977 /* Control never gets here */
4978
4979 case PT_ALNUM:
4980 for (fi = min;; fi++)
4981 {
4982 int category;
4983 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4984 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4985 if (fi >= max) RRETURN(MATCH_NOMATCH);
4986 if (eptr >= md->end_subject)
4987 {
4988 SCHECK_PARTIAL();
4989 RRETURN(MATCH_NOMATCH);
4990 }
4991 GETCHARINCTEST(c, eptr);
4992 category = UCD_CATEGORY(c);
4993 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4994 RRETURN(MATCH_NOMATCH);
4995 }
4996 /* Control never gets here */
4997
4998 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4999 which means that Perl space and POSIX space are now identical. PCRE
5000 was changed at release 8.34. */
5001
5002 case PT_SPACE: /* Perl space */
5003 case PT_PXSPACE: /* POSIX space */
5004 for (fi = min;; fi++)
5005 {
5006 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
5007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5008 if (fi >= max) RRETURN(MATCH_NOMATCH);
5009 if (eptr >= md->end_subject)
5010 {
5011 SCHECK_PARTIAL();
5012 RRETURN(MATCH_NOMATCH);
5013 }
5014 GETCHARINCTEST(c, eptr);
5015 switch(c)
5016 {
5017 HSPACE_CASES:
5018 VSPACE_CASES:
5019 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5020 break;
5021
5022 default:
5023 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5024 RRETURN(MATCH_NOMATCH);
5025 break;
5026 }
5027 }
5028 /* Control never gets here */
5029
5030 case PT_WORD:
5031 for (fi = min;; fi++)
5032 {
5033 int category;
5034 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
5035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5036 if (fi >= max) RRETURN(MATCH_NOMATCH);
5037 if (eptr >= md->end_subject)
5038 {
5039 SCHECK_PARTIAL();
5040 RRETURN(MATCH_NOMATCH);
5041 }
5042 GETCHARINCTEST(c, eptr);
5043 category = UCD_CATEGORY(c);
5044 if ((category == ucp_L ||
5045 category == ucp_N ||
5046 c == CHAR_UNDERSCORE)
5047 == prop_fail_result)
5048 RRETURN(MATCH_NOMATCH);
5049 }
5050 /* Control never gets here */
5051
5052 case PT_CLIST:
5053 for (fi = min;; fi++)
5054 {
5055 const pcre_uint32 *cp;
5056 RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5057 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5058 if (fi >= max) RRETURN(MATCH_NOMATCH);
5059 if (eptr >= md->end_subject)
5060 {
5061 SCHECK_PARTIAL();
5062 RRETURN(MATCH_NOMATCH);
5063 }
5064 GETCHARINCTEST(c, eptr);
5065 cp = PRIV(ucd_caseless_sets) + prop_value;
5066 for (;;)
5067 {
5068 if (c < *cp)
5069 { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5070 if (c == *cp++)
5071 { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5072 }
5073 }
5074 /* Control never gets here */
5075
5076 case PT_UCNC:
5077 for (fi = min;; fi++)
5078 {
5079 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
5080 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5081 if (fi >= max) RRETURN(MATCH_NOMATCH);
5082 if (eptr >= md->end_subject)
5083 {
5084 SCHECK_PARTIAL();
5085 RRETURN(MATCH_NOMATCH);
5086 }
5087 GETCHARINCTEST(c, eptr);
5088 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5089 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5090 c >= 0xe000) == prop_fail_result)
5091 RRETURN(MATCH_NOMATCH);
5092 }
5093 /* Control never gets here */
5094
5095 /* This should never occur */
5096 default:
5097 RRETURN(PCRE_ERROR_INTERNAL);
5098 }
5099 }
5100
5101 /* Match extended Unicode sequences. We will get here only if the
5102 support is in the binary; otherwise a compile-time error occurs. */
5103
5104 else if (ctype == OP_EXTUNI)
5105 {
5106 for (fi = min;; fi++)
5107 {
5108 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
5109 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5110 if (fi >= max) RRETURN(MATCH_NOMATCH);
5111 if (eptr >= md->end_subject)
5112 {
5113 SCHECK_PARTIAL();
5114 RRETURN(MATCH_NOMATCH);
5115 }
5116 else
5117 {
5118 int lgb, rgb;
5119 GETCHARINCTEST(c, eptr);
5120 lgb = UCD_GRAPHBREAK(c);
5121 while (eptr < md->end_subject)
5122 {
5123 int len = 1;
5124 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5125 rgb = UCD_GRAPHBREAK(c);
5126 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5127 lgb = rgb;
5128 eptr += len;
5129 }
5130 }
5131 CHECK_PARTIAL();
5132 }
5133 }
5134 else
5135 #endif /* SUPPORT_UCP */
5136
5137 #ifdef SUPPORT_UTF
5138 if (utf)
5139 {
5140 for (fi = min;; fi++)
5141 {
5142 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
5143 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5144 if (fi >= max) RRETURN(MATCH_NOMATCH);
5145 if (eptr >= md->end_subject)
5146 {
5147 SCHECK_PARTIAL();
5148 RRETURN(MATCH_NOMATCH);
5149 }
5150 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5151 RRETURN(MATCH_NOMATCH);
5152 GETCHARINC(c, eptr);
5153 switch(ctype)
5154 {
5155 case OP_ANY: /* This is the non-NL case */
5156 if (md->partial != 0 && /* Take care with CRLF partial */
5157 eptr >= md->end_subject &&
5158 NLBLOCK->nltype == NLTYPE_FIXED &&
5159 NLBLOCK->nllen == 2 &&
5160 c == NLBLOCK->nl[0])
5161 {
5162 md->hitend = TRUE;
5163 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5164 }
5165 break;
5166
5167 case OP_ALLANY:
5168 case OP_ANYBYTE:
5169 break;
5170
5171 case OP_ANYNL:
5172 switch(c)
5173 {
5174 default: RRETURN(MATCH_NOMATCH);
5175 case CHAR_CR:
5176 if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
5177 break;
5178
5179 case CHAR_LF:
5180 break;
5181
5182 case CHAR_VT:
5183 case CHAR_FF:
5184 case CHAR_NEL:
5185 #ifndef EBCDIC
5186 case 0x2028:
5187 case 0x2029:
5188 #endif /* Not EBCDIC */
5189 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5190 break;
5191 }
5192 break;
5193
5194 case OP_NOT_HSPACE:
5195 switch(c)
5196 {
5197 HSPACE_CASES: RRETURN(MATCH_NOMATCH);
5198 default: break;
5199 }
5200 break;
5201
5202 case OP_HSPACE:
5203 switch(c)
5204 {
5205 HSPACE_CASES: break;
5206 default: RRETURN(MATCH_NOMATCH);
5207 }
5208 break;
5209
5210 case OP_NOT_VSPACE:
5211 switch(c)
5212 {
5213 VSPACE_CASES: RRETURN(MATCH_NOMATCH);
5214 default: break;
5215 }
5216 break;
5217
5218 case OP_VSPACE:
5219 switch(c)
5220 {
5221 VSPACE_CASES: break;
5222 default: RRETURN(MATCH_NOMATCH);
5223 }
5224 break;
5225
5226 case OP_NOT_DIGIT:
5227 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5228 RRETURN(MATCH_NOMATCH);
5229 break;
5230
5231 case OP_DIGIT:
5232 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5233 RRETURN(MATCH_NOMATCH);
5234 break;
5235
5236 case OP_NOT_WHITESPACE:
5237 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5238 RRETURN(MATCH_NOMATCH);
5239 break;
5240
5241 case OP_WHITESPACE:
5242 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5243 RRETURN(MATCH_NOMATCH);
5244 break;
5245
5246 case OP_NOT_WORDCHAR:
5247 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5248 RRETURN(MATCH_NOMATCH);
5249 break;
5250
5251 case OP_WORDCHAR:
5252 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5253 RRETURN(MATCH_NOMATCH);
5254 break;
5255
5256 default:
5257 RRETURN(PCRE_ERROR_INTERNAL);
5258 }
5259 }
5260 }
5261 else
5262 #endif
5263 /* Not UTF mode */
5264 {
5265 for (fi = min;; fi++)
5266 {
5267 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5268 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5269 if (fi >= max) RRETURN(MATCH_NOMATCH);
5270 if (eptr >= md->end_subject)
5271 {
5272 SCHECK_PARTIAL();
5273 RRETURN(MATCH_NOMATCH);
5274 }
5275 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5276 RRETURN(MATCH_NOMATCH);
5277 c = *eptr++;
5278 switch(ctype)
5279 {
5280 case OP_ANY: /* This is the non-NL case */
5281 if (md->partial != 0 && /* Take care with CRLF partial */
5282 eptr >= md->end_subject &&
5283 NLBLOCK->nltype == NLTYPE_FIXED &&
5284 NLBLOCK->nllen == 2 &&
5285 c == NLBLOCK->nl[0])
5286 {
5287 md->hitend = TRUE;
5288 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5289 }
5290 break;
5291
5292 case OP_ALLANY:
5293 case OP_ANYBYTE:
5294 break;
5295
5296 case OP_ANYNL:
5297 switch(c)
5298 {
5299 default: RRETURN(MATCH_NOMATCH);
5300 case CHAR_CR:
5301 if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
5302 break;
5303
5304 case CHAR_LF:
5305 break;
5306
5307 case CHAR_VT:
5308 case CHAR_FF:
5309 case CHAR_NEL:
5310 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5311 case 0x2028:
5312 case 0x2029:
5313 #endif
5314 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5315 break;
5316 }
5317 break;
5318
5319 case OP_NOT_HSPACE:
5320 switch(c)
5321 {
5322 default: break;
5323 HSPACE_BYTE_CASES:
5324 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5325 HSPACE_MULTIBYTE_CASES:
5326 #endif
5327 RRETURN(MATCH_NOMATCH);
5328 }
5329 break;
5330
5331 case OP_HSPACE:
5332 switch(c)
5333 {
5334 default: RRETURN(MATCH_NOMATCH);
5335 HSPACE_BYTE_CASES:
5336 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5337 HSPACE_MULTIBYTE_CASES:
5338 #endif
5339 break;
5340 }
5341 break;
5342
5343 case OP_NOT_VSPACE:
5344 switch(c)
5345 {
5346 default: break;
5347 VSPACE_BYTE_CASES:
5348 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5349 VSPACE_MULTIBYTE_CASES:
5350 #endif
5351 RRETURN(MATCH_NOMATCH);
5352 }
5353 break;
5354
5355 case OP_VSPACE:
5356 switch(c)
5357 {
5358 default: RRETURN(MATCH_NOMATCH);
5359 VSPACE_BYTE_CASES:
5360 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5361 VSPACE_MULTIBYTE_CASES:
5362 #endif
5363 break;
5364 }
5365 break;
5366
5367 case OP_NOT_DIGIT:
5368 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5369 break;
5370
5371 case OP_DIGIT:
5372 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5373 break;
5374
5375 case OP_NOT_WHITESPACE:
5376 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5377 break;
5378
5379 case OP_WHITESPACE:
5380 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5381 break;
5382
5383 case OP_NOT_WORDCHAR:
5384 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5385 break;
5386
5387 case OP_WORDCHAR:
5388 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5389 break;
5390
5391 default:
5392 RRETURN(PCRE_ERROR_INTERNAL);
5393 }
5394 }
5395 }
5396 /* Control never gets here */
5397 }
5398
5399 /* If maximizing, it is worth using inline code for speed, doing the type
5400 test once at the start (i.e. keep it out of the loop). Again, keep the
5401 UTF-8 and UCP stuff separate. */
5402
5403 else
5404 {
5405 pp = eptr; /* Remember where we started */
5406
5407 #ifdef SUPPORT_UCP
5408 if (prop_type >= 0)
5409 {
5410 switch(prop_type)
5411 {
5412 case PT_ANY:
5413 for (i = min; i < max; i++)
5414 {
5415 int len = 1;
5416 if (eptr >= md->end_subject)
5417 {
5418 SCHECK_PARTIAL();
5419 break;
5420 }
5421 GETCHARLENTEST(c, eptr, len);
5422 if (prop_fail_result) break;
5423 eptr+= len;
5424 }
5425 break;
5426
5427 case PT_LAMP:
5428 for (i = min; i < max; i++)
5429 {
5430 int chartype;
5431 int len = 1;
5432 if (eptr >= md->end_subject)
5433 {
5434 SCHECK_PARTIAL();
5435 break;
5436 }
5437 GETCHARLENTEST(c, eptr, len);
5438 chartype = UCD_CHARTYPE(c);
5439 if ((chartype == ucp_Lu ||
5440 chartype == ucp_Ll ||
5441 chartype == ucp_Lt) == prop_fail_result)
5442 break;
5443 eptr+= len;
5444 }
5445 break;
5446
5447 case PT_GC:
5448 for (i = min; i < max; i++)
5449 {
5450 int len = 1;
5451 if (eptr >= md->end_subject)
5452 {
5453 SCHECK_PARTIAL();
5454 break;
5455 }
5456 GETCHARLENTEST(c, eptr, len);
5457 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5458 eptr+= len;
5459 }
5460 break;
5461
5462 case PT_PC:
5463 for (i = min; i < max; i++)
5464 {
5465 int len = 1;
5466 if (eptr >= md->end_subject)
5467 {
5468 SCHECK_PARTIAL();
5469 break;
5470 }
5471 GETCHARLENTEST(c, eptr, len);
5472 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5473 eptr+= len;
5474 }
5475 break;
5476
5477 case PT_SC:
5478 for (i = min; i < max; i++)
5479 {
5480 int len = 1;
5481 if (eptr >= md->end_subject)
5482 {
5483 SCHECK_PARTIAL();
5484 break;
5485 }
5486 GETCHARLENTEST(c, eptr, len);
5487 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5488 eptr+= len;
5489 }
5490 break;
5491
5492 case PT_ALNUM:
5493 for (i = min; i < max; i++)
5494 {
5495 int category;
5496 int len = 1;
5497 if (eptr >= md->end_subject)
5498 {
5499 SCHECK_PARTIAL();
5500 break;
5501 }
5502 GETCHARLENTEST(c, eptr, len);
5503 category = UCD_CATEGORY(c);
5504 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5505 break;
5506 eptr+= len;
5507 }
5508 break;
5509
5510 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5511 which means that Perl space and POSIX space are now identical. PCRE
5512 was changed at release 8.34. */
5513
5514 case PT_SPACE: /* Perl space */
5515 case PT_PXSPACE: /* POSIX space */
5516 for (i = min; i < max; i++)
5517 {
5518 int len = 1;
5519 if (eptr >= md->end_subject)
5520 {
5521 SCHECK_PARTIAL();
5522 break;
5523 }
5524 GETCHARLENTEST(c, eptr, len);
5525 switch(c)
5526 {
5527 HSPACE_CASES:
5528 VSPACE_CASES:
5529 if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
5530 break;
5531
5532 default:
5533 if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5534 goto ENDLOOP99; /* Break the loop */
5535 break;
5536 }
5537 eptr+= len;
5538 }
5539 ENDLOOP99:
5540 break;
5541
5542 case PT_WORD:
5543 for (i = min; i < max; i++)
5544 {
5545 int category;
5546 int len = 1;
5547 if (eptr >= md->end_subject)
5548 {
5549 SCHECK_PARTIAL();
5550 break;
5551 }
5552 GETCHARLENTEST(c, eptr, len);
5553 category = UCD_CATEGORY(c);
5554 if ((category == ucp_L || category == ucp_N ||
5555 c == CHAR_UNDERSCORE) == prop_fail_result)
5556 break;
5557 eptr+= len;
5558 }
5559 break;
5560
5561 case PT_CLIST:
5562 for (i = min; i < max; i++)
5563 {
5564 const pcre_uint32 *cp;
5565 int len = 1;
5566 if (eptr >= md->end_subject)
5567 {
5568 SCHECK_PARTIAL();
5569 break;
5570 }
5571 GETCHARLENTEST(c, eptr, len);
5572 cp = PRIV(ucd_caseless_sets) + prop_value;
5573 for (;;)
5574 {
5575 if (c < *cp)
5576 { if (prop_fail_result) break; else goto GOT_MAX; }
5577 if (c == *cp++)
5578 { if (prop_fail_result) goto GOT_MAX; else break; }
5579 }
5580 eptr += len;
5581 }
5582 GOT_MAX:
5583 break;
5584
5585 case PT_UCNC:
5586 for (i = min; i < max; i++)
5587 {
5588 int len = 1;
5589 if (eptr >= md->end_subject)
5590 {
5591 SCHECK_PARTIAL();
5592 break;
5593 }
5594 GETCHARLENTEST(c, eptr, len);
5595 if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5596 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5597 c >= 0xe000) == prop_fail_result)
5598 break;
5599 eptr += len;
5600 }
5601 break;
5602
5603 default:
5604 RRETURN(PCRE_ERROR_INTERNAL);
5605 }
5606
5607 /* eptr is now past the end of the maximum run */
5608
5609 if (possessive) continue; /* No backtracking */
5610 for(;;)
5611 {
5612 if (eptr == pp) goto TAIL_RECURSE;
5613 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5614 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5615 eptr--;
5616 if (utf) BACKCHAR(eptr);
5617 }
5618 }
5619
5620 /* Match extended Unicode grapheme clusters. We will get here only if the
5621 support is in the binary; otherwise a compile-time error occurs. */
5622
5623 else if (ctype == OP_EXTUNI)
5624 {
5625 for (i = min; i < max; i++)
5626 {
5627 if (eptr >= md->end_subject)
5628 {
5629 SCHECK_PARTIAL();
5630 break;
5631 }
5632 else
5633 {
5634 int lgb, rgb;
5635 GETCHARINCTEST(c, eptr);
5636 lgb = UCD_GRAPHBREAK(c);
5637 while (eptr < md->end_subject)
5638 {
5639 int len = 1;
5640 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5641 rgb = UCD_GRAPHBREAK(c);
5642 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5643 lgb = rgb;
5644 eptr += len;
5645 }
5646 }
5647 CHECK_PARTIAL();
5648 }
5649
5650 /* eptr is now past the end of the maximum run */
5651
5652 if (possessive) continue; /* No backtracking */
5653
5654 for(;;)
5655 {
5656 int lgb, rgb;
5657 PCRE_PUCHAR fptr;
5658
5659 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5660 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5662
5663 /* Backtracking over an extended grapheme cluster involves inspecting
5664 the previous two characters (if present) to see if a break is
5665 permitted between them. */
5666
5667 eptr--;
5668 if (!utf) c = *eptr; else
5669 {
5670 BACKCHAR(eptr);
5671 GETCHAR(c, eptr);
5672 }
5673 rgb = UCD_GRAPHBREAK(c);
5674
5675 for (;;)
5676 {
5677 if (eptr == pp) goto TAIL_RECURSE; /* At start of char run */
5678 fptr = eptr - 1;
5679 if (!utf) c = *fptr; else
5680 {
5681 BACKCHAR(fptr);
5682 GETCHAR(c, fptr);
5683 }
5684 lgb = UCD_GRAPHBREAK(c);
5685 if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5686 eptr = fptr;
5687 rgb = lgb;
5688 }
5689 }
5690 }
5691
5692 else
5693 #endif /* SUPPORT_UCP */
5694
5695 #ifdef SUPPORT_UTF
5696 if (utf)
5697 {
5698 switch(ctype)
5699 {
5700 case OP_ANY:
5701 for (i = min; i < max; i++)
5702 {
5703 if (eptr >= md->end_subject)
5704 {
5705 SCHECK_PARTIAL();
5706 break;
5707 }
5708 if (IS_NEWLINE(eptr)) break;
5709 if (md->partial != 0 && /* Take care with CRLF partial */
5710 eptr + 1 >= md->end_subject &&
5711 NLBLOCK->nltype == NLTYPE_FIXED &&
5712 NLBLOCK->nllen == 2 &&
5713 UCHAR21(eptr) == NLBLOCK->nl[0])
5714 {
5715 md->hitend = TRUE;
5716 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5717 }
5718 eptr++;
5719 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5720 }
5721 break;
5722
5723 case OP_ALLANY:
5724 if (max < INT_MAX)
5725 {
5726 for (i = min; i < max; i++)
5727 {
5728 if (eptr >= md->end_subject)
5729 {
5730 SCHECK_PARTIAL();
5731 break;
5732 }
5733 eptr++;
5734 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5735 }
5736 }
5737 else
5738 {
5739 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5740 SCHECK_PARTIAL();
5741 }
5742 break;
5743
5744 /* The byte case is the same as non-UTF8 */
5745
5746 case OP_ANYBYTE:
5747 c = max - min;
5748 if (c > (unsigned int)(md->end_subject - eptr))
5749 {
5750 eptr = md->end_subject;
5751 SCHECK_PARTIAL();
5752 }
5753 else eptr += c;
5754 break;
5755
5756 case OP_ANYNL:
5757 for (i = min; i < max; i++)
5758 {
5759 int len = 1;
5760 if (eptr >= md->end_subject)
5761 {
5762 SCHECK_PARTIAL();
5763 break;
5764 }
5765 GETCHARLEN(c, eptr, len);
5766 if (c == CHAR_CR)
5767 {
5768 if (++eptr >= md->end_subject) break;
5769 if (UCHAR21(eptr) == CHAR_LF) eptr++;
5770 }
5771 else
5772 {
5773 if (c != CHAR_LF &&
5774 (md->bsr_anycrlf ||
5775 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5776 #ifndef EBCDIC
5777 && c != 0x2028 && c != 0x2029
5778 #endif /* Not EBCDIC */
5779 )))
5780 break;
5781 eptr += len;
5782 }
5783 }
5784 break;
5785
5786 case OP_NOT_HSPACE:
5787 case OP_HSPACE:
5788 for (i = min; i < max; i++)
5789 {
5790 BOOL gotspace;
5791 int len = 1;
5792 if (eptr >= md->end_subject)
5793 {
5794 SCHECK_PARTIAL();
5795 break;
5796 }
5797 GETCHARLEN(c, eptr, len);
5798 switch(c)
5799 {
5800 HSPACE_CASES: gotspace = TRUE; break;
5801 default: gotspace = FALSE; break;
5802 }
5803 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5804 eptr += len;
5805 }
5806 break;
5807
5808 case OP_NOT_VSPACE:
5809 case OP_VSPACE:
5810 for (i = min; i < max; i++)
5811 {
5812 BOOL gotspace;
5813 int len = 1;
5814 if (eptr >= md->end_subject)
5815 {
5816 SCHECK_PARTIAL();
5817 break;
5818 }
5819 GETCHARLEN(c, eptr, len);
5820 switch(c)
5821 {
5822 VSPACE_CASES: gotspace = TRUE; break;
5823 default: gotspace = FALSE; break;
5824 }
5825 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5826 eptr += len;
5827 }
5828 break;
5829
5830 case OP_NOT_DIGIT:
5831 for (i = min; i < max; i++)
5832 {
5833 int len = 1;
5834 if (eptr >= md->end_subject)
5835 {
5836 SCHECK_PARTIAL();
5837 break;
5838 }
5839 GETCHARLEN(c, eptr, len);
5840 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5841 eptr+= len;
5842 }
5843 break;
5844
5845 case OP_DIGIT:
5846 for (i = min; i < max; i++)
5847 {
5848 int len = 1;
5849 if (eptr >= md->end_subject)
5850 {
5851 SCHECK_PARTIAL();
5852 break;
5853 }
5854 GETCHARLEN(c, eptr, len);
5855 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5856 eptr+= len;
5857 }
5858 break;
5859
5860 case OP_NOT_WHITESPACE:
5861 for (i = min; i < max; i++)
5862 {
5863 int len = 1;
5864 if (eptr >= md->end_subject)
5865 {
5866 SCHECK_PARTIAL();
5867 break;
5868 }
5869 GETCHARLEN(c, eptr, len);
5870 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5871 eptr+= len;
5872 }
5873 break;
5874
5875 case OP_WHITESPACE:
5876 for (i = min; i < max; i++)
5877 {
5878 int len = 1;
5879 if (eptr >= md->end_subject)
5880 {
5881 SCHECK_PARTIAL();
5882 break;
5883 }
5884 GETCHARLEN(c, eptr, len);
5885 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5886 eptr+= len;
5887 }
5888 break;
5889
5890 case OP_NOT_WORDCHAR:
5891 for (i = min; i < max; i++)
5892 {
5893 int len = 1;
5894 if (eptr >= md->end_subject)
5895 {
5896 SCHECK_PARTIAL();
5897 break;
5898 }
5899 GETCHARLEN(c, eptr, len);
5900 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5901 eptr+= len;
5902 }
5903 break;
5904
5905 case OP_WORDCHAR:
5906 for (i = min; i < max; i++)
5907 {
5908 int len = 1;
5909 if (eptr >= md->end_subject)
5910 {
5911 SCHECK_PARTIAL();
5912 break;
5913 }
5914 GETCHARLEN(c, eptr, len);
5915 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5916 eptr+= len;
5917 }
5918 break;
5919
5920 default:
5921 RRETURN(PCRE_ERROR_INTERNAL);
5922 }
5923
5924 if (possessive) continue; /* No backtracking */
5925 for(;;)
5926 {
5927 if (eptr == pp) goto TAIL_RECURSE;
5928 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5929 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5930 eptr--;
5931 BACKCHAR(eptr);
5932 if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5933 UCHAR21(eptr - 1) == CHAR_CR) eptr--;
5934 }
5935 }
5936 else
5937 #endif /* SUPPORT_UTF */
5938 /* Not UTF mode */
5939 {
5940 switch(ctype)
5941 {
5942 case OP_ANY:
5943 for (i = min; i < max; i++)
5944 {
5945 if (eptr >= md->end_subject)
5946 {
5947 SCHECK_PARTIAL();
5948 break;
5949 }
5950 if (IS_NEWLINE(eptr)) break;
5951 if (md->partial != 0 && /* Take care with CRLF partial */
5952 eptr + 1 >= md->end_subject &&
5953 NLBLOCK->nltype == NLTYPE_FIXED &&
5954 NLBLOCK->nllen == 2 &&
5955 *eptr == NLBLOCK->nl[0])
5956 {
5957 md->hitend = TRUE;
5958 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5959 }
5960 eptr++;
5961 }
5962 break;
5963
5964 case OP_ALLANY:
5965 case OP_ANYBYTE:
5966 c = max - min;
5967 if (c > (unsigned int)(md->end_subject - eptr))
5968 {
5969 eptr = md->end_subject;
5970 SCHECK_PARTIAL();
5971 }
5972 else eptr += c;
5973 break;
5974
5975 case OP_ANYNL:
5976 for (i = min; i < max; i++)
5977 {
5978 if (eptr >= md->end_subject)
5979 {
5980 SCHECK_PARTIAL();
5981 break;
5982 }
5983 c = *eptr;
5984 if (c == CHAR_CR)
5985 {
5986 if (++eptr >= md->end_subject) break;
5987 if (*eptr == CHAR_LF) eptr++;
5988 }
5989 else
5990 {
5991 if (c != CHAR_LF && (md->bsr_anycrlf ||
5992 (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5993 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5994 && c != 0x2028 && c != 0x2029
5995 #endif
5996 ))) break;
5997 eptr++;
5998 }
5999 }
6000 break;
6001
6002 case OP_NOT_HSPACE:
6003 for (i = min; i < max; i++)
6004 {
6005 if (eptr >= md->end_subject)
6006 {
6007 SCHECK_PARTIAL();
6008 break;
6009 }
6010 switch(*eptr)
6011 {
6012 default: eptr++; break;
6013 HSPACE_BYTE_CASES:
6014 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6015 HSPACE_MULTIBYTE_CASES:
6016 #endif
6017 goto ENDLOOP00;
6018 }
6019 }
6020 ENDLOOP00:
6021 break;
6022
6023 case OP_HSPACE:
6024 for (i = min; i < max; i++)
6025 {
6026 if (eptr >= md->end_subject)
6027 {
6028 SCHECK_PARTIAL();
6029 break;
6030 }
6031 switch(*eptr)
6032 {
6033 default: goto ENDLOOP01;
6034 HSPACE_BYTE_CASES:
6035 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6036 HSPACE_MULTIBYTE_CASES:
6037 #endif
6038 eptr++; break;
6039 }
6040 }
6041 ENDLOOP01:
6042 break;
6043
6044 case OP_NOT_VSPACE:
6045 for (i = min; i < max; i++)
6046 {
6047 if (eptr >= md->end_subject)
6048 {
6049 SCHECK_PARTIAL();
6050 break;
6051 }
6052 switch(*eptr)
6053 {
6054 default: eptr++; break;
6055 VSPACE_BYTE_CASES:
6056 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6057 VSPACE_MULTIBYTE_CASES:
6058 #endif
6059 goto ENDLOOP02;
6060 }
6061 }
6062 ENDLOOP02:
6063 break;
6064
6065 case OP_VSPACE:
6066 for (i = min; i < max; i++)
6067 {
6068 if (eptr >= md->end_subject)
6069 {
6070 SCHECK_PARTIAL();
6071 break;
6072 }
6073 switch(*eptr)
6074 {
6075 default: goto ENDLOOP03;
6076 VSPACE_BYTE_CASES:
6077 #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6078 VSPACE_MULTIBYTE_CASES:
6079 #endif
6080 eptr++; break;
6081 }
6082 }
6083 ENDLOOP03:
6084 break;
6085
6086 case OP_NOT_DIGIT:
6087 for (i = min; i < max; i++)
6088 {
6089 if (eptr >= md->end_subject)
6090 {
6091 SCHECK_PARTIAL();
6092 break;
6093 }
6094 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
6095 eptr++;
6096 }
6097 break;
6098
6099 case OP_DIGIT:
6100 for (i = min; i < max; i++)
6101 {
6102 if (eptr >= md->end_subject)
6103 {
6104 SCHECK_PARTIAL();
6105 break;
6106 }
6107 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
6108 eptr++;
6109 }
6110 break;
6111
6112 case OP_NOT_WHITESPACE:
6113 for (i = min; i < max; i++)
6114 {
6115 if (eptr >= md->end_subject)
6116 {
6117 SCHECK_PARTIAL();
6118 break;
6119 }
6120 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
6121 eptr++;
6122 }
6123 break;
6124
6125 case OP_WHITESPACE:
6126 for (i = min; i < max; i++)
6127 {
6128 if (eptr >= md->end_subject)
6129 {
6130 SCHECK_PARTIAL();
6131 break;
6132 }
6133 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
6134 eptr++;
6135 }
6136 break;
6137
6138 case OP_NOT_WORDCHAR:
6139 for (i = min; i < max; i++)
6140 {
6141 if (eptr >= md->end_subject)
6142 {
6143 SCHECK_PARTIAL();
6144 break;
6145 }
6146 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
6147 eptr++;
6148 }
6149 break;
6150
6151 case OP_WORDCHAR:
6152 for (i = min; i < max; i++)
6153 {
6154 if (eptr >= md->end_subject)
6155 {
6156 SCHECK_PARTIAL();
6157 break;
6158 }
6159 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
6160 eptr++;
6161 }
6162 break;
6163
6164 default:
6165 RRETURN(PCRE_ERROR_INTERNAL);
6166 }
6167
6168 if (possessive) continue; /* No backtracking */
6169 for (;;)
6170 {
6171 if (eptr == pp) goto TAIL_RECURSE;
6172 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6173 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
6174 eptr--;
6175 if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6176 eptr[-1] == CHAR_CR) eptr--;
6177 }
6178 }
6179
6180 /* Control never gets here */
6181 }
6182
6183 /* There's been some horrible disaster. Arrival here can only mean there is
6184 something seriously wrong in the code above or the OP_xxx definitions. */
6185
6186 default:
6187 DPRINTF(("Unknown opcode %d\n", *ecode));
6188 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6189 }
6190
6191 /* Do not stick any code in here without much thought; it is assumed
6192 that "continue" in the code above comes out to here to repeat the main
6193 loop. */
6194
6195 } /* End of main loop */
6196 /* Control never reaches here */
6197
6198
6199 /* When compiling to use the heap rather than the stack for recursive calls to
6200 match(), the RRETURN() macro jumps here. The number that is saved in
6201 frame->Xwhere indicates which label we actually want to return to. */
6202
6203 #ifdef NO_RECURSE
6204 #define LBL(val) case val: goto L_RM##val;
6205 HEAP_RETURN:
6206 switch (frame->Xwhere)
6207 {
6208 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6209 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6210 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6211 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6212 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6213 LBL(65) LBL(66)
6214 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6215 LBL(20) LBL(21)
6216 #endif
6217 #ifdef SUPPORT_UTF
6218 LBL(16) LBL(18)
6219 LBL(22) LBL(23) LBL(28) LBL(30)
6220 LBL(32) LBL(34) LBL(42) LBL(46)
6221 #ifdef SUPPORT_UCP
6222 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6223 LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
6224 #endif /* SUPPORT_UCP */
6225 #endif /* SUPPORT_UTF */
6226 default:
6227 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6228 return PCRE_ERROR_INTERNAL;
6229 }
6230 #undef LBL
6231 #endif /* NO_RECURSE */
6232 }
6233
6234
6235 /***************************************************************************
6236 ****************************************************************************
6237 RECURSION IN THE match() FUNCTION
6238
6239 Undefine all the macros that were defined above to handle this. */
6240
6241 #ifdef NO_RECURSE
6242 #undef eptr
6243 #undef ecode
6244 #undef mstart
6245 #undef offset_top
6246 #undef eptrb
6247 #undef flags
6248
6249 #undef callpat
6250 #undef charptr
6251 #undef data
6252 #undef next
6253 #undef pp
6254 #undef prev
6255 #undef saved_eptr
6256
6257 #undef new_recursive
6258
6259 #undef cur_is_word
6260 #undef condition
6261 #undef prev_is_word
6262
6263 #undef ctype
6264 #undef length
6265 #undef max
6266 #undef min
6267 #undef number
6268 #undef offset
6269 #undef op
6270 #undef save_capture_last
6271 #undef save_offset1
6272 #undef save_offset2
6273 #undef save_offset3
6274 #undef stacksave
6275
6276 #undef newptrb
6277
6278 #endif
6279
6280 /* These two are defined as macros in both cases */
6281
6282 #undef fc
6283 #undef fi
6284
6285 /***************************************************************************
6286 ***************************************************************************/
6287
6288
6289 #ifdef NO_RECURSE
6290 /*************************************************
6291 * Release allocated heap frames *
6292 *************************************************/
6293
6294 /* This function releases all the allocated frames. The base frame is on the
6295 machine stack, and so must not be freed.
6296
6297 Argument: the address of the base frame
6298 Returns: nothing
6299 */
6300
6301 static void
6302 release_match_heapframes (heapframe *frame_base)
6303 {
6304 heapframe *nextframe = frame_base->Xnextframe;
6305 while (nextframe != NULL)
6306 {
6307 heapframe *oldframe = nextframe;
6308 nextframe = nextframe->Xnextframe;
6309 (PUBL(stack_free))(oldframe);
6310 }
6311 }
6312 #endif
6313
6314
6315 /*************************************************
6316 * Execute a Regular Expression *
6317 *************************************************/
6318
6319 /* This function applies a compiled re to a subject string and picks out
6320 portions of the string if it matches. Two elements in the vector are set for
6321 each substring: the offsets to the start and end of the substring.
6322
6323 Arguments:
6324 argument_re points to the compiled expression
6325 extra_data points to extra data or is NULL
6326 subject points to the subject string
6327 length length of subject string (may contain binary zeros)
6328 start_offset where to start in the subject string
6329 options option bits
6330 offsets points to a vector of ints to be filled in with offsets
6331 offsetcount the number of elements in the vector
6332
6333 Returns: > 0 => success; value is the number of elements filled in
6334 = 0 => success, but offsets is not big enough
6335 -1 => failed to match
6336 < -1 => some kind of unexpected problem
6337 */
6338
6339 #if defined COMPILE_PCRE8
6340 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6341 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6342 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6343 int offsetcount)
6344 #elif defined COMPILE_PCRE16
6345 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6346 pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6347 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6348 int offsetcount)
6349 #elif defined COMPILE_PCRE32
6350 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6351 pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6352 PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6353 int offsetcount)
6354 #endif
6355 {
6356 int rc, ocount, arg_offset_max;
6357 int newline;
6358 BOOL using_temporary_offsets = FALSE;
6359 BOOL anchored;
6360 BOOL startline;
6361 BOOL firstline;
6362 BOOL utf;
6363 BOOL has_first_char = FALSE;
6364 BOOL has_req_char = FALSE;
6365 pcre_uchar first_char = 0;
6366 pcre_uchar first_char2 = 0;
6367 pcre_uchar req_char = 0;
6368 pcre_uchar req_char2 = 0;
6369 match_data match_block;
6370 match_data *md = &match_block;
6371 const pcre_uint8 *tables;
6372 const pcre_uint8 *start_bits = NULL;
6373 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6374 PCRE_PUCHAR end_subject;
6375 PCRE_PUCHAR start_partial = NULL;
6376 PCRE_PUCHAR match_partial = NULL;
6377 PCRE_PUCHAR req_char_ptr = start_match - 1;
6378
6379 const pcre_study_data *study;
6380 const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6381
6382 #ifdef NO_RECURSE
6383 heapframe frame_zero;
6384 frame_zero.Xprevframe = NULL; /* Marks the top level */
6385 frame_zero.Xnextframe = NULL; /* None are allocated yet */
6386 md->match_frames_base = &frame_zero;
6387 #endif
6388
6389 /* Check for the special magic call that measures the size of the stack used
6390 per recursive call of match(). Without the funny casting for sizeof, a Windows
6391 compiler gave this error: "unary minus operator applied to unsigned type,
6392 result still unsigned". Hopefully the cast fixes that. */
6393
6394 if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6395 start_offset == -999)
6396 #ifdef NO_RECURSE
6397 return -((int)sizeof(heapframe));
6398 #else
6399 return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6400 #endif
6401
6402 /* Plausibility checks */
6403
6404 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6405 if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6406 return PCRE_ERROR_NULL;
6407 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6408 if (length < 0) return PCRE_ERROR_BADLENGTH;
6409 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6410
6411 /* Check that the first field in the block is the magic number. If it is not,
6412 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6413 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6414 means that the pattern is likely compiled with different endianness. */
6415
6416 if (re->magic_number != MAGIC_NUMBER)
6417 return re->magic_number == REVERSED_MAGIC_NUMBER?
6418 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6419 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6420
6421 /* These two settings are used in the code for checking a UTF-8 string that
6422 follows immediately afterwards. Other values in the md block are used only
6423 during "normal" pcre_exec() processing, not when the JIT support is in use,
6424 so they are set up later. */
6425
6426 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6427 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6428 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6429 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6430
6431 /* Check a UTF-8 string if required. Pass back the character offset and error
6432 code for an invalid string if a results vector is available. */
6433
6434 #ifdef SUPPORT_UTF
6435 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6436 {
6437 int erroroffset;
6438 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6439 if (errorcode != 0)
6440 {
6441 if (offsetcount >= 2)
6442 {
6443 offsets[0] = erroroffset;
6444 offsets[1] = errorcode;
6445 }
6446 #if defined COMPILE_PCRE8
6447 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6448 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6449 #elif defined COMPILE_PCRE16
6450 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6451 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6452 #elif defined COMPILE_PCRE32
6453 return PCRE_ERROR_BADUTF32;
6454 #endif
6455 }
6456 #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
6457 /* Check that a start_offset points to the start of a UTF character. */
6458 if (start_offset > 0 && start_offset < length &&
6459 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6460 return PCRE_ERROR_BADUTF8_OFFSET;
6461 #endif
6462 }
6463 #endif
6464
6465 /* If the pattern was successfully studied with JIT support, run the JIT
6466 executable instead of the rest of this function. Most options must be set at
6467 compile time for the JIT code to be usable. Fallback to the normal code path if
6468 an unsupported flag is set. */
6469
6470 #ifdef SUPPORT_JIT
6471 if (extra_data != NULL
6472 && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6473 PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
6474 && extra_data->executable_jit != NULL
6475 && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6476 {
6477 rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6478 start_offset, options, offsets, offsetcount);
6479
6480 /* PCRE_ERROR_NULL means that the selected normal or partial matching
6481 mode is not compiled. In this case we simply fallback to interpreter. */
6482
6483 if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6484 }
6485 #endif
6486
6487 /* Carry on with non-JIT matching. This information is for finding all the
6488 numbers associated with a given name, for condition testing. */
6489
6490 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6491 md->name_count = re->name_count;
6492 md->name_entry_size = re->name_entry_size;
6493
6494 /* Fish out the optional data from the extra_data structure, first setting
6495 the default values. */
6496
6497 study = NULL;
6498 md->match_limit = MATCH_LIMIT;
6499 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6500 md->callout_data = NULL;
6501
6502 /* The table pointer is always in native byte order. */
6503
6504 tables = re->tables;
6505
6506 /* The two limit values override the defaults, whatever their value. */
6507
6508 if (extra_data != NULL)
6509 {
6510 unsigned long int flags = extra_data->flags;
6511 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6512 study = (const pcre_study_data *)extra_data->study_data;
6513 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6514 md->match_limit = extra_data->match_limit;
6515 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6516 md->match_limit_recursion = extra_data->match_limit_recursion;
6517 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6518 md->callout_data = extra_data->callout_data;
6519 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6520 }
6521
6522 /* Limits in the regex override only if they are smaller. */
6523
6524 if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6525 md->match_limit = re->limit_match;
6526
6527 if ((re->flags & PCRE_RLSET) != 0 &&
6528 re->limit_recursion < md->match_limit_recursion)
6529 md->match_limit_recursion = re->limit_recursion;
6530
6531 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6532 is a feature that makes it possible to save compiled regex and re-use them
6533 in other programs later. */
6534
6535 if (tables == NULL) tables = PRIV(default_tables);
6536
6537 /* Set up other data */
6538
6539 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6540 startline = (re->flags & PCRE_STARTLINE) != 0;
6541 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6542
6543 /* The code starts after the real_pcre block and the capture name table. */
6544
6545 md->start_code = (const pcre_uchar *)re + re->name_table_offset +
6546 re->name_count * re->name_entry_size;
6547
6548 md->start_subject = (PCRE_PUCHAR)subject;
6549 md->start_offset = start_offset;
6550 md->end_subject = md->start_subject + length;
6551 end_subject = md->end_subject;
6552
6553 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6554 md->use_ucp = (re->options & PCRE_UCP) != 0;
6555 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6556 md->ignore_skip_arg = 0;
6557
6558 /* Some options are unpacked into BOOL variables in the hope that testing
6559 them will be faster than individual option bits. */
6560
6561 md->notbol = (options & PCRE_NOTBOL) != 0;
6562 md->noteol = (options & PCRE_NOTEOL) != 0;
6563 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6564 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6565
6566 md->hitend = FALSE;
6567 md->mark = md->nomatch_mark = NULL; /* In case never set */
6568
6569 md->recursive = NULL; /* No recursion at top level */
6570 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6571
6572 md->lcc = tables + lcc_offset;
6573 md->fcc = tables + fcc_offset;
6574 md->ctypes = tables + ctypes_offset;
6575
6576 /* Handle different \R options. */
6577
6578 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6579 {
6580 case 0:
6581 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6582 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6583 else
6584 #ifdef BSR_ANYCRLF
6585 md->bsr_anycrlf = TRUE;
6586 #else
6587 md->bsr_anycrlf = FALSE;
6588 #endif
6589 break;
6590
6591 case PCRE_BSR_ANYCRLF:
6592 md->bsr_anycrlf = TRUE;
6593 break;
6594
6595 case PCRE_BSR_UNICODE:
6596 md->bsr_anycrlf = FALSE;
6597 break;
6598
6599 default: return PCRE_ERROR_BADNEWLINE;
6600 }
6601
6602 /* Handle different types of newline. The three bits give eight cases. If
6603 nothing is set at run time, whatever was used at compile time applies. */
6604
6605 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6606 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6607 {
6608 case 0: newline = NEWLINE; break; /* Compile-time default */
6609 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6610 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6611 case PCRE_NEWLINE_CR+
6612 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6613 case PCRE_NEWLINE_ANY: newline = -1; break;
6614 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6615 default: return PCRE_ERROR_BADNEWLINE;
6616 }
6617
6618 if (newline == -2)
6619 {
6620 md->nltype = NLTYPE_ANYCRLF;
6621 }
6622 else if (newline < 0)
6623 {
6624 md->nltype = NLTYPE_ANY;
6625 }
6626 else
6627 {
6628 md->nltype = NLTYPE_FIXED;
6629 if (newline > 255)
6630 {
6631 md->nllen = 2;
6632 md->nl[0] = (newline >> 8) & 255;
6633 md->nl[1] = newline & 255;
6634 }
6635 else
6636 {
6637 md->nllen = 1;
6638 md->nl[0] = newline;
6639 }
6640 }
6641
6642 /* Partial matching was originally supported only for a restricted set of
6643 regexes; from release 8.00 there are no restrictions, but the bits are still
6644 defined (though never set). So there's no harm in leaving this code. */
6645
6646 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6647 return PCRE_ERROR_BADPARTIAL;
6648
6649 /* If the expression has got more back references than the offsets supplied can
6650 hold, we get a temporary chunk of working store to use during the matching.
6651 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6652 of 3. */
6653
6654 ocount = offsetcount - (offsetcount % 3);
6655 arg_offset_max = (2*ocount)/3;
6656
6657 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6658 {
6659 ocount = re->top_backref * 3 + 3;
6660 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6661 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6662 using_temporary_offsets = TRUE;
6663 DPRINTF(("Got memory to hold back references\n"));
6664 }
6665 else md->offset_vector = offsets;
6666 md->offset_end = ocount;
6667 md->offset_max = (2*ocount)/3;
6668 md->capture_last = 0;
6669
6670 /* Reset the working variable associated with each extraction. These should
6671 never be used unless previously set, but they get saved and restored, and so we
6672 initialize them to avoid reading uninitialized locations. Also, unset the
6673 offsets for the matched string. This is really just for tidiness with callouts,
6674 in case they inspect these fields. */
6675
6676 if (md->offset_vector != NULL)
6677 {
6678 register int *iptr = md->offset_vector + ocount;
6679 register int *iend = iptr - re->top_bracket;
6680 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6681 while (--iptr >= iend) *iptr = -1;
6682 md->offset_vector[0] = md->offset_vector[1] = -1;
6683 }
6684
6685 /* Set up the first character to match, if available. The first_char value is
6686 never set for an anchored regular expression, but the anchoring may be forced
6687 at run time, so we have to test for anchoring. The first char may be unset for
6688 an unanchored pattern, of course. If there's no first char and the pattern was
6689 studied, there may be a bitmap of possible first characters. */
6690
6691 if (!anchored)
6692 {
6693 if ((re->flags & PCRE_FIRSTSET) != 0)
6694 {
6695 has_first_char = TRUE;
6696 first_char = first_char2 = (pcre_uchar)(re->first_char);
6697 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6698 {
6699 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6700 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6701 if (utf && first_char > 127)
6702 first_char2 = UCD_OTHERCASE(first_char);
6703 #endif
6704 }
6705 }
6706 else
6707 if (!startline && study != NULL &&
6708 (study->flags & PCRE_STUDY_MAPPED) != 0)
6709 start_bits = study->start_bits;
6710 }
6711
6712 /* For anchored or unanchored matches, there may be a "last known required
6713 character" set. */
6714
6715 if ((re->flags & PCRE_REQCHSET) != 0)
6716 {
6717 has_req_char = TRUE;
6718 req_char = req_char2 = (pcre_uchar)(re->req_char);
6719 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6720 {
6721 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6722 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6723 if (utf && req_char > 127)
6724 req_char2 = UCD_OTHERCASE(req_char);
6725 #endif
6726 }
6727 }
6728
6729
6730 /* ==========================================================================*/
6731
6732 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6733 the loop runs just once. */
6734
6735 for(;;)
6736 {
6737 PCRE_PUCHAR save_end_subject = end_subject;
6738 PCRE_PUCHAR new_start_match;
6739
6740 /* If firstline is TRUE, the start of the match is constrained to the first
6741 line of a multiline string. That is, the match must be before or at the first
6742 newline. Implement this by temporarily adjusting end_subject so that we stop
6743 scanning at a newline. If the match fails at the newline, later code breaks
6744 this loop. */
6745
6746 if (firstline)
6747 {
6748 PCRE_PUCHAR t = start_match;
6749 #ifdef SUPPORT_UTF
6750 if (utf)
6751 {
6752 while (t < md->end_subject && !IS_NEWLINE(t))
6753 {
6754 t++;
6755 ACROSSCHAR(t < end_subject, *t, t++);