/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 842 - (show annotations) (download)
Sat Dec 31 15:19:04 2011 UTC (2 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 211344 byte(s)
Typos in PrepareRelease; include 16-bit in make distcheck; fix RunTest and 
RunGrepTest when building in non-source directory; problem in pcretest when no 
UTF or 16-bit support; other tidies consequent on building a tarball.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2012 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_ONCE (-996)
80 #define MATCH_PRUNE (-995)
81 #define MATCH_SKIP (-994)
82 #define MATCH_SKIP_ARG (-993)
83 #define MATCH_THEN (-992)
84
85 /* Maximum number of ints of offset to save on the stack for recursive calls.
86 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87 because the offset vector is always a multiple of 3 long. */
88
89 #define REC_STACK_SAVE_MAX 30
90
91 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92
93 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95
96
97
98 #ifdef PCRE_DEBUG
99 /*************************************************
100 * Debugging function to print chars *
101 *************************************************/
102
103 /* Print a sequence of chars in printable format, stopping at the end of the
104 subject if the requested.
105
106 Arguments:
107 p points to characters
108 length number to print
109 is_subject TRUE if printing from within md->start_subject
110 md pointer to matching data block, if is_subject is TRUE
111
112 Returns: nothing
113 */
114
115 static void
116 pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
117 {
118 unsigned int c;
119 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120 while (length-- > 0)
121 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122 }
123 #endif
124
125
126
127 /*************************************************
128 * Match a back-reference *
129 *************************************************/
130
131 /* Normally, if a back reference hasn't been set, the length that is passed is
132 negative, so the match always fails. However, in JavaScript compatibility mode,
133 the length passed is zero. Note that in caseless UTF-8 mode, the number of
134 subject bytes matched may be different to the number of reference bytes.
135
136 Arguments:
137 offset index into the offset vector
138 eptr pointer into the subject
139 length length of reference to be matched (number of bytes)
140 md points to match data block
141 caseless TRUE if caseless
142
143 Returns: < 0 if not matched, otherwise the number of subject bytes matched
144 */
145
146 static int
147 match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
148 BOOL caseless)
149 {
150 PCRE_PUCHAR eptr_start = eptr;
151 register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
152
153 #ifdef PCRE_DEBUG
154 if (eptr >= md->end_subject)
155 printf("matching subject <null>");
156 else
157 {
158 printf("matching subject ");
159 pchars(eptr, length, TRUE, md);
160 }
161 printf(" against backref ");
162 pchars(p, length, FALSE, md);
163 printf("\n");
164 #endif
165
166 /* Always fail if reference not set (and not JavaScript compatible). */
167
168 if (length < 0) return -1;
169
170 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171 properly if Unicode properties are supported. Otherwise, we can check only
172 ASCII characters. */
173
174 if (caseless)
175 {
176 #ifdef SUPPORT_UTF
177 #ifdef SUPPORT_UCP
178 if (md->utf)
179 {
180 /* Match characters up to the end of the reference. NOTE: the number of
181 bytes matched may differ, because there are some characters whose upper and
182 lower case versions code as different numbers of bytes. For example, U+023A
183 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
184 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
185 the latter. It is important, therefore, to check the length along the
186 reference, not along the subject (earlier code did this wrong). */
187
188 PCRE_PUCHAR endptr = p + length;
189 while (p < endptr)
190 {
191 int c, d;
192 if (eptr >= md->end_subject) return -1;
193 GETCHARINC(c, eptr);
194 GETCHARINC(d, p);
195 if (c != d && c != UCD_OTHERCASE(d)) return -1;
196 }
197 }
198 else
199 #endif
200 #endif
201
202 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203 is no UCP support. */
204 {
205 if (eptr + length > md->end_subject) return -1;
206 while (length-- > 0)
207 {
208 if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
209 p++;
210 eptr++;
211 }
212 }
213 }
214
215 /* In the caseful case, we can just compare the bytes, whether or not we
216 are in UTF-8 mode. */
217
218 else
219 {
220 if (eptr + length > md->end_subject) return -1;
221 while (length-- > 0) if (*p++ != *eptr++) return -1;
222 }
223
224 return (int)(eptr - eptr_start);
225 }
226
227
228
229 /***************************************************************************
230 ****************************************************************************
231 RECURSION IN THE match() FUNCTION
232
233 The match() function is highly recursive, though not every recursive call
234 increases the recursive depth. Nevertheless, some regular expressions can cause
235 it to recurse to a great depth. I was writing for Unix, so I just let it call
236 itself recursively. This uses the stack for saving everything that has to be
237 saved for a recursive call. On Unix, the stack can be large, and this works
238 fine.
239
240 It turns out that on some non-Unix-like systems there are problems with
241 programs that use a lot of stack. (This despite the fact that every last chip
242 has oodles of memory these days, and techniques for extending the stack have
243 been known for decades.) So....
244
245 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246 calls by keeping local variables that need to be preserved in blocks of memory
247 obtained from malloc() instead instead of on the stack. Macros are used to
248 achieve this so that the actual code doesn't look very different to what it
249 always used to.
250
251 The original heap-recursive code used longjmp(). However, it seems that this
252 can be very slow on some operating systems. Following a suggestion from Stan
253 Switzer, the use of longjmp() has been abolished, at the cost of having to
254 provide a unique number for each call to RMATCH. There is no way of generating
255 a sequence of numbers at compile time in C. I have given them names, to make
256 them stand out more clearly.
257
258 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260 tests. Furthermore, not using longjmp() means that local dynamic variables
261 don't have indeterminate values; this has meant that the frame size can be
262 reduced because the result can be "passed back" by straight setting of the
263 variable instead of being passed in the frame.
264 ****************************************************************************
265 ***************************************************************************/
266
267 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268 below must be updated in sync. */
269
270 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
275 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
276 RM61, RM62, RM63, RM64, RM65, RM66 };
277
278 /* These versions of the macros use the stack, as normal. There are debugging
279 versions and production versions. Note that the "rw" argument of RMATCH isn't
280 actually used in this definition. */
281
282 #ifndef NO_RECURSE
283 #define REGISTER register
284
285 #ifdef PCRE_DEBUG
286 #define RMATCH(ra,rb,rc,rd,re,rw) \
287 { \
288 printf("match() called in line %d\n", __LINE__); \
289 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
290 printf("to line %d\n", __LINE__); \
291 }
292 #define RRETURN(ra) \
293 { \
294 printf("match() returned %d from line %d ", ra, __LINE__); \
295 return ra; \
296 }
297 #else
298 #define RMATCH(ra,rb,rc,rd,re,rw) \
299 rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
300 #define RRETURN(ra) return ra
301 #endif
302
303 #else
304
305
306 /* These versions of the macros manage a private stack on the heap. Note that
307 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308 argument of match(), which never changes. */
309
310 #define REGISTER
311
312 #define RMATCH(ra,rb,rc,rd,re,rw)\
313 {\
314 heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
315 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
316 frame->Xwhere = rw; \
317 newframe->Xeptr = ra;\
318 newframe->Xecode = rb;\
319 newframe->Xmstart = mstart;\
320 newframe->Xoffset_top = rc;\
321 newframe->Xeptrb = re;\
322 newframe->Xrdepth = frame->Xrdepth + 1;\
323 newframe->Xprevframe = frame;\
324 frame = newframe;\
325 DPRINTF(("restarting from line %d\n", __LINE__));\
326 goto HEAP_RECURSE;\
327 L_##rw:\
328 DPRINTF(("jumped back to line %d\n", __LINE__));\
329 }
330
331 #define RRETURN(ra)\
332 {\
333 heapframe *oldframe = frame;\
334 frame = oldframe->Xprevframe;\
335 (PUBL(stack_free))(oldframe);\
336 if (frame != NULL)\
337 {\
338 rrc = ra;\
339 goto HEAP_RETURN;\
340 }\
341 return ra;\
342 }
343
344
345 /* Structure for remembering the local variables in a private frame */
346
347 typedef struct heapframe {
348 struct heapframe *Xprevframe;
349
350 /* Function arguments that may change */
351
352 PCRE_PUCHAR Xeptr;
353 const pcre_uchar *Xecode;
354 PCRE_PUCHAR Xmstart;
355 int Xoffset_top;
356 eptrblock *Xeptrb;
357 unsigned int Xrdepth;
358
359 /* Function local variables */
360
361 PCRE_PUCHAR Xcallpat;
362 #ifdef SUPPORT_UTF
363 PCRE_PUCHAR Xcharptr;
364 #endif
365 PCRE_PUCHAR Xdata;
366 PCRE_PUCHAR Xnext;
367 PCRE_PUCHAR Xpp;
368 PCRE_PUCHAR Xprev;
369 PCRE_PUCHAR Xsaved_eptr;
370
371 recursion_info Xnew_recursive;
372
373 BOOL Xcur_is_word;
374 BOOL Xcondition;
375 BOOL Xprev_is_word;
376
377 #ifdef SUPPORT_UCP
378 int Xprop_type;
379 int Xprop_value;
380 int Xprop_fail_result;
381 int Xoclength;
382 pcre_uchar Xocchars[6];
383 #endif
384
385 int Xcodelink;
386 int Xctype;
387 unsigned int Xfc;
388 int Xfi;
389 int Xlength;
390 int Xmax;
391 int Xmin;
392 int Xnumber;
393 int Xoffset;
394 int Xop;
395 int Xsave_capture_last;
396 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397 int Xstacksave[REC_STACK_SAVE_MAX];
398
399 eptrblock Xnewptrb;
400
401 /* Where to jump back to */
402
403 int Xwhere;
404
405 } heapframe;
406
407 #endif
408
409
410 /***************************************************************************
411 ***************************************************************************/
412
413
414
415 /*************************************************
416 * Match from current position *
417 *************************************************/
418
419 /* This function is called recursively in many circumstances. Whenever it
420 returns a negative (error) response, the outer incarnation must also return the
421 same response. */
422
423 /* These macros pack up tests that are used for partial matching, and which
424 appear several times in the code. We set the "hit end" flag if the pointer is
425 at the end of the subject and also past the start of the subject (i.e.
426 something has been matched). For hard partial matching, we then return
427 immediately. The second one is used when we already know we are past the end of
428 the subject. */
429
430 #define CHECK_PARTIAL()\
431 if (md->partial != 0 && eptr >= md->end_subject && \
432 eptr > md->start_used_ptr) \
433 { \
434 md->hitend = TRUE; \
435 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
436 }
437
438 #define SCHECK_PARTIAL()\
439 if (md->partial != 0 && eptr > md->start_used_ptr) \
440 { \
441 md->hitend = TRUE; \
442 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
443 }
444
445
446 /* Performance note: It might be tempting to extract commonly used fields from
447 the md structure (e.g. utf, end_subject) into individual variables to improve
448 performance. Tests using gcc on a SPARC disproved this; in the first case, it
449 made performance worse.
450
451 Arguments:
452 eptr pointer to current character in subject
453 ecode pointer to current position in compiled code
454 mstart pointer to the current match start position (can be modified
455 by encountering \K)
456 offset_top current top pointer
457 md pointer to "static" info for the match
458 eptrb pointer to chain of blocks containing eptr at start of
459 brackets - for testing for empty matches
460 rdepth the recursion depth
461
462 Returns: MATCH_MATCH if matched ) these values are >= 0
463 MATCH_NOMATCH if failed to match )
464 a negative MATCH_xxx value for PRUNE, SKIP, etc
465 a negative PCRE_ERROR_xxx value if aborted by an error condition
466 (e.g. stopped by repeated call or recursion limit)
467 */
468
469 static int
470 match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
471 PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
472 unsigned int rdepth)
473 {
474 /* These variables do not need to be preserved over recursion in this function,
475 so they can be ordinary variables in all cases. Mark some of them with
476 "register" because they are used a lot in loops. */
477
478 register int rrc; /* Returns from recursive calls */
479 register int i; /* Used for loops not involving calls to RMATCH() */
480 register unsigned int c; /* Character values not kept over RMATCH() calls */
481 register BOOL utf; /* Local copy of UTF flag for speed */
482
483 BOOL minimize, possessive; /* Quantifier options */
484 BOOL caseless;
485 int condcode;
486
487 /* When recursion is not being used, all "local" variables that have to be
488 preserved over calls to RMATCH() are part of a "frame" which is obtained from
489 heap storage. Set up the top-level frame here; others are obtained from the
490 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
491
492 #ifdef NO_RECURSE
493 heapframe *frame = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));
494 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
495 frame->Xprevframe = NULL; /* Marks the top level */
496
497 /* Copy in the original argument variables */
498
499 frame->Xeptr = eptr;
500 frame->Xecode = ecode;
501 frame->Xmstart = mstart;
502 frame->Xoffset_top = offset_top;
503 frame->Xeptrb = eptrb;
504 frame->Xrdepth = rdepth;
505
506 /* This is where control jumps back to to effect "recursion" */
507
508 HEAP_RECURSE:
509
510 /* Macros make the argument variables come from the current frame */
511
512 #define eptr frame->Xeptr
513 #define ecode frame->Xecode
514 #define mstart frame->Xmstart
515 #define offset_top frame->Xoffset_top
516 #define eptrb frame->Xeptrb
517 #define rdepth frame->Xrdepth
518
519 /* Ditto for the local variables */
520
521 #ifdef SUPPORT_UTF
522 #define charptr frame->Xcharptr
523 #endif
524 #define callpat frame->Xcallpat
525 #define codelink frame->Xcodelink
526 #define data frame->Xdata
527 #define next frame->Xnext
528 #define pp frame->Xpp
529 #define prev frame->Xprev
530 #define saved_eptr frame->Xsaved_eptr
531
532 #define new_recursive frame->Xnew_recursive
533
534 #define cur_is_word frame->Xcur_is_word
535 #define condition frame->Xcondition
536 #define prev_is_word frame->Xprev_is_word
537
538 #ifdef SUPPORT_UCP
539 #define prop_type frame->Xprop_type
540 #define prop_value frame->Xprop_value
541 #define prop_fail_result frame->Xprop_fail_result
542 #define oclength frame->Xoclength
543 #define occhars frame->Xocchars
544 #endif
545
546 #define ctype frame->Xctype
547 #define fc frame->Xfc
548 #define fi frame->Xfi
549 #define length frame->Xlength
550 #define max frame->Xmax
551 #define min frame->Xmin
552 #define number frame->Xnumber
553 #define offset frame->Xoffset
554 #define op frame->Xop
555 #define save_capture_last frame->Xsave_capture_last
556 #define save_offset1 frame->Xsave_offset1
557 #define save_offset2 frame->Xsave_offset2
558 #define save_offset3 frame->Xsave_offset3
559 #define stacksave frame->Xstacksave
560
561 #define newptrb frame->Xnewptrb
562
563 /* When recursion is being used, local variables are allocated on the stack and
564 get preserved during recursion in the normal way. In this environment, fi and
565 i, and fc and c, can be the same variables. */
566
567 #else /* NO_RECURSE not defined */
568 #define fi i
569 #define fc c
570
571 /* Many of the following variables are used only in small blocks of the code.
572 My normal style of coding would have declared them within each of those blocks.
573 However, in order to accommodate the version of this code that uses an external
574 "stack" implemented on the heap, it is easier to declare them all here, so the
575 declarations can be cut out in a block. The only declarations within blocks
576 below are for variables that do not have to be preserved over a recursive call
577 to RMATCH(). */
578
579 #ifdef SUPPORT_UTF
580 const pcre_uchar *charptr;
581 #endif
582 const pcre_uchar *callpat;
583 const pcre_uchar *data;
584 const pcre_uchar *next;
585 PCRE_PUCHAR pp;
586 const pcre_uchar *prev;
587 PCRE_PUCHAR saved_eptr;
588
589 recursion_info new_recursive;
590
591 BOOL cur_is_word;
592 BOOL condition;
593 BOOL prev_is_word;
594
595 #ifdef SUPPORT_UCP
596 int prop_type;
597 int prop_value;
598 int prop_fail_result;
599 int oclength;
600 pcre_uchar occhars[6];
601 #endif
602
603 int codelink;
604 int ctype;
605 int length;
606 int max;
607 int min;
608 int number;
609 int offset;
610 int op;
611 int save_capture_last;
612 int save_offset1, save_offset2, save_offset3;
613 int stacksave[REC_STACK_SAVE_MAX];
614
615 eptrblock newptrb;
616 #endif /* NO_RECURSE */
617
618 /* To save space on the stack and in the heap frame, I have doubled up on some
619 of the local variables that are used only in localised parts of the code, but
620 still need to be preserved over recursive calls of match(). These macros define
621 the alternative names that are used. */
622
623 #define allow_zero cur_is_word
624 #define cbegroup condition
625 #define code_offset codelink
626 #define condassert condition
627 #define matched_once prev_is_word
628 #define foc number
629
630 /* These statements are here to stop the compiler complaining about unitialized
631 variables. */
632
633 #ifdef SUPPORT_UCP
634 prop_value = 0;
635 prop_fail_result = 0;
636 #endif
637
638
639 /* This label is used for tail recursion, which is used in a few cases even
640 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
641 used. Thanks to Ian Taylor for noticing this possibility and sending the
642 original patch. */
643
644 TAIL_RECURSE:
645
646 /* OK, now we can get on with the real code of the function. Recursive calls
647 are specified by the macro RMATCH and RRETURN is used to return. When
648 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
649 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
650 defined). However, RMATCH isn't like a function call because it's quite a
651 complicated macro. It has to be used in one particular way. This shouldn't,
652 however, impact performance when true recursion is being used. */
653
654 #ifdef SUPPORT_UTF
655 utf = md->utf; /* Local copy of the flag */
656 #else
657 utf = FALSE;
658 #endif
659
660 /* First check that we haven't called match() too many times, or that we
661 haven't exceeded the recursive call limit. */
662
663 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
664 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
665
666 /* At the start of a group with an unlimited repeat that may match an empty
667 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
668 done this way to save having to use another function argument, which would take
669 up space on the stack. See also MATCH_CONDASSERT below.
670
671 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
672 such remembered pointers, to be checked when we hit the closing ket, in order
673 to break infinite loops that match no characters. When match() is called in
674 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
675 NOT be used with tail recursion, because the memory block that is used is on
676 the stack, so a new one may be required for each match(). */
677
678 if (md->match_function_type == MATCH_CBEGROUP)
679 {
680 newptrb.epb_saved_eptr = eptr;
681 newptrb.epb_prev = eptrb;
682 eptrb = &newptrb;
683 md->match_function_type = 0;
684 }
685
686 /* Now start processing the opcodes. */
687
688 for (;;)
689 {
690 minimize = possessive = FALSE;
691 op = *ecode;
692
693 switch(op)
694 {
695 case OP_MARK:
696 md->nomatch_mark = ecode + 2;
697 md->mark = NULL; /* In case previously set by assertion */
698 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
699 eptrb, RM55);
700 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
701 md->mark == NULL) md->mark = ecode + 2;
702
703 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
704 argument, and we must check whether that argument matches this MARK's
705 argument. It is passed back in md->start_match_ptr (an overloading of that
706 variable). If it does match, we reset that variable to the current subject
707 position and return MATCH_SKIP. Otherwise, pass back the return code
708 unaltered. */
709
710 else if (rrc == MATCH_SKIP_ARG &&
711 STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
712 {
713 md->start_match_ptr = eptr;
714 RRETURN(MATCH_SKIP);
715 }
716 RRETURN(rrc);
717
718 case OP_FAIL:
719 RRETURN(MATCH_NOMATCH);
720
721 /* COMMIT overrides PRUNE, SKIP, and THEN */
722
723 case OP_COMMIT:
724 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
725 eptrb, RM52);
726 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
727 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
728 rrc != MATCH_THEN)
729 RRETURN(rrc);
730 RRETURN(MATCH_COMMIT);
731
732 /* PRUNE overrides THEN */
733
734 case OP_PRUNE:
735 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
736 eptrb, RM51);
737 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
738 RRETURN(MATCH_PRUNE);
739
740 case OP_PRUNE_ARG:
741 md->nomatch_mark = ecode + 2;
742 md->mark = NULL; /* In case previously set by assertion */
743 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
744 eptrb, RM56);
745 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
746 md->mark == NULL) md->mark = ecode + 2;
747 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
748 RRETURN(MATCH_PRUNE);
749
750 /* SKIP overrides PRUNE and THEN */
751
752 case OP_SKIP:
753 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
754 eptrb, RM53);
755 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
756 RRETURN(rrc);
757 md->start_match_ptr = eptr; /* Pass back current position */
758 RRETURN(MATCH_SKIP);
759
760 /* Note that, for Perl compatibility, SKIP with an argument does NOT set
761 nomatch_mark. There is a flag that disables this opcode when re-matching a
762 pattern that ended with a SKIP for which there was not a matching MARK. */
763
764 case OP_SKIP_ARG:
765 if (md->ignore_skip_arg)
766 {
767 ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
768 break;
769 }
770 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
771 eptrb, RM57);
772 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
773 RRETURN(rrc);
774
775 /* Pass back the current skip name by overloading md->start_match_ptr and
776 returning the special MATCH_SKIP_ARG return code. This will either be
777 caught by a matching MARK, or get to the top, where it causes a rematch
778 with the md->ignore_skip_arg flag set. */
779
780 md->start_match_ptr = ecode + 2;
781 RRETURN(MATCH_SKIP_ARG);
782
783 /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
784 the branch in which it occurs can be determined. Overload the start of
785 match pointer to do this. */
786
787 case OP_THEN:
788 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
789 eptrb, RM54);
790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
791 md->start_match_ptr = ecode;
792 RRETURN(MATCH_THEN);
793
794 case OP_THEN_ARG:
795 md->nomatch_mark = ecode + 2;
796 md->mark = NULL; /* In case previously set by assertion */
797 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
798 md, eptrb, RM58);
799 if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800 md->mark == NULL) md->mark = ecode + 2;
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode;
803 RRETURN(MATCH_THEN);
804
805 /* Handle an atomic group that does not contain any capturing parentheses.
806 This can be handled like an assertion. Prior to 8.13, all atomic groups
807 were handled this way. In 8.13, the code was changed as below for ONCE, so
808 that backups pass through the group and thereby reset captured values.
809 However, this uses a lot more stack, so in 8.20, atomic groups that do not
810 contain any captures generate OP_ONCE_NC, which can be handled in the old,
811 less stack intensive way.
812
813 Check the alternative branches in turn - the matching won't pass the KET
814 for this kind of subpattern. If any one branch matches, we carry on as at
815 the end of a normal bracket, leaving the subject pointer, but resetting
816 the start-of-match value in case it was changed by \K. */
817
818 case OP_ONCE_NC:
819 prev = ecode;
820 saved_eptr = eptr;
821 do
822 {
823 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
824 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
825 {
826 mstart = md->start_match_ptr;
827 break;
828 }
829 if (rrc == MATCH_THEN)
830 {
831 next = ecode + GET(ecode,1);
832 if (md->start_match_ptr < next &&
833 (*ecode == OP_ALT || *next == OP_ALT))
834 rrc = MATCH_NOMATCH;
835 }
836
837 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
838 ecode += GET(ecode,1);
839 }
840 while (*ecode == OP_ALT);
841
842 /* If hit the end of the group (which could be repeated), fail */
843
844 if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
845
846 /* Continue as from after the group, updating the offsets high water
847 mark, since extracts may have been taken. */
848
849 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
850
851 offset_top = md->end_offset_top;
852 eptr = md->end_match_ptr;
853
854 /* For a non-repeating ket, just continue at this level. This also
855 happens for a repeating ket if no characters were matched in the group.
856 This is the forcible breaking of infinite loops as implemented in Perl
857 5.005. */
858
859 if (*ecode == OP_KET || eptr == saved_eptr)
860 {
861 ecode += 1+LINK_SIZE;
862 break;
863 }
864
865 /* The repeating kets try the rest of the pattern or restart from the
866 preceding bracket, in the appropriate order. The second "call" of match()
867 uses tail recursion, to avoid using another stack frame. */
868
869 if (*ecode == OP_KETRMIN)
870 {
871 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
873 ecode = prev;
874 goto TAIL_RECURSE;
875 }
876 else /* OP_KETRMAX */
877 {
878 md->match_function_type = MATCH_CBEGROUP;
879 RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
880 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
881 ecode += 1 + LINK_SIZE;
882 goto TAIL_RECURSE;
883 }
884 /* Control never gets here */
885
886 /* Handle a capturing bracket, other than those that are possessive with an
887 unlimited repeat. If there is space in the offset vector, save the current
888 subject position in the working slot at the top of the vector. We mustn't
889 change the current values of the data slot, because they may be set from a
890 previous iteration of this group, and be referred to by a reference inside
891 the group. A failure to match might occur after the group has succeeded,
892 if something later on doesn't match. For this reason, we need to restore
893 the working value and also the values of the final offsets, in case they
894 were set by a previous iteration of the same bracket.
895
896 If there isn't enough space in the offset vector, treat this as if it were
897 a non-capturing bracket. Don't worry about setting the flag for the error
898 case here; that is handled in the code for KET. */
899
900 case OP_CBRA:
901 case OP_SCBRA:
902 number = GET2(ecode, 1+LINK_SIZE);
903 offset = number << 1;
904
905 #ifdef PCRE_DEBUG
906 printf("start bracket %d\n", number);
907 printf("subject=");
908 pchars(eptr, 16, TRUE, md);
909 printf("\n");
910 #endif
911
912 if (offset < md->offset_max)
913 {
914 save_offset1 = md->offset_vector[offset];
915 save_offset2 = md->offset_vector[offset+1];
916 save_offset3 = md->offset_vector[md->offset_end - number];
917 save_capture_last = md->capture_last;
918
919 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
920 md->offset_vector[md->offset_end - number] =
921 (int)(eptr - md->start_subject);
922
923 for (;;)
924 {
925 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
926 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
927 eptrb, RM1);
928 if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
929
930 /* If we backed up to a THEN, check whether it is within the current
931 branch by comparing the address of the THEN that is passed back with
932 the end of the branch. If it is within the current branch, and the
933 branch is one of two or more alternatives (it either starts or ends
934 with OP_ALT), we have reached the limit of THEN's action, so convert
935 the return code to NOMATCH, which will cause normal backtracking to
936 happen from now on. Otherwise, THEN is passed back to an outer
937 alternative. This implements Perl's treatment of parenthesized groups,
938 where a group not containing | does not affect the current alternative,
939 that is, (X) is NOT the same as (X|(*F)). */
940
941 if (rrc == MATCH_THEN)
942 {
943 next = ecode + GET(ecode,1);
944 if (md->start_match_ptr < next &&
945 (*ecode == OP_ALT || *next == OP_ALT))
946 rrc = MATCH_NOMATCH;
947 }
948
949 /* Anything other than NOMATCH is passed back. */
950
951 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
952 md->capture_last = save_capture_last;
953 ecode += GET(ecode, 1);
954 if (*ecode != OP_ALT) break;
955 }
956
957 DPRINTF(("bracket %d failed\n", number));
958 md->offset_vector[offset] = save_offset1;
959 md->offset_vector[offset+1] = save_offset2;
960 md->offset_vector[md->offset_end - number] = save_offset3;
961
962 /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
963
964 RRETURN(rrc);
965 }
966
967 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
968 as a non-capturing bracket. */
969
970 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
971 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
972
973 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
974
975 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
976 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
977
978 /* Non-capturing or atomic group, except for possessive with unlimited
979 repeat and ONCE group with no captures. Loop for all the alternatives.
980
981 When we get to the final alternative within the brackets, we used to return
982 the result of a recursive call to match() whatever happened so it was
983 possible to reduce stack usage by turning this into a tail recursion,
984 except in the case of a possibly empty group. However, now that there is
985 the possiblity of (*THEN) occurring in the final alternative, this
986 optimization is no longer always possible.
987
988 We can optimize if we know there are no (*THEN)s in the pattern; at present
989 this is the best that can be done.
990
991 MATCH_ONCE is returned when the end of an atomic group is successfully
992 reached, but subsequent matching fails. It passes back up the tree (causing
993 captured values to be reset) until the original atomic group level is
994 reached. This is tested by comparing md->once_target with the start of the
995 group. At this point, the return is converted into MATCH_NOMATCH so that
996 previous backup points can be taken. */
997
998 case OP_ONCE:
999 case OP_BRA:
1000 case OP_SBRA:
1001 DPRINTF(("start non-capturing bracket\n"));
1002
1003 for (;;)
1004 {
1005 if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
1006
1007 /* If this is not a possibly empty group, and there are no (*THEN)s in
1008 the pattern, and this is the final alternative, optimize as described
1009 above. */
1010
1011 else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1012 {
1013 ecode += PRIV(OP_lengths)[*ecode];
1014 goto TAIL_RECURSE;
1015 }
1016
1017 /* In all other cases, we have to make another call to match(). */
1018
1019 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1020 RM2);
1021
1022 /* See comment in the code for capturing groups above about handling
1023 THEN. */
1024
1025 if (rrc == MATCH_THEN)
1026 {
1027 next = ecode + GET(ecode,1);
1028 if (md->start_match_ptr < next &&
1029 (*ecode == OP_ALT || *next == OP_ALT))
1030 rrc = MATCH_NOMATCH;
1031 }
1032
1033 if (rrc != MATCH_NOMATCH)
1034 {
1035 if (rrc == MATCH_ONCE)
1036 {
1037 const pcre_uchar *scode = ecode;
1038 if (*scode != OP_ONCE) /* If not at start, find it */
1039 {
1040 while (*scode == OP_ALT) scode += GET(scode, 1);
1041 scode -= GET(scode, 1);
1042 }
1043 if (md->once_target == scode) rrc = MATCH_NOMATCH;
1044 }
1045 RRETURN(rrc);
1046 }
1047 ecode += GET(ecode, 1);
1048 if (*ecode != OP_ALT) break;
1049 }
1050
1051 RRETURN(MATCH_NOMATCH);
1052
1053 /* Handle possessive capturing brackets with an unlimited repeat. We come
1054 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1055 handled similarly to the normal case above. However, the matching is
1056 different. The end of these brackets will always be OP_KETRPOS, which
1057 returns MATCH_KETRPOS without going further in the pattern. By this means
1058 we can handle the group by iteration rather than recursion, thereby
1059 reducing the amount of stack needed. */
1060
1061 case OP_CBRAPOS:
1062 case OP_SCBRAPOS:
1063 allow_zero = FALSE;
1064
1065 POSSESSIVE_CAPTURE:
1066 number = GET2(ecode, 1+LINK_SIZE);
1067 offset = number << 1;
1068
1069 #ifdef PCRE_DEBUG
1070 printf("start possessive bracket %d\n", number);
1071 printf("subject=");
1072 pchars(eptr, 16, TRUE, md);
1073 printf("\n");
1074 #endif
1075
1076 if (offset < md->offset_max)
1077 {
1078 matched_once = FALSE;
1079 code_offset = (int)(ecode - md->start_code);
1080
1081 save_offset1 = md->offset_vector[offset];
1082 save_offset2 = md->offset_vector[offset+1];
1083 save_offset3 = md->offset_vector[md->offset_end - number];
1084 save_capture_last = md->capture_last;
1085
1086 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1087
1088 /* Each time round the loop, save the current subject position for use
1089 when the group matches. For MATCH_MATCH, the group has matched, so we
1090 restart it with a new subject starting position, remembering that we had
1091 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1092 usual. If we haven't matched any alternatives in any iteration, check to
1093 see if a previous iteration matched. If so, the group has matched;
1094 continue from afterwards. Otherwise it has failed; restore the previous
1095 capture values before returning NOMATCH. */
1096
1097 for (;;)
1098 {
1099 md->offset_vector[md->offset_end - number] =
1100 (int)(eptr - md->start_subject);
1101 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1102 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1103 eptrb, RM63);
1104 if (rrc == MATCH_KETRPOS)
1105 {
1106 offset_top = md->end_offset_top;
1107 eptr = md->end_match_ptr;
1108 ecode = md->start_code + code_offset;
1109 save_capture_last = md->capture_last;
1110 matched_once = TRUE;
1111 continue;
1112 }
1113
1114 /* See comment in the code for capturing groups above about handling
1115 THEN. */
1116
1117 if (rrc == MATCH_THEN)
1118 {
1119 next = ecode + GET(ecode,1);
1120 if (md->start_match_ptr < next &&
1121 (*ecode == OP_ALT || *next == OP_ALT))
1122 rrc = MATCH_NOMATCH;
1123 }
1124
1125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1126 md->capture_last = save_capture_last;
1127 ecode += GET(ecode, 1);
1128 if (*ecode != OP_ALT) break;
1129 }
1130
1131 if (!matched_once)
1132 {
1133 md->offset_vector[offset] = save_offset1;
1134 md->offset_vector[offset+1] = save_offset2;
1135 md->offset_vector[md->offset_end - number] = save_offset3;
1136 }
1137
1138 if (allow_zero || matched_once)
1139 {
1140 ecode += 1 + LINK_SIZE;
1141 break;
1142 }
1143
1144 RRETURN(MATCH_NOMATCH);
1145 }
1146
1147 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1148 as a non-capturing bracket. */
1149
1150 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1151 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1152
1153 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1154
1155 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1156 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1157
1158 /* Non-capturing possessive bracket with unlimited repeat. We come here
1159 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1160 without the capturing complication. It is written out separately for speed
1161 and cleanliness. */
1162
1163 case OP_BRAPOS:
1164 case OP_SBRAPOS:
1165 allow_zero = FALSE;
1166
1167 POSSESSIVE_NON_CAPTURE:
1168 matched_once = FALSE;
1169 code_offset = (int)(ecode - md->start_code);
1170
1171 for (;;)
1172 {
1173 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1174 RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1175 eptrb, RM48);
1176 if (rrc == MATCH_KETRPOS)
1177 {
1178 offset_top = md->end_offset_top;
1179 eptr = md->end_match_ptr;
1180 ecode = md->start_code + code_offset;
1181 matched_once = TRUE;
1182 continue;
1183 }
1184
1185 /* See comment in the code for capturing groups above about handling
1186 THEN. */
1187
1188 if (rrc == MATCH_THEN)
1189 {
1190 next = ecode + GET(ecode,1);
1191 if (md->start_match_ptr < next &&
1192 (*ecode == OP_ALT || *next == OP_ALT))
1193 rrc = MATCH_NOMATCH;
1194 }
1195
1196 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197 ecode += GET(ecode, 1);
1198 if (*ecode != OP_ALT) break;
1199 }
1200
1201 if (matched_once || allow_zero)
1202 {
1203 ecode += 1 + LINK_SIZE;
1204 break;
1205 }
1206 RRETURN(MATCH_NOMATCH);
1207
1208 /* Control never reaches here. */
1209
1210 /* Conditional group: compilation checked that there are no more than
1211 two branches. If the condition is false, skipping the first branch takes us
1212 past the end if there is only one branch, but that's OK because that is
1213 exactly what going to the ket would do. */
1214
1215 case OP_COND:
1216 case OP_SCOND:
1217 codelink = GET(ecode, 1);
1218
1219 /* Because of the way auto-callout works during compile, a callout item is
1220 inserted between OP_COND and an assertion condition. */
1221
1222 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1223 {
1224 if (PUBL(callout) != NULL)
1225 {
1226 pcre_callout_block cb;
1227 cb.version = 2; /* Version 1 of the callout block */
1228 cb.callout_number = ecode[LINK_SIZE+2];
1229 cb.offset_vector = md->offset_vector;
1230 cb.subject = (PCRE_SPTR)md->start_subject;
1231 cb.subject_length = (int)(md->end_subject - md->start_subject);
1232 cb.start_match = (int)(mstart - md->start_subject);
1233 cb.current_position = (int)(eptr - md->start_subject);
1234 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1235 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1236 cb.capture_top = offset_top/2;
1237 cb.capture_last = md->capture_last;
1238 cb.callout_data = md->callout_data;
1239 cb.mark = md->nomatch_mark;
1240 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1241 if (rrc < 0) RRETURN(rrc);
1242 }
1243 ecode += PRIV(OP_lengths)[OP_CALLOUT];
1244 }
1245
1246 condcode = ecode[LINK_SIZE+1];
1247
1248 /* Now see what the actual condition is */
1249
1250 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1251 {
1252 if (md->recursive == NULL) /* Not recursing => FALSE */
1253 {
1254 condition = FALSE;
1255 ecode += GET(ecode, 1);
1256 }
1257 else
1258 {
1259 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1260 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1261
1262 /* If the test is for recursion into a specific subpattern, and it is
1263 false, but the test was set up by name, scan the table to see if the
1264 name refers to any other numbers, and test them. The condition is true
1265 if any one is set. */
1266
1267 if (!condition && condcode == OP_NRREF)
1268 {
1269 pcre_uchar *slotA = md->name_table;
1270 for (i = 0; i < md->name_count; i++)
1271 {
1272 if (GET2(slotA, 0) == recno) break;
1273 slotA += md->name_entry_size;
1274 }
1275
1276 /* Found a name for the number - there can be only one; duplicate
1277 names for different numbers are allowed, but not vice versa. First
1278 scan down for duplicates. */
1279
1280 if (i < md->name_count)
1281 {
1282 pcre_uchar *slotB = slotA;
1283 while (slotB > md->name_table)
1284 {
1285 slotB -= md->name_entry_size;
1286 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1287 {
1288 condition = GET2(slotB, 0) == md->recursive->group_num;
1289 if (condition) break;
1290 }
1291 else break;
1292 }
1293
1294 /* Scan up for duplicates */
1295
1296 if (!condition)
1297 {
1298 slotB = slotA;
1299 for (i++; i < md->name_count; i++)
1300 {
1301 slotB += md->name_entry_size;
1302 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1303 {
1304 condition = GET2(slotB, 0) == md->recursive->group_num;
1305 if (condition) break;
1306 }
1307 else break;
1308 }
1309 }
1310 }
1311 }
1312
1313 /* Chose branch according to the condition */
1314
1315 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1316 }
1317 }
1318
1319 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1320 {
1321 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1322 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1323
1324 /* If the numbered capture is unset, but the reference was by name,
1325 scan the table to see if the name refers to any other numbers, and test
1326 them. The condition is true if any one is set. This is tediously similar
1327 to the code above, but not close enough to try to amalgamate. */
1328
1329 if (!condition && condcode == OP_NCREF)
1330 {
1331 int refno = offset >> 1;
1332 pcre_uchar *slotA = md->name_table;
1333
1334 for (i = 0; i < md->name_count; i++)
1335 {
1336 if (GET2(slotA, 0) == refno) break;
1337 slotA += md->name_entry_size;
1338 }
1339
1340 /* Found a name for the number - there can be only one; duplicate names
1341 for different numbers are allowed, but not vice versa. First scan down
1342 for duplicates. */
1343
1344 if (i < md->name_count)
1345 {
1346 pcre_uchar *slotB = slotA;
1347 while (slotB > md->name_table)
1348 {
1349 slotB -= md->name_entry_size;
1350 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1351 {
1352 offset = GET2(slotB, 0) << 1;
1353 condition = offset < offset_top &&
1354 md->offset_vector[offset] >= 0;
1355 if (condition) break;
1356 }
1357 else break;
1358 }
1359
1360 /* Scan up for duplicates */
1361
1362 if (!condition)
1363 {
1364 slotB = slotA;
1365 for (i++; i < md->name_count; i++)
1366 {
1367 slotB += md->name_entry_size;
1368 if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1369 {
1370 offset = GET2(slotB, 0) << 1;
1371 condition = offset < offset_top &&
1372 md->offset_vector[offset] >= 0;
1373 if (condition) break;
1374 }
1375 else break;
1376 }
1377 }
1378 }
1379 }
1380
1381 /* Chose branch according to the condition */
1382
1383 ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1384 }
1385
1386 else if (condcode == OP_DEF) /* DEFINE - always false */
1387 {
1388 condition = FALSE;
1389 ecode += GET(ecode, 1);
1390 }
1391
1392 /* The condition is an assertion. Call match() to evaluate it - setting
1393 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1394 an assertion. */
1395
1396 else
1397 {
1398 md->match_function_type = MATCH_CONDASSERT;
1399 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1400 if (rrc == MATCH_MATCH)
1401 {
1402 if (md->end_offset_top > offset_top)
1403 offset_top = md->end_offset_top; /* Captures may have happened */
1404 condition = TRUE;
1405 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1406 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1407 }
1408
1409 /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1410 assertion; it is therefore treated as NOMATCH. */
1411
1412 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1413 {
1414 RRETURN(rrc); /* Need braces because of following else */
1415 }
1416 else
1417 {
1418 condition = FALSE;
1419 ecode += codelink;
1420 }
1421 }
1422
1423 /* We are now at the branch that is to be obeyed. As there is only one, can
1424 use tail recursion to avoid using another stack frame, except when there is
1425 unlimited repeat of a possibly empty group. In the latter case, a recursive
1426 call to match() is always required, unless the second alternative doesn't
1427 exist, in which case we can just plough on. Note that, for compatibility
1428 with Perl, the | in a conditional group is NOT treated as creating two
1429 alternatives. If a THEN is encountered in the branch, it propagates out to
1430 the enclosing alternative (unless nested in a deeper set of alternatives,
1431 of course). */
1432
1433 if (condition || *ecode == OP_ALT)
1434 {
1435 if (op != OP_SCOND)
1436 {
1437 ecode += 1 + LINK_SIZE;
1438 goto TAIL_RECURSE;
1439 }
1440
1441 md->match_function_type = MATCH_CBEGROUP;
1442 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1443 RRETURN(rrc);
1444 }
1445
1446 /* Condition false & no alternative; continue after the group. */
1447
1448 else
1449 {
1450 ecode += 1 + LINK_SIZE;
1451 }
1452 break;
1453
1454
1455 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1456 to close any currently open capturing brackets. */
1457
1458 case OP_CLOSE:
1459 number = GET2(ecode, 1);
1460 offset = number << 1;
1461
1462 #ifdef PCRE_DEBUG
1463 printf("end bracket %d at *ACCEPT", number);
1464 printf("\n");
1465 #endif
1466
1467 md->capture_last = number;
1468 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1469 {
1470 md->offset_vector[offset] =
1471 md->offset_vector[md->offset_end - number];
1472 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1473 if (offset_top <= offset) offset_top = offset + 2;
1474 }
1475 ecode += 1 + IMM2_SIZE;
1476 break;
1477
1478
1479 /* End of the pattern, either real or forced. */
1480
1481 case OP_END:
1482 case OP_ACCEPT:
1483 case OP_ASSERT_ACCEPT:
1484
1485 /* If we have matched an empty string, fail if not in an assertion and not
1486 in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1487 is set and we have matched at the start of the subject. In both cases,
1488 backtracking will then try other alternatives, if any. */
1489
1490 if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1491 md->recursive == NULL &&
1492 (md->notempty ||
1493 (md->notempty_atstart &&
1494 mstart == md->start_subject + md->start_offset)))
1495 RRETURN(MATCH_NOMATCH);
1496
1497 /* Otherwise, we have a match. */
1498
1499 md->end_match_ptr = eptr; /* Record where we ended */
1500 md->end_offset_top = offset_top; /* and how many extracts were taken */
1501 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1502
1503 /* For some reason, the macros don't work properly if an expression is
1504 given as the argument to RRETURN when the heap is in use. */
1505
1506 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1507 RRETURN(rrc);
1508
1509 /* Assertion brackets. Check the alternative branches in turn - the
1510 matching won't pass the KET for an assertion. If any one branch matches,
1511 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1512 start of each branch to move the current point backwards, so the code at
1513 this level is identical to the lookahead case. When the assertion is part
1514 of a condition, we want to return immediately afterwards. The caller of
1515 this incarnation of the match() function will have set MATCH_CONDASSERT in
1516 md->match_function type, and one of these opcodes will be the first opcode
1517 that is processed. We use a local variable that is preserved over calls to
1518 match() to remember this case. */
1519
1520 case OP_ASSERT:
1521 case OP_ASSERTBACK:
1522 if (md->match_function_type == MATCH_CONDASSERT)
1523 {
1524 condassert = TRUE;
1525 md->match_function_type = 0;
1526 }
1527 else condassert = FALSE;
1528
1529 do
1530 {
1531 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1532 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1533 {
1534 mstart = md->start_match_ptr; /* In case \K reset it */
1535 break;
1536 }
1537
1538 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1539 as NOMATCH. */
1540
1541 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1542 ecode += GET(ecode, 1);
1543 }
1544 while (*ecode == OP_ALT);
1545
1546 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1547
1548 /* If checking an assertion for a condition, return MATCH_MATCH. */
1549
1550 if (condassert) RRETURN(MATCH_MATCH);
1551
1552 /* Continue from after the assertion, updating the offsets high water
1553 mark, since extracts may have been taken during the assertion. */
1554
1555 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1556 ecode += 1 + LINK_SIZE;
1557 offset_top = md->end_offset_top;
1558 continue;
1559
1560 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1561 PRUNE, or COMMIT means we must assume failure without checking subsequent
1562 branches. */
1563
1564 case OP_ASSERT_NOT:
1565 case OP_ASSERTBACK_NOT:
1566 if (md->match_function_type == MATCH_CONDASSERT)
1567 {
1568 condassert = TRUE;
1569 md->match_function_type = 0;
1570 }
1571 else condassert = FALSE;
1572
1573 do
1574 {
1575 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1576 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1577 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1578 {
1579 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1580 break;
1581 }
1582
1583 /* PCRE does not allow THEN to escape beyond an assertion; it is treated
1584 as NOMATCH. */
1585
1586 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1587 ecode += GET(ecode,1);
1588 }
1589 while (*ecode == OP_ALT);
1590
1591 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1592
1593 ecode += 1 + LINK_SIZE;
1594 continue;
1595
1596 /* Move the subject pointer back. This occurs only at the start of
1597 each branch of a lookbehind assertion. If we are too close to the start to
1598 move back, this match function fails. When working with UTF-8 we move
1599 back a number of characters, not bytes. */
1600
1601 case OP_REVERSE:
1602 #ifdef SUPPORT_UTF
1603 if (utf)
1604 {
1605 i = GET(ecode, 1);
1606 while (i-- > 0)
1607 {
1608 eptr--;
1609 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1610 BACKCHAR(eptr);
1611 }
1612 }
1613 else
1614 #endif
1615
1616 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1617
1618 {
1619 eptr -= GET(ecode, 1);
1620 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1621 }
1622
1623 /* Save the earliest consulted character, then skip to next op code */
1624
1625 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1626 ecode += 1 + LINK_SIZE;
1627 break;
1628
1629 /* The callout item calls an external function, if one is provided, passing
1630 details of the match so far. This is mainly for debugging, though the
1631 function is able to force a failure. */
1632
1633 case OP_CALLOUT:
1634 if (PUBL(callout) != NULL)
1635 {
1636 pcre_callout_block cb;
1637 cb.version = 2; /* Version 1 of the callout block */
1638 cb.callout_number = ecode[1];
1639 cb.offset_vector = md->offset_vector;
1640 cb.subject = (PCRE_SPTR)md->start_subject;
1641 cb.subject_length = (int)(md->end_subject - md->start_subject);
1642 cb.start_match = (int)(mstart - md->start_subject);
1643 cb.current_position = (int)(eptr - md->start_subject);
1644 cb.pattern_position = GET(ecode, 2);
1645 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1646 cb.capture_top = offset_top/2;
1647 cb.capture_last = md->capture_last;
1648 cb.callout_data = md->callout_data;
1649 cb.mark = md->nomatch_mark;
1650 if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1651 if (rrc < 0) RRETURN(rrc);
1652 }
1653 ecode += 2 + 2*LINK_SIZE;
1654 break;
1655
1656 /* Recursion either matches the current regex, or some subexpression. The
1657 offset data is the offset to the starting bracket from the start of the
1658 whole pattern. (This is so that it works from duplicated subpatterns.)
1659
1660 The state of the capturing groups is preserved over recursion, and
1661 re-instated afterwards. We don't know how many are started and not yet
1662 finished (offset_top records the completed total) so we just have to save
1663 all the potential data. There may be up to 65535 such values, which is too
1664 large to put on the stack, but using malloc for small numbers seems
1665 expensive. As a compromise, the stack is used when there are no more than
1666 REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1667
1668 There are also other values that have to be saved. We use a chained
1669 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1670 for the original version of this logic. It has, however, been hacked around
1671 a lot, so he is not to blame for the current way it works. */
1672
1673 case OP_RECURSE:
1674 {
1675 recursion_info *ri;
1676 int recno;
1677
1678 callpat = md->start_code + GET(ecode, 1);
1679 recno = (callpat == md->start_code)? 0 :
1680 GET2(callpat, 1 + LINK_SIZE);
1681
1682 /* Check for repeating a recursion without advancing the subject pointer.
1683 This should catch convoluted mutual recursions. (Some simple cases are
1684 caught at compile time.) */
1685
1686 for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1687 if (recno == ri->group_num && eptr == ri->subject_position)
1688 RRETURN(PCRE_ERROR_RECURSELOOP);
1689
1690 /* Add to "recursing stack" */
1691
1692 new_recursive.group_num = recno;
1693 new_recursive.subject_position = eptr;
1694 new_recursive.prevrec = md->recursive;
1695 md->recursive = &new_recursive;
1696
1697 /* Where to continue from afterwards */
1698
1699 ecode += 1 + LINK_SIZE;
1700
1701 /* Now save the offset data */
1702
1703 new_recursive.saved_max = md->offset_end;
1704 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1705 new_recursive.offset_save = stacksave;
1706 else
1707 {
1708 new_recursive.offset_save =
1709 (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1710 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1711 }
1712 memcpy(new_recursive.offset_save, md->offset_vector,
1713 new_recursive.saved_max * sizeof(int));
1714
1715 /* OK, now we can do the recursion. After processing each alternative,
1716 restore the offset data. If there were nested recursions, md->recursive
1717 might be changed, so reset it before looping. */
1718
1719 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1720 cbegroup = (*callpat >= OP_SBRA);
1721 do
1722 {
1723 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1724 RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1725 md, eptrb, RM6);
1726 memcpy(md->offset_vector, new_recursive.offset_save,
1727 new_recursive.saved_max * sizeof(int));
1728 md->recursive = new_recursive.prevrec;
1729 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1730 {
1731 DPRINTF(("Recursion matched\n"));
1732 if (new_recursive.offset_save != stacksave)
1733 (PUBL(free))(new_recursive.offset_save);
1734
1735 /* Set where we got to in the subject, and reset the start in case
1736 it was changed by \K. This *is* propagated back out of a recursion,
1737 for Perl compatibility. */
1738
1739 eptr = md->end_match_ptr;
1740 mstart = md->start_match_ptr;
1741 goto RECURSION_MATCHED; /* Exit loop; end processing */
1742 }
1743
1744 /* PCRE does not allow THEN to escape beyond a recursion; it is treated
1745 as NOMATCH. */
1746
1747 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1748 {
1749 DPRINTF(("Recursion gave error %d\n", rrc));
1750 if (new_recursive.offset_save != stacksave)
1751 (PUBL(free))(new_recursive.offset_save);
1752 RRETURN(rrc);
1753 }
1754
1755 md->recursive = &new_recursive;
1756 callpat += GET(callpat, 1);
1757 }
1758 while (*callpat == OP_ALT);
1759
1760 DPRINTF(("Recursion didn't match\n"));
1761 md->recursive = new_recursive.prevrec;
1762 if (new_recursive.offset_save != stacksave)
1763 (PUBL(free))(new_recursive.offset_save);
1764 RRETURN(MATCH_NOMATCH);
1765 }
1766
1767 RECURSION_MATCHED:
1768 break;
1769
1770 /* An alternation is the end of a branch; scan along to find the end of the
1771 bracketed group and go to there. */
1772
1773 case OP_ALT:
1774 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1775 break;
1776
1777 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1778 indicating that it may occur zero times. It may repeat infinitely, or not
1779 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1780 with fixed upper repeat limits are compiled as a number of copies, with the
1781 optional ones preceded by BRAZERO or BRAMINZERO. */
1782
1783 case OP_BRAZERO:
1784 next = ecode + 1;
1785 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1786 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1787 do next += GET(next, 1); while (*next == OP_ALT);
1788 ecode = next + 1 + LINK_SIZE;
1789 break;
1790
1791 case OP_BRAMINZERO:
1792 next = ecode + 1;
1793 do next += GET(next, 1); while (*next == OP_ALT);
1794 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1795 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1796 ecode++;
1797 break;
1798
1799 case OP_SKIPZERO:
1800 next = ecode+1;
1801 do next += GET(next,1); while (*next == OP_ALT);
1802 ecode = next + 1 + LINK_SIZE;
1803 break;
1804
1805 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1806 here; just jump to the group, with allow_zero set TRUE. */
1807
1808 case OP_BRAPOSZERO:
1809 op = *(++ecode);
1810 allow_zero = TRUE;
1811 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1812 goto POSSESSIVE_NON_CAPTURE;
1813
1814 /* End of a group, repeated or non-repeating. */
1815
1816 case OP_KET:
1817 case OP_KETRMIN:
1818 case OP_KETRMAX:
1819 case OP_KETRPOS:
1820 prev = ecode - GET(ecode, 1);
1821
1822 /* If this was a group that remembered the subject start, in order to break
1823 infinite repeats of empty string matches, retrieve the subject start from
1824 the chain. Otherwise, set it NULL. */
1825
1826 if (*prev >= OP_SBRA || *prev == OP_ONCE)
1827 {
1828 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1829 eptrb = eptrb->epb_prev; /* Backup to previous group */
1830 }
1831 else saved_eptr = NULL;
1832
1833 /* If we are at the end of an assertion group or a non-capturing atomic
1834 group, stop matching and return MATCH_MATCH, but record the current high
1835 water mark for use by positive assertions. We also need to record the match
1836 start in case it was changed by \K. */
1837
1838 if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1839 *prev == OP_ONCE_NC)
1840 {
1841 md->end_match_ptr = eptr; /* For ONCE_NC */
1842 md->end_offset_top = offset_top;
1843 md->start_match_ptr = mstart;
1844 RRETURN(MATCH_MATCH); /* Sets md->mark */
1845 }
1846
1847 /* For capturing groups we have to check the group number back at the start
1848 and if necessary complete handling an extraction by setting the offsets and
1849 bumping the high water mark. Whole-pattern recursion is coded as a recurse
1850 into group 0, so it won't be picked up here. Instead, we catch it when the
1851 OP_END is reached. Other recursion is handled here. We just have to record
1852 the current subject position and start match pointer and give a MATCH
1853 return. */
1854
1855 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1856 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1857 {
1858 number = GET2(prev, 1+LINK_SIZE);
1859 offset = number << 1;
1860
1861 #ifdef PCRE_DEBUG
1862 printf("end bracket %d", number);
1863 printf("\n");
1864 #endif
1865
1866 /* Handle a recursively called group. */
1867
1868 if (md->recursive != NULL && md->recursive->group_num == number)
1869 {
1870 md->end_match_ptr = eptr;
1871 md->start_match_ptr = mstart;
1872 RRETURN(MATCH_MATCH);
1873 }
1874
1875 /* Deal with capturing */
1876
1877 md->capture_last = number;
1878 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1879 {
1880 /* If offset is greater than offset_top, it means that we are
1881 "skipping" a capturing group, and that group's offsets must be marked
1882 unset. In earlier versions of PCRE, all the offsets were unset at the
1883 start of matching, but this doesn't work because atomic groups and
1884 assertions can cause a value to be set that should later be unset.
1885 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1886 part of the atomic group, but this is not on the final matching path,
1887 so must be unset when 2 is set. (If there is no group 2, there is no
1888 problem, because offset_top will then be 2, indicating no capture.) */
1889
1890 if (offset > offset_top)
1891 {
1892 register int *iptr = md->offset_vector + offset_top;
1893 register int *iend = md->offset_vector + offset;
1894 while (iptr < iend) *iptr++ = -1;
1895 }
1896
1897 /* Now make the extraction */
1898
1899 md->offset_vector[offset] =
1900 md->offset_vector[md->offset_end - number];
1901 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1902 if (offset_top <= offset) offset_top = offset + 2;
1903 }
1904 }
1905
1906 /* For an ordinary non-repeating ket, just continue at this level. This
1907 also happens for a repeating ket if no characters were matched in the
1908 group. This is the forcible breaking of infinite loops as implemented in
1909 Perl 5.005. For a non-repeating atomic group that includes captures,
1910 establish a backup point by processing the rest of the pattern at a lower
1911 level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
1912 original OP_ONCE level, thereby bypassing intermediate backup points, but
1913 resetting any captures that happened along the way. */
1914
1915 if (*ecode == OP_KET || eptr == saved_eptr)
1916 {
1917 if (*prev == OP_ONCE)
1918 {
1919 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1920 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1921 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1922 RRETURN(MATCH_ONCE);
1923 }
1924 ecode += 1 + LINK_SIZE; /* Carry on at this level */
1925 break;
1926 }
1927
1928 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1929 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1930 at a time from the outer level, thus saving stack. */
1931
1932 if (*ecode == OP_KETRPOS)
1933 {
1934 md->end_match_ptr = eptr;
1935 md->end_offset_top = offset_top;
1936 RRETURN(MATCH_KETRPOS);
1937 }
1938
1939 /* The normal repeating kets try the rest of the pattern or restart from
1940 the preceding bracket, in the appropriate order. In the second case, we can
1941 use tail recursion to avoid using another stack frame, unless we have an
1942 an atomic group or an unlimited repeat of a group that can match an empty
1943 string. */
1944
1945 if (*ecode == OP_KETRMIN)
1946 {
1947 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1948 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1949 if (*prev == OP_ONCE)
1950 {
1951 RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
1952 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1953 md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
1954 RRETURN(MATCH_ONCE);
1955 }
1956 if (*prev >= OP_SBRA) /* Could match an empty string */
1957 {
1958 md->match_function_type = MATCH_CBEGROUP;
1959 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1960 RRETURN(rrc);
1961 }
1962 ecode = prev;
1963 goto TAIL_RECURSE;
1964 }
1965 else /* OP_KETRMAX */
1966 {
1967 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1968 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1969 if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1971 if (*prev == OP_ONCE)
1972 {
1973 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
1974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1975 md->once_target = prev;
1976 RRETURN(MATCH_ONCE);
1977 }
1978 ecode += 1 + LINK_SIZE;
1979 goto TAIL_RECURSE;
1980 }
1981 /* Control never gets here */
1982
1983 /* Not multiline mode: start of subject assertion, unless notbol. */
1984
1985 case OP_CIRC:
1986 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1987
1988 /* Start of subject assertion */
1989
1990 case OP_SOD:
1991 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1992 ecode++;
1993 break;
1994
1995 /* Multiline mode: start of subject unless notbol, or after any newline. */
1996
1997 case OP_CIRCM:
1998 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1999 if (eptr != md->start_subject &&
2000 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2001 RRETURN(MATCH_NOMATCH);
2002 ecode++;
2003 break;
2004
2005 /* Start of match assertion */
2006
2007 case OP_SOM:
2008 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
2009 ecode++;
2010 break;
2011
2012 /* Reset the start of match point */
2013
2014 case OP_SET_SOM:
2015 mstart = eptr;
2016 ecode++;
2017 break;
2018
2019 /* Multiline mode: assert before any newline, or before end of subject
2020 unless noteol is set. */
2021
2022 case OP_DOLLM:
2023 if (eptr < md->end_subject)
2024 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
2025 else
2026 {
2027 if (md->noteol) RRETURN(MATCH_NOMATCH);
2028 SCHECK_PARTIAL();
2029 }
2030 ecode++;
2031 break;
2032
2033 /* Not multiline mode: assert before a terminating newline or before end of
2034 subject unless noteol is set. */
2035
2036 case OP_DOLL:
2037 if (md->noteol) RRETURN(MATCH_NOMATCH);
2038 if (!md->endonly) goto ASSERT_NL_OR_EOS;
2039
2040 /* ... else fall through for endonly */
2041
2042 /* End of subject assertion (\z) */
2043
2044 case OP_EOD:
2045 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
2046 SCHECK_PARTIAL();
2047 ecode++;
2048 break;
2049
2050 /* End of subject or ending \n assertion (\Z) */
2051
2052 case OP_EODN:
2053 ASSERT_NL_OR_EOS:
2054 if (eptr < md->end_subject &&
2055 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
2056 RRETURN(MATCH_NOMATCH);
2057
2058 /* Either at end of string or \n before end. */
2059
2060 SCHECK_PARTIAL();
2061 ecode++;
2062 break;
2063
2064 /* Word boundary assertions */
2065
2066 case OP_NOT_WORD_BOUNDARY:
2067 case OP_WORD_BOUNDARY:
2068 {
2069
2070 /* Find out if the previous and current characters are "word" characters.
2071 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
2072 be "non-word" characters. Remember the earliest consulted character for
2073 partial matching. */
2074
2075 #ifdef SUPPORT_UTF
2076 if (utf)
2077 {
2078 /* Get status of previous character */
2079
2080 if (eptr == md->start_subject) prev_is_word = FALSE; else
2081 {
2082 PCRE_PUCHAR lastptr = eptr - 1;
2083 BACKCHAR(lastptr);
2084 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
2085 GETCHAR(c, lastptr);
2086 #ifdef SUPPORT_UCP
2087 if (md->use_ucp)
2088 {
2089 if (c == '_') prev_is_word = TRUE; else
2090 {
2091 int cat = UCD_CATEGORY(c);
2092 prev_is_word = (cat == ucp_L || cat == ucp_N);
2093 }
2094 }
2095 else
2096 #endif
2097 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2098 }
2099
2100 /* Get status of next character */
2101
2102 if (eptr >= md->end_subject)
2103 {
2104 SCHECK_PARTIAL();
2105 cur_is_word = FALSE;
2106 }
2107 else
2108 {
2109 GETCHAR(c, eptr);
2110 #ifdef SUPPORT_UCP
2111 if (md->use_ucp)
2112 {
2113 if (c == '_') cur_is_word = TRUE; else
2114 {
2115 int cat = UCD_CATEGORY(c);
2116 cur_is_word = (cat == ucp_L || cat == ucp_N);
2117 }
2118 }
2119 else
2120 #endif
2121 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2122 }
2123 }
2124 else
2125 #endif
2126
2127 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2128 consistency with the behaviour of \w we do use it in this case. */
2129
2130 {
2131 /* Get status of previous character */
2132
2133 if (eptr == md->start_subject) prev_is_word = FALSE; else
2134 {
2135 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2136 #ifdef SUPPORT_UCP
2137 if (md->use_ucp)
2138 {
2139 c = eptr[-1];
2140 if (c == '_') prev_is_word = TRUE; else
2141 {
2142 int cat = UCD_CATEGORY(c);
2143 prev_is_word = (cat == ucp_L || cat == ucp_N);
2144 }
2145 }
2146 else
2147 #endif
2148 prev_is_word = MAX_255(eptr[-1])
2149 && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2150 }
2151
2152 /* Get status of next character */
2153
2154 if (eptr >= md->end_subject)
2155 {
2156 SCHECK_PARTIAL();
2157 cur_is_word = FALSE;
2158 }
2159 else
2160 #ifdef SUPPORT_UCP
2161 if (md->use_ucp)
2162 {
2163 c = *eptr;
2164 if (c == '_') cur_is_word = TRUE; else
2165 {
2166 int cat = UCD_CATEGORY(c);
2167 cur_is_word = (cat == ucp_L || cat == ucp_N);
2168 }
2169 }
2170 else
2171 #endif
2172 cur_is_word = MAX_255(*eptr)
2173 && ((md->ctypes[*eptr] & ctype_word) != 0);
2174 }
2175
2176 /* Now see if the situation is what we want */
2177
2178 if ((*ecode++ == OP_WORD_BOUNDARY)?
2179 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2180 RRETURN(MATCH_NOMATCH);
2181 }
2182 break;
2183
2184 /* Match a single character type; inline for speed */
2185
2186 case OP_ANY:
2187 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2188 /* Fall through */
2189
2190 case OP_ALLANY:
2191 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2192 { /* not be updated before SCHECK_PARTIAL. */
2193 SCHECK_PARTIAL();
2194 RRETURN(MATCH_NOMATCH);
2195 }
2196 eptr++;
2197 #ifdef SUPPORT_UTF
2198 if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2199 #endif
2200 ecode++;
2201 break;
2202
2203 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2204 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2205
2206 case OP_ANYBYTE:
2207 if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2208 { /* not be updated before SCHECK_PARTIAL. */
2209 SCHECK_PARTIAL();
2210 RRETURN(MATCH_NOMATCH);
2211 }
2212 eptr++;
2213 ecode++;
2214 break;
2215
2216 case OP_NOT_DIGIT:
2217 if (eptr >= md->end_subject)
2218 {
2219 SCHECK_PARTIAL();
2220 RRETURN(MATCH_NOMATCH);
2221 }
2222 GETCHARINCTEST(c, eptr);
2223 if (
2224 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2225 c < 256 &&
2226 #endif
2227 (md->ctypes[c] & ctype_digit) != 0
2228 )
2229 RRETURN(MATCH_NOMATCH);
2230 ecode++;
2231 break;
2232
2233 case OP_DIGIT:
2234 if (eptr >= md->end_subject)
2235 {
2236 SCHECK_PARTIAL();
2237 RRETURN(MATCH_NOMATCH);
2238 }
2239 GETCHARINCTEST(c, eptr);
2240 if (
2241 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2242 c > 255 ||
2243 #endif
2244 (md->ctypes[c] & ctype_digit) == 0
2245 )
2246 RRETURN(MATCH_NOMATCH);
2247 ecode++;
2248 break;
2249
2250 case OP_NOT_WHITESPACE:
2251 if (eptr >= md->end_subject)
2252 {
2253 SCHECK_PARTIAL();
2254 RRETURN(MATCH_NOMATCH);
2255 }
2256 GETCHARINCTEST(c, eptr);
2257 if (
2258 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2259 c < 256 &&
2260 #endif
2261 (md->ctypes[c] & ctype_space) != 0
2262 )
2263 RRETURN(MATCH_NOMATCH);
2264 ecode++;
2265 break;
2266
2267 case OP_WHITESPACE:
2268 if (eptr >= md->end_subject)
2269 {
2270 SCHECK_PARTIAL();
2271 RRETURN(MATCH_NOMATCH);
2272 }
2273 GETCHARINCTEST(c, eptr);
2274 if (
2275 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2276 c > 255 ||
2277 #endif
2278 (md->ctypes[c] & ctype_space) == 0
2279 )
2280 RRETURN(MATCH_NOMATCH);
2281 ecode++;
2282 break;
2283
2284 case OP_NOT_WORDCHAR:
2285 if (eptr >= md->end_subject)
2286 {
2287 SCHECK_PARTIAL();
2288 RRETURN(MATCH_NOMATCH);
2289 }
2290 GETCHARINCTEST(c, eptr);
2291 if (
2292 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2293 c < 256 &&
2294 #endif
2295 (md->ctypes[c] & ctype_word) != 0
2296 )
2297 RRETURN(MATCH_NOMATCH);
2298 ecode++;
2299 break;
2300
2301 case OP_WORDCHAR:
2302 if (eptr >= md->end_subject)
2303 {
2304 SCHECK_PARTIAL();
2305 RRETURN(MATCH_NOMATCH);
2306 }
2307 GETCHARINCTEST(c, eptr);
2308 if (
2309 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2310 c > 255 ||
2311 #endif
2312 (md->ctypes[c] & ctype_word) == 0
2313 )
2314 RRETURN(MATCH_NOMATCH);
2315 ecode++;
2316 break;
2317
2318 case OP_ANYNL:
2319 if (eptr >= md->end_subject)
2320 {
2321 SCHECK_PARTIAL();
2322 RRETURN(MATCH_NOMATCH);
2323 }
2324 GETCHARINCTEST(c, eptr);
2325 switch(c)
2326 {
2327 default: RRETURN(MATCH_NOMATCH);
2328
2329 case 0x000d:
2330 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2331 break;
2332
2333 case 0x000a:
2334 break;
2335
2336 case 0x000b:
2337 case 0x000c:
2338 case 0x0085:
2339 case 0x2028:
2340 case 0x2029:
2341 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2342 break;
2343 }
2344 ecode++;
2345 break;
2346
2347 case OP_NOT_HSPACE:
2348 if (eptr >= md->end_subject)
2349 {
2350 SCHECK_PARTIAL();
2351 RRETURN(MATCH_NOMATCH);
2352 }
2353 GETCHARINCTEST(c, eptr);
2354 switch(c)
2355 {
2356 default: break;
2357 case 0x09: /* HT */
2358 case 0x20: /* SPACE */
2359 case 0xa0: /* NBSP */
2360 case 0x1680: /* OGHAM SPACE MARK */
2361 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2362 case 0x2000: /* EN QUAD */
2363 case 0x2001: /* EM QUAD */
2364 case 0x2002: /* EN SPACE */
2365 case 0x2003: /* EM SPACE */
2366 case 0x2004: /* THREE-PER-EM SPACE */
2367 case 0x2005: /* FOUR-PER-EM SPACE */
2368 case 0x2006: /* SIX-PER-EM SPACE */
2369 case 0x2007: /* FIGURE SPACE */
2370 case 0x2008: /* PUNCTUATION SPACE */
2371 case 0x2009: /* THIN SPACE */
2372 case 0x200A: /* HAIR SPACE */
2373 case 0x202f: /* NARROW NO-BREAK SPACE */
2374 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2375 case 0x3000: /* IDEOGRAPHIC SPACE */
2376 RRETURN(MATCH_NOMATCH);
2377 }
2378 ecode++;
2379 break;
2380
2381 case OP_HSPACE:
2382 if (eptr >= md->end_subject)
2383 {
2384 SCHECK_PARTIAL();
2385 RRETURN(MATCH_NOMATCH);
2386 }
2387 GETCHARINCTEST(c, eptr);
2388 switch(c)
2389 {
2390 default: RRETURN(MATCH_NOMATCH);
2391 case 0x09: /* HT */
2392 case 0x20: /* SPACE */
2393 case 0xa0: /* NBSP */
2394 case 0x1680: /* OGHAM SPACE MARK */
2395 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2396 case 0x2000: /* EN QUAD */
2397 case 0x2001: /* EM QUAD */
2398 case 0x2002: /* EN SPACE */
2399 case 0x2003: /* EM SPACE */
2400 case 0x2004: /* THREE-PER-EM SPACE */
2401 case 0x2005: /* FOUR-PER-EM SPACE */
2402 case 0x2006: /* SIX-PER-EM SPACE */
2403 case 0x2007: /* FIGURE SPACE */
2404 case 0x2008: /* PUNCTUATION SPACE */
2405 case 0x2009: /* THIN SPACE */
2406 case 0x200A: /* HAIR SPACE */
2407 case 0x202f: /* NARROW NO-BREAK SPACE */
2408 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2409 case 0x3000: /* IDEOGRAPHIC SPACE */
2410 break;
2411 }
2412 ecode++;
2413 break;
2414
2415 case OP_NOT_VSPACE:
2416 if (eptr >= md->end_subject)
2417 {
2418 SCHECK_PARTIAL();
2419 RRETURN(MATCH_NOMATCH);
2420 }
2421 GETCHARINCTEST(c, eptr);
2422 switch(c)
2423 {
2424 default: break;
2425 case 0x0a: /* LF */
2426 case 0x0b: /* VT */
2427 case 0x0c: /* FF */
2428 case 0x0d: /* CR */
2429 case 0x85: /* NEL */
2430 case 0x2028: /* LINE SEPARATOR */
2431 case 0x2029: /* PARAGRAPH SEPARATOR */
2432 RRETURN(MATCH_NOMATCH);
2433 }
2434 ecode++;
2435 break;
2436
2437 case OP_VSPACE:
2438 if (eptr >= md->end_subject)
2439 {
2440 SCHECK_PARTIAL();
2441 RRETURN(MATCH_NOMATCH);
2442 }
2443 GETCHARINCTEST(c, eptr);
2444 switch(c)
2445 {
2446 default: RRETURN(MATCH_NOMATCH);
2447 case 0x0a: /* LF */
2448 case 0x0b: /* VT */
2449 case 0x0c: /* FF */
2450 case 0x0d: /* CR */
2451 case 0x85: /* NEL */
2452 case 0x2028: /* LINE SEPARATOR */
2453 case 0x2029: /* PARAGRAPH SEPARATOR */
2454 break;
2455 }
2456 ecode++;
2457 break;
2458
2459 #ifdef SUPPORT_UCP
2460 /* Check the next character by Unicode property. We will get here only
2461 if the support is in the binary; otherwise a compile-time error occurs. */
2462
2463 case OP_PROP:
2464 case OP_NOTPROP:
2465 if (eptr >= md->end_subject)
2466 {
2467 SCHECK_PARTIAL();
2468 RRETURN(MATCH_NOMATCH);
2469 }
2470 GETCHARINCTEST(c, eptr);
2471 {
2472 const ucd_record *prop = GET_UCD(c);
2473
2474 switch(ecode[1])
2475 {
2476 case PT_ANY:
2477 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2478 break;
2479
2480 case PT_LAMP:
2481 if ((prop->chartype == ucp_Lu ||
2482 prop->chartype == ucp_Ll ||
2483 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2484 RRETURN(MATCH_NOMATCH);
2485 break;
2486
2487 case PT_GC:
2488 if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2489 RRETURN(MATCH_NOMATCH);
2490 break;
2491
2492 case PT_PC:
2493 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2494 RRETURN(MATCH_NOMATCH);
2495 break;
2496
2497 case PT_SC:
2498 if ((ecode[2] != prop->script) == (op == OP_PROP))
2499 RRETURN(MATCH_NOMATCH);
2500 break;
2501
2502 /* These are specials */
2503
2504 case PT_ALNUM:
2505 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2506 PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2507 RRETURN(MATCH_NOMATCH);
2508 break;
2509
2510 case PT_SPACE: /* Perl space */
2511 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2512 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2513 == (op == OP_NOTPROP))
2514 RRETURN(MATCH_NOMATCH);
2515 break;
2516
2517 case PT_PXSPACE: /* POSIX space */
2518 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
2519 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2520 c == CHAR_FF || c == CHAR_CR)
2521 == (op == OP_NOTPROP))
2522 RRETURN(MATCH_NOMATCH);
2523 break;
2524
2525 case PT_WORD:
2526 if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2527 PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
2528 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2529 RRETURN(MATCH_NOMATCH);
2530 break;
2531
2532 /* This should never occur */
2533
2534 default:
2535 RRETURN(PCRE_ERROR_INTERNAL);
2536 }
2537
2538 ecode += 3;
2539 }
2540 break;
2541
2542 /* Match an extended Unicode sequence. We will get here only if the support
2543 is in the binary; otherwise a compile-time error occurs. */
2544
2545 case OP_EXTUNI:
2546 if (eptr >= md->end_subject)
2547 {
2548 SCHECK_PARTIAL();
2549 RRETURN(MATCH_NOMATCH);
2550 }
2551 GETCHARINCTEST(c, eptr);
2552 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
2553 while (eptr < md->end_subject)
2554 {
2555 int len = 1;
2556 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2557 if (UCD_CATEGORY(c) != ucp_M) break;
2558 eptr += len;
2559 }
2560 ecode++;
2561 break;
2562 #endif
2563
2564
2565 /* Match a back reference, possibly repeatedly. Look past the end of the
2566 item to see if there is repeat information following. The code is similar
2567 to that for character classes, but repeated for efficiency. Then obey
2568 similar code to character type repeats - written out again for speed.
2569 However, if the referenced string is the empty string, always treat
2570 it as matched, any number of times (otherwise there could be infinite
2571 loops). */
2572
2573 case OP_REF:
2574 case OP_REFI:
2575 caseless = op == OP_REFI;
2576 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2577 ecode += 1 + IMM2_SIZE;
2578
2579 /* If the reference is unset, there are two possibilities:
2580
2581 (a) In the default, Perl-compatible state, set the length negative;
2582 this ensures that every attempt at a match fails. We can't just fail
2583 here, because of the possibility of quantifiers with zero minima.
2584
2585 (b) If the JavaScript compatibility flag is set, set the length to zero
2586 so that the back reference matches an empty string.
2587
2588 Otherwise, set the length to the length of what was matched by the
2589 referenced subpattern. */
2590
2591 if (offset >= offset_top || md->offset_vector[offset] < 0)
2592 length = (md->jscript_compat)? 0 : -1;
2593 else
2594 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2595
2596 /* Set up for repetition, or handle the non-repeated case */
2597
2598 switch (*ecode)
2599 {
2600 case OP_CRSTAR:
2601 case OP_CRMINSTAR:
2602 case OP_CRPLUS:
2603 case OP_CRMINPLUS:
2604 case OP_CRQUERY:
2605 case OP_CRMINQUERY:
2606 c = *ecode++ - OP_CRSTAR;
2607 minimize = (c & 1) != 0;
2608 min = rep_min[c]; /* Pick up values from tables; */
2609 max = rep_max[c]; /* zero for max => infinity */
2610 if (max == 0) max = INT_MAX;
2611 break;
2612
2613 case OP_CRRANGE:
2614 case OP_CRMINRANGE:
2615 minimize = (*ecode == OP_CRMINRANGE);
2616 min = GET2(ecode, 1);
2617 max = GET2(ecode, 1 + IMM2_SIZE);
2618 if (max == 0) max = INT_MAX;
2619 ecode += 1 + 2 * IMM2_SIZE;
2620 break;
2621
2622 default: /* No repeat follows */
2623 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2624 {
2625 CHECK_PARTIAL();
2626 RRETURN(MATCH_NOMATCH);
2627 }
2628 eptr += length;
2629 continue; /* With the main loop */
2630 }
2631
2632 /* Handle repeated back references. If the length of the reference is
2633 zero, just continue with the main loop. If the length is negative, it
2634 means the reference is unset in non-Java-compatible mode. If the minimum is
2635 zero, we can continue at the same level without recursion. For any other
2636 minimum, carrying on will result in NOMATCH. */
2637
2638 if (length == 0) continue;
2639 if (length < 0 && min == 0) continue;
2640
2641 /* First, ensure the minimum number of matches are present. We get back
2642 the length of the reference string explicitly rather than passing the
2643 address of eptr, so that eptr can be a register variable. */
2644
2645 for (i = 1; i <= min; i++)
2646 {
2647 int slength;
2648 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2649 {
2650 CHECK_PARTIAL();
2651 RRETURN(MATCH_NOMATCH);
2652 }
2653 eptr += slength;
2654 }
2655
2656 /* If min = max, continue at the same level without recursion.
2657 They are not both allowed to be zero. */
2658
2659 if (min == max) continue;
2660
2661 /* If minimizing, keep trying and advancing the pointer */
2662
2663 if (minimize)
2664 {
2665 for (fi = min;; fi++)
2666 {
2667 int slength;
2668 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2670 if (fi >= max) RRETURN(MATCH_NOMATCH);
2671 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2672 {
2673 CHECK_PARTIAL();
2674 RRETURN(MATCH_NOMATCH);
2675 }
2676 eptr += slength;
2677 }
2678 /* Control never gets here */
2679 }
2680
2681 /* If maximizing, find the longest string and work backwards */
2682
2683 else
2684 {
2685 pp = eptr;
2686 for (i = min; i < max; i++)
2687 {
2688 int slength;
2689 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2690 {
2691 CHECK_PARTIAL();
2692 break;
2693 }
2694 eptr += slength;
2695 }
2696 while (eptr >= pp)
2697 {
2698 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2700 eptr -= length;
2701 }
2702 RRETURN(MATCH_NOMATCH);
2703 }
2704 /* Control never gets here */
2705
2706 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2707 used when all the characters in the class have values in the range 0-255,
2708 and either the matching is caseful, or the characters are in the range
2709 0-127 when UTF-8 processing is enabled. The only difference between
2710 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2711 encountered.
2712
2713 First, look past the end of the item to see if there is repeat information
2714 following. Then obey similar code to character type repeats - written out
2715 again for speed. */
2716
2717 case OP_NCLASS:
2718 case OP_CLASS:
2719 {
2720 /* The data variable is saved across frames, so the byte map needs to
2721 be stored there. */
2722 #define BYTE_MAP ((pcre_uint8 *)data)
2723 data = ecode + 1; /* Save for matching */
2724 ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
2725
2726 switch (*ecode)
2727 {
2728 case OP_CRSTAR:
2729 case OP_CRMINSTAR:
2730 case OP_CRPLUS:
2731 case OP_CRMINPLUS:
2732 case OP_CRQUERY:
2733 case OP_CRMINQUERY:
2734 c = *ecode++ - OP_CRSTAR;
2735 minimize = (c & 1) != 0;
2736 min = rep_min[c]; /* Pick up values from tables; */
2737 max = rep_max[c]; /* zero for max => infinity */
2738 if (max == 0) max = INT_MAX;
2739 break;
2740
2741 case OP_CRRANGE:
2742 case OP_CRMINRANGE:
2743 minimize = (*ecode == OP_CRMINRANGE);
2744 min = GET2(ecode, 1);
2745 max = GET2(ecode, 1 + IMM2_SIZE);
2746 if (max == 0) max = INT_MAX;
2747 ecode += 1 + 2 * IMM2_SIZE;
2748 break;
2749
2750 default: /* No repeat follows */
2751 min = max = 1;
2752 break;
2753 }
2754
2755 /* First, ensure the minimum number of matches are present. */
2756
2757 #ifdef SUPPORT_UTF
2758 if (utf)
2759 {
2760 for (i = 1; i <= min; i++)
2761 {
2762 if (eptr >= md->end_subject)
2763 {
2764 SCHECK_PARTIAL();
2765 RRETURN(MATCH_NOMATCH);
2766 }
2767 GETCHARINC(c, eptr);
2768 if (c > 255)
2769 {
2770 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2771 }
2772 else
2773 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2774 }
2775 }
2776 else
2777 #endif
2778 /* Not UTF mode */
2779 {
2780 for (i = 1; i <= min; i++)
2781 {
2782 if (eptr >= md->end_subject)
2783 {
2784 SCHECK_PARTIAL();
2785 RRETURN(MATCH_NOMATCH);
2786 }
2787 c = *eptr++;
2788 #ifndef COMPILE_PCRE8
2789 if (c > 255)
2790 {
2791 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2792 }
2793 else
2794 #endif
2795 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2796 }
2797 }
2798
2799 /* If max == min we can continue with the main loop without the
2800 need to recurse. */
2801
2802 if (min == max) continue;
2803
2804 /* If minimizing, keep testing the rest of the expression and advancing
2805 the pointer while it matches the class. */
2806
2807 if (minimize)
2808 {
2809 #ifdef SUPPORT_UTF
2810 if (utf)
2811 {
2812 for (fi = min;; fi++)
2813 {
2814 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2816 if (fi >= max) RRETURN(MATCH_NOMATCH);
2817 if (eptr >= md->end_subject)
2818 {
2819 SCHECK_PARTIAL();
2820 RRETURN(MATCH_NOMATCH);
2821 }
2822 GETCHARINC(c, eptr);
2823 if (c > 255)
2824 {
2825 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2826 }
2827 else
2828 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2829 }
2830 }
2831 else
2832 #endif
2833 /* Not UTF mode */
2834 {
2835 for (fi = min;; fi++)
2836 {
2837 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2838 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2839 if (fi >= max) RRETURN(MATCH_NOMATCH);
2840 if (eptr >= md->end_subject)
2841 {
2842 SCHECK_PARTIAL();
2843 RRETURN(MATCH_NOMATCH);
2844 }
2845 c = *eptr++;
2846 #ifndef COMPILE_PCRE8
2847 if (c > 255)
2848 {
2849 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2850 }
2851 else
2852 #endif
2853 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2854 }
2855 }
2856 /* Control never gets here */
2857 }
2858
2859 /* If maximizing, find the longest possible run, then work backwards. */
2860
2861 else
2862 {
2863 pp = eptr;
2864
2865 #ifdef SUPPORT_UTF
2866 if (utf)
2867 {
2868 for (i = min; i < max; i++)
2869 {
2870 int len = 1;
2871 if (eptr >= md->end_subject)
2872 {
2873 SCHECK_PARTIAL();
2874 break;
2875 }
2876 GETCHARLEN(c, eptr, len);
2877 if (c > 255)
2878 {
2879 if (op == OP_CLASS) break;
2880 }
2881 else
2882 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2883 eptr += len;
2884 }
2885 for (;;)
2886 {
2887 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2888 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2889 if (eptr-- == pp) break; /* Stop if tried at original pos */
2890 BACKCHAR(eptr);
2891 }
2892 }
2893 else
2894 #endif
2895 /* Not UTF mode */
2896 {
2897 for (i = min; i < max; i++)
2898 {
2899 if (eptr >= md->end_subject)
2900 {
2901 SCHECK_PARTIAL();
2902 break;
2903 }
2904 c = *eptr;
2905 #ifndef COMPILE_PCRE8
2906 if (c > 255)
2907 {
2908 if (op == OP_CLASS) break;
2909 }
2910 else
2911 #endif
2912 if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
2913 eptr++;
2914 }
2915 while (eptr >= pp)
2916 {
2917 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2919 eptr--;
2920 }
2921 }
2922
2923 RRETURN(MATCH_NOMATCH);
2924 }
2925 #undef BYTE_MAP
2926 }
2927 /* Control never gets here */
2928
2929
2930 /* Match an extended character class. This opcode is encountered only
2931 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2932 mode, because Unicode properties are supported in non-UTF-8 mode. */
2933
2934 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2935 case OP_XCLASS:
2936 {
2937 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2938 ecode += GET(ecode, 1); /* Advance past the item */
2939
2940 switch (*ecode)
2941 {
2942 case OP_CRSTAR:
2943 case OP_CRMINSTAR:
2944 case OP_CRPLUS:
2945 case OP_CRMINPLUS:
2946 case OP_CRQUERY:
2947 case OP_CRMINQUERY:
2948 c = *ecode++ - OP_CRSTAR;
2949 minimize = (c & 1) != 0;
2950 min = rep_min[c]; /* Pick up values from tables; */
2951 max = rep_max[c]; /* zero for max => infinity */
2952 if (max == 0) max = INT_MAX;
2953 break;
2954
2955 case OP_CRRANGE:
2956 case OP_CRMINRANGE:
2957 minimize = (*ecode == OP_CRMINRANGE);
2958 min = GET2(ecode, 1);
2959 max = GET2(ecode, 1 + IMM2_SIZE);
2960 if (max == 0) max = INT_MAX;
2961 ecode += 1 + 2 * IMM2_SIZE;
2962 break;
2963
2964 default: /* No repeat follows */
2965 min = max = 1;
2966 break;
2967 }
2968
2969 /* First, ensure the minimum number of matches are present. */
2970
2971 for (i = 1; i <= min; i++)
2972 {
2973 if (eptr >= md->end_subject)
2974 {
2975 SCHECK_PARTIAL();
2976 RRETURN(MATCH_NOMATCH);
2977 }
2978 GETCHARINCTEST(c, eptr);
2979 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
2980 }
2981
2982 /* If max == min we can continue with the main loop without the
2983 need to recurse. */
2984
2985 if (min == max) continue;
2986
2987 /* If minimizing, keep testing the rest of the expression and advancing
2988 the pointer while it matches the class. */
2989
2990 if (minimize)
2991 {
2992 for (fi = min;; fi++)
2993 {
2994 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2996 if (fi >= max) RRETURN(MATCH_NOMATCH);
2997 if (eptr >= md->end_subject)
2998 {
2999 SCHECK_PARTIAL();
3000 RRETURN(MATCH_NOMATCH);
3001 }
3002 GETCHARINCTEST(c, eptr);
3003 if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
3004 }
3005 /* Control never gets here */
3006 }
3007
3008 /* If maximizing, find the longest possible run, then work backwards. */
3009
3010 else
3011 {
3012 pp = eptr;
3013 for (i = min; i < max; i++)
3014 {
3015 int len = 1;
3016 if (eptr >= md->end_subject)
3017 {
3018 SCHECK_PARTIAL();
3019 break;
3020 }
3021 #ifdef SUPPORT_UTF
3022 GETCHARLENTEST(c, eptr, len);
3023 #else
3024 c = *eptr;
3025 #endif
3026 if (!PRIV(xclass)(c, data, utf)) break;
3027 eptr += len;
3028 }
3029 for(;;)
3030 {
3031 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
3032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3033 if (eptr-- == pp) break; /* Stop if tried at original pos */
3034 #ifdef SUPPORT_UTF
3035 if (utf) BACKCHAR(eptr);
3036 #endif
3037 }
3038 RRETURN(MATCH_NOMATCH);
3039 }
3040
3041 /* Control never gets here */
3042 }
3043 #endif /* End of XCLASS */
3044
3045 /* Match a single character, casefully */
3046
3047 case OP_CHAR:
3048 #ifdef SUPPORT_UTF
3049 if (utf)
3050 {
3051 length = 1;
3052 ecode++;
3053 GETCHARLEN(fc, ecode, length);
3054 if (length > md->end_subject - eptr)
3055 {
3056 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
3057 RRETURN(MATCH_NOMATCH);
3058 }
3059 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
3060 }
3061 else
3062 #endif
3063 /* Not UTF mode */
3064 {
3065 if (md->end_subject - eptr < 1)
3066 {
3067 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3068 RRETURN(MATCH_NOMATCH);
3069 }
3070 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
3071 ecode += 2;
3072 }
3073 break;
3074
3075 /* Match a single character, caselessly. If we are at the end of the
3076 subject, give up immediately. */
3077
3078 case OP_CHARI:
3079 if (eptr >= md->end_subject)
3080 {
3081 SCHECK_PARTIAL();
3082 RRETURN(MATCH_NOMATCH);
3083 }
3084
3085 #ifdef SUPPORT_UTF
3086 if (utf)
3087 {
3088 length = 1;
3089 ecode++;
3090 GETCHARLEN(fc, ecode, length);
3091
3092 /* If the pattern character's value is < 128, we have only one byte, and
3093 we know that its other case must also be one byte long, so we can use the
3094 fast lookup table. We know that there is at least one byte left in the
3095 subject. */
3096
3097 if (fc < 128)
3098 {
3099 if (md->lcc[fc]
3100 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3101 ecode++;
3102 eptr++;
3103 }
3104
3105 /* Otherwise we must pick up the subject character. Note that we cannot
3106 use the value of "length" to check for sufficient bytes left, because the
3107 other case of the character may have more or fewer bytes. */
3108
3109 else
3110 {
3111 unsigned int dc;
3112 GETCHARINC(dc, eptr);
3113 ecode += length;
3114
3115 /* If we have Unicode property support, we can use it to test the other
3116 case of the character, if there is one. */
3117
3118 if (fc != dc)
3119 {
3120 #ifdef SUPPORT_UCP
3121 if (dc != UCD_OTHERCASE(fc))
3122 #endif
3123 RRETURN(MATCH_NOMATCH);
3124 }
3125 }
3126 }
3127 else
3128 #endif /* SUPPORT_UTF */
3129
3130 /* Not UTF mode */
3131 {
3132 if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3133 != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3134 eptr++;
3135 ecode += 2;
3136 }
3137 break;
3138
3139 /* Match a single character repeatedly. */
3140
3141 case OP_EXACT:
3142 case OP_EXACTI:
3143 min = max = GET2(ecode, 1);
3144 ecode += 1 + IMM2_SIZE;
3145 goto REPEATCHAR;
3146
3147 case OP_POSUPTO:
3148 case OP_POSUPTOI:
3149 possessive = TRUE;
3150 /* Fall through */
3151
3152 case OP_UPTO:
3153 case OP_UPTOI:
3154 case OP_MINUPTO:
3155 case OP_MINUPTOI:
3156 min = 0;
3157 max = GET2(ecode, 1);
3158 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3159 ecode += 1 + IMM2_SIZE;
3160 goto REPEATCHAR;
3161
3162 case OP_POSSTAR:
3163 case OP_POSSTARI:
3164 possessive = TRUE;
3165 min = 0;
3166 max = INT_MAX;
3167 ecode++;
3168 goto REPEATCHAR;
3169
3170 case OP_POSPLUS:
3171 case OP_POSPLUSI:
3172 possessive = TRUE;
3173 min = 1;
3174 max = INT_MAX;
3175 ecode++;
3176 goto REPEATCHAR;
3177
3178 case OP_POSQUERY:
3179 case OP_POSQUERYI:
3180 possessive = TRUE;
3181 min = 0;
3182 max = 1;
3183 ecode++;
3184 goto REPEATCHAR;
3185
3186 case OP_STAR:
3187 case OP_STARI:
3188 case OP_MINSTAR:
3189 case OP_MINSTARI:
3190 case OP_PLUS:
3191 case OP_PLUSI:
3192 case OP_MINPLUS:
3193 case OP_MINPLUSI:
3194 case OP_QUERY:
3195 case OP_QUERYI:
3196 case OP_MINQUERY:
3197 case OP_MINQUERYI:
3198 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3199 minimize = (c & 1) != 0;
3200 min = rep_min[c]; /* Pick up values from tables; */
3201 max = rep_max[c]; /* zero for max => infinity */
3202 if (max == 0) max = INT_MAX;
3203
3204 /* Common code for all repeated single-character matches. */
3205
3206 REPEATCHAR:
3207 #ifdef SUPPORT_UTF
3208 if (utf)
3209 {
3210 length = 1;
3211 charptr = ecode;
3212 GETCHARLEN(fc, ecode, length);
3213 ecode += length;
3214
3215 /* Handle multibyte character matching specially here. There is
3216 support for caseless matching if UCP support is present. */
3217
3218 if (length > 1)
3219 {
3220 #ifdef SUPPORT_UCP
3221 unsigned int othercase;
3222 if (op >= OP_STARI && /* Caseless */
3223 (othercase = UCD_OTHERCASE(fc)) != fc)
3224 oclength = PRIV(ord2utf)(othercase, occhars);
3225 else oclength = 0;
3226 #endif /* SUPPORT_UCP */
3227
3228 for (i = 1; i <= min; i++)
3229 {
3230 if (eptr <= md->end_subject - length &&
3231 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3232 #ifdef SUPPORT_UCP
3233 else if (oclength > 0 &&
3234 eptr <= md->end_subject - oclength &&
3235 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3236 #endif /* SUPPORT_UCP */
3237 else
3238 {
3239 CHECK_PARTIAL();
3240 RRETURN(MATCH_NOMATCH);
3241 }
3242 }
3243
3244 if (min == max) continue;
3245
3246 if (minimize)
3247 {
3248 for (fi = min;; fi++)
3249 {
3250 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3251 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3252 if (fi >= max) RRETURN(MATCH_NOMATCH);
3253 if (eptr <= md->end_subject - length &&
3254 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3255 #ifdef SUPPORT_UCP
3256 else if (oclength > 0 &&
3257 eptr <= md->end_subject - oclength &&
3258 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3259 #endif /* SUPPORT_UCP */
3260 else
3261 {
3262 CHECK_PARTIAL();
3263 RRETURN(MATCH_NOMATCH);
3264 }
3265 }
3266 /* Control never gets here */
3267 }
3268
3269 else /* Maximize */
3270 {
3271 pp = eptr;
3272 for (i = min; i < max; i++)
3273 {
3274 if (eptr <= md->end_subject - length &&
3275 memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
3276 #ifdef SUPPORT_UCP
3277 else if (oclength > 0 &&
3278 eptr <= md->end_subject - oclength &&
3279 memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
3280 #endif /* SUPPORT_UCP */
3281 else
3282 {
3283 CHECK_PARTIAL();
3284 break;
3285 }
3286 }
3287
3288 if (possessive) continue;
3289
3290 for(;;)
3291 {
3292 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3293 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3294 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
3295 #ifdef SUPPORT_UCP
3296 eptr--;
3297 BACKCHAR(eptr);
3298 #else /* without SUPPORT_UCP */
3299 eptr -= length;
3300 #endif /* SUPPORT_UCP */
3301 }
3302 }
3303 /* Control never gets here */
3304 }
3305
3306 /* If the length of a UTF-8 character is 1, we fall through here, and
3307 obey the code as for non-UTF-8 characters below, though in this case the
3308 value of fc will always be < 128. */
3309 }
3310 else
3311 #endif /* SUPPORT_UTF */
3312 /* When not in UTF-8 mode, load a single-byte character. */
3313 fc = *ecode++;
3314
3315 /* The value of fc at this point is always one character, though we may
3316 or may not be in UTF mode. The code is duplicated for the caseless and
3317 caseful cases, for speed, since matching characters is likely to be quite
3318 common. First, ensure the minimum number of matches are present. If min =
3319 max, continue at the same level without recursing. Otherwise, if
3320 minimizing, keep trying the rest of the expression and advancing one
3321 matching character if failing, up to the maximum. Alternatively, if
3322 maximizing, find the maximum number of characters and work backwards. */
3323
3324 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3325 max, eptr));
3326
3327 if (op >= OP_STARI) /* Caseless */
3328 {
3329 #ifdef COMPILE_PCRE8
3330 /* fc must be < 128 if UTF is enabled. */
3331 foc = md->fcc[fc];
3332 #else
3333 #ifdef SUPPORT_UTF
3334 #ifdef SUPPORT_UCP
3335 if (utf && fc > 127)
3336 foc = UCD_OTHERCASE(fc);
3337 #else
3338 if (utf && fc > 127)
3339 foc = fc;
3340 #endif /* SUPPORT_UCP */
3341 else
3342 #endif /* SUPPORT_UTF */
3343 foc = TABLE_GET(fc, md->fcc, fc);
3344 #endif /* COMPILE_PCRE8 */
3345
3346 for (i = 1; i <= min; i++)
3347 {
3348 if (eptr >= md->end_subject)
3349 {
3350 SCHECK_PARTIAL();
3351 RRETURN(MATCH_NOMATCH);
3352 }
3353 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3354 eptr++;
3355 }
3356 if (min == max) continue;
3357 if (minimize)
3358 {
3359 for (fi = min;; fi++)
3360 {
3361 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3362 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3363 if (fi >= max) RRETURN(MATCH_NOMATCH);
3364 if (eptr >= md->end_subject)
3365 {
3366 SCHECK_PARTIAL();
3367 RRETURN(MATCH_NOMATCH);
3368 }
3369 if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
3370 eptr++;
3371 }
3372 /* Control never gets here */
3373 }
3374 else /* Maximize */
3375 {
3376 pp = eptr;
3377 for (i = min; i < max; i++)
3378 {
3379 if (eptr >= md->end_subject)
3380 {
3381 SCHECK_PARTIAL();
3382 break;
3383 }
3384 if (fc != *eptr && foc != *eptr) break;
3385 eptr++;
3386 }
3387
3388 if (possessive) continue;
3389
3390 while (eptr >= pp)
3391 {
3392 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3393 eptr--;
3394 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3395 }
3396 RRETURN(MATCH_NOMATCH);
3397 }
3398 /* Control never gets here */
3399 }
3400
3401 /* Caseful comparisons (includes all multi-byte characters) */
3402
3403 else
3404 {
3405 for (i = 1; i <= min; i++)
3406 {
3407 if (eptr >= md->end_subject)
3408 {
3409 SCHECK_PARTIAL();
3410 RRETURN(MATCH_NOMATCH);
3411 }
3412 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3413 }
3414
3415 if (min == max) continue;
3416
3417 if (minimize)
3418 {
3419 for (fi = min;; fi++)
3420 {
3421 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3422 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3423 if (fi >= max) RRETURN(MATCH_NOMATCH);
3424 if (eptr >= md->end_subject)
3425 {
3426 SCHECK_PARTIAL();
3427 RRETURN(MATCH_NOMATCH);
3428 }
3429 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
3430 }
3431 /* Control never gets here */
3432 }
3433 else /* Maximize */
3434 {
3435 pp = eptr;
3436 for (i = min; i < max; i++)
3437 {
3438 if (eptr >= md->end_subject)
3439 {
3440 SCHECK_PARTIAL();
3441 break;
3442 }
3443 if (fc != *eptr) break;
3444 eptr++;
3445 }
3446 if (possessive) continue;
3447
3448 while (eptr >= pp)
3449 {
3450 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3451 eptr--;
3452 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3453 }
3454 RRETURN(MATCH_NOMATCH);
3455 }
3456 }
3457 /* Control never gets here */
3458
3459 /* Match a negated single one-byte character. The character we are
3460 checking can be multibyte. */
3461
3462 case OP_NOT:
3463 case OP_NOTI:
3464 if (eptr >= md->end_subject)
3465 {
3466 SCHECK_PARTIAL();
3467 RRETURN(MATCH_NOMATCH);
3468 }
3469 ecode++;
3470 GETCHARINCTEST(c, eptr);
3471 if (op == OP_NOTI) /* The caseless case */
3472 {
3473 register int ch, och;
3474 ch = *ecode++;
3475 #ifdef COMPILE_PCRE8
3476 /* ch must be < 128 if UTF is enabled. */
3477 och = md->fcc[ch];
3478 #else
3479 #ifdef SUPPORT_UTF
3480 #ifdef SUPPORT_UCP
3481 if (utf && ch > 127)
3482 och = UCD_OTHERCASE(ch);
3483 #else
3484 if (utf && ch > 127)
3485 och = ch;
3486 #endif /* SUPPORT_UCP */
3487 else
3488 #endif /* SUPPORT_UTF */
3489 och = TABLE_GET(ch, md->fcc, ch);
3490 #endif /* COMPILE_PCRE8 */
3491 if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3492 }
3493 else /* Caseful */
3494 {
3495 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
3496 }
3497 break;
3498
3499 /* Match a negated single one-byte character repeatedly. This is almost a
3500 repeat of the code for a repeated single character, but I haven't found a
3501 nice way of commoning these up that doesn't require a test of the
3502 positive/negative option for each character match. Maybe that wouldn't add
3503 very much to the time taken, but character matching *is* what this is all
3504 about... */
3505
3506 case OP_NOTEXACT:
3507 case OP_NOTEXACTI:
3508 min = max = GET2(ecode, 1);
3509 ecode += 1 + IMM2_SIZE;
3510 goto REPEATNOTCHAR;
3511
3512 case OP_NOTUPTO:
3513 case OP_NOTUPTOI:
3514 case OP_NOTMINUPTO:
3515 case OP_NOTMINUPTOI:
3516 min = 0;
3517 max = GET2(ecode, 1);
3518 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3519 ecode += 1 + IMM2_SIZE;
3520 goto REPEATNOTCHAR;
3521
3522 case OP_NOTPOSSTAR:
3523 case OP_NOTPOSSTARI:
3524 possessive = TRUE;
3525 min = 0;
3526 max = INT_MAX;
3527 ecode++;
3528 goto REPEATNOTCHAR;
3529
3530 case OP_NOTPOSPLUS:
3531 case OP_NOTPOSPLUSI:
3532 possessive = TRUE;
3533 min = 1;
3534 max = INT_MAX;
3535 ecode++;
3536 goto REPEATNOTCHAR;
3537
3538 case OP_NOTPOSQUERY:
3539 case OP_NOTPOSQUERYI:
3540 possessive = TRUE;
3541 min = 0;
3542 max = 1;
3543 ecode++;
3544 goto REPEATNOTCHAR;
3545
3546 case OP_NOTPOSUPTO:
3547 case OP_NOTPOSUPTOI:
3548 possessive = TRUE;
3549 min = 0;
3550 max = GET2(ecode, 1);
3551 ecode += 1 + IMM2_SIZE;
3552 goto REPEATNOTCHAR;
3553
3554 case OP_NOTSTAR:
3555 case OP_NOTSTARI:
3556 case OP_NOTMINSTAR:
3557 case OP_NOTMINSTARI:
3558 case OP_NOTPLUS:
3559 case OP_NOTPLUSI:
3560 case OP_NOTMINPLUS:
3561 case OP_NOTMINPLUSI:
3562 case OP_NOTQUERY:
3563 case OP_NOTQUERYI:
3564 case OP_NOTMINQUERY:
3565 case OP_NOTMINQUERYI:
3566 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3567 minimize = (c & 1) != 0;
3568 min = rep_min[c]; /* Pick up values from tables; */
3569 max = rep_max[c]; /* zero for max => infinity */
3570 if (max == 0) max = INT_MAX;
3571
3572 /* Common code for all repeated single-byte matches. */
3573
3574 REPEATNOTCHAR:
3575 fc = *ecode++;
3576
3577 /* The code is duplicated for the caseless and caseful cases, for speed,
3578 since matching characters is likely to be quite common. First, ensure the
3579 minimum number of matches are present. If min = max, continue at the same
3580 level without recursing. Otherwise, if minimizing, keep trying the rest of
3581 the expression and advancing one matching character if failing, up to the
3582 maximum. Alternatively, if maximizing, find the maximum number of
3583 characters and work backwards. */
3584
3585 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3586 max, eptr));
3587
3588 if (op >= OP_NOTSTARI) /* Caseless */
3589 {
3590 #ifdef COMPILE_PCRE8
3591 /* fc must be < 128 if UTF is enabled. */
3592 foc = md->fcc[fc];
3593 #else
3594 #ifdef SUPPORT_UTF
3595 #ifdef SUPPORT_UCP
3596 if (utf && fc > 127)
3597 foc = UCD_OTHERCASE(fc);
3598 #else
3599 if (utf && fc > 127)
3600 foc = fc;
3601 #endif /* SUPPORT_UCP */
3602 else
3603 #endif /* SUPPORT_UTF */
3604 foc = TABLE_GET(fc, md->fcc, fc);
3605 #endif /* COMPILE_PCRE8 */
3606
3607 #ifdef SUPPORT_UTF
3608 if (utf)
3609 {
3610 register unsigned int d;
3611 for (i = 1; i <= min; i++)
3612 {
3613 if (eptr >= md->end_subject)
3614 {
3615 SCHECK_PARTIAL();
3616 RRETURN(MATCH_NOMATCH);
3617 }
3618 GETCHARINC(d, eptr);
3619 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3620 }
3621 }
3622 else
3623 #endif
3624 /* Not UTF mode */
3625 {
3626 for (i = 1; i <= min; i++)
3627 {
3628 if (eptr >= md->end_subject)
3629 {
3630 SCHECK_PARTIAL();
3631 RRETURN(MATCH_NOMATCH);
3632 }
3633 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3634 eptr++;
3635 }
3636 }
3637
3638 if (min == max) continue;
3639
3640 if (minimize)
3641 {
3642 #ifdef SUPPORT_UTF
3643 if (utf)
3644 {
3645 register unsigned int d;
3646 for (fi = min;; fi++)
3647 {
3648 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3650 if (fi >= max) RRETURN(MATCH_NOMATCH);
3651 if (eptr >= md->end_subject)
3652 {
3653 SCHECK_PARTIAL();
3654 RRETURN(MATCH_NOMATCH);
3655 }
3656 GETCHARINC(d, eptr);
3657 if (fc == d || foc == d) RRETURN(MATCH_NOMATCH);
3658 }
3659 }
3660 else
3661 #endif
3662 /* Not UTF mode */
3663 {
3664 for (fi = min;; fi++)
3665 {
3666 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3668 if (fi >= max) RRETURN(MATCH_NOMATCH);
3669 if (eptr >= md->end_subject)
3670 {
3671 SCHECK_PARTIAL();
3672 RRETURN(MATCH_NOMATCH);
3673 }
3674 if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3675 eptr++;
3676 }
3677 }
3678 /* Control never gets here */
3679 }
3680
3681 /* Maximize case */
3682
3683 else
3684 {
3685 pp = eptr;
3686
3687 #ifdef SUPPORT_UTF
3688 if (utf)
3689 {
3690 register unsigned int d;
3691 for (i = min; i < max; i++)
3692 {
3693 int len = 1;
3694 if (eptr >= md->end_subject)
3695 {
3696 SCHECK_PARTIAL();
3697 break;
3698 }
3699 GETCHARLEN(d, eptr, len);
3700 if (fc == d || foc == d) break;
3701 eptr += len;
3702 }
3703 if (possessive) continue;
3704 for(;;)
3705 {
3706 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3707 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3708 if (eptr-- == pp) break; /* Stop if tried at original pos */
3709 BACKCHAR(eptr);
3710 }
3711 }
3712 else
3713 #endif
3714 /* Not UTF mode */
3715 {
3716 for (i = min; i < max; i++)
3717 {
3718 if (eptr >= md->end_subject)
3719 {
3720 SCHECK_PARTIAL();
3721 break;
3722 }
3723 if (fc == *eptr || foc == *eptr) break;
3724 eptr++;
3725 }
3726 if (possessive) continue;
3727 while (eptr >= pp)
3728 {
3729 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3730 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3731 eptr--;
3732 }
3733 }
3734
3735 RRETURN(MATCH_NOMATCH);
3736 }
3737 /* Control never gets here */
3738 }
3739
3740 /* Caseful comparisons */
3741
3742 else
3743 {
3744 #ifdef SUPPORT_UTF
3745 if (utf)
3746 {
3747 register unsigned int d;
3748 for (i = 1; i <= min; i++)
3749 {
3750 if (eptr >= md->end_subject)
3751 {
3752 SCHECK_PARTIAL();
3753 RRETURN(MATCH_NOMATCH);
3754 }
3755 GETCHARINC(d, eptr);
3756 if (fc == d) RRETURN(MATCH_NOMATCH);
3757 }
3758 }
3759 else
3760 #endif
3761 /* Not UTF mode */
3762 {
3763 for (i = 1; i <= min; i++)
3764 {
3765 if (eptr >= md->end_subject)
3766 {
3767 SCHECK_PARTIAL();
3768 RRETURN(MATCH_NOMATCH);
3769 }
3770 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3771 }
3772 }
3773
3774 if (min == max) continue;
3775
3776 if (minimize)
3777 {
3778 #ifdef SUPPORT_UTF
3779 if (utf)
3780 {
3781 register unsigned int d;
3782 for (fi = min;; fi++)
3783 {
3784 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3785 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3786 if (fi >= max) RRETURN(MATCH_NOMATCH);
3787 if (eptr >= md->end_subject)
3788 {
3789 SCHECK_PARTIAL();
3790 RRETURN(MATCH_NOMATCH);
3791 }
3792 GETCHARINC(d, eptr);
3793 if (fc == d) RRETURN(MATCH_NOMATCH);
3794 }
3795 }
3796 else
3797 #endif
3798 /* Not UTF mode */
3799 {
3800 for (fi = min;; fi++)
3801 {
3802 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3803 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3804 if (fi >= max) RRETURN(MATCH_NOMATCH);
3805 if (eptr >= md->end_subject)
3806 {
3807 SCHECK_PARTIAL();
3808 RRETURN(MATCH_NOMATCH);
3809 }
3810 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3811 }
3812 }
3813 /* Control never gets here */
3814 }
3815
3816 /* Maximize case */
3817
3818 else
3819 {
3820 pp = eptr;
3821
3822 #ifdef SUPPORT_UTF
3823 if (utf)
3824 {
3825 register unsigned int d;
3826 for (i = min; i < max; i++)
3827 {
3828 int len = 1;
3829 if (eptr >= md->end_subject)
3830 {
3831 SCHECK_PARTIAL();
3832 break;
3833 }
3834 GETCHARLEN(d, eptr, len);
3835 if (fc == d) break;
3836 eptr += len;
3837 }
3838 if (possessive) continue;
3839 for(;;)
3840 {
3841 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3842 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3843 if (eptr-- == pp) break; /* Stop if tried at original pos */
3844 BACKCHAR(eptr);
3845 }
3846 }
3847 else
3848 #endif
3849 /* Not UTF mode */
3850 {
3851 for (i = min; i < max; i++)
3852 {
3853 if (eptr >= md->end_subject)
3854 {
3855 SCHECK_PARTIAL();
3856 break;
3857 }
3858 if (fc == *eptr) break;
3859 eptr++;
3860 }
3861 if (possessive) continue;
3862 while (eptr >= pp)
3863 {
3864 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3866 eptr--;
3867 }
3868 }
3869
3870 RRETURN(MATCH_NOMATCH);
3871 }
3872 }
3873 /* Control never gets here */
3874
3875 /* Match a single character type repeatedly; several different opcodes
3876 share code. This is very similar to the code for single characters, but we
3877 repeat it in the interests of efficiency. */
3878
3879 case OP_TYPEEXACT:
3880 min = max = GET2(ecode, 1);
3881 minimize = TRUE;
3882 ecode += 1 + IMM2_SIZE;
3883 goto REPEATTYPE;
3884
3885 case OP_TYPEUPTO:
3886 case OP_TYPEMINUPTO:
3887 min = 0;
3888 max = GET2(ecode, 1);
3889 minimize = *ecode == OP_TYPEMINUPTO;
3890 ecode += 1 + IMM2_SIZE;
3891 goto REPEATTYPE;
3892
3893 case OP_TYPEPOSSTAR:
3894 possessive = TRUE;
3895 min = 0;
3896 max = INT_MAX;
3897 ecode++;
3898 goto REPEATTYPE;
3899
3900 case OP_TYPEPOSPLUS:
3901 possessive = TRUE;
3902 min = 1;
3903 max = INT_MAX;
3904 ecode++;
3905 goto REPEATTYPE;
3906
3907 case OP_TYPEPOSQUERY:
3908 possessive = TRUE;
3909 min = 0;
3910 max = 1;
3911 ecode++;
3912 goto REPEATTYPE;
3913
3914 case OP_TYPEPOSUPTO:
3915 possessive = TRUE;
3916 min = 0;
3917 max = GET2(ecode, 1);
3918 ecode += 1 + IMM2_SIZE;
3919 goto REPEATTYPE;
3920
3921 case OP_TYPESTAR:
3922 case OP_TYPEMINSTAR:
3923 case OP_TYPEPLUS:
3924 case OP_TYPEMINPLUS:
3925 case OP_TYPEQUERY:
3926 case OP_TYPEMINQUERY:
3927 c = *ecode++ - OP_TYPESTAR;
3928 minimize = (c & 1) != 0;
3929 min = rep_min[c]; /* Pick up values from tables; */
3930 max = rep_max[c]; /* zero for max => infinity */
3931 if (max == 0) max = INT_MAX;
3932
3933 /* Common code for all repeated single character type matches. Note that
3934 in UTF-8 mode, '.' matches a character of any length, but for the other
3935 character types, the valid characters are all one-byte long. */
3936
3937 REPEATTYPE:
3938 ctype = *ecode++; /* Code for the character type */
3939
3940 #ifdef SUPPORT_UCP
3941 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3942 {
3943 prop_fail_result = ctype == OP_NOTPROP;
3944 prop_type = *ecode++;
3945 prop_value = *ecode++;
3946 }
3947 else prop_type = -1;
3948 #endif
3949
3950 /* First, ensure the minimum number of matches are present. Use inline
3951 code for maximizing the speed, and do the type test once at the start
3952 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3953 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3954 and single-bytes. */
3955
3956 if (min > 0)
3957 {
3958 #ifdef SUPPORT_UCP
3959 if (prop_type >= 0)
3960 {
3961 switch(prop_type)
3962 {
3963 case PT_ANY:
3964 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3965 for (i = 1; i <= min; i++)
3966 {
3967 if (eptr >= md->end_subject)
3968 {
3969 SCHECK_PARTIAL();
3970 RRETURN(MATCH_NOMATCH);
3971 }
3972 GETCHARINCTEST(c, eptr);
3973 }
3974 break;
3975
3976 case PT_LAMP:
3977 for (i = 1; i <= min; i++)
3978 {
3979 int chartype;
3980 if (eptr >= md->end_subject)
3981 {
3982 SCHECK_PARTIAL();
3983 RRETURN(MATCH_NOMATCH);
3984 }
3985 GETCHARINCTEST(c, eptr);
3986 chartype = UCD_CHARTYPE(c);
3987 if ((chartype == ucp_Lu ||
3988 chartype == ucp_Ll ||
3989 chartype == ucp_Lt) == prop_fail_result)
3990 RRETURN(MATCH_NOMATCH);
3991 }
3992 break;
3993
3994 case PT_GC:
3995 for (i = 1; i <= min; i++)
3996 {
3997 if (eptr >= md->end_subject)
3998 {
3999 SCHECK_PARTIAL();
4000 RRETURN(MATCH_NOMATCH);
4001 }
4002 GETCHARINCTEST(c, eptr);
4003 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4004 RRETURN(MATCH_NOMATCH);
4005 }
4006 break;
4007
4008 case PT_PC:
4009 for (i = 1; i <= min; i++)
4010 {
4011 if (eptr >= md->end_subject)
4012 {
4013 SCHECK_PARTIAL();
4014 RRETURN(MATCH_NOMATCH);
4015 }
4016 GETCHARINCTEST(c, eptr);
4017 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4018 RRETURN(MATCH_NOMATCH);
4019 }
4020 break;
4021
4022 case PT_SC:
4023 for (i = 1; i <= min; i++)
4024 {
4025 if (eptr >= md->end_subject)
4026 {
4027 SCHECK_PARTIAL();
4028 RRETURN(MATCH_NOMATCH);
4029 }
4030 GETCHARINCTEST(c, eptr);
4031 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4032 RRETURN(MATCH_NOMATCH);
4033 }
4034 break;
4035
4036 case PT_ALNUM:
4037 for (i = 1; i <= min; i++)
4038 {
4039 int category;
4040 if (eptr >= md->end_subject)
4041 {
4042 SCHECK_PARTIAL();
4043 RRETURN(MATCH_NOMATCH);
4044 }
4045 GETCHARINCTEST(c, eptr);
4046 category = UCD_CATEGORY(c);
4047 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4048 RRETURN(MATCH_NOMATCH);
4049 }
4050 break;
4051
4052 case PT_SPACE: /* Perl space */
4053 for (i = 1; i <= min; i++)
4054 {
4055 if (eptr >= md->end_subject)
4056 {
4057 SCHECK_PARTIAL();
4058 RRETURN(MATCH_NOMATCH);
4059 }
4060 GETCHARINCTEST(c, eptr);
4061 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4062 c == CHAR_FF || c == CHAR_CR)
4063 == prop_fail_result)
4064 RRETURN(MATCH_NOMATCH);
4065 }
4066 break;
4067
4068 case PT_PXSPACE: /* POSIX space */
4069 for (i = 1; i <= min; i++)
4070 {
4071 if (eptr >= md->end_subject)
4072 {
4073 SCHECK_PARTIAL();
4074 RRETURN(MATCH_NOMATCH);
4075 }
4076 GETCHARINCTEST(c, eptr);
4077 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4078 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4079 == prop_fail_result)
4080 RRETURN(MATCH_NOMATCH);
4081 }
4082 break;
4083
4084 case PT_WORD:
4085 for (i = 1; i <= min; i++)
4086 {
4087 int category;
4088 if (eptr >= md->end_subject)
4089 {
4090 SCHECK_PARTIAL();
4091 RRETURN(MATCH_NOMATCH);
4092 }
4093 GETCHARINCTEST(c, eptr);
4094 category = UCD_CATEGORY(c);
4095 if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
4096 == prop_fail_result)
4097 RRETURN(MATCH_NOMATCH);
4098 }
4099 break;
4100
4101 /* This should not occur */
4102
4103 default:
4104 RRETURN(PCRE_ERROR_INTERNAL);
4105 }
4106 }
4107
4108 /* Match extended Unicode sequences. We will get here only if the
4109 support is in the binary; otherwise a compile-time error occurs. */
4110
4111 else if (ctype == OP_EXTUNI)
4112 {
4113 for (i = 1; i <= min; i++)
4114 {
4115 if (eptr >= md->end_subject)
4116 {
4117 SCHECK_PARTIAL();
4118 RRETURN(MATCH_NOMATCH);
4119 }
4120 GETCHARINCTEST(c, eptr);
4121 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4122 while (eptr < md->end_subject)
4123 {
4124 int len = 1;
4125 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4126 if (UCD_CATEGORY(c) != ucp_M) break;
4127 eptr += len;
4128 }
4129 }
4130 }
4131
4132 else
4133 #endif /* SUPPORT_UCP */
4134
4135 /* Handle all other cases when the coding is UTF-8 */
4136
4137 #ifdef SUPPORT_UTF
4138 if (utf) switch(ctype)
4139 {
4140 case OP_ANY:
4141 for (i = 1; i <= min; i++)
4142 {
4143 if (eptr >= md->end_subject)
4144 {
4145 SCHECK_PARTIAL();
4146 RRETURN(MATCH_NOMATCH);
4147 }
4148 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4149 eptr++;
4150 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4151 }
4152 break;
4153
4154 case OP_ALLANY:
4155 for (i = 1; i <= min; i++)
4156 {
4157 if (eptr >= md->end_subject)
4158 {
4159 SCHECK_PARTIAL();
4160 RRETURN(MATCH_NOMATCH);
4161 }
4162 eptr++;
4163 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4164 }
4165 break;
4166
4167 case OP_ANYBYTE:
4168 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
4169 eptr += min;
4170 break;
4171
4172 case OP_ANYNL:
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 RRETURN(MATCH_NOMATCH);
4179 }
4180 GETCHARINC(c, eptr);
4181 switch(c)
4182 {
4183 default: RRETURN(MATCH_NOMATCH);
4184
4185 case 0x000d:
4186 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4187 break;
4188
4189 case 0x000a:
4190 break;
4191
4192 case 0x000b:
4193 case 0x000c:
4194 case 0x0085:
4195 case 0x2028:
4196 case 0x2029:
4197 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4198 break;
4199 }
4200 }
4201 break;
4202
4203 case OP_NOT_HSPACE:
4204 for (i = 1; i <= min; i++)
4205 {
4206 if (eptr >= md->end_subject)
4207 {
4208 SCHECK_PARTIAL();
4209 RRETURN(MATCH_NOMATCH);
4210 }
4211 GETCHARINC(c, eptr);
4212 switch(c)
4213 {
4214 default: break;
4215 case 0x09: /* HT */
4216 case 0x20: /* SPACE */
4217 case 0xa0: /* NBSP */
4218 case 0x1680: /* OGHAM SPACE MARK */
4219 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4220 case 0x2000: /* EN QUAD */
4221 case 0x2001: /* EM QUAD */
4222 case 0x2002: /* EN SPACE */
4223 case 0x2003: /* EM SPACE */
4224 case 0x2004: /* THREE-PER-EM SPACE */
4225 case 0x2005: /* FOUR-PER-EM SPACE */
4226 case 0x2006: /* SIX-PER-EM SPACE */
4227 case 0x2007: /* FIGURE SPACE */
4228 case 0x2008: /* PUNCTUATION SPACE */
4229 case 0x2009: /* THIN SPACE */
4230 case 0x200A: /* HAIR SPACE */
4231 case 0x202f: /* NARROW NO-BREAK SPACE */
4232 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4233 case 0x3000: /* IDEOGRAPHIC SPACE */
4234 RRETURN(MATCH_NOMATCH);
4235 }
4236 }
4237 break;
4238
4239 case OP_HSPACE:
4240 for (i = 1; i <= min; i++)
4241 {
4242 if (eptr >= md->end_subject)
4243 {
4244 SCHECK_PARTIAL();
4245 RRETURN(MATCH_NOMATCH);
4246 }
4247 GETCHARINC(c, eptr);
4248 switch(c)
4249 {
4250 default: RRETURN(MATCH_NOMATCH);
4251 case 0x09: /* HT */
4252 case 0x20: /* SPACE */
4253 case 0xa0: /* NBSP */
4254 case 0x1680: /* OGHAM SPACE MARK */
4255 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4256 case 0x2000: /* EN QUAD */
4257 case 0x2001: /* EM QUAD */
4258 case 0x2002: /* EN SPACE */
4259 case 0x2003: /* EM SPACE */
4260 case 0x2004: /* THREE-PER-EM SPACE */
4261 case 0x2005: /* FOUR-PER-EM SPACE */
4262 case 0x2006: /* SIX-PER-EM SPACE */
4263 case 0x2007: /* FIGURE SPACE */
4264 case 0x2008: /* PUNCTUATION SPACE */
4265 case 0x2009: /* THIN SPACE */
4266 case 0x200A: /* HAIR SPACE */
4267 case 0x202f: /* NARROW NO-BREAK SPACE */
4268 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4269 case 0x3000: /* IDEOGRAPHIC SPACE */
4270 break;
4271 }
4272 }
4273 break;
4274
4275 case OP_NOT_VSPACE:
4276 for (i = 1; i <= min; i++)
4277 {
4278 if (eptr >= md->end_subject)
4279 {
4280 SCHECK_PARTIAL();
4281 RRETURN(MATCH_NOMATCH);
4282 }
4283 GETCHARINC(c, eptr);
4284 switch(c)
4285 {
4286 default: break;
4287 case 0x0a: /* LF */
4288 case 0x0b: /* VT */
4289 case 0x0c: /* FF */
4290 case 0x0d: /* CR */
4291 case 0x85: /* NEL */
4292 case 0x2028: /* LINE SEPARATOR */
4293 case 0x2029: /* PARAGRAPH SEPARATOR */
4294 RRETURN(MATCH_NOMATCH);
4295 }
4296 }
4297 break;
4298
4299 case OP_VSPACE:
4300 for (i = 1; i <= min; i++)
4301 {
4302 if (eptr >= md->end_subject)
4303 {
4304 SCHECK_PARTIAL();
4305 RRETURN(MATCH_NOMATCH);
4306 }
4307 GETCHARINC(c, eptr);
4308 switch(c)
4309 {
4310 default: RRETURN(MATCH_NOMATCH);
4311 case 0x0a: /* LF */
4312 case 0x0b: /* VT */
4313 case 0x0c: /* FF */
4314 case 0x0d: /* CR */
4315 case 0x85: /* NEL */
4316 case 0x2028: /* LINE SEPARATOR */
4317 case 0x2029: /* PARAGRAPH SEPARATOR */
4318 break;
4319 }
4320 }
4321 break;
4322
4323 case OP_NOT_DIGIT:
4324 for (i = 1; i <= min; i++)
4325 {
4326 if (eptr >= md->end_subject)
4327 {
4328 SCHECK_PARTIAL();
4329 RRETURN(MATCH_NOMATCH);
4330 }
4331 GETCHARINC(c, eptr);
4332 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4333 RRETURN(MATCH_NOMATCH);
4334 }
4335 break;
4336
4337 case OP_DIGIT:
4338 for (i = 1; i <= min; i++)
4339 {
4340 if (eptr >= md->end_subject)
4341 {
4342 SCHECK_PARTIAL();
4343 RRETURN(MATCH_NOMATCH);
4344 }
4345 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
4346 RRETURN(MATCH_NOMATCH);
4347 eptr++;
4348 /* No need to skip more bytes - we know it's a 1-byte character */
4349 }
4350 break;
4351
4352 case OP_NOT_WHITESPACE:
4353 for (i = 1; i <= min; i++)
4354 {
4355 if (eptr >= md->end_subject)
4356 {
4357 SCHECK_PARTIAL();
4358 RRETURN(MATCH_NOMATCH);
4359 }
4360 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4361 RRETURN(MATCH_NOMATCH);
4362 eptr++;
4363 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4364 }
4365 break;
4366
4367 case OP_WHITESPACE:
4368 for (i = 1; i <= min; i++)
4369 {
4370 if (eptr >= md->end_subject)
4371 {
4372 SCHECK_PARTIAL();
4373 RRETURN(MATCH_NOMATCH);
4374 }
4375 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
4376 RRETURN(MATCH_NOMATCH);
4377 eptr++;
4378 /* No need to skip more bytes - we know it's a 1-byte character */
4379 }
4380 break;
4381
4382 case OP_NOT_WORDCHAR:
4383 for (i = 1; i <= min; i++)
4384 {
4385 if (eptr >= md->end_subject)
4386 {
4387 SCHECK_PARTIAL();
4388 RRETURN(MATCH_NOMATCH);
4389 }
4390 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4391 RRETURN(MATCH_NOMATCH);
4392 eptr++;
4393 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
4394 }
4395 break;
4396
4397 case OP_WORDCHAR:
4398 for (i = 1; i <= min; i++)
4399 {
4400 if (eptr >= md->end_subject)
4401 {
4402 SCHECK_PARTIAL();
4403 RRETURN(MATCH_NOMATCH);
4404 }
4405 if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
4406 RRETURN(MATCH_NOMATCH);
4407 eptr++;
4408 /* No need to skip more bytes - we know it's a 1-byte character */
4409 }
4410 break;
4411
4412 default:
4413 RRETURN(PCRE_ERROR_INTERNAL);
4414 } /* End switch(ctype) */
4415
4416 else
4417 #endif /* SUPPORT_UTF */
4418
4419 /* Code for the non-UTF-8 case for minimum matching of operators other
4420 than OP_PROP and OP_NOTPROP. */
4421
4422 switch(ctype)
4423 {
4424 case OP_ANY:
4425 for (i = 1; i <= min; i++)
4426 {
4427 if (eptr >= md->end_subject)
4428 {
4429 SCHECK_PARTIAL();
4430 RRETURN(MATCH_NOMATCH);
4431 }
4432 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
4433 eptr++;
4434 }
4435 break;
4436
4437 case OP_ALLANY:
4438 if (eptr > md->end_subject - min)
4439 {
4440 SCHECK_PARTIAL();
4441 RRETURN(MATCH_NOMATCH);
4442 }
4443 eptr += min;
4444 break;
4445
4446 case OP_ANYBYTE:
4447 if (eptr > md->end_subject - min)
4448 {
4449 SCHECK_PARTIAL();
4450 RRETURN(MATCH_NOMATCH);
4451 }
4452 eptr += min;
4453 break;
4454
4455 case OP_ANYNL:
4456 for (i = 1; i <= min; i++)
4457 {
4458 if (eptr >= md->end_subject)
4459 {
4460 SCHECK_PARTIAL();
4461 RRETURN(MATCH_NOMATCH);
4462 }
4463 switch(*eptr++)
4464 {
4465 default: RRETURN(MATCH_NOMATCH);
4466
4467 case 0x000d:
4468 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4469 break;
4470
4471 case 0x000a:
4472 break;
4473
4474 case 0x000b:
4475 case 0x000c:
4476 case 0x0085:
4477 #ifdef COMPILE_PCRE16
4478 case 0x2028:
4479 case 0x2029:
4480 #endif
4481 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4482 break;
4483 }
4484 }
4485 break;
4486
4487 case OP_NOT_HSPACE:
4488 for (i = 1; i <= min; i++)
4489 {
4490 if (eptr >= md->end_subject)
4491 {
4492 SCHECK_PARTIAL();
4493 RRETURN(MATCH_NOMATCH);
4494 }
4495 switch(*eptr++)
4496 {
4497 default: break;
4498 case 0x09: /* HT */
4499 case 0x20: /* SPACE */
4500 case 0xa0: /* NBSP */
4501 #ifdef COMPILE_PCRE16
4502 case 0x1680: /* OGHAM SPACE MARK */
4503 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4504 case 0x2000: /* EN QUAD */
4505 case 0x2001: /* EM QUAD */
4506 case 0x2002: /* EN SPACE */
4507 case 0x2003: /* EM SPACE */
4508 case 0x2004: /* THREE-PER-EM SPACE */
4509 case 0x2005: /* FOUR-PER-EM SPACE */
4510 case 0x2006: /* SIX-PER-EM SPACE */
4511 case 0x2007: /* FIGURE SPACE */
4512 case 0x2008: /* PUNCTUATION SPACE */
4513 case 0x2009: /* THIN SPACE */
4514 case 0x200A: /* HAIR SPACE */
4515 case 0x202f: /* NARROW NO-BREAK SPACE */
4516 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4517 case 0x3000: /* IDEOGRAPHIC SPACE */
4518 #endif
4519 RRETURN(MATCH_NOMATCH);
4520 }
4521 }
4522 break;
4523
4524 case OP_HSPACE:
4525 for (i = 1; i <= min; i++)
4526 {
4527 if (eptr >= md->end_subject)
4528 {
4529 SCHECK_PARTIAL();
4530 RRETURN(MATCH_NOMATCH);
4531 }
4532 switch(*eptr++)
4533 {
4534 default: RRETURN(MATCH_NOMATCH);
4535 case 0x09: /* HT */
4536 case 0x20: /* SPACE */
4537 case 0xa0: /* NBSP */
4538 #ifdef COMPILE_PCRE16
4539 case 0x1680: /* OGHAM SPACE MARK */
4540 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4541 case 0x2000: /* EN QUAD */
4542 case 0x2001: /* EM QUAD */
4543 case 0x2002: /* EN SPACE */
4544 case 0x2003: /* EM SPACE */
4545 case 0x2004: /* THREE-PER-EM SPACE */
4546 case 0x2005: /* FOUR-PER-EM SPACE */
4547 case 0x2006: /* SIX-PER-EM SPACE */
4548 case 0x2007: /* FIGURE SPACE */
4549 case 0x2008: /* PUNCTUATION SPACE */
4550 case 0x2009: /* THIN SPACE */
4551 case 0x200A: /* HAIR SPACE */
4552 case 0x202f: /* NARROW NO-BREAK SPACE */
4553 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4554 case 0x3000: /* IDEOGRAPHIC SPACE */
4555 #endif
4556 break;
4557 }
4558 }
4559 break;
4560
4561 case OP_NOT_VSPACE:
4562 for (i = 1; i <= min; i++)
4563 {
4564 if (eptr >= md->end_subject)
4565 {
4566 SCHECK_PARTIAL();
4567 RRETURN(MATCH_NOMATCH);
4568 }
4569 switch(*eptr++)
4570 {
4571 default: break;
4572 case 0x0a: /* LF */
4573 case 0x0b: /* VT */
4574 case 0x0c: /* FF */
4575 case 0x0d: /* CR */
4576 case 0x85: /* NEL */
4577 #ifdef COMPILE_PCRE16
4578 case 0x2028: /* LINE SEPARATOR */
4579 case 0x2029: /* PARAGRAPH SEPARATOR */
4580 #endif
4581 RRETURN(MATCH_NOMATCH);
4582 }
4583 }
4584 break;
4585
4586 case OP_VSPACE:
4587 for (i = 1; i <= min; i++)
4588 {
4589 if (eptr >= md->end_subject)
4590 {
4591 SCHECK_PARTIAL();
4592 RRETURN(MATCH_NOMATCH);
4593 }
4594 switch(*eptr++)
4595 {
4596 default: RRETURN(MATCH_NOMATCH);
4597 case 0x0a: /* LF */
4598 case 0x0b: /* VT */
4599 case 0x0c: /* FF */
4600 case 0x0d: /* CR */
4601 case 0x85: /* NEL */
4602 #ifdef COMPILE_PCRE16
4603 case 0x2028: /* LINE SEPARATOR */
4604 case 0x2029: /* PARAGRAPH SEPARATOR */
4605 #endif
4606 break;
4607 }
4608 }
4609 break;
4610
4611 case OP_NOT_DIGIT:
4612 for (i = 1; i <= min; i++)
4613 {
4614 if (eptr >= md->end_subject)
4615 {
4616 SCHECK_PARTIAL();
4617 RRETURN(MATCH_NOMATCH);
4618 }
4619 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4620 RRETURN(MATCH_NOMATCH);
4621 eptr++;
4622 }
4623 break;
4624
4625 case OP_DIGIT:
4626 for (i = 1; i <= min; i++)
4627 {
4628 if (eptr >= md->end_subject)
4629 {
4630 SCHECK_PARTIAL();
4631 RRETURN(MATCH_NOMATCH);
4632 }
4633 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4634 RRETURN(MATCH_NOMATCH);
4635 eptr++;
4636 }
4637 break;
4638
4639 case OP_NOT_WHITESPACE:
4640 for (i = 1; i <= min; i++)
4641 {
4642 if (eptr >= md->end_subject)
4643 {
4644 SCHECK_PARTIAL();
4645 RRETURN(MATCH_NOMATCH);
4646 }
4647 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4648 RRETURN(MATCH_NOMATCH);
4649 eptr++;
4650 }
4651 break;
4652
4653 case OP_WHITESPACE:
4654 for (i = 1; i <= min; i++)
4655 {
4656 if (eptr >= md->end_subject)
4657 {
4658 SCHECK_PARTIAL();
4659 RRETURN(MATCH_NOMATCH);
4660 }
4661 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4662 RRETURN(MATCH_NOMATCH);
4663 eptr++;
4664 }
4665 break;
4666
4667 case OP_NOT_WORDCHAR:
4668 for (i = 1; i <= min; i++)
4669 {
4670 if (eptr >= md->end_subject)
4671 {
4672 SCHECK_PARTIAL();
4673 RRETURN(MATCH_NOMATCH);
4674 }
4675 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4676 RRETURN(MATCH_NOMATCH);
4677 eptr++;
4678 }
4679 break;
4680
4681 case OP_WORDCHAR:
4682 for (i = 1; i <= min; i++)
4683 {
4684 if (eptr >= md->end_subject)
4685 {
4686 SCHECK_PARTIAL();
4687 RRETURN(MATCH_NOMATCH);
4688 }
4689 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4690 RRETURN(MATCH_NOMATCH);
4691 eptr++;
4692 }
4693 break;
4694
4695 default:
4696 RRETURN(PCRE_ERROR_INTERNAL);
4697 }
4698 }
4699
4700 /* If min = max, continue at the same level without recursing */
4701
4702 if (min == max) continue;
4703
4704 /* If minimizing, we have to test the rest of the pattern before each
4705 subsequent match. Again, separate the UTF-8 case for speed, and also
4706 separate the UCP cases. */
4707
4708 if (minimize)
4709 {
4710 #ifdef SUPPORT_UCP
4711 if (prop_type >= 0)
4712 {
4713 switch(prop_type)
4714 {
4715 case PT_ANY:
4716 for (fi = min;; fi++)
4717 {
4718 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4720 if (fi >= max) RRETURN(MATCH_NOMATCH);
4721 if (eptr >= md->end_subject)
4722 {
4723 SCHECK_PARTIAL();
4724 RRETURN(MATCH_NOMATCH);
4725 }
4726 GETCHARINCTEST(c, eptr);
4727 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4728 }
4729 /* Control never gets here */
4730
4731 case PT_LAMP:
4732 for (fi = min;; fi++)
4733 {
4734 int chartype;
4735 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4736 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4737 if (fi >= max) RRETURN(MATCH_NOMATCH);
4738 if (eptr >= md->end_subject)
4739 {
4740 SCHECK_PARTIAL();
4741 RRETURN(MATCH_NOMATCH);
4742 }
4743 GETCHARINCTEST(c, eptr);
4744 chartype = UCD_CHARTYPE(c);
4745 if ((chartype == ucp_Lu ||
4746 chartype == ucp_Ll ||
4747 chartype == ucp_Lt) == prop_fail_result)
4748 RRETURN(MATCH_NOMATCH);
4749 }
4750 /* Control never gets here */
4751
4752 case PT_GC:
4753 for (fi = min;; fi++)
4754 {
4755 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4756 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4757 if (fi >= max) RRETURN(MATCH_NOMATCH);
4758 if (eptr >= md->end_subject)
4759 {
4760 SCHECK_PARTIAL();
4761 RRETURN(MATCH_NOMATCH);
4762 }
4763 GETCHARINCTEST(c, eptr);
4764 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4765 RRETURN(MATCH_NOMATCH);
4766 }
4767 /* Control never gets here */
4768
4769 case PT_PC:
4770 for (fi = min;; fi++)
4771 {
4772 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4774 if (fi >= max) RRETURN(MATCH_NOMATCH);
4775 if (eptr >= md->end_subject)
4776 {
4777 SCHECK_PARTIAL();
4778 RRETURN(MATCH_NOMATCH);
4779 }
4780 GETCHARINCTEST(c, eptr);
4781 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4782 RRETURN(MATCH_NOMATCH);
4783 }
4784 /* Control never gets here */
4785
4786 case PT_SC:
4787 for (fi = min;; fi++)
4788 {
4789 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4791 if (fi >= max) RRETURN(MATCH_NOMATCH);
4792 if (eptr >= md->end_subject)
4793 {
4794 SCHECK_PARTIAL();
4795 RRETURN(MATCH_NOMATCH);
4796 }
4797 GETCHARINCTEST(c, eptr);
4798 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4799 RRETURN(MATCH_NOMATCH);
4800 }
4801 /* Control never gets here */
4802
4803 case PT_ALNUM:
4804 for (fi = min;; fi++)
4805 {
4806 int category;
4807 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4809 if (fi >= max) RRETURN(MATCH_NOMATCH);
4810 if (eptr >= md->end_subject)
4811 {
4812 SCHECK_PARTIAL();
4813 RRETURN(MATCH_NOMATCH);
4814 }
4815 GETCHARINCTEST(c, eptr);
4816 category = UCD_CATEGORY(c);
4817 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4818 RRETURN(MATCH_NOMATCH);
4819 }
4820 /* Control never gets here */
4821
4822 case PT_SPACE: /* Perl space */
4823 for (fi = min;; fi++)
4824 {
4825 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4826 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4827 if (fi >= max) RRETURN(MATCH_NOMATCH);
4828 if (eptr >= md->end_subject)
4829 {
4830 SCHECK_PARTIAL();
4831 RRETURN(MATCH_NOMATCH);
4832 }
4833 GETCHARINCTEST(c, eptr);
4834 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4835 c == CHAR_FF || c == CHAR_CR)
4836 == prop_fail_result)
4837 RRETURN(MATCH_NOMATCH);
4838 }
4839 /* Control never gets here */
4840
4841 case PT_PXSPACE: /* POSIX space */
4842 for (fi = min;; fi++)
4843 {
4844 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4845 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4846 if (fi >= max) RRETURN(MATCH_NOMATCH);
4847 if (eptr >= md->end_subject)
4848 {
4849 SCHECK_PARTIAL();
4850 RRETURN(MATCH_NOMATCH);
4851 }
4852 GETCHARINCTEST(c, eptr);
4853 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4854 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4855 == prop_fail_result)
4856 RRETURN(MATCH_NOMATCH);
4857 }
4858 /* Control never gets here */
4859
4860 case PT_WORD:
4861 for (fi = min;; fi++)
4862 {
4863 int category;
4864 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4865 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4866 if (fi >= max) RRETURN(MATCH_NOMATCH);
4867 if (eptr >= md->end_subject)
4868 {
4869 SCHECK_PARTIAL();
4870 RRETURN(MATCH_NOMATCH);
4871 }
4872 GETCHARINCTEST(c, eptr);
4873 category = UCD_CATEGORY(c);
4874 if ((category == ucp_L ||
4875 category == ucp_N ||
4876 c == CHAR_UNDERSCORE)
4877 == prop_fail_result)
4878 RRETURN(MATCH_NOMATCH);
4879 }
4880 /* Control never gets here */
4881
4882 /* This should never occur */
4883
4884 default:
4885 RRETURN(PCRE_ERROR_INTERNAL);
4886 }
4887 }
4888
4889 /* Match extended Unicode sequences. We will get here only if the
4890 support is in the binary; otherwise a compile-time error occurs. */
4891
4892 else if (ctype == OP_EXTUNI)
4893 {
4894 for (fi = min;; fi++)
4895 {
4896 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4897 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4898 if (fi >= max) RRETURN(MATCH_NOMATCH);
4899 if (eptr >= md->end_subject)
4900 {
4901 SCHECK_PARTIAL();
4902 RRETURN(MATCH_NOMATCH);
4903 }
4904 GETCHARINCTEST(c, eptr);
4905 if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
4906 while (eptr < md->end_subject)
4907 {
4908 int len = 1;
4909 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4910 if (UCD_CATEGORY(c) != ucp_M) break;
4911 eptr += len;
4912 }
4913 }
4914 }
4915 else
4916 #endif /* SUPPORT_UCP */
4917
4918 #ifdef SUPPORT_UTF
4919 if (utf)
4920 {
4921 for (fi = min;; fi++)
4922 {
4923 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4925 if (fi >= max) RRETURN(MATCH_NOMATCH);
4926 if (eptr >= md->end_subject)
4927 {
4928 SCHECK_PARTIAL();
4929 RRETURN(MATCH_NOMATCH);
4930 }
4931 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4932 RRETURN(MATCH_NOMATCH);
4933 GETCHARINC(c, eptr);
4934 switch(ctype)
4935 {
4936 case OP_ANY: /* This is the non-NL case */
4937 case OP_ALLANY:
4938 case OP_ANYBYTE:
4939 break;
4940
4941 case OP_ANYNL:
4942 switch(c)
4943 {
4944 default: RRETURN(MATCH_NOMATCH);
4945 case 0x000d:
4946 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4947 break;
4948 case 0x000a:
4949 break;
4950
4951 case 0x000b:
4952 case 0x000c:
4953 case 0x0085:
4954 case 0x2028:
4955 case 0x2029:
4956 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4957 break;
4958 }
4959 break;
4960
4961 case OP_NOT_HSPACE:
4962 switch(c)
4963 {
4964 default: break;
4965 case 0x09: /* HT */
4966 case 0x20: /* SPACE */
4967 case 0xa0: /* NBSP */
4968 case 0x1680: /* OGHAM SPACE MARK */
4969 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4970 case 0x2000: /* EN QUAD */
4971 case 0x2001: /* EM QUAD */
4972 case 0x2002: /* EN SPACE */
4973 case 0x2003: /* EM SPACE */
4974 case 0x2004: /* THREE-PER-EM SPACE */
4975 case 0x2005: /* FOUR-PER-EM SPACE */
4976 case 0x2006: /* SIX-PER-EM SPACE */
4977 case 0x2007: /* FIGURE SPACE */
4978 case 0x2008: /* PUNCTUATION SPACE */
4979 case 0x2009: /* THIN SPACE */
4980 case 0x200A: /* HAIR SPACE */
4981 case 0x202f: /* NARROW NO-BREAK SPACE */
4982 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4983 case 0x3000: /* IDEOGRAPHIC SPACE */
4984 RRETURN(MATCH_NOMATCH);
4985 }
4986 break;
4987
4988 case OP_HSPACE:
4989 switch(c)
4990 {
4991 default: RRETURN(MATCH_NOMATCH);
4992 case 0x09: /* HT */
4993 case 0x20: /* SPACE */
4994 case 0xa0: /* NBSP */
4995 case 0x1680: /* OGHAM SPACE MARK */
4996 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4997 case 0x2000: /* EN QUAD */
4998 case 0x2001: /* EM QUAD */
4999 case 0x2002: /* EN SPACE */
5000 case 0x2003: /* EM SPACE */
5001 case 0x2004: /* THREE-PER-EM SPACE */
5002 case 0x2005: /* FOUR-PER-EM SPACE */
5003 case 0x2006: /* SIX-PER-EM SPACE */
5004 case 0x2007: /* FIGURE SPACE */
5005 case 0x2008: /* PUNCTUATION SPACE */
5006 case 0x2009: /* THIN SPACE */
5007 case 0x200A: /* HAIR SPACE */
5008 case 0x202f: /* NARROW NO-BREAK SPACE */
5009 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5010 case 0x3000: /* IDEOGRAPHIC SPACE */
5011 break;
5012 }
5013 break;
5014
5015 case OP_NOT_VSPACE:
5016 switch(c)
5017 {
5018 default: break;
5019 case 0x0a: /* LF */
5020 case 0x0b: /* VT */
5021 case 0x0c: /* FF */
5022 case 0x0d: /* CR */
5023 case 0x85: /* NEL */
5024 case 0x2028: /* LINE SEPARATOR */
5025 case 0x2029: /* PARAGRAPH SEPARATOR */
5026 RRETURN(MATCH_NOMATCH);
5027 }
5028 break;
5029
5030 case OP_VSPACE:
5031 switch(c)
5032 {
5033 default: RRETURN(MATCH_NOMATCH);
5034 case 0x0a: /* LF */
5035 case 0x0b: /* VT */
5036 case 0x0c: /* FF */
5037 case 0x0d: /* CR */
5038 case 0x85: /* NEL */
5039 case 0x2028: /* LINE SEPARATOR */
5040 case 0x2029: /* PARAGRAPH SEPARATOR */
5041 break;
5042 }
5043 break;
5044
5045 case OP_NOT_DIGIT:
5046 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
5047 RRETURN(MATCH_NOMATCH);
5048 break;
5049
5050 case OP_DIGIT:
5051 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
5052 RRETURN(MATCH_NOMATCH);
5053 break;
5054
5055 case OP_NOT_WHITESPACE:
5056 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
5057 RRETURN(MATCH_NOMATCH);
5058 break;
5059
5060 case OP_WHITESPACE:
5061 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5062 RRETURN(MATCH_NOMATCH);
5063 break;
5064
5065 case OP_NOT_WORDCHAR:
5066 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
5067 RRETURN(MATCH_NOMATCH);
5068 break;
5069
5070 case OP_WORDCHAR:
5071 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
5072 RRETURN(MATCH_NOMATCH);
5073 break;
5074
5075 default:
5076 RRETURN(PCRE_ERROR_INTERNAL);
5077 }
5078 }
5079 }
5080 else
5081 #endif
5082 /* Not UTF mode */
5083 {
5084 for (fi = min;; fi++)
5085 {
5086 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
5087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5088 if (fi >= max) RRETURN(MATCH_NOMATCH);
5089 if (eptr >= md->end_subject)
5090 {
5091 SCHECK_PARTIAL();
5092 RRETURN(MATCH_NOMATCH);
5093 }
5094 if (ctype == OP_ANY && IS_NEWLINE(eptr))
5095 RRETURN(MATCH_NOMATCH);
5096 c = *eptr++;
5097 switch(ctype)
5098 {
5099 case OP_ANY: /* This is the non-NL case */
5100 case OP_ALLANY:
5101 case OP_ANYBYTE:
5102 break;
5103
5104 case OP_ANYNL:
5105 switch(c)
5106 {
5107 default: RRETURN(MATCH_NOMATCH);
5108 case 0x000d:
5109 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5110 break;
5111
5112 case 0x000a:
5113 break;
5114
5115 case 0x000b:
5116 case 0x000c:
5117 case 0x0085:
5118 #ifdef COMPILE_PCRE16
5119 case 0x2028:
5120 case 0x2029:
5121 #endif
5122 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
5123 break;
5124 }
5125 break;
5126
5127 case OP_NOT_HSPACE:
5128 switch(c)
5129 {
5130 default: break;
5131 case 0x09: /* HT */
5132 case 0x20: /* SPACE */
5133 case 0xa0: /* NBSP */
5134 #ifdef COMPILE_PCRE16
5135 case 0x1680: /* OGHAM SPACE MARK */
5136 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5137 case 0x2000: /* EN QUAD */
5138 case 0x2001: /* EM QUAD */
5139 case 0x2002: /* EN SPACE */
5140 case 0x2003: /* EM SPACE */
5141 case 0x2004: /* THREE-PER-EM SPACE */
5142 case 0x2005: /* FOUR-PER-EM SPACE */
5143 case 0x2006: /* SIX-PER-EM SPACE */
5144 case 0x2007: /* FIGURE SPACE */
5145 case 0x2008: /* PUNCTUATION SPACE */
5146 case 0x2009: /* THIN SPACE */
5147 case 0x200A: /* HAIR SPACE */
5148 case 0x202f: /* NARROW NO-BREAK SPACE */
5149 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5150 case 0x3000: /* IDEOGRAPHIC SPACE */
5151 #endif
5152 RRETURN(MATCH_NOMATCH);
5153 }
5154 break;
5155
5156 case OP_HSPACE:
5157 switch(c)
5158 {
5159 default: RRETURN(MATCH_NOMATCH);
5160 case 0x09: /* HT */
5161 case 0x20: /* SPACE */
5162 case 0xa0: /* NBSP */
5163 #ifdef COMPILE_PCRE16
5164 case 0x1680: /* OGHAM SPACE MARK */
5165 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5166 case 0x2000: /* EN QUAD */
5167 case 0x2001: /* EM QUAD */
5168 case 0x2002: /* EN SPACE */
5169 case 0x2003: /* EM SPACE */
5170 case 0x2004: /* THREE-PER-EM SPACE */
5171 case 0x2005: /* FOUR-PER-EM SPACE */
5172 case 0x2006: /* SIX-PER-EM SPACE */
5173 case 0x2007: /* FIGURE SPACE */
5174 case 0x2008: /* PUNCTUATION SPACE */
5175 case 0x2009: /* THIN SPACE */
5176 case 0x200A: /* HAIR SPACE */
5177 case 0x202f: /* NARROW NO-BREAK SPACE */
5178 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5179 case 0x3000: /* IDEOGRAPHIC SPACE */
5180 #endif
5181 break;
5182 }
5183 break;
5184
5185 case OP_NOT_VSPACE:
5186 switch(c)
5187 {
5188 default: break;
5189 case 0x0a: /* LF */
5190 case 0x0b: /* VT */
5191 case 0x0c: /* FF */
5192 case 0x0d: /* CR */
5193 case 0x85: /* NEL */
5194 #ifdef COMPILE_PCRE16
5195 case 0x2028: /* LINE SEPARATOR */
5196 case 0x2029: /* PARAGRAPH SEPARATOR */
5197 #endif
5198 RRETURN(MATCH_NOMATCH);
5199 }
5200 break;
5201
5202 case OP_VSPACE:
5203 switch(c)
5204 {
5205 default: RRETURN(MATCH_NOMATCH);
5206 case 0x0a: /* LF */
5207 case 0x0b: /* VT */
5208 case 0x0c: /* FF */
5209 case 0x0d: /* CR */
5210 case 0x85: /* NEL */
5211 #ifdef COMPILE_PCRE16
5212 case 0x2028: /* LINE SEPARATOR */
5213 case 0x2029: /* PARAGRAPH SEPARATOR */
5214 #endif
5215 break;
5216 }
5217 break;
5218
5219 case OP_NOT_DIGIT:
5220 if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
5221 break;
5222
5223 case OP_DIGIT:
5224 if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
5225 break;
5226
5227 case OP_NOT_WHITESPACE:
5228 if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
5229 break;
5230
5231 case OP_WHITESPACE:
5232 if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
5233 break;
5234
5235 case OP_NOT_WORDCHAR:
5236 if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
5237 break;
5238
5239 case OP_WORDCHAR:
5240 if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
5241 break;
5242
5243 default:
5244 RRETURN(PCRE_ERROR_INTERNAL);
5245 }
5246 }
5247 }
5248 /* Control never gets here */
5249 }
5250
5251 /* If maximizing, it is worth using inline code for speed, doing the type
5252 test once at the start (i.e. keep it out of the loop). Again, keep the
5253 UTF-8 and UCP stuff separate. */
5254
5255 else
5256 {
5257 pp = eptr; /* Remember where we started */
5258
5259 #ifdef SUPPORT_UCP
5260 if (prop_type >= 0)
5261 {
5262 switch(prop_type)
5263 {
5264 case PT_ANY:
5265 for (i = min; i < max; i++)
5266 {
5267 int len = 1;
5268 if (eptr >= md->end_subject)
5269 {
5270 SCHECK_PARTIAL();
5271 break;
5272 }
5273 GETCHARLENTEST(c, eptr, len);
5274 if (prop_fail_result) break;
5275 eptr+= len;
5276 }
5277 break;
5278
5279 case PT_LAMP:
5280 for (i = min; i < max; i++)
5281 {
5282 int chartype;
5283 int len = 1;
5284 if (eptr >= md->end_subject)
5285 {
5286 SCHECK_PARTIAL();
5287 break;
5288 }
5289 GETCHARLENTEST(c, eptr, len);
5290 chartype = UCD_CHARTYPE(c);
5291 if ((chartype == ucp_Lu ||
5292 chartype == ucp_Ll ||
5293 chartype == ucp_Lt) == prop_fail_result)
5294 break;
5295 eptr+= len;
5296 }
5297 break;
5298
5299 case PT_GC:
5300 for (i = min; i < max; i++)
5301 {
5302 int len = 1;
5303 if (eptr >= md->end_subject)
5304 {
5305 SCHECK_PARTIAL();
5306 break;
5307 }
5308 GETCHARLENTEST(c, eptr, len);
5309 if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
5310 eptr+= len;
5311 }
5312 break;
5313
5314 case PT_PC:
5315 for (i = min; i < max; i++)
5316 {
5317 int len = 1;
5318 if (eptr >= md->end_subject)
5319 {
5320 SCHECK_PARTIAL();
5321 break;
5322 }
5323 GETCHARLENTEST(c, eptr, len);
5324 if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
5325 eptr+= len;
5326 }
5327 break;
5328
5329 case PT_SC:
5330 for (i = min; i < max; i++)
5331 {
5332 int len = 1;
5333 if (eptr >= md->end_subject)
5334 {
5335 SCHECK_PARTIAL();
5336 break;
5337 }
5338 GETCHARLENTEST(c, eptr, len);
5339 if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
5340 eptr+= len;
5341 }
5342 break;
5343
5344 case PT_ALNUM:
5345 for (i = min; i < max; i++)
5346 {
5347 int category;
5348 int len = 1;
5349 if (eptr >= md->end_subject)
5350 {
5351 SCHECK_PARTIAL();
5352 break;
5353 }
5354 GETCHARLENTEST(c, eptr, len);
5355 category = UCD_CATEGORY(c);
5356 if ((category == ucp_L || category == ucp_N) == prop_fail_result)
5357 break;
5358 eptr+= len;
5359 }
5360 break;
5361
5362 case PT_SPACE: /* Perl space */
5363 for (i = min; i < max; i++)
5364 {
5365 int len = 1;
5366 if (eptr >= md->end_subject)
5367 {
5368 SCHECK_PARTIAL();
5369 break;
5370 }
5371 GETCHARLENTEST(c, eptr, len);
5372 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5373 c == CHAR_FF || c == CHAR_CR)
5374 == prop_fail_result)
5375 break;
5376 eptr+= len;
5377 }
5378 break;
5379
5380 case PT_PXSPACE: /* POSIX space */
5381 for (i = min; i < max; i++)
5382 {
5383 int len = 1;
5384 if (eptr >= md->end_subject)
5385 {
5386 SCHECK_PARTIAL();
5387 break;
5388 }
5389 GETCHARLENTEST(c, eptr, len);
5390 if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5391 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5392 == prop_fail_result)
5393 break;
5394 eptr+= len;
5395 }
5396 break;
5397
5398 case PT_WORD:
5399 for (i = min; i < max; i++)
5400 {
5401 int category;
5402 int len = 1;
5403 if (eptr >= md->end_subject)
5404 {
5405 SCHECK_PARTIAL();
5406 break;
5407 }
5408 GETCHARLENTEST(c, eptr, len);
5409 category = UCD_CATEGORY(c);
5410 if ((category == ucp_L || category == ucp_N ||
5411 c == CHAR_UNDERSCORE) == prop_fail_result)
5412 break;
5413 eptr+= len;
5414 }
5415 break;
5416
5417 default:
5418 RRETURN(PCRE_ERROR_INTERNAL);
5419 }
5420
5421 /* eptr is now past the end of the maximum run */
5422
5423 if (possessive) continue;
5424 for(;;)
5425 {
5426 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5428 if (eptr-- == pp) break; /* Stop if tried at original pos */
5429 if (utf) BACKCHAR(eptr);
5430 }
5431 }
5432
5433 /* Match extended Unicode sequences. We will get here only if the
5434 support is in the binary; otherwise a compile-time error occurs. */
5435
5436 else if (ctype == OP_EXTUNI)
5437 {
5438 for (i = min; i < max; i++)
5439 {
5440 int len = 1;
5441 if (eptr >= md->end_subject)
5442 {
5443 SCHECK_PARTIAL();
5444 break;
5445 }
5446 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5447 if (UCD_CATEGORY(c) == ucp_M) break;
5448 eptr += len;
5449 while (eptr < md->end_subject)
5450 {
5451 len = 1;
5452 if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5453 if (UCD_CATEGORY(c) != ucp_M) break;
5454 eptr += len;
5455 }
5456 }
5457
5458 /* eptr is now past the end of the maximum run */
5459
5460 if (possessive) continue;
5461
5462 for(;;)
5463 {
5464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5466 if (eptr-- == pp) break; /* Stop if tried at original pos */
5467 for (;;) /* Move back over one extended */
5468 {
5469 if (!utf) c = *eptr; else
5470 {
5471 BACKCHAR(eptr);
5472 GETCHAR(c, eptr);
5473 }
5474 if (UCD_CATEGORY(c) != ucp_M) break;
5475 eptr--;
5476 }
5477 }
5478 }
5479
5480 else
5481 #endif /* SUPPORT_UCP */
5482
5483 #ifdef SUPPORT_UTF
5484 if (utf)
5485 {
5486 switch(ctype)
5487 {
5488 case OP_ANY:
5489 if (max < INT_MAX)
5490 {
5491 for (i = min; i < max; i++)
5492 {
5493 if (eptr >= md->end_subject)
5494 {
5495 SCHECK_PARTIAL();
5496 break;
5497 }
5498 if (IS_NEWLINE(eptr)) break;
5499 eptr++;
5500 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5501 }
5502 }
5503
5504 /* Handle unlimited UTF-8 repeat */
5505
5506 else
5507 {
5508 for (i = min; i < max; i++)
5509 {
5510 if (eptr >= md->end_subject)
5511 {
5512 SCHECK_PARTIAL();
5513 break;
5514 }
5515 if (IS_NEWLINE(eptr)) break;
5516 eptr++;
5517 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5518 }
5519 }
5520 break;
5521
5522 case OP_ALLANY:
5523 if (max < INT_MAX)
5524 {
5525 for (i = min; i < max; i++)
5526 {
5527 if (eptr >= md->end_subject)
5528 {
5529 SCHECK_PARTIAL();
5530 break;
5531 }
5532 eptr++;
5533 ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
5534 }
5535 }
5536 else
5537 {
5538 eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5539 SCHECK_PARTIAL();
5540 }
5541 break;
5542
5543 /* The byte case is the same as non-UTF8 */
5544
5545 case OP_ANYBYTE:
5546 c = max - min;
5547 if (c > (unsigned int)(md->end_subject - eptr))
5548 {
5549 eptr = md->end_subject;
5550 SCHECK_PARTIAL();
5551 }
5552 else eptr += c;
5553 break;
5554
5555 case OP_ANYNL:
5556 for (i = min; i < max; i++)
5557 {
5558 int len = 1;
5559 if (eptr >= md->end_subject)
5560 {
5561 SCHECK_PARTIAL();
5562 break;
5563 }
5564 GETCHARLEN(c, eptr, len);
5565 if (c == 0x000d)
5566 {
5567 if (++eptr >= md->end_subject) break;
5568 if (*eptr == 0x000a) eptr++;
5569 }
5570 else
5571 {
5572 if (c != 0x000a &&
5573 (md->bsr_anycrlf ||
5574 (c != 0x000b && c != 0x000c &&
5575 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5576 break;
5577 eptr += len;
5578 }
5579 }
5580 break;
5581
5582 case OP_NOT_HSPACE:
5583 case OP_HSPACE:
5584 for (i = min; i < max; i++)
5585 {
5586 BOOL gotspace;
5587 int len = 1;
5588 if (eptr >= md->end_subject)
5589 {
5590 SCHECK_PARTIAL();
5591 break;
5592 }
5593 GETCHARLEN(c, eptr, len);
5594 switch(c)
5595 {
5596 default: gotspace = FALSE; break;
5597 case 0x09: /* HT */
5598 case 0x20: /* SPACE */
5599 case 0xa0: /* NBSP */
5600 case 0x1680: /* OGHAM SPACE MARK */
5601 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5602 case 0x2000: /* EN QUAD */
5603 case 0x2001: /* EM QUAD */
5604 case 0x2002: /* EN SPACE */
5605 case 0x2003: /* EM SPACE */
5606 case 0x2004: /* THREE-PER-EM SPACE */
5607 case 0x2005: /* FOUR-PER-EM SPACE */
5608 case 0x2006: /* SIX-PER-EM SPACE */
5609 case 0x2007: /* FIGURE SPACE */
5610 case 0x2008: /* PUNCTUATION SPACE */
5611 case 0x2009: /* THIN SPACE */
5612 case 0x200A: /* HAIR SPACE */
5613 case 0x202f: /* NARROW NO-BREAK SPACE */
5614 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5615 case 0x3000: /* IDEOGRAPHIC SPACE */
5616 gotspace = TRUE;
5617 break;
5618 }
5619 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5620 eptr += len;
5621 }
5622 break;
5623
5624 case OP_NOT_VSPACE:
5625 case OP_VSPACE:
5626 for (i = min; i < max; i++)
5627 {
5628 BOOL gotspace;
5629 int len = 1;
5630 if (eptr >= md->end_subject)
5631 {
5632 SCHECK_PARTIAL();
5633 break;
5634 }
5635 GETCHARLEN(c, eptr, len);
5636 switch(c)
5637 {
5638 default: gotspace = FALSE; break;
5639 case 0x0a: /* LF */
5640 case 0x0b: /* VT */
5641 case 0x0c: /* FF */
5642 case 0x0d: /* CR */
5643 case 0x85: /* NEL */
5644 case 0x2028: /* LINE SEPARATOR */
5645 case 0x2029: /* PARAGRAPH SEPARATOR */
5646 gotspace = TRUE;
5647 break;
5648 }
5649 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5650 eptr += len;
5651 }
5652 break;
5653
5654 case OP_NOT_DIGIT:
5655 for (i = min; i < max; i++)
5656 {
5657 int len = 1;
5658 if (eptr >= md->end_subject)
5659 {
5660 SCHECK_PARTIAL();
5661 break;
5662 }
5663 GETCHARLEN(c, eptr, len);
5664 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5665 eptr+= len;
5666 }
5667 break;
5668
5669 case OP_DIGIT:
5670 for (i = min; i < max; i++)
5671 {
5672 int len = 1;
5673 if (eptr >= md->end_subject)
5674 {
5675 SCHECK_PARTIAL();
5676 break;
5677 }
5678 GETCHARLEN(c, eptr, len);
5679 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5680 eptr+= len;
5681 }
5682 break;
5683
5684 case OP_NOT_WHITESPACE:
5685 for (i = min; i < max; i++)
5686 {
5687 int len = 1;
5688 if (eptr >= md->end_subject)
5689 {
5690 SCHECK_PARTIAL();
5691 break;
5692 }
5693 GETCHARLEN(c, eptr, len);
5694 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5695 eptr+= len;
5696 }
5697 break;
5698
5699 case OP_WHITESPACE:
5700 for (i = min; i < max; i++)
5701 {
5702 int len = 1;
5703 if (eptr >= md->end_subject)
5704 {
5705 SCHECK_PARTIAL();
5706 break;
5707 }
5708 GETCHARLEN(c, eptr, len);
5709 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5710 eptr+= len;
5711 }
5712 break;
5713
5714 case OP_NOT_WORDCHAR:
5715 for (i = min; i < max; i++)
5716 {
5717 int len = 1;
5718 if (eptr >= md->end_subject)
5719 {
5720 SCHECK_PARTIAL();
5721 break;
5722 }
5723 GETCHARLEN(c, eptr, len);
5724 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5725 eptr+= len;
5726 }
5727 break;
5728
5729 case OP_WORDCHAR:
5730 for (i = min; i < max; i++)
5731 {
5732 int len = 1;
5733 if (eptr >= md->end_subject)
5734 {
5735 SCHECK_PARTIAL();
5736 break;
5737 }
5738 GETCHARLEN(c, eptr, len);
5739 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5740 eptr+= len;
5741 }
5742 break;
5743
5744 default:
5745 RRETURN(PCRE_ERROR_INTERNAL);
5746 }
5747
5748 /* eptr is now past the end of the maximum run. If possessive, we are
5749 done (no backing up). Otherwise, match at this position; anything other
5750 than no match is immediately returned. For nomatch, back up one
5751 character, unless we are matching \R and the last thing matched was
5752 \r\n, in which case, back up two bytes. */
5753
5754 if (possessive) continue;
5755 for(;;)
5756 {
5757 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5759 if (eptr-- == pp) break; /* Stop if tried at original pos */
5760 BACKCHAR(eptr);
5761 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5762 eptr[-1] == '\r') eptr--;
5763 }
5764 }
5765 else
5766 #endif /* SUPPORT_UTF */
5767 /* Not UTF mode */
5768 {
5769 switch(ctype)
5770 {
5771 case OP_ANY:
5772 for (i = min; i < max; i++)
5773 {
5774 if (eptr >= md->end_subject)
5775 {
5776 SCHECK_PARTIAL();
5777 break;
5778 }
5779 if (IS_NEWLINE(eptr)) break;
5780 eptr++;
5781 }
5782 break;
5783
5784 case OP_ALLANY:
5785 case OP_ANYBYTE:
5786 c = max - min;
5787 if (c > (unsigned int)(md->end_subject - eptr))
5788 {
5789 eptr = md->end_subject;
5790 SCHECK_PARTIAL();
5791 }
5792 else eptr += c;
5793 break;
5794
5795 case OP_ANYNL:
5796 for (i = min; i < max; i++)
5797 {
5798 if (eptr >= md->end_subject)
5799 {
5800 SCHECK_PARTIAL();
5801 break;
5802 }
5803 c = *eptr;
5804 if (c == 0x000d)
5805 {
5806 if (++eptr >= md->end_subject) break;
5807 if (*eptr == 0x000a) eptr++;
5808 }
5809 else
5810 {
5811 if (c != 0x000a && (md->bsr_anycrlf ||
5812 (c != 0x000b && c != 0x000c && c != 0x0085
5813 #ifdef COMPILE_PCRE16
5814 && c != 0x2028 && c != 0x2029
5815 #endif
5816 ))) break;
5817 eptr++;
5818 }
5819 }
5820 break;
5821
5822 case OP_NOT_HSPACE:
5823 for (i = min; i < max; i++)
5824 {
5825 if (eptr >= md->end_subject)
5826 {
5827 SCHECK_PARTIAL();
5828 break;
5829 }
5830 c = *eptr;
5831 if (c == 0x09 || c == 0x20 || c == 0xa0
5832 #ifdef COMPILE_PCRE16
5833 || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
5834 || c == 0x202f || c == 0x205f || c == 0x3000
5835 #endif
5836 ) break;
5837 eptr++;
5838 }
5839 break;
5840
5841 case OP_HSPACE:
5842 for (i = min; i < max; i++)
5843 {
5844 if (eptr >= md->end_subject)
5845 {
5846 SCHECK_PARTIAL();
5847 break;
5848 }
5849 c = *eptr;
5850 if (c != 0x09 && c != 0x20 && c != 0xa0
5851 #ifdef COMPILE_PCRE16
5852 && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
5853 && c != 0x202f && c != 0x205f && c != 0x3000
5854 #endif
5855 ) break;
5856 eptr++;
5857 }
5858 break;
5859
5860 case OP_NOT_VSPACE:
5861 for (i = min; i < max; i++)
5862 {
5863 if (eptr >= md->end_subject)
5864 {
5865 SCHECK_PARTIAL();
5866 break;
5867 }
5868 c = *eptr;
5869 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
5870 #ifdef COMPILE_PCRE16
5871 || c == 0x2028 || c == 0x2029
5872 #endif
5873 ) break;
5874 eptr++;
5875 }
5876 break;
5877
5878 case OP_VSPACE:
5879 for (i = min; i < max; i++)
5880 {
5881 if (eptr >= md->end_subject)
5882 {
5883 SCHECK_PARTIAL();
5884 break;
5885 }
5886 c = *eptr;
5887 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
5888 #ifdef COMPILE_PCRE16
5889 && c != 0x2028 && c != 0x2029
5890 #endif
5891 ) break;
5892 eptr++;
5893 }
5894 break;
5895
5896 case OP_NOT_DIGIT:
5897 for (i = min; i < max; i++)
5898 {
5899 if (eptr >= md->end_subject)
5900 {
5901 SCHECK_PARTIAL();
5902 break;
5903 }
5904 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
5905 eptr++;
5906 }
5907 break;
5908
5909 case OP_DIGIT:
5910 for (i = min; i < max; i++)
5911 {
5912 if (eptr >= md->end_subject)
5913 {
5914 SCHECK_PARTIAL();
5915 break;
5916 }
5917 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
5918 eptr++;
5919 }
5920 break;
5921
5922 case OP_NOT_WHITESPACE:
5923 for (i = min; i < max; i++)
5924 {
5925 if (eptr >= md->end_subject)
5926 {
5927 SCHECK_PARTIAL();
5928 break;
5929 }
5930 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
5931 eptr++;
5932 }
5933 break;
5934
5935 case OP_WHITESPACE:
5936 for (i = min; i < max; i++)
5937 {
5938 if (eptr >= md->end_subject)
5939 {
5940 SCHECK_PARTIAL();
5941 break;
5942 }
5943 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
5944 eptr++;
5945 }
5946 break;
5947
5948 case OP_NOT_WORDCHAR:
5949 for (i = min; i < max; i++)
5950 {
5951 if (eptr >= md->end_subject)
5952 {
5953 SCHECK_PARTIAL();
5954 break;
5955 }
5956 if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
5957 eptr++;
5958 }
5959 break;
5960
5961 case OP_WORDCHAR:
5962 for (i = min; i < max; i++)
5963 {
5964 if (eptr >= md->end_subject)
5965 {
5966 SCHECK_PARTIAL();
5967 break;
5968 }
5969 if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
5970 eptr++;
5971 }
5972 break;
5973
5974 default:
5975 RRETURN(PCRE_ERROR_INTERNAL);
5976 }
5977
5978 /* eptr is now past the end of the maximum run. If possessive, we are
5979 done (no backing up). Otherwise, match at this position; anything other
5980 than no match is immediately returned. For nomatch, back up one
5981 character (byte), unless we are matching \R and the last thing matched
5982 was \r\n, in which case, back up two bytes. */
5983
5984 if (possessive) continue;
5985 while (eptr >= pp)
5986 {
5987 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5989 eptr--;
5990 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5991 eptr[-1] == '\r') eptr--;
5992 }
5993 }
5994
5995 /* Get here if we can't make it match with any permitted repetitions */
5996
5997 RRETURN(MATCH_NOMATCH);
5998 }
5999 /* Control never gets here */
6000
6001 /* There's been some horrible disaster. Arrival here can only mean there is
6002 something seriously wrong in the code above or the OP_xxx definitions. */
6003
6004 default:
6005 DPRINTF(("Unknown opcode %d\n", *ecode));
6006 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6007 }
6008
6009 /* Do not stick any code in here without much thought; it is assumed
6010 that "continue" in the code above comes out to here to repeat the main
6011 loop. */
6012
6013 } /* End of main loop */
6014 /* Control never reaches here */
6015
6016
6017 /* When compiling to use the heap rather than the stack for recursive calls to
6018 match(), the RRETURN() macro jumps here. The number that is saved in
6019 frame->Xwhere indicates which label we actually want to return to. */
6020
6021 #ifdef NO_RECURSE
6022 #define LBL(val) case val: goto L_RM##val;
6023 HEAP_RETURN:
6024 switch (frame->Xwhere)
6025 {
6026 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6027 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6028 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6029 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
6030 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6031 LBL(65) LBL(66)
6032 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
6033 LBL(21)
6034 #endif
6035 #ifdef SUPPORT_UTF
6036 LBL(16) LBL(18) LBL(20)
6037 LBL(22) LBL(23) LBL(28) LBL(30)
6038 LBL(32) LBL(34) LBL(42) LBL(46)
6039 #ifdef SUPPORT_UCP
6040 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
6041 LBL(59) LBL(60) LBL(61) LBL(62)
6042 #endif /* SUPPORT_UCP */
6043 #endif /* SUPPORT_UTF */
6044 default:
6045 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6046
6047 printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
6048
6049 return PCRE_ERROR_INTERNAL;
6050 }
6051 #undef LBL
6052 #endif /* NO_RECURSE */
6053 }
6054
6055
6056 /***************************************************************************
6057 ****************************************************************************
6058 RECURSION IN THE match() FUNCTION
6059
6060 Undefine all the macros that were defined above to handle this. */
6061
6062 #ifdef NO_RECURSE
6063 #undef eptr
6064 #undef ecode
6065 #undef mstart
6066 #undef offset_top
6067 #undef eptrb
6068 #undef flags
6069
6070 #undef callpat
6071 #undef charptr
6072 #undef data
6073 #undef next
6074 #undef pp
6075 #undef prev
6076 #undef saved_eptr
6077
6078 #undef new_recursive
6079
6080 #undef cur_is_word
6081 #undef condition
6082 #undef prev_is_word
6083
6084 #undef ctype
6085 #undef length
6086 #undef max
6087 #undef min
6088 #undef number
6089 #undef offset
6090 #undef op
6091 #undef save_capture_last
6092 #undef save_offset1
6093 #undef save_offset2
6094 #undef save_offset3
6095 #undef stacksave
6096
6097 #undef newptrb
6098
6099 #endif
6100
6101 /* These two are defined as macros in both cases */
6102
6103 #undef fc
6104 #undef fi
6105
6106 /***************************************************************************
6107 ***************************************************************************/
6108
6109
6110
6111 /*************************************************
6112 * Execute a Regular Expression *
6113 *************************************************/
6114
6115 /* This function applies a compiled re to a subject string and picks out
6116 portions of the string if it matches. Two elements in the vector are set for
6117 each substring: the offsets to the start and end of the substring.
6118
6119 Arguments:
6120 argument_re points to the compiled expression
6121 extra_data points to extra data or is NULL
6122 subject points to the subject string
6123 length length of subject string (may contain binary zeros)
6124 start_offset where to start in the subject string
6125 options option bits
6126 offsets points to a vector of ints to be filled in with offsets
6127 offsetcount the number of elements in the vector
6128
6129 Returns: > 0 => success; value is the number of elements filled in
6130 = 0 => success, but offsets is not big enough
6131 -1 => failed to match
6132 < -1 => some kind of unexpected problem
6133 */
6134
6135 #ifdef COMPILE_PCRE8
6136 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6137 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6138 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6139 int offsetcount)
6140 #else
6141 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6142 pcre16_exec(const pcre *argument_re, const pcre_extra *extra_data,
6143 PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6144 int offsetcount)
6145 #endif
6146 {
6147 int rc, ocount, arg_offset_max;
6148 int newline;
6149 BOOL using_temporary_offsets = FALSE;
6150 BOOL anchored;
6151 BOOL startline;
6152 BOOL firstline;
6153 BOOL utf;
6154 BOOL has_first_char = FALSE;
6155 BOOL has_req_char = FALSE;
6156 pcre_uchar first_char = 0;
6157 pcre_uchar first_char2 = 0;
6158 pcre_uchar req_char = 0;
6159 pcre_uchar req_char2 = 0;
6160 match_data match_block;
6161 match_data *md = &match_block;
6162 const pcre_uint8 *tables;
6163 const pcre_uint8 *start_bits = NULL;
6164 PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6165 PCRE_PUCHAR end_subject;
6166 PCRE_PUCHAR start_partial = NULL;
6167 PCRE_PUCHAR req_char_ptr = start_match - 1;
6168
6169 const pcre_study_data *study;
6170 const real_pcre *external_re = (const real_pcre *)argument_re;
6171 const real_pcre *re = external_re;
6172
6173 /* Plausibility checks */
6174
6175 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
6176 if (re == NULL || subject == NULL ||
6177 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
6178 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
6179 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
6180
6181 /* These two settings are used in the code for checking a UTF-8 string that
6182 follows immediately afterwards. Other values in the md block are used only
6183 during "normal" pcre_exec() processing, not when the JIT support is in use,
6184 so they are set up later. */
6185
6186 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6187 utf = md->utf = (re->options & PCRE_UTF8) != 0;
6188 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6189 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6190
6191 /* Check a UTF-8 string if required. Pass back the character offset and error
6192 code for an invalid string if a results vector is available. */
6193
6194 #ifdef SUPPORT_UTF
6195 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6196 {
6197 int erroroffset;
6198 int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6199 if (errorcode != 0)
6200 {
6201 if (offsetcount >= 2)
6202 {
6203 offsets[0] = erroroffset;
6204 offsets[1] = errorcode;
6205 }
6206 #ifdef COMPILE_PCRE16
6207 return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6208 PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
6209 #else
6210 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6211 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6212 #endif
6213 }
6214
6215 /* Check that a start_offset points to the start of a UTF character. */
6216 if (start_offset > 0 && start_offset < length &&
6217 NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6218 return PCRE_ERROR_BADUTF8_OFFSET;
6219 }
6220 #endif
6221
6222 /* If the pattern was successfully studied with JIT support, run the JIT
6223 executable instead of the rest of this function. Most options must be set at
6224 compile time for the JIT code to be usable. Fallback to the normal code path if
6225 an unsupported flag is set. In particular, JIT does not support partial
6226 matching. */
6227
6228 #ifdef SUPPORT_JIT
6229 if (extra_data != NULL
6230 && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
6231 && extra_data->executable_jit != NULL
6232 && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
6233 && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
6234 PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
6235 return PRIV(jit_exec)(re, extra_data->executable_jit,
6236 (const pcre_uchar *)subject, length, start_offset, options,
6237 ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
6238 ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
6239 #endif
6240
6241 /* Carry on with non-JIT matching. This information is for finding all the
6242 numbers associated with a given name, for condition testing. */
6243
6244 md->name_table = (pcre_uchar *)re + re->name_table_offset;
6245 md->name_count = re->name_count;
6246 md->name_entry_size = re->name_entry_size;
6247
6248 /* Fish out the optional data from the extra_data structure, first setting
6249 the default values. */
6250
6251 study = NULL;
6252 md->match_limit = MATCH_LIMIT;
6253 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6254 md->callout_data = NULL;
6255
6256 /* The table pointer is always in native byte order. */
6257
6258 tables = external_re->tables;
6259
6260 if (extra_data != NULL)
6261 {
6262 register unsigned int flags = extra_data->flags;
6263 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6264 study = (const pcre_study_data *)extra_data->study_data;
6265 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6266 md->match_limit = extra_data->match_limit;
6267 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6268 md->match_limit_recursion = extra_data->match_limit_recursion;
6269 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6270 md->callout_data = extra_data->callout_data;
6271 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6272 }
6273
6274 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6275 is a feature that makes it possible to save compiled regex and re-use them
6276 in other programs later. */
6277
6278 if (tables == NULL) tables = PRIV(default_tables);
6279
6280 /* Check that the first field in the block is the magic number. If it is not,
6281 return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6282 REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6283 means that the pattern is likely compiled with different endianness. */
6284
6285 if (re->magic_number != MAGIC_NUMBER)
6286 return re->magic_number == REVERSED_MAGIC_NUMBER?
6287 PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6288 if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6289
6290 /* Set up other data */
6291
6292 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6293 startline = (re->flags & PCRE_STARTLINE) != 0;
6294 firstline = (re->options & PCRE_FIRSTLINE) != 0;
6295
6296 /* The code starts after the real_pcre block and the capture name table. */
6297
6298 md->start_code = (const pcre_uchar *)external_re + re->name_table_offset +
6299 re->name_count * re->name_entry_size;
6300
6301 md->start_subject = (PCRE_PUCHAR)subject;
6302 md->start_offset = start_offset;
6303 md->end_subject = md->start_subject + length;
6304 end_subject = md->end_subject;
6305
6306 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
6307 md->use_ucp = (re->options & PCRE_UCP) != 0;
6308 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
6309 md->ignore_skip_arg = FALSE;
6310
6311 /* Some options are unpacked into BOOL variables in the hope that testing
6312 them will be faster than individual option bits. */
6313
6314 md->notbol = (options & PCRE_NOTBOL) != 0;
6315 md->noteol = (options & PCRE_NOTEOL) != 0;
6316 md->notempty = (options & PCRE_NOTEMPTY) != 0;
6317 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
6318
6319 md->hitend = FALSE;
6320 md->mark = md->nomatch_mark = NULL; /* In case never set */
6321
6322 md->recursive = NULL; /* No recursion at top level */
6323 md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
6324
6325 md->lcc = tables + lcc_offset;
6326 md->fcc = tables + fcc_offset;
6327 md->ctypes = tables + ctypes_offset;
6328
6329 /* Handle different \R options. */
6330
6331 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6332 {
6333 case 0:
6334 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6335 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6336 else
6337 #ifdef BSR_ANYCRLF
6338 md->bsr_anycrlf = TRUE;
6339 #else
6340 md->bsr_anycrlf = FALSE;
6341 #endif
6342 break;
6343
6344 case PCRE_BSR_ANYCRLF:
6345 md->bsr_anycrlf = TRUE;
6346 break;
6347
6348 case PCRE_BSR_UNICODE:
6349 md->bsr_anycrlf = FALSE;
6350 break;
6351
6352 default: return PCRE_ERROR_BADNEWLINE;
6353 }
6354
6355 /* Handle different types of newline. The three bits give eight cases. If
6356 nothing is set at run time, whatever was used at compile time applies. */
6357
6358 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6359 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6360 {
6361 case 0: newline = NEWLINE; break; /* Compile-time default */
6362 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6363 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6364 case PCRE_NEWLINE_CR+
6365 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6366 case PCRE_NEWLINE_ANY: newline = -1; break;
6367 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6368 default: return PCRE_ERROR_BADNEWLINE;
6369 }
6370
6371 if (newline == -2)
6372 {
6373 md->nltype = NLTYPE_ANYCRLF;
6374 }
6375 else if (newline < 0)
6376 {
6377 md->nltype = NLTYPE_ANY;
6378 }
6379 else
6380 {
6381 md->nltype = NLTYPE_FIXED;
6382 if (newline > 255)
6383 {
6384 md->nllen = 2;
6385 md->nl[0] = (newline >> 8) & 255;
6386 md->nl[1] = newline & 255;
6387 }
6388 else
6389 {
6390 md->nllen = 1;
6391 md->nl[0] = newline;
6392 }
6393 }
6394
6395 /* Partial matching was originally supported only for a restricted set of
6396 regexes; from release 8.00 there are no restrictions, but the bits are still
6397 defined (though never set). So there's no harm in leaving this code. */
6398
6399 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6400 return PCRE_ERROR_BADPARTIAL;
6401
6402 /* If the expression has got more back references than the offsets supplied can
6403 hold, we get a temporary chunk of working store to use during the matching.
6404 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6405 of 3. */
6406
6407 ocount = offsetcount - (offsetcount % 3);
6408 arg_offset_max = (2*ocount)/3;
6409
6410 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6411 {
6412 ocount = re->top_backref * 3 + 3;
6413 md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
6414 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6415 using_temporary_offsets = TRUE;
6416 DPRINTF(("Got memory to hold back references\n"));
6417 }
6418 else md->offset_vector = offsets;
6419
6420 md->offset_end = ocount;
6421 md->offset_max = (2*ocount)/3;
6422 md->offset_overflow = FALSE;
6423 md->capture_last = -1;
6424
6425 /* Reset the working variable associated with each extraction. These should
6426 never be used unless previously set, but they get saved and restored, and so we
6427 initialize them to avoid reading uninitialized locations. Also, unset the
6428 offsets for the matched string. This is really just for tidiness with callouts,
6429 in case they inspect these fields. */
6430
6431 if (md->offset_vector != NULL)
6432 {
6433 register int *iptr = md->offset_vector + ocount;
6434 register int *iend = iptr - re->top_bracket;
6435 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6436 while (--iptr >= iend) *iptr = -1;
6437 md->offset_vector[0] = md->offset_vector[1] = -1;
6438 }
6439
6440 /* Set up the first character to match, if available. The first_char value is
6441 never set for an anchored regular expression, but the anchoring may be forced
6442 at run time, so we have to test for anchoring. The first char may be unset for
6443 an unanchored pattern, of course. If there's no first char and the pattern was
6444 studied, there may be a bitmap of possible first characters. */
6445
6446 if (!anchored)
6447 {
6448 if ((re->flags & PCRE_FIRSTSET) != 0)
6449 {
6450 has_first_char = TRUE;
6451 first_char = first_char2 = re->first_char;
6452 if ((re->flags & PCRE_FCH_CASELESS) != 0)
6453 {
6454 first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6455 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6456 if (utf && first_char > 127)
6457 first_char2 = UCD_OTHERCASE(first_char);
6458 #endif
6459 }
6460 }
6461 else
6462 if (!startline && study != NULL &&
6463 (study->flags & PCRE_STUDY_MAPPED) != 0)
6464 start_bits = study->start_bits;
6465 }
6466
6467 /* For anchored or unanchored matches, there may be a "last known required
6468 character" set. */
6469
6470 if ((re->flags & PCRE_REQCHSET) != 0)
6471 {
6472 has_req_char = TRUE;
6473 req_char = req_char2 = re->req_char;
6474 if ((re->flags & PCRE_RCH_CASELESS) != 0)
6475 {
6476 req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6477 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6478 if (utf && req_char > 127)
6479 req_char2 = UCD_OTHERCASE(req_char);
6480 #endif
6481 }
6482 }
6483
6484
6485 /* ==========================================================================*/
6486
6487 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6488 the loop runs just once. */
6489
6490 for(;;)
6491 {
6492 PCRE_PUCHAR save_end_subject = end_subject;
6493 PCRE_PUCHAR new_start_match;
6494
6495 /* If firstline is TRUE, the start of the match is constrained to the first
6496 line of a multiline string. That is, the match must be before or at the first
6497 newline. Implement this by temporarily adjusting end_subject so that we stop
6498 scanning at a newline. If the match fails at the newline, later code breaks
6499 this loop. */
6500
6501 if (firstline)
6502 {
6503 PCRE_PUCHAR t = start_match;
6504 #ifdef SUPPORT_UTF
6505 if (utf)
6506 {
6507 while (t < md->end_subject && !IS_NEWLINE(t))
6508 {
6509 t++;
6510 ACROSSCHAR(t < end_subject, *t, t++);
6511 }
6512 }
6513 else
6514 #endif
6515 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6516 end_subject = t;
6517 }
6518
6519 /* There are some optimizations that avoid running the match if a known
6520 starting point is not found, or if a known later character is not present.
6521 However, there is an option that disables these, for testing and for ensuring
6522 that all callouts do actually occur. The option can be set in the regex by
6523 (*NO_START_OPT) or passed in match-time options. */
6524
6525 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6526 {
6527 /* Advance to a unique first char if there is one. */
6528
6529 if (has_first_char)
6530 {
6531 if (first_char != first_char2)
6532 while (start_match < end_subject &&
6533 *start_match != first_char && *start_match != first_char2)
6534 start_match++;
6535 else
6536 while (start_match < end_subject && *start_match != first_char)
6537 start_match++;
6538 }
6539
6540 /* Or to just after a linebreak for a multiline match */
6541
6542 else if (startline)
6543 {
6544 if (start_match > md->start_subject + start_offset)
6545 {
6546 #ifdef SUPPORT_UTF
6547 if (utf)
6548 {
6549 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6550 {
6551 start_match++;
6552 ACROSSCHAR(start_match < end_subject, *start_match,
6553 start_match++);
6554 }
6555 }
6556 else
6557 #endif
6558 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6559 start_match++;
6560
6561 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6562 and we are now at a LF, advance the match position by one more character.
6563 */
6564
6565 if (start_match[-1] == CHAR_CR &&
6566 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6567 start_match < end_subject &&
6568 *start_match == CHAR_NL)
6569 start_match++;
6570 }
6571 }
6572
6573 /* Or to a non-unique first byte after study */
6574
6575 else if (start_bits != NULL)
6576 {
6577 while (start_match < end_subject)
6578 {
6579 register unsigned int c = *start_match;
6580 #ifndef COMPILE_PCRE8
6581 if (c > 255) c = 255;
6582 #endif
6583 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6584 {
6585 start_match++;
6586 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
6587 /* In non 8-bit mode, the iteration will stop for
6588 characters > 255 at the beginning or not stop at all. */
6589 if (utf)
6590 ACROSSCHAR(start_match < end_subject, *start_match,
6591 start_match++);
6592 #endif
6593 }
6594 else break;
6595 }
6596 }
6597 } /* Starting optimizations */
6598
6599 /* Restore fudged end_subject */
6600
6601 end_subject = save_end_subject;
6602
6603 /* The following two optimizations are disabled for partial matching or if
6604 disabling is explicitly requested. */
6605
6606 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6607 {
6608 /* If the pattern was studied, a minimum subject length may be set. This is
6609 a lower bound; no actual string of that length may actually match the
6610 pattern. Although the value is, strictly, in characters, we treat it as
6611 bytes to avoid spending too much time in this optimization. */
6612
6613 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6614 (pcre_uint32)(end_subject - start_match) < study->minlength)
6615 {
6616 rc = MATCH_NOMATCH;
6617 break;
6618 }
6619
6620 /* If req_char is set, we know that that character must appear in the
6621 subject for the match to succeed. If the first character is set, req_char
6622 must be later in the subject; otherwise the test starts at the match point.
6623 This optimization can save a huge amount of backtracking in patterns with
6624 nested unlimited repeats that aren't going to match. Writing separate code
6625 for cased/caseless versions makes it go faster, as does using an
6626 autoincrement and backing off on a match.
6627
6628 HOWEVER: when the subject string is very, very long, searching to its end
6629 can take a long time, and give bad performance on quite ordinary patterns.
6630 This showed up when somebody was matching something like /^\d+C/ on a
6631 32-megabyte string... so we don't do this when the string is sufficiently
6632 long. */
6633
6634 if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
6635 {
6636 register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
6637
6638 /* We don't need to repeat the search if we haven't yet reached the
6639 place we found it at last time. */
6640
6641 if (p > req_char_ptr)
6642 {
6643 if (req_char != req_char2)
6644 {
6645 while (p < end_subject)
6646 {
6647 register int pp = *p++;
6648 if (pp == req_char || pp == req_char2) { p--; break; }
6649 }
6650 }
6651 else
6652 {
6653 while (p < end_subject)
6654 {
6655 if (*p++ == req_char) { p--; break; }
6656 }
6657 }
6658
6659 /* If we can't find the required character, break the matching loop,
6660 forcing a match failure. */
6661
6662 if (p >= end_subject)
6663 {
6664 rc = MATCH_NOMATCH;
6665 break;
6666 }
6667
6668 /* If we have found the required character, save the point where we
6669 found it, so that we don't search again next time round the loop if
6670 the start hasn't passed this character yet. */
6671
6672 req_char_ptr = p;
6673 }
6674 }
6675 }
6676
6677 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6678 printf(">>>> Match against: ");
6679 pchars(start_match, end_subject - start_match, TRUE, md);
6680 printf("\n");
6681 #endif
6682
6683 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6684 first starting point for which a partial match was found. */
6685
6686 md->start_match_ptr = start_match;
6687 md->start_used_ptr = start_match;
6688 md->match_call_count = 0;
6689 md->match_function_type = 0;
6690 md->end_offset_top = 0;
6691 rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
6692 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6693
6694 switch(rc)
6695 {
6696 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6697 the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6698 entirely. The only way we can do that is to re-do the match at the same
6699 point, with a flag to force SKIP with an argument to be ignored. Just
6700 treating this case as NOMATCH does not work because it does not check other
6701 alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6702
6703 case MATCH_SKIP_ARG:
6704 new_start_match = start_match;
6705 md->ignore_skip_arg = TRUE;
6706 break;
6707
6708 /* SKIP passes back the next starting point explicitly, but if it is the
6709 same as the match we have just done, treat it as NOMATCH. */
6710
6711 case MATCH_SKIP:
6712 if (md->start_match_ptr != start_match)
6713 {
6714 new_start_match = md->start_match_ptr;
6715 break;
6716 }
6717 /* Fall through */
6718
6719 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6720 exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
6721
6722 case MATCH_NOMATCH:
6723 case MATCH_PRUNE:
6724 case MATCH_THEN:
6725 md->ignore_skip_arg = FALSE;
6726 new_start_match = start_match + 1;
6727 #ifdef SUPPORT_UTF
6728 if (utf)
6729 ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6730 new_start_match++);
6731 #endif
6732 break;