/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 614 - (show annotations) (download)
Sat Jul 9 10:48:16 2011 UTC (3 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 196987 byte(s)
Fix atomic group and assertion capturing problems.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63};
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. If we fail to match, we need to restore this value and also the
812 values of the final offsets, in case they were set by a previous iteration
813 of the same bracket.
814
815 If there isn't enough space in the offset vector, treat this as if it were
816 a non-capturing bracket. Don't worry about setting the flag for the error
817 case here; that is handled in the code for KET. */
818
819 case OP_CBRA:
820 case OP_SCBRA:
821 number = GET2(ecode, 1+LINK_SIZE);
822 offset = number << 1;
823
824 #ifdef PCRE_DEBUG
825 printf("start bracket %d\n", number);
826 printf("subject=");
827 pchars(eptr, 16, TRUE, md);
828 printf("\n");
829 #endif
830
831 if (offset < md->offset_max)
832 {
833 save_offset1 = md->offset_vector[offset];
834 save_offset2 = md->offset_vector[offset+1];
835 save_offset3 = md->offset_vector[md->offset_end - number];
836 save_capture_last = md->capture_last;
837
838 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 md->offset_vector[md->offset_end - number] =
840 (int)(eptr - md->start_subject);
841
842 for (;;)
843 {
844 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846 eptrb, RM1);
847 if (rrc != MATCH_NOMATCH &&
848 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849 RRETURN(rrc);
850
851 /* If md->end_offset_top is greater than offset_top, it means that the
852 branch we have just failed to match did manage to match some capturing
853 parentheses within an atomic group or an assertion. Although offset_top
854 reverts to its original value at this level, we must unset the captured
855 values in case a later match sets a higher capturing number. Example:
856 matching /((?>(a))b|(a)c)/ against "ac". This captures 3, but we need
857 to ensure that 2 - which was captured in the atomic matching - is
858 unset. */
859
860 if (md->end_offset_top > offset_top)
861 {
862 register int *iptr = md->offset_vector + offset_top;
863 register int *iend = md->offset_vector + md->end_offset_top;
864 while (iptr < iend) *iptr++ = -1;
865 }
866
867 md->capture_last = save_capture_last;
868 ecode += GET(ecode, 1);
869 if (*ecode != OP_ALT) break;
870 }
871
872 DPRINTF(("bracket %d failed\n", number));
873
874 md->offset_vector[offset] = save_offset1;
875 md->offset_vector[offset+1] = save_offset2;
876 md->offset_vector[md->offset_end - number] = save_offset3;
877
878 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
879 RRETURN(MATCH_NOMATCH);
880 }
881
882 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
883 as a non-capturing bracket. */
884
885 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
886 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
887
888 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
889
890 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
891 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
892
893 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
894 for all the alternatives. When we get to the final alternative within the
895 brackets, we used to return the result of a recursive call to match()
896 whatever happened so it was possible to reduce stack usage by turning this
897 into a tail recursion, except in the case of a possibly empty group.
898 However, now that there is the possiblity of (*THEN) occurring in the final
899 alternative, this optimization is no longer possible. */
900
901 case OP_BRA:
902 case OP_SBRA:
903 DPRINTF(("start non-capturing bracket\n"));
904 for (;;)
905 {
906 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
907 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
908 RM2);
909 if (rrc != MATCH_NOMATCH &&
910 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
911 RRETURN(rrc);
912
913 /* See explanatory comment above under OP_CBRA. */
914
915 if (md->end_offset_top > offset_top)
916 {
917 register int *iptr = md->offset_vector + offset_top;
918 register int *iend = md->offset_vector + md->end_offset_top;
919 while (iptr < iend) *iptr++ = -1;
920 }
921
922 ecode += GET(ecode, 1);
923 if (*ecode != OP_ALT) break;
924 }
925
926 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
927 RRETURN(MATCH_NOMATCH);
928
929 /* Handle possessive capturing brackets with an unlimited repeat. We come
930 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
931 handled similarly to the normal case above. However, the matching is
932 different. The end of these brackets will always be OP_KETRPOS, which
933 returns MATCH_KETRPOS without going further in the pattern. By this means
934 we can handle the group by iteration rather than recursion, thereby
935 reducing the amount of stack needed. */
936
937 case OP_CBRAPOS:
938 case OP_SCBRAPOS:
939 allow_zero = FALSE;
940
941 POSSESSIVE_CAPTURE:
942 number = GET2(ecode, 1+LINK_SIZE);
943 offset = number << 1;
944
945 #ifdef PCRE_DEBUG
946 printf("start possessive bracket %d\n", number);
947 printf("subject=");
948 pchars(eptr, 16, TRUE, md);
949 printf("\n");
950 #endif
951
952 if (offset < md->offset_max)
953 {
954 matched_once = FALSE;
955 code_offset = ecode - md->start_code;
956
957 save_offset1 = md->offset_vector[offset];
958 save_offset2 = md->offset_vector[offset+1];
959 save_offset3 = md->offset_vector[md->offset_end - number];
960 save_capture_last = md->capture_last;
961
962 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
963
964 /* Each time round the loop, save the current subject position for use
965 when the group matches. For MATCH_MATCH, the group has matched, so we
966 restart it with a new subject starting position, remembering that we had
967 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
968 usual. If we haven't matched any alternatives in any iteration, check to
969 see if a previous iteration matched. If so, the group has matched;
970 continue from afterwards. Otherwise it has failed; restore the previous
971 capture values before returning NOMATCH. */
972
973 for (;;)
974 {
975 md->offset_vector[md->offset_end - number] =
976 (int)(eptr - md->start_subject);
977 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
978 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
979 eptrb, RM63);
980 if (rrc == MATCH_KETRPOS)
981 {
982 offset_top = md->end_offset_top;
983 eptr = md->end_match_ptr;
984 ecode = md->start_code + code_offset;
985 save_capture_last = md->capture_last;
986 matched_once = TRUE;
987 continue;
988 }
989 if (rrc != MATCH_NOMATCH &&
990 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
991 RRETURN(rrc);
992
993 /* See explanatory comment above under OP_CBRA. */
994
995 if (md->end_offset_top > offset_top)
996 {
997 register int *iptr = md->offset_vector + offset_top;
998 register int *iend = md->offset_vector + md->end_offset_top;
999 while (iptr < iend) *iptr++ = -1;
1000 }
1001
1002 md->capture_last = save_capture_last;
1003 ecode += GET(ecode, 1);
1004 if (*ecode != OP_ALT) break;
1005 }
1006
1007 if (!matched_once)
1008 {
1009 md->offset_vector[offset] = save_offset1;
1010 md->offset_vector[offset+1] = save_offset2;
1011 md->offset_vector[md->offset_end - number] = save_offset3;
1012 }
1013
1014 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
1015 if (allow_zero || matched_once)
1016 {
1017 ecode += 1 + LINK_SIZE;
1018 break;
1019 }
1020
1021 RRETURN(MATCH_NOMATCH);
1022 }
1023
1024 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1025 as a non-capturing bracket. */
1026
1027 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1028 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029
1030 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1031
1032 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1033 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034
1035 /* Non-capturing possessive bracket with unlimited repeat. We come here
1036 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1037 without the capturing complication. It is written out separately for speed
1038 and cleanliness. */
1039
1040 case OP_BRAPOS:
1041 case OP_SBRAPOS:
1042 allow_zero = FALSE;
1043
1044 POSSESSIVE_NON_CAPTURE:
1045 matched_once = FALSE;
1046 code_offset = ecode - md->start_code;
1047
1048 for (;;)
1049 {
1050 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1051 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1052 eptrb, RM48);
1053 if (rrc == MATCH_KETRPOS)
1054 {
1055 offset_top = md->end_offset_top;
1056 eptr = md->end_match_ptr;
1057 ecode = md->start_code + code_offset;
1058 matched_once = TRUE;
1059 continue;
1060 }
1061 if (rrc != MATCH_NOMATCH &&
1062 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1063 RRETURN(rrc);
1064
1065 /* See explanatory comment above under OP_CBRA. */
1066
1067 if (md->end_offset_top > offset_top)
1068 {
1069 register int *iptr = md->offset_vector + offset_top;
1070 register int *iend = md->offset_vector + md->end_offset_top;
1071 while (iptr < iend) *iptr++ = -1;
1072 }
1073
1074 ecode += GET(ecode, 1);
1075 if (*ecode != OP_ALT) break;
1076 }
1077
1078 if (matched_once || allow_zero)
1079 {
1080 ecode += 1 + LINK_SIZE;
1081 break;
1082 }
1083 RRETURN(MATCH_NOMATCH);
1084
1085 /* Control never reaches here. */
1086
1087 /* Conditional group: compilation checked that there are no more than
1088 two branches. If the condition is false, skipping the first branch takes us
1089 past the end if there is only one branch, but that's OK because that is
1090 exactly what going to the ket would do. */
1091
1092 case OP_COND:
1093 case OP_SCOND:
1094 codelink = GET(ecode, 1);
1095
1096 /* Because of the way auto-callout works during compile, a callout item is
1097 inserted between OP_COND and an assertion condition. */
1098
1099 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1100 {
1101 if (pcre_callout != NULL)
1102 {
1103 pcre_callout_block cb;
1104 cb.version = 1; /* Version 1 of the callout block */
1105 cb.callout_number = ecode[LINK_SIZE+2];
1106 cb.offset_vector = md->offset_vector;
1107 cb.subject = (PCRE_SPTR)md->start_subject;
1108 cb.subject_length = (int)(md->end_subject - md->start_subject);
1109 cb.start_match = (int)(mstart - md->start_subject);
1110 cb.current_position = (int)(eptr - md->start_subject);
1111 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1112 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1113 cb.capture_top = offset_top/2;
1114 cb.capture_last = md->capture_last;
1115 cb.callout_data = md->callout_data;
1116 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1117 if (rrc < 0) RRETURN(rrc);
1118 }
1119 ecode += _pcre_OP_lengths[OP_CALLOUT];
1120 }
1121
1122 condcode = ecode[LINK_SIZE+1];
1123
1124 /* Now see what the actual condition is */
1125
1126 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1127 {
1128 if (md->recursive == NULL) /* Not recursing => FALSE */
1129 {
1130 condition = FALSE;
1131 ecode += GET(ecode, 1);
1132 }
1133 else
1134 {
1135 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1136 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1137
1138 /* If the test is for recursion into a specific subpattern, and it is
1139 false, but the test was set up by name, scan the table to see if the
1140 name refers to any other numbers, and test them. The condition is true
1141 if any one is set. */
1142
1143 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1144 {
1145 uschar *slotA = md->name_table;
1146 for (i = 0; i < md->name_count; i++)
1147 {
1148 if (GET2(slotA, 0) == recno) break;
1149 slotA += md->name_entry_size;
1150 }
1151
1152 /* Found a name for the number - there can be only one; duplicate
1153 names for different numbers are allowed, but not vice versa. First
1154 scan down for duplicates. */
1155
1156 if (i < md->name_count)
1157 {
1158 uschar *slotB = slotA;
1159 while (slotB > md->name_table)
1160 {
1161 slotB -= md->name_entry_size;
1162 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1163 {
1164 condition = GET2(slotB, 0) == md->recursive->group_num;
1165 if (condition) break;
1166 }
1167 else break;
1168 }
1169
1170 /* Scan up for duplicates */
1171
1172 if (!condition)
1173 {
1174 slotB = slotA;
1175 for (i++; i < md->name_count; i++)
1176 {
1177 slotB += md->name_entry_size;
1178 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1179 {
1180 condition = GET2(slotB, 0) == md->recursive->group_num;
1181 if (condition) break;
1182 }
1183 else break;
1184 }
1185 }
1186 }
1187 }
1188
1189 /* Chose branch according to the condition */
1190
1191 ecode += condition? 3 : GET(ecode, 1);
1192 }
1193 }
1194
1195 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1196 {
1197 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1198 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1199
1200 /* If the numbered capture is unset, but the reference was by name,
1201 scan the table to see if the name refers to any other numbers, and test
1202 them. The condition is true if any one is set. This is tediously similar
1203 to the code above, but not close enough to try to amalgamate. */
1204
1205 if (!condition && condcode == OP_NCREF)
1206 {
1207 int refno = offset >> 1;
1208 uschar *slotA = md->name_table;
1209
1210 for (i = 0; i < md->name_count; i++)
1211 {
1212 if (GET2(slotA, 0) == refno) break;
1213 slotA += md->name_entry_size;
1214 }
1215
1216 /* Found a name for the number - there can be only one; duplicate names
1217 for different numbers are allowed, but not vice versa. First scan down
1218 for duplicates. */
1219
1220 if (i < md->name_count)
1221 {
1222 uschar *slotB = slotA;
1223 while (slotB > md->name_table)
1224 {
1225 slotB -= md->name_entry_size;
1226 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1227 {
1228 offset = GET2(slotB, 0) << 1;
1229 condition = offset < offset_top &&
1230 md->offset_vector[offset] >= 0;
1231 if (condition) break;
1232 }
1233 else break;
1234 }
1235
1236 /* Scan up for duplicates */
1237
1238 if (!condition)
1239 {
1240 slotB = slotA;
1241 for (i++; i < md->name_count; i++)
1242 {
1243 slotB += md->name_entry_size;
1244 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1245 {
1246 offset = GET2(slotB, 0) << 1;
1247 condition = offset < offset_top &&
1248 md->offset_vector[offset] >= 0;
1249 if (condition) break;
1250 }
1251 else break;
1252 }
1253 }
1254 }
1255 }
1256
1257 /* Chose branch according to the condition */
1258
1259 ecode += condition? 3 : GET(ecode, 1);
1260 }
1261
1262 else if (condcode == OP_DEF) /* DEFINE - always false */
1263 {
1264 condition = FALSE;
1265 ecode += GET(ecode, 1);
1266 }
1267
1268 /* The condition is an assertion. Call match() to evaluate it - setting
1269 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1270 an assertion. */
1271
1272 else
1273 {
1274 md->match_function_type = MATCH_CONDASSERT;
1275 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1276 if (rrc == MATCH_MATCH)
1277 {
1278 condition = TRUE;
1279 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1280 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1281 }
1282 else if (rrc != MATCH_NOMATCH &&
1283 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1284 {
1285 RRETURN(rrc); /* Need braces because of following else */
1286 }
1287 else
1288 {
1289 condition = FALSE;
1290 ecode += codelink;
1291 }
1292 }
1293
1294 /* We are now at the branch that is to be obeyed. As there is only one,
1295 we used to use tail recursion to avoid using another stack frame, except
1296 when there was unlimited repeat of a possibly empty group. However, that
1297 strategy no longer works because of the possibilty of (*THEN) being
1298 encountered in the branch. A recursive call to match() is always required,
1299 unless the second alternative doesn't exist, in which case we can just
1300 plough on. */
1301
1302 if (condition || *ecode == OP_ALT)
1303 {
1304 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1305 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1306 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1307 rrc = MATCH_NOMATCH;
1308 RRETURN(rrc);
1309 }
1310 else /* Condition false & no alternative */
1311 {
1312 ecode += 1 + LINK_SIZE;
1313 }
1314 break;
1315
1316
1317 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1318 to close any currently open capturing brackets. */
1319
1320 case OP_CLOSE:
1321 number = GET2(ecode, 1);
1322 offset = number << 1;
1323
1324 #ifdef PCRE_DEBUG
1325 printf("end bracket %d at *ACCEPT", number);
1326 printf("\n");
1327 #endif
1328
1329 md->capture_last = number;
1330 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1331 {
1332 md->offset_vector[offset] =
1333 md->offset_vector[md->offset_end - number];
1334 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1335 if (offset_top <= offset) offset_top = offset + 2;
1336 }
1337 ecode += 3;
1338 break;
1339
1340
1341 /* End of the pattern, either real or forced. If we are in a recursion, we
1342 should restore the offsets appropriately, and if it's a top-level
1343 recursion, continue from after the call. */
1344
1345 case OP_ACCEPT:
1346 case OP_ASSERT_ACCEPT:
1347 case OP_END:
1348 if (md->recursive != NULL)
1349 {
1350 recursion_info *rec = md->recursive;
1351 md->recursive = rec->prevrec;
1352 memmove(md->offset_vector, rec->offset_save,
1353 rec->saved_max * sizeof(int));
1354 offset_top = rec->save_offset_top;
1355 if (rec->group_num == 0)
1356 {
1357 ecode = rec->after_call;
1358 break;
1359 }
1360 }
1361
1362 /* Otherwise, if we have matched an empty string, fail if not in an
1363 assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1364 is set and we have matched at the start of the subject. In both cases,
1365 backtracking will then try other alternatives, if any. */
1366
1367 else if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1368 (md->notempty ||
1369 (md->notempty_atstart &&
1370 mstart == md->start_subject + md->start_offset)))
1371 MRRETURN(MATCH_NOMATCH);
1372
1373 /* Otherwise, we have a match. */
1374
1375 md->end_match_ptr = eptr; /* Record where we ended */
1376 md->end_offset_top = offset_top; /* and how many extracts were taken */
1377 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1378
1379 /* For some reason, the macros don't work properly if an expression is
1380 given as the argument to MRRETURN when the heap is in use. */
1381
1382 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1383 MRRETURN(rrc);
1384
1385 /* Assertion brackets. Check the alternative branches in turn - the
1386 matching won't pass the KET for an assertion. If any one branch matches,
1387 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1388 start of each branch to move the current point backwards, so the code at
1389 this level is identical to the lookahead case. When the assertion is part
1390 of a condition, we want to return immediately afterwards. The caller of
1391 this incarnation of the match() function will have set MATCH_CONDASSERT in
1392 md->match_function type, and one of these opcodes will be the first opcode
1393 that is processed. We use a local variable that is preserved over calls to
1394 match() to remember this case. */
1395
1396 case OP_ASSERT:
1397 case OP_ASSERTBACK:
1398 if (md->match_function_type == MATCH_CONDASSERT)
1399 {
1400 condassert = TRUE;
1401 md->match_function_type = 0;
1402 }
1403 else condassert = FALSE;
1404
1405 do
1406 {
1407 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1408 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1409 {
1410 mstart = md->start_match_ptr; /* In case \K reset it */
1411 break;
1412 }
1413 if (rrc != MATCH_NOMATCH &&
1414 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1415 RRETURN(rrc);
1416
1417 /* See explanatory comment above under OP_CBRA. */
1418
1419 if (md->end_offset_top > offset_top)
1420 {
1421 register int *iptr = md->offset_vector + offset_top;
1422 register int *iend = md->offset_vector + md->end_offset_top;
1423 while (iptr < iend) *iptr++ = -1;
1424 }
1425
1426 ecode += GET(ecode, 1);
1427 }
1428 while (*ecode == OP_ALT);
1429
1430 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1431
1432 /* If checking an assertion for a condition, return MATCH_MATCH. */
1433
1434 if (condassert) RRETURN(MATCH_MATCH);
1435
1436 /* Continue from after the assertion, updating the offsets high water
1437 mark, since extracts may have been taken during the assertion. */
1438
1439 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1440 ecode += 1 + LINK_SIZE;
1441 offset_top = md->end_offset_top;
1442 continue;
1443
1444 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1445 PRUNE, or COMMIT means we must assume failure without checking subsequent
1446 branches. */
1447
1448 case OP_ASSERT_NOT:
1449 case OP_ASSERTBACK_NOT:
1450 if (md->match_function_type == MATCH_CONDASSERT)
1451 {
1452 condassert = TRUE;
1453 md->match_function_type = 0;
1454 }
1455 else condassert = FALSE;
1456
1457 do
1458 {
1459 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1460 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1461 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1462 {
1463 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1464 break;
1465 }
1466 if (rrc != MATCH_NOMATCH &&
1467 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1468 RRETURN(rrc);
1469 ecode += GET(ecode,1);
1470 }
1471 while (*ecode == OP_ALT);
1472
1473 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1474
1475 ecode += 1 + LINK_SIZE;
1476 continue;
1477
1478 /* Move the subject pointer back. This occurs only at the start of
1479 each branch of a lookbehind assertion. If we are too close to the start to
1480 move back, this match function fails. When working with UTF-8 we move
1481 back a number of characters, not bytes. */
1482
1483 case OP_REVERSE:
1484 #ifdef SUPPORT_UTF8
1485 if (utf8)
1486 {
1487 i = GET(ecode, 1);
1488 while (i-- > 0)
1489 {
1490 eptr--;
1491 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1492 BACKCHAR(eptr);
1493 }
1494 }
1495 else
1496 #endif
1497
1498 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1499
1500 {
1501 eptr -= GET(ecode, 1);
1502 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1503 }
1504
1505 /* Save the earliest consulted character, then skip to next op code */
1506
1507 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1508 ecode += 1 + LINK_SIZE;
1509 break;
1510
1511 /* The callout item calls an external function, if one is provided, passing
1512 details of the match so far. This is mainly for debugging, though the
1513 function is able to force a failure. */
1514
1515 case OP_CALLOUT:
1516 if (pcre_callout != NULL)
1517 {
1518 pcre_callout_block cb;
1519 cb.version = 1; /* Version 1 of the callout block */
1520 cb.callout_number = ecode[1];
1521 cb.offset_vector = md->offset_vector;
1522 cb.subject = (PCRE_SPTR)md->start_subject;
1523 cb.subject_length = (int)(md->end_subject - md->start_subject);
1524 cb.start_match = (int)(mstart - md->start_subject);
1525 cb.current_position = (int)(eptr - md->start_subject);
1526 cb.pattern_position = GET(ecode, 2);
1527 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1528 cb.capture_top = offset_top/2;
1529 cb.capture_last = md->capture_last;
1530 cb.callout_data = md->callout_data;
1531 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1532 if (rrc < 0) RRETURN(rrc);
1533 }
1534 ecode += 2 + 2*LINK_SIZE;
1535 break;
1536
1537 /* Recursion either matches the current regex, or some subexpression. The
1538 offset data is the offset to the starting bracket from the start of the
1539 whole pattern. (This is so that it works from duplicated subpatterns.)
1540
1541 If there are any capturing brackets started but not finished, we have to
1542 save their starting points and reinstate them after the recursion. However,
1543 we don't know how many such there are (offset_top records the completed
1544 total) so we just have to save all the potential data. There may be up to
1545 65535 such values, which is too large to put on the stack, but using malloc
1546 for small numbers seems expensive. As a compromise, the stack is used when
1547 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1548 is used. A problem is what to do if the malloc fails ... there is no way of
1549 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1550 values on the stack, and accept that the rest may be wrong.
1551
1552 There are also other values that have to be saved. We use a chained
1553 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1554 for the original version of this logic. */
1555
1556 case OP_RECURSE:
1557 {
1558 callpat = md->start_code + GET(ecode, 1);
1559 new_recursive.group_num = (callpat == md->start_code)? 0 :
1560 GET2(callpat, 1 + LINK_SIZE);
1561
1562 /* Add to "recursing stack" */
1563
1564 new_recursive.prevrec = md->recursive;
1565 md->recursive = &new_recursive;
1566
1567 /* Find where to continue from afterwards */
1568
1569 ecode += 1 + LINK_SIZE;
1570 new_recursive.after_call = ecode;
1571
1572 /* Now save the offset data. */
1573
1574 new_recursive.saved_max = md->offset_end;
1575 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1576 new_recursive.offset_save = stacksave;
1577 else
1578 {
1579 new_recursive.offset_save =
1580 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1581 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1582 }
1583
1584 memcpy(new_recursive.offset_save, md->offset_vector,
1585 new_recursive.saved_max * sizeof(int));
1586 new_recursive.save_offset_top = offset_top;
1587
1588 /* OK, now we can do the recursion. For each top-level alternative we
1589 restore the offset and recursion data. */
1590
1591 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1592 cbegroup = (*callpat >= OP_SBRA);
1593 do
1594 {
1595 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1596 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1597 md, eptrb, RM6);
1598 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1599 {
1600 DPRINTF(("Recursion matched\n"));
1601 md->recursive = new_recursive.prevrec;
1602 if (new_recursive.offset_save != stacksave)
1603 (pcre_free)(new_recursive.offset_save);
1604 MRRETURN(MATCH_MATCH);
1605 }
1606 else if (rrc != MATCH_NOMATCH &&
1607 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1608 {
1609 DPRINTF(("Recursion gave error %d\n", rrc));
1610 if (new_recursive.offset_save != stacksave)
1611 (pcre_free)(new_recursive.offset_save);
1612 RRETURN(rrc);
1613 }
1614
1615 md->recursive = &new_recursive;
1616 memcpy(md->offset_vector, new_recursive.offset_save,
1617 new_recursive.saved_max * sizeof(int));
1618 callpat += GET(callpat, 1);
1619 }
1620 while (*callpat == OP_ALT);
1621
1622 DPRINTF(("Recursion didn't match\n"));
1623 md->recursive = new_recursive.prevrec;
1624 if (new_recursive.offset_save != stacksave)
1625 (pcre_free)(new_recursive.offset_save);
1626 MRRETURN(MATCH_NOMATCH);
1627 }
1628 /* Control never reaches here */
1629
1630 /* "Once" brackets are like assertion brackets except that after a match,
1631 the point in the subject string is not moved back. Thus there can never be
1632 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1633 Check the alternative branches in turn - the matching won't pass the KET
1634 for this kind of subpattern. If any one branch matches, we carry on as at
1635 the end of a normal bracket, leaving the subject pointer, but resetting
1636 the start-of-match value in case it was changed by \K. */
1637
1638 case OP_ONCE:
1639 prev = ecode;
1640 saved_eptr = eptr;
1641
1642 do
1643 {
1644 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1645 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1646 {
1647 mstart = md->start_match_ptr;
1648 break;
1649 }
1650 if (rrc != MATCH_NOMATCH &&
1651 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1652 RRETURN(rrc);
1653
1654 /* See explanatory comment above under OP_CBRA. */
1655
1656 if (md->end_offset_top > offset_top)
1657 {
1658 register int *iptr = md->offset_vector + offset_top;
1659 register int *iend = md->offset_vector + md->end_offset_top;
1660 while (iptr < iend) *iptr++ = -1;
1661 }
1662
1663 ecode += GET(ecode,1);
1664 }
1665 while (*ecode == OP_ALT);
1666
1667 /* If hit the end of the group (which could be repeated), fail */
1668
1669 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1670
1671 /* Continue after the group, updating the offsets high water mark, since
1672 extracts may have been taken. */
1673
1674 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1675
1676 offset_top = md->end_offset_top;
1677 eptr = md->end_match_ptr;
1678
1679 /* For a non-repeating ket, just continue at this level. This also
1680 happens for a repeating ket if no characters were matched in the group.
1681 This is the forcible breaking of infinite loops as implemented in Perl
1682 5.005. If there is an options reset, it will get obeyed in the normal
1683 course of events. */
1684
1685 if (*ecode == OP_KET || eptr == saved_eptr)
1686 {
1687 ecode += 1+LINK_SIZE;
1688 break;
1689 }
1690
1691 /* The repeating kets try the rest of the pattern or restart from the
1692 preceding bracket, in the appropriate order. The second "call" of match()
1693 uses tail recursion, to avoid using another stack frame. */
1694
1695 if (*ecode == OP_KETRMIN)
1696 {
1697 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1698 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1699 ecode = prev;
1700 goto TAIL_RECURSE;
1701 }
1702 else /* OP_KETRMAX */
1703 {
1704 md->match_function_type = MATCH_CBEGROUP;
1705 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1706 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1707 ecode += 1 + LINK_SIZE;
1708 goto TAIL_RECURSE;
1709 }
1710 /* Control never gets here */
1711
1712 /* An alternation is the end of a branch; scan along to find the end of the
1713 bracketed group and go to there. */
1714
1715 case OP_ALT:
1716 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1717 break;
1718
1719 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1720 indicating that it may occur zero times. It may repeat infinitely, or not
1721 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1722 with fixed upper repeat limits are compiled as a number of copies, with the
1723 optional ones preceded by BRAZERO or BRAMINZERO. */
1724
1725 case OP_BRAZERO:
1726 next = ecode + 1;
1727 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1728 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1729 do next += GET(next, 1); while (*next == OP_ALT);
1730 ecode = next + 1 + LINK_SIZE;
1731 break;
1732
1733 case OP_BRAMINZERO:
1734 next = ecode + 1;
1735 do next += GET(next, 1); while (*next == OP_ALT);
1736 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1737 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1738 ecode++;
1739 break;
1740
1741 case OP_SKIPZERO:
1742 next = ecode+1;
1743 do next += GET(next,1); while (*next == OP_ALT);
1744 ecode = next + 1 + LINK_SIZE;
1745 break;
1746
1747 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1748 here; just jump to the group, with allow_zero set TRUE. */
1749
1750 case OP_BRAPOSZERO:
1751 op = *(++ecode);
1752 allow_zero = TRUE;
1753 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1754 goto POSSESSIVE_NON_CAPTURE;
1755
1756 /* End of a group, repeated or non-repeating. */
1757
1758 case OP_KET:
1759 case OP_KETRMIN:
1760 case OP_KETRMAX:
1761 case OP_KETRPOS:
1762 prev = ecode - GET(ecode, 1);
1763
1764 /* If this was a group that remembered the subject start, in order to break
1765 infinite repeats of empty string matches, retrieve the subject start from
1766 the chain. Otherwise, set it NULL. */
1767
1768 if (*prev >= OP_SBRA)
1769 {
1770 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1771 eptrb = eptrb->epb_prev; /* Backup to previous group */
1772 }
1773 else saved_eptr = NULL;
1774
1775 /* If we are at the end of an assertion group or an atomic group, stop
1776 matching and return MATCH_MATCH, but record the current high water mark for
1777 use by positive assertions. We also need to record the match start in case
1778 it was changed by \K. */
1779
1780 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1781 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1782 *prev == OP_ONCE)
1783 {
1784 md->end_match_ptr = eptr; /* For ONCE */
1785 md->end_offset_top = offset_top;
1786 md->start_match_ptr = mstart;
1787 MRRETURN(MATCH_MATCH);
1788 }
1789
1790 /* For capturing groups we have to check the group number back at the start
1791 and if necessary complete handling an extraction by setting the offsets and
1792 bumping the high water mark. Note that whole-pattern recursion is coded as
1793 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1794 when the OP_END is reached. Other recursion is handled here. */
1795
1796 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1797 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1798 {
1799 number = GET2(prev, 1+LINK_SIZE);
1800 offset = number << 1;
1801
1802 #ifdef PCRE_DEBUG
1803 printf("end bracket %d", number);
1804 printf("\n");
1805 #endif
1806
1807 md->capture_last = number;
1808 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1809 {
1810 md->offset_vector[offset] =
1811 md->offset_vector[md->offset_end - number];
1812 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1813 if (offset_top <= offset) offset_top = offset + 2;
1814 }
1815
1816 /* Handle a recursively called group. Restore the offsets
1817 appropriately and continue from after the call. */
1818
1819 if (md->recursive != NULL && md->recursive->group_num == number)
1820 {
1821 recursion_info *rec = md->recursive;
1822 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1823 md->recursive = rec->prevrec;
1824 memcpy(md->offset_vector, rec->offset_save,
1825 rec->saved_max * sizeof(int));
1826 offset_top = rec->save_offset_top;
1827 ecode = rec->after_call;
1828 break;
1829 }
1830 }
1831
1832 /* For a non-repeating ket, just continue at this level. This also
1833 happens for a repeating ket if no characters were matched in the group.
1834 This is the forcible breaking of infinite loops as implemented in Perl
1835 5.005. If there is an options reset, it will get obeyed in the normal
1836 course of events. */
1837
1838 if (*ecode == OP_KET || eptr == saved_eptr)
1839 {
1840 ecode += 1 + LINK_SIZE;
1841 break;
1842 }
1843
1844 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1845 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1846 at a time from the outer level, thus saving stack. */
1847
1848 if (*ecode == OP_KETRPOS)
1849 {
1850 md->end_match_ptr = eptr;
1851 md->end_offset_top = offset_top;
1852 RRETURN(MATCH_KETRPOS);
1853 }
1854
1855 /* The normal repeating kets try the rest of the pattern or restart from
1856 the preceding bracket, in the appropriate order. In the second case, we can
1857 use tail recursion to avoid using another stack frame, unless we have an
1858 unlimited repeat of a group that can match an empty string. */
1859
1860 if (*ecode == OP_KETRMIN)
1861 {
1862 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1863 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1864 if (*prev >= OP_SBRA) /* Could match an empty string */
1865 {
1866 md->match_function_type = MATCH_CBEGROUP;
1867 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1868 RRETURN(rrc);
1869 }
1870 ecode = prev;
1871 goto TAIL_RECURSE;
1872 }
1873 else /* OP_KETRMAX */
1874 {
1875 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1876 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1877 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1878 ecode += 1 + LINK_SIZE;
1879 goto TAIL_RECURSE;
1880 }
1881 /* Control never gets here */
1882
1883 /* Not multiline mode: start of subject assertion, unless notbol. */
1884
1885 case OP_CIRC:
1886 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1887
1888 /* Start of subject assertion */
1889
1890 case OP_SOD:
1891 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1892 ecode++;
1893 break;
1894
1895 /* Multiline mode: start of subject unless notbol, or after any newline. */
1896
1897 case OP_CIRCM:
1898 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1899 if (eptr != md->start_subject &&
1900 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1901 MRRETURN(MATCH_NOMATCH);
1902 ecode++;
1903 break;
1904
1905 /* Start of match assertion */
1906
1907 case OP_SOM:
1908 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1909 ecode++;
1910 break;
1911
1912 /* Reset the start of match point */
1913
1914 case OP_SET_SOM:
1915 mstart = eptr;
1916 ecode++;
1917 break;
1918
1919 /* Multiline mode: assert before any newline, or before end of subject
1920 unless noteol is set. */
1921
1922 case OP_DOLLM:
1923 if (eptr < md->end_subject)
1924 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1925 else
1926 {
1927 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1928 SCHECK_PARTIAL();
1929 }
1930 ecode++;
1931 break;
1932
1933 /* Not multiline mode: assert before a terminating newline or before end of
1934 subject unless noteol is set. */
1935
1936 case OP_DOLL:
1937 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1938 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1939
1940 /* ... else fall through for endonly */
1941
1942 /* End of subject assertion (\z) */
1943
1944 case OP_EOD:
1945 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1946 SCHECK_PARTIAL();
1947 ecode++;
1948 break;
1949
1950 /* End of subject or ending \n assertion (\Z) */
1951
1952 case OP_EODN:
1953 ASSERT_NL_OR_EOS:
1954 if (eptr < md->end_subject &&
1955 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1956 MRRETURN(MATCH_NOMATCH);
1957
1958 /* Either at end of string or \n before end. */
1959
1960 SCHECK_PARTIAL();
1961 ecode++;
1962 break;
1963
1964 /* Word boundary assertions */
1965
1966 case OP_NOT_WORD_BOUNDARY:
1967 case OP_WORD_BOUNDARY:
1968 {
1969
1970 /* Find out if the previous and current characters are "word" characters.
1971 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1972 be "non-word" characters. Remember the earliest consulted character for
1973 partial matching. */
1974
1975 #ifdef SUPPORT_UTF8
1976 if (utf8)
1977 {
1978 /* Get status of previous character */
1979
1980 if (eptr == md->start_subject) prev_is_word = FALSE; else
1981 {
1982 USPTR lastptr = eptr - 1;
1983 while((*lastptr & 0xc0) == 0x80) lastptr--;
1984 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1985 GETCHAR(c, lastptr);
1986 #ifdef SUPPORT_UCP
1987 if (md->use_ucp)
1988 {
1989 if (c == '_') prev_is_word = TRUE; else
1990 {
1991 int cat = UCD_CATEGORY(c);
1992 prev_is_word = (cat == ucp_L || cat == ucp_N);
1993 }
1994 }
1995 else
1996 #endif
1997 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1998 }
1999
2000 /* Get status of next character */
2001
2002 if (eptr >= md->end_subject)
2003 {
2004 SCHECK_PARTIAL();
2005 cur_is_word = FALSE;
2006 }
2007 else
2008 {
2009 GETCHAR(c, eptr);
2010 #ifdef SUPPORT_UCP
2011 if (md->use_ucp)
2012 {
2013 if (c == '_') cur_is_word = TRUE; else
2014 {
2015 int cat = UCD_CATEGORY(c);
2016 cur_is_word = (cat == ucp_L || cat == ucp_N);
2017 }
2018 }
2019 else
2020 #endif
2021 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2022 }
2023 }
2024 else
2025 #endif
2026
2027 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2028 consistency with the behaviour of \w we do use it in this case. */
2029
2030 {
2031 /* Get status of previous character */
2032
2033 if (eptr == md->start_subject) prev_is_word = FALSE; else
2034 {
2035 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2036 #ifdef SUPPORT_UCP
2037 if (md->use_ucp)
2038 {
2039 c = eptr[-1];
2040 if (c == '_') prev_is_word = TRUE; else
2041 {
2042 int cat = UCD_CATEGORY(c);
2043 prev_is_word = (cat == ucp_L || cat == ucp_N);
2044 }
2045 }
2046 else
2047 #endif
2048 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2049 }
2050
2051 /* Get status of next character */
2052
2053 if (eptr >= md->end_subject)
2054 {
2055 SCHECK_PARTIAL();
2056 cur_is_word = FALSE;
2057 }
2058 else
2059 #ifdef SUPPORT_UCP
2060 if (md->use_ucp)
2061 {
2062 c = *eptr;
2063 if (c == '_') cur_is_word = TRUE; else
2064 {
2065 int cat = UCD_CATEGORY(c);
2066 cur_is_word = (cat == ucp_L || cat == ucp_N);
2067 }
2068 }
2069 else
2070 #endif
2071 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2072 }
2073
2074 /* Now see if the situation is what we want */
2075
2076 if ((*ecode++ == OP_WORD_BOUNDARY)?
2077 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2078 MRRETURN(MATCH_NOMATCH);
2079 }
2080 break;
2081
2082 /* Match a single character type; inline for speed */
2083
2084 case OP_ANY:
2085 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2086 /* Fall through */
2087
2088 case OP_ALLANY:
2089 if (eptr++ >= md->end_subject)
2090 {
2091 SCHECK_PARTIAL();
2092 MRRETURN(MATCH_NOMATCH);
2093 }
2094 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2095 ecode++;
2096 break;
2097
2098 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2099 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2100
2101 case OP_ANYBYTE:
2102 if (eptr++ >= md->end_subject)
2103 {
2104 SCHECK_PARTIAL();
2105 MRRETURN(MATCH_NOMATCH);
2106 }
2107 ecode++;
2108 break;
2109
2110 case OP_NOT_DIGIT:
2111 if (eptr >= md->end_subject)
2112 {
2113 SCHECK_PARTIAL();
2114 MRRETURN(MATCH_NOMATCH);
2115 }
2116 GETCHARINCTEST(c, eptr);
2117 if (
2118 #ifdef SUPPORT_UTF8
2119 c < 256 &&
2120 #endif
2121 (md->ctypes[c] & ctype_digit) != 0
2122 )
2123 MRRETURN(MATCH_NOMATCH);
2124 ecode++;
2125 break;
2126
2127 case OP_DIGIT:
2128 if (eptr >= md->end_subject)
2129 {
2130 SCHECK_PARTIAL();
2131 MRRETURN(MATCH_NOMATCH);
2132 }
2133 GETCHARINCTEST(c, eptr);
2134 if (
2135 #ifdef SUPPORT_UTF8
2136 c >= 256 ||
2137 #endif
2138 (md->ctypes[c] & ctype_digit) == 0
2139 )
2140 MRRETURN(MATCH_NOMATCH);
2141 ecode++;
2142 break;
2143
2144 case OP_NOT_WHITESPACE:
2145 if (eptr >= md->end_subject)
2146 {
2147 SCHECK_PARTIAL();
2148 MRRETURN(MATCH_NOMATCH);
2149 }
2150 GETCHARINCTEST(c, eptr);
2151 if (
2152 #ifdef SUPPORT_UTF8
2153 c < 256 &&
2154 #endif
2155 (md->ctypes[c] & ctype_space) != 0
2156 )
2157 MRRETURN(MATCH_NOMATCH);
2158 ecode++;
2159 break;
2160
2161 case OP_WHITESPACE:
2162 if (eptr >= md->end_subject)
2163 {
2164 SCHECK_PARTIAL();
2165 MRRETURN(MATCH_NOMATCH);
2166 }
2167 GETCHARINCTEST(c, eptr);
2168 if (
2169 #ifdef SUPPORT_UTF8
2170 c >= 256 ||
2171 #endif
2172 (md->ctypes[c] & ctype_space) == 0
2173 )
2174 MRRETURN(MATCH_NOMATCH);
2175 ecode++;
2176 break;
2177
2178 case OP_NOT_WORDCHAR:
2179 if (eptr >= md->end_subject)
2180 {
2181 SCHECK_PARTIAL();
2182 MRRETURN(MATCH_NOMATCH);
2183 }
2184 GETCHARINCTEST(c, eptr);
2185 if (
2186 #ifdef SUPPORT_UTF8
2187 c < 256 &&
2188 #endif
2189 (md->ctypes[c] & ctype_word) != 0
2190 )
2191 MRRETURN(MATCH_NOMATCH);
2192 ecode++;
2193 break;
2194
2195 case OP_WORDCHAR:
2196 if (eptr >= md->end_subject)
2197 {
2198 SCHECK_PARTIAL();
2199 MRRETURN(MATCH_NOMATCH);
2200 }
2201 GETCHARINCTEST(c, eptr);
2202 if (
2203 #ifdef SUPPORT_UTF8
2204 c >= 256 ||
2205 #endif
2206 (md->ctypes[c] & ctype_word) == 0
2207 )
2208 MRRETURN(MATCH_NOMATCH);
2209 ecode++;
2210 break;
2211
2212 case OP_ANYNL:
2213 if (eptr >= md->end_subject)
2214 {
2215 SCHECK_PARTIAL();
2216 MRRETURN(MATCH_NOMATCH);
2217 }
2218 GETCHARINCTEST(c, eptr);
2219 switch(c)
2220 {
2221 default: MRRETURN(MATCH_NOMATCH);
2222
2223 case 0x000d:
2224 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2225 break;
2226
2227 case 0x000a:
2228 break;
2229
2230 case 0x000b:
2231 case 0x000c:
2232 case 0x0085:
2233 case 0x2028:
2234 case 0x2029:
2235 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2236 break;
2237 }
2238 ecode++;
2239 break;
2240
2241 case OP_NOT_HSPACE:
2242 if (eptr >= md->end_subject)
2243 {
2244 SCHECK_PARTIAL();
2245 MRRETURN(MATCH_NOMATCH);
2246 }
2247 GETCHARINCTEST(c, eptr);
2248 switch(c)
2249 {
2250 default: break;
2251 case 0x09: /* HT */
2252 case 0x20: /* SPACE */
2253 case 0xa0: /* NBSP */
2254 case 0x1680: /* OGHAM SPACE MARK */
2255 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2256 case 0x2000: /* EN QUAD */
2257 case 0x2001: /* EM QUAD */
2258 case 0x2002: /* EN SPACE */
2259 case 0x2003: /* EM SPACE */
2260 case 0x2004: /* THREE-PER-EM SPACE */
2261 case 0x2005: /* FOUR-PER-EM SPACE */
2262 case 0x2006: /* SIX-PER-EM SPACE */
2263 case 0x2007: /* FIGURE SPACE */
2264 case 0x2008: /* PUNCTUATION SPACE */
2265 case 0x2009: /* THIN SPACE */
2266 case 0x200A: /* HAIR SPACE */
2267 case 0x202f: /* NARROW NO-BREAK SPACE */
2268 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2269 case 0x3000: /* IDEOGRAPHIC SPACE */
2270 MRRETURN(MATCH_NOMATCH);
2271 }
2272 ecode++;
2273 break;
2274
2275 case OP_HSPACE:
2276 if (eptr >= md->end_subject)
2277 {
2278 SCHECK_PARTIAL();
2279 MRRETURN(MATCH_NOMATCH);
2280 }
2281 GETCHARINCTEST(c, eptr);
2282 switch(c)
2283 {
2284 default: MRRETURN(MATCH_NOMATCH);
2285 case 0x09: /* HT */
2286 case 0x20: /* SPACE */
2287 case 0xa0: /* NBSP */
2288 case 0x1680: /* OGHAM SPACE MARK */
2289 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2290 case 0x2000: /* EN QUAD */
2291 case 0x2001: /* EM QUAD */
2292 case 0x2002: /* EN SPACE */
2293 case 0x2003: /* EM SPACE */
2294 case 0x2004: /* THREE-PER-EM SPACE */
2295 case 0x2005: /* FOUR-PER-EM SPACE */
2296 case 0x2006: /* SIX-PER-EM SPACE */
2297 case 0x2007: /* FIGURE SPACE */
2298 case 0x2008: /* PUNCTUATION SPACE */
2299 case 0x2009: /* THIN SPACE */
2300 case 0x200A: /* HAIR SPACE */
2301 case 0x202f: /* NARROW NO-BREAK SPACE */
2302 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2303 case 0x3000: /* IDEOGRAPHIC SPACE */
2304 break;
2305 }
2306 ecode++;
2307 break;
2308
2309 case OP_NOT_VSPACE:
2310 if (eptr >= md->end_subject)
2311 {
2312 SCHECK_PARTIAL();
2313 MRRETURN(MATCH_NOMATCH);
2314 }
2315 GETCHARINCTEST(c, eptr);
2316 switch(c)
2317 {
2318 default: break;
2319 case 0x0a: /* LF */
2320 case 0x0b: /* VT */
2321 case 0x0c: /* FF */
2322 case 0x0d: /* CR */
2323 case 0x85: /* NEL */
2324 case 0x2028: /* LINE SEPARATOR */
2325 case 0x2029: /* PARAGRAPH SEPARATOR */
2326 MRRETURN(MATCH_NOMATCH);
2327 }
2328 ecode++;
2329 break;
2330
2331 case OP_VSPACE:
2332 if (eptr >= md->end_subject)
2333 {
2334 SCHECK_PARTIAL();
2335 MRRETURN(MATCH_NOMATCH);
2336 }
2337 GETCHARINCTEST(c, eptr);
2338 switch(c)
2339 {
2340 default: MRRETURN(MATCH_NOMATCH);
2341 case 0x0a: /* LF */
2342 case 0x0b: /* VT */
2343 case 0x0c: /* FF */
2344 case 0x0d: /* CR */
2345 case 0x85: /* NEL */
2346 case 0x2028: /* LINE SEPARATOR */
2347 case 0x2029: /* PARAGRAPH SEPARATOR */
2348 break;
2349 }
2350 ecode++;
2351 break;
2352
2353 #ifdef SUPPORT_UCP
2354 /* Check the next character by Unicode property. We will get here only
2355 if the support is in the binary; otherwise a compile-time error occurs. */
2356
2357 case OP_PROP:
2358 case OP_NOTPROP:
2359 if (eptr >= md->end_subject)
2360 {
2361 SCHECK_PARTIAL();
2362 MRRETURN(MATCH_NOMATCH);
2363 }
2364 GETCHARINCTEST(c, eptr);
2365 {
2366 const ucd_record *prop = GET_UCD(c);
2367
2368 switch(ecode[1])
2369 {
2370 case PT_ANY:
2371 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2372 break;
2373
2374 case PT_LAMP:
2375 if ((prop->chartype == ucp_Lu ||
2376 prop->chartype == ucp_Ll ||
2377 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2378 MRRETURN(MATCH_NOMATCH);
2379 break;
2380
2381 case PT_GC:
2382 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2383 MRRETURN(MATCH_NOMATCH);
2384 break;
2385
2386 case PT_PC:
2387 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2388 MRRETURN(MATCH_NOMATCH);
2389 break;
2390
2391 case PT_SC:
2392 if ((ecode[2] != prop->script) == (op == OP_PROP))
2393 MRRETURN(MATCH_NOMATCH);
2394 break;
2395
2396 /* These are specials */
2397
2398 case PT_ALNUM:
2399 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2400 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2401 MRRETURN(MATCH_NOMATCH);
2402 break;
2403
2404 case PT_SPACE: /* Perl space */
2405 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2406 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2407 == (op == OP_NOTPROP))
2408 MRRETURN(MATCH_NOMATCH);
2409 break;
2410
2411 case PT_PXSPACE: /* POSIX space */
2412 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2413 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2414 c == CHAR_FF || c == CHAR_CR)
2415 == (op == OP_NOTPROP))
2416 MRRETURN(MATCH_NOMATCH);
2417 break;
2418
2419 case PT_WORD:
2420 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2421 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2422 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2423 MRRETURN(MATCH_NOMATCH);
2424 break;
2425
2426 /* This should never occur */
2427
2428 default:
2429 RRETURN(PCRE_ERROR_INTERNAL);
2430 }
2431
2432 ecode += 3;
2433 }
2434 break;
2435
2436 /* Match an extended Unicode sequence. We will get here only if the support
2437 is in the binary; otherwise a compile-time error occurs. */
2438
2439 case OP_EXTUNI:
2440 if (eptr >= md->end_subject)
2441 {
2442 SCHECK_PARTIAL();
2443 MRRETURN(MATCH_NOMATCH);
2444 }
2445 GETCHARINCTEST(c, eptr);
2446 {
2447 int category = UCD_CATEGORY(c);
2448 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2449 while (eptr < md->end_subject)
2450 {
2451 int len = 1;
2452 if (!utf8) c = *eptr; else
2453 {
2454 GETCHARLEN(c, eptr, len);
2455 }
2456 category = UCD_CATEGORY(c);
2457 if (category != ucp_M) break;
2458 eptr += len;
2459 }
2460 }
2461 ecode++;
2462 break;
2463 #endif
2464
2465
2466 /* Match a back reference, possibly repeatedly. Look past the end of the
2467 item to see if there is repeat information following. The code is similar
2468 to that for character classes, but repeated for efficiency. Then obey
2469 similar code to character type repeats - written out again for speed.
2470 However, if the referenced string is the empty string, always treat
2471 it as matched, any number of times (otherwise there could be infinite
2472 loops). */
2473
2474 case OP_REF:
2475 case OP_REFI:
2476 caseless = op == OP_REFI;
2477 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2478 ecode += 3;
2479
2480 /* If the reference is unset, there are two possibilities:
2481
2482 (a) In the default, Perl-compatible state, set the length negative;
2483 this ensures that every attempt at a match fails. We can't just fail
2484 here, because of the possibility of quantifiers with zero minima.
2485
2486 (b) If the JavaScript compatibility flag is set, set the length to zero
2487 so that the back reference matches an empty string.
2488
2489 Otherwise, set the length to the length of what was matched by the
2490 referenced subpattern. */
2491
2492 if (offset >= offset_top || md->offset_vector[offset] < 0)
2493 length = (md->jscript_compat)? 0 : -1;
2494 else
2495 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2496
2497 /* Set up for repetition, or handle the non-repeated case */
2498
2499 switch (*ecode)
2500 {
2501 case OP_CRSTAR:
2502 case OP_CRMINSTAR:
2503 case OP_CRPLUS:
2504 case OP_CRMINPLUS:
2505 case OP_CRQUERY:
2506 case OP_CRMINQUERY:
2507 c = *ecode++ - OP_CRSTAR;
2508 minimize = (c & 1) != 0;
2509 min = rep_min[c]; /* Pick up values from tables; */
2510 max = rep_max[c]; /* zero for max => infinity */
2511 if (max == 0) max = INT_MAX;
2512 break;
2513
2514 case OP_CRRANGE:
2515 case OP_CRMINRANGE:
2516 minimize = (*ecode == OP_CRMINRANGE);
2517 min = GET2(ecode, 1);
2518 max = GET2(ecode, 3);
2519 if (max == 0) max = INT_MAX;
2520 ecode += 5;
2521 break;
2522
2523 default: /* No repeat follows */
2524 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2525 {
2526 CHECK_PARTIAL();
2527 MRRETURN(MATCH_NOMATCH);
2528 }
2529 eptr += length;
2530 continue; /* With the main loop */
2531 }
2532
2533 /* Handle repeated back references. If the length of the reference is
2534 zero, just continue with the main loop. */
2535
2536 if (length == 0) continue;
2537
2538 /* First, ensure the minimum number of matches are present. We get back
2539 the length of the reference string explicitly rather than passing the
2540 address of eptr, so that eptr can be a register variable. */
2541
2542 for (i = 1; i <= min; i++)
2543 {
2544 int slength;
2545 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2546 {
2547 CHECK_PARTIAL();
2548 MRRETURN(MATCH_NOMATCH);
2549 }
2550 eptr += slength;
2551 }
2552
2553 /* If min = max, continue at the same level without recursion.
2554 They are not both allowed to be zero. */
2555
2556 if (min == max) continue;
2557
2558 /* If minimizing, keep trying and advancing the pointer */
2559
2560 if (minimize)
2561 {
2562 for (fi = min;; fi++)
2563 {
2564 int slength;
2565 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2566 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2567 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2568 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2569 {
2570 CHECK_PARTIAL();
2571 MRRETURN(MATCH_NOMATCH);
2572 }
2573 eptr += slength;
2574 }
2575 /* Control never gets here */
2576 }
2577
2578 /* If maximizing, find the longest string and work backwards */
2579
2580 else
2581 {
2582 pp = eptr;
2583 for (i = min; i < max; i++)
2584 {
2585 int slength;
2586 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2587 {
2588 CHECK_PARTIAL();
2589 break;
2590 }
2591 eptr += slength;
2592 }
2593 while (eptr >= pp)
2594 {
2595 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2596 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2597 eptr -= length;
2598 }
2599 MRRETURN(MATCH_NOMATCH);
2600 }
2601 /* Control never gets here */
2602
2603 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2604 used when all the characters in the class have values in the range 0-255,
2605 and either the matching is caseful, or the characters are in the range
2606 0-127 when UTF-8 processing is enabled. The only difference between
2607 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2608 encountered.
2609
2610 First, look past the end of the item to see if there is repeat information
2611 following. Then obey similar code to character type repeats - written out
2612 again for speed. */
2613
2614 case OP_NCLASS:
2615 case OP_CLASS:
2616 {
2617 data = ecode + 1; /* Save for matching */
2618 ecode += 33; /* Advance past the item */
2619
2620 switch (*ecode)
2621 {
2622 case OP_CRSTAR:
2623 case OP_CRMINSTAR:
2624 case OP_CRPLUS:
2625 case OP_CRMINPLUS:
2626 case OP_CRQUERY:
2627 case OP_CRMINQUERY:
2628 c = *ecode++ - OP_CRSTAR;
2629 minimize = (c & 1) != 0;
2630 min = rep_min[c]; /* Pick up values from tables; */
2631 max = rep_max[c]; /* zero for max => infinity */
2632 if (max == 0) max = INT_MAX;
2633 break;
2634
2635 case OP_CRRANGE:
2636 case OP_CRMINRANGE:
2637 minimize = (*ecode == OP_CRMINRANGE);
2638 min = GET2(ecode, 1);
2639 max = GET2(ecode, 3);
2640 if (max == 0) max = INT_MAX;
2641 ecode += 5;
2642 break;
2643
2644 default: /* No repeat follows */
2645 min = max = 1;
2646 break;
2647 }
2648
2649 /* First, ensure the minimum number of matches are present. */
2650
2651 #ifdef SUPPORT_UTF8
2652 /* UTF-8 mode */
2653 if (utf8)
2654 {
2655 for (i = 1; i <= min; i++)
2656 {
2657 if (eptr >= md->end_subject)
2658 {
2659 SCHECK_PARTIAL();
2660 MRRETURN(MATCH_NOMATCH);
2661 }
2662 GETCHARINC(c, eptr);
2663 if (c > 255)
2664 {
2665 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2666 }
2667 else
2668 {
2669 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2670 }
2671 }
2672 }
2673 else
2674 #endif
2675 /* Not UTF-8 mode */
2676 {
2677 for (i = 1; i <= min; i++)
2678 {
2679 if (eptr >= md->end_subject)
2680 {
2681 SCHECK_PARTIAL();
2682 MRRETURN(MATCH_NOMATCH);
2683 }
2684 c = *eptr++;
2685 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2686 }
2687 }
2688
2689 /* If max == min we can continue with the main loop without the
2690 need to recurse. */
2691
2692 if (min == max) continue;
2693
2694 /* If minimizing, keep testing the rest of the expression and advancing
2695 the pointer while it matches the class. */
2696
2697 if (minimize)
2698 {
2699 #ifdef SUPPORT_UTF8
2700 /* UTF-8 mode */
2701 if (utf8)
2702 {
2703 for (fi = min;; fi++)
2704 {
2705 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2706 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2707 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2708 if (eptr >= md->end_subject)
2709 {
2710 SCHECK_PARTIAL();
2711 MRRETURN(MATCH_NOMATCH);
2712 }
2713 GETCHARINC(c, eptr);
2714 if (c > 255)
2715 {
2716 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2717 }
2718 else
2719 {
2720 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2721 }
2722 }
2723 }
2724 else
2725 #endif
2726 /* Not UTF-8 mode */
2727 {
2728 for (fi = min;; fi++)
2729 {
2730 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2731 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2732 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2733 if (eptr >= md->end_subject)
2734 {
2735 SCHECK_PARTIAL();
2736 MRRETURN(MATCH_NOMATCH);
2737 }
2738 c = *eptr++;
2739 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2740 }
2741 }
2742 /* Control never gets here */
2743 }
2744
2745 /* If maximizing, find the longest possible run, then work backwards. */
2746
2747 else
2748 {
2749 pp = eptr;
2750
2751 #ifdef SUPPORT_UTF8
2752 /* UTF-8 mode */
2753 if (utf8)
2754 {
2755 for (i = min; i < max; i++)
2756 {
2757 int len = 1;
2758 if (eptr >= md->end_subject)
2759 {
2760 SCHECK_PARTIAL();
2761 break;
2762 }
2763 GETCHARLEN(c, eptr, len);
2764 if (c > 255)
2765 {
2766 if (op == OP_CLASS) break;
2767 }
2768 else
2769 {
2770 if ((data[c/8] & (1 << (c&7))) == 0) break;
2771 }
2772 eptr += len;
2773 }
2774 for (;;)
2775 {
2776 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2777 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2778 if (eptr-- == pp) break; /* Stop if tried at original pos */
2779 BACKCHAR(eptr);
2780 }
2781 }
2782 else
2783 #endif
2784 /* Not UTF-8 mode */
2785 {
2786 for (i = min; i < max; i++)
2787 {
2788 if (eptr >= md->end_subject)
2789 {
2790 SCHECK_PARTIAL();
2791 break;
2792 }
2793 c = *eptr;
2794 if ((data[c/8] & (1 << (c&7))) == 0) break;
2795 eptr++;
2796 }
2797 while (eptr >= pp)
2798 {
2799 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2800 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2801 eptr--;
2802 }
2803 }
2804
2805 MRRETURN(MATCH_NOMATCH);
2806 }
2807 }
2808 /* Control never gets here */
2809
2810
2811 /* Match an extended character class. This opcode is encountered only
2812 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2813 mode, because Unicode properties are supported in non-UTF-8 mode. */
2814
2815 #ifdef SUPPORT_UTF8
2816 case OP_XCLASS:
2817 {
2818 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2819 ecode += GET(ecode, 1); /* Advance past the item */
2820
2821 switch (*ecode)
2822 {
2823 case OP_CRSTAR:
2824 case OP_CRMINSTAR:
2825 case OP_CRPLUS:
2826 case OP_CRMINPLUS:
2827 case OP_CRQUERY:
2828 case OP_CRMINQUERY:
2829 c = *ecode++ - OP_CRSTAR;
2830 minimize = (c & 1) != 0;
2831 min = rep_min[c]; /* Pick up values from tables; */
2832 max = rep_max[c]; /* zero for max => infinity */
2833 if (max == 0) max = INT_MAX;
2834 break;
2835
2836 case OP_CRRANGE:
2837 case OP_CRMINRANGE:
2838 minimize = (*ecode == OP_CRMINRANGE);
2839 min = GET2(ecode, 1);
2840 max = GET2(ecode, 3);
2841 if (max == 0) max = INT_MAX;
2842 ecode += 5;
2843 break;
2844
2845 default: /* No repeat follows */
2846 min = max = 1;
2847 break;
2848 }
2849
2850 /* First, ensure the minimum number of matches are present. */
2851
2852 for (i = 1; i <= min; i++)
2853 {
2854 if (eptr >= md->end_subject)
2855 {
2856 SCHECK_PARTIAL();
2857 MRRETURN(MATCH_NOMATCH);
2858 }
2859 GETCHARINCTEST(c, eptr);
2860 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2861 }
2862
2863 /* If max == min we can continue with the main loop without the
2864 need to recurse. */
2865
2866 if (min == max) continue;
2867
2868 /* If minimizing, keep testing the rest of the expression and advancing
2869 the pointer while it matches the class. */
2870
2871 if (minimize)
2872 {
2873 for (fi = min;; fi++)
2874 {
2875 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2877 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2878 if (eptr >= md->end_subject)
2879 {
2880 SCHECK_PARTIAL();
2881 MRRETURN(MATCH_NOMATCH);
2882 }
2883 GETCHARINCTEST(c, eptr);
2884 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2885 }
2886 /* Control never gets here */
2887 }
2888
2889 /* If maximizing, find the longest possible run, then work backwards. */
2890
2891 else
2892 {
2893 pp = eptr;
2894 for (i = min; i < max; i++)
2895 {
2896 int len = 1;
2897 if (eptr >= md->end_subject)
2898 {
2899 SCHECK_PARTIAL();
2900 break;
2901 }
2902 GETCHARLENTEST(c, eptr, len);
2903 if (!_pcre_xclass(c, data)) break;
2904 eptr += len;
2905 }
2906 for(;;)
2907 {
2908 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2909 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2910 if (eptr-- == pp) break; /* Stop if tried at original pos */
2911 if (utf8) BACKCHAR(eptr);
2912 }
2913 MRRETURN(MATCH_NOMATCH);
2914 }
2915
2916 /* Control never gets here */
2917 }
2918 #endif /* End of XCLASS */
2919
2920 /* Match a single character, casefully */
2921
2922 case OP_CHAR:
2923 #ifdef SUPPORT_UTF8
2924 if (utf8)
2925 {
2926 length = 1;
2927 ecode++;
2928 GETCHARLEN(fc, ecode, length);
2929 if (length > md->end_subject - eptr)
2930 {
2931 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2932 MRRETURN(MATCH_NOMATCH);
2933 }
2934 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2935 }
2936 else
2937 #endif
2938
2939 /* Non-UTF-8 mode */
2940 {
2941 if (md->end_subject - eptr < 1)
2942 {
2943 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2944 MRRETURN(MATCH_NOMATCH);
2945 }
2946 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2947 ecode += 2;
2948 }
2949 break;
2950
2951 /* Match a single character, caselessly */
2952
2953 case OP_CHARI:
2954 #ifdef SUPPORT_UTF8
2955 if (utf8)
2956 {
2957 length = 1;
2958 ecode++;
2959 GETCHARLEN(fc, ecode, length);
2960
2961 if (length > md->end_subject - eptr)
2962 {
2963 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2964 MRRETURN(MATCH_NOMATCH);
2965 }
2966
2967 /* If the pattern character's value is < 128, we have only one byte, and
2968 can use the fast lookup table. */
2969
2970 if (fc < 128)
2971 {
2972 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2973 }
2974
2975 /* Otherwise we must pick up the subject character */
2976
2977 else
2978 {
2979 unsigned int dc;
2980 GETCHARINC(dc, eptr);
2981 ecode += length;
2982
2983 /* If we have Unicode property support, we can use it to test the other
2984 case of the character, if there is one. */
2985
2986 if (fc != dc)
2987 {
2988 #ifdef SUPPORT_UCP
2989 if (dc != UCD_OTHERCASE(fc))
2990 #endif
2991 MRRETURN(MATCH_NOMATCH);
2992 }
2993 }
2994 }
2995 else
2996 #endif /* SUPPORT_UTF8 */
2997
2998 /* Non-UTF-8 mode */
2999 {
3000 if (md->end_subject - eptr < 1)
3001 {
3002 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
3003 MRRETURN(MATCH_NOMATCH);
3004 }
3005 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3006 ecode += 2;
3007 }
3008 break;
3009
3010 /* Match a single character repeatedly. */
3011
3012 case OP_EXACT:
3013 case OP_EXACTI:
3014 min = max = GET2(ecode, 1);
3015 ecode += 3;
3016 goto REPEATCHAR;
3017
3018 case OP_POSUPTO:
3019 case OP_POSUPTOI:
3020 possessive = TRUE;
3021 /* Fall through */
3022
3023 case OP_UPTO:
3024 case OP_UPTOI:
3025 case OP_MINUPTO:
3026 case OP_MINUPTOI:
3027 min = 0;
3028 max = GET2(ecode, 1);
3029 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3030 ecode += 3;
3031 goto REPEATCHAR;
3032
3033 case OP_POSSTAR:
3034 case OP_POSSTARI:
3035 possessive = TRUE;
3036 min = 0;
3037 max = INT_MAX;
3038 ecode++;
3039 goto REPEATCHAR;
3040
3041 case OP_POSPLUS:
3042 case OP_POSPLUSI:
3043 possessive = TRUE;
3044 min = 1;
3045 max = INT_MAX;
3046 ecode++;
3047 goto REPEATCHAR;
3048
3049 case OP_POSQUERY:
3050 case OP_POSQUERYI:
3051 possessive = TRUE;
3052 min = 0;
3053 max = 1;
3054 ecode++;
3055 goto REPEATCHAR;
3056
3057 case OP_STAR:
3058 case OP_STARI:
3059 case OP_MINSTAR:
3060 case OP_MINSTARI:
3061 case OP_PLUS:
3062 case OP_PLUSI:
3063 case OP_MINPLUS:
3064 case OP_MINPLUSI:
3065 case OP_QUERY:
3066 case OP_QUERYI:
3067 case OP_MINQUERY:
3068 case OP_MINQUERYI:
3069 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3070 minimize = (c & 1) != 0;
3071 min = rep_min[c]; /* Pick up values from tables; */
3072 max = rep_max[c]; /* zero for max => infinity */
3073 if (max == 0) max = INT_MAX;
3074
3075 /* Common code for all repeated single-character matches. */
3076
3077 REPEATCHAR:
3078 #ifdef SUPPORT_UTF8
3079 if (utf8)
3080 {
3081 length = 1;
3082 charptr = ecode;
3083 GETCHARLEN(fc, ecode, length);
3084 ecode += length;
3085
3086 /* Handle multibyte character matching specially here. There is
3087 support for caseless matching if UCP support is present. */
3088
3089 if (length > 1)
3090 {
3091 #ifdef SUPPORT_UCP
3092 unsigned int othercase;
3093 if (op >= OP_STARI && /* Caseless */
3094 (othercase = UCD_OTHERCASE(fc)) != fc)
3095 oclength = _pcre_ord2utf8(othercase, occhars);
3096 else oclength = 0;
3097 #endif /* SUPPORT_UCP */
3098
3099 for (i = 1; i <= min; i++)
3100 {
3101 if (eptr <= md->end_subject - length &&
3102 memcmp(eptr, charptr, length) == 0) eptr += length;
3103 #ifdef SUPPORT_UCP
3104 else if (oclength > 0 &&
3105 eptr <= md->end_subject - oclength &&
3106 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3107 #endif /* SUPPORT_UCP */
3108 else
3109 {
3110 CHECK_PARTIAL();
3111 MRRETURN(MATCH_NOMATCH);
3112 }
3113 }
3114
3115 if (min == max) continue;
3116
3117 if (minimize)
3118 {
3119 for (fi = min;; fi++)
3120 {
3121 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3122 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3123 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3124 if (eptr <= md->end_subject - length &&
3125 memcmp(eptr, charptr, length) == 0) eptr += length;
3126 #ifdef SUPPORT_UCP
3127 else if (oclength > 0 &&
3128 eptr <= md->end_subject - oclength &&
3129 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3130 #endif /* SUPPORT_UCP */
3131 else
3132 {
3133 CHECK_PARTIAL();
3134 MRRETURN(MATCH_NOMATCH);
3135 }
3136 }
3137 /* Control never gets here */
3138 }
3139
3140 else /* Maximize */
3141 {
3142 pp = eptr;
3143 for (i = min; i < max; i++)
3144 {
3145 if (eptr <= md->end_subject - length &&
3146 memcmp(eptr, charptr, length) == 0) eptr += length;
3147 #ifdef SUPPORT_UCP
3148 else if (oclength > 0 &&
3149 eptr <= md->end_subject - oclength &&
3150 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3151 #endif /* SUPPORT_UCP */
3152 else
3153 {
3154 CHECK_PARTIAL();
3155 break;
3156 }
3157 }
3158
3159 if (possessive) continue;
3160
3161 for(;;)
3162 {
3163 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3164 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3165 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3166 #ifdef SUPPORT_UCP
3167 eptr--;
3168 BACKCHAR(eptr);
3169 #else /* without SUPPORT_UCP */
3170 eptr -= length;
3171 #endif /* SUPPORT_UCP */
3172 }
3173 }
3174 /* Control never gets here */
3175 }
3176
3177 /* If the length of a UTF-8 character is 1, we fall through here, and
3178 obey the code as for non-UTF-8 characters below, though in this case the
3179 value of fc will always be < 128. */
3180 }
3181 else
3182 #endif /* SUPPORT_UTF8 */
3183
3184 /* When not in UTF-8 mode, load a single-byte character. */
3185
3186 fc = *ecode++;
3187
3188 /* The value of fc at this point is always less than 256, though we may or
3189 may not be in UTF-8 mode. The code is duplicated for the caseless and
3190 caseful cases, for speed, since matching characters is likely to be quite
3191 common. First, ensure the minimum number of matches are present. If min =
3192 max, continue at the same level without recursing. Otherwise, if
3193 minimizing, keep trying the rest of the expression and advancing one
3194 matching character if failing, up to the maximum. Alternatively, if
3195 maximizing, find the maximum number of characters and work backwards. */
3196
3197 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3198 max, eptr));
3199
3200 if (op >= OP_STARI) /* Caseless */
3201 {
3202 fc = md->lcc[fc];
3203 for (i = 1; i <= min; i++)
3204 {
3205 if (eptr >= md->end_subject)
3206 {
3207 SCHECK_PARTIAL();
3208 MRRETURN(MATCH_NOMATCH);
3209 }
3210 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3211 }
3212 if (min == max) continue;
3213 if (minimize)
3214 {
3215 for (fi = min;; fi++)
3216 {
3217 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3218 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3219 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3220 if (eptr >= md->end_subject)
3221 {
3222 SCHECK_PARTIAL();
3223 MRRETURN(MATCH_NOMATCH);
3224 }
3225 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3226 }
3227 /* Control never gets here */
3228 }
3229 else /* Maximize */
3230 {
3231 pp = eptr;
3232 for (i = min; i < max; i++)
3233 {
3234 if (eptr >= md->end_subject)
3235 {
3236 SCHECK_PARTIAL();
3237 break;
3238 }
3239 if (fc != md->lcc[*eptr]) break;
3240 eptr++;
3241 }
3242
3243 if (possessive) continue;
3244
3245 while (eptr >= pp)
3246 {
3247 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3248 eptr--;
3249 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3250 }
3251 MRRETURN(MATCH_NOMATCH);
3252 }
3253 /* Control never gets here */
3254 }
3255
3256 /* Caseful comparisons (includes all multi-byte characters) */
3257
3258 else
3259 {
3260 for (i = 1; i <= min; i++)
3261 {
3262 if (eptr >= md->end_subject)
3263 {
3264 SCHECK_PARTIAL();
3265 MRRETURN(MATCH_NOMATCH);
3266 }
3267 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3268 }
3269
3270 if (min == max) continue;
3271
3272 if (minimize)
3273 {
3274 for (fi = min;; fi++)
3275 {
3276 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3278 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3279 if (eptr >= md->end_subject)
3280 {
3281 SCHECK_PARTIAL();
3282 MRRETURN(MATCH_NOMATCH);
3283 }
3284 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3285 }
3286 /* Control never gets here */
3287 }
3288 else /* Maximize */
3289 {
3290 pp = eptr;
3291 for (i = min; i < max; i++)
3292 {
3293 if (eptr >= md->end_subject)
3294 {
3295 SCHECK_PARTIAL();
3296 break;
3297 }
3298 if (fc != *eptr) break;
3299 eptr++;
3300 }
3301 if (possessive) continue;
3302
3303 while (eptr >= pp)
3304 {
3305 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3306 eptr--;
3307 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3308 }
3309 MRRETURN(MATCH_NOMATCH);
3310 }
3311 }
3312 /* Control never gets here */
3313
3314 /* Match a negated single one-byte character. The character we are
3315 checking can be multibyte. */
3316
3317 case OP_NOT:
3318 case OP_NOTI:
3319 if (eptr >= md->end_subject)
3320 {
3321 SCHECK_PARTIAL();
3322 MRRETURN(MATCH_NOMATCH);
3323 }
3324 ecode++;
3325 GETCHARINCTEST(c, eptr);
3326 if (op == OP_NOTI) /* The caseless case */
3327 {
3328 #ifdef SUPPORT_UTF8
3329 if (c < 256)
3330 #endif
3331 c = md->lcc[c];
3332 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3333 }
3334 else /* Caseful */
3335 {
3336 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3337 }
3338 break;
3339
3340 /* Match a negated single one-byte character repeatedly. This is almost a
3341 repeat of the code for a repeated single character, but I haven't found a
3342 nice way of commoning these up that doesn't require a test of the
3343 positive/negative option for each character match. Maybe that wouldn't add
3344 very much to the time taken, but character matching *is* what this is all
3345 about... */
3346
3347 case OP_NOTEXACT:
3348 case OP_NOTEXACTI:
3349 min = max = GET2(ecode, 1);
3350 ecode += 3;
3351 goto REPEATNOTCHAR;
3352
3353 case OP_NOTUPTO:
3354 case OP_NOTUPTOI:
3355 case OP_NOTMINUPTO:
3356 case OP_NOTMINUPTOI:
3357 min = 0;
3358 max = GET2(ecode, 1);
3359 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3360 ecode += 3;
3361 goto REPEATNOTCHAR;
3362
3363 case OP_NOTPOSSTAR:
3364 case OP_NOTPOSSTARI:
3365 possessive = TRUE;
3366 min = 0;
3367 max = INT_MAX;
3368 ecode++;
3369 goto REPEATNOTCHAR;
3370
3371 case OP_NOTPOSPLUS:
3372 case OP_NOTPOSPLUSI:
3373 possessive = TRUE;
3374 min = 1;
3375 max = INT_MAX;
3376 ecode++;
3377 goto REPEATNOTCHAR;
3378
3379 case OP_NOTPOSQUERY:
3380 case OP_NOTPOSQUERYI:
3381 possessive = TRUE;
3382 min = 0;
3383 max = 1;
3384 ecode++;
3385 goto REPEATNOTCHAR;
3386
3387 case OP_NOTPOSUPTO:
3388 case OP_NOTPOSUPTOI:
3389 possessive = TRUE;
3390 min = 0;
3391 max = GET2(ecode, 1);
3392 ecode += 3;
3393 goto REPEATNOTCHAR;
3394
3395 case OP_NOTSTAR:
3396 case OP_NOTSTARI:
3397 case OP_NOTMINSTAR:
3398 case OP_NOTMINSTARI:
3399 case OP_NOTPLUS:
3400 case OP_NOTPLUSI:
3401 case OP_NOTMINPLUS:
3402 case OP_NOTMINPLUSI:
3403 case OP_NOTQUERY:
3404 case OP_NOTQUERYI:
3405 case OP_NOTMINQUERY:
3406 case OP_NOTMINQUERYI:
3407 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3408 minimize = (c & 1) != 0;
3409 min = rep_min[c]; /* Pick up values from tables; */
3410 max = rep_max[c]; /* zero for max => infinity */
3411 if (max == 0) max = INT_MAX;
3412
3413 /* Common code for all repeated single-byte matches. */
3414
3415 REPEATNOTCHAR:
3416 fc = *ecode++;
3417
3418 /* The code is duplicated for the caseless and caseful cases, for speed,
3419 since matching characters is likely to be quite common. First, ensure the
3420 minimum number of matches are present. If min = max, continue at the same
3421 level without recursing. Otherwise, if minimizing, keep trying the rest of
3422 the expression and advancing one matching character if failing, up to the
3423 maximum. Alternatively, if maximizing, find the maximum number of
3424 characters and work backwards. */
3425
3426 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3427 max, eptr));
3428
3429 if (op >= OP_NOTSTARI) /* Caseless */
3430 {
3431 fc = md->lcc[fc];
3432
3433 #ifdef SUPPORT_UTF8
3434 /* UTF-8 mode */
3435 if (utf8)
3436 {
3437 register unsigned int d;
3438 for (i = 1; i <= min; i++)
3439 {
3440 if (eptr >= md->end_subject)
3441 {
3442 SCHECK_PARTIAL();
3443 MRRETURN(MATCH_NOMATCH);
3444 }
3445 GETCHARINC(d, eptr);
3446 if (d < 256) d = md->lcc[d];
3447 if (fc == d) MRRETURN(MATCH_NOMATCH);
3448 }
3449 }
3450 else
3451 #endif
3452
3453 /* Not UTF-8 mode */
3454 {
3455 for (i = 1; i <= min; i++)
3456 {
3457 if (eptr >= md->end_subject)
3458 {
3459 SCHECK_PARTIAL();
3460 MRRETURN(MATCH_NOMATCH);
3461 }
3462 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3463 }
3464 }
3465
3466 if (min == max) continue;
3467
3468 if (minimize)
3469 {
3470 #ifdef SUPPORT_UTF8
3471 /* UTF-8 mode */
3472 if (utf8)
3473 {
3474 register unsigned int d;
3475 for (fi = min;; fi++)
3476 {
3477 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3478 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3480 if (eptr >= md->end_subject)
3481 {
3482 SCHECK_PARTIAL();
3483 MRRETURN(MATCH_NOMATCH);
3484 }
3485 GETCHARINC(d, eptr);
3486 if (d < 256) d = md->lcc[d];
3487 if (fc == d) MRRETURN(MATCH_NOMATCH);
3488 }
3489 }
3490 else
3491 #endif
3492 /* Not UTF-8 mode */
3493 {
3494 for (fi = min;; fi++)
3495 {
3496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3498 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3499 if (eptr >= md->end_subject)
3500 {
3501 SCHECK_PARTIAL();
3502 MRRETURN(MATCH_NOMATCH);
3503 }
3504 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3505 }
3506 }
3507 /* Control never gets here */
3508 }
3509
3510 /* Maximize case */
3511
3512 else
3513 {
3514 pp = eptr;
3515
3516 #ifdef SUPPORT_UTF8
3517 /* UTF-8 mode */
3518 if (utf8)
3519 {
3520 register unsigned int d;
3521 for (i = min; i < max; i++)
3522 {
3523 int len = 1;
3524 if (eptr >= md->end_subject)
3525 {
3526 SCHECK_PARTIAL();
3527 break;
3528 }
3529 GETCHARLEN(d, eptr, len);
3530 if (d < 256) d = md->lcc[d];
3531 if (fc == d) break;
3532 eptr += len;
3533 }
3534 if (possessive) continue;
3535 for(;;)
3536 {
3537 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3538 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3539 if (eptr-- == pp) break; /* Stop if tried at original pos */
3540 BACKCHAR(eptr);
3541 }
3542 }
3543 else
3544 #endif
3545 /* Not UTF-8 mode */
3546 {
3547 for (i = min; i < max; i++)
3548 {
3549 if (eptr >= md->end_subject)
3550 {
3551 SCHECK_PARTIAL();
3552 break;
3553 }
3554 if (fc == md->lcc[*eptr]) break;
3555 eptr++;
3556 }
3557 if (possessive) continue;
3558 while (eptr >= pp)
3559 {
3560 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3561 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3562 eptr--;
3563 }
3564 }
3565
3566 MRRETURN(MATCH_NOMATCH);
3567 }
3568 /* Control never gets here */
3569 }
3570
3571 /* Caseful comparisons */
3572
3573 else
3574 {
3575 #ifdef SUPPORT_UTF8
3576 /* UTF-8 mode */
3577 if (utf8)
3578 {
3579 register unsigned int d;
3580 for (i = 1; i <= min; i++)
3581 {
3582 if (eptr >= md->end_subject)
3583 {
3584 SCHECK_PARTIAL();
3585 MRRETURN(MATCH_NOMATCH);
3586 }
3587 GETCHARINC(d, eptr);
3588 if (fc == d) MRRETURN(MATCH_NOMATCH);
3589 }
3590 }
3591 else
3592 #endif
3593 /* Not UTF-8 mode */
3594 {
3595 for (i = 1; i <= min; i++)
3596 {
3597 if (eptr >= md->end_subject)
3598 {
3599 SCHECK_PARTIAL();
3600 MRRETURN(MATCH_NOMATCH);
3601 }
3602 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3603 }
3604 }
3605
3606 if (min == max) continue;
3607
3608 if (minimize)
3609 {
3610 #ifdef SUPPORT_UTF8
3611 /* UTF-8 mode */
3612 if (utf8)
3613 {
3614 register unsigned int d;
3615 for (fi = min;; fi++)
3616 {
3617 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3618 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3619 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3620 if (eptr >= md->end_subject)
3621 {
3622 SCHECK_PARTIAL();
3623 MRRETURN(MATCH_NOMATCH);
3624 }
3625 GETCHARINC(d, eptr);
3626 if (fc == d) MRRETURN(MATCH_NOMATCH);
3627 }
3628 }
3629 else
3630 #endif
3631 /* Not UTF-8 mode */
3632 {
3633 for (fi = min;; fi++)
3634 {
3635 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3636 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3637 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3638 if (eptr >= md->end_subject)
3639 {
3640 SCHECK_PARTIAL();
3641 MRRETURN(MATCH_NOMATCH);
3642 }
3643 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3644 }
3645 }
3646 /* Control never gets here */
3647 }
3648
3649 /* Maximize case */
3650
3651 else
3652 {
3653 pp = eptr;
3654
3655 #ifdef SUPPORT_UTF8
3656 /* UTF-8 mode */
3657 if (utf8)
3658 {
3659 register unsigned int d;
3660 for (i = min; i < max; i++)
3661 {
3662 int len = 1;
3663 if (eptr >= md->end_subject)
3664 {
3665 SCHECK_PARTIAL();
3666 break;
3667 }
3668 GETCHARLEN(d, eptr, len);
3669 if (fc == d) break;
3670 eptr += len;
3671 }
3672 if (possessive) continue;
3673 for(;;)
3674 {
3675 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3676 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3677 if (eptr-- == pp) break; /* Stop if tried at original pos */
3678 BACKCHAR(eptr);
3679 }
3680 }
3681 else
3682 #endif
3683 /* Not UTF-8 mode */
3684 {
3685 for (i = min; i < max; i++)
3686 {
3687 if (eptr >= md->end_subject)
3688 {
3689 SCHECK_PARTIAL();
3690 break;
3691 }
3692 if (fc == *eptr) break;
3693 eptr++;
3694 }
3695 if (possessive) continue;
3696 while (eptr >= pp)
3697 {
3698 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3699 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3700 eptr--;
3701 }
3702 }
3703
3704 MRRETURN(MATCH_NOMATCH);
3705 }
3706 }
3707 /* Control never gets here */
3708
3709 /* Match a single character type repeatedly; several different opcodes
3710 share code. This is very similar to the code for single characters, but we
3711 repeat it in the interests of efficiency. */
3712
3713 case OP_TYPEEXACT:
3714 min = max = GET2(ecode, 1);
3715 minimize = TRUE;
3716 ecode += 3;
3717 goto REPEATTYPE;
3718
3719 case OP_TYPEUPTO:
3720 case OP_TYPEMINUPTO:
3721 min = 0;
3722 max = GET2(ecode, 1);
3723 minimize = *ecode == OP_TYPEMINUPTO;
3724 ecode += 3;
3725 goto REPEATTYPE;
3726
3727 case OP_TYPEPOSSTAR:
3728 possessive = TRUE;
3729 min = 0;
3730 max = INT_MAX;
3731 ecode++;
3732 goto REPEATTYPE;
3733
3734 case OP_TYPEPOSPLUS:
3735 possessive = TRUE;
3736 min = 1;
3737 max = INT_MAX;
3738 ecode++;
3739 goto REPEATTYPE;
3740
3741 case OP_TYPEPOSQUERY:
3742 possessive = TRUE;
3743 min = 0;
3744 max = 1;
3745 ecode++;
3746 goto REPEATTYPE;
3747
3748 case OP_TYPEPOSUPTO:
3749 possessive = TRUE;
3750 min = 0;
3751 max = GET2(ecode, 1);
3752 ecode += 3;
3753 goto REPEATTYPE;
3754
3755 case OP_TYPESTAR:
3756 case OP_TYPEMINSTAR:
3757 case OP_TYPEPLUS:
3758 case OP_TYPEMINPLUS:
3759 case OP_TYPEQUERY:
3760 case OP_TYPEMINQUERY:
3761 c = *ecode++ - OP_TYPESTAR;
3762 minimize = (c & 1) != 0;
3763 min = rep_min[c]; /* Pick up values from tables; */
3764 max = rep_max[c]; /* zero for max => infinity */
3765 if (max == 0) max = INT_MAX;
3766
3767 /* Common code for all repeated single character type matches. Note that
3768 in UTF-8 mode, '.' matches a character of any length, but for the other
3769 character types, the valid characters are all one-byte long. */
3770
3771 REPEATTYPE:
3772 ctype = *ecode++; /* Code for the character type */
3773
3774 #ifdef SUPPORT_UCP
3775 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3776 {
3777 prop_fail_result = ctype == OP_NOTPROP;
3778 prop_type = *ecode++;
3779 prop_value = *ecode++;
3780 }
3781 else prop_type = -1;
3782 #endif
3783
3784 /* First, ensure the minimum number of matches are present. Use inline
3785 code for maximizing the speed, and do the type test once at the start
3786 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3787 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3788 and single-bytes. */
3789
3790 if (min > 0)
3791 {
3792 #ifdef SUPPORT_UCP
3793 if (prop_type >= 0)
3794 {
3795 switch(prop_type)
3796 {
3797 case PT_ANY:
3798 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3799 for (i = 1; i <= min; i++)
3800 {
3801 if (eptr >= md->end_subject)
3802 {
3803 SCHECK_PARTIAL();
3804 MRRETURN(MATCH_NOMATCH);
3805 }
3806 GETCHARINCTEST(c, eptr);
3807 }
3808 break;
3809
3810 case PT_LAMP:
3811 for (i = 1; i <= min; i++)
3812 {
3813 if (eptr >= md->end_subject)
3814 {
3815 SCHECK_PARTIAL();
3816 MRRETURN(MATCH_NOMATCH);
3817 }
3818 GETCHARINCTEST(c, eptr);
3819 prop_chartype = UCD_CHARTYPE(c);
3820 if ((prop_chartype == ucp_Lu ||
3821 prop_chartype == ucp_Ll ||
3822 prop_chartype == ucp_Lt) == prop_fail_result)
3823 MRRETURN(MATCH_NOMATCH);
3824 }
3825 break;
3826
3827 case PT_GC:
3828 for (i = 1; i <= min; i++)
3829 {
3830 if (eptr >= md->end_subject)
3831 {
3832 SCHECK_PARTIAL();
3833 MRRETURN(MATCH_NOMATCH);
3834 }
3835 GETCHARINCTEST(c, eptr);
3836 prop_category = UCD_CATEGORY(c);
3837 if ((prop_category == prop_value) == prop_fail_result)
3838 MRRETURN(MATCH_NOMATCH);
3839 }
3840 break;
3841
3842 case PT_PC:
3843 for (i = 1; i <= min; i++)
3844 {
3845 if (eptr >= md->end_subject)
3846 {
3847 SCHECK_PARTIAL();
3848 MRRETURN(MATCH_NOMATCH);
3849 }
3850 GETCHARINCTEST(c, eptr);
3851 prop_chartype = UCD_CHARTYPE(c);
3852 if ((prop_chartype == prop_value) == prop_fail_result)
3853 MRRETURN(MATCH_NOMATCH);
3854 }
3855 break;
3856
3857 case PT_SC:
3858 for (i = 1; i <= min; i++)
3859 {
3860 if (eptr >= md->end_subject)
3861 {
3862 SCHECK_PARTIAL();
3863 MRRETURN(MATCH_NOMATCH);
3864 }
3865 GETCHARINCTEST(c, eptr);
3866 prop_script = UCD_SCRIPT(c);
3867 if ((prop_script == prop_value) == prop_fail_result)
3868 MRRETURN(MATCH_NOMATCH);
3869 }
3870 break;
3871
3872 case PT_ALNUM:
3873 for (i = 1; i <= min; i++)
3874 {
3875 if (eptr >= md->end_subject)
3876 {
3877 SCHECK_PARTIAL();
3878 MRRETURN(MATCH_NOMATCH);
3879 }
3880 GETCHARINCTEST(c, eptr);
3881 prop_category = UCD_CATEGORY(c);
3882 if ((prop_category == ucp_L || prop_category == ucp_N)
3883 == prop_fail_result)
3884 MRRETURN(MATCH_NOMATCH);
3885 }
3886 break;
3887
3888 case PT_SPACE: /* Perl space */
3889 for (i = 1; i <= min; i++)
3890 {
3891 if (eptr >= md->end_subject)
3892 {
3893 SCHECK_PARTIAL();
3894 MRRETURN(MATCH_NOMATCH);
3895 }
3896 GETCHARINCTEST(c, eptr);
3897 prop_category = UCD_CATEGORY(c);
3898 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3899 c == CHAR_FF || c == CHAR_CR)
3900 == prop_fail_result)
3901 MRRETURN(MATCH_NOMATCH);
3902 }
3903 break;
3904
3905 case PT_PXSPACE: /* POSIX space */
3906 for (i = 1; i <= min; i++)
3907 {
3908 if (eptr >= md->end_subject)
3909 {
3910 SCHECK_PARTIAL();
3911 MRRETURN(MATCH_NOMATCH);
3912 }
3913 GETCHARINCTEST(c, eptr);
3914 prop_category = UCD_CATEGORY(c);
3915 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3916 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3917 == prop_fail_result)
3918 MRRETURN(MATCH_NOMATCH);
3919 }
3920 break;
3921
3922 case PT_WORD:
3923 for (i = 1; i <= min; i++)
3924 {
3925 if (eptr >= md->end_subject)
3926 {
3927 SCHECK_PARTIAL();
3928 MRRETURN(MATCH_NOMATCH);
3929 }
3930 GETCHARINCTEST(c, eptr);
3931 prop_category = UCD_CATEGORY(c);
3932 if ((prop_category == ucp_L || prop_category == ucp_N ||
3933 c == CHAR_UNDERSCORE)
3934 == prop_fail_result)
3935 MRRETURN(MATCH_NOMATCH);
3936 }
3937 break;
3938
3939 /* This should not occur */
3940
3941 default:
3942 RRETURN(PCRE_ERROR_INTERNAL);
3943 }
3944 }
3945
3946 /* Match extended Unicode sequences. We will get here only if the
3947 support is in the binary; otherwise a compile-time error occurs. */
3948
3949 else if (ctype == OP_EXTUNI)
3950 {
3951 for (i = 1; i <= min; i++)
3952 {
3953 if (eptr >= md->end_subject)
3954 {
3955 SCHECK_PARTIAL();
3956 MRRETURN(MATCH_NOMATCH);
3957 }
3958 GETCHARINCTEST(c, eptr);
3959 prop_category = UCD_CATEGORY(c);
3960 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3961 while (eptr < md->end_subject)
3962 {
3963 int len = 1;
3964 if (!utf8) c = *eptr;
3965 else { GETCHARLEN(c, eptr, len); }
3966 prop_category = UCD_CATEGORY(c);
3967 if (prop_category != ucp_M) break;
3968 eptr += len;
3969 }
3970 }
3971 }
3972
3973 else
3974 #endif /* SUPPORT_UCP */
3975
3976 /* Handle all other cases when the coding is UTF-8 */
3977
3978 #ifdef SUPPORT_UTF8
3979 if (utf8) switch(ctype)
3980 {
3981 case OP_ANY:
3982 for (i = 1; i <= min; i++)
3983 {
3984 if (eptr >= md->end_subject)
3985 {
3986 SCHECK_PARTIAL();
3987 MRRETURN(MATCH_NOMATCH);
3988 }
3989 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3990 eptr++;
3991 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3992 }
3993 break;
3994
3995 case OP_ALLANY:
3996 for (i = 1; i <= min; i++)
3997 {
3998 if (eptr >= md->end_subject)
3999 {
4000 SCHECK_PARTIAL();
4001 MRRETURN(MATCH_NOMATCH);
4002 }
4003 eptr++;
4004 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4005 }
4006 break;
4007
4008 case OP_ANYBYTE:
4009 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
4010 eptr += min;
4011 break;
4012
4013 case OP_ANYNL:
4014 for (i = 1; i <= min; i++)
4015 {
4016 if (eptr >= md->end_subject)
4017 {
4018 SCHECK_PARTIAL();
4019 MRRETURN(MATCH_NOMATCH);
4020 }
4021 GETCHARINC(c, eptr);
4022 switch(c)
4023 {
4024 default: MRRETURN(MATCH_NOMATCH);
4025
4026 case 0x000d:
4027 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4028 break;
4029
4030 case 0x000a:
4031 break;
4032
4033 case 0x000b:
4034 case 0x000c:
4035 case 0x0085:
4036 case 0x2028:
4037 case 0x2029:
4038 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4039 break;
4040 }
4041 }
4042 break;
4043
4044 case OP_NOT_HSPACE:
4045 for (i = 1; i <= min; i++)
4046 {
4047 if (eptr >= md->end_subject)
4048 {
4049 SCHECK_PARTIAL();
4050 MRRETURN(MATCH_NOMATCH);
4051 }
4052 GETCHARINC(c, eptr);
4053 switch(c)
4054 {
4055 default: break;
4056 case 0x09: /* HT */
4057 case 0x20: /* SPACE */
4058 case 0xa0: /* NBSP */
4059 case 0x1680: /* OGHAM SPACE MARK */
4060 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4061 case 0x2000: /* EN QUAD */
4062 case 0x2001: /* EM QUAD */
4063 case 0x2002: /* EN SPACE */
4064 case 0x2003: /* EM SPACE */
4065 case 0x2004: /* THREE-PER-EM SPACE */
4066 case 0x2005: /* FOUR-PER-EM SPACE */
4067 case 0x2006: /* SIX-PER-EM SPACE */
4068 case 0x2007: /* FIGURE SPACE */
4069 case 0x2008: /* PUNCTUATION SPACE */
4070 case 0x2009: /* THIN SPACE */
4071 case 0x200A: /* HAIR SPACE */
4072 case 0x202f: /* NARROW NO-BREAK SPACE */
4073 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4074 case 0x3000: /* IDEOGRAPHIC SPACE */
4075 MRRETURN(MATCH_NOMATCH);
4076 }
4077 }
4078 break;
4079
4080 case OP_HSPACE:
4081 for (i = 1; i <= min; i++)
4082 {
4083 if (eptr >= md->end_subject)
4084 {
4085 SCHECK_PARTIAL();
4086 MRRETURN(MATCH_NOMATCH);
4087 }
4088 GETCHARINC(c, eptr);
4089 switch(c)
4090 {
4091 default: MRRETURN(MATCH_NOMATCH);
4092 case 0x09: /* HT */
4093 case 0x20: /* SPACE */
4094 case 0xa0: /* NBSP */
4095 case 0x1680: /* OGHAM SPACE MARK */
4096 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4097 case 0x2000: /* EN QUAD */
4098 case 0x2001: /* EM QUAD */
4099 case 0x2002: /* EN SPACE */
4100 case 0x2003: /* EM SPACE */
4101 case 0x2004: /* THREE-PER-EM SPACE */
4102 case 0x2005: /* FOUR-PER-EM SPACE */
4103 case 0x2006: /* SIX-PER-EM SPACE */
4104 case 0x2007: /* FIGURE SPACE */
4105 case 0x2008: /* PUNCTUATION SPACE */
4106 case 0x2009: /* THIN SPACE */
4107 case 0x200A: /* HAIR SPACE */
4108 case 0x202f: /* NARROW NO-BREAK SPACE */
4109 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4110 case 0x3000: /* IDEOGRAPHIC SPACE */
4111 break;
4112 }
4113 }
4114 break;
4115
4116 case OP_NOT_VSPACE:
4117 for (i = 1; i <= min; i++)
4118 {
4119 if (eptr >= md->end_subject)
4120 {
4121 SCHECK_PARTIAL();
4122 MRRETURN(MATCH_NOMATCH);
4123 }
4124 GETCHARINC(c, eptr);
4125 switch(c)
4126 {
4127 default: break;
4128 case 0x0a: /* LF */
4129 case 0x0b: /* VT */
4130 case 0x0c: /* FF */
4131 case 0x0d: /* CR */
4132 case 0x85: /* NEL */
4133 case 0x2028: /* LINE SEPARATOR */
4134 case 0x2029: /* PARAGRAPH SEPARATOR */
4135 MRRETURN(MATCH_NOMATCH);
4136 }
4137 }
4138 break;
4139
4140 case OP_VSPACE:
4141 for (i = 1; i <= min; i++)
4142 {
4143 if (eptr >= md->end_subject)
4144 {
4145 SCHECK_PARTIAL();
4146 MRRETURN(MATCH_NOMATCH);
4147 }
4148 GETCHARINC(c, eptr);
4149 switch(c)
4150 {
4151 default: MRRETURN(MATCH_NOMATCH);
4152 case 0x0a: /* LF */
4153 case 0x0b: /* VT */
4154 case 0x0c: /* FF */
4155 case 0x0d: /* CR */
4156 case 0x85: /* NEL */
4157 case 0x2028: /* LINE SEPARATOR */
4158 case 0x2029: /* PARAGRAPH SEPARATOR */
4159 break;
4160 }
4161 }
4162 break;
4163
4164 case OP_NOT_DIGIT:
4165 for (i = 1; i <= min; i++)
4166 {
4167 if (eptr >= md->end_subject)
4168 {
4169 SCHECK_PARTIAL();
4170 MRRETURN(MATCH_NOMATCH);
4171 }
4172 GETCHARINC(c, eptr);
4173 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4174 MRRETURN(MATCH_NOMATCH);
4175 }
4176 break;
4177
4178 case OP_DIGIT:
4179 for (i = 1; i <= min; i++)
4180 {
4181 if (eptr >= md->end_subject)
4182 {
4183 SCHECK_PARTIAL();
4184 MRRETURN(MATCH_NOMATCH);
4185 }
4186 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4187 MRRETURN(MATCH_NOMATCH);
4188 /* No need to skip more bytes - we know it's a 1-byte character */
4189 }
4190 break;
4191
4192 case OP_NOT_WHITESPACE:
4193 for (i = 1; i <= min; i++)
4194 {
4195 if (eptr >= md->end_subject)
4196 {
4197 SCHECK_PARTIAL();
4198 MRRETURN(MATCH_NOMATCH);
4199 }
4200 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4201 MRRETURN(MATCH_NOMATCH);
4202 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4203 }
4204 break;
4205
4206 case OP_WHITESPACE:
4207 for (i = 1; i <= min; i++)
4208 {
4209 if (eptr >= md->end_subject)
4210 {
4211 SCHECK_PARTIAL();
4212 MRRETURN(MATCH_NOMATCH);
4213 }
4214 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4215 MRRETURN(MATCH_NOMATCH);
4216 /* No need to skip more bytes - we know it's a 1-byte character */
4217 }
4218 break;
4219
4220 case OP_NOT_WORDCHAR:
4221 for (i = 1; i <= min; i++)
4222 {
4223 if (eptr >= md->end_subject)
4224 {
4225 SCHECK_PARTIAL();
4226 MRRETURN(MATCH_NOMATCH);
4227 }
4228 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4229 MRRETURN(MATCH_NOMATCH);
4230 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4231 }
4232 break;
4233
4234 case OP_WORDCHAR:
4235 for (i = 1; i <= min; i++)
4236 {
4237 if (eptr >= md->end_subject)
4238 {
4239 SCHECK_PARTIAL();
4240 MRRETURN(MATCH_NOMATCH);
4241 }
4242 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4243 MRRETURN(MATCH_NOMATCH);
4244 /* No need to skip more bytes - we know it's a 1-byte character */
4245 }
4246 break;
4247
4248 default:
4249 RRETURN(PCRE_ERROR_INTERNAL);
4250 } /* End switch(ctype) */
4251
4252 else
4253 #endif /* SUPPORT_UTF8 */
4254
4255 /* Code for the non-UTF-8 case for minimum matching of operators other
4256 than OP_PROP and OP_NOTPROP. */
4257
4258 switch(ctype)
4259 {
4260 case OP_ANY:
4261 for (i = 1; i <= min; i++)
4262 {
4263 if (eptr >= md->end_subject)
4264 {
4265 SCHECK_PARTIAL();
4266 MRRETURN(MATCH_NOMATCH);
4267 }
4268 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4269 eptr++;
4270 }
4271 break;
4272
4273 case OP_ALLANY:
4274 if (eptr > md->end_subject - min)
4275 {
4276 SCHECK_PARTIAL();
4277 MRRETURN(MATCH_NOMATCH);
4278 }
4279 eptr += min;
4280 break;
4281
4282 case OP_ANYBYTE:
4283 if (eptr > md->end_subject - min)
4284 {
4285 SCHECK_PARTIAL();
4286 MRRETURN(MATCH_NOMATCH);
4287 }
4288 eptr += min;
4289 break;
4290
4291 case OP_ANYNL:
4292 for (i = 1; i <= min; i++)
4293 {
4294 if (eptr >= md->end_subject)
4295 {
4296 SCHECK_PARTIAL();
4297 MRRETURN(MATCH_NOMATCH);
4298 }
4299 switch(*eptr++)
4300 {
4301 default: MRRETURN(MATCH_NOMATCH);
4302
4303 case 0x000d:
4304 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4305 break;
4306
4307 case 0x000a:
4308 break;
4309
4310 case 0x000b:
4311 case 0x000c:
4312 case 0x0085:
4313 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4314 break;
4315 }
4316 }
4317 break;
4318
4319 case OP_NOT_HSPACE:
4320 for (i = 1; i <= min; i++)
4321 {
4322 if (eptr >= md->end_subject)
4323 {
4324 SCHECK_PARTIAL();
4325 MRRETURN(MATCH_NOMATCH);
4326 }
4327 switch(*eptr++)
4328 {
4329 default: break;
4330 case 0x09: /* HT */
4331 case 0x20: /* SPACE */
4332 case 0xa0: /* NBSP */
4333 MRRETURN(MATCH_NOMATCH);
4334 }
4335 }
4336 break;
4337
4338 case OP_HSPACE:
4339 for (i = 1; i <= min; i++)
4340 {
4341 if (eptr >= md->end_subject)
4342 {
4343 SCHECK_PARTIAL();
4344 MRRETURN(MATCH_NOMATCH);
4345 }
4346 switch(*eptr++)
4347 {
4348 default: MRRETURN(MATCH_NOMATCH);
4349 case 0x09: /* HT */
4350 case 0x20: /* SPACE */
4351 case 0xa0: /* NBSP */
4352 break;
4353 }
4354 }
4355 break;
4356
4357 case OP_NOT_VSPACE:
4358 for (i = 1; i <= min; i++)
4359 {
4360 if (eptr >= md->end_subject)
4361 {
4362 SCHECK_PARTIAL();
4363 MRRETURN(MATCH_NOMATCH);
4364 }
4365 switch(*eptr++)
4366 {
4367 default: break;
4368 case 0x0a: /* LF */
4369 case 0x0b: /* VT */
4370 case 0x0c: /* FF */
4371 case 0x0d: /* CR */
4372 case 0x85: /* NEL */
4373 MRRETURN(MATCH_NOMATCH);
4374 }
4375 }
4376 break;
4377
4378 case OP_VSPACE:
4379 for (i = 1; i <= min; i++)
4380 {
4381 if (eptr >= md->end_subject)
4382 {
4383 SCHECK_PARTIAL();
4384 MRRETURN(MATCH_NOMATCH);
4385 }
4386 switch(*eptr++)
4387 {
4388 default: MRRETURN(MATCH_NOMATCH);
4389 case 0x0a: /* LF */
4390 case 0x0b: /* VT */
4391 case 0x0c: /* FF */
4392 case 0x0d: /* CR */
4393 case 0x85: /* NEL */
4394 break;
4395 }
4396 }
4397 break;
4398
4399 case OP_NOT_DIGIT:
4400 for (i = 1; i <= min; i++)
4401 {
4402 if (eptr >= md->end_subject)
4403 {
4404 SCHECK_PARTIAL();
4405 MRRETURN(MATCH_NOMATCH);
4406 }
4407 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4408 }
4409 break;
4410
4411 case OP_DIGIT:
4412 for (i = 1; i <= min; i++)
4413 {
4414 if (eptr >= md->end_subject)
4415 {
4416 SCHECK_PARTIAL();
4417 MRRETURN(MATCH_NOMATCH);
4418 }
4419 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4420 }
4421 break;
4422
4423 case OP_NOT_WHITESPACE:
4424 for (i = 1; i <= min; i++)
4425 {
4426 if (eptr >= md->end_subject)
4427 {
4428 SCHECK_PARTIAL();
4429 MRRETURN(MATCH_NOMATCH);
4430 }
4431 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4432 }
4433 break;
4434
4435 case OP_WHITESPACE:
4436 for (i = 1; i <= min; i++)
4437 {
4438 if (eptr >= md->end_subject)
4439 {
4440 SCHECK_PARTIAL();
4441 MRRETURN(MATCH_NOMATCH);
4442 }
4443 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4444 }
4445 break;
4446
4447 case OP_NOT_WORDCHAR:
4448 for (i = 1; i <= min; i++)
4449 {
4450 if (eptr >= md->end_subject)
4451 {
4452 SCHECK_PARTIAL();
4453 MRRETURN(MATCH_NOMATCH);
4454 }
4455 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4456 MRRETURN(MATCH_NOMATCH);
4457 }
4458 break;
4459
4460 case OP_WORDCHAR:
4461 for (i = 1; i <= min; i++)
4462 {
4463 if (eptr >= md->end_subject)
4464 {
4465 SCHECK_PARTIAL();
4466 MRRETURN(MATCH_NOMATCH);
4467 }
4468 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4469 MRRETURN(MATCH_NOMATCH);
4470 }
4471 break;
4472
4473 default:
4474 RRETURN(PCRE_ERROR_INTERNAL);
4475 }
4476 }
4477
4478 /* If min = max, continue at the same level without recursing */
4479
4480 if (min == max) continue;
4481
4482 /* If minimizing, we have to test the rest of the pattern before each
4483 subsequent match. Again, separate the UTF-8 case for speed, and also
4484 separate the UCP cases. */
4485
4486 if (minimize)
4487 {
4488 #ifdef SUPPORT_UCP
4489 if (prop_type >= 0)
4490 {
4491 switch(prop_type)
4492 {
4493 case PT_ANY:
4494 for (fi = min;; fi++)
4495 {
4496 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4497 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4498 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4499 if (eptr >= md->end_subject)
4500 {
4501 SCHECK_PARTIAL();
4502 MRRETURN(MATCH_NOMATCH);
4503 }
4504 GETCHARINCTEST(c, eptr);
4505 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4506 }
4507 /* Control never gets here */
4508
4509 case PT_LAMP:
4510 for (fi = min;; fi++)
4511 {
4512 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4513 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4514 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4515 if (eptr >= md->end_subject)
4516 {
4517 SCHECK_PARTIAL();
4518 MRRETURN(MATCH_NOMATCH);
4519 }
4520 GETCHARINCTEST(c, eptr);
4521 prop_chartype = UCD_CHARTYPE(c);
4522 if ((prop_chartype == ucp_Lu ||
4523 prop_chartype == ucp_Ll ||
4524 prop_chartype == ucp_Lt) == prop_fail_result)
4525 MRRETURN(MATCH_NOMATCH);
4526 }
4527 /* Control never gets here */
4528
4529 case PT_GC:
4530 for (fi = min;; fi++)
4531 {
4532 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4533 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4534 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4535 if (eptr >= md->end_subject)
4536 {
4537 SCHECK_PARTIAL();
4538 MRRETURN(MATCH_NOMATCH);
4539 }
4540 GETCHARINCTEST(c, eptr);
4541 prop_category = UCD_CATEGORY(c);
4542 if ((prop_category == prop_value) == prop_fail_result)
4543 MRRETURN(MATCH_NOMATCH);
4544 }
4545 /* Control never gets here */
4546
4547 case PT_PC:
4548 for (fi = min;; fi++)
4549 {
4550 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4551 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4552 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4553 if (eptr >= md->end_subject)
4554 {
4555 SCHECK_PARTIAL();
4556 MRRETURN(MATCH_NOMATCH);
4557 }
4558 GETCHARINCTEST(c, eptr);
4559 prop_chartype = UCD_CHARTYPE(c);
4560 if ((prop_chartype == prop_value) == prop_fail_result)
4561 MRRETURN(MATCH_NOMATCH);
4562 }
4563 /* Control never gets here */
4564
4565 case PT_SC:
4566 for (fi = min;; fi++)
4567 {
4568 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4569 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4570 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4571 if (eptr >= md->end_subject)
4572 {
4573 SCHECK_PARTIAL();
4574 MRRETURN(MATCH_NOMATCH);
4575 }
4576 GETCHARINCTEST(c, eptr);
4577 prop_script = UCD_SCRIPT(c);
4578 if ((prop_script == prop_value) == prop_fail_result)
4579 MRRETURN(MATCH_NOMATCH);
4580 }
4581 /* Control never gets here */
4582
4583 case PT_ALNUM:
4584 for (fi = min;; fi++)
4585 {
4586 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4587 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4588 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4589 if (eptr >= md->end_subject)
4590 {
4591 SCHECK_PARTIAL();
4592 MRRETURN(MATCH_NOMATCH);
4593 }
4594 GETCHARINCTEST(c, eptr);
4595 prop_category = UCD_CATEGORY(c);
4596 if ((prop_category == ucp_L || prop_category == ucp_N)
4597 == prop_fail_result)
4598 MRRETURN(MATCH_NOMATCH);
4599 }
4600 /* Control never gets here */
4601
4602 case PT_SPACE: /* Perl space */
4603 for (fi = min;; fi++)
4604 {
4605 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4606 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4607 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4608 if (eptr >= md->end_subject)
4609 {
4610 SCHECK_PARTIAL();
4611 MRRETURN(MATCH_NOMATCH);
4612 }
4613 GETCHARINCTEST(c, eptr);
4614 prop_category = UCD_CATEGORY(c);
4615 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4616 c == CHAR_FF || c == CHAR_CR)
4617 == prop_fail_result)
4618 MRRETURN(MATCH_NOMATCH);
4619 }
4620 /* Control never gets here */
4621
4622 case PT_PXSPACE: /* POSIX space */
4623 for (fi = min;; fi++)
4624 {
4625 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4626 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4627 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4628 if (eptr >= md->end_subject)
4629 {
4630 SCHECK_PARTIAL();
4631 MRRETURN(MATCH_NOMATCH);
4632 }
4633 GETCHARINCTEST(c, eptr);
4634 prop_category = UCD_CATEGORY(c);
4635 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4636 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4637 == prop_fail_result)
4638 MRRETURN(MATCH_NOMATCH);
4639 }
4640 /* Control never gets here */
4641
4642 case PT_WORD:
4643 for (fi = min;; fi++)
4644 {
4645 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4646 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4647 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4648 if (eptr >= md->end_subject)
4649 {
4650 SCHECK_PARTIAL();
4651 MRRETURN(MATCH_NOMATCH);
4652 }
4653 GETCHARINCTEST(c, eptr);
4654 prop_category = UCD_CATEGORY(c);
4655 if ((prop_category == ucp_L ||
4656 prop_category == ucp_N ||
4657 c == CHAR_UNDERSCORE)
4658 == prop_fail_result)
4659 MRRETURN(MATCH_NOMATCH);
4660 }
4661 /* Control never gets here */
4662
4663 /* This should never occur */
4664
4665 default:
4666 RRETURN(PCRE_ERROR_INTERNAL);
4667 }
4668 }
4669
4670 /* Match extended Unicode sequences. We will get here only if the
4671 support is in the binary; otherwise a compile-time error occurs. */
4672
4673 else if (ctype == OP_EXTUNI)
4674 {
4675 for (fi = min;; fi++)
4676 {
4677 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4679 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 MRRETURN(MATCH_NOMATCH);
4684 }
4685 GETCHARINCTEST(c, eptr);
4686 prop_category = UCD_CATEGORY(c);
4687 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4688 while (eptr < md->end_subject)
4689 {
4690 int len = 1;
4691 if (!utf8) c = *eptr;
4692 else { GETCHARLEN(c, eptr, len); }
4693 prop_category = UCD_CATEGORY(c);
4694 if (prop_category != ucp_M) break;
4695 eptr += len;
4696 }
4697 }
4698 }
4699
4700 else
4701 #endif /* SUPPORT_UCP */
4702
4703 #ifdef SUPPORT_UTF8
4704 /* UTF-8 mode */
4705 if (utf8)
4706 {
4707 for (fi = min;; fi++)
4708 {
4709 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4710 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4711 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4712 if (eptr >= md->end_subject)
4713 {
4714 SCHECK_PARTIAL();
4715 MRRETURN(MATCH_NOMATCH);
4716 }
4717 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4718 MRRETURN(MATCH_NOMATCH);
4719 GETCHARINC(c, eptr);
4720 switch(ctype)
4721 {
4722 case OP_ANY: /* This is the non-NL case */
4723 case OP_ALLANY:
4724 case OP_ANYBYTE:
4725 break;
4726
4727 case OP_ANYNL:
4728 switch(c)
4729 {
4730 default: MRRETURN(MATCH_NOMATCH);
4731 case 0x000d:
4732 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4733 break;
4734 case 0x000a:
4735 break;
4736
4737 case 0x000b:
4738 case 0x000c:
4739 case 0x0085:
4740 case 0x2028:
4741 case 0x2029:
4742 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4743 break;
4744 }
4745 break;
4746
4747 case OP_NOT_HSPACE:
4748 switch(c)
4749 {
4750 default: break;
4751 case 0x09: /* HT */
4752 case 0x20: /* SPACE */
4753 case 0xa0: /* NBSP */
4754 case 0x1680: /* OGHAM SPACE MARK */
4755 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4756 case 0x2000: /* EN QUAD */
4757 case 0x2001: /* EM QUAD */
4758 case 0x2002: /* EN SPACE */
4759 case 0x2003: /* EM SPACE */
4760 case 0x2004: /* THREE-PER-EM SPACE */
4761 case 0x2005: /* FOUR-PER-EM SPACE */
4762 case 0x2006: /* SIX-PER-EM SPACE */
4763 case 0x2007: /* FIGURE SPACE */
4764 case 0x2008: /* PUNCTUATION SPACE */
4765 case 0x2009: /* THIN SPACE */
4766 case 0x200A: /* HAIR SPACE */
4767 case 0x202f: /* NARROW NO-BREAK SPACE */
4768 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4769 case 0x3000: /* IDEOGRAPHIC SPACE */
4770 MRRETURN(MATCH_NOMATCH);
4771 }
4772 break;
4773
4774 case OP_HSPACE:
4775 switch(c)
4776 {
4777 default: MRRETURN(MATCH_NOMATCH);
4778 case 0x09: /* HT */
4779 case 0x20: /* SPACE */
4780 case 0xa0: /* NBSP */
4781 case 0x1680: /* OGHAM SPACE MARK */
4782 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4783 case 0x2000: /* EN QUAD */
4784 case 0x2001: /* EM QUAD */
4785 case 0x2002: /* EN SPACE */
4786 case 0x2003: /* EM SPACE */
4787 case 0x2004: /* THREE-PER-EM SPACE */
4788 case 0x2005: /* FOUR-PER-EM SPACE */
4789 case 0x2006: /* SIX-PER-EM SPACE */
4790 case 0x2007: /* FIGURE SPACE */
4791 case 0x2008: /* PUNCTUATION SPACE */
4792 case 0x2009: /* THIN SPACE */
4793 case 0x200A: /* HAIR SPACE */
4794 case 0x202f: /* NARROW NO-BREAK SPACE */
4795 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4796 case 0x3000: /* IDEOGRAPHIC SPACE */
4797 break;
4798 }
4799 break;
4800
4801 case OP_NOT_VSPACE:
4802 switch(c)
4803 {
4804 default: break;
4805 case 0x0a: /* LF */
4806 case 0x0b: /* VT */
4807 case 0x0c: /* FF */
4808 case 0x0d: /* CR */
4809 case 0x85: /* NEL */
4810 case 0x2028: /* LINE SEPARATOR */
4811 case 0x2029: /* PARAGRAPH SEPARATOR */
4812 MRRETURN(MATCH_NOMATCH);
4813 }
4814 break;
4815
4816 case OP_VSPACE:
4817 switch(c)
4818 {
4819 default: MRRETURN(MATCH_NOMATCH);
4820 case 0x0a: /* LF */
4821 case 0x0b: /* VT */
4822 case 0x0c: /* FF */
4823 case 0x0d: /* CR */
4824 case 0x85: /* NEL */
4825 case 0x2028: /* LINE SEPARATOR */
4826 case 0x2029: /* PARAGRAPH SEPARATOR */
4827 break;
4828 }
4829 break;
4830
4831 case OP_NOT_DIGIT:
4832 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4833 MRRETURN(MATCH_NOMATCH);
4834 break;
4835
4836 case OP_DIGIT:
4837 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4838 MRRETURN(MATCH_NOMATCH);
4839 break;
4840
4841 case OP_NOT_WHITESPACE:
4842 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4843 MRRETURN(MATCH_NOMATCH);
4844 break;
4845
4846 case OP_WHITESPACE:
4847 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4848 MRRETURN(MATCH_NOMATCH);
4849 break;
4850
4851 case OP_NOT_WORDCHAR:
4852 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4853 MRRETURN(MATCH_NOMATCH);
4854 break;
4855
4856 case OP_WORDCHAR:
4857 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4858 MRRETURN(MATCH_NOMATCH);
4859 break;
4860
4861 default:
4862 RRETURN(PCRE_ERROR_INTERNAL);
4863 }
4864 }
4865 }
4866 else
4867 #endif
4868 /* Not UTF-8 mode */
4869 {
4870 for (fi = min;; fi++)
4871 {
4872 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4873 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4874 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4875 if (eptr >= md->end_subject)
4876 {
4877 SCHECK_PARTIAL();
4878 MRRETURN(MATCH_NOMATCH);
4879 }
4880 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4881 MRRETURN(MATCH_NOMATCH);
4882 c = *eptr++;
4883 switch(ctype)
4884 {
4885 case OP_ANY: /* This is the non-NL case */
4886 case OP_ALLANY:
4887 case OP_ANYBYTE:
4888 break;
4889
4890 case OP_ANYNL:
4891 switch(c)
4892 {
4893 default: MRRETURN(MATCH_NOMATCH);
4894 case 0x000d:
4895 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4896 break;
4897
4898 case 0x000a:
4899 break;
4900
4901 case 0x000b:
4902 case 0x000c:
4903 case 0x0085:
4904 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4905 break;
4906 }
4907 break;
4908
4909 case OP_NOT_HSPACE:
4910 switch(c)
4911 {
4912 default: break;
4913 case 0x09: /* HT */
4914 case 0x20: /* SPACE */
4915 case 0xa0: /* NBSP */
4916 MRRETURN(MATCH_NOMATCH);
4917 }
4918 break;
4919
4920 case OP_HSPACE:
4921 switch(c)
4922 {
4923 default: MRRETURN(MATCH_NOMATCH);
4924 case 0x09: /* HT */
4925 case 0x20: /* SPACE */
4926 case 0xa0: /* NBSP */
4927 break;
4928 }
4929 break;
4930
4931 case OP_NOT_VSPACE:
4932 switch(c)
4933 {
4934 default: break;
4935 case 0x0a: /* LF */
4936 case 0x0b: /* VT */
4937 case 0x0c: /* FF */
4938 case 0x0d: /* CR */
4939 case 0x85: /* NEL */
4940 MRRETURN(MATCH_NOMATCH);
4941 }
4942 break;
4943
4944 case OP_VSPACE:
4945 switch(c)
4946 {
4947 default: MRRETURN(MATCH_NOMATCH);
4948 case 0x0a: /* LF */
4949 case 0x0b: /* VT */
4950 case 0x0c: /* FF */
4951 case 0x0d: /* CR */
4952 case 0x85: /* NEL */
4953 break;
4954 }
4955 break;
4956
4957 case OP_NOT_DIGIT:
4958 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4959 break;
4960
4961 case OP_DIGIT:
4962 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4963 break;
4964
4965 case OP_NOT_WHITESPACE:
4966 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4967 break;
4968
4969 case OP_WHITESPACE:
4970 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4971 break;
4972
4973 case OP_NOT_WORDCHAR:
4974 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4975 break;
4976
4977 case OP_WORDCHAR:
4978 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4979 break;
4980
4981 default:
4982 RRETURN(PCRE_ERROR_INTERNAL);
4983 }
4984 }
4985 }
4986 /* Control never gets here */
4987 }
4988
4989 /* If maximizing, it is worth using inline code for speed, doing the type
4990 test once at the start (i.e. keep it out of the loop). Again, keep the
4991 UTF-8 and UCP stuff separate. */
4992
4993 else
4994 {
4995 pp = eptr; /* Remember where we started */
4996
4997 #ifdef SUPPORT_UCP
4998 if (prop_type >= 0)
4999 {
5000 switch(prop_type)
5001 {
5002 case PT_ANY:
5003 for (i = min; i < max; i++)
5004 {
5005 int len = 1;
5006 if (eptr >= md->end_subject)
5007 {
5008 SCHECK_PARTIAL();
5009 break;
5010 }
5011 GETCHARLENTEST(c, eptr, len);
5012 if (prop_fail_result) break;
5013 eptr+= len;
5014 }
5015 break;
5016
5017 case PT_LAMP:
5018 for (i = min; i < max; i++)
5019 {
5020 int len = 1;
5021 if (eptr >= md->end_subject)
5022 {
5023 SCHECK_PARTIAL();
5024 break;
5025 }
5026 GETCHARLENTEST(c, eptr, len);
5027 prop_chartype = UCD_CHARTYPE(c);
5028 if ((prop_chartype == ucp_Lu ||
5029 prop_chartype == ucp_Ll ||
5030 prop_chartype == ucp_Lt) == prop_fail_result)
5031 break;
5032 eptr+= len;
5033 }
5034 break;
5035
5036 case PT_GC:
5037 for (i = min; i < max; i++)
5038 {
5039 int len = 1;
5040 if (eptr >= md->end_subject)
5041 {
5042 SCHECK_PARTIAL();
5043 break;
5044 }
5045 GETCHARLENTEST(c, eptr, len);
5046 prop_category = UCD_CATEGORY(c);
5047 if ((prop_category == prop_value) == prop_fail_result)
5048 break;
5049 eptr+= len;
5050 }
5051 break;
5052
5053 case PT_PC:
5054 for (i = min; i < max; i++)
5055 {
5056 int len = 1;
5057 if (eptr >= md->end_subject)
5058 {
5059 SCHECK_PARTIAL();
5060 break;
5061 }
5062 GETCHARLENTEST(c, eptr, len);
5063 prop_chartype = UCD_CHARTYPE(c);
5064 if ((prop_chartype == prop_value) == prop_fail_result)
5065 break;
5066 eptr+= len;
5067 }
5068 break;
5069
5070 case PT_SC:
5071 for (i = min; i < max; i++)
5072 {
5073 int len = 1;
5074 if (eptr >= md->end_subject)
5075 {
5076 SCHECK_PARTIAL();
5077 break;
5078 }
5079 GETCHARLENTEST(c, eptr, len);
5080 prop_script = UCD_SCRIPT(c);
5081 if ((prop_script == prop_value) == prop_fail_result)
5082 break;
5083 eptr+= len;
5084 }
5085 break;
5086
5087 case PT_ALNUM:
5088 for (i = min; i < max; i++)
5089 {
5090 int len = 1;
5091 if (eptr >= md->end_subject)
5092 {
5093 SCHECK_PARTIAL();
5094 break;
5095 }
5096 GETCHARLENTEST(c, eptr, len);
5097 prop_category = UCD_CATEGORY(c);
5098 if ((prop_category == ucp_L || prop_category == ucp_N)
5099 == prop_fail_result)
5100 break;
5101 eptr+= len;
5102 }
5103 break;
5104
5105 case PT_SPACE: /* Perl space */
5106 for (i = min; i < max; i++)
5107 {
5108 int len = 1;
5109 if (eptr >= md->end_subject)
5110 {
5111 SCHECK_PARTIAL();
5112 break;
5113 }
5114 GETCHARLENTEST(c, eptr, len);
5115 prop_category = UCD_CATEGORY(c);
5116 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5117 c == CHAR_FF || c == CHAR_CR)
5118 == prop_fail_result)
5119 break;
5120 eptr+= len;
5121 }
5122 break;
5123
5124 case PT_PXSPACE: /* POSIX space */
5125 for (i = min; i < max; i++)
5126 {
5127 int len = 1;
5128 if (eptr >= md->end_subject)
5129 {
5130 SCHECK_PARTIAL();
5131 break;
5132 }
5133 GETCHARLENTEST(c, eptr, len);
5134 prop_category = UCD_CATEGORY(c);
5135 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5136 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5137 == prop_fail_result)
5138 break;
5139 eptr+= len;
5140 }
5141 break;
5142
5143 case PT_WORD:
5144 for (i = min; i < max; i++)
5145 {
5146 int len = 1;
5147 if (eptr >= md->end_subject)
5148 {
5149 SCHECK_PARTIAL();
5150 break;
5151 }
5152 GETCHARLENTEST(c, eptr, len);
5153 prop_category = UCD_CATEGORY(c);
5154 if ((prop_category == ucp_L || prop_category == ucp_N ||
5155 c == CHAR_UNDERSCORE) == prop_fail_result)
5156 break;
5157 eptr+= len;
5158 }
5159 break;
5160
5161 default:
5162 RRETURN(PCRE_ERROR_INTERNAL);
5163 }
5164
5165 /* eptr is now past the end of the maximum run */
5166
5167 if (possessive) continue;
5168 for(;;)
5169 {
5170 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5171 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5172 if (eptr-- == pp) break; /* Stop if tried at original pos */
5173 if (utf8) BACKCHAR(eptr);
5174 }
5175 }
5176
5177 /* Match extended Unicode sequences. We will get here only if the
5178 support is in the binary; otherwise a compile-time error occurs. */
5179
5180 else if (ctype == OP_EXTUNI)
5181 {
5182 for (i = min; i < max; i++)
5183 {
5184 if (eptr >= md->end_subject)
5185 {
5186 SCHECK_PARTIAL();
5187 break;
5188 }
5189 GETCHARINCTEST(c, eptr);
5190 prop_category = UCD_CATEGORY(c);
5191 if (prop_category == ucp_M) break;
5192 while (eptr < md->end_subject)
5193 {
5194 int len = 1;
5195 if (!utf8) c = *eptr; else
5196 {
5197 GETCHARLEN(c, eptr, len);
5198 }
5199 prop_category = UCD_CATEGORY(c);
5200 if (prop_category != ucp_M) break;
5201 eptr += len;
5202 }
5203 }
5204
5205 /* eptr is now past the end of the maximum run */
5206
5207 if (possessive) continue;
5208
5209 for(;;)
5210 {
5211 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5212 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5213 if (eptr-- == pp) break; /* Stop if tried at original pos */
5214 for (;;) /* Move back over one extended */
5215 {
5216 int len = 1;
5217 if (!utf8) c = *eptr; else
5218 {
5219 BACKCHAR(eptr);
5220 GETCHARLEN(c, eptr, len);
5221 }
5222 prop_category = UCD_CATEGORY(c);
5223 if (prop_category != ucp_M) break;
5224 eptr--;
5225 }
5226 }
5227 }
5228
5229 else
5230 #endif /* SUPPORT_UCP */
5231
5232 #ifdef SUPPORT_UTF8
5233 /* UTF-8 mode */
5234
5235 if (utf8)
5236 {
5237 switch(ctype)
5238 {
5239 case OP_ANY:
5240 if (max < INT_MAX)
5241 {
5242 for (i = min; i < max; i++)
5243 {
5244 if (eptr >= md->end_subject)
5245 {
5246 SCHECK_PARTIAL();
5247 break;
5248 }
5249 if (IS_NEWLINE(eptr)) break;
5250 eptr++;
5251 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5252 }
5253 }
5254
5255 /* Handle unlimited UTF-8 repeat */
5256
5257 else
5258 {
5259 for (i = min; i < max; i++)
5260 {
5261 if (eptr >= md->end_subject)
5262 {
5263 SCHECK_PARTIAL();
5264 break;
5265 }
5266 if (IS_NEWLINE(eptr)) break;
5267 eptr++;
5268 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5269 }
5270 }
5271 break;
5272
5273 case OP_ALLANY:
5274 if (max < INT_MAX)
5275 {
5276 for (i = min; i < max; i++)
5277 {
5278 if (eptr >= md->end_subject)
5279 {
5280 SCHECK_PARTIAL();
5281 break;
5282 }
5283 eptr++;
5284 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5285 }
5286 }
5287 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5288 break;
5289
5290 /* The byte case is the same as non-UTF8 */
5291
5292 case OP_ANYBYTE:
5293 c = max - min;
5294 if (c > (unsigned int)(md->end_subject - eptr))
5295 {
5296 eptr = md->end_subject;
5297 SCHECK_PARTIAL();
5298 }
5299 else eptr += c;
5300 break;
5301
5302 case OP_ANYNL:
5303 for (i = min; i < max; i++)
5304 {
5305 int len = 1;
5306 if (eptr >= md->end_subject)
5307 {
5308 SCHECK_PARTIAL();
5309 break;
5310 }
5311 GETCHARLEN(c, eptr, len);
5312 if (c == 0x000d)
5313 {
5314 if (++eptr >= md->end_subject) break;
5315 if (*eptr == 0x000a) eptr++;
5316 }
5317 else
5318 {
5319 if (c != 0x000a &&
5320 (md->bsr_anycrlf ||
5321 (c != 0x000b && c != 0x000c &&
5322 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5323 break;
5324 eptr += len;
5325 }
5326 }
5327 break;
5328
5329 case OP_NOT_HSPACE:
5330 case OP_HSPACE:
5331 for (i = min; i < max; i++)
5332 {
5333 BOOL gotspace;
5334 int len = 1;
5335 if (eptr >= md->end_subject)
5336 {
5337 SCHECK_PARTIAL();
5338 break;
5339 }
5340 GETCHARLEN(c, eptr, len);
5341 switch(c)
5342 {
5343 default: gotspace = FALSE; break;
5344 case 0x09: /* HT */
5345 case 0x20: /* SPACE */
5346 case 0xa0: /* NBSP */
5347 case 0x1680: /* OGHAM SPACE MARK */
5348 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5349 case 0x2000: /* EN QUAD */
5350 case 0x2001: /* EM QUAD */
5351 case 0x2002: /* EN SPACE */
5352 case 0x2003: /* EM SPACE */
5353 case 0x2004: /* THREE-PER-EM SPACE */
5354 case 0x2005: /* FOUR-PER-EM SPACE */
5355 case 0x2006: /* SIX-PER-EM SPACE */
5356 case 0x2007: /* FIGURE SPACE */
5357 case 0x2008: /* PUNCTUATION SPACE */
5358 case 0x2009: /* THIN SPACE */
5359 case 0x200A: /* HAIR SPACE */
5360 case 0x202f: /* NARROW NO-BREAK SPACE */
5361 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5362 case 0x3000: /* IDEOGRAPHIC SPACE */
5363 gotspace = TRUE;
5364 break;
5365 }
5366 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5367 eptr += len;
5368 }
5369 break;
5370
5371 case OP_NOT_VSPACE:
5372 case OP_VSPACE:
5373 for (i = min; i < max; i++)
5374 {
5375 BOOL gotspace;
5376 int len = 1;
5377 if (eptr >= md->end_subject)
5378 {
5379 SCHECK_PARTIAL();
5380 break;
5381 }
5382 GETCHARLEN(c, eptr, len);
5383 switch(c)
5384 {
5385 default: gotspace = FALSE; break;
5386 case 0x0a: /* LF */
5387 case 0x0b: /* VT */
5388 case 0x0c: /* FF */
5389 case 0x0d: /* CR */
5390 case 0x85: /* NEL */
5391 case 0x2028: /* LINE SEPARATOR */
5392 case 0x2029: /* PARAGRAPH SEPARATOR */
5393 gotspace = TRUE;
5394 break;
5395 }
5396 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5397 eptr += len;
5398 }
5399 break;
5400
5401 case OP_NOT_DIGIT:
5402 for (i = min; i < max; i++)
5403 {
5404 int len = 1;
5405 if (eptr >= md->end_subject)
5406 {
5407 SCHECK_PARTIAL();
5408 break;
5409 }
5410 GETCHARLEN(c, eptr, len);
5411 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5412 eptr+= len;
5413 }
5414 break;
5415
5416 case OP_DIGIT:
5417 for (i = min; i < max; i++)
5418 {
5419 int len = 1;
5420 if (eptr >= md->end_subject)
5421 {
5422 SCHECK_PARTIAL();
5423 break;
5424 }
5425 GETCHARLEN(c, eptr, len);
5426 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5427 eptr+= len;
5428 }
5429 break;
5430
5431 case OP_NOT_WHITESPACE:
5432 for (i = min; i < max; i++)
5433 {
5434 int len = 1;
5435 if (eptr >= md->end_subject)
5436 {
5437 SCHECK_PARTIAL();
5438 break;
5439 }
5440 GETCHARLEN(c, eptr, len);
5441 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5442 eptr+= len;
5443 }
5444 break;
5445
5446 case OP_WHITESPACE:
5447 for (i = min; i < max; i++)
5448 {
5449 int len = 1;
5450 if (eptr >= md->end_subject)
5451 {
5452 SCHECK_PARTIAL();
5453 break;
5454 }
5455 GETCHARLEN(c, eptr, len);
5456 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5457 eptr+= len;
5458 }
5459 break;
5460
5461 case OP_NOT_WORDCHAR:
5462 for (i = min; i < max; i++)
5463 {
5464 int len = 1;
5465 if (eptr >= md->end_subject)
5466 {
5467 SCHECK_PARTIAL();
5468 break;
5469 }
5470 GETCHARLEN(c, eptr, len);
5471 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5472 eptr+= len;
5473 }
5474 break;
5475
5476 case OP_WORDCHAR:
5477 for (i = min; i < max; i++)
5478 {
5479 int len = 1;
5480 if (eptr >= md->end_subject)
5481 {
5482 SCHECK_PARTIAL();
5483 break;
5484 }
5485 GETCHARLEN(c, eptr, len);
5486 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5487 eptr+= len;
5488 }
5489 break;
5490
5491 default:
5492 RRETURN(PCRE_ERROR_INTERNAL);
5493 }
5494
5495 /* eptr is now past the end of the maximum run. If possessive, we are
5496 done (no backing up). Otherwise, match at this position; anything other
5497 than no match is immediately returned. For nomatch, back up one
5498 character, unless we are matching \R and the last thing matched was
5499 \r\n, in which case, back up two bytes. */
5500
5501 if (possessive) continue;
5502 for(;;)
5503 {
5504 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5505 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5506 if (eptr-- == pp) break; /* Stop if tried at original pos */
5507 BACKCHAR(eptr);
5508 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5509 eptr[-1] == '\r') eptr--;
5510 }
5511 }
5512 else
5513 #endif /* SUPPORT_UTF8 */
5514
5515 /* Not UTF-8 mode */
5516 {
5517 switch(ctype)
5518 {
5519 case OP_ANY:
5520 for (i = min; i < max; i++)
5521 {
5522 if (eptr >= md->end_subject)
5523 {
5524 SCHECK_PARTIAL();
5525 break;
5526 }
5527 if (IS_NEWLINE(eptr)) break;
5528 eptr++;
5529 }
5530 break;
5531
5532 case OP_ALLANY:
5533 case OP_ANYBYTE:
5534 c = max - min;
5535 if (c > (unsigned int)(md->end_subject - eptr))
5536 {
5537 eptr = md->end_subject;
5538 SCHECK_PARTIAL();
5539 }
5540 else eptr += c;
5541 break;
5542
5543 case OP_ANYNL:
5544 for (i = min; i < max; i++)
5545 {
5546 if (eptr >= md->end_subject)
5547 {
5548 SCHECK_PARTIAL();
5549 break;
5550 }
5551 c = *eptr;
5552 if (c == 0x000d)
5553 {
5554 if (++eptr >= md->end_subject) break;
5555 if (*eptr == 0x000a) eptr++;
5556 }
5557 else
5558 {
5559 if (c != 0x000a &&
5560 (md->bsr_anycrlf ||
5561 (c != 0x000b && c != 0x000c && c != 0x0085)))
5562 break;
5563 eptr++;
5564 }
5565 }
5566 break;
5567
5568 case OP_NOT_HSPACE:
5569 for (i = min; i < max; i++)
5570 {
5571 if (eptr >= md->end_subject)
5572 {
5573 SCHECK_PARTIAL();
5574 break;
5575 }
5576 c = *eptr;
5577 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5578 eptr++;
5579 }
5580 break;
5581
5582 case OP_HSPACE:
5583 for (i = min; i < max; i++)
5584 {
5585 if (eptr >= md->end_subject)
5586 {
5587 SCHECK_PARTIAL();
5588 break;
5589 }
5590 c = *eptr;
5591 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5592 eptr++;
5593 }
5594 break;
5595
5596 case OP_NOT_VSPACE:
5597 for (i = min; i < max; i++)
5598 {
5599 if (eptr >= md->end_subject)
5600 {
5601 SCHECK_PARTIAL();
5602 break;
5603 }
5604 c = *eptr;
5605 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5606 break;
5607 eptr++;
5608 }
5609 break;
5610
5611 case OP_VSPACE:
5612 for (i = min; i < max; i++)
5613 {
5614 if (eptr >= md->end_subject)
5615 {
5616 SCHECK_PARTIAL();
5617 break;
5618 }
5619 c = *eptr;
5620 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5621 break;
5622 eptr++;
5623 }
5624 break;
5625
5626 case OP_NOT_DIGIT:
5627 for (i = min; i < max; i++)
5628 {
5629 if (eptr >= md->end_subject)
5630 {
5631 SCHECK_PARTIAL();
5632 break;
5633 }
5634 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5635 eptr++;
5636 }
5637 break;
5638
5639 case OP_DIGIT:
5640 for (i = min; i < max; i++)
5641 {
5642 if (eptr >= md->end_subject)
5643 {
5644 SCHECK_PARTIAL();
5645 break;
5646 }
5647 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5648 eptr++;
5649 }
5650 break;
5651
5652 case OP_NOT_WHITESPACE:
5653 for (i = min; i < max; i++)
5654 {
5655 if (eptr >= md->end_subject)
5656 {
5657 SCHECK_PARTIAL();
5658 break;
5659 }
5660 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5661 eptr++;
5662 }
5663 break;
5664
5665 case OP_WHITESPACE:
5666 for (i = min; i < max; i++)
5667 {
5668 if (eptr >= md->end_subject)
5669 {
5670 SCHECK_PARTIAL();
5671 break;
5672 }
5673 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5674 eptr++;
5675 }
5676 break;
5677
5678 case OP_NOT_WORDCHAR:
5679 for (i = min; i < max; i++)
5680 {
5681 if (eptr >= md->end_subject)
5682 {
5683 SCHECK_PARTIAL();
5684 break;
5685 }
5686 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5687 eptr++;
5688 }
5689 break;
5690
5691 case OP_WORDCHAR:
5692 for (i = min; i < max; i++)
5693 {
5694 if (eptr >= md->end_subject)
5695 {
5696 SCHECK_PARTIAL();
5697 break;
5698 }
5699 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5700 eptr++;
5701 }
5702 break;
5703
5704 default:
5705 RRETURN(PCRE_ERROR_INTERNAL);
5706 }
5707
5708 /* eptr is now past the end of the maximum run. If possessive, we are
5709 done (no backing up). Otherwise, match at this position; anything other
5710 than no match is immediately returned. For nomatch, back up one
5711 character (byte), unless we are matching \R and the last thing matched
5712 was \r\n, in which case, back up two bytes. */
5713
5714 if (possessive) continue;
5715 while (eptr >= pp)
5716 {
5717 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5718 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5719 eptr--;
5720 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5721 eptr[-1] == '\r') eptr--;
5722 }
5723 }
5724
5725 /* Get here if we can't make it match with any permitted repetitions */
5726
5727 MRRETURN(MATCH_NOMATCH);
5728 }
5729 /* Control never gets here */
5730
5731 /* There's been some horrible disaster. Arrival here can only mean there is
5732 something seriously wrong in the code above or the OP_xxx definitions. */
5733
5734 default:
5735 DPRINTF(("Unknown opcode %d\n", *ecode));
5736 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5737 }
5738
5739 /* Do not stick any code in here without much thought; it is assumed
5740 that "continue" in the code above comes out to here to repeat the main
5741 loop. */
5742
5743 } /* End of main loop */
5744 /* Control never reaches here */
5745
5746
5747 /* When compiling to use the heap rather than the stack for recursive calls to
5748 match(), the RRETURN() macro jumps here. The number that is saved in
5749 frame->Xwhere indicates which label we actually want to return to. */
5750
5751 #ifdef NO_RECURSE
5752 #define LBL(val) case val: goto L_RM##val;
5753 HEAP_RETURN:
5754 switch (frame->Xwhere)
5755 {
5756 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5757 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5758 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5759 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5760 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5761 #ifdef SUPPORT_UTF8
5762 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5763 LBL(32) LBL(34) LBL(42) LBL(46)
5764 #ifdef SUPPORT_UCP
5765 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5766 LBL(59) LBL(60) LBL(61) LBL(62)
5767 #endif /* SUPPORT_UCP */
5768 #endif /* SUPPORT_UTF8 */
5769 default:
5770 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5771 return PCRE_ERROR_INTERNAL;
5772 }
5773 #undef LBL
5774 #endif /* NO_RECURSE */
5775 }
5776
5777
5778 /***************************************************************************
5779 ****************************************************************************
5780 RECURSION IN THE match() FUNCTION
5781
5782 Undefine all the macros that were defined above to handle this. */
5783
5784 #ifdef NO_RECURSE
5785 #undef eptr
5786 #undef ecode
5787 #undef mstart
5788 #undef offset_top
5789 #undef eptrb
5790 #undef flags
5791
5792 #undef callpat
5793 #undef charptr
5794 #undef data
5795 #undef next
5796 #undef pp
5797 #undef prev
5798 #undef saved_eptr
5799
5800 #undef new_recursive
5801
5802 #undef cur_is_word
5803 #undef condition
5804 #undef prev_is_word
5805
5806 #undef ctype
5807 #undef length
5808 #undef max
5809 #undef min
5810 #undef number
5811 #undef offset
5812 #undef op
5813 #undef save_capture_last
5814 #undef save_offset1
5815 #undef save_offset2
5816 #undef save_offset3
5817 #undef stacksave
5818
5819 #undef newptrb
5820
5821 #endif
5822
5823 /* These two are defined as macros in both cases */
5824
5825 #undef fc
5826 #undef fi
5827
5828 /***************************************************************************
5829 ***************************************************************************/
5830
5831
5832
5833 /*************************************************
5834 * Execute a Regular Expression *
5835 *************************************************/
5836
5837 /* This function applies a compiled re to a subject string and picks out
5838 portions of the string if it matches. Two elements in the vector are set for
5839 each substring: the offsets to the start and end of the substring.
5840
5841 Arguments:
5842 argument_re points to the compiled expression
5843 extra_data points to extra data or is NULL
5844 subject points to the subject string
5845 length length of subject string (may contain binary zeros)
5846 start_offset where to start in the subject string
5847 options option bits
5848 offsets points to a vector of ints to be filled in with offsets
5849 offsetcount the number of elements in the vector
5850
5851 Returns: > 0 => success; value is the number of elements filled in
5852 = 0 => success, but offsets is not big enough
5853 -1 => failed to match
5854 < -1 => some kind of unexpected problem
5855 */
5856
5857 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5858 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5859 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5860 int offsetcount)
5861 {
5862 int rc, resetcount, ocount;
5863 int first_byte = -1;
5864 int req_byte = -1;
5865 int req_byte2 = -1;
5866 int newline;
5867 BOOL using_temporary_offsets = FALSE;
5868 BOOL anchored;
5869 BOOL startline;
5870 BOOL firstline;
5871 BOOL first_byte_caseless = FALSE;
5872 BOOL req_byte_caseless = FALSE;
5873 BOOL utf8;
5874 match_data match_block;
5875 match_data *md = &match_block;
5876 const uschar *tables;
5877 const uschar *start_bits = NULL;
5878 USPTR start_match = (USPTR)subject + start_offset;
5879 USPTR end_subject;
5880 USPTR start_partial = NULL;
5881 USPTR req_byte_ptr = start_match - 1;
5882
5883 pcre_study_data internal_study;
5884 const pcre_study_data *study;
5885
5886 real_pcre internal_re;
5887 const real_pcre *external_re = (const real_pcre *)argument_re;
5888 const real_pcre *re = external_re;
5889
5890 /* Plausibility checks */
5891
5892 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5893 if (re == NULL || subject == NULL ||
5894 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5895 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5896 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5897
5898 /* This information is for finding all the numbers associated with a given
5899 name, for condition testing. */
5900
5901 md->name_table = (uschar *)re + re->name_table_offset;
5902 md->name_count = re->name_count;
5903 md->name_entry_size = re->name_entry_size;
5904
5905 /* Fish out the optional data from the extra_data structure, first setting
5906 the default values. */
5907
5908 study = NULL;
5909 md->match_limit = MATCH_LIMIT;
5910 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5911 md->callout_data = NULL;
5912
5913 /* The table pointer is always in native byte order. */
5914
5915 tables = external_re->tables;
5916
5917 if (extra_data != NULL)
5918 {
5919 register unsigned int flags = extra_data->flags;
5920 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5921 study = (const pcre_study_data *)extra_data->study_data;
5922 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5923 md->match_limit = extra_data->match_limit;
5924 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5925 md->match_limit_recursion = extra_data->match_limit_recursion;
5926 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5927 md->callout_data = extra_data->callout_data;
5928 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5929 }
5930
5931 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5932 is a feature that makes it possible to save compiled regex and re-use them
5933 in other programs later. */
5934
5935 if (tables == NULL) tables = _pcre_default_tables;
5936
5937 /* Check that the first field in the block is the magic number. If it is not,
5938 test for a regex that was compiled on a host of opposite endianness. If this is
5939 the case, flipped values are put in internal_re and internal_study if there was
5940 study data too. */
5941
5942 if (re->magic_number != MAGIC_NUMBER)
5943 {
5944 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5945 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5946 if (study != NULL) study = &internal_study;
5947 }
5948
5949 /* Set up other data */
5950
5951 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5952 startline = (re->flags & PCRE_STARTLINE) != 0;
5953 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5954
5955 /* The code starts after the real_pcre block and the capture name table. */
5956
5957 md->start_code = (const uschar *)external_re + re->name_table_offset +
5958 re->name_count * re->name_entry_size;
5959
5960 md->start_subject = (USPTR)subject;
5961 md->start_offset = start_offset;
5962 md->end_subject = md->start_subject + length;
5963 end_subject = md->end_subject;
5964
5965 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5966 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5967 md->use_ucp = (re->options & PCRE_UCP) != 0;
5968 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5969
5970 /* Some options are unpacked into BOOL variables in the hope that testing
5971 them will be faster than individual option bits. */
5972
5973 md->notbol = (options & PCRE_NOTBOL) != 0;
5974 md->noteol = (options & PCRE_NOTEOL) != 0;
5975 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5976 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5977 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5978 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5979
5980
5981 md->hitend = FALSE;
5982 md->mark = NULL; /* In case never set */
5983
5984 md->recursive = NULL; /* No recursion at top level */
5985
5986 md->lcc = tables + lcc_offset;
5987 md->ctypes = tables + ctypes_offset;
5988
5989 /* Handle different \R options. */
5990
5991 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5992 {
5993 case 0:
5994 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5995 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5996 else
5997 #ifdef BSR_ANYCRLF
5998 md->bsr_anycrlf = TRUE;
5999 #else
6000 md->bsr_anycrlf = FALSE;
6001 #endif
6002 break;
6003
6004 case PCRE_BSR_ANYCRLF:
6005 md->bsr_anycrlf = TRUE;
6006 break;
6007
6008 case PCRE_BSR_UNICODE:
6009 md->bsr_anycrlf = FALSE;
6010 break;
6011
6012 default: return PCRE_ERROR_BADNEWLINE;
6013 }
6014
6015 /* Handle different types of newline. The three bits give eight cases. If
6016 nothing is set at run time, whatever was used at compile time applies. */
6017
6018 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6019 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6020 {
6021 case 0: newline = NEWLINE; break; /* Compile-time default */
6022 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6023 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6024 case PCRE_NEWLINE_CR+
6025 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6026 case PCRE_NEWLINE_ANY: newline = -1; break;
6027 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6028 default: return PCRE_ERROR_BADNEWLINE;
6029 }
6030
6031 if (newline == -2)
6032 {
6033 md->nltype = NLTYPE_ANYCRLF;
6034 }
6035 else if (newline < 0)
6036 {
6037 md->nltype = NLTYPE_ANY;
6038 }
6039 else
6040 {
6041 md->nltype = NLTYPE_FIXED;
6042 if (newline > 255)
6043 {
6044 md->nllen = 2;
6045 md->nl[0] = (newline >> 8) & 255;
6046 md->nl[1] = newline & 255;
6047 }
6048 else
6049 {
6050 md->nllen = 1;
6051 md->nl[0] = newline;
6052 }
6053 }
6054
6055 /* Partial matching was originally supported only for a restricted set of
6056 regexes; from release 8.00 there are no restrictions, but the bits are still
6057 defined (though never set). So there's no harm in leaving this code. */
6058
6059 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6060 return PCRE_ERROR_BADPARTIAL;
6061
6062 /* Check a UTF-8 string if required. Pass back the character offset and error
6063 code for an invalid string if a results vector is available. */
6064
6065 #ifdef SUPPORT_UTF8
6066 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6067 {
6068 int erroroffset;
6069 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
6070 if (errorcode != 0)
6071 {
6072 if (offsetcount >= 2)
6073 {
6074 offsets[0] = erroroffset;
6075 offsets[1] = errorcode;
6076 }
6077 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6078 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6079 }
6080
6081 /* Check that a start_offset points to the start of a UTF-8 character. */
6082
6083 if (start_offset > 0 && start_offset < length &&
6084 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6085 return PCRE_ERROR_BADUTF8_OFFSET;
6086 }
6087 #endif
6088
6089 /* If the expression has got more back references than the offsets supplied can
6090 hold, we get a temporary chunk of working store to use during the matching.
6091 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6092 of 3. */
6093
6094 ocount = offsetcount - (offsetcount % 3);
6095
6096 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6097 {
6098 ocount = re->top_backref * 3 + 3;
6099 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6100 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6101 using_temporary_offsets = TRUE;
6102 DPRINTF(("Got memory to hold back references\n"));
6103 }
6104 else md->offset_vector = offsets;
6105
6106 md->offset_end = ocount;
6107 md->offset_max = (2*ocount)/3;
6108 md->offset_overflow = FALSE;
6109 md->capture_last = -1;
6110
6111 /* Compute the minimum number of offsets that we need to reset each time. Doing
6112 this makes a huge difference to execution time when there aren't many brackets
6113 in the pattern. */
6114
6115 resetcount = 2 + re->top_bracket * 2;
6116 if (resetcount > offsetcount) resetcount = ocount;
6117
6118 /* Reset the working variable associated with each extraction. These should
6119 never be used unless previously set, but they get saved and restored, and so we
6120 initialize them to avoid reading uninitialized locations. */
6121
6122 if (md->offset_vector != NULL)
6123 {
6124 register int *iptr = md->offset_vector + ocount;
6125 register int *iend = iptr - resetcount/2 + 1;
6126 while (--iptr >= iend) *iptr = -1;
6127 }
6128
6129 /* Set up the first character to match, if available. The first_byte value is
6130 never set for an anchored regular expression, but the anchoring may be forced
6131 at run time, so we have to test for anchoring. The first char may be unset for
6132 an unanchored pattern, of course. If there's no first char and the pattern was
6133 studied, there may be a bitmap of possible first characters. */
6134
6135 if (!anchored)
6136 {
6137 if ((re->flags & PCRE_FIRSTSET) != 0)
6138 {
6139 first_byte = re->first_byte & 255;
6140 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6141 first_byte = md->lcc[first_byte];
6142 }
6143 else
6144 if (!startline && study != NULL &&
6145 (study->flags & PCRE_STUDY_MAPPED) != 0)
6146 start_bits = study->start_bits;
6147 }
6148
6149 /* For anchored or unanchored matches, there may be a "last known required
6150 character" set. */
6151
6152 if ((re->flags & PCRE_REQCHSET) != 0)
6153 {
6154 req_byte = re->req_byte & 255;
6155 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6156 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6157 }
6158
6159
6160 /* ==========================================================================*/
6161
6162 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6163 the loop runs just once. */
6164
6165 for(;;)
6166 {
6167 USPTR save_end_subject = end_subject;
6168 USPTR new_start_match;
6169
6170 /* Reset the maximum number of extractions we might see. */
6171
6172 if (md->offset_vector != NULL)
6173 {
6174 register int *iptr = md->offset_vector;
6175 register int *iend = iptr + resetcount;
6176 while (iptr < iend) *iptr++ = -1;
6177 }
6178
6179 /* If firstline is TRUE, the start of the match is constrained to the first
6180 line of a multiline string. That is, the match must be before or at the first
6181 newline. Implement this by temporarily adjusting end_subject so that we stop
6182 scanning at a newline. If the match fails at the newline, later code breaks
6183 this loop. */
6184
6185 if (firstline)
6186 {
6187 USPTR t = start_match;
6188 #ifdef SUPPORT_UTF8
6189 if (utf8)
6190 {
6191 while (t < md->end_subject && !IS_NEWLINE(t))
6192 {
6193 t++;
6194 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6195 }
6196 }
6197 else
6198 #endif
6199 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6200 end_subject = t;
6201 }
6202
6203 /* There are some optimizations that avoid running the match if a known
6204 starting point is not found, or if a known later character is not present.
6205 However, there is an option that disables these, for testing and for ensuring
6206 that all callouts do actually occur. The option can be set in the regex by
6207 (*NO_START_OPT) or passed in match-time options. */
6208
6209 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6210 {
6211 /* Advance to a unique first byte if there is one. */
6212
6213 if (first_byte >= 0)
6214 {
6215 if (first_byte_caseless)
6216 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6217 start_match++;
6218 else
6219 while (start_match < end_subject && *start_match != first_byte)
6220 start_match++;
6221 }
6222
6223 /* Or to just after a linebreak for a multiline match */
6224
6225 else if (startline)
6226 {
6227 if (start_match > md->start_subject + start_offset)
6228 {
6229 #ifdef SUPPORT_UTF8
6230 if (utf8)
6231 {
6232 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6233 {
6234 start_match++;
6235 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6236 start_match++;
6237 }
6238 }
6239 else
6240 #endif
6241 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6242 start_match++;
6243
6244 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6245 and we are now at a LF, advance the match position by one more character.
6246 */
6247
6248 if (start_match[-1] == CHAR_CR &&
6249 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6250 start_match < end_subject &&
6251 *start_match == CHAR_NL)
6252 start_match++;
6253 }
6254 }
6255
6256 /* Or to a non-unique first byte after study */
6257
6258 else if (start_bits != NULL)
6259 {
6260 while (start_match < end_subject)
6261 {
6262 register unsigned int c = *start_match;
6263 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6264 {
6265 start_match++;
6266 #ifdef SUPPORT_UTF8
6267 if (utf8)
6268 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6269 start_match++;
6270 #endif
6271 }
6272 else break;
6273 }
6274 }
6275 } /* Starting optimizations */
6276
6277 /* Restore fudged end_subject */
6278
6279 end_subject = save_end_subject;
6280
6281 /* The following two optimizations are disabled for partial matching or if
6282 disabling is explicitly requested. */
6283
6284 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6285 {
6286 /* If the pattern was studied, a minimum subject length may be set. This is
6287 a lower bound; no actual string of that length may actually match the
6288 pattern. Although the value is, strictly, in characters, we treat it as
6289 bytes to avoid spending too much time in this optimization. */
6290
6291 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6292 (pcre_uint32)(end_subject - start_match) < study->minlength)
6293 {
6294 rc = MATCH_NOMATCH;
6295 break;
6296 }
6297
6298 /* If req_byte is set, we know that that character must appear in the
6299 subject for the match to succeed. If the first character is set, req_byte
6300 must be later in the subject; otherwise the test starts at the match point.
6301 This optimization can save a huge amount of backtracking in patterns with
6302 nested unlimited repeats that aren't going to match. Writing separate code
6303 for cased/caseless versions makes it go faster, as does using an
6304 autoincrement and backing off on a match.
6305
6306 HOWEVER: when the subject string is very, very long, searching to its end
6307 can take a long time, and give bad performance on quite ordinary patterns.
6308 This showed up when somebody was matching something like /^\d+C/ on a
6309 32-megabyte string... so we don't do this when the string is sufficiently
6310 long. */
6311
6312 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6313 {
6314 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6315
6316 /* We don't need to repeat the search if we haven't yet reached the
6317 place we found it at last time. */
6318
6319 if (p > req_byte_ptr)
6320 {
6321 if (req_byte_caseless)
6322 {
6323 while (p < end_subject)
6324 {
6325 register int pp = *p++;
6326 if (pp == req_byte || pp == req_byte2) { p--; break; }
6327 }
6328 }
6329 else
6330 {
6331 while (p < end_subject)
6332 {
6333 if (*p++ == req_byte) { p--; break; }
6334 }
6335 }
6336
6337 /* If we can't find the required character, break the matching loop,
6338 forcing a match failure. */
6339
6340 if (p >= end_subject)
6341 {
6342 rc = MATCH_NOMATCH;
6343 break;
6344 }
6345
6346 /* If we have found the required character, save the point where we
6347 found it, so that we don't search again next time round the loop if
6348 the start hasn't passed this character yet. */
6349
6350 req_byte_ptr = p;
6351 }
6352 }
6353 }
6354
6355 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6356 printf(">>>> Match against: ");
6357 pchars(start_match, end_subject - start_match, TRUE, md);
6358 printf("\n");
6359 #endif
6360
6361 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6362 first starting point for which a partial match was found. */
6363
6364 md->start_match_ptr = start_match;
6365 md->start_used_ptr = start_match;
6366 md->match_call_count = 0;
6367 md->match_function_type = 0;
6368 md->end_offset_top = 0;
6369 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6370 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6371
6372 switch(rc)
6373 {
6374 /* SKIP passes back the next starting point explicitly, but if it is the
6375 same as the match we have just done, treat it as NOMATCH. */
6376
6377 case MATCH_SKIP:
6378 if (md->start_match_ptr != start_match)
6379 {
6380 new_start_match = md->start_match_ptr;
6381 break;
6382 }
6383 /* Fall through */
6384
6385 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6386 the SKIP's arg was not found. We also treat this as NOMATCH. */
6387
6388 case MATCH_SKIP_ARG:
6389 /* Fall through */
6390
6391 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6392 exactly like PRUNE. */
6393
6394 case MATCH_NOMATCH:
6395 case MATCH_PRUNE:
6396 case MATCH_THEN:
6397 new_start_match = start_match + 1;
6398 #ifdef SUPPORT_UTF8
6399 if (utf8)
6400 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6401 new_start_match++;
6402 #endif
6403 break;
6404
6405 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6406
6407 case MATCH_COMMIT:
6408 rc = MATCH_NOMATCH;
6409 goto ENDLOOP;
6410
6411 /* Any other return is either a match, or some kind of error. */
6412
6413 default:
6414 goto ENDLOOP;
6415 }
6416
6417 /* Control reaches here for the various types of "no match at this point"
6418 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6419
6420 rc = MATCH_NOMATCH;
6421
6422 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6423 newline in the subject (though it may continue over the newline). Therefore,
6424 if we have just failed to match, starting at a newline, do not continue. */
6425
6426 if (firstline && IS_NEWLINE(start_match)) break;
6427
6428 /* Advance to new matching position */
6429
6430 start_match = new_start_match;
6431
6432 /* Break the loop if the pattern is anchored or if we have passed the end of
6433 the subject. */
6434
6435 if (anchored || start_match > end_subject) break;
6436
6437 /* If we have just passed a CR and we are now at a LF, and the pattern does
6438 not contain any explicit matches for \r or \n, and the newline option is CRLF
6439 or ANY or ANYCRLF, advance the match position by one more character. */
6440
6441 if (start_match[-1] == CHAR_CR &&
6442 start_match < end_subject &&
6443 *start_match == CHAR_NL &&
6444 (re->flags & PCRE_HASCRORLF) == 0 &&
6445 (md->nltype == NLTYPE_ANY ||
6446 md->nltype == NLTYPE_ANYCRLF ||
6447 md->nllen == 2))
6448 start_match++;
6449
6450 md->mark = NULL; /* Reset for start of next match attempt */
6451 } /* End of for(;;) "bumpalong" loop */
6452
6453 /* ==========================================================================*/
6454
6455 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6456 conditions is true:
6457
6458 (1) The pattern is anchored or the match was failed by (*COMMIT);
6459
6460 (2) We are past the end of the subject;
6461
6462 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6463 this option requests that a match occur at or before the first newline in
6464 the subject.
6465
6466 When we have a match and the offset vector is big enough to deal with any
6467 backreferences, captured substring offsets will already be set up. In the case
6468 where we had to get some local store to hold offsets for backreference
6469 processing, copy those that we can. In this case there need not be overflow if
6470 certain parts of the pattern were not used, even though there are more
6471 capturing parentheses than vector slots. */
6472
6473 ENDLOOP:
6474
6475 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6476 {
6477 if (using_temporary_offsets)
6478 {
6479 if (offsetcount >= 4)
6480 {
6481 memcpy(offsets + 2, md->offset_vector + 2,
6482 (offsetcount - 2) * sizeof(int));
6483 DPRINTF(("Copied offsets from temporary memory\n"));
6484 }
6485 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6486 DPRINTF(("Freeing temporary memory\n"));
6487 (pcre_free)(md->offset_vector);
6488 }
6489
6490 /* Set the return code to the number of captured strings, or 0 if there are
6491 too many to fit into the vector. */
6492
6493 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6494
6495 /* If there is space, set up the whole thing as substring 0. The value of
6496 md->start_match_ptr might be modified if \K was encountered on the success
6497 matching path. */
6498
6499 if (offsetcount < 2) rc = 0; else
6500 {
6501 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6502 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6503 }
6504
6505 DPRINTF((">>>> returning %d\n", rc));
6506 goto RETURN_MARK;
6507 }
6508
6509 /* Control gets here if there has been an error, or if the overall match
6510 attempt has failed at all permitted starting positions. */
6511
6512 if (using_temporary_offsets)
6513 {
6514 DPRINTF(("Freeing temporary memory\n"));
6515 (pcre_free)(md->offset_vector);
6516 }
6517
6518 /* For anything other than nomatch or partial match, just return the code. */
6519
6520 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6521 {
6522 DPRINTF((">>>> error: returning %d\n", rc));
6523 return rc;
6524 }
6525
6526 /* Handle partial matches - disable any mark data */
6527
6528 if (start_partial != NULL)
6529 {
6530 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6531 md->mark = NULL;
6532 if (offsetcount > 1)
6533 {
6534 offsets[0] = (int)(start_partial - (USPTR)subject);
6535 offsets[1] = (int)(end_subject - (USPTR)subject);
6536 }
6537 rc = PCRE_ERROR_PARTIAL;
6538 }
6539
6540 /* This is the classic nomatch case */
6541
6542 else
6543 {
6544 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6545 rc = PCRE_ERROR_NOMATCH;
6546 }
6547
6548 /* Return the MARK data if it has been requested. */
6549
6550 RETURN_MARK:
6551
6552 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6553 *(extra_data->mark) = (unsigned char *)(md->mark);
6554 return rc;
6555 }
6556
6557 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12