/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 615 - (show annotations) (download)
Mon Jul 11 14:23:06 2011 UTC (2 years, 9 months ago) by ph10
File MIME type: text/plain
File size: 195321 byte(s)
A better patch for the atomic capturing not resetting bug.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2011 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Values for setting in md->match_function_type to indicate two special types
61 of call to match(). We do it this way to save on using another stack variable,
62 as stack usage is to be discouraged. */
63
64 #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
65 #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Special internal returns from the match() function. Make them sufficiently
74 negative to avoid the external error codes. */
75
76 #define MATCH_ACCEPT (-999)
77 #define MATCH_COMMIT (-998)
78 #define MATCH_KETRPOS (-997)
79 #define MATCH_PRUNE (-996)
80 #define MATCH_SKIP (-995)
81 #define MATCH_SKIP_ARG (-994)
82 #define MATCH_THEN (-993)
83
84 /* This is a convenience macro for code that occurs many times. */
85
86 #define MRRETURN(ra) \
87 { \
88 md->mark = markptr; \
89 RRETURN(ra); \
90 }
91
92 /* Maximum number of ints of offset to save on the stack for recursive calls.
93 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
94 because the offset vector is always a multiple of 3 long. */
95
96 #define REC_STACK_SAVE_MAX 30
97
98 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
99
100 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
101 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
102
103
104
105 #ifdef PCRE_DEBUG
106 /*************************************************
107 * Debugging function to print chars *
108 *************************************************/
109
110 /* Print a sequence of chars in printable format, stopping at the end of the
111 subject if the requested.
112
113 Arguments:
114 p points to characters
115 length number to print
116 is_subject TRUE if printing from within md->start_subject
117 md pointer to matching data block, if is_subject is TRUE
118
119 Returns: nothing
120 */
121
122 static void
123 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
124 {
125 unsigned int c;
126 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
127 while (length-- > 0)
128 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
129 }
130 #endif
131
132
133
134 /*************************************************
135 * Match a back-reference *
136 *************************************************/
137
138 /* Normally, if a back reference hasn't been set, the length that is passed is
139 negative, so the match always fails. However, in JavaScript compatibility mode,
140 the length passed is zero. Note that in caseless UTF-8 mode, the number of
141 subject bytes matched may be different to the number of reference bytes.
142
143 Arguments:
144 offset index into the offset vector
145 eptr pointer into the subject
146 length length of reference to be matched (number of bytes)
147 md points to match data block
148 caseless TRUE if caseless
149
150 Returns: < 0 if not matched, otherwise the number of subject bytes matched
151 */
152
153 static int
154 match_ref(int offset, register USPTR eptr, int length, match_data *md,
155 BOOL caseless)
156 {
157 USPTR eptr_start = eptr;
158 register USPTR p = md->start_subject + md->offset_vector[offset];
159
160 #ifdef PCRE_DEBUG
161 if (eptr >= md->end_subject)
162 printf("matching subject <null>");
163 else
164 {
165 printf("matching subject ");
166 pchars(eptr, length, TRUE, md);
167 }
168 printf(" against backref ");
169 pchars(p, length, FALSE, md);
170 printf("\n");
171 #endif
172
173 /* Always fail if reference not set (and not JavaScript compatible). */
174
175 if (length < 0) return -1;
176
177 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
178 properly if Unicode properties are supported. Otherwise, we can check only
179 ASCII characters. */
180
181 if (caseless)
182 {
183 #ifdef SUPPORT_UTF8
184 #ifdef SUPPORT_UCP
185 if (md->utf8)
186 {
187 /* Match characters up to the end of the reference. NOTE: the number of
188 bytes matched may differ, because there are some characters whose upper and
189 lower case versions code as different numbers of bytes. For example, U+023A
190 (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
191 a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
192 the latter. It is important, therefore, to check the length along the
193 reference, not along the subject (earlier code did this wrong). */
194
195 USPTR endptr = p + length;
196 while (p < endptr)
197 {
198 int c, d;
199 if (eptr >= md->end_subject) return -1;
200 GETCHARINC(c, eptr);
201 GETCHARINC(d, p);
202 if (c != d && c != UCD_OTHERCASE(d)) return -1;
203 }
204 }
205 else
206 #endif
207 #endif
208
209 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
210 is no UCP support. */
211 {
212 if (eptr + length > md->end_subject) return -1;
213 while (length-- > 0)
214 { if (md->lcc[*p++] != md->lcc[*eptr++]) return -1; }
215 }
216 }
217
218 /* In the caseful case, we can just compare the bytes, whether or not we
219 are in UTF-8 mode. */
220
221 else
222 {
223 if (eptr + length > md->end_subject) return -1;
224 while (length-- > 0) if (*p++ != *eptr++) return -1;
225 }
226
227 return eptr - eptr_start;
228 }
229
230
231
232 /***************************************************************************
233 ****************************************************************************
234 RECURSION IN THE match() FUNCTION
235
236 The match() function is highly recursive, though not every recursive call
237 increases the recursive depth. Nevertheless, some regular expressions can cause
238 it to recurse to a great depth. I was writing for Unix, so I just let it call
239 itself recursively. This uses the stack for saving everything that has to be
240 saved for a recursive call. On Unix, the stack can be large, and this works
241 fine.
242
243 It turns out that on some non-Unix-like systems there are problems with
244 programs that use a lot of stack. (This despite the fact that every last chip
245 has oodles of memory these days, and techniques for extending the stack have
246 been known for decades.) So....
247
248 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
249 calls by keeping local variables that need to be preserved in blocks of memory
250 obtained from malloc() instead instead of on the stack. Macros are used to
251 achieve this so that the actual code doesn't look very different to what it
252 always used to.
253
254 The original heap-recursive code used longjmp(). However, it seems that this
255 can be very slow on some operating systems. Following a suggestion from Stan
256 Switzer, the use of longjmp() has been abolished, at the cost of having to
257 provide a unique number for each call to RMATCH. There is no way of generating
258 a sequence of numbers at compile time in C. I have given them names, to make
259 them stand out more clearly.
260
261 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
262 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
263 tests. Furthermore, not using longjmp() means that local dynamic variables
264 don't have indeterminate values; this has meant that the frame size can be
265 reduced because the result can be "passed back" by straight setting of the
266 variable instead of being passed in the frame.
267 ****************************************************************************
268 ***************************************************************************/
269
270 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
271 below must be updated in sync. */
272
273 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
274 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
275 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
276 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
277 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
278 RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
279 RM61, RM62, RM63};
280
281 /* These versions of the macros use the stack, as normal. There are debugging
282 versions and production versions. Note that the "rw" argument of RMATCH isn't
283 actually used in this definition. */
284
285 #ifndef NO_RECURSE
286 #define REGISTER register
287
288 #ifdef PCRE_DEBUG
289 #define RMATCH(ra,rb,rc,rd,re,rw) \
290 { \
291 printf("match() called in line %d\n", __LINE__); \
292 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1); \
293 printf("to line %d\n", __LINE__); \
294 }
295 #define RRETURN(ra) \
296 { \
297 printf("match() returned %d from line %d ", ra, __LINE__); \
298 return ra; \
299 }
300 #else
301 #define RMATCH(ra,rb,rc,rd,re,rw) \
302 rrc = match(ra,rb,mstart,markptr,rc,rd,re,rdepth+1)
303 #define RRETURN(ra) return ra
304 #endif
305
306 #else
307
308
309 /* These versions of the macros manage a private stack on the heap. Note that
310 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
311 argument of match(), which never changes. */
312
313 #define REGISTER
314
315 #define RMATCH(ra,rb,rc,rd,re,rw)\
316 {\
317 heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
318 if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
319 frame->Xwhere = rw; \
320 newframe->Xeptr = ra;\
321 newframe->Xecode = rb;\
322 newframe->Xmstart = mstart;\
323 newframe->Xmarkptr = markptr;\
324 newframe->Xoffset_top = rc;\
325 newframe->Xeptrb = re;\
326 newframe->Xrdepth = frame->Xrdepth + 1;\
327 newframe->Xprevframe = frame;\
328 frame = newframe;\
329 DPRINTF(("restarting from line %d\n", __LINE__));\
330 goto HEAP_RECURSE;\
331 L_##rw:\
332 DPRINTF(("jumped back to line %d\n", __LINE__));\
333 }
334
335 #define RRETURN(ra)\
336 {\
337 heapframe *oldframe = frame;\
338 frame = oldframe->Xprevframe;\
339 (pcre_stack_free)(oldframe);\
340 if (frame != NULL)\
341 {\
342 rrc = ra;\
343 goto HEAP_RETURN;\
344 }\
345 return ra;\
346 }
347
348
349 /* Structure for remembering the local variables in a private frame */
350
351 typedef struct heapframe {
352 struct heapframe *Xprevframe;
353
354 /* Function arguments that may change */
355
356 USPTR Xeptr;
357 const uschar *Xecode;
358 USPTR Xmstart;
359 USPTR Xmarkptr;
360 int Xoffset_top;
361 eptrblock *Xeptrb;
362 unsigned int Xrdepth;
363
364 /* Function local variables */
365
366 USPTR Xcallpat;
367 #ifdef SUPPORT_UTF8
368 USPTR Xcharptr;
369 #endif
370 USPTR Xdata;
371 USPTR Xnext;
372 USPTR Xpp;
373 USPTR Xprev;
374 USPTR Xsaved_eptr;
375
376 recursion_info Xnew_recursive;
377
378 BOOL Xcur_is_word;
379 BOOL Xcondition;
380 BOOL Xprev_is_word;
381
382 #ifdef SUPPORT_UCP
383 int Xprop_type;
384 int Xprop_value;
385 int Xprop_fail_result;
386 int Xprop_category;
387 int Xprop_chartype;
388 int Xprop_script;
389 int Xoclength;
390 uschar Xocchars[8];
391 #endif
392
393 int Xcodelink;
394 int Xctype;
395 unsigned int Xfc;
396 int Xfi;
397 int Xlength;
398 int Xmax;
399 int Xmin;
400 int Xnumber;
401 int Xoffset;
402 int Xop;
403 int Xsave_capture_last;
404 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
405 int Xstacksave[REC_STACK_SAVE_MAX];
406
407 eptrblock Xnewptrb;
408
409 /* Where to jump back to */
410
411 int Xwhere;
412
413 } heapframe;
414
415 #endif
416
417
418 /***************************************************************************
419 ***************************************************************************/
420
421
422
423 /*************************************************
424 * Match from current position *
425 *************************************************/
426
427 /* This function is called recursively in many circumstances. Whenever it
428 returns a negative (error) response, the outer incarnation must also return the
429 same response. */
430
431 /* These macros pack up tests that are used for partial matching, and which
432 appears several times in the code. We set the "hit end" flag if the pointer is
433 at the end of the subject and also past the start of the subject (i.e.
434 something has been matched). For hard partial matching, we then return
435 immediately. The second one is used when we already know we are past the end of
436 the subject. */
437
438 #define CHECK_PARTIAL()\
439 if (md->partial != 0 && eptr >= md->end_subject && \
440 eptr > md->start_used_ptr) \
441 { \
442 md->hitend = TRUE; \
443 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
444 }
445
446 #define SCHECK_PARTIAL()\
447 if (md->partial != 0 && eptr > md->start_used_ptr) \
448 { \
449 md->hitend = TRUE; \
450 if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
451 }
452
453
454 /* Performance note: It might be tempting to extract commonly used fields from
455 the md structure (e.g. utf8, end_subject) into individual variables to improve
456 performance. Tests using gcc on a SPARC disproved this; in the first case, it
457 made performance worse.
458
459 Arguments:
460 eptr pointer to current character in subject
461 ecode pointer to current position in compiled code
462 mstart pointer to the current match start position (can be modified
463 by encountering \K)
464 markptr pointer to the most recent MARK name, or NULL
465 offset_top current top pointer
466 md pointer to "static" info for the match
467 eptrb pointer to chain of blocks containing eptr at start of
468 brackets - for testing for empty matches
469 rdepth the recursion depth
470
471 Returns: MATCH_MATCH if matched ) these values are >= 0
472 MATCH_NOMATCH if failed to match )
473 a negative MATCH_xxx value for PRUNE, SKIP, etc
474 a negative PCRE_ERROR_xxx value if aborted by an error condition
475 (e.g. stopped by repeated call or recursion limit)
476 */
477
478 static int
479 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
480 const uschar *markptr, int offset_top, match_data *md, eptrblock *eptrb,
481 unsigned int rdepth)
482 {
483 /* These variables do not need to be preserved over recursion in this function,
484 so they can be ordinary variables in all cases. Mark some of them with
485 "register" because they are used a lot in loops. */
486
487 register int rrc; /* Returns from recursive calls */
488 register int i; /* Used for loops not involving calls to RMATCH() */
489 register unsigned int c; /* Character values not kept over RMATCH() calls */
490 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
491
492 BOOL minimize, possessive; /* Quantifier options */
493 BOOL caseless;
494 int condcode;
495
496 /* When recursion is not being used, all "local" variables that have to be
497 preserved over calls to RMATCH() are part of a "frame" which is obtained from
498 heap storage. Set up the top-level frame here; others are obtained from the
499 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
500
501 #ifdef NO_RECURSE
502 heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
503 if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
504 frame->Xprevframe = NULL; /* Marks the top level */
505
506 /* Copy in the original argument variables */
507
508 frame->Xeptr = eptr;
509 frame->Xecode = ecode;
510 frame->Xmstart = mstart;
511 frame->Xmarkptr = markptr;
512 frame->Xoffset_top = offset_top;
513 frame->Xeptrb = eptrb;
514 frame->Xrdepth = rdepth;
515
516 /* This is where control jumps back to to effect "recursion" */
517
518 HEAP_RECURSE:
519
520 /* Macros make the argument variables come from the current frame */
521
522 #define eptr frame->Xeptr
523 #define ecode frame->Xecode
524 #define mstart frame->Xmstart
525 #define markptr frame->Xmarkptr
526 #define offset_top frame->Xoffset_top
527 #define eptrb frame->Xeptrb
528 #define rdepth frame->Xrdepth
529
530 /* Ditto for the local variables */
531
532 #ifdef SUPPORT_UTF8
533 #define charptr frame->Xcharptr
534 #endif
535 #define callpat frame->Xcallpat
536 #define codelink frame->Xcodelink
537 #define data frame->Xdata
538 #define next frame->Xnext
539 #define pp frame->Xpp
540 #define prev frame->Xprev
541 #define saved_eptr frame->Xsaved_eptr
542
543 #define new_recursive frame->Xnew_recursive
544
545 #define cur_is_word frame->Xcur_is_word
546 #define condition frame->Xcondition
547 #define prev_is_word frame->Xprev_is_word
548
549 #ifdef SUPPORT_UCP
550 #define prop_type frame->Xprop_type
551 #define prop_value frame->Xprop_value
552 #define prop_fail_result frame->Xprop_fail_result
553 #define prop_category frame->Xprop_category
554 #define prop_chartype frame->Xprop_chartype
555 #define prop_script frame->Xprop_script
556 #define oclength frame->Xoclength
557 #define occhars frame->Xocchars
558 #endif
559
560 #define ctype frame->Xctype
561 #define fc frame->Xfc
562 #define fi frame->Xfi
563 #define length frame->Xlength
564 #define max frame->Xmax
565 #define min frame->Xmin
566 #define number frame->Xnumber
567 #define offset frame->Xoffset
568 #define op frame->Xop
569 #define save_capture_last frame->Xsave_capture_last
570 #define save_offset1 frame->Xsave_offset1
571 #define save_offset2 frame->Xsave_offset2
572 #define save_offset3 frame->Xsave_offset3
573 #define stacksave frame->Xstacksave
574
575 #define newptrb frame->Xnewptrb
576
577 /* When recursion is being used, local variables are allocated on the stack and
578 get preserved during recursion in the normal way. In this environment, fi and
579 i, and fc and c, can be the same variables. */
580
581 #else /* NO_RECURSE not defined */
582 #define fi i
583 #define fc c
584
585 /* Many of the following variables are used only in small blocks of the code.
586 My normal style of coding would have declared them within each of those blocks.
587 However, in order to accommodate the version of this code that uses an external
588 "stack" implemented on the heap, it is easier to declare them all here, so the
589 declarations can be cut out in a block. The only declarations within blocks
590 below are for variables that do not have to be preserved over a recursive call
591 to RMATCH(). */
592
593 #ifdef SUPPORT_UTF8
594 const uschar *charptr;
595 #endif
596 const uschar *callpat;
597 const uschar *data;
598 const uschar *next;
599 USPTR pp;
600 const uschar *prev;
601 USPTR saved_eptr;
602
603 recursion_info new_recursive;
604
605 BOOL cur_is_word;
606 BOOL condition;
607 BOOL prev_is_word;
608
609 #ifdef SUPPORT_UCP
610 int prop_type;
611 int prop_value;
612 int prop_fail_result;
613 int prop_category;
614 int prop_chartype;
615 int prop_script;
616 int oclength;
617 uschar occhars[8];
618 #endif
619
620 int codelink;
621 int ctype;
622 int length;
623 int max;
624 int min;
625 int number;
626 int offset;
627 int op;
628 int save_capture_last;
629 int save_offset1, save_offset2, save_offset3;
630 int stacksave[REC_STACK_SAVE_MAX];
631
632 eptrblock newptrb;
633 #endif /* NO_RECURSE */
634
635 /* To save space on the stack and in the heap frame, I have doubled up on some
636 of the local variables that are used only in localised parts of the code, but
637 still need to be preserved over recursive calls of match(). These macros define
638 the alternative names that are used. */
639
640 #define allow_zero cur_is_word
641 #define cbegroup condition
642 #define code_offset codelink
643 #define condassert condition
644 #define matched_once prev_is_word
645
646 /* These statements are here to stop the compiler complaining about unitialized
647 variables. */
648
649 #ifdef SUPPORT_UCP
650 prop_value = 0;
651 prop_fail_result = 0;
652 #endif
653
654
655 /* This label is used for tail recursion, which is used in a few cases even
656 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
657 used. Thanks to Ian Taylor for noticing this possibility and sending the
658 original patch. */
659
660 TAIL_RECURSE:
661
662 /* OK, now we can get on with the real code of the function. Recursive calls
663 are specified by the macro RMATCH and RRETURN is used to return. When
664 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
665 and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
666 defined). However, RMATCH isn't like a function call because it's quite a
667 complicated macro. It has to be used in one particular way. This shouldn't,
668 however, impact performance when true recursion is being used. */
669
670 #ifdef SUPPORT_UTF8
671 utf8 = md->utf8; /* Local copy of the flag */
672 #else
673 utf8 = FALSE;
674 #endif
675
676 /* First check that we haven't called match() too many times, or that we
677 haven't exceeded the recursive call limit. */
678
679 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
680 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
681
682 /* At the start of a group with an unlimited repeat that may match an empty
683 string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
684 done this way to save having to use another function argument, which would take
685 up space on the stack. See also MATCH_CONDASSERT below.
686
687 When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
688 such remembered pointers, to be checked when we hit the closing ket, in order
689 to break infinite loops that match no characters. When match() is called in
690 other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
691 NOT be used with tail recursion, because the memory block that is used is on
692 the stack, so a new one may be required for each match(). */
693
694 if (md->match_function_type == MATCH_CBEGROUP)
695 {
696 newptrb.epb_saved_eptr = eptr;
697 newptrb.epb_prev = eptrb;
698 eptrb = &newptrb;
699 md->match_function_type = 0;
700 }
701
702 /* Now start processing the opcodes. */
703
704 for (;;)
705 {
706 minimize = possessive = FALSE;
707 op = *ecode;
708
709 switch(op)
710 {
711 case OP_MARK:
712 markptr = ecode + 2;
713 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
714 eptrb, RM55);
715
716 /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
717 argument, and we must check whether that argument matches this MARK's
718 argument. It is passed back in md->start_match_ptr (an overloading of that
719 variable). If it does match, we reset that variable to the current subject
720 position and return MATCH_SKIP. Otherwise, pass back the return code
721 unaltered. */
722
723 if (rrc == MATCH_SKIP_ARG &&
724 strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
725 {
726 md->start_match_ptr = eptr;
727 RRETURN(MATCH_SKIP);
728 }
729
730 if (md->mark == NULL) md->mark = markptr;
731 RRETURN(rrc);
732
733 case OP_FAIL:
734 MRRETURN(MATCH_NOMATCH);
735
736 /* COMMIT overrides PRUNE, SKIP, and THEN */
737
738 case OP_COMMIT:
739 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
740 eptrb, RM52);
741 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
742 rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
743 rrc != MATCH_THEN)
744 RRETURN(rrc);
745 MRRETURN(MATCH_COMMIT);
746
747 /* PRUNE overrides THEN */
748
749 case OP_PRUNE:
750 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
751 eptrb, RM51);
752 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 MRRETURN(MATCH_PRUNE);
754
755 case OP_PRUNE_ARG:
756 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
757 eptrb, RM56);
758 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
759 md->mark = ecode + 2;
760 RRETURN(MATCH_PRUNE);
761
762 /* SKIP overrides PRUNE and THEN */
763
764 case OP_SKIP:
765 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
766 eptrb, RM53);
767 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
768 RRETURN(rrc);
769 md->start_match_ptr = eptr; /* Pass back current position */
770 MRRETURN(MATCH_SKIP);
771
772 case OP_SKIP_ARG:
773 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
774 eptrb, RM57);
775 if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776 RRETURN(rrc);
777
778 /* Pass back the current skip name by overloading md->start_match_ptr and
779 returning the special MATCH_SKIP_ARG return code. This will either be
780 caught by a matching MARK, or get to the top, where it is treated the same
781 as PRUNE. */
782
783 md->start_match_ptr = ecode + 2;
784 RRETURN(MATCH_SKIP_ARG);
785
786 /* For THEN (and THEN_ARG) we pass back the address of the bracket or
787 the alt that is at the start of the current branch. This makes it possible
788 to skip back past alternatives that precede the THEN within the current
789 branch. */
790
791 case OP_THEN:
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
793 eptrb, RM54);
794 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
795 md->start_match_ptr = ecode - GET(ecode, 1);
796 MRRETURN(MATCH_THEN);
797
798 case OP_THEN_ARG:
799 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
800 offset_top, md, eptrb, RM58);
801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
802 md->start_match_ptr = ecode - GET(ecode, 1);
803 md->mark = ecode + LINK_SIZE + 2;
804 RRETURN(MATCH_THEN);
805
806 /* Handle a capturing bracket, other than those that are possessive with an
807 unlimited repeat. If there is space in the offset vector, save the current
808 subject position in the working slot at the top of the vector. We mustn't
809 change the current values of the data slot, because they may be set from a
810 previous iteration of this group, and be referred to by a reference inside
811 the group. If we fail to match, we need to restore this value and also the
812 values of the final offsets, in case they were set by a previous iteration
813 of the same bracket.
814
815 If there isn't enough space in the offset vector, treat this as if it were
816 a non-capturing bracket. Don't worry about setting the flag for the error
817 case here; that is handled in the code for KET. */
818
819 case OP_CBRA:
820 case OP_SCBRA:
821 number = GET2(ecode, 1+LINK_SIZE);
822 offset = number << 1;
823
824 #ifdef PCRE_DEBUG
825 printf("start bracket %d\n", number);
826 printf("subject=");
827 pchars(eptr, 16, TRUE, md);
828 printf("\n");
829 #endif
830
831 if (offset < md->offset_max)
832 {
833 save_offset1 = md->offset_vector[offset];
834 save_offset2 = md->offset_vector[offset+1];
835 save_offset3 = md->offset_vector[md->offset_end - number];
836 save_capture_last = md->capture_last;
837
838 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
839 md->offset_vector[md->offset_end - number] =
840 (int)(eptr - md->start_subject);
841
842 for (;;)
843 {
844 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
845 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
846 eptrb, RM1);
847 if (rrc != MATCH_NOMATCH &&
848 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
849 RRETURN(rrc);
850 md->capture_last = save_capture_last;
851 ecode += GET(ecode, 1);
852 if (*ecode != OP_ALT) break;
853 }
854
855 DPRINTF(("bracket %d failed\n", number));
856
857 md->offset_vector[offset] = save_offset1;
858 md->offset_vector[offset+1] = save_offset2;
859 md->offset_vector[md->offset_end - number] = save_offset3;
860
861 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
862 RRETURN(MATCH_NOMATCH);
863 }
864
865 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
866 as a non-capturing bracket. */
867
868 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
869 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
870
871 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
872
873 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
874 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
875
876 /* Non-capturing bracket, except for possessive with unlimited repeat. Loop
877 for all the alternatives. When we get to the final alternative within the
878 brackets, we used to return the result of a recursive call to match()
879 whatever happened so it was possible to reduce stack usage by turning this
880 into a tail recursion, except in the case of a possibly empty group.
881 However, now that there is the possiblity of (*THEN) occurring in the final
882 alternative, this optimization is no longer possible. */
883
884 case OP_BRA:
885 case OP_SBRA:
886 DPRINTF(("start non-capturing bracket\n"));
887 for (;;)
888 {
889 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
890 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, eptrb,
891 RM2);
892 if (rrc != MATCH_NOMATCH &&
893 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
894 RRETURN(rrc);
895 ecode += GET(ecode, 1);
896 if (*ecode != OP_ALT) break;
897 }
898
899 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
900 RRETURN(MATCH_NOMATCH);
901
902 /* Handle possessive capturing brackets with an unlimited repeat. We come
903 here from BRAZERO with allow_zero set TRUE. The offset_vector values are
904 handled similarly to the normal case above. However, the matching is
905 different. The end of these brackets will always be OP_KETRPOS, which
906 returns MATCH_KETRPOS without going further in the pattern. By this means
907 we can handle the group by iteration rather than recursion, thereby
908 reducing the amount of stack needed. */
909
910 case OP_CBRAPOS:
911 case OP_SCBRAPOS:
912 allow_zero = FALSE;
913
914 POSSESSIVE_CAPTURE:
915 number = GET2(ecode, 1+LINK_SIZE);
916 offset = number << 1;
917
918 #ifdef PCRE_DEBUG
919 printf("start possessive bracket %d\n", number);
920 printf("subject=");
921 pchars(eptr, 16, TRUE, md);
922 printf("\n");
923 #endif
924
925 if (offset < md->offset_max)
926 {
927 matched_once = FALSE;
928 code_offset = ecode - md->start_code;
929
930 save_offset1 = md->offset_vector[offset];
931 save_offset2 = md->offset_vector[offset+1];
932 save_offset3 = md->offset_vector[md->offset_end - number];
933 save_capture_last = md->capture_last;
934
935 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
936
937 /* Each time round the loop, save the current subject position for use
938 when the group matches. For MATCH_MATCH, the group has matched, so we
939 restart it with a new subject starting position, remembering that we had
940 at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
941 usual. If we haven't matched any alternatives in any iteration, check to
942 see if a previous iteration matched. If so, the group has matched;
943 continue from afterwards. Otherwise it has failed; restore the previous
944 capture values before returning NOMATCH. */
945
946 for (;;)
947 {
948 md->offset_vector[md->offset_end - number] =
949 (int)(eptr - md->start_subject);
950 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
951 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
952 eptrb, RM63);
953 if (rrc == MATCH_KETRPOS)
954 {
955 offset_top = md->end_offset_top;
956 eptr = md->end_match_ptr;
957 ecode = md->start_code + code_offset;
958 save_capture_last = md->capture_last;
959 matched_once = TRUE;
960 continue;
961 }
962 if (rrc != MATCH_NOMATCH &&
963 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
964 RRETURN(rrc);
965 md->capture_last = save_capture_last;
966 ecode += GET(ecode, 1);
967 if (*ecode != OP_ALT) break;
968 }
969
970 if (!matched_once)
971 {
972 md->offset_vector[offset] = save_offset1;
973 md->offset_vector[offset+1] = save_offset2;
974 md->offset_vector[md->offset_end - number] = save_offset3;
975 }
976
977 if (rrc != MATCH_THEN && md->mark == NULL) md->mark = markptr;
978 if (allow_zero || matched_once)
979 {
980 ecode += 1 + LINK_SIZE;
981 break;
982 }
983
984 RRETURN(MATCH_NOMATCH);
985 }
986
987 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
988 as a non-capturing bracket. */
989
990 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
991 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
992
993 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
994
995 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
997
998 /* Non-capturing possessive bracket with unlimited repeat. We come here
999 from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1000 without the capturing complication. It is written out separately for speed
1001 and cleanliness. */
1002
1003 case OP_BRAPOS:
1004 case OP_SBRAPOS:
1005 allow_zero = FALSE;
1006
1007 POSSESSIVE_NON_CAPTURE:
1008 matched_once = FALSE;
1009 code_offset = ecode - md->start_code;
1010
1011 for (;;)
1012 {
1013 if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1014 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
1015 eptrb, RM48);
1016 if (rrc == MATCH_KETRPOS)
1017 {
1018 offset_top = md->end_offset_top;
1019 eptr = md->end_match_ptr;
1020 ecode = md->start_code + code_offset;
1021 matched_once = TRUE;
1022 continue;
1023 }
1024 if (rrc != MATCH_NOMATCH &&
1025 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1026 RRETURN(rrc);
1027 ecode += GET(ecode, 1);
1028 if (*ecode != OP_ALT) break;
1029 }
1030
1031 if (matched_once || allow_zero)
1032 {
1033 ecode += 1 + LINK_SIZE;
1034 break;
1035 }
1036 RRETURN(MATCH_NOMATCH);
1037
1038 /* Control never reaches here. */
1039
1040 /* Conditional group: compilation checked that there are no more than
1041 two branches. If the condition is false, skipping the first branch takes us
1042 past the end if there is only one branch, but that's OK because that is
1043 exactly what going to the ket would do. */
1044
1045 case OP_COND:
1046 case OP_SCOND:
1047 codelink = GET(ecode, 1);
1048
1049 /* Because of the way auto-callout works during compile, a callout item is
1050 inserted between OP_COND and an assertion condition. */
1051
1052 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1053 {
1054 if (pcre_callout != NULL)
1055 {
1056 pcre_callout_block cb;
1057 cb.version = 1; /* Version 1 of the callout block */
1058 cb.callout_number = ecode[LINK_SIZE+2];
1059 cb.offset_vector = md->offset_vector;
1060 cb.subject = (PCRE_SPTR)md->start_subject;
1061 cb.subject_length = (int)(md->end_subject - md->start_subject);
1062 cb.start_match = (int)(mstart - md->start_subject);
1063 cb.current_position = (int)(eptr - md->start_subject);
1064 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1065 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1066 cb.capture_top = offset_top/2;
1067 cb.capture_last = md->capture_last;
1068 cb.callout_data = md->callout_data;
1069 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1070 if (rrc < 0) RRETURN(rrc);
1071 }
1072 ecode += _pcre_OP_lengths[OP_CALLOUT];
1073 }
1074
1075 condcode = ecode[LINK_SIZE+1];
1076
1077 /* Now see what the actual condition is */
1078
1079 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1080 {
1081 if (md->recursive == NULL) /* Not recursing => FALSE */
1082 {
1083 condition = FALSE;
1084 ecode += GET(ecode, 1);
1085 }
1086 else
1087 {
1088 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1089 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1090
1091 /* If the test is for recursion into a specific subpattern, and it is
1092 false, but the test was set up by name, scan the table to see if the
1093 name refers to any other numbers, and test them. The condition is true
1094 if any one is set. */
1095
1096 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
1097 {
1098 uschar *slotA = md->name_table;
1099 for (i = 0; i < md->name_count; i++)
1100 {
1101 if (GET2(slotA, 0) == recno) break;
1102 slotA += md->name_entry_size;
1103 }
1104
1105 /* Found a name for the number - there can be only one; duplicate
1106 names for different numbers are allowed, but not vice versa. First
1107 scan down for duplicates. */
1108
1109 if (i < md->name_count)
1110 {
1111 uschar *slotB = slotA;
1112 while (slotB > md->name_table)
1113 {
1114 slotB -= md->name_entry_size;
1115 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1116 {
1117 condition = GET2(slotB, 0) == md->recursive->group_num;
1118 if (condition) break;
1119 }
1120 else break;
1121 }
1122
1123 /* Scan up for duplicates */
1124
1125 if (!condition)
1126 {
1127 slotB = slotA;
1128 for (i++; i < md->name_count; i++)
1129 {
1130 slotB += md->name_entry_size;
1131 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1132 {
1133 condition = GET2(slotB, 0) == md->recursive->group_num;
1134 if (condition) break;
1135 }
1136 else break;
1137 }
1138 }
1139 }
1140 }
1141
1142 /* Chose branch according to the condition */
1143
1144 ecode += condition? 3 : GET(ecode, 1);
1145 }
1146 }
1147
1148 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1149 {
1150 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1151 condition = offset < offset_top && md->offset_vector[offset] >= 0;
1152
1153 /* If the numbered capture is unset, but the reference was by name,
1154 scan the table to see if the name refers to any other numbers, and test
1155 them. The condition is true if any one is set. This is tediously similar
1156 to the code above, but not close enough to try to amalgamate. */
1157
1158 if (!condition && condcode == OP_NCREF)
1159 {
1160 int refno = offset >> 1;
1161 uschar *slotA = md->name_table;
1162
1163 for (i = 0; i < md->name_count; i++)
1164 {
1165 if (GET2(slotA, 0) == refno) break;
1166 slotA += md->name_entry_size;
1167 }
1168
1169 /* Found a name for the number - there can be only one; duplicate names
1170 for different numbers are allowed, but not vice versa. First scan down
1171 for duplicates. */
1172
1173 if (i < md->name_count)
1174 {
1175 uschar *slotB = slotA;
1176 while (slotB > md->name_table)
1177 {
1178 slotB -= md->name_entry_size;
1179 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1180 {
1181 offset = GET2(slotB, 0) << 1;
1182 condition = offset < offset_top &&
1183 md->offset_vector[offset] >= 0;
1184 if (condition) break;
1185 }
1186 else break;
1187 }
1188
1189 /* Scan up for duplicates */
1190
1191 if (!condition)
1192 {
1193 slotB = slotA;
1194 for (i++; i < md->name_count; i++)
1195 {
1196 slotB += md->name_entry_size;
1197 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1198 {
1199 offset = GET2(slotB, 0) << 1;
1200 condition = offset < offset_top &&
1201 md->offset_vector[offset] >= 0;
1202 if (condition) break;
1203 }
1204 else break;
1205 }
1206 }
1207 }
1208 }
1209
1210 /* Chose branch according to the condition */
1211
1212 ecode += condition? 3 : GET(ecode, 1);
1213 }
1214
1215 else if (condcode == OP_DEF) /* DEFINE - always false */
1216 {
1217 condition = FALSE;
1218 ecode += GET(ecode, 1);
1219 }
1220
1221 /* The condition is an assertion. Call match() to evaluate it - setting
1222 md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
1223 an assertion. */
1224
1225 else
1226 {
1227 md->match_function_type = MATCH_CONDASSERT;
1228 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1229 if (rrc == MATCH_MATCH)
1230 {
1231 condition = TRUE;
1232 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1233 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1234 }
1235 else if (rrc != MATCH_NOMATCH &&
1236 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1237 {
1238 RRETURN(rrc); /* Need braces because of following else */
1239 }
1240 else
1241 {
1242 condition = FALSE;
1243 ecode += codelink;
1244 }
1245 }
1246
1247 /* We are now at the branch that is to be obeyed. As there is only one,
1248 we used to use tail recursion to avoid using another stack frame, except
1249 when there was unlimited repeat of a possibly empty group. However, that
1250 strategy no longer works because of the possibilty of (*THEN) being
1251 encountered in the branch. A recursive call to match() is always required,
1252 unless the second alternative doesn't exist, in which case we can just
1253 plough on. */
1254
1255 if (condition || *ecode == OP_ALT)
1256 {
1257 if (op == OP_SCOND) md->match_function_type = MATCH_CBEGROUP;
1258 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
1259 if (rrc == MATCH_THEN && md->start_match_ptr == ecode)
1260 rrc = MATCH_NOMATCH;
1261 RRETURN(rrc);
1262 }
1263 else /* Condition false & no alternative */
1264 {
1265 ecode += 1 + LINK_SIZE;
1266 }
1267 break;
1268
1269
1270 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1271 to close any currently open capturing brackets. */
1272
1273 case OP_CLOSE:
1274 number = GET2(ecode, 1);
1275 offset = number << 1;
1276
1277 #ifdef PCRE_DEBUG
1278 printf("end bracket %d at *ACCEPT", number);
1279 printf("\n");
1280 #endif
1281
1282 md->capture_last = number;
1283 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1284 {
1285 md->offset_vector[offset] =
1286 md->offset_vector[md->offset_end - number];
1287 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1288 if (offset_top <= offset) offset_top = offset + 2;
1289 }
1290 ecode += 3;
1291 break;
1292
1293
1294 /* End of the pattern, either real or forced. If we are in a recursion, we
1295 should restore the offsets appropriately, and if it's a top-level
1296 recursion, continue from after the call. */
1297
1298 case OP_ACCEPT:
1299 case OP_ASSERT_ACCEPT:
1300 case OP_END:
1301 if (md->recursive != NULL)
1302 {
1303 recursion_info *rec = md->recursive;
1304 md->recursive = rec->prevrec;
1305 memmove(md->offset_vector, rec->offset_save,
1306 rec->saved_max * sizeof(int));
1307 offset_top = rec->save_offset_top;
1308 if (rec->group_num == 0)
1309 {
1310 ecode = rec->after_call;
1311 break;
1312 }
1313 }
1314
1315 /* Otherwise, if we have matched an empty string, fail if not in an
1316 assertion and if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1317 is set and we have matched at the start of the subject. In both cases,
1318 backtracking will then try other alternatives, if any. */
1319
1320 else if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1321 (md->notempty ||
1322 (md->notempty_atstart &&
1323 mstart == md->start_subject + md->start_offset)))
1324 MRRETURN(MATCH_NOMATCH);
1325
1326 /* Otherwise, we have a match. */
1327
1328 md->end_match_ptr = eptr; /* Record where we ended */
1329 md->end_offset_top = offset_top; /* and how many extracts were taken */
1330 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1331
1332 /* For some reason, the macros don't work properly if an expression is
1333 given as the argument to MRRETURN when the heap is in use. */
1334
1335 rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1336 MRRETURN(rrc);
1337
1338 /* Assertion brackets. Check the alternative branches in turn - the
1339 matching won't pass the KET for an assertion. If any one branch matches,
1340 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1341 start of each branch to move the current point backwards, so the code at
1342 this level is identical to the lookahead case. When the assertion is part
1343 of a condition, we want to return immediately afterwards. The caller of
1344 this incarnation of the match() function will have set MATCH_CONDASSERT in
1345 md->match_function type, and one of these opcodes will be the first opcode
1346 that is processed. We use a local variable that is preserved over calls to
1347 match() to remember this case. */
1348
1349 case OP_ASSERT:
1350 case OP_ASSERTBACK:
1351 if (md->match_function_type == MATCH_CONDASSERT)
1352 {
1353 condassert = TRUE;
1354 md->match_function_type = 0;
1355 }
1356 else condassert = FALSE;
1357
1358 do
1359 {
1360 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1361 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1362 {
1363 mstart = md->start_match_ptr; /* In case \K reset it */
1364 break;
1365 }
1366 if (rrc != MATCH_NOMATCH &&
1367 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1368 RRETURN(rrc);
1369 ecode += GET(ecode, 1);
1370 }
1371 while (*ecode == OP_ALT);
1372
1373 if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1374
1375 /* If checking an assertion for a condition, return MATCH_MATCH. */
1376
1377 if (condassert) RRETURN(MATCH_MATCH);
1378
1379 /* Continue from after the assertion, updating the offsets high water
1380 mark, since extracts may have been taken during the assertion. */
1381
1382 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1383 ecode += 1 + LINK_SIZE;
1384 offset_top = md->end_offset_top;
1385 continue;
1386
1387 /* Negative assertion: all branches must fail to match. Encountering SKIP,
1388 PRUNE, or COMMIT means we must assume failure without checking subsequent
1389 branches. */
1390
1391 case OP_ASSERT_NOT:
1392 case OP_ASSERTBACK_NOT:
1393 if (md->match_function_type == MATCH_CONDASSERT)
1394 {
1395 condassert = TRUE;
1396 md->match_function_type = 0;
1397 }
1398 else condassert = FALSE;
1399
1400 do
1401 {
1402 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1403 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1404 if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1405 {
1406 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1407 break;
1408 }
1409 if (rrc != MATCH_NOMATCH &&
1410 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1411 RRETURN(rrc);
1412 ecode += GET(ecode,1);
1413 }
1414 while (*ecode == OP_ALT);
1415
1416 if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1417
1418 ecode += 1 + LINK_SIZE;
1419 continue;
1420
1421 /* Move the subject pointer back. This occurs only at the start of
1422 each branch of a lookbehind assertion. If we are too close to the start to
1423 move back, this match function fails. When working with UTF-8 we move
1424 back a number of characters, not bytes. */
1425
1426 case OP_REVERSE:
1427 #ifdef SUPPORT_UTF8
1428 if (utf8)
1429 {
1430 i = GET(ecode, 1);
1431 while (i-- > 0)
1432 {
1433 eptr--;
1434 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1435 BACKCHAR(eptr);
1436 }
1437 }
1438 else
1439 #endif
1440
1441 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1442
1443 {
1444 eptr -= GET(ecode, 1);
1445 if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1446 }
1447
1448 /* Save the earliest consulted character, then skip to next op code */
1449
1450 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1451 ecode += 1 + LINK_SIZE;
1452 break;
1453
1454 /* The callout item calls an external function, if one is provided, passing
1455 details of the match so far. This is mainly for debugging, though the
1456 function is able to force a failure. */
1457
1458 case OP_CALLOUT:
1459 if (pcre_callout != NULL)
1460 {
1461 pcre_callout_block cb;
1462 cb.version = 1; /* Version 1 of the callout block */
1463 cb.callout_number = ecode[1];
1464 cb.offset_vector = md->offset_vector;
1465 cb.subject = (PCRE_SPTR)md->start_subject;
1466 cb.subject_length = (int)(md->end_subject - md->start_subject);
1467 cb.start_match = (int)(mstart - md->start_subject);
1468 cb.current_position = (int)(eptr - md->start_subject);
1469 cb.pattern_position = GET(ecode, 2);
1470 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1471 cb.capture_top = offset_top/2;
1472 cb.capture_last = md->capture_last;
1473 cb.callout_data = md->callout_data;
1474 if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1475 if (rrc < 0) RRETURN(rrc);
1476 }
1477 ecode += 2 + 2*LINK_SIZE;
1478 break;
1479
1480 /* Recursion either matches the current regex, or some subexpression. The
1481 offset data is the offset to the starting bracket from the start of the
1482 whole pattern. (This is so that it works from duplicated subpatterns.)
1483
1484 If there are any capturing brackets started but not finished, we have to
1485 save their starting points and reinstate them after the recursion. However,
1486 we don't know how many such there are (offset_top records the completed
1487 total) so we just have to save all the potential data. There may be up to
1488 65535 such values, which is too large to put on the stack, but using malloc
1489 for small numbers seems expensive. As a compromise, the stack is used when
1490 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1491 is used. A problem is what to do if the malloc fails ... there is no way of
1492 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1493 values on the stack, and accept that the rest may be wrong.
1494
1495 There are also other values that have to be saved. We use a chained
1496 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1497 for the original version of this logic. */
1498
1499 case OP_RECURSE:
1500 {
1501 callpat = md->start_code + GET(ecode, 1);
1502 new_recursive.group_num = (callpat == md->start_code)? 0 :
1503 GET2(callpat, 1 + LINK_SIZE);
1504
1505 /* Add to "recursing stack" */
1506
1507 new_recursive.prevrec = md->recursive;
1508 md->recursive = &new_recursive;
1509
1510 /* Find where to continue from afterwards */
1511
1512 ecode += 1 + LINK_SIZE;
1513 new_recursive.after_call = ecode;
1514
1515 /* Now save the offset data. */
1516
1517 new_recursive.saved_max = md->offset_end;
1518 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1519 new_recursive.offset_save = stacksave;
1520 else
1521 {
1522 new_recursive.offset_save =
1523 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1524 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1525 }
1526
1527 memcpy(new_recursive.offset_save, md->offset_vector,
1528 new_recursive.saved_max * sizeof(int));
1529 new_recursive.save_offset_top = offset_top;
1530
1531 /* OK, now we can do the recursion. For each top-level alternative we
1532 restore the offset and recursion data. */
1533
1534 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1535 cbegroup = (*callpat >= OP_SBRA);
1536 do
1537 {
1538 if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1539 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1540 md, eptrb, RM6);
1541 if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1542 {
1543 DPRINTF(("Recursion matched\n"));
1544 md->recursive = new_recursive.prevrec;
1545 if (new_recursive.offset_save != stacksave)
1546 (pcre_free)(new_recursive.offset_save);
1547 MRRETURN(MATCH_MATCH);
1548 }
1549 else if (rrc != MATCH_NOMATCH &&
1550 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1551 {
1552 DPRINTF(("Recursion gave error %d\n", rrc));
1553 if (new_recursive.offset_save != stacksave)
1554 (pcre_free)(new_recursive.offset_save);
1555 RRETURN(rrc);
1556 }
1557
1558 md->recursive = &new_recursive;
1559 memcpy(md->offset_vector, new_recursive.offset_save,
1560 new_recursive.saved_max * sizeof(int));
1561 callpat += GET(callpat, 1);
1562 }
1563 while (*callpat == OP_ALT);
1564
1565 DPRINTF(("Recursion didn't match\n"));
1566 md->recursive = new_recursive.prevrec;
1567 if (new_recursive.offset_save != stacksave)
1568 (pcre_free)(new_recursive.offset_save);
1569 MRRETURN(MATCH_NOMATCH);
1570 }
1571 /* Control never reaches here */
1572
1573 /* "Once" brackets are like assertion brackets except that after a match,
1574 the point in the subject string is not moved back. Thus there can never be
1575 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1576 Check the alternative branches in turn - the matching won't pass the KET
1577 for this kind of subpattern. If any one branch matches, we carry on as at
1578 the end of a normal bracket, leaving the subject pointer, but resetting
1579 the start-of-match value in case it was changed by \K. */
1580
1581 case OP_ONCE:
1582 prev = ecode;
1583 saved_eptr = eptr;
1584
1585 do
1586 {
1587 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1588 if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1589 {
1590 mstart = md->start_match_ptr;
1591 break;
1592 }
1593 if (rrc != MATCH_NOMATCH &&
1594 (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1595 RRETURN(rrc);
1596 ecode += GET(ecode,1);
1597 }
1598 while (*ecode == OP_ALT);
1599
1600 /* If hit the end of the group (which could be repeated), fail */
1601
1602 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1603
1604 /* Continue after the group, updating the offsets high water mark, since
1605 extracts may have been taken. */
1606
1607 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1608
1609 offset_top = md->end_offset_top;
1610 eptr = md->end_match_ptr;
1611
1612 /* For a non-repeating ket, just continue at this level. This also
1613 happens for a repeating ket if no characters were matched in the group.
1614 This is the forcible breaking of infinite loops as implemented in Perl
1615 5.005. If there is an options reset, it will get obeyed in the normal
1616 course of events. */
1617
1618 if (*ecode == OP_KET || eptr == saved_eptr)
1619 {
1620 ecode += 1+LINK_SIZE;
1621 break;
1622 }
1623
1624 /* The repeating kets try the rest of the pattern or restart from the
1625 preceding bracket, in the appropriate order. The second "call" of match()
1626 uses tail recursion, to avoid using another stack frame. */
1627
1628 if (*ecode == OP_KETRMIN)
1629 {
1630 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM8);
1631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1632 ecode = prev;
1633 goto TAIL_RECURSE;
1634 }
1635 else /* OP_KETRMAX */
1636 {
1637 md->match_function_type = MATCH_CBEGROUP;
1638 RMATCH(eptr, prev, offset_top, md, eptrb, RM9);
1639 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1640 ecode += 1 + LINK_SIZE;
1641 goto TAIL_RECURSE;
1642 }
1643 /* Control never gets here */
1644
1645 /* An alternation is the end of a branch; scan along to find the end of the
1646 bracketed group and go to there. */
1647
1648 case OP_ALT:
1649 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1650 break;
1651
1652 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1653 indicating that it may occur zero times. It may repeat infinitely, or not
1654 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1655 with fixed upper repeat limits are compiled as a number of copies, with the
1656 optional ones preceded by BRAZERO or BRAMINZERO. */
1657
1658 case OP_BRAZERO:
1659 next = ecode + 1;
1660 RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1662 do next += GET(next, 1); while (*next == OP_ALT);
1663 ecode = next + 1 + LINK_SIZE;
1664 break;
1665
1666 case OP_BRAMINZERO:
1667 next = ecode + 1;
1668 do next += GET(next, 1); while (*next == OP_ALT);
1669 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1670 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1671 ecode++;
1672 break;
1673
1674 case OP_SKIPZERO:
1675 next = ecode+1;
1676 do next += GET(next,1); while (*next == OP_ALT);
1677 ecode = next + 1 + LINK_SIZE;
1678 break;
1679
1680 /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1681 here; just jump to the group, with allow_zero set TRUE. */
1682
1683 case OP_BRAPOSZERO:
1684 op = *(++ecode);
1685 allow_zero = TRUE;
1686 if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1687 goto POSSESSIVE_NON_CAPTURE;
1688
1689 /* End of a group, repeated or non-repeating. */
1690
1691 case OP_KET:
1692 case OP_KETRMIN:
1693 case OP_KETRMAX:
1694 case OP_KETRPOS:
1695 prev = ecode - GET(ecode, 1);
1696
1697 /* If this was a group that remembered the subject start, in order to break
1698 infinite repeats of empty string matches, retrieve the subject start from
1699 the chain. Otherwise, set it NULL. */
1700
1701 if (*prev >= OP_SBRA)
1702 {
1703 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1704 eptrb = eptrb->epb_prev; /* Backup to previous group */
1705 }
1706 else saved_eptr = NULL;
1707
1708 /* If we are at the end of an assertion group or an atomic group, stop
1709 matching and return MATCH_MATCH, but record the current high water mark for
1710 use by positive assertions. We also need to record the match start in case
1711 it was changed by \K. */
1712
1713 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1714 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1715 *prev == OP_ONCE)
1716 {
1717 md->end_match_ptr = eptr; /* For ONCE */
1718 md->end_offset_top = offset_top;
1719 md->start_match_ptr = mstart;
1720 MRRETURN(MATCH_MATCH);
1721 }
1722
1723 /* For capturing groups we have to check the group number back at the start
1724 and if necessary complete handling an extraction by setting the offsets and
1725 bumping the high water mark. Note that whole-pattern recursion is coded as
1726 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1727 when the OP_END is reached. Other recursion is handled here. */
1728
1729 if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1730 *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1731 {
1732 number = GET2(prev, 1+LINK_SIZE);
1733 offset = number << 1;
1734
1735 #ifdef PCRE_DEBUG
1736 printf("end bracket %d", number);
1737 printf("\n");
1738 #endif
1739
1740 md->capture_last = number;
1741 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1742 {
1743 /* If offset is greater than offset_top, it means that we are
1744 "skipping" a capturing group, and that group's offsets must be marked
1745 unset. In earlier versions of PCRE, all the offsets were unset at the
1746 start of matching, but this doesn't work because atomic groups and
1747 assertions can cause a value to be set that should later be unset.
1748 Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1749 part of the atomic group, but this is not on the final matching path,
1750 so must be unset when 2 is set. (If there is no group 2, there is no
1751 problem, because offset_top will then be 2, indicating no capture.) */
1752
1753 if (offset > offset_top)
1754 {
1755 register int *iptr = md->offset_vector + offset_top;
1756 register int *iend = md->offset_vector + offset;
1757 while (iptr < iend) *iptr++ = -1;
1758 }
1759
1760 /* Now make the extraction */
1761
1762 md->offset_vector[offset] =
1763 md->offset_vector[md->offset_end - number];
1764 md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1765 if (offset_top <= offset) offset_top = offset + 2;
1766 }
1767
1768 /* Handle a recursively called group. Restore the offsets
1769 appropriately and continue from after the call. */
1770
1771 if (md->recursive != NULL && md->recursive->group_num == number)
1772 {
1773 recursion_info *rec = md->recursive;
1774 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1775 md->recursive = rec->prevrec;
1776 memcpy(md->offset_vector, rec->offset_save,
1777 rec->saved_max * sizeof(int));
1778 offset_top = rec->save_offset_top;
1779 ecode = rec->after_call;
1780 break;
1781 }
1782 }
1783
1784 /* For a non-repeating ket, just continue at this level. This also
1785 happens for a repeating ket if no characters were matched in the group.
1786 This is the forcible breaking of infinite loops as implemented in Perl
1787 5.005. If there is an options reset, it will get obeyed in the normal
1788 course of events. */
1789
1790 if (*ecode == OP_KET || eptr == saved_eptr)
1791 {
1792 ecode += 1 + LINK_SIZE;
1793 break;
1794 }
1795
1796 /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
1797 and return the MATCH_KETRPOS. This makes it possible to do the repeats one
1798 at a time from the outer level, thus saving stack. */
1799
1800 if (*ecode == OP_KETRPOS)
1801 {
1802 md->end_match_ptr = eptr;
1803 md->end_offset_top = offset_top;
1804 RRETURN(MATCH_KETRPOS);
1805 }
1806
1807 /* The normal repeating kets try the rest of the pattern or restart from
1808 the preceding bracket, in the appropriate order. In the second case, we can
1809 use tail recursion to avoid using another stack frame, unless we have an
1810 unlimited repeat of a group that can match an empty string. */
1811
1812 if (*ecode == OP_KETRMIN)
1813 {
1814 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
1815 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1816 if (*prev >= OP_SBRA) /* Could match an empty string */
1817 {
1818 md->match_function_type = MATCH_CBEGROUP;
1819 RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1820 RRETURN(rrc);
1821 }
1822 ecode = prev;
1823 goto TAIL_RECURSE;
1824 }
1825 else /* OP_KETRMAX */
1826 {
1827 if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1828 RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
1829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1830 ecode += 1 + LINK_SIZE;
1831 goto TAIL_RECURSE;
1832 }
1833 /* Control never gets here */
1834
1835 /* Not multiline mode: start of subject assertion, unless notbol. */
1836
1837 case OP_CIRC:
1838 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1839
1840 /* Start of subject assertion */
1841
1842 case OP_SOD:
1843 if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1844 ecode++;
1845 break;
1846
1847 /* Multiline mode: start of subject unless notbol, or after any newline. */
1848
1849 case OP_CIRCM:
1850 if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1851 if (eptr != md->start_subject &&
1852 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1853 MRRETURN(MATCH_NOMATCH);
1854 ecode++;
1855 break;
1856
1857 /* Start of match assertion */
1858
1859 case OP_SOM:
1860 if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1861 ecode++;
1862 break;
1863
1864 /* Reset the start of match point */
1865
1866 case OP_SET_SOM:
1867 mstart = eptr;
1868 ecode++;
1869 break;
1870
1871 /* Multiline mode: assert before any newline, or before end of subject
1872 unless noteol is set. */
1873
1874 case OP_DOLLM:
1875 if (eptr < md->end_subject)
1876 { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1877 else
1878 {
1879 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1880 SCHECK_PARTIAL();
1881 }
1882 ecode++;
1883 break;
1884
1885 /* Not multiline mode: assert before a terminating newline or before end of
1886 subject unless noteol is set. */
1887
1888 case OP_DOLL:
1889 if (md->noteol) MRRETURN(MATCH_NOMATCH);
1890 if (!md->endonly) goto ASSERT_NL_OR_EOS;
1891
1892 /* ... else fall through for endonly */
1893
1894 /* End of subject assertion (\z) */
1895
1896 case OP_EOD:
1897 if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1898 SCHECK_PARTIAL();
1899 ecode++;
1900 break;
1901
1902 /* End of subject or ending \n assertion (\Z) */
1903
1904 case OP_EODN:
1905 ASSERT_NL_OR_EOS:
1906 if (eptr < md->end_subject &&
1907 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1908 MRRETURN(MATCH_NOMATCH);
1909
1910 /* Either at end of string or \n before end. */
1911
1912 SCHECK_PARTIAL();
1913 ecode++;
1914 break;
1915
1916 /* Word boundary assertions */
1917
1918 case OP_NOT_WORD_BOUNDARY:
1919 case OP_WORD_BOUNDARY:
1920 {
1921
1922 /* Find out if the previous and current characters are "word" characters.
1923 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1924 be "non-word" characters. Remember the earliest consulted character for
1925 partial matching. */
1926
1927 #ifdef SUPPORT_UTF8
1928 if (utf8)
1929 {
1930 /* Get status of previous character */
1931
1932 if (eptr == md->start_subject) prev_is_word = FALSE; else
1933 {
1934 USPTR lastptr = eptr - 1;
1935 while((*lastptr & 0xc0) == 0x80) lastptr--;
1936 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1937 GETCHAR(c, lastptr);
1938 #ifdef SUPPORT_UCP
1939 if (md->use_ucp)
1940 {
1941 if (c == '_') prev_is_word = TRUE; else
1942 {
1943 int cat = UCD_CATEGORY(c);
1944 prev_is_word = (cat == ucp_L || cat == ucp_N);
1945 }
1946 }
1947 else
1948 #endif
1949 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1950 }
1951
1952 /* Get status of next character */
1953
1954 if (eptr >= md->end_subject)
1955 {
1956 SCHECK_PARTIAL();
1957 cur_is_word = FALSE;
1958 }
1959 else
1960 {
1961 GETCHAR(c, eptr);
1962 #ifdef SUPPORT_UCP
1963 if (md->use_ucp)
1964 {
1965 if (c == '_') cur_is_word = TRUE; else
1966 {
1967 int cat = UCD_CATEGORY(c);
1968 cur_is_word = (cat == ucp_L || cat == ucp_N);
1969 }
1970 }
1971 else
1972 #endif
1973 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1974 }
1975 }
1976 else
1977 #endif
1978
1979 /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1980 consistency with the behaviour of \w we do use it in this case. */
1981
1982 {
1983 /* Get status of previous character */
1984
1985 if (eptr == md->start_subject) prev_is_word = FALSE; else
1986 {
1987 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1988 #ifdef SUPPORT_UCP
1989 if (md->use_ucp)
1990 {
1991 c = eptr[-1];
1992 if (c == '_') prev_is_word = TRUE; else
1993 {
1994 int cat = UCD_CATEGORY(c);
1995 prev_is_word = (cat == ucp_L || cat == ucp_N);
1996 }
1997 }
1998 else
1999 #endif
2000 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
2001 }
2002
2003 /* Get status of next character */
2004
2005 if (eptr >= md->end_subject)
2006 {
2007 SCHECK_PARTIAL();
2008 cur_is_word = FALSE;
2009 }
2010 else
2011 #ifdef SUPPORT_UCP
2012 if (md->use_ucp)
2013 {
2014 c = *eptr;
2015 if (c == '_') cur_is_word = TRUE; else
2016 {
2017 int cat = UCD_CATEGORY(c);
2018 cur_is_word = (cat == ucp_L || cat == ucp_N);
2019 }
2020 }
2021 else
2022 #endif
2023 cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
2024 }
2025
2026 /* Now see if the situation is what we want */
2027
2028 if ((*ecode++ == OP_WORD_BOUNDARY)?
2029 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
2030 MRRETURN(MATCH_NOMATCH);
2031 }
2032 break;
2033
2034 /* Match a single character type; inline for speed */
2035
2036 case OP_ANY:
2037 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
2038 /* Fall through */
2039
2040 case OP_ALLANY:
2041 if (eptr++ >= md->end_subject)
2042 {
2043 SCHECK_PARTIAL();
2044 MRRETURN(MATCH_NOMATCH);
2045 }
2046 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2047 ecode++;
2048 break;
2049
2050 /* Match a single byte, even in UTF-8 mode. This opcode really does match
2051 any byte, even newline, independent of the setting of PCRE_DOTALL. */
2052
2053 case OP_ANYBYTE:
2054 if (eptr++ >= md->end_subject)
2055 {
2056 SCHECK_PARTIAL();
2057 MRRETURN(MATCH_NOMATCH);
2058 }
2059 ecode++;
2060 break;
2061
2062 case OP_NOT_DIGIT:
2063 if (eptr >= md->end_subject)
2064 {
2065 SCHECK_PARTIAL();
2066 MRRETURN(MATCH_NOMATCH);
2067 }
2068 GETCHARINCTEST(c, eptr);
2069 if (
2070 #ifdef SUPPORT_UTF8
2071 c < 256 &&
2072 #endif
2073 (md->ctypes[c] & ctype_digit) != 0
2074 )
2075 MRRETURN(MATCH_NOMATCH);
2076 ecode++;
2077 break;
2078
2079 case OP_DIGIT:
2080 if (eptr >= md->end_subject)
2081 {
2082 SCHECK_PARTIAL();
2083 MRRETURN(MATCH_NOMATCH);
2084 }
2085 GETCHARINCTEST(c, eptr);
2086 if (
2087 #ifdef SUPPORT_UTF8
2088 c >= 256 ||
2089 #endif
2090 (md->ctypes[c] & ctype_digit) == 0
2091 )
2092 MRRETURN(MATCH_NOMATCH);
2093 ecode++;
2094 break;
2095
2096 case OP_NOT_WHITESPACE:
2097 if (eptr >= md->end_subject)
2098 {
2099 SCHECK_PARTIAL();
2100 MRRETURN(MATCH_NOMATCH);
2101 }
2102 GETCHARINCTEST(c, eptr);
2103 if (
2104 #ifdef SUPPORT_UTF8
2105 c < 256 &&
2106 #endif
2107 (md->ctypes[c] & ctype_space) != 0
2108 )
2109 MRRETURN(MATCH_NOMATCH);
2110 ecode++;
2111 break;
2112
2113 case OP_WHITESPACE:
2114 if (eptr >= md->end_subject)
2115 {
2116 SCHECK_PARTIAL();
2117 MRRETURN(MATCH_NOMATCH);
2118 }
2119 GETCHARINCTEST(c, eptr);
2120 if (
2121 #ifdef SUPPORT_UTF8
2122 c >= 256 ||
2123 #endif
2124 (md->ctypes[c] & ctype_space) == 0
2125 )
2126 MRRETURN(MATCH_NOMATCH);
2127 ecode++;
2128 break;
2129
2130 case OP_NOT_WORDCHAR:
2131 if (eptr >= md->end_subject)
2132 {
2133 SCHECK_PARTIAL();
2134 MRRETURN(MATCH_NOMATCH);
2135 }
2136 GETCHARINCTEST(c, eptr);
2137 if (
2138 #ifdef SUPPORT_UTF8
2139 c < 256 &&
2140 #endif
2141 (md->ctypes[c] & ctype_word) != 0
2142 )
2143 MRRETURN(MATCH_NOMATCH);
2144 ecode++;
2145 break;
2146
2147 case OP_WORDCHAR:
2148 if (eptr >= md->end_subject)
2149 {
2150 SCHECK_PARTIAL();
2151 MRRETURN(MATCH_NOMATCH);
2152 }
2153 GETCHARINCTEST(c, eptr);
2154 if (
2155 #ifdef SUPPORT_UTF8
2156 c >= 256 ||
2157 #endif
2158 (md->ctypes[c] & ctype_word) == 0
2159 )
2160 MRRETURN(MATCH_NOMATCH);
2161 ecode++;
2162 break;
2163
2164 case OP_ANYNL:
2165 if (eptr >= md->end_subject)
2166 {
2167 SCHECK_PARTIAL();
2168 MRRETURN(MATCH_NOMATCH);
2169 }
2170 GETCHARINCTEST(c, eptr);
2171 switch(c)
2172 {
2173 default: MRRETURN(MATCH_NOMATCH);
2174
2175 case 0x000d:
2176 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2177 break;
2178
2179 case 0x000a:
2180 break;
2181
2182 case 0x000b:
2183 case 0x000c:
2184 case 0x0085:
2185 case 0x2028:
2186 case 0x2029:
2187 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
2188 break;
2189 }
2190 ecode++;
2191 break;
2192
2193 case OP_NOT_HSPACE:
2194 if (eptr >= md->end_subject)
2195 {
2196 SCHECK_PARTIAL();
2197 MRRETURN(MATCH_NOMATCH);
2198 }
2199 GETCHARINCTEST(c, eptr);
2200 switch(c)
2201 {
2202 default: break;
2203 case 0x09: /* HT */
2204 case 0x20: /* SPACE */
2205 case 0xa0: /* NBSP */
2206 case 0x1680: /* OGHAM SPACE MARK */
2207 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2208 case 0x2000: /* EN QUAD */
2209 case 0x2001: /* EM QUAD */
2210 case 0x2002: /* EN SPACE */
2211 case 0x2003: /* EM SPACE */
2212 case 0x2004: /* THREE-PER-EM SPACE */
2213 case 0x2005: /* FOUR-PER-EM SPACE */
2214 case 0x2006: /* SIX-PER-EM SPACE */
2215 case 0x2007: /* FIGURE SPACE */
2216 case 0x2008: /* PUNCTUATION SPACE */
2217 case 0x2009: /* THIN SPACE */
2218 case 0x200A: /* HAIR SPACE */
2219 case 0x202f: /* NARROW NO-BREAK SPACE */
2220 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2221 case 0x3000: /* IDEOGRAPHIC SPACE */
2222 MRRETURN(MATCH_NOMATCH);
2223 }
2224 ecode++;
2225 break;
2226
2227 case OP_HSPACE:
2228 if (eptr >= md->end_subject)
2229 {
2230 SCHECK_PARTIAL();
2231 MRRETURN(MATCH_NOMATCH);
2232 }
2233 GETCHARINCTEST(c, eptr);
2234 switch(c)
2235 {
2236 default: MRRETURN(MATCH_NOMATCH);
2237 case 0x09: /* HT */
2238 case 0x20: /* SPACE */
2239 case 0xa0: /* NBSP */
2240 case 0x1680: /* OGHAM SPACE MARK */
2241 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2242 case 0x2000: /* EN QUAD */
2243 case 0x2001: /* EM QUAD */
2244 case 0x2002: /* EN SPACE */
2245 case 0x2003: /* EM SPACE */
2246 case 0x2004: /* THREE-PER-EM SPACE */
2247 case 0x2005: /* FOUR-PER-EM SPACE */
2248 case 0x2006: /* SIX-PER-EM SPACE */
2249 case 0x2007: /* FIGURE SPACE */
2250 case 0x2008: /* PUNCTUATION SPACE */
2251 case 0x2009: /* THIN SPACE */
2252 case 0x200A: /* HAIR SPACE */
2253 case 0x202f: /* NARROW NO-BREAK SPACE */
2254 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2255 case 0x3000: /* IDEOGRAPHIC SPACE */
2256 break;
2257 }
2258 ecode++;
2259 break;
2260
2261 case OP_NOT_VSPACE:
2262 if (eptr >= md->end_subject)
2263 {
2264 SCHECK_PARTIAL();
2265 MRRETURN(MATCH_NOMATCH);
2266 }
2267 GETCHARINCTEST(c, eptr);
2268 switch(c)
2269 {
2270 default: break;
2271 case 0x0a: /* LF */
2272 case 0x0b: /* VT */
2273 case 0x0c: /* FF */
2274 case 0x0d: /* CR */
2275 case 0x85: /* NEL */
2276 case 0x2028: /* LINE SEPARATOR */
2277 case 0x2029: /* PARAGRAPH SEPARATOR */
2278 MRRETURN(MATCH_NOMATCH);
2279 }
2280 ecode++;
2281 break;
2282
2283 case OP_VSPACE:
2284 if (eptr >= md->end_subject)
2285 {
2286 SCHECK_PARTIAL();
2287 MRRETURN(MATCH_NOMATCH);
2288 }
2289 GETCHARINCTEST(c, eptr);
2290 switch(c)
2291 {
2292 default: MRRETURN(MATCH_NOMATCH);
2293 case 0x0a: /* LF */
2294 case 0x0b: /* VT */
2295 case 0x0c: /* FF */
2296 case 0x0d: /* CR */
2297 case 0x85: /* NEL */
2298 case 0x2028: /* LINE SEPARATOR */
2299 case 0x2029: /* PARAGRAPH SEPARATOR */
2300 break;
2301 }
2302 ecode++;
2303 break;
2304
2305 #ifdef SUPPORT_UCP
2306 /* Check the next character by Unicode property. We will get here only
2307 if the support is in the binary; otherwise a compile-time error occurs. */
2308
2309 case OP_PROP:
2310 case OP_NOTPROP:
2311 if (eptr >= md->end_subject)
2312 {
2313 SCHECK_PARTIAL();
2314 MRRETURN(MATCH_NOMATCH);
2315 }
2316 GETCHARINCTEST(c, eptr);
2317 {
2318 const ucd_record *prop = GET_UCD(c);
2319
2320 switch(ecode[1])
2321 {
2322 case PT_ANY:
2323 if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
2324 break;
2325
2326 case PT_LAMP:
2327 if ((prop->chartype == ucp_Lu ||
2328 prop->chartype == ucp_Ll ||
2329 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
2330 MRRETURN(MATCH_NOMATCH);
2331 break;
2332
2333 case PT_GC:
2334 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
2335 MRRETURN(MATCH_NOMATCH);
2336 break;
2337
2338 case PT_PC:
2339 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
2340 MRRETURN(MATCH_NOMATCH);
2341 break;
2342
2343 case PT_SC:
2344 if ((ecode[2] != prop->script) == (op == OP_PROP))
2345 MRRETURN(MATCH_NOMATCH);
2346 break;
2347
2348 /* These are specials */
2349
2350 case PT_ALNUM:
2351 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2352 _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2353 MRRETURN(MATCH_NOMATCH);
2354 break;
2355
2356 case PT_SPACE: /* Perl space */
2357 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2358 c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2359 == (op == OP_NOTPROP))
2360 MRRETURN(MATCH_NOMATCH);
2361 break;
2362
2363 case PT_PXSPACE: /* POSIX space */
2364 if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2365 c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2366 c == CHAR_FF || c == CHAR_CR)
2367 == (op == OP_NOTPROP))
2368 MRRETURN(MATCH_NOMATCH);
2369 break;
2370
2371 case PT_WORD:
2372 if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2373 _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2374 c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2375 MRRETURN(MATCH_NOMATCH);
2376 break;
2377
2378 /* This should never occur */
2379
2380 default:
2381 RRETURN(PCRE_ERROR_INTERNAL);
2382 }
2383
2384 ecode += 3;
2385 }
2386 break;
2387
2388 /* Match an extended Unicode sequence. We will get here only if the support
2389 is in the binary; otherwise a compile-time error occurs. */
2390
2391 case OP_EXTUNI:
2392 if (eptr >= md->end_subject)
2393 {
2394 SCHECK_PARTIAL();
2395 MRRETURN(MATCH_NOMATCH);
2396 }
2397 GETCHARINCTEST(c, eptr);
2398 {
2399 int category = UCD_CATEGORY(c);
2400 if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
2401 while (eptr < md->end_subject)
2402 {
2403 int len = 1;
2404 if (!utf8) c = *eptr; else
2405 {
2406 GETCHARLEN(c, eptr, len);
2407 }
2408 category = UCD_CATEGORY(c);
2409 if (category != ucp_M) break;
2410 eptr += len;
2411 }
2412 }
2413 ecode++;
2414 break;
2415 #endif
2416
2417
2418 /* Match a back reference, possibly repeatedly. Look past the end of the
2419 item to see if there is repeat information following. The code is similar
2420 to that for character classes, but repeated for efficiency. Then obey
2421 similar code to character type repeats - written out again for speed.
2422 However, if the referenced string is the empty string, always treat
2423 it as matched, any number of times (otherwise there could be infinite
2424 loops). */
2425
2426 case OP_REF:
2427 case OP_REFI:
2428 caseless = op == OP_REFI;
2429 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2430 ecode += 3;
2431
2432 /* If the reference is unset, there are two possibilities:
2433
2434 (a) In the default, Perl-compatible state, set the length negative;
2435 this ensures that every attempt at a match fails. We can't just fail
2436 here, because of the possibility of quantifiers with zero minima.
2437
2438 (b) If the JavaScript compatibility flag is set, set the length to zero
2439 so that the back reference matches an empty string.
2440
2441 Otherwise, set the length to the length of what was matched by the
2442 referenced subpattern. */
2443
2444 if (offset >= offset_top || md->offset_vector[offset] < 0)
2445 length = (md->jscript_compat)? 0 : -1;
2446 else
2447 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2448
2449 /* Set up for repetition, or handle the non-repeated case */
2450
2451 switch (*ecode)
2452 {
2453 case OP_CRSTAR:
2454 case OP_CRMINSTAR:
2455 case OP_CRPLUS:
2456 case OP_CRMINPLUS:
2457 case OP_CRQUERY:
2458 case OP_CRMINQUERY:
2459 c = *ecode++ - OP_CRSTAR;
2460 minimize = (c & 1) != 0;
2461 min = rep_min[c]; /* Pick up values from tables; */
2462 max = rep_max[c]; /* zero for max => infinity */
2463 if (max == 0) max = INT_MAX;
2464 break;
2465
2466 case OP_CRRANGE:
2467 case OP_CRMINRANGE:
2468 minimize = (*ecode == OP_CRMINRANGE);
2469 min = GET2(ecode, 1);
2470 max = GET2(ecode, 3);
2471 if (max == 0) max = INT_MAX;
2472 ecode += 5;
2473 break;
2474
2475 default: /* No repeat follows */
2476 if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2477 {
2478 CHECK_PARTIAL();
2479 MRRETURN(MATCH_NOMATCH);
2480 }
2481 eptr += length;
2482 continue; /* With the main loop */
2483 }
2484
2485 /* Handle repeated back references. If the length of the reference is
2486 zero, just continue with the main loop. */
2487
2488 if (length == 0) continue;
2489
2490 /* First, ensure the minimum number of matches are present. We get back
2491 the length of the reference string explicitly rather than passing the
2492 address of eptr, so that eptr can be a register variable. */
2493
2494 for (i = 1; i <= min; i++)
2495 {
2496 int slength;
2497 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2498 {
2499 CHECK_PARTIAL();
2500 MRRETURN(MATCH_NOMATCH);
2501 }
2502 eptr += slength;
2503 }
2504
2505 /* If min = max, continue at the same level without recursion.
2506 They are not both allowed to be zero. */
2507
2508 if (min == max) continue;
2509
2510 /* If minimizing, keep trying and advancing the pointer */
2511
2512 if (minimize)
2513 {
2514 for (fi = min;; fi++)
2515 {
2516 int slength;
2517 RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2518 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2519 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2520 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2521 {
2522 CHECK_PARTIAL();
2523 MRRETURN(MATCH_NOMATCH);
2524 }
2525 eptr += slength;
2526 }
2527 /* Control never gets here */
2528 }
2529
2530 /* If maximizing, find the longest string and work backwards */
2531
2532 else
2533 {
2534 pp = eptr;
2535 for (i = min; i < max; i++)
2536 {
2537 int slength;
2538 if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
2539 {
2540 CHECK_PARTIAL();
2541 break;
2542 }
2543 eptr += slength;
2544 }
2545 while (eptr >= pp)
2546 {
2547 RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2548 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2549 eptr -= length;
2550 }
2551 MRRETURN(MATCH_NOMATCH);
2552 }
2553 /* Control never gets here */
2554
2555 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2556 used when all the characters in the class have values in the range 0-255,
2557 and either the matching is caseful, or the characters are in the range
2558 0-127 when UTF-8 processing is enabled. The only difference between
2559 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2560 encountered.
2561
2562 First, look past the end of the item to see if there is repeat information
2563 following. Then obey similar code to character type repeats - written out
2564 again for speed. */
2565
2566 case OP_NCLASS:
2567 case OP_CLASS:
2568 {
2569 data = ecode + 1; /* Save for matching */
2570 ecode += 33; /* Advance past the item */
2571
2572 switch (*ecode)
2573 {
2574 case OP_CRSTAR:
2575 case OP_CRMINSTAR:
2576 case OP_CRPLUS:
2577 case OP_CRMINPLUS:
2578 case OP_CRQUERY:
2579 case OP_CRMINQUERY:
2580 c = *ecode++ - OP_CRSTAR;
2581 minimize = (c & 1) != 0;
2582 min = rep_min[c]; /* Pick up values from tables; */
2583 max = rep_max[c]; /* zero for max => infinity */
2584 if (max == 0) max = INT_MAX;
2585 break;
2586
2587 case OP_CRRANGE:
2588 case OP_CRMINRANGE:
2589 minimize = (*ecode == OP_CRMINRANGE);
2590 min = GET2(ecode, 1);
2591 max = GET2(ecode, 3);
2592 if (max == 0) max = INT_MAX;
2593 ecode += 5;
2594 break;
2595
2596 default: /* No repeat follows */
2597 min = max = 1;
2598 break;
2599 }
2600
2601 /* First, ensure the minimum number of matches are present. */
2602
2603 #ifdef SUPPORT_UTF8
2604 /* UTF-8 mode */
2605 if (utf8)
2606 {
2607 for (i = 1; i <= min; i++)
2608 {
2609 if (eptr >= md->end_subject)
2610 {
2611 SCHECK_PARTIAL();
2612 MRRETURN(MATCH_NOMATCH);
2613 }
2614 GETCHARINC(c, eptr);
2615 if (c > 255)
2616 {
2617 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2618 }
2619 else
2620 {
2621 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2622 }
2623 }
2624 }
2625 else
2626 #endif
2627 /* Not UTF-8 mode */
2628 {
2629 for (i = 1; i <= min; i++)
2630 {
2631 if (eptr >= md->end_subject)
2632 {
2633 SCHECK_PARTIAL();
2634 MRRETURN(MATCH_NOMATCH);
2635 }
2636 c = *eptr++;
2637 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2638 }
2639 }
2640
2641 /* If max == min we can continue with the main loop without the
2642 need to recurse. */
2643
2644 if (min == max) continue;
2645
2646 /* If minimizing, keep testing the rest of the expression and advancing
2647 the pointer while it matches the class. */
2648
2649 if (minimize)
2650 {
2651 #ifdef SUPPORT_UTF8
2652 /* UTF-8 mode */
2653 if (utf8)
2654 {
2655 for (fi = min;; fi++)
2656 {
2657 RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
2658 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2660 if (eptr >= md->end_subject)
2661 {
2662 SCHECK_PARTIAL();
2663 MRRETURN(MATCH_NOMATCH);
2664 }
2665 GETCHARINC(c, eptr);
2666 if (c > 255)
2667 {
2668 if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
2669 }
2670 else
2671 {
2672 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2673 }
2674 }
2675 }
2676 else
2677 #endif
2678 /* Not UTF-8 mode */
2679 {
2680 for (fi = min;; fi++)
2681 {
2682 RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
2683 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2684 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2685 if (eptr >= md->end_subject)
2686 {
2687 SCHECK_PARTIAL();
2688 MRRETURN(MATCH_NOMATCH);
2689 }
2690 c = *eptr++;
2691 if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
2692 }
2693 }
2694 /* Control never gets here */
2695 }
2696
2697 /* If maximizing, find the longest possible run, then work backwards. */
2698
2699 else
2700 {
2701 pp = eptr;
2702
2703 #ifdef SUPPORT_UTF8
2704 /* UTF-8 mode */
2705 if (utf8)
2706 {
2707 for (i = min; i < max; i++)
2708 {
2709 int len = 1;
2710 if (eptr >= md->end_subject)
2711 {
2712 SCHECK_PARTIAL();
2713 break;
2714 }
2715 GETCHARLEN(c, eptr, len);
2716 if (c > 255)
2717 {
2718 if (op == OP_CLASS) break;
2719 }
2720 else
2721 {
2722 if ((data[c/8] & (1 << (c&7))) == 0) break;
2723 }
2724 eptr += len;
2725 }
2726 for (;;)
2727 {
2728 RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
2729 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2730 if (eptr-- == pp) break; /* Stop if tried at original pos */
2731 BACKCHAR(eptr);
2732 }
2733 }
2734 else
2735 #endif
2736 /* Not UTF-8 mode */
2737 {
2738 for (i = min; i < max; i++)
2739 {
2740 if (eptr >= md->end_subject)
2741 {
2742 SCHECK_PARTIAL();
2743 break;
2744 }
2745 c = *eptr;
2746 if ((data[c/8] & (1 << (c&7))) == 0) break;
2747 eptr++;
2748 }
2749 while (eptr >= pp)
2750 {
2751 RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
2752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2753 eptr--;
2754 }
2755 }
2756
2757 MRRETURN(MATCH_NOMATCH);
2758 }
2759 }
2760 /* Control never gets here */
2761
2762
2763 /* Match an extended character class. This opcode is encountered only
2764 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2765 mode, because Unicode properties are supported in non-UTF-8 mode. */
2766
2767 #ifdef SUPPORT_UTF8
2768 case OP_XCLASS:
2769 {
2770 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2771 ecode += GET(ecode, 1); /* Advance past the item */
2772
2773 switch (*ecode)
2774 {
2775 case OP_CRSTAR:
2776 case OP_CRMINSTAR:
2777 case OP_CRPLUS:
2778 case OP_CRMINPLUS:
2779 case OP_CRQUERY:
2780 case OP_CRMINQUERY:
2781 c = *ecode++ - OP_CRSTAR;
2782 minimize = (c & 1) != 0;
2783 min = rep_min[c]; /* Pick up values from tables; */
2784 max = rep_max[c]; /* zero for max => infinity */
2785 if (max == 0) max = INT_MAX;
2786 break;
2787
2788 case OP_CRRANGE:
2789 case OP_CRMINRANGE:
2790 minimize = (*ecode == OP_CRMINRANGE);
2791 min = GET2(ecode, 1);
2792 max = GET2(ecode, 3);
2793 if (max == 0) max = INT_MAX;
2794 ecode += 5;
2795 break;
2796
2797 default: /* No repeat follows */
2798 min = max = 1;
2799 break;
2800 }
2801
2802 /* First, ensure the minimum number of matches are present. */
2803
2804 for (i = 1; i <= min; i++)
2805 {
2806 if (eptr >= md->end_subject)
2807 {
2808 SCHECK_PARTIAL();
2809 MRRETURN(MATCH_NOMATCH);
2810 }
2811 GETCHARINCTEST(c, eptr);
2812 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2813 }
2814
2815 /* If max == min we can continue with the main loop without the
2816 need to recurse. */
2817
2818 if (min == max) continue;
2819
2820 /* If minimizing, keep testing the rest of the expression and advancing
2821 the pointer while it matches the class. */
2822
2823 if (minimize)
2824 {
2825 for (fi = min;; fi++)
2826 {
2827 RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
2828 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2829 if (fi >= max) MRRETURN(MATCH_NOMATCH);
2830 if (eptr >= md->end_subject)
2831 {
2832 SCHECK_PARTIAL();
2833 MRRETURN(MATCH_NOMATCH);
2834 }
2835 GETCHARINCTEST(c, eptr);
2836 if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
2837 }
2838 /* Control never gets here */
2839 }
2840
2841 /* If maximizing, find the longest possible run, then work backwards. */
2842
2843 else
2844 {
2845 pp = eptr;
2846 for (i = min; i < max; i++)
2847 {
2848 int len = 1;
2849 if (eptr >= md->end_subject)
2850 {
2851 SCHECK_PARTIAL();
2852 break;
2853 }
2854 GETCHARLENTEST(c, eptr, len);
2855 if (!_pcre_xclass(c, data)) break;
2856 eptr += len;
2857 }
2858 for(;;)
2859 {
2860 RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
2861 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2862 if (eptr-- == pp) break; /* Stop if tried at original pos */
2863 if (utf8) BACKCHAR(eptr);
2864 }
2865 MRRETURN(MATCH_NOMATCH);
2866 }
2867
2868 /* Control never gets here */
2869 }
2870 #endif /* End of XCLASS */
2871
2872 /* Match a single character, casefully */
2873
2874 case OP_CHAR:
2875 #ifdef SUPPORT_UTF8
2876 if (utf8)
2877 {
2878 length = 1;
2879 ecode++;
2880 GETCHARLEN(fc, ecode, length);
2881 if (length > md->end_subject - eptr)
2882 {
2883 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2884 MRRETURN(MATCH_NOMATCH);
2885 }
2886 while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
2887 }
2888 else
2889 #endif
2890
2891 /* Non-UTF-8 mode */
2892 {
2893 if (md->end_subject - eptr < 1)
2894 {
2895 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2896 MRRETURN(MATCH_NOMATCH);
2897 }
2898 if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
2899 ecode += 2;
2900 }
2901 break;
2902
2903 /* Match a single character, caselessly */
2904
2905 case OP_CHARI:
2906 #ifdef SUPPORT_UTF8
2907 if (utf8)
2908 {
2909 length = 1;
2910 ecode++;
2911 GETCHARLEN(fc, ecode, length);
2912
2913 if (length > md->end_subject - eptr)
2914 {
2915 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2916 MRRETURN(MATCH_NOMATCH);
2917 }
2918
2919 /* If the pattern character's value is < 128, we have only one byte, and
2920 can use the fast lookup table. */
2921
2922 if (fc < 128)
2923 {
2924 if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2925 }
2926
2927 /* Otherwise we must pick up the subject character */
2928
2929 else
2930 {
2931 unsigned int dc;
2932 GETCHARINC(dc, eptr);
2933 ecode += length;
2934
2935 /* If we have Unicode property support, we can use it to test the other
2936 case of the character, if there is one. */
2937
2938 if (fc != dc)
2939 {
2940 #ifdef SUPPORT_UCP
2941 if (dc != UCD_OTHERCASE(fc))
2942 #endif
2943 MRRETURN(MATCH_NOMATCH);
2944 }
2945 }
2946 }
2947 else
2948 #endif /* SUPPORT_UTF8 */
2949
2950 /* Non-UTF-8 mode */
2951 {
2952 if (md->end_subject - eptr < 1)
2953 {
2954 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2955 MRRETURN(MATCH_NOMATCH);
2956 }
2957 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2958 ecode += 2;
2959 }
2960 break;
2961
2962 /* Match a single character repeatedly. */
2963
2964 case OP_EXACT:
2965 case OP_EXACTI:
2966 min = max = GET2(ecode, 1);
2967 ecode += 3;
2968 goto REPEATCHAR;
2969
2970 case OP_POSUPTO:
2971 case OP_POSUPTOI:
2972 possessive = TRUE;
2973 /* Fall through */
2974
2975 case OP_UPTO:
2976 case OP_UPTOI:
2977 case OP_MINUPTO:
2978 case OP_MINUPTOI:
2979 min = 0;
2980 max = GET2(ecode, 1);
2981 minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
2982 ecode += 3;
2983 goto REPEATCHAR;
2984
2985 case OP_POSSTAR:
2986 case OP_POSSTARI:
2987 possessive = TRUE;
2988 min = 0;
2989 max = INT_MAX;
2990 ecode++;
2991 goto REPEATCHAR;
2992
2993 case OP_POSPLUS:
2994 case OP_POSPLUSI:
2995 possessive = TRUE;
2996 min = 1;
2997 max = INT_MAX;
2998 ecode++;
2999 goto REPEATCHAR;
3000
3001 case OP_POSQUERY:
3002 case OP_POSQUERYI:
3003 possessive = TRUE;
3004 min = 0;
3005 max = 1;
3006 ecode++;
3007 goto REPEATCHAR;
3008
3009 case OP_STAR:
3010 case OP_STARI:
3011 case OP_MINSTAR:
3012 case OP_MINSTARI:
3013 case OP_PLUS:
3014 case OP_PLUSI:
3015 case OP_MINPLUS:
3016 case OP_MINPLUSI:
3017 case OP_QUERY:
3018 case OP_QUERYI:
3019 case OP_MINQUERY:
3020 case OP_MINQUERYI:
3021 c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
3022 minimize = (c & 1) != 0;
3023 min = rep_min[c]; /* Pick up values from tables; */
3024 max = rep_max[c]; /* zero for max => infinity */
3025 if (max == 0) max = INT_MAX;
3026
3027 /* Common code for all repeated single-character matches. */
3028
3029 REPEATCHAR:
3030 #ifdef SUPPORT_UTF8
3031 if (utf8)
3032 {
3033 length = 1;
3034 charptr = ecode;
3035 GETCHARLEN(fc, ecode, length);
3036 ecode += length;
3037
3038 /* Handle multibyte character matching specially here. There is
3039 support for caseless matching if UCP support is present. */
3040
3041 if (length > 1)
3042 {
3043 #ifdef SUPPORT_UCP
3044 unsigned int othercase;
3045 if (op >= OP_STARI && /* Caseless */
3046 (othercase = UCD_OTHERCASE(fc)) != fc)
3047 oclength = _pcre_ord2utf8(othercase, occhars);
3048 else oclength = 0;
3049 #endif /* SUPPORT_UCP */
3050
3051 for (i = 1; i <= min; i++)
3052 {
3053 if (eptr <= md->end_subject - length &&
3054 memcmp(eptr, charptr, length) == 0) eptr += length;
3055 #ifdef SUPPORT_UCP
3056 else if (oclength > 0 &&
3057 eptr <= md->end_subject - oclength &&
3058 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3059 #endif /* SUPPORT_UCP */
3060 else
3061 {
3062 CHECK_PARTIAL();
3063 MRRETURN(MATCH_NOMATCH);
3064 }
3065 }
3066
3067 if (min == max) continue;
3068
3069 if (minimize)
3070 {
3071 for (fi = min;; fi++)
3072 {
3073 RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
3074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3075 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3076 if (eptr <= md->end_subject - length &&
3077 memcmp(eptr, charptr, length) == 0) eptr += length;
3078 #ifdef SUPPORT_UCP
3079 else if (oclength > 0 &&
3080 eptr <= md->end_subject - oclength &&
3081 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3082 #endif /* SUPPORT_UCP */
3083 else
3084 {
3085 CHECK_PARTIAL();
3086 MRRETURN(MATCH_NOMATCH);
3087 }
3088 }
3089 /* Control never gets here */
3090 }
3091
3092 else /* Maximize */
3093 {
3094 pp = eptr;
3095 for (i = min; i < max; i++)
3096 {
3097 if (eptr <= md->end_subject - length &&
3098 memcmp(eptr, charptr, length) == 0) eptr += length;
3099 #ifdef SUPPORT_UCP
3100 else if (oclength > 0 &&
3101 eptr <= md->end_subject - oclength &&
3102 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
3103 #endif /* SUPPORT_UCP */
3104 else
3105 {
3106 CHECK_PARTIAL();
3107 break;
3108 }
3109 }
3110
3111 if (possessive) continue;
3112
3113 for(;;)
3114 {
3115 RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
3116 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3117 if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
3118 #ifdef SUPPORT_UCP
3119 eptr--;
3120 BACKCHAR(eptr);
3121 #else /* without SUPPORT_UCP */
3122 eptr -= length;
3123 #endif /* SUPPORT_UCP */
3124 }
3125 }
3126 /* Control never gets here */
3127 }
3128
3129 /* If the length of a UTF-8 character is 1, we fall through here, and
3130 obey the code as for non-UTF-8 characters below, though in this case the
3131 value of fc will always be < 128. */
3132 }
3133 else
3134 #endif /* SUPPORT_UTF8 */
3135
3136 /* When not in UTF-8 mode, load a single-byte character. */
3137
3138 fc = *ecode++;
3139
3140 /* The value of fc at this point is always less than 256, though we may or
3141 may not be in UTF-8 mode. The code is duplicated for the caseless and
3142 caseful cases, for speed, since matching characters is likely to be quite
3143 common. First, ensure the minimum number of matches are present. If min =
3144 max, continue at the same level without recursing. Otherwise, if
3145 minimizing, keep trying the rest of the expression and advancing one
3146 matching character if failing, up to the maximum. Alternatively, if
3147 maximizing, find the maximum number of characters and work backwards. */
3148
3149 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3150 max, eptr));
3151
3152 if (op >= OP_STARI) /* Caseless */
3153 {
3154 fc = md->lcc[fc];
3155 for (i = 1; i <= min; i++)
3156 {
3157 if (eptr >= md->end_subject)
3158 {
3159 SCHECK_PARTIAL();
3160 MRRETURN(MATCH_NOMATCH);
3161 }
3162 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3163 }
3164 if (min == max) continue;
3165 if (minimize)
3166 {
3167 for (fi = min;; fi++)
3168 {
3169 RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
3170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3171 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3172 if (eptr >= md->end_subject)
3173 {
3174 SCHECK_PARTIAL();
3175 MRRETURN(MATCH_NOMATCH);
3176 }
3177 if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3178 }
3179 /* Control never gets here */
3180 }
3181 else /* Maximize */
3182 {
3183 pp = eptr;
3184 for (i = min; i < max; i++)
3185 {
3186 if (eptr >= md->end_subject)
3187 {
3188 SCHECK_PARTIAL();
3189 break;
3190 }
3191 if (fc != md->lcc[*eptr]) break;
3192 eptr++;
3193 }
3194
3195 if (possessive) continue;
3196
3197 while (eptr >= pp)
3198 {
3199 RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
3200 eptr--;
3201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3202 }
3203 MRRETURN(MATCH_NOMATCH);
3204 }
3205 /* Control never gets here */
3206 }
3207
3208 /* Caseful comparisons (includes all multi-byte characters) */
3209
3210 else
3211 {
3212 for (i = 1; i <= min; i++)
3213 {
3214 if (eptr >= md->end_subject)
3215 {
3216 SCHECK_PARTIAL();
3217 MRRETURN(MATCH_NOMATCH);
3218 }
3219 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3220 }
3221
3222 if (min == max) continue;
3223
3224 if (minimize)
3225 {
3226 for (fi = min;; fi++)
3227 {
3228 RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
3229 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3230 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3231 if (eptr >= md->end_subject)
3232 {
3233 SCHECK_PARTIAL();
3234 MRRETURN(MATCH_NOMATCH);
3235 }
3236 if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3237 }
3238 /* Control never gets here */
3239 }
3240 else /* Maximize */
3241 {
3242 pp = eptr;
3243 for (i = min; i < max; i++)
3244 {
3245 if (eptr >= md->end_subject)
3246 {
3247 SCHECK_PARTIAL();
3248 break;
3249 }
3250 if (fc != *eptr) break;
3251 eptr++;
3252 }
3253 if (possessive) continue;
3254
3255 while (eptr >= pp)
3256 {
3257 RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
3258 eptr--;
3259 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3260 }
3261 MRRETURN(MATCH_NOMATCH);
3262 }
3263 }
3264 /* Control never gets here */
3265
3266 /* Match a negated single one-byte character. The character we are
3267 checking can be multibyte. */
3268
3269 case OP_NOT:
3270 case OP_NOTI:
3271 if (eptr >= md->end_subject)
3272 {
3273 SCHECK_PARTIAL();
3274 MRRETURN(MATCH_NOMATCH);
3275 }
3276 ecode++;
3277 GETCHARINCTEST(c, eptr);
3278 if (op == OP_NOTI) /* The caseless case */
3279 {
3280 #ifdef SUPPORT_UTF8
3281 if (c < 256)
3282 #endif
3283 c = md->lcc[c];
3284 if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
3285 }
3286 else /* Caseful */
3287 {
3288 if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
3289 }
3290 break;
3291
3292 /* Match a negated single one-byte character repeatedly. This is almost a
3293 repeat of the code for a repeated single character, but I haven't found a
3294 nice way of commoning these up that doesn't require a test of the
3295 positive/negative option for each character match. Maybe that wouldn't add
3296 very much to the time taken, but character matching *is* what this is all
3297 about... */
3298
3299 case OP_NOTEXACT:
3300 case OP_NOTEXACTI:
3301 min = max = GET2(ecode, 1);
3302 ecode += 3;
3303 goto REPEATNOTCHAR;
3304
3305 case OP_NOTUPTO:
3306 case OP_NOTUPTOI:
3307 case OP_NOTMINUPTO:
3308 case OP_NOTMINUPTOI:
3309 min = 0;
3310 max = GET2(ecode, 1);
3311 minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3312 ecode += 3;
3313 goto REPEATNOTCHAR;
3314
3315 case OP_NOTPOSSTAR:
3316 case OP_NOTPOSSTARI:
3317 possessive = TRUE;
3318 min = 0;
3319 max = INT_MAX;
3320 ecode++;
3321 goto REPEATNOTCHAR;
3322
3323 case OP_NOTPOSPLUS:
3324 case OP_NOTPOSPLUSI:
3325 possessive = TRUE;
3326 min = 1;
3327 max = INT_MAX;
3328 ecode++;
3329 goto REPEATNOTCHAR;
3330
3331 case OP_NOTPOSQUERY:
3332 case OP_NOTPOSQUERYI:
3333 possessive = TRUE;
3334 min = 0;
3335 max = 1;
3336 ecode++;
3337 goto REPEATNOTCHAR;
3338
3339 case OP_NOTPOSUPTO:
3340 case OP_NOTPOSUPTOI:
3341 possessive = TRUE;
3342 min = 0;
3343 max = GET2(ecode, 1);
3344 ecode += 3;
3345 goto REPEATNOTCHAR;
3346
3347 case OP_NOTSTAR:
3348 case OP_NOTSTARI:
3349 case OP_NOTMINSTAR:
3350 case OP_NOTMINSTARI:
3351 case OP_NOTPLUS:
3352 case OP_NOTPLUSI:
3353 case OP_NOTMINPLUS:
3354 case OP_NOTMINPLUSI:
3355 case OP_NOTQUERY:
3356 case OP_NOTQUERYI:
3357 case OP_NOTMINQUERY:
3358 case OP_NOTMINQUERYI:
3359 c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
3360 minimize = (c & 1) != 0;
3361 min = rep_min[c]; /* Pick up values from tables; */
3362 max = rep_max[c]; /* zero for max => infinity */
3363 if (max == 0) max = INT_MAX;
3364
3365 /* Common code for all repeated single-byte matches. */
3366
3367 REPEATNOTCHAR:
3368 fc = *ecode++;
3369
3370 /* The code is duplicated for the caseless and caseful cases, for speed,
3371 since matching characters is likely to be quite common. First, ensure the
3372 minimum number of matches are present. If min = max, continue at the same
3373 level without recursing. Otherwise, if minimizing, keep trying the rest of
3374 the expression and advancing one matching character if failing, up to the
3375 maximum. Alternatively, if maximizing, find the maximum number of
3376 characters and work backwards. */
3377
3378 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3379 max, eptr));
3380
3381 if (op >= OP_NOTSTARI) /* Caseless */
3382 {
3383 fc = md->lcc[fc];
3384
3385 #ifdef SUPPORT_UTF8
3386 /* UTF-8 mode */
3387 if (utf8)
3388 {
3389 register unsigned int d;
3390 for (i = 1; i <= min; i++)
3391 {
3392 if (eptr >= md->end_subject)
3393 {
3394 SCHECK_PARTIAL();
3395 MRRETURN(MATCH_NOMATCH);
3396 }
3397 GETCHARINC(d, eptr);
3398 if (d < 256) d = md->lcc[d];
3399 if (fc == d) MRRETURN(MATCH_NOMATCH);
3400 }
3401 }
3402 else
3403 #endif
3404
3405 /* Not UTF-8 mode */
3406 {
3407 for (i = 1; i <= min; i++)
3408 {
3409 if (eptr >= md->end_subject)
3410 {
3411 SCHECK_PARTIAL();
3412 MRRETURN(MATCH_NOMATCH);
3413 }
3414 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3415 }
3416 }
3417
3418 if (min == max) continue;
3419
3420 if (minimize)
3421 {
3422 #ifdef SUPPORT_UTF8
3423 /* UTF-8 mode */
3424 if (utf8)
3425 {
3426 register unsigned int d;
3427 for (fi = min;; fi++)
3428 {
3429 RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
3430 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3432 if (eptr >= md->end_subject)
3433 {
3434 SCHECK_PARTIAL();
3435 MRRETURN(MATCH_NOMATCH);
3436 }
3437 GETCHARINC(d, eptr);
3438 if (d < 256) d = md->lcc[d];
3439 if (fc == d) MRRETURN(MATCH_NOMATCH);
3440 }
3441 }
3442 else
3443 #endif
3444 /* Not UTF-8 mode */
3445 {
3446 for (fi = min;; fi++)
3447 {
3448 RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
3449 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3450 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3451 if (eptr >= md->end_subject)
3452 {
3453 SCHECK_PARTIAL();
3454 MRRETURN(MATCH_NOMATCH);
3455 }
3456 if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3457 }
3458 }
3459 /* Control never gets here */
3460 }
3461
3462 /* Maximize case */
3463
3464 else
3465 {
3466 pp = eptr;
3467
3468 #ifdef SUPPORT_UTF8
3469 /* UTF-8 mode */
3470 if (utf8)
3471 {
3472 register unsigned int d;
3473 for (i = min; i < max; i++)
3474 {
3475 int len = 1;
3476 if (eptr >= md->end_subject)
3477 {
3478 SCHECK_PARTIAL();
3479 break;
3480 }
3481 GETCHARLEN(d, eptr, len);
3482 if (d < 256) d = md->lcc[d];
3483 if (fc == d) break;
3484 eptr += len;
3485 }
3486 if (possessive) continue;
3487 for(;;)
3488 {
3489 RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
3490 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3491 if (eptr-- == pp) break; /* Stop if tried at original pos */
3492 BACKCHAR(eptr);
3493 }
3494 }
3495 else
3496 #endif
3497 /* Not UTF-8 mode */
3498 {
3499 for (i = min; i < max; i++)
3500 {
3501 if (eptr >= md->end_subject)
3502 {
3503 SCHECK_PARTIAL();
3504 break;
3505 }
3506 if (fc == md->lcc[*eptr]) break;
3507 eptr++;
3508 }
3509 if (possessive) continue;
3510 while (eptr >= pp)
3511 {
3512 RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
3513 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3514 eptr--;
3515 }
3516 }
3517
3518 MRRETURN(MATCH_NOMATCH);
3519 }
3520 /* Control never gets here */
3521 }
3522
3523 /* Caseful comparisons */
3524
3525 else
3526 {
3527 #ifdef SUPPORT_UTF8
3528 /* UTF-8 mode */
3529 if (utf8)
3530 {
3531 register unsigned int d;
3532 for (i = 1; i <= min; i++)
3533 {
3534 if (eptr >= md->end_subject)
3535 {
3536 SCHECK_PARTIAL();
3537 MRRETURN(MATCH_NOMATCH);
3538 }
3539 GETCHARINC(d, eptr);
3540 if (fc == d) MRRETURN(MATCH_NOMATCH);
3541 }
3542 }
3543 else
3544 #endif
3545 /* Not UTF-8 mode */
3546 {
3547 for (i = 1; i <= min; i++)
3548 {
3549 if (eptr >= md->end_subject)
3550 {
3551 SCHECK_PARTIAL();
3552 MRRETURN(MATCH_NOMATCH);
3553 }
3554 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3555 }
3556 }
3557
3558 if (min == max) continue;
3559
3560 if (minimize)
3561 {
3562 #ifdef SUPPORT_UTF8
3563 /* UTF-8 mode */
3564 if (utf8)
3565 {
3566 register unsigned int d;
3567 for (fi = min;; fi++)
3568 {
3569 RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
3570 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3571 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3572 if (eptr >= md->end_subject)
3573 {
3574 SCHECK_PARTIAL();
3575 MRRETURN(MATCH_NOMATCH);
3576 }
3577 GETCHARINC(d, eptr);
3578 if (fc == d) MRRETURN(MATCH_NOMATCH);
3579 }
3580 }
3581 else
3582 #endif
3583 /* Not UTF-8 mode */
3584 {
3585 for (fi = min;; fi++)
3586 {
3587 RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
3588 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3589 if (fi >= max) MRRETURN(MATCH_NOMATCH);
3590 if (eptr >= md->end_subject)
3591 {
3592 SCHECK_PARTIAL();
3593 MRRETURN(MATCH_NOMATCH);
3594 }
3595 if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3596 }
3597 }
3598 /* Control never gets here */
3599 }
3600
3601 /* Maximize case */
3602
3603 else
3604 {
3605 pp = eptr;
3606
3607 #ifdef SUPPORT_UTF8
3608 /* UTF-8 mode */
3609 if (utf8)
3610 {
3611 register unsigned int d;
3612 for (i = min; i < max; i++)
3613 {
3614 int len = 1;
3615 if (eptr >= md->end_subject)
3616 {
3617 SCHECK_PARTIAL();
3618 break;
3619 }
3620 GETCHARLEN(d, eptr, len);
3621 if (fc == d) break;
3622 eptr += len;
3623 }
3624 if (possessive) continue;
3625 for(;;)
3626 {
3627 RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
3628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629 if (eptr-- == pp) break; /* Stop if tried at original pos */
3630 BACKCHAR(eptr);
3631 }
3632 }
3633 else
3634 #endif
3635 /* Not UTF-8 mode */
3636 {
3637 for (i = min; i < max; i++)
3638 {
3639 if (eptr >= md->end_subject)
3640 {
3641 SCHECK_PARTIAL();
3642 break;
3643 }
3644 if (fc == *eptr) break;
3645 eptr++;
3646 }
3647 if (possessive) continue;
3648 while (eptr >= pp)
3649 {
3650 RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
3651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3652 eptr--;
3653 }
3654 }
3655
3656 MRRETURN(MATCH_NOMATCH);
3657 }
3658 }
3659 /* Control never gets here */
3660
3661 /* Match a single character type repeatedly; several different opcodes
3662 share code. This is very similar to the code for single characters, but we
3663 repeat it in the interests of efficiency. */
3664
3665 case OP_TYPEEXACT:
3666 min = max = GET2(ecode, 1);
3667 minimize = TRUE;
3668 ecode += 3;
3669 goto REPEATTYPE;
3670
3671 case OP_TYPEUPTO:
3672 case OP_TYPEMINUPTO:
3673 min = 0;
3674 max = GET2(ecode, 1);
3675 minimize = *ecode == OP_TYPEMINUPTO;
3676 ecode += 3;
3677 goto REPEATTYPE;
3678
3679 case OP_TYPEPOSSTAR:
3680 possessive = TRUE;
3681 min = 0;
3682 max = INT_MAX;
3683 ecode++;
3684 goto REPEATTYPE;
3685
3686 case OP_TYPEPOSPLUS:
3687 possessive = TRUE;
3688 min = 1;
3689 max = INT_MAX;
3690 ecode++;
3691 goto REPEATTYPE;
3692
3693 case OP_TYPEPOSQUERY:
3694 possessive = TRUE;
3695 min = 0;
3696 max = 1;
3697 ecode++;
3698 goto REPEATTYPE;
3699
3700 case OP_TYPEPOSUPTO:
3701 possessive = TRUE;
3702 min = 0;
3703 max = GET2(ecode, 1);
3704 ecode += 3;
3705 goto REPEATTYPE;
3706
3707 case OP_TYPESTAR:
3708 case OP_TYPEMINSTAR:
3709 case OP_TYPEPLUS:
3710 case OP_TYPEMINPLUS:
3711 case OP_TYPEQUERY:
3712 case OP_TYPEMINQUERY:
3713 c = *ecode++ - OP_TYPESTAR;
3714 minimize = (c & 1) != 0;
3715 min = rep_min[c]; /* Pick up values from tables; */
3716 max = rep_max[c]; /* zero for max => infinity */
3717 if (max == 0) max = INT_MAX;
3718
3719 /* Common code for all repeated single character type matches. Note that
3720 in UTF-8 mode, '.' matches a character of any length, but for the other
3721 character types, the valid characters are all one-byte long. */
3722
3723 REPEATTYPE:
3724 ctype = *ecode++; /* Code for the character type */
3725
3726 #ifdef SUPPORT_UCP
3727 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3728 {
3729 prop_fail_result = ctype == OP_NOTPROP;
3730 prop_type = *ecode++;
3731 prop_value = *ecode++;
3732 }
3733 else prop_type = -1;
3734 #endif
3735
3736 /* First, ensure the minimum number of matches are present. Use inline
3737 code for maximizing the speed, and do the type test once at the start
3738 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3739 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3740 and single-bytes. */
3741
3742 if (min > 0)
3743 {
3744 #ifdef SUPPORT_UCP
3745 if (prop_type >= 0)
3746 {
3747 switch(prop_type)
3748 {
3749 case PT_ANY:
3750 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
3751 for (i = 1; i <= min; i++)
3752 {
3753 if (eptr >= md->end_subject)
3754 {
3755 SCHECK_PARTIAL();
3756 MRRETURN(MATCH_NOMATCH);
3757 }
3758 GETCHARINCTEST(c, eptr);
3759 }
3760 break;
3761
3762 case PT_LAMP:
3763 for (i = 1; i <= min; i++)
3764 {
3765 if (eptr >= md->end_subject)
3766 {
3767 SCHECK_PARTIAL();
3768 MRRETURN(MATCH_NOMATCH);
3769 }
3770 GETCHARINCTEST(c, eptr);
3771 prop_chartype = UCD_CHARTYPE(c);
3772 if ((prop_chartype == ucp_Lu ||
3773 prop_chartype == ucp_Ll ||
3774 prop_chartype == ucp_Lt) == prop_fail_result)
3775 MRRETURN(MATCH_NOMATCH);
3776 }
3777 break;
3778
3779 case PT_GC:
3780 for (i = 1; i <= min; i++)
3781 {
3782 if (eptr >= md->end_subject)
3783 {
3784 SCHECK_PARTIAL();
3785 MRRETURN(MATCH_NOMATCH);
3786 }
3787 GETCHARINCTEST(c, eptr);
3788 prop_category = UCD_CATEGORY(c);
3789 if ((prop_category == prop_value) == prop_fail_result)
3790 MRRETURN(MATCH_NOMATCH);
3791 }
3792 break;
3793
3794 case PT_PC:
3795 for (i = 1; i <= min; i++)
3796 {
3797 if (eptr >= md->end_subject)
3798 {
3799 SCHECK_PARTIAL();
3800 MRRETURN(MATCH_NOMATCH);
3801 }
3802 GETCHARINCTEST(c, eptr);
3803 prop_chartype = UCD_CHARTYPE(c);
3804 if ((prop_chartype == prop_value) == prop_fail_result)
3805 MRRETURN(MATCH_NOMATCH);
3806 }
3807 break;
3808
3809 case PT_SC:
3810 for (i = 1; i <= min; i++)
3811 {
3812 if (eptr >= md->end_subject)
3813 {
3814 SCHECK_PARTIAL();
3815 MRRETURN(MATCH_NOMATCH);
3816 }
3817 GETCHARINCTEST(c, eptr);
3818 prop_script = UCD_SCRIPT(c);
3819 if ((prop_script == prop_value) == prop_fail_result)
3820 MRRETURN(MATCH_NOMATCH);
3821 }
3822 break;
3823
3824 case PT_ALNUM:
3825 for (i = 1; i <= min; i++)
3826 {
3827 if (eptr >= md->end_subject)
3828 {
3829 SCHECK_PARTIAL();
3830 MRRETURN(MATCH_NOMATCH);
3831 }
3832 GETCHARINCTEST(c, eptr);
3833 prop_category = UCD_CATEGORY(c);
3834 if ((prop_category == ucp_L || prop_category == ucp_N)
3835 == prop_fail_result)
3836 MRRETURN(MATCH_NOMATCH);
3837 }
3838 break;
3839
3840 case PT_SPACE: /* Perl space */
3841 for (i = 1; i <= min; i++)
3842 {
3843 if (eptr >= md->end_subject)
3844 {
3845 SCHECK_PARTIAL();
3846 MRRETURN(MATCH_NOMATCH);
3847 }
3848 GETCHARINCTEST(c, eptr);
3849 prop_category = UCD_CATEGORY(c);
3850 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3851 c == CHAR_FF || c == CHAR_CR)
3852 == prop_fail_result)
3853 MRRETURN(MATCH_NOMATCH);
3854 }
3855 break;
3856
3857 case PT_PXSPACE: /* POSIX space */
3858 for (i = 1; i <= min; i++)
3859 {
3860 if (eptr >= md->end_subject)
3861 {
3862 SCHECK_PARTIAL();
3863 MRRETURN(MATCH_NOMATCH);
3864 }
3865 GETCHARINCTEST(c, eptr);
3866 prop_category = UCD_CATEGORY(c);
3867 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3868 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3869 == prop_fail_result)
3870 MRRETURN(MATCH_NOMATCH);
3871 }
3872 break;
3873
3874 case PT_WORD:
3875 for (i = 1; i <= min; i++)
3876 {
3877 if (eptr >= md->end_subject)
3878 {
3879 SCHECK_PARTIAL();
3880 MRRETURN(MATCH_NOMATCH);
3881 }
3882 GETCHARINCTEST(c, eptr);
3883 prop_category = UCD_CATEGORY(c);
3884 if ((prop_category == ucp_L || prop_category == ucp_N ||
3885 c == CHAR_UNDERSCORE)
3886 == prop_fail_result)
3887 MRRETURN(MATCH_NOMATCH);
3888 }
3889 break;
3890
3891 /* This should not occur */
3892
3893 default:
3894 RRETURN(PCRE_ERROR_INTERNAL);
3895 }
3896 }
3897
3898 /* Match extended Unicode sequences. We will get here only if the
3899 support is in the binary; otherwise a compile-time error occurs. */
3900
3901 else if (ctype == OP_EXTUNI)
3902 {
3903 for (i = 1; i <= min; i++)
3904 {
3905 if (eptr >= md->end_subject)
3906 {
3907 SCHECK_PARTIAL();
3908 MRRETURN(MATCH_NOMATCH);
3909 }
3910 GETCHARINCTEST(c, eptr);
3911 prop_category = UCD_CATEGORY(c);
3912 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
3913 while (eptr < md->end_subject)
3914 {
3915 int len = 1;
3916 if (!utf8) c = *eptr;
3917 else { GETCHARLEN(c, eptr, len); }
3918 prop_category = UCD_CATEGORY(c);
3919 if (prop_category != ucp_M) break;
3920 eptr += len;
3921 }
3922 }
3923 }
3924
3925 else
3926 #endif /* SUPPORT_UCP */
3927
3928 /* Handle all other cases when the coding is UTF-8 */
3929
3930 #ifdef SUPPORT_UTF8
3931 if (utf8) switch(ctype)
3932 {
3933 case OP_ANY:
3934 for (i = 1; i <= min; i++)
3935 {
3936 if (eptr >= md->end_subject)
3937 {
3938 SCHECK_PARTIAL();
3939 MRRETURN(MATCH_NOMATCH);
3940 }
3941 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
3942 eptr++;
3943 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3944 }
3945 break;
3946
3947 case OP_ALLANY:
3948 for (i = 1; i <= min; i++)
3949 {
3950 if (eptr >= md->end_subject)
3951 {
3952 SCHECK_PARTIAL();
3953 MRRETURN(MATCH_NOMATCH);
3954 }
3955 eptr++;
3956 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3957 }
3958 break;
3959
3960 case OP_ANYBYTE:
3961 if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
3962 eptr += min;
3963 break;
3964
3965 case OP_ANYNL:
3966 for (i = 1; i <= min; i++)
3967 {
3968 if (eptr >= md->end_subject)
3969 {
3970 SCHECK_PARTIAL();
3971 MRRETURN(MATCH_NOMATCH);
3972 }
3973 GETCHARINC(c, eptr);
3974 switch(c)
3975 {
3976 default: MRRETURN(MATCH_NOMATCH);
3977
3978 case 0x000d:
3979 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3980 break;
3981
3982 case 0x000a:
3983 break;
3984
3985 case 0x000b:
3986 case 0x000c:
3987 case 0x0085:
3988 case 0x2028:
3989 case 0x2029:
3990 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
3991 break;
3992 }
3993 }
3994 break;
3995
3996 case OP_NOT_HSPACE:
3997 for (i = 1; i <= min; i++)
3998 {
3999 if (eptr >= md->end_subject)
4000 {
4001 SCHECK_PARTIAL();
4002 MRRETURN(MATCH_NOMATCH);
4003 }
4004 GETCHARINC(c, eptr);
4005 switch(c)
4006 {
4007 default: break;
4008 case 0x09: /* HT */
4009 case 0x20: /* SPACE */
4010 case 0xa0: /* NBSP */
4011 case 0x1680: /* OGHAM SPACE MARK */
4012 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4013 case 0x2000: /* EN QUAD */
4014 case 0x2001: /* EM QUAD */
4015 case 0x2002: /* EN SPACE */
4016 case 0x2003: /* EM SPACE */
4017 case 0x2004: /* THREE-PER-EM SPACE */
4018 case 0x2005: /* FOUR-PER-EM SPACE */
4019 case 0x2006: /* SIX-PER-EM SPACE */
4020 case 0x2007: /* FIGURE SPACE */
4021 case 0x2008: /* PUNCTUATION SPACE */
4022 case 0x2009: /* THIN SPACE */
4023 case 0x200A: /* HAIR SPACE */
4024 case 0x202f: /* NARROW NO-BREAK SPACE */
4025 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4026 case 0x3000: /* IDEOGRAPHIC SPACE */
4027 MRRETURN(MATCH_NOMATCH);
4028 }
4029 }
4030 break;
4031
4032 case OP_HSPACE:
4033 for (i = 1; i <= min; i++)
4034 {
4035 if (eptr >= md->end_subject)
4036 {
4037 SCHECK_PARTIAL();
4038 MRRETURN(MATCH_NOMATCH);
4039 }
4040 GETCHARINC(c, eptr);
4041 switch(c)
4042 {
4043 default: MRRETURN(MATCH_NOMATCH);
4044 case 0x09: /* HT */
4045 case 0x20: /* SPACE */
4046 case 0xa0: /* NBSP */
4047 case 0x1680: /* OGHAM SPACE MARK */
4048 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4049 case 0x2000: /* EN QUAD */
4050 case 0x2001: /* EM QUAD */
4051 case 0x2002: /* EN SPACE */
4052 case 0x2003: /* EM SPACE */
4053 case 0x2004: /* THREE-PER-EM SPACE */
4054 case 0x2005: /* FOUR-PER-EM SPACE */
4055 case 0x2006: /* SIX-PER-EM SPACE */
4056 case 0x2007: /* FIGURE SPACE */
4057 case 0x2008: /* PUNCTUATION SPACE */
4058 case 0x2009: /* THIN SPACE */
4059 case 0x200A: /* HAIR SPACE */
4060 case 0x202f: /* NARROW NO-BREAK SPACE */
4061 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4062 case 0x3000: /* IDEOGRAPHIC SPACE */
4063 break;
4064 }
4065 }
4066 break;
4067
4068 case OP_NOT_VSPACE:
4069 for (i = 1; i <= min; i++)
4070 {
4071 if (eptr >= md->end_subject)
4072 {
4073 SCHECK_PARTIAL();
4074 MRRETURN(MATCH_NOMATCH);
4075 }
4076 GETCHARINC(c, eptr);
4077 switch(c)
4078 {
4079 default: break;
4080 case 0x0a: /* LF */
4081 case 0x0b: /* VT */
4082 case 0x0c: /* FF */
4083 case 0x0d: /* CR */
4084 case 0x85: /* NEL */
4085 case 0x2028: /* LINE SEPARATOR */
4086 case 0x2029: /* PARAGRAPH SEPARATOR */
4087 MRRETURN(MATCH_NOMATCH);
4088 }
4089 }
4090 break;
4091
4092 case OP_VSPACE:
4093 for (i = 1; i <= min; i++)
4094 {
4095 if (eptr >= md->end_subject)
4096 {
4097 SCHECK_PARTIAL();
4098 MRRETURN(MATCH_NOMATCH);
4099 }
4100 GETCHARINC(c, eptr);
4101 switch(c)
4102 {
4103 default: MRRETURN(MATCH_NOMATCH);
4104 case 0x0a: /* LF */
4105 case 0x0b: /* VT */
4106 case 0x0c: /* FF */
4107 case 0x0d: /* CR */
4108 case 0x85: /* NEL */
4109 case 0x2028: /* LINE SEPARATOR */
4110 case 0x2029: /* PARAGRAPH SEPARATOR */
4111 break;
4112 }
4113 }
4114 break;
4115
4116 case OP_NOT_DIGIT:
4117 for (i = 1; i <= min; i++)
4118 {
4119 if (eptr >= md->end_subject)
4120 {
4121 SCHECK_PARTIAL();
4122 MRRETURN(MATCH_NOMATCH);
4123 }
4124 GETCHARINC(c, eptr);
4125 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
4126 MRRETURN(MATCH_NOMATCH);
4127 }
4128 break;
4129
4130 case OP_DIGIT:
4131 for (i = 1; i <= min; i++)
4132 {
4133 if (eptr >= md->end_subject)
4134 {
4135 SCHECK_PARTIAL();
4136 MRRETURN(MATCH_NOMATCH);
4137 }
4138 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
4139 MRRETURN(MATCH_NOMATCH);
4140 /* No need to skip more bytes - we know it's a 1-byte character */
4141 }
4142 break;
4143
4144 case OP_NOT_WHITESPACE:
4145 for (i = 1; i <= min; i++)
4146 {
4147 if (eptr >= md->end_subject)
4148 {
4149 SCHECK_PARTIAL();
4150 MRRETURN(MATCH_NOMATCH);
4151 }
4152 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
4153 MRRETURN(MATCH_NOMATCH);
4154 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4155 }
4156 break;
4157
4158 case OP_WHITESPACE:
4159 for (i = 1; i <= min; i++)
4160 {
4161 if (eptr >= md->end_subject)
4162 {
4163 SCHECK_PARTIAL();
4164 MRRETURN(MATCH_NOMATCH);
4165 }
4166 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
4167 MRRETURN(MATCH_NOMATCH);
4168 /* No need to skip more bytes - we know it's a 1-byte character */
4169 }
4170 break;
4171
4172 case OP_NOT_WORDCHAR:
4173 for (i = 1; i <= min; i++)
4174 {
4175 if (eptr >= md->end_subject)
4176 {
4177 SCHECK_PARTIAL();
4178 MRRETURN(MATCH_NOMATCH);
4179 }
4180 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
4181 MRRETURN(MATCH_NOMATCH);
4182 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
4183 }
4184 break;
4185
4186 case OP_WORDCHAR:
4187 for (i = 1; i <= min; i++)
4188 {
4189 if (eptr >= md->end_subject)
4190 {
4191 SCHECK_PARTIAL();
4192 MRRETURN(MATCH_NOMATCH);
4193 }
4194 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
4195 MRRETURN(MATCH_NOMATCH);
4196 /* No need to skip more bytes - we know it's a 1-byte character */
4197 }
4198 break;
4199
4200 default:
4201 RRETURN(PCRE_ERROR_INTERNAL);
4202 } /* End switch(ctype) */
4203
4204 else
4205 #endif /* SUPPORT_UTF8 */
4206
4207 /* Code for the non-UTF-8 case for minimum matching of operators other
4208 than OP_PROP and OP_NOTPROP. */
4209
4210 switch(ctype)
4211 {
4212 case OP_ANY:
4213 for (i = 1; i <= min; i++)
4214 {
4215 if (eptr >= md->end_subject)
4216 {
4217 SCHECK_PARTIAL();
4218 MRRETURN(MATCH_NOMATCH);
4219 }
4220 if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
4221 eptr++;
4222 }
4223 break;
4224
4225 case OP_ALLANY:
4226 if (eptr > md->end_subject - min)
4227 {
4228 SCHECK_PARTIAL();
4229 MRRETURN(MATCH_NOMATCH);
4230 }
4231 eptr += min;
4232 break;
4233
4234 case OP_ANYBYTE:
4235 if (eptr > md->end_subject - min)
4236 {
4237 SCHECK_PARTIAL();
4238 MRRETURN(MATCH_NOMATCH);
4239 }
4240 eptr += min;
4241 break;
4242
4243 case OP_ANYNL:
4244 for (i = 1; i <= min; i++)
4245 {
4246 if (eptr >= md->end_subject)
4247 {
4248 SCHECK_PARTIAL();
4249 MRRETURN(MATCH_NOMATCH);
4250 }
4251 switch(*eptr++)
4252 {
4253 default: MRRETURN(MATCH_NOMATCH);
4254
4255 case 0x000d:
4256 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4257 break;
4258
4259 case 0x000a:
4260 break;
4261
4262 case 0x000b:
4263 case 0x000c:
4264 case 0x0085:
4265 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4266 break;
4267 }
4268 }
4269 break;
4270
4271 case OP_NOT_HSPACE:
4272 for (i = 1; i <= min; i++)
4273 {
4274 if (eptr >= md->end_subject)
4275 {
4276 SCHECK_PARTIAL();
4277 MRRETURN(MATCH_NOMATCH);
4278 }
4279 switch(*eptr++)
4280 {
4281 default: break;
4282 case 0x09: /* HT */
4283 case 0x20: /* SPACE */
4284 case 0xa0: /* NBSP */
4285 MRRETURN(MATCH_NOMATCH);
4286 }
4287 }
4288 break;
4289
4290 case OP_HSPACE:
4291 for (i = 1; i <= min; i++)
4292 {
4293 if (eptr >= md->end_subject)
4294 {
4295 SCHECK_PARTIAL();
4296 MRRETURN(MATCH_NOMATCH);
4297 }
4298 switch(*eptr++)
4299 {
4300 default: MRRETURN(MATCH_NOMATCH);
4301 case 0x09: /* HT */
4302 case 0x20: /* SPACE */
4303 case 0xa0: /* NBSP */
4304 break;
4305 }
4306 }
4307 break;
4308
4309 case OP_NOT_VSPACE:
4310 for (i = 1; i <= min; i++)
4311 {
4312 if (eptr >= md->end_subject)
4313 {
4314 SCHECK_PARTIAL();
4315 MRRETURN(MATCH_NOMATCH);
4316 }
4317 switch(*eptr++)
4318 {
4319 default: break;
4320 case 0x0a: /* LF */
4321 case 0x0b: /* VT */
4322 case 0x0c: /* FF */
4323 case 0x0d: /* CR */
4324 case 0x85: /* NEL */
4325 MRRETURN(MATCH_NOMATCH);
4326 }
4327 }
4328 break;
4329
4330 case OP_VSPACE:
4331 for (i = 1; i <= min; i++)
4332 {
4333 if (eptr >= md->end_subject)
4334 {
4335 SCHECK_PARTIAL();
4336 MRRETURN(MATCH_NOMATCH);
4337 }
4338 switch(*eptr++)
4339 {
4340 default: MRRETURN(MATCH_NOMATCH);
4341 case 0x0a: /* LF */
4342 case 0x0b: /* VT */
4343 case 0x0c: /* FF */
4344 case 0x0d: /* CR */
4345 case 0x85: /* NEL */
4346 break;
4347 }
4348 }
4349 break;
4350
4351 case OP_NOT_DIGIT:
4352 for (i = 1; i <= min; i++)
4353 {
4354 if (eptr >= md->end_subject)
4355 {
4356 SCHECK_PARTIAL();
4357 MRRETURN(MATCH_NOMATCH);
4358 }
4359 if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4360 }
4361 break;
4362
4363 case OP_DIGIT:
4364 for (i = 1; i <= min; i++)
4365 {
4366 if (eptr >= md->end_subject)
4367 {
4368 SCHECK_PARTIAL();
4369 MRRETURN(MATCH_NOMATCH);
4370 }
4371 if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4372 }
4373 break;
4374
4375 case OP_NOT_WHITESPACE:
4376 for (i = 1; i <= min; i++)
4377 {
4378 if (eptr >= md->end_subject)
4379 {
4380 SCHECK_PARTIAL();
4381 MRRETURN(MATCH_NOMATCH);
4382 }
4383 if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4384 }
4385 break;
4386
4387 case OP_WHITESPACE:
4388 for (i = 1; i <= min; i++)
4389 {
4390 if (eptr >= md->end_subject)
4391 {
4392 SCHECK_PARTIAL();
4393 MRRETURN(MATCH_NOMATCH);
4394 }
4395 if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4396 }
4397 break;
4398
4399 case OP_NOT_WORDCHAR:
4400 for (i = 1; i <= min; i++)
4401 {
4402 if (eptr >= md->end_subject)
4403 {
4404 SCHECK_PARTIAL();
4405 MRRETURN(MATCH_NOMATCH);
4406 }
4407 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4408 MRRETURN(MATCH_NOMATCH);
4409 }
4410 break;
4411
4412 case OP_WORDCHAR:
4413 for (i = 1; i <= min; i++)
4414 {
4415 if (eptr >= md->end_subject)
4416 {
4417 SCHECK_PARTIAL();
4418 MRRETURN(MATCH_NOMATCH);
4419 }
4420 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4421 MRRETURN(MATCH_NOMATCH);
4422 }
4423 break;
4424
4425 default:
4426 RRETURN(PCRE_ERROR_INTERNAL);
4427 }
4428 }
4429
4430 /* If min = max, continue at the same level without recursing */
4431
4432 if (min == max) continue;
4433
4434 /* If minimizing, we have to test the rest of the pattern before each
4435 subsequent match. Again, separate the UTF-8 case for speed, and also
4436 separate the UCP cases. */
4437
4438 if (minimize)
4439 {
4440 #ifdef SUPPORT_UCP
4441 if (prop_type >= 0)
4442 {
4443 switch(prop_type)
4444 {
4445 case PT_ANY:
4446 for (fi = min;; fi++)
4447 {
4448 RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
4449 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4450 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4451 if (eptr >= md->end_subject)
4452 {
4453 SCHECK_PARTIAL();
4454 MRRETURN(MATCH_NOMATCH);
4455 }
4456 GETCHARINCTEST(c, eptr);
4457 if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
4458 }
4459 /* Control never gets here */
4460
4461 case PT_LAMP:
4462 for (fi = min;; fi++)
4463 {
4464 RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
4465 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4466 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4467 if (eptr >= md->end_subject)
4468 {
4469 SCHECK_PARTIAL();
4470 MRRETURN(MATCH_NOMATCH);
4471 }
4472 GETCHARINCTEST(c, eptr);
4473 prop_chartype = UCD_CHARTYPE(c);
4474 if ((prop_chartype == ucp_Lu ||
4475 prop_chartype == ucp_Ll ||
4476 prop_chartype == ucp_Lt) == prop_fail_result)
4477 MRRETURN(MATCH_NOMATCH);
4478 }
4479 /* Control never gets here */
4480
4481 case PT_GC:
4482 for (fi = min;; fi++)
4483 {
4484 RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
4485 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4486 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 MRRETURN(MATCH_NOMATCH);
4491 }
4492 GETCHARINCTEST(c, eptr);
4493 prop_category = UCD_CATEGORY(c);
4494 if ((prop_category == prop_value) == prop_fail_result)
4495 MRRETURN(MATCH_NOMATCH);
4496 }
4497 /* Control never gets here */
4498
4499 case PT_PC:
4500 for (fi = min;; fi++)
4501 {
4502 RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
4503 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4504 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4505 if (eptr >= md->end_subject)
4506 {
4507 SCHECK_PARTIAL();
4508 MRRETURN(MATCH_NOMATCH);
4509 }
4510 GETCHARINCTEST(c, eptr);
4511 prop_chartype = UCD_CHARTYPE(c);
4512 if ((prop_chartype == prop_value) == prop_fail_result)
4513 MRRETURN(MATCH_NOMATCH);
4514 }
4515 /* Control never gets here */
4516
4517 case PT_SC:
4518 for (fi = min;; fi++)
4519 {
4520 RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
4521 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4522 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4523 if (eptr >= md->end_subject)
4524 {
4525 SCHECK_PARTIAL();
4526 MRRETURN(MATCH_NOMATCH);
4527 }
4528 GETCHARINCTEST(c, eptr);
4529 prop_script = UCD_SCRIPT(c);
4530 if ((prop_script == prop_value) == prop_fail_result)
4531 MRRETURN(MATCH_NOMATCH);
4532 }
4533 /* Control never gets here */
4534
4535 case PT_ALNUM:
4536 for (fi = min;; fi++)
4537 {
4538 RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
4539 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4540 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4541 if (eptr >= md->end_subject)
4542 {
4543 SCHECK_PARTIAL();
4544 MRRETURN(MATCH_NOMATCH);
4545 }
4546 GETCHARINCTEST(c, eptr);
4547 prop_category = UCD_CATEGORY(c);
4548 if ((prop_category == ucp_L || prop_category == ucp_N)
4549 == prop_fail_result)
4550 MRRETURN(MATCH_NOMATCH);
4551 }
4552 /* Control never gets here */
4553
4554 case PT_SPACE: /* Perl space */
4555 for (fi = min;; fi++)
4556 {
4557 RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
4558 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4559 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4560 if (eptr >= md->end_subject)
4561 {
4562 SCHECK_PARTIAL();
4563 MRRETURN(MATCH_NOMATCH);
4564 }
4565 GETCHARINCTEST(c, eptr);
4566 prop_category = UCD_CATEGORY(c);
4567 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4568 c == CHAR_FF || c == CHAR_CR)
4569 == prop_fail_result)
4570 MRRETURN(MATCH_NOMATCH);
4571 }
4572 /* Control never gets here */
4573
4574 case PT_PXSPACE: /* POSIX space */
4575 for (fi = min;; fi++)
4576 {
4577 RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
4578 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4579 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4580 if (eptr >= md->end_subject)
4581 {
4582 SCHECK_PARTIAL();
4583 MRRETURN(MATCH_NOMATCH);
4584 }
4585 GETCHARINCTEST(c, eptr);
4586 prop_category = UCD_CATEGORY(c);
4587 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4588 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4589 == prop_fail_result)
4590 MRRETURN(MATCH_NOMATCH);
4591 }
4592 /* Control never gets here */
4593
4594 case PT_WORD:
4595 for (fi = min;; fi++)
4596 {
4597 RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
4598 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4599 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4600 if (eptr >= md->end_subject)
4601 {
4602 SCHECK_PARTIAL();
4603 MRRETURN(MATCH_NOMATCH);
4604 }
4605 GETCHARINCTEST(c, eptr);
4606 prop_category = UCD_CATEGORY(c);
4607 if ((prop_category == ucp_L ||
4608 prop_category == ucp_N ||
4609 c == CHAR_UNDERSCORE)
4610 == prop_fail_result)
4611 MRRETURN(MATCH_NOMATCH);
4612 }
4613 /* Control never gets here */
4614
4615 /* This should never occur */
4616
4617 default:
4618 RRETURN(PCRE_ERROR_INTERNAL);
4619 }
4620 }
4621
4622 /* Match extended Unicode sequences. We will get here only if the
4623 support is in the binary; otherwise a compile-time error occurs. */
4624
4625 else if (ctype == OP_EXTUNI)
4626 {
4627 for (fi = min;; fi++)
4628 {
4629 RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
4630 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4631 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4632 if (eptr >= md->end_subject)
4633 {
4634 SCHECK_PARTIAL();
4635 MRRETURN(MATCH_NOMATCH);
4636 }
4637 GETCHARINCTEST(c, eptr);
4638 prop_category = UCD_CATEGORY(c);
4639 if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
4640 while (eptr < md->end_subject)
4641 {
4642 int len = 1;
4643 if (!utf8) c = *eptr;
4644 else { GETCHARLEN(c, eptr, len); }
4645 prop_category = UCD_CATEGORY(c);
4646 if (prop_category != ucp_M) break;
4647 eptr += len;
4648 }
4649 }
4650 }
4651
4652 else
4653 #endif /* SUPPORT_UCP */
4654
4655 #ifdef SUPPORT_UTF8
4656 /* UTF-8 mode */
4657 if (utf8)
4658 {
4659 for (fi = min;; fi++)
4660 {
4661 RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
4662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4663 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4664 if (eptr >= md->end_subject)
4665 {
4666 SCHECK_PARTIAL();
4667 MRRETURN(MATCH_NOMATCH);
4668 }
4669 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4670 MRRETURN(MATCH_NOMATCH);
4671 GETCHARINC(c, eptr);
4672 switch(ctype)
4673 {
4674 case OP_ANY: /* This is the non-NL case */
4675 case OP_ALLANY:
4676 case OP_ANYBYTE:
4677 break;
4678
4679 case OP_ANYNL:
4680 switch(c)
4681 {
4682 default: MRRETURN(MATCH_NOMATCH);
4683 case 0x000d:
4684 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4685 break;
4686 case 0x000a:
4687 break;
4688
4689 case 0x000b:
4690 case 0x000c:
4691 case 0x0085:
4692 case 0x2028:
4693 case 0x2029:
4694 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4695 break;
4696 }
4697 break;
4698
4699 case OP_NOT_HSPACE:
4700 switch(c)
4701 {
4702 default: break;
4703 case 0x09: /* HT */
4704 case 0x20: /* SPACE */
4705 case 0xa0: /* NBSP */
4706 case 0x1680: /* OGHAM SPACE MARK */
4707 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4708 case 0x2000: /* EN QUAD */
4709 case 0x2001: /* EM QUAD */
4710 case 0x2002: /* EN SPACE */
4711 case 0x2003: /* EM SPACE */
4712 case 0x2004: /* THREE-PER-EM SPACE */
4713 case 0x2005: /* FOUR-PER-EM SPACE */
4714 case 0x2006: /* SIX-PER-EM SPACE */
4715 case 0x2007: /* FIGURE SPACE */
4716 case 0x2008: /* PUNCTUATION SPACE */
4717 case 0x2009: /* THIN SPACE */
4718 case 0x200A: /* HAIR SPACE */
4719 case 0x202f: /* NARROW NO-BREAK SPACE */
4720 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4721 case 0x3000: /* IDEOGRAPHIC SPACE */
4722 MRRETURN(MATCH_NOMATCH);
4723 }
4724 break;
4725
4726 case OP_HSPACE:
4727 switch(c)
4728 {
4729 default: MRRETURN(MATCH_NOMATCH);
4730 case 0x09: /* HT */
4731 case 0x20: /* SPACE */
4732 case 0xa0: /* NBSP */
4733 case 0x1680: /* OGHAM SPACE MARK */
4734 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4735 case 0x2000: /* EN QUAD */
4736 case 0x2001: /* EM QUAD */
4737 case 0x2002: /* EN SPACE */
4738 case 0x2003: /* EM SPACE */
4739 case 0x2004: /* THREE-PER-EM SPACE */
4740 case 0x2005: /* FOUR-PER-EM SPACE */
4741 case 0x2006: /* SIX-PER-EM SPACE */
4742 case 0x2007: /* FIGURE SPACE */
4743 case 0x2008: /* PUNCTUATION SPACE */
4744 case 0x2009: /* THIN SPACE */
4745 case 0x200A: /* HAIR SPACE */
4746 case 0x202f: /* NARROW NO-BREAK SPACE */
4747 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4748 case 0x3000: /* IDEOGRAPHIC SPACE */
4749 break;
4750 }
4751 break;
4752
4753 case OP_NOT_VSPACE:
4754 switch(c)
4755 {
4756 default: break;
4757 case 0x0a: /* LF */
4758 case 0x0b: /* VT */
4759 case 0x0c: /* FF */
4760 case 0x0d: /* CR */
4761 case 0x85: /* NEL */
4762 case 0x2028: /* LINE SEPARATOR */
4763 case 0x2029: /* PARAGRAPH SEPARATOR */
4764 MRRETURN(MATCH_NOMATCH);
4765 }
4766 break;
4767
4768 case OP_VSPACE:
4769 switch(c)
4770 {
4771 default: MRRETURN(MATCH_NOMATCH);
4772 case 0x0a: /* LF */
4773 case 0x0b: /* VT */
4774 case 0x0c: /* FF */
4775 case 0x0d: /* CR */
4776 case 0x85: /* NEL */
4777 case 0x2028: /* LINE SEPARATOR */
4778 case 0x2029: /* PARAGRAPH SEPARATOR */
4779 break;
4780 }
4781 break;
4782
4783 case OP_NOT_DIGIT:
4784 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4785 MRRETURN(MATCH_NOMATCH);
4786 break;
4787
4788 case OP_DIGIT:
4789 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4790 MRRETURN(MATCH_NOMATCH);
4791 break;
4792
4793 case OP_NOT_WHITESPACE:
4794 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4795 MRRETURN(MATCH_NOMATCH);
4796 break;
4797
4798 case OP_WHITESPACE:
4799 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4800 MRRETURN(MATCH_NOMATCH);
4801 break;
4802
4803 case OP_NOT_WORDCHAR:
4804 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4805 MRRETURN(MATCH_NOMATCH);
4806 break;
4807
4808 case OP_WORDCHAR:
4809 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4810 MRRETURN(MATCH_NOMATCH);
4811 break;
4812
4813 default:
4814 RRETURN(PCRE_ERROR_INTERNAL);
4815 }
4816 }
4817 }
4818 else
4819 #endif
4820 /* Not UTF-8 mode */
4821 {
4822 for (fi = min;; fi++)
4823 {
4824 RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
4825 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4826 if (fi >= max) MRRETURN(MATCH_NOMATCH);
4827 if (eptr >= md->end_subject)
4828 {
4829 SCHECK_PARTIAL();
4830 MRRETURN(MATCH_NOMATCH);
4831 }
4832 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4833 MRRETURN(MATCH_NOMATCH);
4834 c = *eptr++;
4835 switch(ctype)
4836 {
4837 case OP_ANY: /* This is the non-NL case */
4838 case OP_ALLANY:
4839 case OP_ANYBYTE:
4840 break;
4841
4842 case OP_ANYNL:
4843 switch(c)
4844 {
4845 default: MRRETURN(MATCH_NOMATCH);
4846 case 0x000d:
4847 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4848 break;
4849
4850 case 0x000a:
4851 break;
4852
4853 case 0x000b:
4854 case 0x000c:
4855 case 0x0085:
4856 if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
4857 break;
4858 }
4859 break;
4860
4861 case OP_NOT_HSPACE:
4862 switch(c)
4863 {
4864 default: break;
4865 case 0x09: /* HT */
4866 case 0x20: /* SPACE */
4867 case 0xa0: /* NBSP */
4868 MRRETURN(MATCH_NOMATCH);
4869 }
4870 break;
4871
4872 case OP_HSPACE:
4873 switch(c)
4874 {
4875 default: MRRETURN(MATCH_NOMATCH);
4876 case 0x09: /* HT */
4877 case 0x20: /* SPACE */
4878 case 0xa0: /* NBSP */
4879 break;
4880 }
4881 break;
4882
4883 case OP_NOT_VSPACE:
4884 switch(c)
4885 {
4886 default: break;
4887 case 0x0a: /* LF */
4888 case 0x0b: /* VT */
4889 case 0x0c: /* FF */
4890 case 0x0d: /* CR */
4891 case 0x85: /* NEL */
4892 MRRETURN(MATCH_NOMATCH);
4893 }
4894 break;
4895
4896 case OP_VSPACE:
4897 switch(c)
4898 {
4899 default: MRRETURN(MATCH_NOMATCH);
4900 case 0x0a: /* LF */
4901 case 0x0b: /* VT */
4902 case 0x0c: /* FF */
4903 case 0x0d: /* CR */
4904 case 0x85: /* NEL */
4905 break;
4906 }
4907 break;
4908
4909 case OP_NOT_DIGIT:
4910 if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4911 break;
4912
4913 case OP_DIGIT:
4914 if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4915 break;
4916
4917 case OP_NOT_WHITESPACE:
4918 if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4919 break;
4920
4921 case OP_WHITESPACE:
4922 if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4923 break;
4924
4925 case OP_NOT_WORDCHAR:
4926 if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
4927 break;
4928
4929 case OP_WORDCHAR:
4930 if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
4931 break;
4932
4933 default:
4934 RRETURN(PCRE_ERROR_INTERNAL);
4935 }
4936 }
4937 }
4938 /* Control never gets here */
4939 }
4940
4941 /* If maximizing, it is worth using inline code for speed, doing the type
4942 test once at the start (i.e. keep it out of the loop). Again, keep the
4943 UTF-8 and UCP stuff separate. */
4944
4945 else
4946 {
4947 pp = eptr; /* Remember where we started */
4948
4949 #ifdef SUPPORT_UCP
4950 if (prop_type >= 0)
4951 {
4952 switch(prop_type)
4953 {
4954 case PT_ANY:
4955 for (i = min; i < max; i++)
4956 {
4957 int len = 1;
4958 if (eptr >= md->end_subject)
4959 {
4960 SCHECK_PARTIAL();
4961 break;
4962 }
4963 GETCHARLENTEST(c, eptr, len);
4964 if (prop_fail_result) break;
4965 eptr+= len;
4966 }
4967 break;
4968
4969 case PT_LAMP:
4970 for (i = min; i < max; i++)
4971 {
4972 int len = 1;
4973 if (eptr >= md->end_subject)
4974 {
4975 SCHECK_PARTIAL();
4976 break;
4977 }
4978 GETCHARLENTEST(c, eptr, len);
4979 prop_chartype = UCD_CHARTYPE(c);
4980 if ((prop_chartype == ucp_Lu ||
4981 prop_chartype == ucp_Ll ||
4982 prop_chartype == ucp_Lt) == prop_fail_result)
4983 break;
4984 eptr+= len;
4985 }
4986 break;
4987
4988 case PT_GC:
4989 for (i = min; i < max; i++)
4990 {
4991 int len = 1;
4992 if (eptr >= md->end_subject)
4993 {
4994 SCHECK_PARTIAL();
4995 break;
4996 }
4997 GETCHARLENTEST(c, eptr, len);
4998 prop_category = UCD_CATEGORY(c);
4999 if ((prop_category == prop_value) == prop_fail_result)
5000 break;
5001 eptr+= len;
5002 }
5003 break;
5004
5005 case PT_PC:
5006 for (i = min; i < max; i++)
5007 {
5008 int len = 1;
5009 if (eptr >= md->end_subject)
5010 {
5011 SCHECK_PARTIAL();
5012 break;
5013 }
5014 GETCHARLENTEST(c, eptr, len);
5015 prop_chartype = UCD_CHARTYPE(c);
5016 if ((prop_chartype == prop_value) == prop_fail_result)
5017 break;
5018 eptr+= len;
5019 }
5020 break;
5021
5022 case PT_SC:
5023 for (i = min; i < max; i++)
5024 {
5025 int len = 1;
5026 if (eptr >= md->end_subject)
5027 {
5028 SCHECK_PARTIAL();
5029 break;
5030 }
5031 GETCHARLENTEST(c, eptr, len);
5032 prop_script = UCD_SCRIPT(c);
5033 if ((prop_script == prop_value) == prop_fail_result)
5034 break;
5035 eptr+= len;
5036 }
5037 break;
5038
5039 case PT_ALNUM:
5040 for (i = min; i < max; i++)
5041 {
5042 int len = 1;
5043 if (eptr >= md->end_subject)
5044 {
5045 SCHECK_PARTIAL();
5046 break;
5047 }
5048 GETCHARLENTEST(c, eptr, len);
5049 prop_category = UCD_CATEGORY(c);
5050 if ((prop_category == ucp_L || prop_category == ucp_N)
5051 == prop_fail_result)
5052 break;
5053 eptr+= len;
5054 }
5055 break;
5056
5057 case PT_SPACE: /* Perl space */
5058 for (i = min; i < max; i++)
5059 {
5060 int len = 1;
5061 if (eptr >= md->end_subject)
5062 {
5063 SCHECK_PARTIAL();
5064 break;
5065 }
5066 GETCHARLENTEST(c, eptr, len);
5067 prop_category = UCD_CATEGORY(c);
5068 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5069 c == CHAR_FF || c == CHAR_CR)
5070 == prop_fail_result)
5071 break;
5072 eptr+= len;
5073 }
5074 break;
5075
5076 case PT_PXSPACE: /* POSIX space */
5077 for (i = min; i < max; i++)
5078 {
5079 int len = 1;
5080 if (eptr >= md->end_subject)
5081 {
5082 SCHECK_PARTIAL();
5083 break;
5084 }
5085 GETCHARLENTEST(c, eptr, len);
5086 prop_category = UCD_CATEGORY(c);
5087 if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
5088 c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5089 == prop_fail_result)
5090 break;
5091 eptr+= len;
5092 }
5093 break;
5094
5095 case PT_WORD:
5096 for (i = min; i < max; i++)
5097 {
5098 int len = 1;
5099 if (eptr >= md->end_subject)
5100 {
5101 SCHECK_PARTIAL();
5102 break;
5103 }
5104 GETCHARLENTEST(c, eptr, len);
5105 prop_category = UCD_CATEGORY(c);
5106 if ((prop_category == ucp_L || prop_category == ucp_N ||
5107 c == CHAR_UNDERSCORE) == prop_fail_result)
5108 break;
5109 eptr+= len;
5110 }
5111 break;
5112
5113 default:
5114 RRETURN(PCRE_ERROR_INTERNAL);
5115 }
5116
5117 /* eptr is now past the end of the maximum run */
5118
5119 if (possessive) continue;
5120 for(;;)
5121 {
5122 RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
5123 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5124 if (eptr-- == pp) break; /* Stop if tried at original pos */
5125 if (utf8) BACKCHAR(eptr);
5126 }
5127 }
5128
5129 /* Match extended Unicode sequences. We will get here only if the
5130 support is in the binary; otherwise a compile-time error occurs. */
5131
5132 else if (ctype == OP_EXTUNI)
5133 {
5134 for (i = min; i < max; i++)
5135 {
5136 if (eptr >= md->end_subject)
5137 {
5138 SCHECK_PARTIAL();
5139 break;
5140 }
5141 GETCHARINCTEST(c, eptr);
5142 prop_category = UCD_CATEGORY(c);
5143 if (prop_category == ucp_M) break;
5144 while (eptr < md->end_subject)
5145 {
5146 int len = 1;
5147 if (!utf8) c = *eptr; else
5148 {
5149 GETCHARLEN(c, eptr, len);
5150 }
5151 prop_category = UCD_CATEGORY(c);
5152 if (prop_category != ucp_M) break;
5153 eptr += len;
5154 }
5155 }
5156
5157 /* eptr is now past the end of the maximum run */
5158
5159 if (possessive) continue;
5160
5161 for(;;)
5162 {
5163 RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
5164 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5165 if (eptr-- == pp) break; /* Stop if tried at original pos */
5166 for (;;) /* Move back over one extended */
5167 {
5168 int len = 1;
5169 if (!utf8) c = *eptr; else
5170 {
5171 BACKCHAR(eptr);
5172 GETCHARLEN(c, eptr, len);
5173 }
5174 prop_category = UCD_CATEGORY(c);
5175 if (prop_category != ucp_M) break;
5176 eptr--;
5177 }
5178 }
5179 }
5180
5181 else
5182 #endif /* SUPPORT_UCP */
5183
5184 #ifdef SUPPORT_UTF8
5185 /* UTF-8 mode */
5186
5187 if (utf8)
5188 {
5189 switch(ctype)
5190 {
5191 case OP_ANY:
5192 if (max < INT_MAX)
5193 {
5194 for (i = min; i < max; i++)
5195 {
5196 if (eptr >= md->end_subject)
5197 {
5198 SCHECK_PARTIAL();
5199 break;
5200 }
5201 if (IS_NEWLINE(eptr)) break;
5202 eptr++;
5203 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5204 }
5205 }
5206
5207 /* Handle unlimited UTF-8 repeat */
5208
5209 else
5210 {
5211 for (i = min; i < max; i++)
5212 {
5213 if (eptr >= md->end_subject)
5214 {
5215 SCHECK_PARTIAL();
5216 break;
5217 }
5218 if (IS_NEWLINE(eptr)) break;
5219 eptr++;
5220 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5221 }
5222 }
5223 break;
5224
5225 case OP_ALLANY:
5226 if (max < INT_MAX)
5227 {
5228 for (i = min; i < max; i++)
5229 {
5230 if (eptr >= md->end_subject)
5231 {
5232 SCHECK_PARTIAL();
5233 break;
5234 }
5235 eptr++;
5236 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5237 }
5238 }
5239 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5240 break;
5241
5242 /* The byte case is the same as non-UTF8 */
5243
5244 case OP_ANYBYTE:
5245 c = max - min;
5246 if (c > (unsigned int)(md->end_subject - eptr))
5247 {
5248 eptr = md->end_subject;
5249 SCHECK_PARTIAL();
5250 }
5251 else eptr += c;
5252 break;
5253
5254 case OP_ANYNL:
5255 for (i = min; i < max; i++)
5256 {
5257 int len = 1;
5258 if (eptr >= md->end_subject)
5259 {
5260 SCHECK_PARTIAL();
5261 break;
5262 }
5263 GETCHARLEN(c, eptr, len);
5264 if (c == 0x000d)
5265 {
5266 if (++eptr >= md->end_subject) break;
5267 if (*eptr == 0x000a) eptr++;
5268 }
5269 else
5270 {
5271 if (c != 0x000a &&
5272 (md->bsr_anycrlf ||
5273 (c != 0x000b && c != 0x000c &&
5274 c != 0x0085 && c != 0x2028 && c != 0x2029)))
5275 break;
5276 eptr += len;
5277 }
5278 }
5279 break;
5280
5281 case OP_NOT_HSPACE:
5282 case OP_HSPACE:
5283 for (i = min; i < max; i++)
5284 {
5285 BOOL gotspace;
5286 int len = 1;
5287 if (eptr >= md->end_subject)
5288 {
5289 SCHECK_PARTIAL();
5290 break;
5291 }
5292 GETCHARLEN(c, eptr, len);
5293 switch(c)
5294 {
5295 default: gotspace = FALSE; break;
5296 case 0x09: /* HT */
5297 case 0x20: /* SPACE */
5298 case 0xa0: /* NBSP */
5299 case 0x1680: /* OGHAM SPACE MARK */
5300 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5301 case 0x2000: /* EN QUAD */
5302 case 0x2001: /* EM QUAD */
5303 case 0x2002: /* EN SPACE */
5304 case 0x2003: /* EM SPACE */
5305 case 0x2004: /* THREE-PER-EM SPACE */
5306 case 0x2005: /* FOUR-PER-EM SPACE */
5307 case 0x2006: /* SIX-PER-EM SPACE */
5308 case 0x2007: /* FIGURE SPACE */
5309 case 0x2008: /* PUNCTUATION SPACE */
5310 case 0x2009: /* THIN SPACE */
5311 case 0x200A: /* HAIR SPACE */
5312 case 0x202f: /* NARROW NO-BREAK SPACE */
5313 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5314 case 0x3000: /* IDEOGRAPHIC SPACE */
5315 gotspace = TRUE;
5316 break;
5317 }
5318 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5319 eptr += len;
5320 }
5321 break;
5322
5323 case OP_NOT_VSPACE:
5324 case OP_VSPACE:
5325 for (i = min; i < max; i++)
5326 {
5327 BOOL gotspace;
5328 int len = 1;
5329 if (eptr >= md->end_subject)
5330 {
5331 SCHECK_PARTIAL();
5332 break;
5333 }
5334 GETCHARLEN(c, eptr, len);
5335 switch(c)
5336 {
5337 default: gotspace = FALSE; break;
5338 case 0x0a: /* LF */
5339 case 0x0b: /* VT */
5340 case 0x0c: /* FF */
5341 case 0x0d: /* CR */
5342 case 0x85: /* NEL */
5343 case 0x2028: /* LINE SEPARATOR */
5344 case 0x2029: /* PARAGRAPH SEPARATOR */
5345 gotspace = TRUE;
5346 break;
5347 }
5348 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5349 eptr += len;
5350 }
5351 break;
5352
5353 case OP_NOT_DIGIT:
5354 for (i = min; i < max; i++)
5355 {
5356 int len = 1;
5357 if (eptr >= md->end_subject)
5358 {
5359 SCHECK_PARTIAL();
5360 break;
5361 }
5362 GETCHARLEN(c, eptr, len);
5363 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5364 eptr+= len;
5365 }
5366 break;
5367
5368 case OP_DIGIT:
5369 for (i = min; i < max; i++)
5370 {
5371 int len = 1;
5372 if (eptr >= md->end_subject)
5373 {
5374 SCHECK_PARTIAL();
5375 break;
5376 }
5377 GETCHARLEN(c, eptr, len);
5378 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5379 eptr+= len;
5380 }
5381 break;
5382
5383 case OP_NOT_WHITESPACE:
5384 for (i = min; i < max; i++)
5385 {
5386 int len = 1;
5387 if (eptr >= md->end_subject)
5388 {
5389 SCHECK_PARTIAL();
5390 break;
5391 }
5392 GETCHARLEN(c, eptr, len);
5393 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5394 eptr+= len;
5395 }
5396 break;
5397
5398 case OP_WHITESPACE:
5399 for (i = min; i < max; i++)
5400 {
5401 int len = 1;
5402 if (eptr >= md->end_subject)
5403 {
5404 SCHECK_PARTIAL();
5405 break;
5406 }
5407 GETCHARLEN(c, eptr, len);
5408 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5409 eptr+= len;
5410 }
5411 break;
5412
5413 case OP_NOT_WORDCHAR:
5414 for (i = min; i < max; i++)
5415 {
5416 int len = 1;
5417 if (eptr >= md->end_subject)
5418 {
5419 SCHECK_PARTIAL();
5420 break;
5421 }
5422 GETCHARLEN(c, eptr, len);
5423 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5424 eptr+= len;
5425 }
5426 break;
5427
5428 case OP_WORDCHAR:
5429 for (i = min; i < max; i++)
5430 {
5431 int len = 1;
5432 if (eptr >= md->end_subject)
5433 {
5434 SCHECK_PARTIAL();
5435 break;
5436 }
5437 GETCHARLEN(c, eptr, len);
5438 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5439 eptr+= len;
5440 }
5441 break;
5442
5443 default:
5444 RRETURN(PCRE_ERROR_INTERNAL);
5445 }
5446
5447 /* eptr is now past the end of the maximum run. If possessive, we are
5448 done (no backing up). Otherwise, match at this position; anything other
5449 than no match is immediately returned. For nomatch, back up one
5450 character, unless we are matching \R and the last thing matched was
5451 \r\n, in which case, back up two bytes. */
5452
5453 if (possessive) continue;
5454 for(;;)
5455 {
5456 RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
5457 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5458 if (eptr-- == pp) break; /* Stop if tried at original pos */
5459 BACKCHAR(eptr);
5460 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5461 eptr[-1] == '\r') eptr--;
5462 }
5463 }
5464 else
5465 #endif /* SUPPORT_UTF8 */
5466
5467 /* Not UTF-8 mode */
5468 {
5469 switch(ctype)
5470 {
5471 case OP_ANY:
5472 for (i = min; i < max; i++)
5473 {
5474 if (eptr >= md->end_subject)
5475 {
5476 SCHECK_PARTIAL();
5477 break;
5478 }
5479 if (IS_NEWLINE(eptr)) break;
5480 eptr++;
5481 }
5482 break;
5483
5484 case OP_ALLANY:
5485 case OP_ANYBYTE:
5486 c = max - min;
5487 if (c > (unsigned int)(md->end_subject - eptr))
5488 {
5489 eptr = md->end_subject;
5490 SCHECK_PARTIAL();
5491 }
5492 else eptr += c;
5493 break;
5494
5495 case OP_ANYNL:
5496 for (i = min; i < max; i++)
5497 {
5498 if (eptr >= md->end_subject)
5499 {
5500 SCHECK_PARTIAL();
5501 break;
5502 }
5503 c = *eptr;
5504 if (c == 0x000d)
5505 {
5506 if (++eptr >= md->end_subject) break;
5507 if (*eptr == 0x000a) eptr++;
5508 }
5509 else
5510 {
5511 if (c != 0x000a &&
5512 (md->bsr_anycrlf ||
5513 (c != 0x000b && c != 0x000c && c != 0x0085)))
5514 break;
5515 eptr++;
5516 }
5517 }
5518 break;
5519
5520 case OP_NOT_HSPACE:
5521 for (i = min; i < max; i++)
5522 {
5523 if (eptr >= md->end_subject)
5524 {
5525 SCHECK_PARTIAL();
5526 break;
5527 }
5528 c = *eptr;
5529 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5530 eptr++;
5531 }
5532 break;
5533
5534 case OP_HSPACE:
5535 for (i = min; i < max; i++)
5536 {
5537 if (eptr >= md->end_subject)
5538 {
5539 SCHECK_PARTIAL();
5540 break;
5541 }
5542 c = *eptr;
5543 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5544 eptr++;
5545 }
5546 break;
5547
5548 case OP_NOT_VSPACE:
5549 for (i = min; i < max; i++)
5550 {
5551 if (eptr >= md->end_subject)
5552 {
5553 SCHECK_PARTIAL();
5554 break;
5555 }
5556 c = *eptr;
5557 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5558 break;
5559 eptr++;
5560 }
5561 break;
5562
5563 case OP_VSPACE:
5564 for (i = min; i < max; i++)
5565 {
5566 if (eptr >= md->end_subject)
5567 {
5568 SCHECK_PARTIAL();
5569 break;
5570 }
5571 c = *eptr;
5572 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5573 break;
5574 eptr++;
5575 }
5576 break;
5577
5578 case OP_NOT_DIGIT:
5579 for (i = min; i < max; i++)
5580 {
5581 if (eptr >= md->end_subject)
5582 {
5583 SCHECK_PARTIAL();
5584 break;
5585 }
5586 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
5587 eptr++;
5588 }
5589 break;
5590
5591 case OP_DIGIT:
5592 for (i = min; i < max; i++)
5593 {
5594 if (eptr >= md->end_subject)
5595 {
5596 SCHECK_PARTIAL();
5597 break;
5598 }
5599 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
5600 eptr++;
5601 }
5602 break;
5603
5604 case OP_NOT_WHITESPACE:
5605 for (i = min; i < max; i++)
5606 {
5607 if (eptr >= md->end_subject)
5608 {
5609 SCHECK_PARTIAL();
5610 break;
5611 }
5612 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
5613 eptr++;
5614 }
5615 break;
5616
5617 case OP_WHITESPACE:
5618 for (i = min; i < max; i++)
5619 {
5620 if (eptr >= md->end_subject)
5621 {
5622 SCHECK_PARTIAL();
5623 break;
5624 }
5625 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
5626 eptr++;
5627 }
5628 break;
5629
5630 case OP_NOT_WORDCHAR:
5631 for (i = min; i < max; i++)
5632 {
5633 if (eptr >= md->end_subject)
5634 {
5635 SCHECK_PARTIAL();
5636 break;
5637 }
5638 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
5639 eptr++;
5640 }
5641 break;
5642
5643 case OP_WORDCHAR:
5644 for (i = min; i < max; i++)
5645 {
5646 if (eptr >= md->end_subject)
5647 {
5648 SCHECK_PARTIAL();
5649 break;
5650 }
5651 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
5652 eptr++;
5653 }
5654 break;
5655
5656 default:
5657 RRETURN(PCRE_ERROR_INTERNAL);
5658 }
5659
5660 /* eptr is now past the end of the maximum run. If possessive, we are
5661 done (no backing up). Otherwise, match at this position; anything other
5662 than no match is immediately returned. For nomatch, back up one
5663 character (byte), unless we are matching \R and the last thing matched
5664 was \r\n, in which case, back up two bytes. */
5665
5666 if (possessive) continue;
5667 while (eptr >= pp)
5668 {
5669 RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
5670 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5671 eptr--;
5672 if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
5673 eptr[-1] == '\r') eptr--;
5674 }
5675 }
5676
5677 /* Get here if we can't make it match with any permitted repetitions */
5678
5679 MRRETURN(MATCH_NOMATCH);
5680 }
5681 /* Control never gets here */
5682
5683 /* There's been some horrible disaster. Arrival here can only mean there is
5684 something seriously wrong in the code above or the OP_xxx definitions. */
5685
5686 default:
5687 DPRINTF(("Unknown opcode %d\n", *ecode));
5688 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5689 }
5690
5691 /* Do not stick any code in here without much thought; it is assumed
5692 that "continue" in the code above comes out to here to repeat the main
5693 loop. */
5694
5695 } /* End of main loop */
5696 /* Control never reaches here */
5697
5698
5699 /* When compiling to use the heap rather than the stack for recursive calls to
5700 match(), the RRETURN() macro jumps here. The number that is saved in
5701 frame->Xwhere indicates which label we actually want to return to. */
5702
5703 #ifdef NO_RECURSE
5704 #define LBL(val) case val: goto L_RM##val;
5705 HEAP_RETURN:
5706 switch (frame->Xwhere)
5707 {
5708 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5709 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5710 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5711 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5712 LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63)
5713 #ifdef SUPPORT_UTF8
5714 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5715 LBL(32) LBL(34) LBL(42) LBL(46)
5716 #ifdef SUPPORT_UCP
5717 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5718 LBL(59) LBL(60) LBL(61) LBL(62)
5719 #endif /* SUPPORT_UCP */
5720 #endif /* SUPPORT_UTF8 */
5721 default:
5722 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5723 return PCRE_ERROR_INTERNAL;
5724 }
5725 #undef LBL
5726 #endif /* NO_RECURSE */
5727 }
5728
5729
5730 /***************************************************************************
5731 ****************************************************************************
5732 RECURSION IN THE match() FUNCTION
5733
5734 Undefine all the macros that were defined above to handle this. */
5735
5736 #ifdef NO_RECURSE
5737 #undef eptr
5738 #undef ecode
5739 #undef mstart
5740 #undef offset_top
5741 #undef eptrb
5742 #undef flags
5743
5744 #undef callpat
5745 #undef charptr
5746 #undef data
5747 #undef next
5748 #undef pp
5749 #undef prev
5750 #undef saved_eptr
5751
5752 #undef new_recursive
5753
5754 #undef cur_is_word
5755 #undef condition
5756 #undef prev_is_word
5757
5758 #undef ctype
5759 #undef length
5760 #undef max
5761 #undef min
5762 #undef number
5763 #undef offset
5764 #undef op
5765 #undef save_capture_last
5766 #undef save_offset1
5767 #undef save_offset2
5768 #undef save_offset3
5769 #undef stacksave
5770
5771 #undef newptrb
5772
5773 #endif
5774
5775 /* These two are defined as macros in both cases */
5776
5777 #undef fc
5778 #undef fi
5779
5780 /***************************************************************************
5781 ***************************************************************************/
5782
5783
5784
5785 /*************************************************
5786 * Execute a Regular Expression *
5787 *************************************************/
5788
5789 /* This function applies a compiled re to a subject string and picks out
5790 portions of the string if it matches. Two elements in the vector are set for
5791 each substring: the offsets to the start and end of the substring.
5792
5793 Arguments:
5794 argument_re points to the compiled expression
5795 extra_data points to extra data or is NULL
5796 subject points to the subject string
5797 length length of subject string (may contain binary zeros)
5798 start_offset where to start in the subject string
5799 options option bits
5800 offsets points to a vector of ints to be filled in with offsets
5801 offsetcount the number of elements in the vector
5802
5803 Returns: > 0 => success; value is the number of elements filled in
5804 = 0 => success, but offsets is not big enough
5805 -1 => failed to match
5806 < -1 => some kind of unexpected problem
5807 */
5808
5809 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5810 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5811 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5812 int offsetcount)
5813 {
5814 int rc, ocount;
5815 int first_byte = -1;
5816 int req_byte = -1;
5817 int req_byte2 = -1;
5818 int newline;
5819 BOOL using_temporary_offsets = FALSE;
5820 BOOL anchored;
5821 BOOL startline;
5822 BOOL firstline;
5823 BOOL first_byte_caseless = FALSE;
5824 BOOL req_byte_caseless = FALSE;
5825 BOOL utf8;
5826 match_data match_block;
5827 match_data *md = &match_block;
5828 const uschar *tables;
5829 const uschar *start_bits = NULL;
5830 USPTR start_match = (USPTR)subject + start_offset;
5831 USPTR end_subject;
5832 USPTR start_partial = NULL;
5833 USPTR req_byte_ptr = start_match - 1;
5834
5835 pcre_study_data internal_study;
5836 const pcre_study_data *study;
5837
5838 real_pcre internal_re;
5839 const real_pcre *external_re = (const real_pcre *)argument_re;
5840 const real_pcre *re = external_re;
5841
5842 /* Plausibility checks */
5843
5844 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5845 if (re == NULL || subject == NULL ||
5846 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5847 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5848 if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
5849
5850 /* This information is for finding all the numbers associated with a given
5851 name, for condition testing. */
5852
5853 md->name_table = (uschar *)re + re->name_table_offset;
5854 md->name_count = re->name_count;
5855 md->name_entry_size = re->name_entry_size;
5856
5857 /* Fish out the optional data from the extra_data structure, first setting
5858 the default values. */
5859
5860 study = NULL;
5861 md->match_limit = MATCH_LIMIT;
5862 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5863 md->callout_data = NULL;
5864
5865 /* The table pointer is always in native byte order. */
5866
5867 tables = external_re->tables;
5868
5869 if (extra_data != NULL)
5870 {
5871 register unsigned int flags = extra_data->flags;
5872 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5873 study = (const pcre_study_data *)extra_data->study_data;
5874 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5875 md->match_limit = extra_data->match_limit;
5876 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5877 md->match_limit_recursion = extra_data->match_limit_recursion;
5878 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5879 md->callout_data = extra_data->callout_data;
5880 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5881 }
5882
5883 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5884 is a feature that makes it possible to save compiled regex and re-use them
5885 in other programs later. */
5886
5887 if (tables == NULL) tables = _pcre_default_tables;
5888
5889 /* Check that the first field in the block is the magic number. If it is not,
5890 test for a regex that was compiled on a host of opposite endianness. If this is
5891 the case, flipped values are put in internal_re and internal_study if there was
5892 study data too. */
5893
5894 if (re->magic_number != MAGIC_NUMBER)
5895 {
5896 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5897 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5898 if (study != NULL) study = &internal_study;
5899 }
5900
5901 /* Set up other data */
5902
5903 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5904 startline = (re->flags & PCRE_STARTLINE) != 0;
5905 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5906
5907 /* The code starts after the real_pcre block and the capture name table. */
5908
5909 md->start_code = (const uschar *)external_re + re->name_table_offset +
5910 re->name_count * re->name_entry_size;
5911
5912 md->start_subject = (USPTR)subject;
5913 md->start_offset = start_offset;
5914 md->end_subject = md->start_subject + length;
5915 end_subject = md->end_subject;
5916
5917 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5918 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5919 md->use_ucp = (re->options & PCRE_UCP) != 0;
5920 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5921
5922 /* Some options are unpacked into BOOL variables in the hope that testing
5923 them will be faster than individual option bits. */
5924
5925 md->notbol = (options & PCRE_NOTBOL) != 0;
5926 md->noteol = (options & PCRE_NOTEOL) != 0;
5927 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5928 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5929 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5930 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5931
5932
5933 md->hitend = FALSE;
5934 md->mark = NULL; /* In case never set */
5935
5936 md->recursive = NULL; /* No recursion at top level */
5937
5938 md->lcc = tables + lcc_offset;
5939 md->ctypes = tables + ctypes_offset;
5940
5941 /* Handle different \R options. */
5942
5943 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5944 {
5945 case 0:
5946 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5947 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5948 else
5949 #ifdef BSR_ANYCRLF
5950 md->bsr_anycrlf = TRUE;
5951 #else
5952 md->bsr_anycrlf = FALSE;
5953 #endif
5954 break;
5955
5956 case PCRE_BSR_ANYCRLF:
5957 md->bsr_anycrlf = TRUE;
5958 break;
5959
5960 case PCRE_BSR_UNICODE:
5961 md->bsr_anycrlf = FALSE;
5962 break;
5963
5964 default: return PCRE_ERROR_BADNEWLINE;
5965 }
5966
5967 /* Handle different types of newline. The three bits give eight cases. If
5968 nothing is set at run time, whatever was used at compile time applies. */
5969
5970 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5971 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5972 {
5973 case 0: newline = NEWLINE; break; /* Compile-time default */
5974 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5975 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5976 case PCRE_NEWLINE_CR+
5977 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5978 case PCRE_NEWLINE_ANY: newline = -1; break;
5979 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5980 default: return PCRE_ERROR_BADNEWLINE;
5981 }
5982
5983 if (newline == -2)
5984 {
5985 md->nltype = NLTYPE_ANYCRLF;
5986 }
5987 else if (newline < 0)
5988 {
5989 md->nltype = NLTYPE_ANY;
5990 }
5991 else
5992 {
5993 md->nltype = NLTYPE_FIXED;
5994 if (newline > 255)
5995 {
5996 md->nllen = 2;
5997 md->nl[0] = (newline >> 8) & 255;
5998 md->nl[1] = newline & 255;
5999 }
6000 else
6001 {
6002 md->nllen = 1;
6003 md->nl[0] = newline;
6004 }
6005 }
6006
6007 /* Partial matching was originally supported only for a restricted set of
6008 regexes; from release 8.00 there are no restrictions, but the bits are still
6009 defined (though never set). So there's no harm in leaving this code. */
6010
6011 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6012 return PCRE_ERROR_BADPARTIAL;
6013
6014 /* Check a UTF-8 string if required. Pass back the character offset and error
6015 code for an invalid string if a results vector is available. */
6016
6017 #ifdef SUPPORT_UTF8
6018 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
6019 {
6020 int erroroffset;
6021 int errorcode = _pcre_valid_utf8((USPTR)subject, length, &erroroffset);
6022 if (errorcode != 0)
6023 {
6024 if (offsetcount >= 2)
6025 {
6026 offsets[0] = erroroffset;
6027 offsets[1] = errorcode;
6028 }
6029 return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6030 PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6031 }
6032
6033 /* Check that a start_offset points to the start of a UTF-8 character. */
6034
6035 if (start_offset > 0 && start_offset < length &&
6036 (((USPTR)subject)[start_offset] & 0xc0) == 0x80)
6037 return PCRE_ERROR_BADUTF8_OFFSET;
6038 }
6039 #endif
6040
6041 /* If the expression has got more back references than the offsets supplied can
6042 hold, we get a temporary chunk of working store to use during the matching.
6043 Otherwise, we can use the vector supplied, rounding down its size to a multiple
6044 of 3. */
6045
6046 ocount = offsetcount - (offsetcount % 3);
6047
6048 if (re->top_backref > 0 && re->top_backref >= ocount/3)
6049 {
6050 ocount = re->top_backref * 3 + 3;
6051 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
6052 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6053 using_temporary_offsets = TRUE;
6054 DPRINTF(("Got memory to hold back references\n"));
6055 }
6056 else md->offset_vector = offsets;
6057
6058 md->offset_end = ocount;
6059 md->offset_max = (2*ocount)/3;
6060 md->offset_overflow = FALSE;
6061 md->capture_last = -1;
6062
6063 /* Reset the working variable associated with each extraction. These should
6064 never be used unless previously set, but they get saved and restored, and so we
6065 initialize them to avoid reading uninitialized locations. Also, unset the
6066 offsets for the matched string. This is really just for tidiness with callouts,
6067 in case they inspect these fields. */
6068
6069 if (md->offset_vector != NULL)
6070 {
6071 register int *iptr = md->offset_vector + ocount;
6072 register int *iend = iptr - re->top_bracket;
6073 if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
6074 while (--iptr >= iend) *iptr = -1;
6075 md->offset_vector[0] = md->offset_vector[1] = -1;
6076 }
6077
6078 /* Set up the first character to match, if available. The first_byte value is
6079 never set for an anchored regular expression, but the anchoring may be forced
6080 at run time, so we have to test for anchoring. The first char may be unset for
6081 an unanchored pattern, of course. If there's no first char and the pattern was
6082 studied, there may be a bitmap of possible first characters. */
6083
6084 if (!anchored)
6085 {
6086 if ((re->flags & PCRE_FIRSTSET) != 0)
6087 {
6088 first_byte = re->first_byte & 255;
6089 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
6090 first_byte = md->lcc[first_byte];
6091 }
6092 else
6093 if (!startline && study != NULL &&
6094 (study->flags & PCRE_STUDY_MAPPED) != 0)
6095 start_bits = study->start_bits;
6096 }
6097
6098 /* For anchored or unanchored matches, there may be a "last known required
6099 character" set. */
6100
6101 if ((re->flags & PCRE_REQCHSET) != 0)
6102 {
6103 req_byte = re->req_byte & 255;
6104 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
6105 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
6106 }
6107
6108
6109
6110
6111 /* ==========================================================================*/
6112
6113 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6114 the loop runs just once. */
6115
6116 for(;;)
6117 {
6118 USPTR save_end_subject = end_subject;
6119 USPTR new_start_match;
6120
6121 /* If firstline is TRUE, the start of the match is constrained to the first
6122 line of a multiline string. That is, the match must be before or at the first
6123 newline. Implement this by temporarily adjusting end_subject so that we stop
6124 scanning at a newline. If the match fails at the newline, later code breaks
6125 this loop. */
6126
6127 if (firstline)
6128 {
6129 USPTR t = start_match;
6130 #ifdef SUPPORT_UTF8
6131 if (utf8)
6132 {
6133 while (t < md->end_subject && !IS_NEWLINE(t))
6134 {
6135 t++;
6136 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
6137 }
6138 }
6139 else
6140 #endif
6141 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6142 end_subject = t;
6143 }
6144
6145 /* There are some optimizations that avoid running the match if a known
6146 starting point is not found, or if a known later character is not present.
6147 However, there is an option that disables these, for testing and for ensuring
6148 that all callouts do actually occur. The option can be set in the regex by
6149 (*NO_START_OPT) or passed in match-time options. */
6150
6151 if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
6152 {
6153 /* Advance to a unique first byte if there is one. */
6154
6155 if (first_byte >= 0)
6156 {
6157 if (first_byte_caseless)
6158 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
6159 start_match++;
6160 else
6161 while (start_match < end_subject && *start_match != first_byte)
6162 start_match++;
6163 }
6164
6165 /* Or to just after a linebreak for a multiline match */
6166
6167 else if (startline)
6168 {
6169 if (start_match > md->start_subject + start_offset)
6170 {
6171 #ifdef SUPPORT_UTF8
6172 if (utf8)
6173 {
6174 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6175 {
6176 start_match++;
6177 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6178 start_match++;
6179 }
6180 }
6181 else
6182 #endif
6183 while (start_match < end_subject && !WAS_NEWLINE(start_match))
6184 start_match++;
6185
6186 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6187 and we are now at a LF, advance the match position by one more character.
6188 */
6189
6190 if (start_match[-1] == CHAR_CR &&
6191 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6192 start_match < end_subject &&
6193 *start_match == CHAR_NL)
6194 start_match++;
6195 }
6196 }
6197
6198 /* Or to a non-unique first byte after study */
6199
6200 else if (start_bits != NULL)
6201 {
6202 while (start_match < end_subject)
6203 {
6204 register unsigned int c = *start_match;
6205 if ((start_bits[c/8] & (1 << (c&7))) == 0)
6206 {
6207 start_match++;
6208 #ifdef SUPPORT_UTF8
6209 if (utf8)
6210 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6211 start_match++;
6212 #endif
6213 }
6214 else break;
6215 }
6216 }
6217 } /* Starting optimizations */
6218
6219 /* Restore fudged end_subject */
6220
6221 end_subject = save_end_subject;
6222
6223 /* The following two optimizations are disabled for partial matching or if
6224 disabling is explicitly requested. */
6225
6226 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6227 {
6228 /* If the pattern was studied, a minimum subject length may be set. This is
6229 a lower bound; no actual string of that length may actually match the
6230 pattern. Although the value is, strictly, in characters, we treat it as
6231 bytes to avoid spending too much time in this optimization. */
6232
6233 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6234 (pcre_uint32)(end_subject - start_match) < study->minlength)
6235 {
6236 rc = MATCH_NOMATCH;
6237 break;
6238 }
6239
6240 /* If req_byte is set, we know that that character must appear in the
6241 subject for the match to succeed. If the first character is set, req_byte
6242 must be later in the subject; otherwise the test starts at the match point.
6243 This optimization can save a huge amount of backtracking in patterns with
6244 nested unlimited repeats that aren't going to match. Writing separate code
6245 for cased/caseless versions makes it go faster, as does using an
6246 autoincrement and backing off on a match.
6247
6248 HOWEVER: when the subject string is very, very long, searching to its end
6249 can take a long time, and give bad performance on quite ordinary patterns.
6250 This showed up when somebody was matching something like /^\d+C/ on a
6251 32-megabyte string... so we don't do this when the string is sufficiently
6252 long. */
6253
6254 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
6255 {
6256 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6257
6258 /* We don't need to repeat the search if we haven't yet reached the
6259 place we found it at last time. */
6260
6261 if (p > req_byte_ptr)
6262 {
6263 if (req_byte_caseless)
6264 {
6265 while (p < end_subject)
6266 {
6267 register int pp = *p++;
6268 if (pp == req_byte || pp == req_byte2) { p--; break; }
6269 }
6270 }
6271 else
6272 {
6273 while (p < end_subject)
6274 {
6275 if (*p++ == req_byte) { p--; break; }
6276 }
6277 }
6278
6279 /* If we can't find the required character, break the matching loop,
6280 forcing a match failure. */
6281
6282 if (p >= end_subject)
6283 {
6284 rc = MATCH_NOMATCH;
6285 break;
6286 }
6287
6288 /* If we have found the required character, save the point where we
6289 found it, so that we don't search again next time round the loop if
6290 the start hasn't passed this character yet. */
6291
6292 req_byte_ptr = p;
6293 }
6294 }
6295 }
6296
6297 #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6298 printf(">>>> Match against: ");
6299 pchars(start_match, end_subject - start_match, TRUE, md);
6300 printf("\n");
6301 #endif
6302
6303 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6304 first starting point for which a partial match was found. */
6305
6306 md->start_match_ptr = start_match;
6307 md->start_used_ptr = start_match;
6308 md->match_call_count = 0;
6309 md->match_function_type = 0;
6310 md->end_offset_top = 0;
6311 rc = match(start_match, md->start_code, start_match, NULL, 2, md, NULL, 0);
6312 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
6313
6314 switch(rc)
6315 {
6316 /* SKIP passes back the next starting point explicitly, but if it is the
6317 same as the match we have just done, treat it as NOMATCH. */
6318
6319 case MATCH_SKIP:
6320 if (md->start_match_ptr != start_match)
6321 {
6322 new_start_match = md->start_match_ptr;
6323 break;
6324 }
6325 /* Fall through */
6326
6327 /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6328 the SKIP's arg was not found. We also treat this as NOMATCH. */
6329
6330 case MATCH_SKIP_ARG:
6331 /* Fall through */
6332
6333 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6334 exactly like PRUNE. */
6335
6336 case MATCH_NOMATCH:
6337 case MATCH_PRUNE:
6338 case MATCH_THEN:
6339 new_start_match = start_match + 1;
6340 #ifdef SUPPORT_UTF8
6341 if (utf8)
6342 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6343 new_start_match++;
6344 #endif
6345 break;
6346
6347 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6348
6349 case MATCH_COMMIT:
6350 rc = MATCH_NOMATCH;
6351 goto ENDLOOP;
6352
6353 /* Any other return is either a match, or some kind of error. */
6354
6355 default:
6356 goto ENDLOOP;
6357 }
6358
6359 /* Control reaches here for the various types of "no match at this point"
6360 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6361
6362 rc = MATCH_NOMATCH;
6363
6364 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6365 newline in the subject (though it may continue over the newline). Therefore,
6366 if we have just failed to match, starting at a newline, do not continue. */
6367
6368 if (firstline && IS_NEWLINE(start_match)) break;
6369
6370 /* Advance to new matching position */
6371
6372 start_match = new_start_match;
6373
6374 /* Break the loop if the pattern is anchored or if we have passed the end of
6375 the subject. */
6376
6377 if (anchored || start_match > end_subject) break;
6378
6379 /* If we have just passed a CR and we are now at a LF, and the pattern does
6380 not contain any explicit matches for \r or \n, and the newline option is CRLF
6381 or ANY or ANYCRLF, advance the match position by one more character. */
6382
6383 if (start_match[-1] == CHAR_CR &&
6384 start_match < end_subject &&
6385 *start_match == CHAR_NL &&
6386 (re->flags & PCRE_HASCRORLF) == 0 &&
6387 (md->nltype == NLTYPE_ANY ||
6388 md->nltype == NLTYPE_ANYCRLF ||
6389 md->nllen == 2))
6390 start_match++;
6391
6392 md->mark = NULL; /* Reset for start of next match attempt */
6393 } /* End of for(;;) "bumpalong" loop */
6394
6395 /* ==========================================================================*/
6396
6397 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6398 conditions is true:
6399
6400 (1) The pattern is anchored or the match was failed by (*COMMIT);
6401
6402 (2) We are past the end of the subject;
6403
6404 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6405 this option requests that a match occur at or before the first newline in
6406 the subject.
6407
6408 When we have a match and the offset vector is big enough to deal with any
6409 backreferences, captured substring offsets will already be set up. In the case
6410 where we had to get some local store to hold offsets for backreference
6411 processing, copy those that we can. In this case there need not be overflow if
6412 certain parts of the pattern were not used, even though there are more
6413 capturing parentheses than vector slots. */
6414
6415 ENDLOOP:
6416
6417 if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
6418 {
6419 if (using_temporary_offsets)
6420 {
6421 if (offsetcount >= 4)
6422 {
6423 memcpy(offsets + 2, md->offset_vector + 2,
6424 (offsetcount - 2) * sizeof(int));
6425 DPRINTF(("Copied offsets from temporary memory\n"));
6426 }
6427 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6428 DPRINTF(("Freeing temporary memory\n"));
6429 (pcre_free)(md->offset_vector);
6430 }
6431
6432 /* Set the return code to the number of captured strings, or 0 if there are
6433 too many to fit into the vector. */
6434
6435 rc = md->offset_overflow? 0 : md->end_offset_top/2;
6436
6437 /* If there is space, set up the whole thing as substring 0. The value of
6438 md->start_match_ptr might be modified if \K was encountered on the success
6439 matching path. */
6440
6441 if (offsetcount < 2) rc = 0; else
6442 {
6443 offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6444 offsets[1] = (int)(md->end_match_ptr - md->start_subject);
6445 }
6446
6447 DPRINTF((">>>> returning %d\n", rc));
6448 goto RETURN_MARK;
6449 }
6450
6451 /* Control gets here if there has been an error, or if the overall match
6452 attempt has failed at all permitted starting positions. */
6453
6454 if (using_temporary_offsets)
6455 {
6456 DPRINTF(("Freeing temporary memory\n"));
6457 (pcre_free)(md->offset_vector);
6458 }
6459
6460 /* For anything other than nomatch or partial match, just return the code. */
6461
6462 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
6463 {
6464 DPRINTF((">>>> error: returning %d\n", rc));
6465 return rc;
6466 }
6467
6468 /* Handle partial matches - disable any mark data */
6469
6470 if (start_partial != NULL)
6471 {
6472 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
6473 md->mark = NULL;
6474 if (offsetcount > 1)
6475 {
6476 offsets[0] = (int)(start_partial - (USPTR)subject);
6477 offsets[1] = (int)(end_subject - (USPTR)subject);
6478 }
6479 rc = PCRE_ERROR_PARTIAL;
6480 }
6481
6482 /* This is the classic nomatch case */
6483
6484 else
6485 {
6486 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
6487 rc = PCRE_ERROR_NOMATCH;
6488 }
6489
6490 /* Return the MARK data if it has been requested. */
6491
6492 RETURN_MARK:
6493
6494 if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6495 *(extra_data->mark) = (unsigned char *)(md->mark);
6496 return rc;
6497 }
6498
6499 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12