/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 463 - (show annotations) (download)
Sun Oct 18 10:02:46 2009 UTC (5 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 170608 byte(s)
Further tidies to partial matching.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial != 0 && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
422 }
423
424
425 /* Performance note: It might be tempting to extract commonly used fields from
426 the md structure (e.g. utf8, end_subject) into individual variables to improve
427 performance. Tests using gcc on a SPARC disproved this; in the first case, it
428 made performance worse.
429
430 Arguments:
431 eptr pointer to current character in subject
432 ecode pointer to current position in compiled code
433 mstart pointer to the current match start position (can be modified
434 by encountering \K)
435 offset_top current top pointer
436 md pointer to "static" info for the match
437 ims current /i, /m, and /s options
438 eptrb pointer to chain of blocks containing eptr at start of
439 brackets - for testing for empty matches
440 flags can contain
441 match_condassert - this is an assertion condition
442 match_cbegroup - this is the start of an unlimited repeat
443 group that can match an empty string
444 rdepth the recursion depth
445
446 Returns: MATCH_MATCH if matched ) these values are >= 0
447 MATCH_NOMATCH if failed to match )
448 a negative PCRE_ERROR_xxx value if aborted by an error condition
449 (e.g. stopped by repeated call or recursion limit)
450 */
451
452 static int
453 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
454 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
455 int flags, unsigned int rdepth)
456 {
457 /* These variables do not need to be preserved over recursion in this function,
458 so they can be ordinary variables in all cases. Mark some of them with
459 "register" because they are used a lot in loops. */
460
461 register int rrc; /* Returns from recursive calls */
462 register int i; /* Used for loops not involving calls to RMATCH() */
463 register unsigned int c; /* Character values not kept over RMATCH() calls */
464 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
465
466 BOOL minimize, possessive; /* Quantifier options */
467 int condcode;
468
469 /* When recursion is not being used, all "local" variables that have to be
470 preserved over calls to RMATCH() are part of a "frame" which is obtained from
471 heap storage. Set up the top-level frame here; others are obtained from the
472 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
473
474 #ifdef NO_RECURSE
475 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
476 frame->Xprevframe = NULL; /* Marks the top level */
477
478 /* Copy in the original argument variables */
479
480 frame->Xeptr = eptr;
481 frame->Xecode = ecode;
482 frame->Xmstart = mstart;
483 frame->Xoffset_top = offset_top;
484 frame->Xims = ims;
485 frame->Xeptrb = eptrb;
486 frame->Xflags = flags;
487 frame->Xrdepth = rdepth;
488
489 /* This is where control jumps back to to effect "recursion" */
490
491 HEAP_RECURSE:
492
493 /* Macros make the argument variables come from the current frame */
494
495 #define eptr frame->Xeptr
496 #define ecode frame->Xecode
497 #define mstart frame->Xmstart
498 #define offset_top frame->Xoffset_top
499 #define ims frame->Xims
500 #define eptrb frame->Xeptrb
501 #define flags frame->Xflags
502 #define rdepth frame->Xrdepth
503
504 /* Ditto for the local variables */
505
506 #ifdef SUPPORT_UTF8
507 #define charptr frame->Xcharptr
508 #endif
509 #define callpat frame->Xcallpat
510 #define codelink frame->Xcodelink
511 #define data frame->Xdata
512 #define next frame->Xnext
513 #define pp frame->Xpp
514 #define prev frame->Xprev
515 #define saved_eptr frame->Xsaved_eptr
516
517 #define new_recursive frame->Xnew_recursive
518
519 #define cur_is_word frame->Xcur_is_word
520 #define condition frame->Xcondition
521 #define prev_is_word frame->Xprev_is_word
522
523 #define original_ims frame->Xoriginal_ims
524
525 #ifdef SUPPORT_UCP
526 #define prop_type frame->Xprop_type
527 #define prop_value frame->Xprop_value
528 #define prop_fail_result frame->Xprop_fail_result
529 #define prop_category frame->Xprop_category
530 #define prop_chartype frame->Xprop_chartype
531 #define prop_script frame->Xprop_script
532 #define oclength frame->Xoclength
533 #define occhars frame->Xocchars
534 #endif
535
536 #define ctype frame->Xctype
537 #define fc frame->Xfc
538 #define fi frame->Xfi
539 #define length frame->Xlength
540 #define max frame->Xmax
541 #define min frame->Xmin
542 #define number frame->Xnumber
543 #define offset frame->Xoffset
544 #define op frame->Xop
545 #define save_capture_last frame->Xsave_capture_last
546 #define save_offset1 frame->Xsave_offset1
547 #define save_offset2 frame->Xsave_offset2
548 #define save_offset3 frame->Xsave_offset3
549 #define stacksave frame->Xstacksave
550
551 #define newptrb frame->Xnewptrb
552
553 /* When recursion is being used, local variables are allocated on the stack and
554 get preserved during recursion in the normal way. In this environment, fi and
555 i, and fc and c, can be the same variables. */
556
557 #else /* NO_RECURSE not defined */
558 #define fi i
559 #define fc c
560
561
562 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
563 const uschar *charptr; /* in small blocks of the code. My normal */
564 #endif /* style of coding would have declared */
565 const uschar *callpat; /* them within each of those blocks. */
566 const uschar *data; /* However, in order to accommodate the */
567 const uschar *next; /* version of this code that uses an */
568 USPTR pp; /* external "stack" implemented on the */
569 const uschar *prev; /* heap, it is easier to declare them all */
570 USPTR saved_eptr; /* here, so the declarations can be cut */
571 /* out in a block. The only declarations */
572 recursion_info new_recursive; /* within blocks below are for variables */
573 /* that do not have to be preserved over */
574 BOOL cur_is_word; /* a recursive call to RMATCH(). */
575 BOOL condition;
576 BOOL prev_is_word;
577
578 unsigned long int original_ims;
579
580 #ifdef SUPPORT_UCP
581 int prop_type;
582 int prop_value;
583 int prop_fail_result;
584 int prop_category;
585 int prop_chartype;
586 int prop_script;
587 int oclength;
588 uschar occhars[8];
589 #endif
590
591 int codelink;
592 int ctype;
593 int length;
594 int max;
595 int min;
596 int number;
597 int offset;
598 int op;
599 int save_capture_last;
600 int save_offset1, save_offset2, save_offset3;
601 int stacksave[REC_STACK_SAVE_MAX];
602
603 eptrblock newptrb;
604 #endif /* NO_RECURSE */
605
606 /* These statements are here to stop the compiler complaining about unitialized
607 variables. */
608
609 #ifdef SUPPORT_UCP
610 prop_value = 0;
611 prop_fail_result = 0;
612 #endif
613
614
615 /* This label is used for tail recursion, which is used in a few cases even
616 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
617 used. Thanks to Ian Taylor for noticing this possibility and sending the
618 original patch. */
619
620 TAIL_RECURSE:
621
622 /* OK, now we can get on with the real code of the function. Recursive calls
623 are specified by the macro RMATCH and RRETURN is used to return. When
624 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
625 and a "return", respectively (possibly with some debugging if DEBUG is
626 defined). However, RMATCH isn't like a function call because it's quite a
627 complicated macro. It has to be used in one particular way. This shouldn't,
628 however, impact performance when true recursion is being used. */
629
630 #ifdef SUPPORT_UTF8
631 utf8 = md->utf8; /* Local copy of the flag */
632 #else
633 utf8 = FALSE;
634 #endif
635
636 /* First check that we haven't called match() too many times, or that we
637 haven't exceeded the recursive call limit. */
638
639 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
640 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
641
642 original_ims = ims; /* Save for resetting on ')' */
643
644 /* At the start of a group with an unlimited repeat that may match an empty
645 string, the match_cbegroup flag is set. When this is the case, add the current
646 subject pointer to the chain of such remembered pointers, to be checked when we
647 hit the closing ket, in order to break infinite loops that match no characters.
648 When match() is called in other circumstances, don't add to the chain. The
649 match_cbegroup flag must NOT be used with tail recursion, because the memory
650 block that is used is on the stack, so a new one may be required for each
651 match(). */
652
653 if ((flags & match_cbegroup) != 0)
654 {
655 newptrb.epb_saved_eptr = eptr;
656 newptrb.epb_prev = eptrb;
657 eptrb = &newptrb;
658 }
659
660 /* Now start processing the opcodes. */
661
662 for (;;)
663 {
664 minimize = possessive = FALSE;
665 op = *ecode;
666
667 switch(op)
668 {
669 case OP_FAIL:
670 RRETURN(MATCH_NOMATCH);
671
672 case OP_PRUNE:
673 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
674 ims, eptrb, flags, RM51);
675 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676 RRETURN(MATCH_PRUNE);
677
678 case OP_COMMIT:
679 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
680 ims, eptrb, flags, RM52);
681 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 RRETURN(MATCH_COMMIT);
683
684 case OP_SKIP:
685 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
686 ims, eptrb, flags, RM53);
687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
688 md->start_match_ptr = eptr; /* Pass back current position */
689 RRETURN(MATCH_SKIP);
690
691 case OP_THEN:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM54);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 RRETURN(MATCH_THEN);
696
697 /* Handle a capturing bracket. If there is space in the offset vector, save
698 the current subject position in the working slot at the top of the vector.
699 We mustn't change the current values of the data slot, because they may be
700 set from a previous iteration of this group, and be referred to by a
701 reference inside the group.
702
703 If the bracket fails to match, we need to restore this value and also the
704 values of the final offsets, in case they were set by a previous iteration
705 of the same bracket.
706
707 If there isn't enough space in the offset vector, treat this as if it were
708 a non-capturing bracket. Don't worry about setting the flag for the error
709 case here; that is handled in the code for KET. */
710
711 case OP_CBRA:
712 case OP_SCBRA:
713 number = GET2(ecode, 1+LINK_SIZE);
714 offset = number << 1;
715
716 #ifdef DEBUG
717 printf("start bracket %d\n", number);
718 printf("subject=");
719 pchars(eptr, 16, TRUE, md);
720 printf("\n");
721 #endif
722
723 if (offset < md->offset_max)
724 {
725 save_offset1 = md->offset_vector[offset];
726 save_offset2 = md->offset_vector[offset+1];
727 save_offset3 = md->offset_vector[md->offset_end - number];
728 save_capture_last = md->capture_last;
729
730 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
731 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
732
733 flags = (op == OP_SCBRA)? match_cbegroup : 0;
734 do
735 {
736 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
737 ims, eptrb, flags, RM1);
738 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
739 md->capture_last = save_capture_last;
740 ecode += GET(ecode, 1);
741 }
742 while (*ecode == OP_ALT);
743
744 DPRINTF(("bracket %d failed\n", number));
745
746 md->offset_vector[offset] = save_offset1;
747 md->offset_vector[offset+1] = save_offset2;
748 md->offset_vector[md->offset_end - number] = save_offset3;
749
750 RRETURN(MATCH_NOMATCH);
751 }
752
753 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
754 as a non-capturing bracket. */
755
756 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
757 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
758
759 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
760
761 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
762 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
763
764 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
765 final alternative within the brackets, we would return the result of a
766 recursive call to match() whatever happened. We can reduce stack usage by
767 turning this into a tail recursion, except in the case when match_cbegroup
768 is set.*/
769
770 case OP_BRA:
771 case OP_SBRA:
772 DPRINTF(("start non-capturing bracket\n"));
773 flags = (op >= OP_SBRA)? match_cbegroup : 0;
774 for (;;)
775 {
776 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
777 {
778 if (flags == 0) /* Not a possibly empty group */
779 {
780 ecode += _pcre_OP_lengths[*ecode];
781 DPRINTF(("bracket 0 tail recursion\n"));
782 goto TAIL_RECURSE;
783 }
784
785 /* Possibly empty group; can't use tail recursion. */
786
787 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
788 eptrb, flags, RM48);
789 RRETURN(rrc);
790 }
791
792 /* For non-final alternatives, continue the loop for a NOMATCH result;
793 otherwise return. */
794
795 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
796 eptrb, flags, RM2);
797 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
798 ecode += GET(ecode, 1);
799 }
800 /* Control never reaches here. */
801
802 /* Conditional group: compilation checked that there are no more than
803 two branches. If the condition is false, skipping the first branch takes us
804 past the end if there is only one branch, but that's OK because that is
805 exactly what going to the ket would do. As there is only one branch to be
806 obeyed, we can use tail recursion to avoid using another stack frame. */
807
808 case OP_COND:
809 case OP_SCOND:
810 codelink= GET(ecode, 1);
811
812 /* Because of the way auto-callout works during compile, a callout item is
813 inserted between OP_COND and an assertion condition. */
814
815 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
816 {
817 if (pcre_callout != NULL)
818 {
819 pcre_callout_block cb;
820 cb.version = 1; /* Version 1 of the callout block */
821 cb.callout_number = ecode[LINK_SIZE+2];
822 cb.offset_vector = md->offset_vector;
823 cb.subject = (PCRE_SPTR)md->start_subject;
824 cb.subject_length = md->end_subject - md->start_subject;
825 cb.start_match = mstart - md->start_subject;
826 cb.current_position = eptr - md->start_subject;
827 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
828 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
829 cb.capture_top = offset_top/2;
830 cb.capture_last = md->capture_last;
831 cb.callout_data = md->callout_data;
832 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
833 if (rrc < 0) RRETURN(rrc);
834 }
835 ecode += _pcre_OP_lengths[OP_CALLOUT];
836 }
837
838 condcode = ecode[LINK_SIZE+1];
839
840 /* Now see what the actual condition is */
841
842 if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
843 {
844 if (md->recursive == NULL) /* Not recursing => FALSE */
845 {
846 condition = FALSE;
847 ecode += GET(ecode, 1);
848 }
849 else
850 {
851 int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
852 condition = (recno == RREF_ANY || recno == md->recursive->group_num);
853
854 /* If the test is for recursion into a specific subpattern, and it is
855 false, but the test was set up by name, scan the table to see if the
856 name refers to any other numbers, and test them. The condition is true
857 if any one is set. */
858
859 if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
860 {
861 uschar *slotA = md->name_table;
862 for (i = 0; i < md->name_count; i++)
863 {
864 if (GET2(slotA, 0) == recno) break;
865 slotA += md->name_entry_size;
866 }
867
868 /* Found a name for the number - there can be only one; duplicate
869 names for different numbers are allowed, but not vice versa. First
870 scan down for duplicates. */
871
872 if (i < md->name_count)
873 {
874 uschar *slotB = slotA;
875 while (slotB > md->name_table)
876 {
877 slotB -= md->name_entry_size;
878 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
879 {
880 condition = GET2(slotB, 0) == md->recursive->group_num;
881 if (condition) break;
882 }
883 else break;
884 }
885
886 /* Scan up for duplicates */
887
888 if (!condition)
889 {
890 slotB = slotA;
891 for (i++; i < md->name_count; i++)
892 {
893 slotB += md->name_entry_size;
894 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
895 {
896 condition = GET2(slotB, 0) == md->recursive->group_num;
897 if (condition) break;
898 }
899 else break;
900 }
901 }
902 }
903 }
904
905 /* Chose branch according to the condition */
906
907 ecode += condition? 3 : GET(ecode, 1);
908 }
909 }
910
911 else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
912 {
913 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
914 condition = offset < offset_top && md->offset_vector[offset] >= 0;
915
916 /* If the numbered capture is unset, but the reference was by name,
917 scan the table to see if the name refers to any other numbers, and test
918 them. The condition is true if any one is set. This is tediously similar
919 to the code above, but not close enough to try to amalgamate. */
920
921 if (!condition && condcode == OP_NCREF)
922 {
923 int refno = offset >> 1;
924 uschar *slotA = md->name_table;
925
926 for (i = 0; i < md->name_count; i++)
927 {
928 if (GET2(slotA, 0) == refno) break;
929 slotA += md->name_entry_size;
930 }
931
932 /* Found a name for the number - there can be only one; duplicate names
933 for different numbers are allowed, but not vice versa. First scan down
934 for duplicates. */
935
936 if (i < md->name_count)
937 {
938 uschar *slotB = slotA;
939 while (slotB > md->name_table)
940 {
941 slotB -= md->name_entry_size;
942 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
943 {
944 offset = GET2(slotB, 0) << 1;
945 condition = offset < offset_top &&
946 md->offset_vector[offset] >= 0;
947 if (condition) break;
948 }
949 else break;
950 }
951
952 /* Scan up for duplicates */
953
954 if (!condition)
955 {
956 slotB = slotA;
957 for (i++; i < md->name_count; i++)
958 {
959 slotB += md->name_entry_size;
960 if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
961 {
962 offset = GET2(slotB, 0) << 1;
963 condition = offset < offset_top &&
964 md->offset_vector[offset] >= 0;
965 if (condition) break;
966 }
967 else break;
968 }
969 }
970 }
971 }
972
973 /* Chose branch according to the condition */
974
975 ecode += condition? 3 : GET(ecode, 1);
976 }
977
978 else if (condcode == OP_DEF) /* DEFINE - always false */
979 {
980 condition = FALSE;
981 ecode += GET(ecode, 1);
982 }
983
984 /* The condition is an assertion. Call match() to evaluate it - setting
985 the final argument match_condassert causes it to stop at the end of an
986 assertion. */
987
988 else
989 {
990 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
991 match_condassert, RM3);
992 if (rrc == MATCH_MATCH)
993 {
994 condition = TRUE;
995 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
996 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
997 }
998 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
999 {
1000 RRETURN(rrc); /* Need braces because of following else */
1001 }
1002 else
1003 {
1004 condition = FALSE;
1005 ecode += codelink;
1006 }
1007 }
1008
1009 /* We are now at the branch that is to be obeyed. As there is only one,
1010 we can use tail recursion to avoid using another stack frame, except when
1011 match_cbegroup is required for an unlimited repeat of a possibly empty
1012 group. If the second alternative doesn't exist, we can just plough on. */
1013
1014 if (condition || *ecode == OP_ALT)
1015 {
1016 ecode += 1 + LINK_SIZE;
1017 if (op == OP_SCOND) /* Possibly empty group */
1018 {
1019 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1020 RRETURN(rrc);
1021 }
1022 else /* Group must match something */
1023 {
1024 flags = 0;
1025 goto TAIL_RECURSE;
1026 }
1027 }
1028 else /* Condition false & no alternative */
1029 {
1030 ecode += 1 + LINK_SIZE;
1031 }
1032 break;
1033
1034
1035 /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1036 to close any currently open capturing brackets. */
1037
1038 case OP_CLOSE:
1039 number = GET2(ecode, 1);
1040 offset = number << 1;
1041
1042 #ifdef DEBUG
1043 printf("end bracket %d at *ACCEPT", number);
1044 printf("\n");
1045 #endif
1046
1047 md->capture_last = number;
1048 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1049 {
1050 md->offset_vector[offset] =
1051 md->offset_vector[md->offset_end - number];
1052 md->offset_vector[offset+1] = eptr - md->start_subject;
1053 if (offset_top <= offset) offset_top = offset + 2;
1054 }
1055 ecode += 3;
1056 break;
1057
1058
1059 /* End of the pattern, either real or forced. If we are in a top-level
1060 recursion, we should restore the offsets appropriately and continue from
1061 after the call. */
1062
1063 case OP_ACCEPT:
1064 case OP_END:
1065 if (md->recursive != NULL && md->recursive->group_num == 0)
1066 {
1067 recursion_info *rec = md->recursive;
1068 DPRINTF(("End of pattern in a (?0) recursion\n"));
1069 md->recursive = rec->prevrec;
1070 memmove(md->offset_vector, rec->offset_save,
1071 rec->saved_max * sizeof(int));
1072 offset_top = rec->save_offset_top;
1073 mstart = rec->save_start;
1074 ims = original_ims;
1075 ecode = rec->after_call;
1076 break;
1077 }
1078
1079 /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1080 set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1081 the subject. In both cases, backtracking will then try other alternatives,
1082 if any. */
1083
1084 if (eptr == mstart &&
1085 (md->notempty ||
1086 (md->notempty_atstart &&
1087 mstart == md->start_subject + md->start_offset)))
1088 RRETURN(MATCH_NOMATCH);
1089
1090 /* Otherwise, we have a match. */
1091
1092 md->end_match_ptr = eptr; /* Record where we ended */
1093 md->end_offset_top = offset_top; /* and how many extracts were taken */
1094 md->start_match_ptr = mstart; /* and the start (\K can modify) */
1095 RRETURN(MATCH_MATCH);
1096
1097 /* Change option settings */
1098
1099 case OP_OPT:
1100 ims = ecode[1];
1101 ecode += 2;
1102 DPRINTF(("ims set to %02lx\n", ims));
1103 break;
1104
1105 /* Assertion brackets. Check the alternative branches in turn - the
1106 matching won't pass the KET for an assertion. If any one branch matches,
1107 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1108 start of each branch to move the current point backwards, so the code at
1109 this level is identical to the lookahead case. */
1110
1111 case OP_ASSERT:
1112 case OP_ASSERTBACK:
1113 do
1114 {
1115 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1116 RM4);
1117 if (rrc == MATCH_MATCH) break;
1118 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1119 ecode += GET(ecode, 1);
1120 }
1121 while (*ecode == OP_ALT);
1122 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1123
1124 /* If checking an assertion for a condition, return MATCH_MATCH. */
1125
1126 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1127
1128 /* Continue from after the assertion, updating the offsets high water
1129 mark, since extracts may have been taken during the assertion. */
1130
1131 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1132 ecode += 1 + LINK_SIZE;
1133 offset_top = md->end_offset_top;
1134 continue;
1135
1136 /* Negative assertion: all branches must fail to match */
1137
1138 case OP_ASSERT_NOT:
1139 case OP_ASSERTBACK_NOT:
1140 do
1141 {
1142 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1143 RM5);
1144 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
1145 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1146 ecode += GET(ecode,1);
1147 }
1148 while (*ecode == OP_ALT);
1149
1150 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1151
1152 ecode += 1 + LINK_SIZE;
1153 continue;
1154
1155 /* Move the subject pointer back. This occurs only at the start of
1156 each branch of a lookbehind assertion. If we are too close to the start to
1157 move back, this match function fails. When working with UTF-8 we move
1158 back a number of characters, not bytes. */
1159
1160 case OP_REVERSE:
1161 #ifdef SUPPORT_UTF8
1162 if (utf8)
1163 {
1164 i = GET(ecode, 1);
1165 while (i-- > 0)
1166 {
1167 eptr--;
1168 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1169 BACKCHAR(eptr);
1170 }
1171 }
1172 else
1173 #endif
1174
1175 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1176
1177 {
1178 eptr -= GET(ecode, 1);
1179 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1180 }
1181
1182 /* Save the earliest consulted character, then skip to next op code */
1183
1184 if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1185 ecode += 1 + LINK_SIZE;
1186 break;
1187
1188 /* The callout item calls an external function, if one is provided, passing
1189 details of the match so far. This is mainly for debugging, though the
1190 function is able to force a failure. */
1191
1192 case OP_CALLOUT:
1193 if (pcre_callout != NULL)
1194 {
1195 pcre_callout_block cb;
1196 cb.version = 1; /* Version 1 of the callout block */
1197 cb.callout_number = ecode[1];
1198 cb.offset_vector = md->offset_vector;
1199 cb.subject = (PCRE_SPTR)md->start_subject;
1200 cb.subject_length = md->end_subject - md->start_subject;
1201 cb.start_match = mstart - md->start_subject;
1202 cb.current_position = eptr - md->start_subject;
1203 cb.pattern_position = GET(ecode, 2);
1204 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1205 cb.capture_top = offset_top/2;
1206 cb.capture_last = md->capture_last;
1207 cb.callout_data = md->callout_data;
1208 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1209 if (rrc < 0) RRETURN(rrc);
1210 }
1211 ecode += 2 + 2*LINK_SIZE;
1212 break;
1213
1214 /* Recursion either matches the current regex, or some subexpression. The
1215 offset data is the offset to the starting bracket from the start of the
1216 whole pattern. (This is so that it works from duplicated subpatterns.)
1217
1218 If there are any capturing brackets started but not finished, we have to
1219 save their starting points and reinstate them after the recursion. However,
1220 we don't know how many such there are (offset_top records the completed
1221 total) so we just have to save all the potential data. There may be up to
1222 65535 such values, which is too large to put on the stack, but using malloc
1223 for small numbers seems expensive. As a compromise, the stack is used when
1224 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1225 is used. A problem is what to do if the malloc fails ... there is no way of
1226 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1227 values on the stack, and accept that the rest may be wrong.
1228
1229 There are also other values that have to be saved. We use a chained
1230 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1231 for the original version of this logic. */
1232
1233 case OP_RECURSE:
1234 {
1235 callpat = md->start_code + GET(ecode, 1);
1236 new_recursive.group_num = (callpat == md->start_code)? 0 :
1237 GET2(callpat, 1 + LINK_SIZE);
1238
1239 /* Add to "recursing stack" */
1240
1241 new_recursive.prevrec = md->recursive;
1242 md->recursive = &new_recursive;
1243
1244 /* Find where to continue from afterwards */
1245
1246 ecode += 1 + LINK_SIZE;
1247 new_recursive.after_call = ecode;
1248
1249 /* Now save the offset data. */
1250
1251 new_recursive.saved_max = md->offset_end;
1252 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1253 new_recursive.offset_save = stacksave;
1254 else
1255 {
1256 new_recursive.offset_save =
1257 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1258 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1259 }
1260
1261 memcpy(new_recursive.offset_save, md->offset_vector,
1262 new_recursive.saved_max * sizeof(int));
1263 new_recursive.save_start = mstart;
1264 new_recursive.save_offset_top = offset_top;
1265 mstart = eptr;
1266
1267 /* OK, now we can do the recursion. For each top-level alternative we
1268 restore the offset and recursion data. */
1269
1270 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1271 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1272 do
1273 {
1274 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1275 md, ims, eptrb, flags, RM6);
1276 if (rrc == MATCH_MATCH)
1277 {
1278 DPRINTF(("Recursion matched\n"));
1279 md->recursive = new_recursive.prevrec;
1280 if (new_recursive.offset_save != stacksave)
1281 (pcre_free)(new_recursive.offset_save);
1282 RRETURN(MATCH_MATCH);
1283 }
1284 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1285 {
1286 DPRINTF(("Recursion gave error %d\n", rrc));
1287 if (new_recursive.offset_save != stacksave)
1288 (pcre_free)(new_recursive.offset_save);
1289 RRETURN(rrc);
1290 }
1291
1292 md->recursive = &new_recursive;
1293 memcpy(md->offset_vector, new_recursive.offset_save,
1294 new_recursive.saved_max * sizeof(int));
1295 callpat += GET(callpat, 1);
1296 }
1297 while (*callpat == OP_ALT);
1298
1299 DPRINTF(("Recursion didn't match\n"));
1300 md->recursive = new_recursive.prevrec;
1301 if (new_recursive.offset_save != stacksave)
1302 (pcre_free)(new_recursive.offset_save);
1303 RRETURN(MATCH_NOMATCH);
1304 }
1305 /* Control never reaches here */
1306
1307 /* "Once" brackets are like assertion brackets except that after a match,
1308 the point in the subject string is not moved back. Thus there can never be
1309 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1310 Check the alternative branches in turn - the matching won't pass the KET
1311 for this kind of subpattern. If any one branch matches, we carry on as at
1312 the end of a normal bracket, leaving the subject pointer. */
1313
1314 case OP_ONCE:
1315 prev = ecode;
1316 saved_eptr = eptr;
1317
1318 do
1319 {
1320 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1321 if (rrc == MATCH_MATCH) break;
1322 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1323 ecode += GET(ecode,1);
1324 }
1325 while (*ecode == OP_ALT);
1326
1327 /* If hit the end of the group (which could be repeated), fail */
1328
1329 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1330
1331 /* Continue as from after the assertion, updating the offsets high water
1332 mark, since extracts may have been taken. */
1333
1334 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1335
1336 offset_top = md->end_offset_top;
1337 eptr = md->end_match_ptr;
1338
1339 /* For a non-repeating ket, just continue at this level. This also
1340 happens for a repeating ket if no characters were matched in the group.
1341 This is the forcible breaking of infinite loops as implemented in Perl
1342 5.005. If there is an options reset, it will get obeyed in the normal
1343 course of events. */
1344
1345 if (*ecode == OP_KET || eptr == saved_eptr)
1346 {
1347 ecode += 1+LINK_SIZE;
1348 break;
1349 }
1350
1351 /* The repeating kets try the rest of the pattern or restart from the
1352 preceding bracket, in the appropriate order. The second "call" of match()
1353 uses tail recursion, to avoid using another stack frame. We need to reset
1354 any options that changed within the bracket before re-running it, so
1355 check the next opcode. */
1356
1357 if (ecode[1+LINK_SIZE] == OP_OPT)
1358 {
1359 ims = (ims & ~PCRE_IMS) | ecode[4];
1360 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1361 }
1362
1363 if (*ecode == OP_KETRMIN)
1364 {
1365 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1366 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1367 ecode = prev;
1368 flags = 0;
1369 goto TAIL_RECURSE;
1370 }
1371 else /* OP_KETRMAX */
1372 {
1373 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1374 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1375 ecode += 1 + LINK_SIZE;
1376 flags = 0;
1377 goto TAIL_RECURSE;
1378 }
1379 /* Control never gets here */
1380
1381 /* An alternation is the end of a branch; scan along to find the end of the
1382 bracketed group and go to there. */
1383
1384 case OP_ALT:
1385 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1386 break;
1387
1388 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1389 indicating that it may occur zero times. It may repeat infinitely, or not
1390 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1391 with fixed upper repeat limits are compiled as a number of copies, with the
1392 optional ones preceded by BRAZERO or BRAMINZERO. */
1393
1394 case OP_BRAZERO:
1395 {
1396 next = ecode+1;
1397 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1398 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1399 do next += GET(next,1); while (*next == OP_ALT);
1400 ecode = next + 1 + LINK_SIZE;
1401 }
1402 break;
1403
1404 case OP_BRAMINZERO:
1405 {
1406 next = ecode+1;
1407 do next += GET(next, 1); while (*next == OP_ALT);
1408 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1409 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1410 ecode++;
1411 }
1412 break;
1413
1414 case OP_SKIPZERO:
1415 {
1416 next = ecode+1;
1417 do next += GET(next,1); while (*next == OP_ALT);
1418 ecode = next + 1 + LINK_SIZE;
1419 }
1420 break;
1421
1422 /* End of a group, repeated or non-repeating. */
1423
1424 case OP_KET:
1425 case OP_KETRMIN:
1426 case OP_KETRMAX:
1427 prev = ecode - GET(ecode, 1);
1428
1429 /* If this was a group that remembered the subject start, in order to break
1430 infinite repeats of empty string matches, retrieve the subject start from
1431 the chain. Otherwise, set it NULL. */
1432
1433 if (*prev >= OP_SBRA)
1434 {
1435 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1436 eptrb = eptrb->epb_prev; /* Backup to previous group */
1437 }
1438 else saved_eptr = NULL;
1439
1440 /* If we are at the end of an assertion group, stop matching and return
1441 MATCH_MATCH, but record the current high water mark for use by positive
1442 assertions. Do this also for the "once" (atomic) groups. */
1443
1444 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1445 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1446 *prev == OP_ONCE)
1447 {
1448 md->end_match_ptr = eptr; /* For ONCE */
1449 md->end_offset_top = offset_top;
1450 RRETURN(MATCH_MATCH);
1451 }
1452
1453 /* For capturing groups we have to check the group number back at the start
1454 and if necessary complete handling an extraction by setting the offsets and
1455 bumping the high water mark. Note that whole-pattern recursion is coded as
1456 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1457 when the OP_END is reached. Other recursion is handled here. */
1458
1459 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1460 {
1461 number = GET2(prev, 1+LINK_SIZE);
1462 offset = number << 1;
1463
1464 #ifdef DEBUG
1465 printf("end bracket %d", number);
1466 printf("\n");
1467 #endif
1468
1469 md->capture_last = number;
1470 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1471 {
1472 md->offset_vector[offset] =
1473 md->offset_vector[md->offset_end - number];
1474 md->offset_vector[offset+1] = eptr - md->start_subject;
1475 if (offset_top <= offset) offset_top = offset + 2;
1476 }
1477
1478 /* Handle a recursively called group. Restore the offsets
1479 appropriately and continue from after the call. */
1480
1481 if (md->recursive != NULL && md->recursive->group_num == number)
1482 {
1483 recursion_info *rec = md->recursive;
1484 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1485 md->recursive = rec->prevrec;
1486 mstart = rec->save_start;
1487 memcpy(md->offset_vector, rec->offset_save,
1488 rec->saved_max * sizeof(int));
1489 offset_top = rec->save_offset_top;
1490 ecode = rec->after_call;
1491 ims = original_ims;
1492 break;
1493 }
1494 }
1495
1496 /* For both capturing and non-capturing groups, reset the value of the ims
1497 flags, in case they got changed during the group. */
1498
1499 ims = original_ims;
1500 DPRINTF(("ims reset to %02lx\n", ims));
1501
1502 /* For a non-repeating ket, just continue at this level. This also
1503 happens for a repeating ket if no characters were matched in the group.
1504 This is the forcible breaking of infinite loops as implemented in Perl
1505 5.005. If there is an options reset, it will get obeyed in the normal
1506 course of events. */
1507
1508 if (*ecode == OP_KET || eptr == saved_eptr)
1509 {
1510 ecode += 1 + LINK_SIZE;
1511 break;
1512 }
1513
1514 /* The repeating kets try the rest of the pattern or restart from the
1515 preceding bracket, in the appropriate order. In the second case, we can use
1516 tail recursion to avoid using another stack frame, unless we have an
1517 unlimited repeat of a group that can match an empty string. */
1518
1519 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1520
1521 if (*ecode == OP_KETRMIN)
1522 {
1523 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1524 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1525 if (flags != 0) /* Could match an empty string */
1526 {
1527 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1528 RRETURN(rrc);
1529 }
1530 ecode = prev;
1531 goto TAIL_RECURSE;
1532 }
1533 else /* OP_KETRMAX */
1534 {
1535 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1536 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1537 ecode += 1 + LINK_SIZE;
1538 flags = 0;
1539 goto TAIL_RECURSE;
1540 }
1541 /* Control never gets here */
1542
1543 /* Start of subject unless notbol, or after internal newline if multiline */
1544
1545 case OP_CIRC:
1546 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1547 if ((ims & PCRE_MULTILINE) != 0)
1548 {
1549 if (eptr != md->start_subject &&
1550 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1551 RRETURN(MATCH_NOMATCH);
1552 ecode++;
1553 break;
1554 }
1555 /* ... else fall through */
1556
1557 /* Start of subject assertion */
1558
1559 case OP_SOD:
1560 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1561 ecode++;
1562 break;
1563
1564 /* Start of match assertion */
1565
1566 case OP_SOM:
1567 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1568 ecode++;
1569 break;
1570
1571 /* Reset the start of match point */
1572
1573 case OP_SET_SOM:
1574 mstart = eptr;
1575 ecode++;
1576 break;
1577
1578 /* Assert before internal newline if multiline, or before a terminating
1579 newline unless endonly is set, else end of subject unless noteol is set. */
1580
1581 case OP_DOLL:
1582 if ((ims & PCRE_MULTILINE) != 0)
1583 {
1584 if (eptr < md->end_subject)
1585 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1586 else
1587 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1588 ecode++;
1589 break;
1590 }
1591 else
1592 {
1593 if (md->noteol) RRETURN(MATCH_NOMATCH);
1594 if (!md->endonly)
1595 {
1596 if (eptr != md->end_subject &&
1597 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1598 RRETURN(MATCH_NOMATCH);
1599 ecode++;
1600 break;
1601 }
1602 }
1603 /* ... else fall through for endonly */
1604
1605 /* End of subject assertion (\z) */
1606
1607 case OP_EOD:
1608 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1609 ecode++;
1610 break;
1611
1612 /* End of subject or ending \n assertion (\Z) */
1613
1614 case OP_EODN:
1615 if (eptr != md->end_subject &&
1616 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1617 RRETURN(MATCH_NOMATCH);
1618 ecode++;
1619 break;
1620
1621 /* Word boundary assertions */
1622
1623 case OP_NOT_WORD_BOUNDARY:
1624 case OP_WORD_BOUNDARY:
1625 {
1626
1627 /* Find out if the previous and current characters are "word" characters.
1628 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1629 be "non-word" characters. Remember the earliest consulted character for
1630 partial matching. */
1631
1632 #ifdef SUPPORT_UTF8
1633 if (utf8)
1634 {
1635 if (eptr == md->start_subject) prev_is_word = FALSE; else
1636 {
1637 USPTR lastptr = eptr - 1;
1638 while((*lastptr & 0xc0) == 0x80) lastptr--;
1639 if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1640 GETCHAR(c, lastptr);
1641 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1642 }
1643 if (eptr >= md->end_subject)
1644 {
1645 SCHECK_PARTIAL();
1646 cur_is_word = FALSE;
1647 }
1648 else
1649 {
1650 GETCHAR(c, eptr);
1651 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1652 }
1653 }
1654 else
1655 #endif
1656
1657 /* Not in UTF-8 mode */
1658
1659 {
1660 if (eptr == md->start_subject) prev_is_word = FALSE; else
1661 {
1662 if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1663 prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1664 }
1665 if (eptr >= md->end_subject)
1666 {
1667 SCHECK_PARTIAL();
1668 cur_is_word = FALSE;
1669 }
1670 else cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1671 }
1672
1673 /* Now see if the situation is what we want */
1674
1675 if ((*ecode++ == OP_WORD_BOUNDARY)?
1676 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1677 RRETURN(MATCH_NOMATCH);
1678 }
1679 break;
1680
1681 /* Match a single character type; inline for speed */
1682
1683 case OP_ANY:
1684 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1685 /* Fall through */
1686
1687 case OP_ALLANY:
1688 if (eptr++ >= md->end_subject)
1689 {
1690 SCHECK_PARTIAL();
1691 RRETURN(MATCH_NOMATCH);
1692 }
1693 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1694 ecode++;
1695 break;
1696
1697 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1698 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1699
1700 case OP_ANYBYTE:
1701 if (eptr++ >= md->end_subject)
1702 {
1703 SCHECK_PARTIAL();
1704 RRETURN(MATCH_NOMATCH);
1705 }
1706 ecode++;
1707 break;
1708
1709 case OP_NOT_DIGIT:
1710 if (eptr >= md->end_subject)
1711 {
1712 SCHECK_PARTIAL();
1713 RRETURN(MATCH_NOMATCH);
1714 }
1715 GETCHARINCTEST(c, eptr);
1716 if (
1717 #ifdef SUPPORT_UTF8
1718 c < 256 &&
1719 #endif
1720 (md->ctypes[c] & ctype_digit) != 0
1721 )
1722 RRETURN(MATCH_NOMATCH);
1723 ecode++;
1724 break;
1725
1726 case OP_DIGIT:
1727 if (eptr >= md->end_subject)
1728 {
1729 SCHECK_PARTIAL();
1730 RRETURN(MATCH_NOMATCH);
1731 }
1732 GETCHARINCTEST(c, eptr);
1733 if (
1734 #ifdef SUPPORT_UTF8
1735 c >= 256 ||
1736 #endif
1737 (md->ctypes[c] & ctype_digit) == 0
1738 )
1739 RRETURN(MATCH_NOMATCH);
1740 ecode++;
1741 break;
1742
1743 case OP_NOT_WHITESPACE:
1744 if (eptr >= md->end_subject)
1745 {
1746 SCHECK_PARTIAL();
1747 RRETURN(MATCH_NOMATCH);
1748 }
1749 GETCHARINCTEST(c, eptr);
1750 if (
1751 #ifdef SUPPORT_UTF8
1752 c < 256 &&
1753 #endif
1754 (md->ctypes[c] & ctype_space) != 0
1755 )
1756 RRETURN(MATCH_NOMATCH);
1757 ecode++;
1758 break;
1759
1760 case OP_WHITESPACE:
1761 if (eptr >= md->end_subject)
1762 {
1763 SCHECK_PARTIAL();
1764 RRETURN(MATCH_NOMATCH);
1765 }
1766 GETCHARINCTEST(c, eptr);
1767 if (
1768 #ifdef SUPPORT_UTF8
1769 c >= 256 ||
1770 #endif
1771 (md->ctypes[c] & ctype_space) == 0
1772 )
1773 RRETURN(MATCH_NOMATCH);
1774 ecode++;
1775 break;
1776
1777 case OP_NOT_WORDCHAR:
1778 if (eptr >= md->end_subject)
1779 {
1780 SCHECK_PARTIAL();
1781 RRETURN(MATCH_NOMATCH);
1782 }
1783 GETCHARINCTEST(c, eptr);
1784 if (
1785 #ifdef SUPPORT_UTF8
1786 c < 256 &&
1787 #endif
1788 (md->ctypes[c] & ctype_word) != 0
1789 )
1790 RRETURN(MATCH_NOMATCH);
1791 ecode++;
1792 break;
1793
1794 case OP_WORDCHAR:
1795 if (eptr >= md->end_subject)
1796 {
1797 SCHECK_PARTIAL();
1798 RRETURN(MATCH_NOMATCH);
1799 }
1800 GETCHARINCTEST(c, eptr);
1801 if (
1802 #ifdef SUPPORT_UTF8
1803 c >= 256 ||
1804 #endif
1805 (md->ctypes[c] & ctype_word) == 0
1806 )
1807 RRETURN(MATCH_NOMATCH);
1808 ecode++;
1809 break;
1810
1811 case OP_ANYNL:
1812 if (eptr >= md->end_subject)
1813 {
1814 SCHECK_PARTIAL();
1815 RRETURN(MATCH_NOMATCH);
1816 }
1817 GETCHARINCTEST(c, eptr);
1818 switch(c)
1819 {
1820 default: RRETURN(MATCH_NOMATCH);
1821 case 0x000d:
1822 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1823 break;
1824
1825 case 0x000a:
1826 break;
1827
1828 case 0x000b:
1829 case 0x000c:
1830 case 0x0085:
1831 case 0x2028:
1832 case 0x2029:
1833 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1834 break;
1835 }
1836 ecode++;
1837 break;
1838
1839 case OP_NOT_HSPACE:
1840 if (eptr >= md->end_subject)
1841 {
1842 SCHECK_PARTIAL();
1843 RRETURN(MATCH_NOMATCH);
1844 }
1845 GETCHARINCTEST(c, eptr);
1846 switch(c)
1847 {
1848 default: break;
1849 case 0x09: /* HT */
1850 case 0x20: /* SPACE */
1851 case 0xa0: /* NBSP */
1852 case 0x1680: /* OGHAM SPACE MARK */
1853 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1854 case 0x2000: /* EN QUAD */
1855 case 0x2001: /* EM QUAD */
1856 case 0x2002: /* EN SPACE */
1857 case 0x2003: /* EM SPACE */
1858 case 0x2004: /* THREE-PER-EM SPACE */
1859 case 0x2005: /* FOUR-PER-EM SPACE */
1860 case 0x2006: /* SIX-PER-EM SPACE */
1861 case 0x2007: /* FIGURE SPACE */
1862 case 0x2008: /* PUNCTUATION SPACE */
1863 case 0x2009: /* THIN SPACE */
1864 case 0x200A: /* HAIR SPACE */
1865 case 0x202f: /* NARROW NO-BREAK SPACE */
1866 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1867 case 0x3000: /* IDEOGRAPHIC SPACE */
1868 RRETURN(MATCH_NOMATCH);
1869 }
1870 ecode++;
1871 break;
1872
1873 case OP_HSPACE:
1874 if (eptr >= md->end_subject)
1875 {
1876 SCHECK_PARTIAL();
1877 RRETURN(MATCH_NOMATCH);
1878 }
1879 GETCHARINCTEST(c, eptr);
1880 switch(c)
1881 {
1882 default: RRETURN(MATCH_NOMATCH);
1883 case 0x09: /* HT */
1884 case 0x20: /* SPACE */
1885 case 0xa0: /* NBSP */
1886 case 0x1680: /* OGHAM SPACE MARK */
1887 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1888 case 0x2000: /* EN QUAD */
1889 case 0x2001: /* EM QUAD */
1890 case 0x2002: /* EN SPACE */
1891 case 0x2003: /* EM SPACE */
1892 case 0x2004: /* THREE-PER-EM SPACE */
1893 case 0x2005: /* FOUR-PER-EM SPACE */
1894 case 0x2006: /* SIX-PER-EM SPACE */
1895 case 0x2007: /* FIGURE SPACE */
1896 case 0x2008: /* PUNCTUATION SPACE */
1897 case 0x2009: /* THIN SPACE */
1898 case 0x200A: /* HAIR SPACE */
1899 case 0x202f: /* NARROW NO-BREAK SPACE */
1900 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1901 case 0x3000: /* IDEOGRAPHIC SPACE */
1902 break;
1903 }
1904 ecode++;
1905 break;
1906
1907 case OP_NOT_VSPACE:
1908 if (eptr >= md->end_subject)
1909 {
1910 SCHECK_PARTIAL();
1911 RRETURN(MATCH_NOMATCH);
1912 }
1913 GETCHARINCTEST(c, eptr);
1914 switch(c)
1915 {
1916 default: break;
1917 case 0x0a: /* LF */
1918 case 0x0b: /* VT */
1919 case 0x0c: /* FF */
1920 case 0x0d: /* CR */
1921 case 0x85: /* NEL */
1922 case 0x2028: /* LINE SEPARATOR */
1923 case 0x2029: /* PARAGRAPH SEPARATOR */
1924 RRETURN(MATCH_NOMATCH);
1925 }
1926 ecode++;
1927 break;
1928
1929 case OP_VSPACE:
1930 if (eptr >= md->end_subject)
1931 {
1932 SCHECK_PARTIAL();
1933 RRETURN(MATCH_NOMATCH);
1934 }
1935 GETCHARINCTEST(c, eptr);
1936 switch(c)
1937 {
1938 default: RRETURN(MATCH_NOMATCH);
1939 case 0x0a: /* LF */
1940 case 0x0b: /* VT */
1941 case 0x0c: /* FF */
1942 case 0x0d: /* CR */
1943 case 0x85: /* NEL */
1944 case 0x2028: /* LINE SEPARATOR */
1945 case 0x2029: /* PARAGRAPH SEPARATOR */
1946 break;
1947 }
1948 ecode++;
1949 break;
1950
1951 #ifdef SUPPORT_UCP
1952 /* Check the next character by Unicode property. We will get here only
1953 if the support is in the binary; otherwise a compile-time error occurs. */
1954
1955 case OP_PROP:
1956 case OP_NOTPROP:
1957 if (eptr >= md->end_subject)
1958 {
1959 SCHECK_PARTIAL();
1960 RRETURN(MATCH_NOMATCH);
1961 }
1962 GETCHARINCTEST(c, eptr);
1963 {
1964 const ucd_record *prop = GET_UCD(c);
1965
1966 switch(ecode[1])
1967 {
1968 case PT_ANY:
1969 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1970 break;
1971
1972 case PT_LAMP:
1973 if ((prop->chartype == ucp_Lu ||
1974 prop->chartype == ucp_Ll ||
1975 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1976 RRETURN(MATCH_NOMATCH);
1977 break;
1978
1979 case PT_GC:
1980 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1981 RRETURN(MATCH_NOMATCH);
1982 break;
1983
1984 case PT_PC:
1985 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1986 RRETURN(MATCH_NOMATCH);
1987 break;
1988
1989 case PT_SC:
1990 if ((ecode[2] != prop->script) == (op == OP_PROP))
1991 RRETURN(MATCH_NOMATCH);
1992 break;
1993
1994 default:
1995 RRETURN(PCRE_ERROR_INTERNAL);
1996 }
1997
1998 ecode += 3;
1999 }
2000 break;
2001
2002 /* Match an extended Unicode sequence. We will get here only if the support
2003 is in the binary; otherwise a compile-time error occurs. */
2004
2005 case OP_EXTUNI:
2006 if (eptr >= md->end_subject)
2007 {
2008 SCHECK_PARTIAL();
2009 RRETURN(MATCH_NOMATCH);
2010 }
2011 GETCHARINCTEST(c, eptr);
2012 {
2013 int category = UCD_CATEGORY(c);
2014 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
2015 while (eptr < md->end_subject)
2016 {
2017 int len = 1;
2018 if (!utf8) c = *eptr; else
2019 {
2020 GETCHARLEN(c, eptr, len);
2021 }
2022 category = UCD_CATEGORY(c);
2023 if (category != ucp_M) break;
2024 eptr += len;
2025 }
2026 }
2027 ecode++;
2028 break;
2029 #endif
2030
2031
2032 /* Match a back reference, possibly repeatedly. Look past the end of the
2033 item to see if there is repeat information following. The code is similar
2034 to that for character classes, but repeated for efficiency. Then obey
2035 similar code to character type repeats - written out again for speed.
2036 However, if the referenced string is the empty string, always treat
2037 it as matched, any number of times (otherwise there could be infinite
2038 loops). */
2039
2040 case OP_REF:
2041 {
2042 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2043 ecode += 3;
2044
2045 /* If the reference is unset, there are two possibilities:
2046
2047 (a) In the default, Perl-compatible state, set the length to be longer
2048 than the amount of subject left; this ensures that every attempt at a
2049 match fails. We can't just fail here, because of the possibility of
2050 quantifiers with zero minima.
2051
2052 (b) If the JavaScript compatibility flag is set, set the length to zero
2053 so that the back reference matches an empty string.
2054
2055 Otherwise, set the length to the length of what was matched by the
2056 referenced subpattern. */
2057
2058 if (offset >= offset_top || md->offset_vector[offset] < 0)
2059 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
2060 else
2061 length = md->offset_vector[offset+1] - md->offset_vector[offset];
2062
2063 /* Set up for repetition, or handle the non-repeated case */
2064
2065 switch (*ecode)
2066 {
2067 case OP_CRSTAR:
2068 case OP_CRMINSTAR:
2069 case OP_CRPLUS:
2070 case OP_CRMINPLUS:
2071 case OP_CRQUERY:
2072 case OP_CRMINQUERY:
2073 c = *ecode++ - OP_CRSTAR;
2074 minimize = (c & 1) != 0;
2075 min = rep_min[c]; /* Pick up values from tables; */
2076 max = rep_max[c]; /* zero for max => infinity */
2077 if (max == 0) max = INT_MAX;
2078 break;
2079
2080 case OP_CRRANGE:
2081 case OP_CRMINRANGE:
2082 minimize = (*ecode == OP_CRMINRANGE);
2083 min = GET2(ecode, 1);
2084 max = GET2(ecode, 3);
2085 if (max == 0) max = INT_MAX;
2086 ecode += 5;
2087 break;
2088
2089 default: /* No repeat follows */
2090 if (!match_ref(offset, eptr, length, md, ims))
2091 {
2092 CHECK_PARTIAL();
2093 RRETURN(MATCH_NOMATCH);
2094 }
2095 eptr += length;
2096 continue; /* With the main loop */
2097 }
2098
2099 /* If the length of the reference is zero, just continue with the
2100 main loop. */
2101
2102 if (length == 0) continue;
2103
2104 /* First, ensure the minimum number of matches are present. We get back
2105 the length of the reference string explicitly rather than passing the
2106 address of eptr, so that eptr can be a register variable. */
2107
2108 for (i = 1; i <= min; i++)
2109 {
2110 if (!match_ref(offset, eptr, length, md, ims))
2111 {
2112 CHECK_PARTIAL();
2113 RRETURN(MATCH_NOMATCH);
2114 }
2115 eptr += length;
2116 }
2117
2118 /* If min = max, continue at the same level without recursion.
2119 They are not both allowed to be zero. */
2120
2121 if (min == max) continue;
2122
2123 /* If minimizing, keep trying and advancing the pointer */
2124
2125 if (minimize)
2126 {
2127 for (fi = min;; fi++)
2128 {
2129 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2130 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2131 if (fi >= max) RRETURN(MATCH_NOMATCH);
2132 if (!match_ref(offset, eptr, length, md, ims))
2133 {
2134 CHECK_PARTIAL();
2135 RRETURN(MATCH_NOMATCH);
2136 }
2137 eptr += length;
2138 }
2139 /* Control never gets here */
2140 }
2141
2142 /* If maximizing, find the longest string and work backwards */
2143
2144 else
2145 {
2146 pp = eptr;
2147 for (i = min; i < max; i++)
2148 {
2149 if (!match_ref(offset, eptr, length, md, ims))
2150 {
2151 CHECK_PARTIAL();
2152 break;
2153 }
2154 eptr += length;
2155 }
2156 while (eptr >= pp)
2157 {
2158 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2159 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2160 eptr -= length;
2161 }
2162 RRETURN(MATCH_NOMATCH);
2163 }
2164 }
2165 /* Control never gets here */
2166
2167 /* Match a bit-mapped character class, possibly repeatedly. This op code is
2168 used when all the characters in the class have values in the range 0-255,
2169 and either the matching is caseful, or the characters are in the range
2170 0-127 when UTF-8 processing is enabled. The only difference between
2171 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2172 encountered.
2173
2174 First, look past the end of the item to see if there is repeat information
2175 following. Then obey similar code to character type repeats - written out
2176 again for speed. */
2177
2178 case OP_NCLASS:
2179 case OP_CLASS:
2180 {
2181 data = ecode + 1; /* Save for matching */
2182 ecode += 33; /* Advance past the item */
2183
2184 switch (*ecode)
2185 {
2186 case OP_CRSTAR:
2187 case OP_CRMINSTAR:
2188 case OP_CRPLUS:
2189 case OP_CRMINPLUS:
2190 case OP_CRQUERY:
2191 case OP_CRMINQUERY:
2192 c = *ecode++ - OP_CRSTAR;
2193 minimize = (c & 1) != 0;
2194 min = rep_min[c]; /* Pick up values from tables; */
2195 max = rep_max[c]; /* zero for max => infinity */
2196 if (max == 0) max = INT_MAX;
2197 break;
2198
2199 case OP_CRRANGE:
2200 case OP_CRMINRANGE:
2201 minimize = (*ecode == OP_CRMINRANGE);
2202 min = GET2(ecode, 1);
2203 max = GET2(ecode, 3);
2204 if (max == 0) max = INT_MAX;
2205 ecode += 5;
2206 break;
2207
2208 default: /* No repeat follows */
2209 min = max = 1;
2210 break;
2211 }
2212
2213 /* First, ensure the minimum number of matches are present. */
2214
2215 #ifdef SUPPORT_UTF8
2216 /* UTF-8 mode */
2217 if (utf8)
2218 {
2219 for (i = 1; i <= min; i++)
2220 {
2221 if (eptr >= md->end_subject)
2222 {
2223 SCHECK_PARTIAL();
2224 RRETURN(MATCH_NOMATCH);
2225 }
2226 GETCHARINC(c, eptr);
2227 if (c > 255)
2228 {
2229 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2230 }
2231 else
2232 {
2233 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2234 }
2235 }
2236 }
2237 else
2238 #endif
2239 /* Not UTF-8 mode */
2240 {
2241 for (i = 1; i <= min; i++)
2242 {
2243 if (eptr >= md->end_subject)
2244 {
2245 SCHECK_PARTIAL();
2246 RRETURN(MATCH_NOMATCH);
2247 }
2248 c = *eptr++;
2249 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2250 }
2251 }
2252
2253 /* If max == min we can continue with the main loop without the
2254 need to recurse. */
2255
2256 if (min == max) continue;
2257
2258 /* If minimizing, keep testing the rest of the expression and advancing
2259 the pointer while it matches the class. */
2260
2261 if (minimize)
2262 {
2263 #ifdef SUPPORT_UTF8
2264 /* UTF-8 mode */
2265 if (utf8)
2266 {
2267 for (fi = min;; fi++)
2268 {
2269 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2270 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2271 if (fi >= max) RRETURN(MATCH_NOMATCH);
2272 if (eptr >= md->end_subject)
2273 {
2274 SCHECK_PARTIAL();
2275 RRETURN(MATCH_NOMATCH);
2276 }
2277 GETCHARINC(c, eptr);
2278 if (c > 255)
2279 {
2280 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2281 }
2282 else
2283 {
2284 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2285 }
2286 }
2287 }
2288 else
2289 #endif
2290 /* Not UTF-8 mode */
2291 {
2292 for (fi = min;; fi++)
2293 {
2294 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2295 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2296 if (fi >= max) RRETURN(MATCH_NOMATCH);
2297 if (eptr >= md->end_subject)
2298 {
2299 SCHECK_PARTIAL();
2300 RRETURN(MATCH_NOMATCH);
2301 }
2302 c = *eptr++;
2303 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2304 }
2305 }
2306 /* Control never gets here */
2307 }
2308
2309 /* If maximizing, find the longest possible run, then work backwards. */
2310
2311 else
2312 {
2313 pp = eptr;
2314
2315 #ifdef SUPPORT_UTF8
2316 /* UTF-8 mode */
2317 if (utf8)
2318 {
2319 for (i = min; i < max; i++)
2320 {
2321 int len = 1;
2322 if (eptr >= md->end_subject)
2323 {
2324 SCHECK_PARTIAL();
2325 break;
2326 }
2327 GETCHARLEN(c, eptr, len);
2328 if (c > 255)
2329 {
2330 if (op == OP_CLASS) break;
2331 }
2332 else
2333 {
2334 if ((data[c/8] & (1 << (c&7))) == 0) break;
2335 }
2336 eptr += len;
2337 }
2338 for (;;)
2339 {
2340 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2341 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2342 if (eptr-- == pp) break; /* Stop if tried at original pos */
2343 BACKCHAR(eptr);
2344 }
2345 }
2346 else
2347 #endif
2348 /* Not UTF-8 mode */
2349 {
2350 for (i = min; i < max; i++)
2351 {
2352 if (eptr >= md->end_subject)
2353 {
2354 SCHECK_PARTIAL();
2355 break;
2356 }
2357 c = *eptr;
2358 if ((data[c/8] & (1 << (c&7))) == 0) break;
2359 eptr++;
2360 }
2361 while (eptr >= pp)
2362 {
2363 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2364 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2365 eptr--;
2366 }
2367 }
2368
2369 RRETURN(MATCH_NOMATCH);
2370 }
2371 }
2372 /* Control never gets here */
2373
2374
2375 /* Match an extended character class. This opcode is encountered only
2376 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2377 mode, because Unicode properties are supported in non-UTF-8 mode. */
2378
2379 #ifdef SUPPORT_UTF8
2380 case OP_XCLASS:
2381 {
2382 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2383 ecode += GET(ecode, 1); /* Advance past the item */
2384
2385 switch (*ecode)
2386 {
2387 case OP_CRSTAR:
2388 case OP_CRMINSTAR:
2389 case OP_CRPLUS:
2390 case OP_CRMINPLUS:
2391 case OP_CRQUERY:
2392 case OP_CRMINQUERY:
2393 c = *ecode++ - OP_CRSTAR;
2394 minimize = (c & 1) != 0;
2395 min = rep_min[c]; /* Pick up values from tables; */
2396 max = rep_max[c]; /* zero for max => infinity */
2397 if (max == 0) max = INT_MAX;
2398 break;
2399
2400 case OP_CRRANGE:
2401 case OP_CRMINRANGE:
2402 minimize = (*ecode == OP_CRMINRANGE);
2403 min = GET2(ecode, 1);
2404 max = GET2(ecode, 3);
2405 if (max == 0) max = INT_MAX;
2406 ecode += 5;
2407 break;
2408
2409 default: /* No repeat follows */
2410 min = max = 1;
2411 break;
2412 }
2413
2414 /* First, ensure the minimum number of matches are present. */
2415
2416 for (i = 1; i <= min; i++)
2417 {
2418 if (eptr >= md->end_subject)
2419 {
2420 SCHECK_PARTIAL();
2421 RRETURN(MATCH_NOMATCH);
2422 }
2423 GETCHARINCTEST(c, eptr);
2424 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2425 }
2426
2427 /* If max == min we can continue with the main loop without the
2428 need to recurse. */
2429
2430 if (min == max) continue;
2431
2432 /* If minimizing, keep testing the rest of the expression and advancing
2433 the pointer while it matches the class. */
2434
2435 if (minimize)
2436 {
2437 for (fi = min;; fi++)
2438 {
2439 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2440 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2441 if (fi >= max) RRETURN(MATCH_NOMATCH);
2442 if (eptr >= md->end_subject)
2443 {
2444 SCHECK_PARTIAL();
2445 RRETURN(MATCH_NOMATCH);
2446 }
2447 GETCHARINCTEST(c, eptr);
2448 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2449 }
2450 /* Control never gets here */
2451 }
2452
2453 /* If maximizing, find the longest possible run, then work backwards. */
2454
2455 else
2456 {
2457 pp = eptr;
2458 for (i = min; i < max; i++)
2459 {
2460 int len = 1;
2461 if (eptr >= md->end_subject)
2462 {
2463 SCHECK_PARTIAL();
2464 break;
2465 }
2466 GETCHARLENTEST(c, eptr, len);
2467 if (!_pcre_xclass(c, data)) break;
2468 eptr += len;
2469 }
2470 for(;;)
2471 {
2472 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2473 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2474 if (eptr-- == pp) break; /* Stop if tried at original pos */
2475 if (utf8) BACKCHAR(eptr);
2476 }
2477 RRETURN(MATCH_NOMATCH);
2478 }
2479
2480 /* Control never gets here */
2481 }
2482 #endif /* End of XCLASS */
2483
2484 /* Match a single character, casefully */
2485
2486 case OP_CHAR:
2487 #ifdef SUPPORT_UTF8
2488 if (utf8)
2489 {
2490 length = 1;
2491 ecode++;
2492 GETCHARLEN(fc, ecode, length);
2493 if (length > md->end_subject - eptr)
2494 {
2495 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2496 RRETURN(MATCH_NOMATCH);
2497 }
2498 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2499 }
2500 else
2501 #endif
2502
2503 /* Non-UTF-8 mode */
2504 {
2505 if (md->end_subject - eptr < 1)
2506 {
2507 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2508 RRETURN(MATCH_NOMATCH);
2509 }
2510 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2511 ecode += 2;
2512 }
2513 break;
2514
2515 /* Match a single character, caselessly */
2516
2517 case OP_CHARNC:
2518 #ifdef SUPPORT_UTF8
2519 if (utf8)
2520 {
2521 length = 1;
2522 ecode++;
2523 GETCHARLEN(fc, ecode, length);
2524
2525 if (length > md->end_subject - eptr)
2526 {
2527 CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2528 RRETURN(MATCH_NOMATCH);
2529 }
2530
2531 /* If the pattern character's value is < 128, we have only one byte, and
2532 can use the fast lookup table. */
2533
2534 if (fc < 128)
2535 {
2536 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2537 }
2538
2539 /* Otherwise we must pick up the subject character */
2540
2541 else
2542 {
2543 unsigned int dc;
2544 GETCHARINC(dc, eptr);
2545 ecode += length;
2546
2547 /* If we have Unicode property support, we can use it to test the other
2548 case of the character, if there is one. */
2549
2550 if (fc != dc)
2551 {
2552 #ifdef SUPPORT_UCP
2553 if (dc != UCD_OTHERCASE(fc))
2554 #endif
2555 RRETURN(MATCH_NOMATCH);
2556 }
2557 }
2558 }
2559 else
2560 #endif /* SUPPORT_UTF8 */
2561
2562 /* Non-UTF-8 mode */
2563 {
2564 if (md->end_subject - eptr < 1)
2565 {
2566 SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2567 RRETURN(MATCH_NOMATCH);
2568 }
2569 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2570 ecode += 2;
2571 }
2572 break;
2573
2574 /* Match a single character repeatedly. */
2575
2576 case OP_EXACT:
2577 min = max = GET2(ecode, 1);
2578 ecode += 3;
2579 goto REPEATCHAR;
2580
2581 case OP_POSUPTO:
2582 possessive = TRUE;
2583 /* Fall through */
2584
2585 case OP_UPTO:
2586 case OP_MINUPTO:
2587 min = 0;
2588 max = GET2(ecode, 1);
2589 minimize = *ecode == OP_MINUPTO;
2590 ecode += 3;
2591 goto REPEATCHAR;
2592
2593 case OP_POSSTAR:
2594 possessive = TRUE;
2595 min = 0;
2596 max = INT_MAX;
2597 ecode++;
2598 goto REPEATCHAR;
2599
2600 case OP_POSPLUS:
2601 possessive = TRUE;
2602 min = 1;
2603 max = INT_MAX;
2604 ecode++;
2605 goto REPEATCHAR;
2606
2607 case OP_POSQUERY:
2608 possessive = TRUE;
2609 min = 0;
2610 max = 1;
2611 ecode++;
2612 goto REPEATCHAR;
2613
2614 case OP_STAR:
2615 case OP_MINSTAR:
2616 case OP_PLUS:
2617 case OP_MINPLUS:
2618 case OP_QUERY:
2619 case OP_MINQUERY:
2620 c = *ecode++ - OP_STAR;
2621 minimize = (c & 1) != 0;
2622
2623 min = rep_min[c]; /* Pick up values from tables; */
2624 max = rep_max[c]; /* zero for max => infinity */
2625 if (max == 0) max = INT_MAX;
2626
2627 /* Common code for all repeated single-character matches. */
2628
2629 REPEATCHAR:
2630 #ifdef SUPPORT_UTF8
2631 if (utf8)
2632 {
2633 length = 1;
2634 charptr = ecode;
2635 GETCHARLEN(fc, ecode, length);
2636 ecode += length;
2637
2638 /* Handle multibyte character matching specially here. There is
2639 support for caseless matching if UCP support is present. */
2640
2641 if (length > 1)
2642 {
2643 #ifdef SUPPORT_UCP
2644 unsigned int othercase;
2645 if ((ims & PCRE_CASELESS) != 0 &&
2646 (othercase = UCD_OTHERCASE(fc)) != fc)
2647 oclength = _pcre_ord2utf8(othercase, occhars);
2648 else oclength = 0;
2649 #endif /* SUPPORT_UCP */
2650
2651 for (i = 1; i <= min; i++)
2652 {
2653 if (eptr <= md->end_subject - length &&
2654 memcmp(eptr, charptr, length) == 0) eptr += length;
2655 #ifdef SUPPORT_UCP
2656 else if (oclength > 0 &&
2657 eptr <= md->end_subject - oclength &&
2658 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2659 #endif /* SUPPORT_UCP */
2660 else
2661 {
2662 CHECK_PARTIAL();
2663 RRETURN(MATCH_NOMATCH);
2664 }
2665 }
2666
2667 if (min == max) continue;
2668
2669 if (minimize)
2670 {
2671 for (fi = min;; fi++)
2672 {
2673 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2674 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2675 if (fi >= max) RRETURN(MATCH_NOMATCH);
2676 if (eptr <= md->end_subject - length &&
2677 memcmp(eptr, charptr, length) == 0) eptr += length;
2678 #ifdef SUPPORT_UCP
2679 else if (oclength > 0 &&
2680 eptr <= md->end_subject - oclength &&
2681 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2682 #endif /* SUPPORT_UCP */
2683 else
2684 {
2685 CHECK_PARTIAL();
2686 RRETURN(MATCH_NOMATCH);
2687 }
2688 }
2689 /* Control never gets here */
2690 }
2691
2692 else /* Maximize */
2693 {
2694 pp = eptr;
2695 for (i = min; i < max; i++)
2696 {
2697 if (eptr <= md->end_subject - length &&
2698 memcmp(eptr, charptr, length) == 0) eptr += length;
2699 #ifdef SUPPORT_UCP
2700 else if (oclength > 0 &&
2701 eptr <= md->end_subject - oclength &&
2702 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2703 #endif /* SUPPORT_UCP */
2704 else
2705 {
2706 CHECK_PARTIAL();
2707 break;
2708 }
2709 }
2710
2711 if (possessive) continue;
2712
2713 for(;;)
2714 {
2715 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2716 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2717 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2718 #ifdef SUPPORT_UCP
2719 eptr--;
2720 BACKCHAR(eptr);
2721 #else /* without SUPPORT_UCP */
2722 eptr -= length;
2723 #endif /* SUPPORT_UCP */
2724 }
2725 }
2726 /* Control never gets here */
2727 }
2728
2729 /* If the length of a UTF-8 character is 1, we fall through here, and
2730 obey the code as for non-UTF-8 characters below, though in this case the
2731 value of fc will always be < 128. */
2732 }
2733 else
2734 #endif /* SUPPORT_UTF8 */
2735
2736 /* When not in UTF-8 mode, load a single-byte character. */
2737
2738 fc = *ecode++;
2739
2740 /* The value of fc at this point is always less than 256, though we may or
2741 may not be in UTF-8 mode. The code is duplicated for the caseless and
2742 caseful cases, for speed, since matching characters is likely to be quite
2743 common. First, ensure the minimum number of matches are present. If min =
2744 max, continue at the same level without recursing. Otherwise, if
2745 minimizing, keep trying the rest of the expression and advancing one
2746 matching character if failing, up to the maximum. Alternatively, if
2747 maximizing, find the maximum number of characters and work backwards. */
2748
2749 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2750 max, eptr));
2751
2752 if ((ims & PCRE_CASELESS) != 0)
2753 {
2754 fc = md->lcc[fc];
2755 for (i = 1; i <= min; i++)
2756 {
2757 if (eptr >= md->end_subject)
2758 {
2759 SCHECK_PARTIAL();
2760 RRETURN(MATCH_NOMATCH);
2761 }
2762 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2763 }
2764 if (min == max) continue;
2765 if (minimize)
2766 {
2767 for (fi = min;; fi++)
2768 {
2769 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2770 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2771 if (fi >= max) RRETURN(MATCH_NOMATCH);
2772 if (eptr >= md->end_subject)
2773 {
2774 SCHECK_PARTIAL();
2775 RRETURN(MATCH_NOMATCH);
2776 }
2777 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2778 }
2779 /* Control never gets here */
2780 }
2781 else /* Maximize */
2782 {
2783 pp = eptr;
2784 for (i = min; i < max; i++)
2785 {
2786 if (eptr >= md->end_subject)
2787 {
2788 SCHECK_PARTIAL();
2789 break;
2790 }
2791 if (fc != md->lcc[*eptr]) break;
2792 eptr++;
2793 }
2794
2795 if (possessive) continue;
2796
2797 while (eptr >= pp)
2798 {
2799 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2800 eptr--;
2801 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2802 }
2803 RRETURN(MATCH_NOMATCH);
2804 }
2805 /* Control never gets here */
2806 }
2807
2808 /* Caseful comparisons (includes all multi-byte characters) */
2809
2810 else
2811 {
2812 for (i = 1; i <= min; i++)
2813 {
2814 if (eptr >= md->end_subject)
2815 {
2816 SCHECK_PARTIAL();
2817 RRETURN(MATCH_NOMATCH);
2818 }
2819 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2820 }
2821
2822 if (min == max) continue;
2823
2824 if (minimize)
2825 {
2826 for (fi = min;; fi++)
2827 {
2828 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2830 if (fi >= max) RRETURN(MATCH_NOMATCH);
2831 if (eptr >= md->end_subject)
2832 {
2833 SCHECK_PARTIAL();
2834 RRETURN(MATCH_NOMATCH);
2835 }
2836 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2837 }
2838 /* Control never gets here */
2839 }
2840 else /* Maximize */
2841 {
2842 pp = eptr;
2843 for (i = min; i < max; i++)
2844 {
2845 if (eptr >= md->end_subject)
2846 {
2847 SCHECK_PARTIAL();
2848 break;
2849 }
2850 if (fc != *eptr) break;
2851 eptr++;
2852 }
2853 if (possessive) continue;
2854
2855 while (eptr >= pp)
2856 {
2857 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2858 eptr--;
2859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2860 }
2861 RRETURN(MATCH_NOMATCH);
2862 }
2863 }
2864 /* Control never gets here */
2865
2866 /* Match a negated single one-byte character. The character we are
2867 checking can be multibyte. */
2868
2869 case OP_NOT:
2870 if (eptr >= md->end_subject)
2871 {
2872 SCHECK_PARTIAL();
2873 RRETURN(MATCH_NOMATCH);
2874 }
2875 ecode++;
2876 GETCHARINCTEST(c, eptr);
2877 if ((ims & PCRE_CASELESS) != 0)
2878 {
2879 #ifdef SUPPORT_UTF8
2880 if (c < 256)
2881 #endif
2882 c = md->lcc[c];
2883 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2884 }
2885 else
2886 {
2887 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2888 }
2889 break;
2890
2891 /* Match a negated single one-byte character repeatedly. This is almost a
2892 repeat of the code for a repeated single character, but I haven't found a
2893 nice way of commoning these up that doesn't require a test of the
2894 positive/negative option for each character match. Maybe that wouldn't add
2895 very much to the time taken, but character matching *is* what this is all
2896 about... */
2897
2898 case OP_NOTEXACT:
2899 min = max = GET2(ecode, 1);
2900 ecode += 3;
2901 goto REPEATNOTCHAR;
2902
2903 case OP_NOTUPTO:
2904 case OP_NOTMINUPTO:
2905 min = 0;
2906 max = GET2(ecode, 1);
2907 minimize = *ecode == OP_NOTMINUPTO;
2908 ecode += 3;
2909 goto REPEATNOTCHAR;
2910
2911 case OP_NOTPOSSTAR:
2912 possessive = TRUE;
2913 min = 0;
2914 max = INT_MAX;
2915 ecode++;
2916 goto REPEATNOTCHAR;
2917
2918 case OP_NOTPOSPLUS:
2919 possessive = TRUE;
2920 min = 1;
2921 max = INT_MAX;
2922 ecode++;
2923 goto REPEATNOTCHAR;
2924
2925 case OP_NOTPOSQUERY:
2926 possessive = TRUE;
2927 min = 0;
2928 max = 1;
2929 ecode++;
2930 goto REPEATNOTCHAR;
2931
2932 case OP_NOTPOSUPTO:
2933 possessive = TRUE;
2934 min = 0;
2935 max = GET2(ecode, 1);
2936 ecode += 3;
2937 goto REPEATNOTCHAR;
2938
2939 case OP_NOTSTAR:
2940 case OP_NOTMINSTAR:
2941 case OP_NOTPLUS:
2942 case OP_NOTMINPLUS:
2943 case OP_NOTQUERY:
2944 case OP_NOTMINQUERY:
2945 c = *ecode++ - OP_NOTSTAR;
2946 minimize = (c & 1) != 0;
2947 min = rep_min[c]; /* Pick up values from tables; */
2948 max = rep_max[c]; /* zero for max => infinity */
2949 if (max == 0) max = INT_MAX;
2950
2951 /* Common code for all repeated single-byte matches. */
2952
2953 REPEATNOTCHAR:
2954 fc = *ecode++;
2955
2956 /* The code is duplicated for the caseless and caseful cases, for speed,
2957 since matching characters is likely to be quite common. First, ensure the
2958 minimum number of matches are present. If min = max, continue at the same
2959 level without recursing. Otherwise, if minimizing, keep trying the rest of
2960 the expression and advancing one matching character if failing, up to the
2961 maximum. Alternatively, if maximizing, find the maximum number of
2962 characters and work backwards. */
2963
2964 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2965 max, eptr));
2966
2967 if ((ims & PCRE_CASELESS) != 0)
2968 {
2969 fc = md->lcc[fc];
2970
2971 #ifdef SUPPORT_UTF8
2972 /* UTF-8 mode */
2973 if (utf8)
2974 {
2975 register unsigned int d;
2976 for (i = 1; i <= min; i++)
2977 {
2978 if (eptr >= md->end_subject)
2979 {
2980 SCHECK_PARTIAL();
2981 RRETURN(MATCH_NOMATCH);
2982 }
2983 GETCHARINC(d, eptr);
2984 if (d < 256) d = md->lcc[d];
2985 if (fc == d) RRETURN(MATCH_NOMATCH);
2986 }
2987 }
2988 else
2989 #endif
2990
2991 /* Not UTF-8 mode */
2992 {
2993 for (i = 1; i <= min; i++)
2994 {
2995 if (eptr >= md->end_subject)
2996 {
2997 SCHECK_PARTIAL();
2998 RRETURN(MATCH_NOMATCH);
2999 }
3000 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3001 }
3002 }
3003
3004 if (min == max) continue;
3005
3006 if (minimize)
3007 {
3008 #ifdef SUPPORT_UTF8
3009 /* UTF-8 mode */
3010 if (utf8)
3011 {
3012 register unsigned int d;
3013 for (fi = min;; fi++)
3014 {
3015 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3016 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3017 if (fi >= max) RRETURN(MATCH_NOMATCH);
3018 if (eptr >= md->end_subject)
3019 {
3020 SCHECK_PARTIAL();
3021 RRETURN(MATCH_NOMATCH);
3022 }
3023 GETCHARINC(d, eptr);
3024 if (d < 256) d = md->lcc[d];
3025 if (fc == d) RRETURN(MATCH_NOMATCH);
3026 }
3027 }
3028 else
3029 #endif
3030 /* Not UTF-8 mode */
3031 {
3032 for (fi = min;; fi++)
3033 {
3034 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3035 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3036 if (fi >= max) RRETURN(MATCH_NOMATCH);
3037 if (eptr >= md->end_subject)
3038 {
3039 SCHECK_PARTIAL();
3040 RRETURN(MATCH_NOMATCH);
3041 }
3042 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
3043 }
3044 }
3045 /* Control never gets here */
3046 }
3047
3048 /* Maximize case */
3049
3050 else
3051 {
3052 pp = eptr;
3053
3054 #ifdef SUPPORT_UTF8
3055 /* UTF-8 mode */
3056 if (utf8)
3057 {
3058 register unsigned int d;
3059 for (i = min; i < max; i++)
3060 {
3061 int len = 1;
3062 if (eptr >= md->end_subject)
3063 {
3064 SCHECK_PARTIAL();
3065 break;
3066 }
3067 GETCHARLEN(d, eptr, len);
3068 if (d < 256) d = md->lcc[d];
3069 if (fc == d) break;
3070 eptr += len;
3071 }
3072 if (possessive) continue;
3073 for(;;)
3074 {
3075 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3076 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3077 if (eptr-- == pp) break; /* Stop if tried at original pos */
3078 BACKCHAR(eptr);
3079 }
3080 }
3081 else
3082 #endif
3083 /* Not UTF-8 mode */
3084 {
3085 for (i = min; i < max; i++)
3086 {
3087 if (eptr >= md->end_subject)
3088 {
3089 SCHECK_PARTIAL();
3090 break;
3091 }
3092 if (fc == md->lcc[*eptr]) break;
3093 eptr++;
3094 }
3095 if (possessive) continue;
3096 while (eptr >= pp)
3097 {
3098 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3099 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3100 eptr--;
3101 }
3102 }
3103
3104 RRETURN(MATCH_NOMATCH);
3105 }
3106 /* Control never gets here */
3107 }
3108
3109 /* Caseful comparisons */
3110
3111 else
3112 {
3113 #ifdef SUPPORT_UTF8
3114 /* UTF-8 mode */
3115 if (utf8)
3116 {
3117 register unsigned int d;
3118 for (i = 1; i <= min; i++)
3119 {
3120 if (eptr >= md->end_subject)
3121 {
3122 SCHECK_PARTIAL();
3123 RRETURN(MATCH_NOMATCH);
3124 }
3125 GETCHARINC(d, eptr);
3126 if (fc == d) RRETURN(MATCH_NOMATCH);
3127 }
3128 }
3129 else
3130 #endif
3131 /* Not UTF-8 mode */
3132 {
3133 for (i = 1; i <= min; i++)
3134 {
3135 if (eptr >= md->end_subject)
3136 {
3137 SCHECK_PARTIAL();
3138 RRETURN(MATCH_NOMATCH);
3139 }
3140 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3141 }
3142 }
3143
3144 if (min == max) continue;
3145
3146 if (minimize)
3147 {
3148 #ifdef SUPPORT_UTF8
3149 /* UTF-8 mode */
3150 if (utf8)
3151 {
3152 register unsigned int d;
3153 for (fi = min;; fi++)
3154 {
3155 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3156 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3157 if (fi >= max) RRETURN(MATCH_NOMATCH);
3158 if (eptr >= md->end_subject)
3159 {
3160 SCHECK_PARTIAL();
3161 RRETURN(MATCH_NOMATCH);
3162 }
3163 GETCHARINC(d, eptr);
3164 if (fc == d) RRETURN(MATCH_NOMATCH);
3165 }
3166 }
3167 else
3168 #endif
3169 /* Not UTF-8 mode */
3170 {
3171 for (fi = min;; fi++)
3172 {
3173 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3174 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3175 if (fi >= max) RRETURN(MATCH_NOMATCH);
3176 if (eptr >= md->end_subject)
3177 {
3178 SCHECK_PARTIAL();
3179 RRETURN(MATCH_NOMATCH);
3180 }
3181 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
3182 }
3183 }
3184 /* Control never gets here */
3185 }
3186
3187 /* Maximize case */
3188
3189 else
3190 {
3191 pp = eptr;
3192
3193 #ifdef SUPPORT_UTF8
3194 /* UTF-8 mode */
3195 if (utf8)
3196 {
3197 register unsigned int d;
3198 for (i = min; i < max; i++)
3199 {
3200 int len = 1;
3201 if (eptr >= md->end_subject)
3202 {
3203 SCHECK_PARTIAL();
3204 break;
3205 }
3206 GETCHARLEN(d, eptr, len);
3207 if (fc == d) break;
3208 eptr += len;
3209 }
3210 if (possessive) continue;
3211 for(;;)
3212 {
3213 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3215 if (eptr-- == pp) break; /* Stop if tried at original pos */
3216 BACKCHAR(eptr);
3217 }
3218 }
3219 else
3220 #endif
3221 /* Not UTF-8 mode */
3222 {
3223 for (i = min; i < max; i++)
3224 {
3225 if (eptr >= md->end_subject)
3226 {
3227 SCHECK_PARTIAL();
3228 break;
3229 }
3230 if (fc == *eptr) break;
3231 eptr++;
3232 }
3233 if (possessive) continue;
3234 while (eptr >= pp)
3235 {
3236 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3237 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3238 eptr--;
3239 }
3240 }
3241
3242 RRETURN(MATCH_NOMATCH);
3243 }
3244 }
3245 /* Control never gets here */
3246
3247 /* Match a single character type repeatedly; several different opcodes
3248 share code. This is very similar to the code for single characters, but we
3249 repeat it in the interests of efficiency. */
3250
3251 case OP_TYPEEXACT:
3252 min = max = GET2(ecode, 1);
3253 minimize = TRUE;
3254 ecode += 3;
3255 goto REPEATTYPE;
3256
3257 case OP_TYPEUPTO:
3258 case OP_TYPEMINUPTO:
3259 min = 0;
3260 max = GET2(ecode, 1);
3261 minimize = *ecode == OP_TYPEMINUPTO;
3262 ecode += 3;
3263 goto REPEATTYPE;
3264
3265 case OP_TYPEPOSSTAR:
3266 possessive = TRUE;
3267 min = 0;
3268 max = INT_MAX;
3269 ecode++;
3270 goto REPEATTYPE;
3271
3272 case OP_TYPEPOSPLUS:
3273 possessive = TRUE;
3274 min = 1;
3275 max = INT_MAX;
3276 ecode++;
3277 goto REPEATTYPE;
3278
3279 case OP_TYPEPOSQUERY:
3280 possessive = TRUE;
3281 min = 0;
3282 max = 1;
3283 ecode++;
3284 goto REPEATTYPE;
3285
3286 case OP_TYPEPOSUPTO:
3287 possessive = TRUE;
3288 min = 0;
3289 max = GET2(ecode, 1);
3290 ecode += 3;
3291 goto REPEATTYPE;
3292
3293 case OP_TYPESTAR:
3294 case OP_TYPEMINSTAR:
3295 case OP_TYPEPLUS:
3296 case OP_TYPEMINPLUS:
3297 case OP_TYPEQUERY:
3298 case OP_TYPEMINQUERY:
3299 c = *ecode++ - OP_TYPESTAR;
3300 minimize = (c & 1) != 0;
3301 min = rep_min[c]; /* Pick up values from tables; */
3302 max = rep_max[c]; /* zero for max => infinity */
3303 if (max == 0) max = INT_MAX;
3304
3305 /* Common code for all repeated single character type matches. Note that
3306 in UTF-8 mode, '.' matches a character of any length, but for the other
3307 character types, the valid characters are all one-byte long. */
3308
3309 REPEATTYPE:
3310 ctype = *ecode++; /* Code for the character type */
3311
3312 #ifdef SUPPORT_UCP
3313 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3314 {
3315 prop_fail_result = ctype == OP_NOTPROP;
3316 prop_type = *ecode++;
3317 prop_value = *ecode++;
3318 }
3319 else prop_type = -1;
3320 #endif
3321
3322 /* First, ensure the minimum number of matches are present. Use inline
3323 code for maximizing the speed, and do the type test once at the start
3324 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3325 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3326 and single-bytes. */
3327
3328 if (min > 0)
3329 {
3330 #ifdef SUPPORT_UCP
3331 if (prop_type >= 0)
3332 {
3333 switch(prop_type)
3334 {
3335 case PT_ANY:
3336 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3337 for (i = 1; i <= min; i++)
3338 {
3339 if (eptr >= md->end_subject)
3340 {
3341 SCHECK_PARTIAL();
3342 RRETURN(MATCH_NOMATCH);
3343 }
3344 GETCHARINCTEST(c, eptr);
3345 }
3346 break;
3347
3348 case PT_LAMP:
3349 for (i = 1; i <= min; i++)
3350 {
3351 if (eptr >= md->end_subject)
3352 {
3353 SCHECK_PARTIAL();
3354 RRETURN(MATCH_NOMATCH);
3355 }
3356 GETCHARINCTEST(c, eptr);
3357 prop_chartype = UCD_CHARTYPE(c);
3358 if ((prop_chartype == ucp_Lu ||
3359 prop_chartype == ucp_Ll ||
3360 prop_chartype == ucp_Lt) == prop_fail_result)
3361 RRETURN(MATCH_NOMATCH);
3362 }
3363 break;
3364
3365 case PT_GC:
3366 for (i = 1; i <= min; i++)
3367 {
3368 if (eptr >= md->end_subject)
3369 {
3370 SCHECK_PARTIAL();
3371 RRETURN(MATCH_NOMATCH);
3372 }
3373 GETCHARINCTEST(c, eptr);
3374 prop_category = UCD_CATEGORY(c);
3375 if ((prop_category == prop_value) == prop_fail_result)
3376 RRETURN(MATCH_NOMATCH);
3377 }
3378 break;
3379
3380 case PT_PC:
3381 for (i = 1; i <= min; i++)
3382 {
3383 if (eptr >= md->end_subject)
3384 {
3385 SCHECK_PARTIAL();
3386 RRETURN(MATCH_NOMATCH);
3387 }
3388 GETCHARINCTEST(c, eptr);
3389 prop_chartype = UCD_CHARTYPE(c);
3390 if ((prop_chartype == prop_value) == prop_fail_result)
3391 RRETURN(MATCH_NOMATCH);
3392 }
3393 break;
3394
3395 case PT_SC:
3396 for (i = 1; i <= min; i++)
3397 {
3398 if (eptr >= md->end_subject)
3399 {
3400 SCHECK_PARTIAL();
3401 RRETURN(MATCH_NOMATCH);
3402 }
3403 GETCHARINCTEST(c, eptr);
3404 prop_script = UCD_SCRIPT(c);
3405 if ((prop_script == prop_value) == prop_fail_result)
3406 RRETURN(MATCH_NOMATCH);
3407 }
3408 break;
3409
3410 default:
3411 RRETURN(PCRE_ERROR_INTERNAL);
3412 }
3413 }
3414
3415 /* Match extended Unicode sequences. We will get here only if the
3416 support is in the binary; otherwise a compile-time error occurs. */
3417
3418 else if (ctype == OP_EXTUNI)
3419 {
3420 for (i = 1; i <= min; i++)
3421 {
3422 if (eptr >= md->end_subject)
3423 {
3424 SCHECK_PARTIAL();
3425 RRETURN(MATCH_NOMATCH);
3426 }
3427 GETCHARINCTEST(c, eptr);
3428 prop_category = UCD_CATEGORY(c);
3429 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3430 while (eptr < md->end_subject)
3431 {
3432 int len = 1;
3433 if (!utf8) c = *eptr;
3434 else { GETCHARLEN(c, eptr, len); }
3435 prop_category = UCD_CATEGORY(c);
3436 if (prop_category != ucp_M) break;
3437 eptr += len;
3438 }
3439 }
3440 }
3441
3442 else
3443 #endif /* SUPPORT_UCP */
3444
3445 /* Handle all other cases when the coding is UTF-8 */
3446
3447 #ifdef SUPPORT_UTF8
3448 if (utf8) switch(ctype)
3449 {
3450 case OP_ANY:
3451 for (i = 1; i <= min; i++)
3452 {
3453 if (eptr >= md->end_subject)
3454 {
3455 SCHECK_PARTIAL();
3456 RRETURN(MATCH_NOMATCH);
3457 }
3458 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3459 eptr++;
3460 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3461 }
3462 break;
3463
3464 case OP_ALLANY:
3465 for (i = 1; i <= min; i++)
3466 {
3467 if (eptr >= md->end_subject)
3468 {
3469 SCHECK_PARTIAL();
3470 RRETURN(MATCH_NOMATCH);
3471 }
3472 eptr++;
3473 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3474 }
3475 break;
3476
3477 case OP_ANYBYTE:
3478 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3479 eptr += min;
3480 break;
3481
3482 case OP_ANYNL:
3483 for (i = 1; i <= min; i++)
3484 {
3485 if (eptr >= md->end_subject)
3486 {
3487 SCHECK_PARTIAL();
3488 RRETURN(MATCH_NOMATCH);
3489 }
3490 GETCHARINC(c, eptr);
3491 switch(c)
3492 {
3493 default: RRETURN(MATCH_NOMATCH);
3494 case 0x000d:
3495 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3496 break;
3497
3498 case 0x000a:
3499 break;
3500
3501 case 0x000b:
3502 case 0x000c:
3503 case 0x0085:
3504 case 0x2028:
3505 case 0x2029:
3506 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3507 break;
3508 }
3509 }
3510 break;
3511
3512 case OP_NOT_HSPACE:
3513 for (i = 1; i <= min; i++)
3514 {
3515 if (eptr >= md->end_subject)
3516 {
3517 SCHECK_PARTIAL();
3518 RRETURN(MATCH_NOMATCH);
3519 }
3520 GETCHARINC(c, eptr);
3521 switch(c)
3522 {
3523 default: break;
3524 case 0x09: /* HT */
3525 case 0x20: /* SPACE */
3526 case 0xa0: /* NBSP */
3527 case 0x1680: /* OGHAM SPACE MARK */
3528 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3529 case 0x2000: /* EN QUAD */
3530 case 0x2001: /* EM QUAD */
3531 case 0x2002: /* EN SPACE */
3532 case 0x2003: /* EM SPACE */
3533 case 0x2004: /* THREE-PER-EM SPACE */
3534 case 0x2005: /* FOUR-PER-EM SPACE */
3535 case 0x2006: /* SIX-PER-EM SPACE */
3536 case 0x2007: /* FIGURE SPACE */
3537 case 0x2008: /* PUNCTUATION SPACE */
3538 case 0x2009: /* THIN SPACE */
3539 case 0x200A: /* HAIR SPACE */
3540 case 0x202f: /* NARROW NO-BREAK SPACE */
3541 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3542 case 0x3000: /* IDEOGRAPHIC SPACE */
3543 RRETURN(MATCH_NOMATCH);
3544 }
3545 }
3546 break;
3547
3548 case OP_HSPACE:
3549 for (i = 1; i <= min; i++)
3550 {
3551 if (eptr >= md->end_subject)
3552 {
3553 SCHECK_PARTIAL();
3554 RRETURN(MATCH_NOMATCH);
3555 }
3556 GETCHARINC(c, eptr);
3557 switch(c)
3558 {
3559 default: RRETURN(MATCH_NOMATCH);
3560 case 0x09: /* HT */
3561 case 0x20: /* SPACE */
3562 case 0xa0: /* NBSP */
3563 case 0x1680: /* OGHAM SPACE MARK */
3564 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3565 case 0x2000: /* EN QUAD */
3566 case 0x2001: /* EM QUAD */
3567 case 0x2002: /* EN SPACE */
3568 case 0x2003: /* EM SPACE */
3569 case 0x2004: /* THREE-PER-EM SPACE */
3570 case 0x2005: /* FOUR-PER-EM SPACE */
3571 case 0x2006: /* SIX-PER-EM SPACE */
3572 case 0x2007: /* FIGURE SPACE */
3573 case 0x2008: /* PUNCTUATION SPACE */
3574 case 0x2009: /* THIN SPACE */
3575 case 0x200A: /* HAIR SPACE */
3576 case 0x202f: /* NARROW NO-BREAK SPACE */
3577 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3578 case 0x3000: /* IDEOGRAPHIC SPACE */
3579 break;
3580 }
3581 }
3582 break;
3583
3584 case OP_NOT_VSPACE:
3585 for (i = 1; i <= min; i++)
3586 {
3587 if (eptr >= md->end_subject)
3588 {
3589 SCHECK_PARTIAL();
3590 RRETURN(MATCH_NOMATCH);
3591 }
3592 GETCHARINC(c, eptr);
3593 switch(c)
3594 {
3595 default: break;
3596 case 0x0a: /* LF */
3597 case 0x0b: /* VT */
3598 case 0x0c: /* FF */
3599 case 0x0d: /* CR */
3600 case 0x85: /* NEL */
3601 case 0x2028: /* LINE SEPARATOR */
3602 case 0x2029: /* PARAGRAPH SEPARATOR */
3603 RRETURN(MATCH_NOMATCH);
3604 }
3605 }
3606 break;
3607
3608 case OP_VSPACE:
3609 for (i = 1; i <= min; i++)
3610 {
3611 if (eptr >= md->end_subject)
3612 {
3613 SCHECK_PARTIAL();
3614 RRETURN(MATCH_NOMATCH);
3615 }
3616 GETCHARINC(c, eptr);
3617 switch(c)
3618 {
3619 default: RRETURN(MATCH_NOMATCH);
3620 case 0x0a: /* LF */
3621 case 0x0b: /* VT */
3622 case 0x0c: /* FF */
3623 case 0x0d: /* CR */
3624 case 0x85: /* NEL */
3625 case 0x2028: /* LINE SEPARATOR */
3626 case 0x2029: /* PARAGRAPH SEPARATOR */
3627 break;
3628 }
3629 }
3630 break;
3631
3632 case OP_NOT_DIGIT:
3633 for (i = 1; i <= min; i++)
3634 {
3635 if (eptr >= md->end_subject)
3636 {
3637 SCHECK_PARTIAL();
3638 RRETURN(MATCH_NOMATCH);
3639 }
3640 GETCHARINC(c, eptr);
3641 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3642 RRETURN(MATCH_NOMATCH);
3643 }
3644 break;
3645
3646 case OP_DIGIT:
3647 for (i = 1; i <= min; i++)
3648 {
3649 if (eptr >= md->end_subject)
3650 {
3651 SCHECK_PARTIAL();
3652 RRETURN(MATCH_NOMATCH);
3653 }
3654 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3655 RRETURN(MATCH_NOMATCH);
3656 /* No need to skip more bytes - we know it's a 1-byte character */
3657 }
3658 break;
3659
3660 case OP_NOT_WHITESPACE:
3661 for (i = 1; i <= min; i++)
3662 {
3663 if (eptr >= md->end_subject)
3664 {
3665 SCHECK_PARTIAL();
3666 RRETURN(MATCH_NOMATCH);
3667 }
3668 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3669 RRETURN(MATCH_NOMATCH);
3670 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3671 }
3672 break;
3673
3674 case OP_WHITESPACE:
3675 for (i = 1; i <= min; i++)
3676 {
3677 if (eptr >= md->end_subject)
3678 {
3679 SCHECK_PARTIAL();
3680 RRETURN(MATCH_NOMATCH);
3681 }
3682 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3683 RRETURN(MATCH_NOMATCH);
3684 /* No need to skip more bytes - we know it's a 1-byte character */
3685 }
3686 break;
3687
3688 case OP_NOT_WORDCHAR:
3689 for (i = 1; i <= min; i++)
3690 {
3691 if (eptr >= md->end_subject ||
3692 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3693 RRETURN(MATCH_NOMATCH);
3694 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3695 }
3696 break;
3697
3698 case OP_WORDCHAR:
3699 for (i = 1; i <= min; i++)
3700 {
3701 if (eptr >= md->end_subject)
3702 {
3703 SCHECK_PARTIAL();
3704 RRETURN(MATCH_NOMATCH);
3705 }
3706 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3707 RRETURN(MATCH_NOMATCH);
3708 /* No need to skip more bytes - we know it's a 1-byte character */
3709 }
3710 break;
3711
3712 default:
3713 RRETURN(PCRE_ERROR_INTERNAL);
3714 } /* End switch(ctype) */
3715
3716 else
3717 #endif /* SUPPORT_UTF8 */
3718
3719 /* Code for the non-UTF-8 case for minimum matching of operators other
3720 than OP_PROP and OP_NOTPROP. */
3721
3722 switch(ctype)
3723 {
3724 case OP_ANY:
3725 for (i = 1; i <= min; i++)
3726 {
3727 if (eptr >= md->end_subject)
3728 {
3729 SCHECK_PARTIAL();
3730 RRETURN(MATCH_NOMATCH);
3731 }
3732 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3733 eptr++;
3734 }
3735 break;
3736
3737 case OP_ALLANY:
3738 if (eptr > md->end_subject - min)
3739 {
3740 SCHECK_PARTIAL();
3741 RRETURN(MATCH_NOMATCH);
3742 }
3743 eptr += min;
3744 break;
3745
3746 case OP_ANYBYTE:
3747 if (eptr > md->end_subject - min)
3748 {
3749 SCHECK_PARTIAL();
3750 RRETURN(MATCH_NOMATCH);
3751 }
3752 eptr += min;
3753 break;
3754
3755 case OP_ANYNL:
3756 for (i = 1; i <= min; i++)
3757 {
3758 if (eptr >= md->end_subject)
3759 {
3760 SCHECK_PARTIAL();
3761 RRETURN(MATCH_NOMATCH);
3762 }
3763 switch(*eptr++)
3764 {
3765 default: RRETURN(MATCH_NOMATCH);
3766 case 0x000d:
3767 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3768 break;
3769 case 0x000a:
3770 break;
3771
3772 case 0x000b:
3773 case 0x000c:
3774 case 0x0085:
3775 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3776 break;
3777 }
3778 }
3779 break;
3780
3781 case OP_NOT_HSPACE:
3782 for (i = 1; i <= min; i++)
3783 {
3784 if (eptr >= md->end_subject)
3785 {
3786 SCHECK_PARTIAL();
3787 RRETURN(MATCH_NOMATCH);
3788 }
3789 switch(*eptr++)
3790 {
3791 default: break;
3792 case 0x09: /* HT */
3793 case 0x20: /* SPACE */
3794 case 0xa0: /* NBSP */
3795 RRETURN(MATCH_NOMATCH);
3796 }
3797 }
3798 break;
3799
3800 case OP_HSPACE:
3801 for (i = 1; i <= min; i++)
3802 {
3803 if (eptr >= md->end_subject)
3804 {
3805 SCHECK_PARTIAL();
3806 RRETURN(MATCH_NOMATCH);
3807 }
3808 switch(*eptr++)
3809 {
3810 default: RRETURN(MATCH_NOMATCH);
3811 case 0x09: /* HT */
3812 case 0x20: /* SPACE */
3813 case 0xa0: /* NBSP */
3814 break;
3815 }
3816 }
3817 break;
3818
3819 case OP_NOT_VSPACE:
3820 for (i = 1; i <= min; i++)
3821 {
3822 if (eptr >= md->end_subject)
3823 {
3824 SCHECK_PARTIAL();
3825 RRETURN(MATCH_NOMATCH);
3826 }
3827 switch(*eptr++)
3828 {
3829 default: break;
3830 case 0x0a: /* LF */
3831 case 0x0b: /* VT */
3832 case 0x0c: /* FF */
3833 case 0x0d: /* CR */
3834 case 0x85: /* NEL */
3835 RRETURN(MATCH_NOMATCH);
3836 }
3837 }
3838 break;
3839
3840 case OP_VSPACE:
3841 for (i = 1; i <= min; i++)
3842 {
3843 if (eptr >= md->end_subject)
3844 {
3845 SCHECK_PARTIAL();
3846 RRETURN(MATCH_NOMATCH);
3847 }
3848 switch(*eptr++)
3849 {
3850 default: RRETURN(MATCH_NOMATCH);
3851 case 0x0a: /* LF */
3852 case 0x0b: /* VT */
3853 case 0x0c: /* FF */
3854 case 0x0d: /* CR */
3855 case 0x85: /* NEL */
3856 break;
3857 }
3858 }
3859 break;
3860
3861 case OP_NOT_DIGIT:
3862 for (i = 1; i <= min; i++)
3863 {
3864 if (eptr >= md->end_subject)
3865 {
3866 SCHECK_PARTIAL();
3867 RRETURN(MATCH_NOMATCH);
3868 }
3869 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3870 }
3871 break;
3872
3873 case OP_DIGIT:
3874 for (i = 1; i <= min; i++)
3875 {
3876 if (eptr >= md->end_subject)
3877 {
3878 SCHECK_PARTIAL();
3879 RRETURN(MATCH_NOMATCH);
3880 }
3881 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3882 }
3883 break;
3884
3885 case OP_NOT_WHITESPACE:
3886 for (i = 1; i <= min; i++)
3887 {
3888 if (eptr >= md->end_subject)
3889 {
3890 SCHECK_PARTIAL();
3891 RRETURN(MATCH_NOMATCH);
3892 }
3893 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3894 }
3895 break;
3896
3897 case OP_WHITESPACE:
3898 for (i = 1; i <= min; i++)
3899 {
3900 if (eptr >= md->end_subject)
3901 {
3902 SCHECK_PARTIAL();
3903 RRETURN(MATCH_NOMATCH);
3904 }
3905 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3906 }
3907 break;
3908
3909 case OP_NOT_WORDCHAR:
3910 for (i = 1; i <= min; i++)
3911 {
3912 if (eptr >= md->end_subject)
3913 {
3914 SCHECK_PARTIAL();
3915 RRETURN(MATCH_NOMATCH);
3916 }
3917 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3918 RRETURN(MATCH_NOMATCH);
3919 }
3920 break;
3921
3922 case OP_WORDCHAR:
3923 for (i = 1; i <= min; i++)
3924 {
3925 if (eptr >= md->end_subject)
3926 {
3927 SCHECK_PARTIAL();
3928 RRETURN(MATCH_NOMATCH);
3929 }
3930 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3931 RRETURN(MATCH_NOMATCH);
3932 }
3933 break;
3934
3935 default:
3936 RRETURN(PCRE_ERROR_INTERNAL);
3937 }
3938 }
3939
3940 /* If min = max, continue at the same level without recursing */
3941
3942 if (min == max) continue;
3943
3944 /* If minimizing, we have to test the rest of the pattern before each
3945 subsequent match. Again, separate the UTF-8 case for speed, and also
3946 separate the UCP cases. */
3947
3948 if (minimize)
3949 {
3950 #ifdef SUPPORT_UCP
3951 if (prop_type >= 0)
3952 {
3953 switch(prop_type)
3954 {
3955 case PT_ANY:
3956 for (fi = min;; fi++)
3957 {
3958 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3959 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3960 if (fi >= max) RRETURN(MATCH_NOMATCH);
3961 if (eptr >= md->end_subject)
3962 {
3963 SCHECK_PARTIAL();
3964 RRETURN(MATCH_NOMATCH);
3965 }
3966 GETCHARINC(c, eptr);
3967 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3968 }
3969 /* Control never gets here */
3970
3971 case PT_LAMP:
3972 for (fi = min;; fi++)
3973 {
3974 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3975 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3976 if (fi >= max) RRETURN(MATCH_NOMATCH);
3977 if (eptr >= md->end_subject)
3978 {
3979 SCHECK_PARTIAL();
3980 RRETURN(MATCH_NOMATCH);
3981 }
3982 GETCHARINC(c, eptr);
3983 prop_chartype = UCD_CHARTYPE(c);
3984 if ((prop_chartype == ucp_Lu ||
3985 prop_chartype == ucp_Ll ||
3986 prop_chartype == ucp_Lt) == prop_fail_result)
3987 RRETURN(MATCH_NOMATCH);
3988 }
3989 /* Control never gets here */
3990
3991 case PT_GC:
3992 for (fi = min;; fi++)
3993 {
3994 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3995 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3996 if (fi >= max) RRETURN(MATCH_NOMATCH);
3997 if (eptr >= md->end_subject)
3998 {
3999 SCHECK_PARTIAL();
4000 RRETURN(MATCH_NOMATCH);
4001 }
4002 GETCHARINC(c, eptr);
4003 prop_category = UCD_CATEGORY(c);
4004 if ((prop_category == prop_value) == prop_fail_result)
4005 RRETURN(MATCH_NOMATCH);
4006 }
4007 /* Control never gets here */
4008
4009 case PT_PC:
4010 for (fi = min;; fi++)
4011 {
4012 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4013 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4014 if (fi >= max) RRETURN(MATCH_NOMATCH);
4015 if (eptr >= md->end_subject)
4016 {
4017 SCHECK_PARTIAL();
4018 RRETURN(MATCH_NOMATCH);
4019 }
4020 GETCHARINC(c, eptr);
4021 prop_chartype = UCD_CHARTYPE(c);
4022 if ((prop_chartype == prop_value) == prop_fail_result)
4023 RRETURN(MATCH_NOMATCH);
4024 }
4025 /* Control never gets here */
4026
4027 case PT_SC:
4028 for (fi = min;; fi++)
4029 {
4030 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4031 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4032 if (fi >= max) RRETURN(MATCH_NOMATCH);
4033 if (eptr >= md->end_subject)
4034 {
4035 SCHECK_PARTIAL();
4036 RRETURN(MATCH_NOMATCH);
4037 }
4038 GETCHARINC(c, eptr);
4039 prop_script = UCD_SCRIPT(c);
4040 if ((prop_script == prop_value) == prop_fail_result)
4041 RRETURN(MATCH_NOMATCH);
4042 }
4043 /* Control never gets here */
4044
4045 default:
4046 RRETURN(PCRE_ERROR_INTERNAL);
4047 }
4048 }
4049
4050 /* Match extended Unicode sequences. We will get here only if the
4051 support is in the binary; otherwise a compile-time error occurs. */
4052
4053 else if (ctype == OP_EXTUNI)
4054 {
4055 for (fi = min;; fi++)
4056 {
4057 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4059 if (fi >= max) RRETURN(MATCH_NOMATCH);
4060 if (eptr >= md->end_subject)
4061 {
4062 SCHECK_PARTIAL();
4063 RRETURN(MATCH_NOMATCH);
4064 }
4065 GETCHARINCTEST(c, eptr);
4066 prop_category = UCD_CATEGORY(c);
4067 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
4068 while (eptr < md->end_subject)
4069 {
4070 int len = 1;
4071 if (!utf8) c = *eptr;
4072 else { GETCHARLEN(c, eptr, len); }
4073 prop_category = UCD_CATEGORY(c);
4074 if (prop_category != ucp_M) break;
4075 eptr += len;
4076 }
4077 }
4078 }
4079
4080 else
4081 #endif /* SUPPORT_UCP */
4082
4083 #ifdef SUPPORT_UTF8
4084 /* UTF-8 mode */
4085 if (utf8)
4086 {
4087 for (fi = min;; fi++)
4088 {
4089 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4090 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4091 if (fi >= max) RRETURN(MATCH_NOMATCH);
4092 if (eptr >= md->end_subject)
4093 {
4094 SCHECK_PARTIAL();
4095 RRETURN(MATCH_NOMATCH);
4096 }
4097 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4098 RRETURN(MATCH_NOMATCH);
4099 GETCHARINC(c, eptr);
4100 switch(ctype)
4101 {
4102 case OP_ANY: /* This is the non-NL case */
4103 case OP_ALLANY:
4104 case OP_ANYBYTE:
4105 break;
4106
4107 case OP_ANYNL:
4108 switch(c)
4109 {
4110 default: RRETURN(MATCH_NOMATCH);
4111 case 0x000d:
4112 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4113 break;
4114 case 0x000a:
4115 break;
4116
4117 case 0x000b:
4118 case 0x000c:
4119 case 0x0085:
4120 case 0x2028:
4121 case 0x2029:
4122 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4123 break;
4124 }
4125 break;
4126
4127 case OP_NOT_HSPACE:
4128 switch(c)
4129 {
4130 default: break;
4131 case 0x09: /* HT */
4132 case 0x20: /* SPACE */
4133 case 0xa0: /* NBSP */
4134 case 0x1680: /* OGHAM SPACE MARK */
4135 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4136 case 0x2000: /* EN QUAD */
4137 case 0x2001: /* EM QUAD */
4138 case 0x2002: /* EN SPACE */
4139 case 0x2003: /* EM SPACE */
4140 case 0x2004: /* THREE-PER-EM SPACE */
4141 case 0x2005: /* FOUR-PER-EM SPACE */
4142 case 0x2006: /* SIX-PER-EM SPACE */
4143 case 0x2007: /* FIGURE SPACE */
4144 case 0x2008: /* PUNCTUATION SPACE */
4145 case 0x2009: /* THIN SPACE */
4146 case 0x200A: /* HAIR SPACE */
4147 case 0x202f: /* NARROW NO-BREAK SPACE */
4148 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4149 case 0x3000: /* IDEOGRAPHIC SPACE */
4150 RRETURN(MATCH_NOMATCH);
4151 }
4152 break;
4153
4154 case OP_HSPACE:
4155 switch(c)
4156 {
4157 default: RRETURN(MATCH_NOMATCH);
4158 case 0x09: /* HT */
4159 case 0x20: /* SPACE */
4160 case 0xa0: /* NBSP */
4161 case 0x1680: /* OGHAM SPACE MARK */
4162 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4163 case 0x2000: /* EN QUAD */
4164 case 0x2001: /* EM QUAD */
4165 case 0x2002: /* EN SPACE */
4166 case 0x2003: /* EM SPACE */
4167 case 0x2004: /* THREE-PER-EM SPACE */
4168 case 0x2005: /* FOUR-PER-EM SPACE */
4169 case 0x2006: /* SIX-PER-EM SPACE */
4170 case 0x2007: /* FIGURE SPACE */
4171 case 0x2008: /* PUNCTUATION SPACE */
4172 case 0x2009: /* THIN SPACE */
4173 case 0x200A: /* HAIR SPACE */
4174 case 0x202f: /* NARROW NO-BREAK SPACE */
4175 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4176 case 0x3000: /* IDEOGRAPHIC SPACE */
4177 break;
4178 }
4179 break;
4180
4181 case OP_NOT_VSPACE:
4182 switch(c)
4183 {
4184 default: break;
4185 case 0x0a: /* LF */
4186 case 0x0b: /* VT */
4187 case 0x0c: /* FF */
4188 case 0x0d: /* CR */
4189 case 0x85: /* NEL */
4190 case 0x2028: /* LINE SEPARATOR */
4191 case 0x2029: /* PARAGRAPH SEPARATOR */
4192 RRETURN(MATCH_NOMATCH);
4193 }
4194 break;
4195
4196 case OP_VSPACE:
4197 switch(c)
4198 {
4199 default: RRETURN(MATCH_NOMATCH);
4200 case 0x0a: /* LF */
4201 case 0x0b: /* VT */
4202 case 0x0c: /* FF */
4203 case 0x0d: /* CR */
4204 case 0x85: /* NEL */
4205 case 0x2028: /* LINE SEPARATOR */
4206 case 0x2029: /* PARAGRAPH SEPARATOR */
4207 break;
4208 }
4209 break;
4210
4211 case OP_NOT_DIGIT:
4212 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
4213 RRETURN(MATCH_NOMATCH);
4214 break;
4215
4216 case OP_DIGIT:
4217 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
4218 RRETURN(MATCH_NOMATCH);
4219 break;
4220
4221 case OP_NOT_WHITESPACE:
4222 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
4223 RRETURN(MATCH_NOMATCH);
4224 break;
4225
4226 case OP_WHITESPACE:
4227 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
4228 RRETURN(MATCH_NOMATCH);
4229 break;
4230
4231 case OP_NOT_WORDCHAR:
4232 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4233 RRETURN(MATCH_NOMATCH);
4234 break;
4235
4236 case OP_WORDCHAR:
4237 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4238 RRETURN(MATCH_NOMATCH);
4239 break;
4240
4241 default:
4242 RRETURN(PCRE_ERROR_INTERNAL);
4243 }
4244 }
4245 }
4246 else
4247 #endif
4248 /* Not UTF-8 mode */
4249 {
4250 for (fi = min;; fi++)
4251 {
4252 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4254 if (fi >= max) RRETURN(MATCH_NOMATCH);
4255 if (eptr >= md->end_subject)
4256 {
4257 SCHECK_PARTIAL();
4258 RRETURN(MATCH_NOMATCH);
4259 }
4260 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4261 RRETURN(MATCH_NOMATCH);
4262 c = *eptr++;
4263 switch(ctype)
4264 {
4265 case OP_ANY: /* This is the non-NL case */
4266 case OP_ALLANY:
4267 case OP_ANYBYTE:
4268 break;
4269
4270 case OP_ANYNL:
4271 switch(c)
4272 {
4273 default: RRETURN(MATCH_NOMATCH);
4274 case 0x000d:
4275 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4276 break;
4277
4278 case 0x000a:
4279 break;
4280
4281 case 0x000b:
4282 case 0x000c:
4283 case 0x0085:
4284 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4285 break;
4286 }
4287 break;
4288
4289 case OP_NOT_HSPACE:
4290 switch(c)
4291 {
4292 default: break;
4293 case 0x09: /* HT */
4294 case 0x20: /* SPACE */
4295 case 0xa0: /* NBSP */
4296 RRETURN(MATCH_NOMATCH);
4297 }
4298 break;
4299
4300 case OP_HSPACE:
4301 switch(c)
4302 {
4303 default: RRETURN(MATCH_NOMATCH);
4304 case 0x09: /* HT */
4305 case 0x20: /* SPACE */
4306 case 0xa0: /* NBSP */
4307 break;
4308 }
4309 break;
4310
4311 case OP_NOT_VSPACE:
4312 switch(c)
4313 {
4314 default: break;
4315 case 0x0a: /* LF */
4316 case 0x0b: /* VT */
4317 case 0x0c: /* FF */
4318 case 0x0d: /* CR */
4319 case 0x85: /* NEL */
4320 RRETURN(MATCH_NOMATCH);
4321 }
4322 break;
4323
4324 case OP_VSPACE:
4325 switch(c)
4326 {
4327 default: RRETURN(MATCH_NOMATCH);
4328 case 0x0a: /* LF */
4329 case 0x0b: /* VT */
4330 case 0x0c: /* FF */
4331 case 0x0d: /* CR */
4332 case 0x85: /* NEL */
4333 break;
4334 }
4335 break;
4336
4337 case OP_NOT_DIGIT:
4338 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4339 break;
4340
4341 case OP_DIGIT:
4342 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4343 break;
4344
4345 case OP_NOT_WHITESPACE:
4346 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4347 break;
4348
4349 case OP_WHITESPACE:
4350 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4351 break;
4352
4353 case OP_NOT_WORDCHAR:
4354 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4355 break;
4356
4357 case OP_WORDCHAR:
4358 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4359 break;
4360
4361 default:
4362 RRETURN(PCRE_ERROR_INTERNAL);
4363 }
4364 }
4365 }
4366 /* Control never gets here */
4367 }
4368
4369 /* If maximizing, it is worth using inline code for speed, doing the type
4370 test once at the start (i.e. keep it out of the loop). Again, keep the
4371 UTF-8 and UCP stuff separate. */
4372
4373 else
4374 {
4375 pp = eptr; /* Remember where we started */
4376
4377 #ifdef SUPPORT_UCP
4378 if (prop_type >= 0)
4379 {
4380 switch(prop_type)
4381 {
4382 case PT_ANY:
4383 for (i = min; i < max; i++)
4384 {
4385 int len = 1;
4386 if (eptr >= md->end_subject)
4387 {
4388 SCHECK_PARTIAL();
4389 break;
4390 }
4391 GETCHARLEN(c, eptr, len);
4392 if (prop_fail_result) break;
4393 eptr+= len;
4394 }
4395 break;
4396
4397 case PT_LAMP:
4398 for (i = min; i < max; i++)
4399 {
4400 int len = 1;
4401 if (eptr >= md->end_subject)
4402 {
4403 SCHECK_PARTIAL();
4404 break;
4405 }
4406 GETCHARLEN(c, eptr, len);
4407 prop_chartype = UCD_CHARTYPE(c);
4408 if ((prop_chartype == ucp_Lu ||
4409 prop_chartype == ucp_Ll ||
4410 prop_chartype == ucp_Lt) == prop_fail_result)
4411 break;
4412 eptr+= len;
4413 }
4414 break;
4415
4416 case PT_GC:
4417 for (i = min; i < max; i++)
4418 {
4419 int len = 1;
4420 if (eptr >= md->end_subject)
4421 {
4422 SCHECK_PARTIAL();
4423 break;
4424 }
4425 GETCHARLEN(c, eptr, len);
4426 prop_category = UCD_CATEGORY(c);
4427 if ((prop_category == prop_value) == prop_fail_result)
4428 break;
4429 eptr+= len;
4430 }
4431 break;
4432
4433 case PT_PC:
4434 for (i = min; i < max; i++)
4435 {
4436 int len = 1;
4437 if (eptr >= md->end_subject)
4438 {
4439 SCHECK_PARTIAL();
4440 break;
4441 }
4442 GETCHARLEN(c, eptr, len);
4443 prop_chartype = UCD_CHARTYPE(c);
4444 if ((prop_chartype == prop_value) == prop_fail_result)
4445 break;
4446 eptr+= len;
4447 }
4448 break;
4449
4450 case PT_SC:
4451 for (i = min; i < max; i++)
4452 {
4453 int len = 1;
4454 if (eptr >= md->end_subject)
4455 {
4456 SCHECK_PARTIAL();
4457 break;
4458 }
4459 GETCHARLEN(c, eptr, len);
4460 prop_script = UCD_SCRIPT(c);
4461 if ((prop_script == prop_value) == prop_fail_result)
4462 break;
4463 eptr+= len;
4464 }
4465 break;
4466 }
4467
4468 /* eptr is now past the end of the maximum run */
4469
4470 if (possessive) continue;
4471 for(;;)
4472 {
4473 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4474 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4475 if (eptr-- == pp) break; /* Stop if tried at original pos */
4476 if (utf8) BACKCHAR(eptr);
4477 }
4478 }
4479
4480 /* Match extended Unicode sequences. We will get here only if the
4481 support is in the binary; otherwise a compile-time error occurs. */
4482
4483 else if (ctype == OP_EXTUNI)
4484 {
4485 for (i = min; i < max; i++)
4486 {
4487 if (eptr >= md->end_subject)
4488 {
4489 SCHECK_PARTIAL();
4490 break;
4491 }
4492 GETCHARINCTEST(c, eptr);
4493 prop_category = UCD_CATEGORY(c);
4494 if (prop_category == ucp_M) break;
4495 while (eptr < md->end_subject)
4496 {
4497 int len = 1;
4498 if (!utf8) c = *eptr; else
4499 {
4500 GETCHARLEN(c, eptr, len);
4501 }
4502 prop_category = UCD_CATEGORY(c);
4503 if (prop_category != ucp_M) break;
4504 eptr += len;
4505 }
4506 }
4507
4508 /* eptr is now past the end of the maximum run */
4509
4510 if (possessive) continue;
4511
4512 for(;;)
4513 {
4514 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4515 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4516 if (eptr-- == pp) break; /* Stop if tried at original pos */
4517 for (;;) /* Move back over one extended */
4518 {
4519 int len = 1;
4520 if (!utf8) c = *eptr; else
4521 {
4522 BACKCHAR(eptr);
4523 GETCHARLEN(c, eptr, len);
4524 }
4525 prop_category = UCD_CATEGORY(c);
4526 if (prop_category != ucp_M) break;
4527 eptr--;
4528 }
4529 }
4530 }
4531
4532 else
4533 #endif /* SUPPORT_UCP */
4534
4535 #ifdef SUPPORT_UTF8
4536 /* UTF-8 mode */
4537
4538 if (utf8)
4539 {
4540 switch(ctype)
4541 {
4542 case OP_ANY:
4543 if (max < INT_MAX)
4544 {
4545 for (i = min; i < max; i++)
4546 {
4547 if (eptr >= md->end_subject)
4548 {
4549 SCHECK_PARTIAL();
4550 break;
4551 }
4552 if (IS_NEWLINE(eptr)) break;
4553 eptr++;
4554 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4555 }
4556 }
4557
4558 /* Handle unlimited UTF-8 repeat */
4559
4560 else
4561 {
4562 for (i = min; i < max; i++)
4563 {
4564 if (eptr >= md->end_subject)
4565 {
4566 SCHECK_PARTIAL();
4567 break;
4568 }
4569 if (IS_NEWLINE(eptr)) break;
4570 eptr++;
4571 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4572 }
4573 }
4574 break;
4575
4576 case OP_ALLANY:
4577 if (max < INT_MAX)
4578 {
4579 for (i = min; i < max; i++)
4580 {
4581 if (eptr >= md->end_subject)
4582 {
4583 SCHECK_PARTIAL();
4584 break;
4585 }
4586 eptr++;
4587 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4588 }
4589 }
4590 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4591 break;
4592
4593 /* The byte case is the same as non-UTF8 */
4594
4595 case OP_ANYBYTE:
4596 c = max - min;
4597 if (c > (unsigned int)(md->end_subject - eptr))
4598 {
4599 eptr = md->end_subject;
4600 SCHECK_PARTIAL();
4601 }
4602 else eptr += c;
4603 break;
4604
4605 case OP_ANYNL:
4606 for (i = min; i < max; i++)
4607 {
4608 int len = 1;
4609 if (eptr >= md->end_subject)
4610 {
4611 SCHECK_PARTIAL();
4612 break;
4613 }
4614 GETCHARLEN(c, eptr, len);
4615 if (c == 0x000d)
4616 {
4617 if (++eptr >= md->end_subject) break;
4618 if (*eptr == 0x000a) eptr++;
4619 }
4620 else
4621 {
4622 if (c != 0x000a &&
4623 (md->bsr_anycrlf ||
4624 (c != 0x000b && c != 0x000c &&
4625 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4626 break;
4627 eptr += len;
4628 }
4629 }
4630 break;
4631
4632 case OP_NOT_HSPACE:
4633 case OP_HSPACE:
4634 for (i = min; i < max; i++)
4635 {
4636 BOOL gotspace;
4637 int len = 1;
4638 if (eptr >= md->end_subject)
4639 {
4640 SCHECK_PARTIAL();
4641 break;
4642 }
4643 GETCHARLEN(c, eptr, len);
4644 switch(c)
4645 {
4646 default: gotspace = FALSE; break;
4647 case 0x09: /* HT */
4648 case 0x20: /* SPACE */
4649 case 0xa0: /* NBSP */
4650 case 0x1680: /* OGHAM SPACE MARK */
4651 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4652 case 0x2000: /* EN QUAD */
4653 case 0x2001: /* EM QUAD */
4654 case 0x2002: /* EN SPACE */
4655 case 0x2003: /* EM SPACE */
4656 case 0x2004: /* THREE-PER-EM SPACE */
4657 case 0x2005: /* FOUR-PER-EM SPACE */
4658 case 0x2006: /* SIX-PER-EM SPACE */
4659 case 0x2007: /* FIGURE SPACE */
4660 case 0x2008: /* PUNCTUATION SPACE */
4661 case 0x2009: /* THIN SPACE */
4662 case 0x200A: /* HAIR SPACE */
4663 case 0x202f: /* NARROW NO-BREAK SPACE */
4664 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4665 case 0x3000: /* IDEOGRAPHIC SPACE */
4666 gotspace = TRUE;
4667 break;
4668 }
4669 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4670 eptr += len;
4671 }
4672 break;
4673
4674 case OP_NOT_VSPACE:
4675 case OP_VSPACE:
4676 for (i = min; i < max; i++)
4677 {
4678 BOOL gotspace;
4679 int len = 1;
4680 if (eptr >= md->end_subject)
4681 {
4682 SCHECK_PARTIAL();
4683 break;
4684 }
4685 GETCHARLEN(c, eptr, len);
4686 switch(c)
4687 {
4688 default: gotspace = FALSE; break;
4689 case 0x0a: /* LF */
4690 case 0x0b: /* VT */
4691 case 0x0c: /* FF */
4692 case 0x0d: /* CR */
4693 case 0x85: /* NEL */
4694 case 0x2028: /* LINE SEPARATOR */
4695 case 0x2029: /* PARAGRAPH SEPARATOR */
4696 gotspace = TRUE;
4697 break;
4698 }
4699 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4700 eptr += len;
4701 }
4702 break;
4703
4704 case OP_NOT_DIGIT:
4705 for (i = min; i < max; i++)
4706 {
4707 int len = 1;
4708 if (eptr >= md->end_subject)
4709 {
4710 SCHECK_PARTIAL();
4711 break;
4712 }
4713 GETCHARLEN(c, eptr, len);
4714 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4715 eptr+= len;
4716 }
4717 break;
4718
4719 case OP_DIGIT:
4720 for (i = min; i < max; i++)
4721 {
4722 int len = 1;
4723 if (eptr >= md->end_subject)
4724 {
4725 SCHECK_PARTIAL();
4726 break;
4727 }
4728 GETCHARLEN(c, eptr, len);
4729 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4730 eptr+= len;
4731 }
4732 break;
4733
4734 case OP_NOT_WHITESPACE:
4735 for (i = min; i < max; i++)
4736 {
4737 int len = 1;
4738 if (eptr >= md->end_subject)
4739 {
4740 SCHECK_PARTIAL();
4741 break;
4742 }
4743 GETCHARLEN(c, eptr, len);
4744 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4745 eptr+= len;
4746 }
4747 break;
4748
4749 case OP_WHITESPACE:
4750 for (i = min; i < max; i++)
4751 {
4752 int len = 1;
4753 if (eptr >= md->end_subject)
4754 {
4755 SCHECK_PARTIAL();
4756 break;
4757 }
4758 GETCHARLEN(c, eptr, len);
4759 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4760 eptr+= len;
4761 }
4762 break;
4763
4764 case OP_NOT_WORDCHAR:
4765 for (i = min; i < max; i++)
4766 {
4767 int len = 1;
4768 if (eptr >= md->end_subject)
4769 {
4770 SCHECK_PARTIAL();
4771 break;
4772 }
4773 GETCHARLEN(c, eptr, len);
4774 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4775 eptr+= len;
4776 }
4777 break;
4778
4779 case OP_WORDCHAR:
4780 for (i = min; i < max; i++)
4781 {
4782 int len = 1;
4783 if (eptr >= md->end_subject)
4784 {
4785 SCHECK_PARTIAL();
4786 break;
4787 }
4788 GETCHARLEN(c, eptr, len);
4789 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4790 eptr+= len;
4791 }
4792 break;
4793
4794 default:
4795 RRETURN(PCRE_ERROR_INTERNAL);
4796 }
4797
4798 /* eptr is now past the end of the maximum run */
4799
4800 if (possessive) continue;
4801 for(;;)
4802 {
4803 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4805 if (eptr-- == pp) break; /* Stop if tried at original pos */
4806 BACKCHAR(eptr);
4807 }
4808 }
4809 else
4810 #endif /* SUPPORT_UTF8 */
4811
4812 /* Not UTF-8 mode */
4813 {
4814 switch(ctype)
4815 {
4816 case OP_ANY:
4817 for (i = min; i < max; i++)
4818 {
4819 if (eptr >= md->end_subject)
4820 {
4821 SCHECK_PARTIAL();
4822 break;
4823 }
4824 if (IS_NEWLINE(eptr)) break;
4825 eptr++;
4826 }
4827 break;
4828
4829 case OP_ALLANY:
4830 case OP_ANYBYTE:
4831 c = max - min;
4832 if (c > (unsigned int)(md->end_subject - eptr))
4833 {
4834 eptr = md->end_subject;
4835 SCHECK_PARTIAL();
4836 }
4837 else eptr += c;
4838 break;
4839
4840 case OP_ANYNL:
4841 for (i = min; i < max; i++)
4842 {
4843 if (eptr >= md->end_subject)
4844 {
4845 SCHECK_PARTIAL();
4846 break;
4847 }
4848 c = *eptr;
4849 if (c == 0x000d)
4850 {
4851 if (++eptr >= md->end_subject) break;
4852 if (*eptr == 0x000a) eptr++;
4853 }
4854 else
4855 {
4856 if (c != 0x000a &&
4857 (md->bsr_anycrlf ||
4858 (c != 0x000b && c != 0x000c && c != 0x0085)))
4859 break;
4860 eptr++;
4861 }
4862 }
4863 break;
4864
4865 case OP_NOT_HSPACE:
4866 for (i = min; i < max; i++)
4867 {
4868 if (eptr >= md->end_subject)
4869 {
4870 SCHECK_PARTIAL();
4871 break;
4872 }
4873 c = *eptr;
4874 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4875 eptr++;
4876 }
4877 break;
4878
4879 case OP_HSPACE:
4880 for (i = min; i < max; i++)
4881 {
4882 if (eptr >= md->end_subject)
4883 {
4884 SCHECK_PARTIAL();
4885 break;
4886 }
4887 c = *eptr;
4888 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4889 eptr++;
4890 }
4891 break;
4892
4893 case OP_NOT_VSPACE:
4894 for (i = min; i < max; i++)
4895 {
4896 if (eptr >= md->end_subject)
4897 {
4898 SCHECK_PARTIAL();
4899 break;
4900 }
4901 c = *eptr;
4902 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4903 break;
4904 eptr++;
4905 }
4906 break;
4907
4908 case OP_VSPACE:
4909 for (i = min; i < max; i++)
4910 {
4911 if (eptr >= md->end_subject)
4912 {
4913 SCHECK_PARTIAL();
4914 break;
4915 }
4916 c = *eptr;
4917 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4918 break;
4919 eptr++;
4920 }
4921 break;
4922
4923 case OP_NOT_DIGIT:
4924 for (i = min; i < max; i++)
4925 {
4926 if (eptr >= md->end_subject)
4927 {
4928 SCHECK_PARTIAL();
4929 break;
4930 }
4931 if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
4932 eptr++;
4933 }
4934 break;
4935
4936 case OP_DIGIT:
4937 for (i = min; i < max; i++)
4938 {
4939 if (eptr >= md->end_subject)
4940 {
4941 SCHECK_PARTIAL();
4942 break;
4943 }
4944 if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
4945 eptr++;
4946 }
4947 break;
4948
4949 case OP_NOT_WHITESPACE:
4950 for (i = min; i < max; i++)
4951 {
4952 if (eptr >= md->end_subject)
4953 {
4954 SCHECK_PARTIAL();
4955 break;
4956 }
4957 if ((md->ctypes[*eptr] & ctype_space) != 0) break;
4958 eptr++;
4959 }
4960 break;
4961
4962 case OP_WHITESPACE:
4963 for (i = min; i < max; i++)
4964 {
4965 if (eptr >= md->end_subject)
4966 {
4967 SCHECK_PARTIAL();
4968 break;
4969 }
4970 if ((md->ctypes[*eptr] & ctype_space) == 0) break;
4971 eptr++;
4972 }
4973 break;
4974
4975 case OP_NOT_WORDCHAR:
4976 for (i = min; i < max; i++)
4977 {
4978 if (eptr >= md->end_subject)
4979 {
4980 SCHECK_PARTIAL();
4981 break;
4982 }
4983 if ((md->ctypes[*eptr] & ctype_word) != 0) break;
4984 eptr++;
4985 }
4986 break;
4987
4988 case OP_WORDCHAR:
4989 for (i = min; i < max; i++)
4990 {
4991 if (eptr >= md->end_subject)
4992 {
4993 SCHECK_PARTIAL();
4994 break;
4995 }
4996 if ((md->ctypes[*eptr] & ctype_word) == 0) break;
4997 eptr++;
4998 }
4999 break;
5000
5001 default:
5002 RRETURN(PCRE_ERROR_INTERNAL);
5003 }
5004
5005 /* eptr is now past the end of the maximum run */
5006
5007 if (possessive) continue;
5008 while (eptr >= pp)
5009 {
5010 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5011 eptr--;
5012 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5013 }
5014 }
5015
5016 /* Get here if we can't make it match with any permitted repetitions */
5017
5018 RRETURN(MATCH_NOMATCH);
5019 }
5020 /* Control never gets here */
5021
5022 /* There's been some horrible disaster. Arrival here can only mean there is
5023 something seriously wrong in the code above or the OP_xxx definitions. */
5024
5025 default:
5026 DPRINTF(("Unknown opcode %d\n", *ecode));
5027 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5028 }
5029
5030 /* Do not stick any code in here without much thought; it is assumed
5031 that "continue" in the code above comes out to here to repeat the main
5032 loop. */
5033
5034 } /* End of main loop */
5035 /* Control never reaches here */
5036
5037
5038 /* When compiling to use the heap rather than the stack for recursive calls to
5039 match(), the RRETURN() macro jumps here. The number that is saved in
5040 frame->Xwhere indicates which label we actually want to return to. */
5041
5042 #ifdef NO_RECURSE
5043 #define LBL(val) case val: goto L_RM##val;
5044 HEAP_RETURN:
5045 switch (frame->Xwhere)
5046 {
5047 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5048 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5049 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5050 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
5051 LBL(53) LBL(54)
5052 #ifdef SUPPORT_UTF8
5053 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5054 LBL(32) LBL(34) LBL(42) LBL(46)
5055 #ifdef SUPPORT_UCP
5056 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
5057 #endif /* SUPPORT_UCP */
5058 #endif /* SUPPORT_UTF8 */
5059 default:
5060 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5061 return PCRE_ERROR_INTERNAL;
5062 }
5063 #undef LBL
5064 #endif /* NO_RECURSE */
5065 }
5066
5067
5068 /***************************************************************************
5069 ****************************************************************************
5070 RECURSION IN THE match() FUNCTION
5071
5072 Undefine all the macros that were defined above to handle this. */
5073
5074 #ifdef NO_RECURSE
5075 #undef eptr
5076 #undef ecode
5077 #undef mstart
5078 #undef offset_top
5079 #undef ims
5080 #undef eptrb
5081 #undef flags
5082
5083 #undef callpat
5084 #undef charptr
5085 #undef data
5086 #undef next
5087 #undef pp
5088 #undef prev
5089 #undef saved_eptr
5090
5091 #undef new_recursive
5092
5093 #undef cur_is_word
5094 #undef condition
5095 #undef prev_is_word
5096
5097 #undef original_ims
5098
5099 #undef ctype
5100 #undef length
5101 #undef max
5102 #undef min
5103 #undef number
5104 #undef offset
5105 #undef op
5106 #undef save_capture_last
5107 #undef save_offset1
5108 #undef save_offset2
5109 #undef save_offset3
5110 #undef stacksave
5111
5112 #undef newptrb
5113
5114 #endif
5115
5116 /* These two are defined as macros in both cases */
5117
5118 #undef fc
5119 #undef fi
5120
5121 /***************************************************************************
5122 ***************************************************************************/
5123
5124
5125
5126 /*************************************************
5127 * Execute a Regular Expression *
5128 *************************************************/
5129
5130 /* This function applies a compiled re to a subject string and picks out
5131 portions of the string if it matches. Two elements in the vector are set for
5132 each substring: the offsets to the start and end of the substring.
5133
5134 Arguments:
5135 argument_re points to the compiled expression
5136 extra_data points to extra data or is NULL
5137 subject points to the subject string
5138 length length of subject string (may contain binary zeros)
5139 start_offset where to start in the subject string
5140 options option bits
5141 offsets points to a vector of ints to be filled in with offsets
5142 offsetcount the number of elements in the vector
5143
5144 Returns: > 0 => success; value is the number of elements filled in
5145 = 0 => success, but offsets is not big enough
5146 -1 => failed to match
5147 < -1 => some kind of unexpected problem
5148 */
5149
5150 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
5151 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5152 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5153 int offsetcount)
5154 {
5155 int rc, resetcount, ocount;
5156 int first_byte = -1;
5157 int req_byte = -1;
5158 int req_byte2 = -1;
5159 int newline;
5160 unsigned long int ims;
5161 BOOL using_temporary_offsets = FALSE;
5162 BOOL anchored;
5163 BOOL startline;
5164 BOOL firstline;
5165 BOOL first_byte_caseless = FALSE;
5166 BOOL req_byte_caseless = FALSE;
5167 BOOL utf8;
5168 match_data match_block;
5169 match_data *md = &match_block;
5170 const uschar *tables;
5171 const uschar *start_bits = NULL;
5172 USPTR start_match = (USPTR)subject + start_offset;
5173 USPTR end_subject;
5174 USPTR start_partial = NULL;
5175 USPTR req_byte_ptr = start_match - 1;
5176
5177 pcre_study_data internal_study;
5178 const pcre_study_data *study;
5179
5180 real_pcre internal_re;
5181 const real_pcre *external_re = (const real_pcre *)argument_re;
5182 const real_pcre *re = external_re;
5183
5184 /* Plausibility checks */
5185
5186 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5187 if (re == NULL || subject == NULL ||
5188 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5189 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5190
5191 /* This information is for finding all the numbers associated with a given
5192 name, for condition testing. */
5193
5194 md->name_table = (uschar *)re + re->name_table_offset;
5195 md->name_count = re->name_count;
5196 md->name_entry_size = re->name_entry_size;
5197
5198 /* Fish out the optional data from the extra_data structure, first setting
5199 the default values. */
5200
5201 study = NULL;
5202 md->match_limit = MATCH_LIMIT;
5203 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5204 md->callout_data = NULL;
5205
5206 /* The table pointer is always in native byte order. */
5207
5208 tables = external_re->tables;
5209
5210 if (extra_data != NULL)
5211 {
5212 register unsigned int flags = extra_data->flags;
5213 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5214 study = (const pcre_study_data *)extra_data->study_data;
5215 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5216 md->match_limit = extra_data->match_limit;
5217 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5218 md->match_limit_recursion = extra_data->match_limit_recursion;
5219 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5220 md->callout_data = extra_data->callout_data;
5221 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5222 }
5223
5224 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5225 is a feature that makes it possible to save compiled regex and re-use them
5226 in other programs later. */
5227
5228 if (tables == NULL) tables = _pcre_default_tables;
5229
5230 /* Check that the first field in the block is the magic number. If it is not,
5231 test for a regex that was compiled on a host of opposite endianness. If this is
5232 the case, flipped values are put in internal_re and internal_study if there was
5233 study data too. */
5234
5235 if (re->magic_number != MAGIC_NUMBER)
5236 {
5237 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5238 if (re == NULL) return PCRE_ERROR_BADMAGIC;
5239 if (study != NULL) study = &internal_study;
5240 }
5241
5242 /* Set up other data */
5243
5244 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5245 startline = (re->flags & PCRE_STARTLINE) != 0;
5246 firstline = (re->options & PCRE_FIRSTLINE) != 0;
5247
5248 /* The code starts after the real_pcre block and the capture name table. */
5249
5250 md->start_code = (const uschar *)external_re + re->name_table_offset +
5251 re->name_count * re->name_entry_size;
5252
5253 md->start_subject = (USPTR)subject;
5254 md->start_offset = start_offset;
5255 md->end_subject = md->start_subject + length;
5256 end_subject = md->end_subject;
5257
5258 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5259 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
5260 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5261
5262 md->notbol = (options & PCRE_NOTBOL) != 0;
5263 md->noteol = (options & PCRE_NOTEOL) != 0;
5264 md->notempty = (options & PCRE_NOTEMPTY) != 0;
5265 md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5266 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5267 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
5268 md->hitend = FALSE;
5269
5270 md->recursive = NULL; /* No recursion at top level */
5271
5272 md->lcc = tables + lcc_offset;
5273 md->ctypes = tables + ctypes_offset;
5274
5275 /* Handle different \R options. */
5276
5277 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5278 {
5279 case 0:
5280 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5281 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5282 else
5283 #ifdef BSR_ANYCRLF
5284 md->bsr_anycrlf = TRUE;
5285 #else
5286 md->bsr_anycrlf = FALSE;
5287 #endif
5288 break;
5289
5290 case PCRE_BSR_ANYCRLF:
5291 md->bsr_anycrlf = TRUE;
5292 break;
5293
5294 case PCRE_BSR_UNICODE:
5295 md->bsr_anycrlf = FALSE;
5296 break;
5297
5298 default: return PCRE_ERROR_BADNEWLINE;
5299 }
5300
5301 /* Handle different types of newline. The three bits give eight cases. If
5302 nothing is set at run time, whatever was used at compile time applies. */
5303
5304 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5305 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5306 {
5307 case 0: newline = NEWLINE; break; /* Compile-time default */
5308 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5309 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
5310 case PCRE_NEWLINE_CR+
5311 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
5312 case PCRE_NEWLINE_ANY: newline = -1; break;
5313 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5314 default: return PCRE_ERROR_BADNEWLINE;
5315 }
5316
5317 if (newline == -2)
5318 {
5319 md->nltype = NLTYPE_ANYCRLF;
5320 }
5321 else if (newline < 0)
5322 {
5323 md->nltype = NLTYPE_ANY;
5324 }
5325 else
5326 {
5327 md->nltype = NLTYPE_FIXED;
5328 if (newline > 255)
5329 {
5330 md->nllen = 2;
5331 md->nl[0] = (newline >> 8) & 255;
5332 md->nl[1] = newline & 255;
5333 }
5334 else
5335 {
5336 md->nllen = 1;
5337 md->nl[0] = newline;
5338 }
5339 }
5340
5341 /* Partial matching was originally supported only for a restricted set of
5342 regexes; from release 8.00 there are no restrictions, but the bits are still
5343 defined (though never set). So there's no harm in leaving this code. */
5344
5345 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5346 return PCRE_ERROR_BADPARTIAL;
5347
5348 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5349 back the character offset. */
5350
5351 #ifdef SUPPORT_UTF8
5352 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5353 {
5354 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
5355 return PCRE_ERROR_BADUTF8;
5356 if (start_offset > 0 && start_offset < length)
5357 {
5358 int tb = ((USPTR)subject)[start_offset];
5359 if (tb > 127)
5360 {
5361 tb &= 0xc0;
5362 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5363 }
5364 }
5365 }
5366 #endif
5367
5368 /* The ims options can vary during the matching as a result of the presence
5369 of (?ims) items in the pattern. They are kept in a local variable so that
5370 restoring at the exit of a group is easy. */
5371
5372 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5373
5374 /* If the expression has got more back references than the offsets supplied can
5375 hold, we get a temporary chunk of working store to use during the matching.
5376 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5377 of 3. */
5378
5379 ocount = offsetcount - (offsetcount % 3);
5380
5381 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5382 {
5383 ocount = re->top_backref * 3 + 3;
5384 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5385 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5386 using_temporary_offsets = TRUE;
5387 DPRINTF(("Got memory to hold back references\n"));
5388 }
5389 else md->offset_vector = offsets;
5390
5391 md->offset_end = ocount;
5392 md->offset_max = (2*ocount)/3;
5393 md->offset_overflow = FALSE;
5394 md->capture_last = -1;
5395
5396 /* Compute the minimum number of offsets that we need to reset each time. Doing
5397 this makes a huge difference to execution time when there aren't many brackets
5398 in the pattern. */
5399
5400 resetcount = 2 + re->top_bracket * 2;
5401 if (resetcount > offsetcount) resetcount = ocount;
5402
5403 /* Reset the working variable associated with each extraction. These should
5404 never be used unless previously set, but they get saved and restored, and so we
5405 initialize them to avoid reading uninitialized locations. */
5406
5407 if (md->offset_vector != NULL)
5408 {
5409 register int *iptr = md->offset_vector + ocount;
5410 register int *iend = iptr - resetcount/2 + 1;
5411 while (--iptr >= iend) *iptr = -1;
5412 }
5413
5414 /* Set up the first character to match, if available. The first_byte value is
5415 never set for an anchored regular expression, but the anchoring may be forced
5416 at run time, so we have to test for anchoring. The first char may be unset for
5417 an unanchored pattern, of course. If there's no first char and the pattern was
5418 studied, there may be a bitmap of possible first characters. */
5419
5420 if (!anchored)
5421 {
5422 if ((re->flags & PCRE_FIRSTSET) != 0)
5423 {
5424 first_byte = re->first_byte & 255;
5425 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5426 first_byte = md->lcc[first_byte];
5427 }
5428 else
5429 if (!startline && study != NULL &&
5430 (study->flags & PCRE_STUDY_MAPPED) != 0)
5431 start_bits = study->start_bits;
5432 }
5433
5434 /* For anchored or unanchored matches, there may be a "last known required
5435 character" set. */
5436
5437 if ((re->flags & PCRE_REQCHSET) != 0)
5438 {
5439 req_byte = re->req_byte & 255;
5440 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5441 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5442 }
5443
5444
5445 /* ==========================================================================*/
5446
5447 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5448 the loop runs just once. */
5449
5450 for(;;)
5451 {
5452 USPTR save_end_subject = end_subject;
5453 USPTR new_start_match;
5454
5455 /* Reset the maximum number of extractions we might see. */
5456
5457 if (md->offset_vector != NULL)
5458 {
5459 register int *iptr = md->offset_vector;
5460 register int *iend = iptr + resetcount;
5461 while (iptr < iend) *iptr++ = -1;
5462 }
5463
5464 /* If firstline is TRUE, the start of the match is constrained to the first
5465 line of a multiline string. That is, the match must be before or at the first
5466 newline. Implement this by temporarily adjusting end_subject so that we stop
5467 scanning at a newline. If the match fails at the newline, later code breaks
5468 this loop. */
5469
5470 if (firstline)
5471 {
5472 USPTR t = start_match;
5473 #ifdef SUPPORT_UTF8
5474 if (utf8)
5475 {
5476 while (t < md->end_subject && !IS_NEWLINE(t))
5477 {
5478 t++;
5479 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5480 }
5481 }
5482 else
5483 #endif
5484 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5485 end_subject = t;
5486 }
5487
5488 /* There are some optimizations that avoid running the match if a known
5489 starting point is not found, or if a known later character is not present.
5490 However, there is an option that disables these, for testing and for ensuring
5491 that all callouts do actually occur. */
5492
5493 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5494 {
5495 /* Advance to a unique first byte if there is one. */
5496
5497 if (first_byte >= 0)
5498 {
5499 if (first_byte_caseless)
5500 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5501 start_match++;
5502 else
5503 while (start_match < end_subject && *start_match != first_byte)
5504 start_match++;
5505 }
5506
5507 /* Or to just after a linebreak for a multiline match */
5508
5509 else if (startline)
5510 {
5511 if (start_match > md->start_subject + start_offset)
5512 {
5513 #ifdef SUPPORT_UTF8
5514 if (utf8)
5515 {
5516 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5517 {
5518 start_match++;
5519 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5520 start_match++;
5521 }
5522 }
5523 else
5524 #endif
5525 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5526 start_match++;
5527
5528 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5529 and we are now at a LF, advance the match position by one more character.
5530 */
5531
5532 if (start_match[-1] == CHAR_CR &&
5533 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5534 start_match < end_subject &&
5535 *start_match == CHAR_NL)
5536 start_match++;
5537 }
5538 }
5539
5540 /* Or to a non-unique first byte after study */
5541
5542 else if (start_bits != NULL)
5543 {
5544 while (start_match < end_subject)
5545 {
5546 register unsigned int c = *start_match;
5547 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5548 else break;
5549 }
5550 }
5551 } /* Starting optimizations */
5552
5553 /* Restore fudged end_subject */
5554
5555 end_subject = save_end_subject;
5556
5557 /* The following two optimizations are disabled for partial matching or if
5558 disabling is explicitly requested. */
5559
5560 if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
5561 {
5562 /* If the pattern was studied, a minimum subject length may be set. This is
5563 a lower bound; no actual string of that length may actually match the
5564 pattern. Although the value is, strictly, in characters, we treat it as
5565 bytes to avoid spending too much time in this optimization. */
5566
5567 if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
5568 end_subject - start_match < study->minlength)
5569 {
5570 rc = MATCH_NOMATCH;
5571 break;
5572 }
5573
5574 /* If req_byte is set, we know that that character must appear in the
5575 subject for the match to succeed. If the first character is set, req_byte
5576 must be later in the subject; otherwise the test starts at the match point.
5577 This optimization can save a huge amount of backtracking in patterns with
5578 nested unlimited repeats that aren't going to match. Writing separate code
5579 for cased/caseless versions makes it go faster, as does using an
5580 autoincrement and backing off on a match.
5581
5582 HOWEVER: when the subject string is very, very long, searching to its end
5583 can take a long time, and give bad performance on quite ordinary patterns.
5584 This showed up when somebody was matching something like /^\d+C/ on a
5585 32-megabyte string... so we don't do this when the string is sufficiently
5586 long. */
5587
5588 if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
5589 {
5590 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5591
5592 /* We don't need to repeat the search if we haven't yet reached the
5593 place we found it at last time. */
5594
5595 if (p > req_byte_ptr)
5596 {
5597 if (req_byte_caseless)
5598 {
5599 while (p < end_subject)
5600 {
5601 register int pp = *p++;
5602 if (pp == req_byte || pp == req_byte2) { p--; break; }
5603 }
5604 }
5605 else
5606 {
5607 while (p < end_subject)
5608 {
5609 if (*p++ == req_byte) { p--; break; }
5610 }
5611 }
5612
5613 /* If we can't find the required character, break the matching loop,
5614 forcing a match failure. */
5615
5616 if (p >= end_subject)
5617 {
5618 rc = MATCH_NOMATCH;
5619 break;
5620 }
5621
5622 /* If we have found the required character, save the point where we
5623 found it, so that we don't search again next time round the loop if
5624 the start hasn't passed this character yet. */
5625
5626 req_byte_ptr = p;
5627 }
5628 }
5629 }
5630
5631 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5632 printf(">>>> Match against: ");
5633 pchars(start_match, end_subject - start_match, TRUE, md);
5634 printf("\n");
5635 #endif
5636
5637 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5638 first starting point for which a partial match was found. */
5639
5640 md->start_match_ptr = start_match;
5641 md->start_used_ptr = start_match;
5642 md->match_call_count = 0;
5643 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5644 if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
5645
5646 switch(rc)
5647 {
5648 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5649 exactly like PRUNE. */
5650
5651 case MATCH_NOMATCH:
5652 case MATCH_PRUNE:
5653 case MATCH_THEN:
5654 new_start_match = start_match + 1;
5655 #ifdef SUPPORT_UTF8
5656 if (utf8)
5657 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5658 new_start_match++;
5659 #endif
5660 break;
5661
5662 /* SKIP passes back the next starting point explicitly. */
5663
5664 case MATCH_SKIP:
5665 new_start_match = md->start_match_ptr;
5666 break;
5667
5668 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5669
5670 case MATCH_COMMIT:
5671 rc = MATCH_NOMATCH;
5672 goto ENDLOOP;
5673
5674 /* Any other return is either a match, or some kind of error. */
5675
5676 default:
5677 goto ENDLOOP;
5678 }
5679
5680 /* Control reaches here for the various types of "no match at this point"
5681 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5682
5683 rc = MATCH_NOMATCH;
5684
5685 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5686 newline in the subject (though it may continue over the newline). Therefore,
5687 if we have just failed to match, starting at a newline, do not continue. */
5688
5689 if (firstline && IS_NEWLINE(start_match)) break;
5690
5691 /* Advance to new matching position */
5692
5693 start_match = new_start_match;
5694
5695 /* Break the loop if the pattern is anchored or if we have passed the end of
5696 the subject. */
5697
5698 if (anchored || start_match > end_subject) break;
5699
5700 /* If we have just passed a CR and we are now at a LF, and the pattern does
5701 not contain any explicit matches for \r or \n, and the newline option is CRLF
5702 or ANY or ANYCRLF, advance the match position by one more character. */
5703
5704 if (start_match[-1] == CHAR_CR &&
5705 start_match < end_subject &&
5706 *start_match == CHAR_NL &&
5707 (re->flags & PCRE_HASCRORLF) == 0 &&
5708 (md->nltype == NLTYPE_ANY ||
5709 md->nltype == NLTYPE_ANYCRLF ||
5710 md->nllen == 2))
5711 start_match++;
5712
5713 } /* End of for(;;) "bumpalong" loop */
5714
5715 /* ==========================================================================*/
5716
5717 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5718 conditions is true:
5719
5720 (1) The pattern is anchored or the match was failed by (*COMMIT);
5721
5722 (2) We are past the end of the subject;
5723
5724 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5725 this option requests that a match occur at or before the first newline in
5726 the subject.
5727
5728 When we have a match and the offset vector is big enough to deal with any
5729 backreferences, captured substring offsets will already be set up. In the case
5730 where we had to get some local store to hold offsets for backreference
5731 processing, copy those that we can. In this case there need not be overflow if
5732 certain parts of the pattern were not used, even though there are more
5733 capturing parentheses than vector slots. */
5734
5735 ENDLOOP:
5736
5737 if (rc == MATCH_MATCH)
5738 {
5739 if (using_temporary_offsets)
5740 {
5741 if (offsetcount >= 4)
5742 {
5743 memcpy(offsets + 2, md->offset_vector + 2,
5744 (offsetcount - 2) * sizeof(int));
5745 DPRINTF(("Copied offsets from temporary memory\n"));
5746 }
5747 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5748 DPRINTF(("Freeing temporary memory\n"));
5749 (pcre_free)(md->offset_vector);
5750 }
5751
5752 /* Set the return code to the number of captured strings, or 0 if there are
5753 too many to fit into the vector. */
5754
5755 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5756
5757 /* If there is space, set up the whole thing as substring 0. The value of
5758 md->start_match_ptr might be modified if \K was encountered on the success
5759 matching path. */
5760
5761 if (offsetcount < 2) rc = 0; else
5762 {
5763 offsets[0] = md->start_match_ptr - md->start_subject;
5764 offsets[1] = md->end_match_ptr - md->start_subject;
5765 }
5766
5767 DPRINTF((">>>> returning %d\n", rc));
5768 return rc;
5769 }
5770
5771 /* Control gets here if there has been an error, or if the overall match
5772 attempt has failed at all permitted starting positions. */
5773
5774 if (using_temporary_offsets)
5775 {
5776 DPRINTF(("Freeing temporary memory\n"));
5777 (pcre_free)(md->offset_vector);
5778 }
5779
5780 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5781 {
5782 DPRINTF((">>>> error: returning %d\n", rc));
5783 return rc;
5784 }
5785 else if (start_partial != NULL)
5786 {
5787 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5788 if (offsetcount > 1)
5789 {
5790 offsets[0] = start_partial - (USPTR)subject;
5791 offsets[1] = end_subject - (USPTR)subject;
5792 }
5793 return PCRE_ERROR_PARTIAL;
5794 }
5795 else
5796 {
5797 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5798 return PCRE_ERROR_NOMATCH;
5799 }
5800 }
5801
5802 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12