/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 426 - (show annotations) (download)
Wed Aug 26 15:38:32 2009 UTC (5 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 161361 byte(s)
Remove restrictions on pcre_exec() partial matching.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). The second one is used when we already know we are
407 past the end of the subject. */
408
409 #define CHECK_PARTIAL()\
410 if (md->partial && eptr >= md->end_subject && eptr > mstart)\
411 md->hitend = TRUE
412
413 #define SCHECK_PARTIAL()\
414 if (md->partial && eptr > mstart) md->hitend = TRUE
415
416 /* Performance note: It might be tempting to extract commonly used fields from
417 the md structure (e.g. utf8, end_subject) into individual variables to improve
418 performance. Tests using gcc on a SPARC disproved this; in the first case, it
419 made performance worse.
420
421 Arguments:
422 eptr pointer to current character in subject
423 ecode pointer to current position in compiled code
424 mstart pointer to the current match start position (can be modified
425 by encountering \K)
426 offset_top current top pointer
427 md pointer to "static" info for the match
428 ims current /i, /m, and /s options
429 eptrb pointer to chain of blocks containing eptr at start of
430 brackets - for testing for empty matches
431 flags can contain
432 match_condassert - this is an assertion condition
433 match_cbegroup - this is the start of an unlimited repeat
434 group that can match an empty string
435 rdepth the recursion depth
436
437 Returns: MATCH_MATCH if matched ) these values are >= 0
438 MATCH_NOMATCH if failed to match )
439 a negative PCRE_ERROR_xxx value if aborted by an error condition
440 (e.g. stopped by repeated call or recursion limit)
441 */
442
443 static int
444 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
445 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
446 int flags, unsigned int rdepth)
447 {
448 /* These variables do not need to be preserved over recursion in this function,
449 so they can be ordinary variables in all cases. Mark some of them with
450 "register" because they are used a lot in loops. */
451
452 register int rrc; /* Returns from recursive calls */
453 register int i; /* Used for loops not involving calls to RMATCH() */
454 register unsigned int c; /* Character values not kept over RMATCH() calls */
455 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
456
457 BOOL minimize, possessive; /* Quantifier options */
458 int condcode;
459
460 /* When recursion is not being used, all "local" variables that have to be
461 preserved over calls to RMATCH() are part of a "frame" which is obtained from
462 heap storage. Set up the top-level frame here; others are obtained from the
463 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
464
465 #ifdef NO_RECURSE
466 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
467 frame->Xprevframe = NULL; /* Marks the top level */
468
469 /* Copy in the original argument variables */
470
471 frame->Xeptr = eptr;
472 frame->Xecode = ecode;
473 frame->Xmstart = mstart;
474 frame->Xoffset_top = offset_top;
475 frame->Xims = ims;
476 frame->Xeptrb = eptrb;
477 frame->Xflags = flags;
478 frame->Xrdepth = rdepth;
479
480 /* This is where control jumps back to to effect "recursion" */
481
482 HEAP_RECURSE:
483
484 /* Macros make the argument variables come from the current frame */
485
486 #define eptr frame->Xeptr
487 #define ecode frame->Xecode
488 #define mstart frame->Xmstart
489 #define offset_top frame->Xoffset_top
490 #define ims frame->Xims
491 #define eptrb frame->Xeptrb
492 #define flags frame->Xflags
493 #define rdepth frame->Xrdepth
494
495 /* Ditto for the local variables */
496
497 #ifdef SUPPORT_UTF8
498 #define charptr frame->Xcharptr
499 #endif
500 #define callpat frame->Xcallpat
501 #define codelink frame->Xcodelink
502 #define data frame->Xdata
503 #define next frame->Xnext
504 #define pp frame->Xpp
505 #define prev frame->Xprev
506 #define saved_eptr frame->Xsaved_eptr
507
508 #define new_recursive frame->Xnew_recursive
509
510 #define cur_is_word frame->Xcur_is_word
511 #define condition frame->Xcondition
512 #define prev_is_word frame->Xprev_is_word
513
514 #define original_ims frame->Xoriginal_ims
515
516 #ifdef SUPPORT_UCP
517 #define prop_type frame->Xprop_type
518 #define prop_value frame->Xprop_value
519 #define prop_fail_result frame->Xprop_fail_result
520 #define prop_category frame->Xprop_category
521 #define prop_chartype frame->Xprop_chartype
522 #define prop_script frame->Xprop_script
523 #define oclength frame->Xoclength
524 #define occhars frame->Xocchars
525 #endif
526
527 #define ctype frame->Xctype
528 #define fc frame->Xfc
529 #define fi frame->Xfi
530 #define length frame->Xlength
531 #define max frame->Xmax
532 #define min frame->Xmin
533 #define number frame->Xnumber
534 #define offset frame->Xoffset
535 #define op frame->Xop
536 #define save_capture_last frame->Xsave_capture_last
537 #define save_offset1 frame->Xsave_offset1
538 #define save_offset2 frame->Xsave_offset2
539 #define save_offset3 frame->Xsave_offset3
540 #define stacksave frame->Xstacksave
541
542 #define newptrb frame->Xnewptrb
543
544 /* When recursion is being used, local variables are allocated on the stack and
545 get preserved during recursion in the normal way. In this environment, fi and
546 i, and fc and c, can be the same variables. */
547
548 #else /* NO_RECURSE not defined */
549 #define fi i
550 #define fc c
551
552
553 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
554 const uschar *charptr; /* in small blocks of the code. My normal */
555 #endif /* style of coding would have declared */
556 const uschar *callpat; /* them within each of those blocks. */
557 const uschar *data; /* However, in order to accommodate the */
558 const uschar *next; /* version of this code that uses an */
559 USPTR pp; /* external "stack" implemented on the */
560 const uschar *prev; /* heap, it is easier to declare them all */
561 USPTR saved_eptr; /* here, so the declarations can be cut */
562 /* out in a block. The only declarations */
563 recursion_info new_recursive; /* within blocks below are for variables */
564 /* that do not have to be preserved over */
565 BOOL cur_is_word; /* a recursive call to RMATCH(). */
566 BOOL condition;
567 BOOL prev_is_word;
568
569 unsigned long int original_ims;
570
571 #ifdef SUPPORT_UCP
572 int prop_type;
573 int prop_value;
574 int prop_fail_result;
575 int prop_category;
576 int prop_chartype;
577 int prop_script;
578 int oclength;
579 uschar occhars[8];
580 #endif
581
582 int codelink;
583 int ctype;
584 int length;
585 int max;
586 int min;
587 int number;
588 int offset;
589 int op;
590 int save_capture_last;
591 int save_offset1, save_offset2, save_offset3;
592 int stacksave[REC_STACK_SAVE_MAX];
593
594 eptrblock newptrb;
595 #endif /* NO_RECURSE */
596
597 /* These statements are here to stop the compiler complaining about unitialized
598 variables. */
599
600 #ifdef SUPPORT_UCP
601 prop_value = 0;
602 prop_fail_result = 0;
603 #endif
604
605
606 /* This label is used for tail recursion, which is used in a few cases even
607 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
608 used. Thanks to Ian Taylor for noticing this possibility and sending the
609 original patch. */
610
611 TAIL_RECURSE:
612
613 /* OK, now we can get on with the real code of the function. Recursive calls
614 are specified by the macro RMATCH and RRETURN is used to return. When
615 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
616 and a "return", respectively (possibly with some debugging if DEBUG is
617 defined). However, RMATCH isn't like a function call because it's quite a
618 complicated macro. It has to be used in one particular way. This shouldn't,
619 however, impact performance when true recursion is being used. */
620
621 #ifdef SUPPORT_UTF8
622 utf8 = md->utf8; /* Local copy of the flag */
623 #else
624 utf8 = FALSE;
625 #endif
626
627 /* First check that we haven't called match() too many times, or that we
628 haven't exceeded the recursive call limit. */
629
630 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
631 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
632
633 original_ims = ims; /* Save for resetting on ')' */
634
635 /* At the start of a group with an unlimited repeat that may match an empty
636 string, the match_cbegroup flag is set. When this is the case, add the current
637 subject pointer to the chain of such remembered pointers, to be checked when we
638 hit the closing ket, in order to break infinite loops that match no characters.
639 When match() is called in other circumstances, don't add to the chain. The
640 match_cbegroup flag must NOT be used with tail recursion, because the memory
641 block that is used is on the stack, so a new one may be required for each
642 match(). */
643
644 if ((flags & match_cbegroup) != 0)
645 {
646 newptrb.epb_saved_eptr = eptr;
647 newptrb.epb_prev = eptrb;
648 eptrb = &newptrb;
649 }
650
651 /* Now start processing the opcodes. */
652
653 for (;;)
654 {
655 minimize = possessive = FALSE;
656 op = *ecode;
657
658 /* For partial matching, remember if we ever hit the end of the subject after
659 matching at least one subject character. This code is now wrapped in a macro
660 because it appears several times below. */
661
662 CHECK_PARTIAL();
663
664 switch(op)
665 {
666 case OP_FAIL:
667 RRETURN(MATCH_NOMATCH);
668
669 case OP_PRUNE:
670 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
671 ims, eptrb, flags, RM51);
672 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
673 RRETURN(MATCH_PRUNE);
674
675 case OP_COMMIT:
676 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
677 ims, eptrb, flags, RM52);
678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
679 RRETURN(MATCH_COMMIT);
680
681 case OP_SKIP:
682 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
683 ims, eptrb, flags, RM53);
684 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
685 md->start_match_ptr = eptr; /* Pass back current position */
686 RRETURN(MATCH_SKIP);
687
688 case OP_THEN:
689 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
690 ims, eptrb, flags, RM54);
691 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
692 RRETURN(MATCH_THEN);
693
694 /* Handle a capturing bracket. If there is space in the offset vector, save
695 the current subject position in the working slot at the top of the vector.
696 We mustn't change the current values of the data slot, because they may be
697 set from a previous iteration of this group, and be referred to by a
698 reference inside the group.
699
700 If the bracket fails to match, we need to restore this value and also the
701 values of the final offsets, in case they were set by a previous iteration
702 of the same bracket.
703
704 If there isn't enough space in the offset vector, treat this as if it were
705 a non-capturing bracket. Don't worry about setting the flag for the error
706 case here; that is handled in the code for KET. */
707
708 case OP_CBRA:
709 case OP_SCBRA:
710 number = GET2(ecode, 1+LINK_SIZE);
711 offset = number << 1;
712
713 #ifdef DEBUG
714 printf("start bracket %d\n", number);
715 printf("subject=");
716 pchars(eptr, 16, TRUE, md);
717 printf("\n");
718 #endif
719
720 if (offset < md->offset_max)
721 {
722 save_offset1 = md->offset_vector[offset];
723 save_offset2 = md->offset_vector[offset+1];
724 save_offset3 = md->offset_vector[md->offset_end - number];
725 save_capture_last = md->capture_last;
726
727 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
728 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
729
730 flags = (op == OP_SCBRA)? match_cbegroup : 0;
731 do
732 {
733 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
734 ims, eptrb, flags, RM1);
735 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
736 md->capture_last = save_capture_last;
737 ecode += GET(ecode, 1);
738 }
739 while (*ecode == OP_ALT);
740
741 DPRINTF(("bracket %d failed\n", number));
742
743 md->offset_vector[offset] = save_offset1;
744 md->offset_vector[offset+1] = save_offset2;
745 md->offset_vector[md->offset_end - number] = save_offset3;
746
747 RRETURN(MATCH_NOMATCH);
748 }
749
750 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
751 as a non-capturing bracket. */
752
753 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
754 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
755
756 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
757
758 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
759 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
760
761 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
762 final alternative within the brackets, we would return the result of a
763 recursive call to match() whatever happened. We can reduce stack usage by
764 turning this into a tail recursion, except in the case when match_cbegroup
765 is set.*/
766
767 case OP_BRA:
768 case OP_SBRA:
769 DPRINTF(("start non-capturing bracket\n"));
770 flags = (op >= OP_SBRA)? match_cbegroup : 0;
771 for (;;)
772 {
773 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
774 {
775 if (flags == 0) /* Not a possibly empty group */
776 {
777 ecode += _pcre_OP_lengths[*ecode];
778 DPRINTF(("bracket 0 tail recursion\n"));
779 goto TAIL_RECURSE;
780 }
781
782 /* Possibly empty group; can't use tail recursion. */
783
784 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
785 eptrb, flags, RM48);
786 RRETURN(rrc);
787 }
788
789 /* For non-final alternatives, continue the loop for a NOMATCH result;
790 otherwise return. */
791
792 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
793 eptrb, flags, RM2);
794 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
795 ecode += GET(ecode, 1);
796 }
797 /* Control never reaches here. */
798
799 /* Conditional group: compilation checked that there are no more than
800 two branches. If the condition is false, skipping the first branch takes us
801 past the end if there is only one branch, but that's OK because that is
802 exactly what going to the ket would do. As there is only one branch to be
803 obeyed, we can use tail recursion to avoid using another stack frame. */
804
805 case OP_COND:
806 case OP_SCOND:
807 codelink= GET(ecode, 1);
808
809 /* Because of the way auto-callout works during compile, a callout item is
810 inserted between OP_COND and an assertion condition. */
811
812 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
813 {
814 if (pcre_callout != NULL)
815 {
816 pcre_callout_block cb;
817 cb.version = 1; /* Version 1 of the callout block */
818 cb.callout_number = ecode[LINK_SIZE+2];
819 cb.offset_vector = md->offset_vector;
820 cb.subject = (PCRE_SPTR)md->start_subject;
821 cb.subject_length = md->end_subject - md->start_subject;
822 cb.start_match = mstart - md->start_subject;
823 cb.current_position = eptr - md->start_subject;
824 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
825 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
826 cb.capture_top = offset_top/2;
827 cb.capture_last = md->capture_last;
828 cb.callout_data = md->callout_data;
829 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
830 if (rrc < 0) RRETURN(rrc);
831 }
832 ecode += _pcre_OP_lengths[OP_CALLOUT];
833 }
834
835 condcode = ecode[LINK_SIZE+1];
836
837 /* Now see what the actual condition is */
838
839 if (condcode == OP_RREF) /* Recursion test */
840 {
841 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
842 condition = md->recursive != NULL &&
843 (offset == RREF_ANY || offset == md->recursive->group_num);
844 ecode += condition? 3 : GET(ecode, 1);
845 }
846
847 else if (condcode == OP_CREF) /* Group used test */
848 {
849 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
850 condition = offset < offset_top && md->offset_vector[offset] >= 0;
851 ecode += condition? 3 : GET(ecode, 1);
852 }
853
854 else if (condcode == OP_DEF) /* DEFINE - always false */
855 {
856 condition = FALSE;
857 ecode += GET(ecode, 1);
858 }
859
860 /* The condition is an assertion. Call match() to evaluate it - setting
861 the final argument match_condassert causes it to stop at the end of an
862 assertion. */
863
864 else
865 {
866 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
867 match_condassert, RM3);
868 if (rrc == MATCH_MATCH)
869 {
870 condition = TRUE;
871 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
872 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
873 }
874 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
875 {
876 RRETURN(rrc); /* Need braces because of following else */
877 }
878 else
879 {
880 condition = FALSE;
881 ecode += codelink;
882 }
883 }
884
885 /* We are now at the branch that is to be obeyed. As there is only one,
886 we can use tail recursion to avoid using another stack frame, except when
887 match_cbegroup is required for an unlimited repeat of a possibly empty
888 group. If the second alternative doesn't exist, we can just plough on. */
889
890 if (condition || *ecode == OP_ALT)
891 {
892 ecode += 1 + LINK_SIZE;
893 if (op == OP_SCOND) /* Possibly empty group */
894 {
895 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
896 RRETURN(rrc);
897 }
898 else /* Group must match something */
899 {
900 flags = 0;
901 goto TAIL_RECURSE;
902 }
903 }
904 else /* Condition false & no alternative */
905 {
906 ecode += 1 + LINK_SIZE;
907 }
908 break;
909
910
911 /* End of the pattern, either real or forced. If we are in a top-level
912 recursion, we should restore the offsets appropriately and continue from
913 after the call. */
914
915 case OP_ACCEPT:
916 case OP_END:
917 if (md->recursive != NULL && md->recursive->group_num == 0)
918 {
919 recursion_info *rec = md->recursive;
920 DPRINTF(("End of pattern in a (?0) recursion\n"));
921 md->recursive = rec->prevrec;
922 memmove(md->offset_vector, rec->offset_save,
923 rec->saved_max * sizeof(int));
924 mstart = rec->save_start;
925 ims = original_ims;
926 ecode = rec->after_call;
927 break;
928 }
929
930 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
931 string - backtracking will then try other alternatives, if any. */
932
933 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
934 md->end_match_ptr = eptr; /* Record where we ended */
935 md->end_offset_top = offset_top; /* and how many extracts were taken */
936 md->start_match_ptr = mstart; /* and the start (\K can modify) */
937 RRETURN(MATCH_MATCH);
938
939 /* Change option settings */
940
941 case OP_OPT:
942 ims = ecode[1];
943 ecode += 2;
944 DPRINTF(("ims set to %02lx\n", ims));
945 break;
946
947 /* Assertion brackets. Check the alternative branches in turn - the
948 matching won't pass the KET for an assertion. If any one branch matches,
949 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
950 start of each branch to move the current point backwards, so the code at
951 this level is identical to the lookahead case. */
952
953 case OP_ASSERT:
954 case OP_ASSERTBACK:
955 do
956 {
957 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
958 RM4);
959 if (rrc == MATCH_MATCH) break;
960 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
961 ecode += GET(ecode, 1);
962 }
963 while (*ecode == OP_ALT);
964 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
965
966 /* If checking an assertion for a condition, return MATCH_MATCH. */
967
968 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
969
970 /* Continue from after the assertion, updating the offsets high water
971 mark, since extracts may have been taken during the assertion. */
972
973 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
974 ecode += 1 + LINK_SIZE;
975 offset_top = md->end_offset_top;
976 continue;
977
978 /* Negative assertion: all branches must fail to match */
979
980 case OP_ASSERT_NOT:
981 case OP_ASSERTBACK_NOT:
982 do
983 {
984 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
985 RM5);
986 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
987 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
988 ecode += GET(ecode,1);
989 }
990 while (*ecode == OP_ALT);
991
992 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
993
994 ecode += 1 + LINK_SIZE;
995 continue;
996
997 /* Move the subject pointer back. This occurs only at the start of
998 each branch of a lookbehind assertion. If we are too close to the start to
999 move back, this match function fails. When working with UTF-8 we move
1000 back a number of characters, not bytes. */
1001
1002 case OP_REVERSE:
1003 #ifdef SUPPORT_UTF8
1004 if (utf8)
1005 {
1006 i = GET(ecode, 1);
1007 while (i-- > 0)
1008 {
1009 eptr--;
1010 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1011 BACKCHAR(eptr);
1012 }
1013 }
1014 else
1015 #endif
1016
1017 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1018
1019 {
1020 eptr -= GET(ecode, 1);
1021 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1022 }
1023
1024 /* Skip to next op code */
1025
1026 ecode += 1 + LINK_SIZE;
1027 break;
1028
1029 /* The callout item calls an external function, if one is provided, passing
1030 details of the match so far. This is mainly for debugging, though the
1031 function is able to force a failure. */
1032
1033 case OP_CALLOUT:
1034 if (pcre_callout != NULL)
1035 {
1036 pcre_callout_block cb;
1037 cb.version = 1; /* Version 1 of the callout block */
1038 cb.callout_number = ecode[1];
1039 cb.offset_vector = md->offset_vector;
1040 cb.subject = (PCRE_SPTR)md->start_subject;
1041 cb.subject_length = md->end_subject - md->start_subject;
1042 cb.start_match = mstart - md->start_subject;
1043 cb.current_position = eptr - md->start_subject;
1044 cb.pattern_position = GET(ecode, 2);
1045 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1046 cb.capture_top = offset_top/2;
1047 cb.capture_last = md->capture_last;
1048 cb.callout_data = md->callout_data;
1049 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1050 if (rrc < 0) RRETURN(rrc);
1051 }
1052 ecode += 2 + 2*LINK_SIZE;
1053 break;
1054
1055 /* Recursion either matches the current regex, or some subexpression. The
1056 offset data is the offset to the starting bracket from the start of the
1057 whole pattern. (This is so that it works from duplicated subpatterns.)
1058
1059 If there are any capturing brackets started but not finished, we have to
1060 save their starting points and reinstate them after the recursion. However,
1061 we don't know how many such there are (offset_top records the completed
1062 total) so we just have to save all the potential data. There may be up to
1063 65535 such values, which is too large to put on the stack, but using malloc
1064 for small numbers seems expensive. As a compromise, the stack is used when
1065 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1066 is used. A problem is what to do if the malloc fails ... there is no way of
1067 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1068 values on the stack, and accept that the rest may be wrong.
1069
1070 There are also other values that have to be saved. We use a chained
1071 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1072 for the original version of this logic. */
1073
1074 case OP_RECURSE:
1075 {
1076 callpat = md->start_code + GET(ecode, 1);
1077 new_recursive.group_num = (callpat == md->start_code)? 0 :
1078 GET2(callpat, 1 + LINK_SIZE);
1079
1080 /* Add to "recursing stack" */
1081
1082 new_recursive.prevrec = md->recursive;
1083 md->recursive = &new_recursive;
1084
1085 /* Find where to continue from afterwards */
1086
1087 ecode += 1 + LINK_SIZE;
1088 new_recursive.after_call = ecode;
1089
1090 /* Now save the offset data. */
1091
1092 new_recursive.saved_max = md->offset_end;
1093 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1094 new_recursive.offset_save = stacksave;
1095 else
1096 {
1097 new_recursive.offset_save =
1098 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1099 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1100 }
1101
1102 memcpy(new_recursive.offset_save, md->offset_vector,
1103 new_recursive.saved_max * sizeof(int));
1104 new_recursive.save_start = mstart;
1105 mstart = eptr;
1106
1107 /* OK, now we can do the recursion. For each top-level alternative we
1108 restore the offset and recursion data. */
1109
1110 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1111 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1112 do
1113 {
1114 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1115 md, ims, eptrb, flags, RM6);
1116 if (rrc == MATCH_MATCH)
1117 {
1118 DPRINTF(("Recursion matched\n"));
1119 md->recursive = new_recursive.prevrec;
1120 if (new_recursive.offset_save != stacksave)
1121 (pcre_free)(new_recursive.offset_save);
1122 RRETURN(MATCH_MATCH);
1123 }
1124 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1125 {
1126 DPRINTF(("Recursion gave error %d\n", rrc));
1127 if (new_recursive.offset_save != stacksave)
1128 (pcre_free)(new_recursive.offset_save);
1129 RRETURN(rrc);
1130 }
1131
1132 md->recursive = &new_recursive;
1133 memcpy(md->offset_vector, new_recursive.offset_save,
1134 new_recursive.saved_max * sizeof(int));
1135 callpat += GET(callpat, 1);
1136 }
1137 while (*callpat == OP_ALT);
1138
1139 DPRINTF(("Recursion didn't match\n"));
1140 md->recursive = new_recursive.prevrec;
1141 if (new_recursive.offset_save != stacksave)
1142 (pcre_free)(new_recursive.offset_save);
1143 RRETURN(MATCH_NOMATCH);
1144 }
1145 /* Control never reaches here */
1146
1147 /* "Once" brackets are like assertion brackets except that after a match,
1148 the point in the subject string is not moved back. Thus there can never be
1149 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1150 Check the alternative branches in turn - the matching won't pass the KET
1151 for this kind of subpattern. If any one branch matches, we carry on as at
1152 the end of a normal bracket, leaving the subject pointer. */
1153
1154 case OP_ONCE:
1155 prev = ecode;
1156 saved_eptr = eptr;
1157
1158 do
1159 {
1160 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1161 if (rrc == MATCH_MATCH) break;
1162 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1163 ecode += GET(ecode,1);
1164 }
1165 while (*ecode == OP_ALT);
1166
1167 /* If hit the end of the group (which could be repeated), fail */
1168
1169 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1170
1171 /* Continue as from after the assertion, updating the offsets high water
1172 mark, since extracts may have been taken. */
1173
1174 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1175
1176 offset_top = md->end_offset_top;
1177 eptr = md->end_match_ptr;
1178
1179 /* For a non-repeating ket, just continue at this level. This also
1180 happens for a repeating ket if no characters were matched in the group.
1181 This is the forcible breaking of infinite loops as implemented in Perl
1182 5.005. If there is an options reset, it will get obeyed in the normal
1183 course of events. */
1184
1185 if (*ecode == OP_KET || eptr == saved_eptr)
1186 {
1187 ecode += 1+LINK_SIZE;
1188 break;
1189 }
1190
1191 /* The repeating kets try the rest of the pattern or restart from the
1192 preceding bracket, in the appropriate order. The second "call" of match()
1193 uses tail recursion, to avoid using another stack frame. We need to reset
1194 any options that changed within the bracket before re-running it, so
1195 check the next opcode. */
1196
1197 if (ecode[1+LINK_SIZE] == OP_OPT)
1198 {
1199 ims = (ims & ~PCRE_IMS) | ecode[4];
1200 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1201 }
1202
1203 if (*ecode == OP_KETRMIN)
1204 {
1205 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1206 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1207 ecode = prev;
1208 flags = 0;
1209 goto TAIL_RECURSE;
1210 }
1211 else /* OP_KETRMAX */
1212 {
1213 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1214 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1215 ecode += 1 + LINK_SIZE;
1216 flags = 0;
1217 goto TAIL_RECURSE;
1218 }
1219 /* Control never gets here */
1220
1221 /* An alternation is the end of a branch; scan along to find the end of the
1222 bracketed group and go to there. */
1223
1224 case OP_ALT:
1225 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1226 break;
1227
1228 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1229 indicating that it may occur zero times. It may repeat infinitely, or not
1230 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1231 with fixed upper repeat limits are compiled as a number of copies, with the
1232 optional ones preceded by BRAZERO or BRAMINZERO. */
1233
1234 case OP_BRAZERO:
1235 {
1236 next = ecode+1;
1237 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1238 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1239 do next += GET(next,1); while (*next == OP_ALT);
1240 ecode = next + 1 + LINK_SIZE;
1241 }
1242 break;
1243
1244 case OP_BRAMINZERO:
1245 {
1246 next = ecode+1;
1247 do next += GET(next, 1); while (*next == OP_ALT);
1248 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1249 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1250 ecode++;
1251 }
1252 break;
1253
1254 case OP_SKIPZERO:
1255 {
1256 next = ecode+1;
1257 do next += GET(next,1); while (*next == OP_ALT);
1258 ecode = next + 1 + LINK_SIZE;
1259 }
1260 break;
1261
1262 /* End of a group, repeated or non-repeating. */
1263
1264 case OP_KET:
1265 case OP_KETRMIN:
1266 case OP_KETRMAX:
1267 prev = ecode - GET(ecode, 1);
1268
1269 /* If this was a group that remembered the subject start, in order to break
1270 infinite repeats of empty string matches, retrieve the subject start from
1271 the chain. Otherwise, set it NULL. */
1272
1273 if (*prev >= OP_SBRA)
1274 {
1275 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1276 eptrb = eptrb->epb_prev; /* Backup to previous group */
1277 }
1278 else saved_eptr = NULL;
1279
1280 /* If we are at the end of an assertion group, stop matching and return
1281 MATCH_MATCH, but record the current high water mark for use by positive
1282 assertions. Do this also for the "once" (atomic) groups. */
1283
1284 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1285 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1286 *prev == OP_ONCE)
1287 {
1288 md->end_match_ptr = eptr; /* For ONCE */
1289 md->end_offset_top = offset_top;
1290 RRETURN(MATCH_MATCH);
1291 }
1292
1293 /* For capturing groups we have to check the group number back at the start
1294 and if necessary complete handling an extraction by setting the offsets and
1295 bumping the high water mark. Note that whole-pattern recursion is coded as
1296 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1297 when the OP_END is reached. Other recursion is handled here. */
1298
1299 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1300 {
1301 number = GET2(prev, 1+LINK_SIZE);
1302 offset = number << 1;
1303
1304 #ifdef DEBUG
1305 printf("end bracket %d", number);
1306 printf("\n");
1307 #endif
1308
1309 md->capture_last = number;
1310 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1311 {
1312 md->offset_vector[offset] =
1313 md->offset_vector[md->offset_end - number];
1314 md->offset_vector[offset+1] = eptr - md->start_subject;
1315 if (offset_top <= offset) offset_top = offset + 2;
1316 }
1317
1318 /* Handle a recursively called group. Restore the offsets
1319 appropriately and continue from after the call. */
1320
1321 if (md->recursive != NULL && md->recursive->group_num == number)
1322 {
1323 recursion_info *rec = md->recursive;
1324 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1325 md->recursive = rec->prevrec;
1326 mstart = rec->save_start;
1327 memcpy(md->offset_vector, rec->offset_save,
1328 rec->saved_max * sizeof(int));
1329 ecode = rec->after_call;
1330 ims = original_ims;
1331 break;
1332 }
1333 }
1334
1335 /* For both capturing and non-capturing groups, reset the value of the ims
1336 flags, in case they got changed during the group. */
1337
1338 ims = original_ims;
1339 DPRINTF(("ims reset to %02lx\n", ims));
1340
1341 /* For a non-repeating ket, just continue at this level. This also
1342 happens for a repeating ket if no characters were matched in the group.
1343 This is the forcible breaking of infinite loops as implemented in Perl
1344 5.005. If there is an options reset, it will get obeyed in the normal
1345 course of events. */
1346
1347 if (*ecode == OP_KET || eptr == saved_eptr)
1348 {
1349 ecode += 1 + LINK_SIZE;
1350 break;
1351 }
1352
1353 /* The repeating kets try the rest of the pattern or restart from the
1354 preceding bracket, in the appropriate order. In the second case, we can use
1355 tail recursion to avoid using another stack frame, unless we have an
1356 unlimited repeat of a group that can match an empty string. */
1357
1358 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1359
1360 if (*ecode == OP_KETRMIN)
1361 {
1362 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1363 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1364 if (flags != 0) /* Could match an empty string */
1365 {
1366 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1367 RRETURN(rrc);
1368 }
1369 ecode = prev;
1370 goto TAIL_RECURSE;
1371 }
1372 else /* OP_KETRMAX */
1373 {
1374 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1375 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1376 ecode += 1 + LINK_SIZE;
1377 flags = 0;
1378 goto TAIL_RECURSE;
1379 }
1380 /* Control never gets here */
1381
1382 /* Start of subject unless notbol, or after internal newline if multiline */
1383
1384 case OP_CIRC:
1385 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1386 if ((ims & PCRE_MULTILINE) != 0)
1387 {
1388 if (eptr != md->start_subject &&
1389 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1390 RRETURN(MATCH_NOMATCH);
1391 ecode++;
1392 break;
1393 }
1394 /* ... else fall through */
1395
1396 /* Start of subject assertion */
1397
1398 case OP_SOD:
1399 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1400 ecode++;
1401 break;
1402
1403 /* Start of match assertion */
1404
1405 case OP_SOM:
1406 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1407 ecode++;
1408 break;
1409
1410 /* Reset the start of match point */
1411
1412 case OP_SET_SOM:
1413 mstart = eptr;
1414 ecode++;
1415 break;
1416
1417 /* Assert before internal newline if multiline, or before a terminating
1418 newline unless endonly is set, else end of subject unless noteol is set. */
1419
1420 case OP_DOLL:
1421 if ((ims & PCRE_MULTILINE) != 0)
1422 {
1423 if (eptr < md->end_subject)
1424 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1425 else
1426 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1427 ecode++;
1428 break;
1429 }
1430 else
1431 {
1432 if (md->noteol) RRETURN(MATCH_NOMATCH);
1433 if (!md->endonly)
1434 {
1435 if (eptr != md->end_subject &&
1436 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1437 RRETURN(MATCH_NOMATCH);
1438 ecode++;
1439 break;
1440 }
1441 }
1442 /* ... else fall through for endonly */
1443
1444 /* End of subject assertion (\z) */
1445
1446 case OP_EOD:
1447 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1448 ecode++;
1449 break;
1450
1451 /* End of subject or ending \n assertion (\Z) */
1452
1453 case OP_EODN:
1454 if (eptr != md->end_subject &&
1455 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1456 RRETURN(MATCH_NOMATCH);
1457 ecode++;
1458 break;
1459
1460 /* Word boundary assertions */
1461
1462 case OP_NOT_WORD_BOUNDARY:
1463 case OP_WORD_BOUNDARY:
1464 {
1465
1466 /* Find out if the previous and current characters are "word" characters.
1467 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1468 be "non-word" characters. */
1469
1470 #ifdef SUPPORT_UTF8
1471 if (utf8)
1472 {
1473 if (eptr == md->start_subject) prev_is_word = FALSE; else
1474 {
1475 USPTR lastptr = eptr - 1;
1476 while((*lastptr & 0xc0) == 0x80) lastptr--;
1477 GETCHAR(c, lastptr);
1478 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1479 }
1480 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1481 {
1482 GETCHAR(c, eptr);
1483 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1484 }
1485 }
1486 else
1487 #endif
1488
1489 /* More streamlined when not in UTF-8 mode */
1490
1491 {
1492 prev_is_word = (eptr != md->start_subject) &&
1493 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1494 cur_is_word = (eptr < md->end_subject) &&
1495 ((md->ctypes[*eptr] & ctype_word) != 0);
1496 }
1497
1498 /* Now see if the situation is what we want */
1499
1500 if ((*ecode++ == OP_WORD_BOUNDARY)?
1501 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1502 RRETURN(MATCH_NOMATCH);
1503 }
1504 break;
1505
1506 /* Match a single character type; inline for speed */
1507
1508 case OP_ANY:
1509 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1510 /* Fall through */
1511
1512 case OP_ALLANY:
1513 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1514 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1515 ecode++;
1516 break;
1517
1518 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1519 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1520
1521 case OP_ANYBYTE:
1522 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1523 ecode++;
1524 break;
1525
1526 case OP_NOT_DIGIT:
1527 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1528 GETCHARINCTEST(c, eptr);
1529 if (
1530 #ifdef SUPPORT_UTF8
1531 c < 256 &&
1532 #endif
1533 (md->ctypes[c] & ctype_digit) != 0
1534 )
1535 RRETURN(MATCH_NOMATCH);
1536 ecode++;
1537 break;
1538
1539 case OP_DIGIT:
1540 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1541 GETCHARINCTEST(c, eptr);
1542 if (
1543 #ifdef SUPPORT_UTF8
1544 c >= 256 ||
1545 #endif
1546 (md->ctypes[c] & ctype_digit) == 0
1547 )
1548 RRETURN(MATCH_NOMATCH);
1549 ecode++;
1550 break;
1551
1552 case OP_NOT_WHITESPACE:
1553 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1554 GETCHARINCTEST(c, eptr);
1555 if (
1556 #ifdef SUPPORT_UTF8
1557 c < 256 &&
1558 #endif
1559 (md->ctypes[c] & ctype_space) != 0
1560 )
1561 RRETURN(MATCH_NOMATCH);
1562 ecode++;
1563 break;
1564
1565 case OP_WHITESPACE:
1566 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1567 GETCHARINCTEST(c, eptr);
1568 if (
1569 #ifdef SUPPORT_UTF8
1570 c >= 256 ||
1571 #endif
1572 (md->ctypes[c] & ctype_space) == 0
1573 )
1574 RRETURN(MATCH_NOMATCH);
1575 ecode++;
1576 break;
1577
1578 case OP_NOT_WORDCHAR:
1579 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1580 GETCHARINCTEST(c, eptr);
1581 if (
1582 #ifdef SUPPORT_UTF8
1583 c < 256 &&
1584 #endif
1585 (md->ctypes[c] & ctype_word) != 0
1586 )
1587 RRETURN(MATCH_NOMATCH);
1588 ecode++;
1589 break;
1590
1591 case OP_WORDCHAR:
1592 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1593 GETCHARINCTEST(c, eptr);
1594 if (
1595 #ifdef SUPPORT_UTF8
1596 c >= 256 ||
1597 #endif
1598 (md->ctypes[c] & ctype_word) == 0
1599 )
1600 RRETURN(MATCH_NOMATCH);
1601 ecode++;
1602 break;
1603
1604 case OP_ANYNL:
1605 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1606 GETCHARINCTEST(c, eptr);
1607 switch(c)
1608 {
1609 default: RRETURN(MATCH_NOMATCH);
1610 case 0x000d:
1611 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1612 break;
1613
1614 case 0x000a:
1615 break;
1616
1617 case 0x000b:
1618 case 0x000c:
1619 case 0x0085:
1620 case 0x2028:
1621 case 0x2029:
1622 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1623 break;
1624 }
1625 ecode++;
1626 break;
1627
1628 case OP_NOT_HSPACE:
1629 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1630 GETCHARINCTEST(c, eptr);
1631 switch(c)
1632 {
1633 default: break;
1634 case 0x09: /* HT */
1635 case 0x20: /* SPACE */
1636 case 0xa0: /* NBSP */
1637 case 0x1680: /* OGHAM SPACE MARK */
1638 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1639 case 0x2000: /* EN QUAD */
1640 case 0x2001: /* EM QUAD */
1641 case 0x2002: /* EN SPACE */
1642 case 0x2003: /* EM SPACE */
1643 case 0x2004: /* THREE-PER-EM SPACE */
1644 case 0x2005: /* FOUR-PER-EM SPACE */
1645 case 0x2006: /* SIX-PER-EM SPACE */
1646 case 0x2007: /* FIGURE SPACE */
1647 case 0x2008: /* PUNCTUATION SPACE */
1648 case 0x2009: /* THIN SPACE */
1649 case 0x200A: /* HAIR SPACE */
1650 case 0x202f: /* NARROW NO-BREAK SPACE */
1651 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1652 case 0x3000: /* IDEOGRAPHIC SPACE */
1653 RRETURN(MATCH_NOMATCH);
1654 }
1655 ecode++;
1656 break;
1657
1658 case OP_HSPACE:
1659 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1660 GETCHARINCTEST(c, eptr);
1661 switch(c)
1662 {
1663 default: RRETURN(MATCH_NOMATCH);
1664 case 0x09: /* HT */
1665 case 0x20: /* SPACE */
1666 case 0xa0: /* NBSP */
1667 case 0x1680: /* OGHAM SPACE MARK */
1668 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1669 case 0x2000: /* EN QUAD */
1670 case 0x2001: /* EM QUAD */
1671 case 0x2002: /* EN SPACE */
1672 case 0x2003: /* EM SPACE */
1673 case 0x2004: /* THREE-PER-EM SPACE */
1674 case 0x2005: /* FOUR-PER-EM SPACE */
1675 case 0x2006: /* SIX-PER-EM SPACE */
1676 case 0x2007: /* FIGURE SPACE */
1677 case 0x2008: /* PUNCTUATION SPACE */
1678 case 0x2009: /* THIN SPACE */
1679 case 0x200A: /* HAIR SPACE */
1680 case 0x202f: /* NARROW NO-BREAK SPACE */
1681 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1682 case 0x3000: /* IDEOGRAPHIC SPACE */
1683 break;
1684 }
1685 ecode++;
1686 break;
1687
1688 case OP_NOT_VSPACE:
1689 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1690 GETCHARINCTEST(c, eptr);
1691 switch(c)
1692 {
1693 default: break;
1694 case 0x0a: /* LF */
1695 case 0x0b: /* VT */
1696 case 0x0c: /* FF */
1697 case 0x0d: /* CR */
1698 case 0x85: /* NEL */
1699 case 0x2028: /* LINE SEPARATOR */
1700 case 0x2029: /* PARAGRAPH SEPARATOR */
1701 RRETURN(MATCH_NOMATCH);
1702 }
1703 ecode++;
1704 break;
1705
1706 case OP_VSPACE:
1707 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1708 GETCHARINCTEST(c, eptr);
1709 switch(c)
1710 {
1711 default: RRETURN(MATCH_NOMATCH);
1712 case 0x0a: /* LF */
1713 case 0x0b: /* VT */
1714 case 0x0c: /* FF */
1715 case 0x0d: /* CR */
1716 case 0x85: /* NEL */
1717 case 0x2028: /* LINE SEPARATOR */
1718 case 0x2029: /* PARAGRAPH SEPARATOR */
1719 break;
1720 }
1721 ecode++;
1722 break;
1723
1724 #ifdef SUPPORT_UCP
1725 /* Check the next character by Unicode property. We will get here only
1726 if the support is in the binary; otherwise a compile-time error occurs. */
1727
1728 case OP_PROP:
1729 case OP_NOTPROP:
1730 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1731 GETCHARINCTEST(c, eptr);
1732 {
1733 const ucd_record *prop = GET_UCD(c);
1734
1735 switch(ecode[1])
1736 {
1737 case PT_ANY:
1738 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1739 break;
1740
1741 case PT_LAMP:
1742 if ((prop->chartype == ucp_Lu ||
1743 prop->chartype == ucp_Ll ||
1744 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1745 RRETURN(MATCH_NOMATCH);
1746 break;
1747
1748 case PT_GC:
1749 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1750 RRETURN(MATCH_NOMATCH);
1751 break;
1752
1753 case PT_PC:
1754 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1755 RRETURN(MATCH_NOMATCH);
1756 break;
1757
1758 case PT_SC:
1759 if ((ecode[2] != prop->script) == (op == OP_PROP))
1760 RRETURN(MATCH_NOMATCH);
1761 break;
1762
1763 default:
1764 RRETURN(PCRE_ERROR_INTERNAL);
1765 }
1766
1767 ecode += 3;
1768 }
1769 break;
1770
1771 /* Match an extended Unicode sequence. We will get here only if the support
1772 is in the binary; otherwise a compile-time error occurs. */
1773
1774 case OP_EXTUNI:
1775 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1776 GETCHARINCTEST(c, eptr);
1777 {
1778 int category = UCD_CATEGORY(c);
1779 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1780 while (eptr < md->end_subject)
1781 {
1782 int len = 1;
1783 if (!utf8) c = *eptr; else
1784 {
1785 GETCHARLEN(c, eptr, len);
1786 }
1787 category = UCD_CATEGORY(c);
1788 if (category != ucp_M) break;
1789 eptr += len;
1790 }
1791 }
1792 ecode++;
1793 break;
1794 #endif
1795
1796
1797 /* Match a back reference, possibly repeatedly. Look past the end of the
1798 item to see if there is repeat information following. The code is similar
1799 to that for character classes, but repeated for efficiency. Then obey
1800 similar code to character type repeats - written out again for speed.
1801 However, if the referenced string is the empty string, always treat
1802 it as matched, any number of times (otherwise there could be infinite
1803 loops). */
1804
1805 case OP_REF:
1806 {
1807 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1808 ecode += 3;
1809
1810 /* If the reference is unset, there are two possibilities:
1811
1812 (a) In the default, Perl-compatible state, set the length to be longer
1813 than the amount of subject left; this ensures that every attempt at a
1814 match fails. We can't just fail here, because of the possibility of
1815 quantifiers with zero minima.
1816
1817 (b) If the JavaScript compatibility flag is set, set the length to zero
1818 so that the back reference matches an empty string.
1819
1820 Otherwise, set the length to the length of what was matched by the
1821 referenced subpattern. */
1822
1823 if (offset >= offset_top || md->offset_vector[offset] < 0)
1824 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1825 else
1826 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1827
1828 /* Set up for repetition, or handle the non-repeated case */
1829
1830 switch (*ecode)
1831 {
1832 case OP_CRSTAR:
1833 case OP_CRMINSTAR:
1834 case OP_CRPLUS:
1835 case OP_CRMINPLUS:
1836 case OP_CRQUERY:
1837 case OP_CRMINQUERY:
1838 c = *ecode++ - OP_CRSTAR;
1839 minimize = (c & 1) != 0;
1840 min = rep_min[c]; /* Pick up values from tables; */
1841 max = rep_max[c]; /* zero for max => infinity */
1842 if (max == 0) max = INT_MAX;
1843 break;
1844
1845 case OP_CRRANGE:
1846 case OP_CRMINRANGE:
1847 minimize = (*ecode == OP_CRMINRANGE);
1848 min = GET2(ecode, 1);
1849 max = GET2(ecode, 3);
1850 if (max == 0) max = INT_MAX;
1851 ecode += 5;
1852 break;
1853
1854 default: /* No repeat follows */
1855 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1856 eptr += length;
1857 continue; /* With the main loop */
1858 }
1859
1860 /* If the length of the reference is zero, just continue with the
1861 main loop. */
1862
1863 if (length == 0) continue;
1864
1865 /* First, ensure the minimum number of matches are present. We get back
1866 the length of the reference string explicitly rather than passing the
1867 address of eptr, so that eptr can be a register variable. */
1868
1869 for (i = 1; i <= min; i++)
1870 {
1871 if (!match_ref(offset, eptr, length, md, ims))
1872 {
1873 CHECK_PARTIAL();
1874 RRETURN(MATCH_NOMATCH);
1875 }
1876 eptr += length;
1877 }
1878
1879 /* If min = max, continue at the same level without recursion.
1880 They are not both allowed to be zero. */
1881
1882 if (min == max) continue;
1883
1884 /* If minimizing, keep trying and advancing the pointer */
1885
1886 if (minimize)
1887 {
1888 for (fi = min;; fi++)
1889 {
1890 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1891 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1892 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1893 {
1894 CHECK_PARTIAL();
1895 RRETURN(MATCH_NOMATCH);
1896 }
1897 eptr += length;
1898 }
1899 /* Control never gets here */
1900 }
1901
1902 /* If maximizing, find the longest string and work backwards */
1903
1904 else
1905 {
1906 pp = eptr;
1907 for (i = min; i < max; i++)
1908 {
1909 if (!match_ref(offset, eptr, length, md, ims)) break;
1910 eptr += length;
1911 }
1912 CHECK_PARTIAL();
1913 while (eptr >= pp)
1914 {
1915 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1916 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1917 eptr -= length;
1918 }
1919 RRETURN(MATCH_NOMATCH);
1920 }
1921 }
1922 /* Control never gets here */
1923
1924
1925
1926 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1927 used when all the characters in the class have values in the range 0-255,
1928 and either the matching is caseful, or the characters are in the range
1929 0-127 when UTF-8 processing is enabled. The only difference between
1930 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1931 encountered.
1932
1933 First, look past the end of the item to see if there is repeat information
1934 following. Then obey similar code to character type repeats - written out
1935 again for speed. */
1936
1937 case OP_NCLASS:
1938 case OP_CLASS:
1939 {
1940 data = ecode + 1; /* Save for matching */
1941 ecode += 33; /* Advance past the item */
1942
1943 switch (*ecode)
1944 {
1945 case OP_CRSTAR:
1946 case OP_CRMINSTAR:
1947 case OP_CRPLUS:
1948 case OP_CRMINPLUS:
1949 case OP_CRQUERY:
1950 case OP_CRMINQUERY:
1951 c = *ecode++ - OP_CRSTAR;
1952 minimize = (c & 1) != 0;
1953 min = rep_min[c]; /* Pick up values from tables; */
1954 max = rep_max[c]; /* zero for max => infinity */
1955 if (max == 0) max = INT_MAX;
1956 break;
1957
1958 case OP_CRRANGE:
1959 case OP_CRMINRANGE:
1960 minimize = (*ecode == OP_CRMINRANGE);
1961 min = GET2(ecode, 1);
1962 max = GET2(ecode, 3);
1963 if (max == 0) max = INT_MAX;
1964 ecode += 5;
1965 break;
1966
1967 default: /* No repeat follows */
1968 min = max = 1;
1969 break;
1970 }
1971
1972 /* First, ensure the minimum number of matches are present. */
1973
1974 #ifdef SUPPORT_UTF8
1975 /* UTF-8 mode */
1976 if (utf8)
1977 {
1978 for (i = 1; i <= min; i++)
1979 {
1980 if (eptr >= md->end_subject)
1981 {
1982 CHECK_PARTIAL();
1983 RRETURN(MATCH_NOMATCH);
1984 }
1985 GETCHARINC(c, eptr);
1986 if (c > 255)
1987 {
1988 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1989 }
1990 else
1991 {
1992 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1993 }
1994 }
1995 }
1996 else
1997 #endif
1998 /* Not UTF-8 mode */
1999 {
2000 for (i = 1; i <= min; i++)
2001 {
2002 if (eptr >= md->end_subject)
2003 {
2004 CHECK_PARTIAL();
2005 RRETURN(MATCH_NOMATCH);
2006 }
2007 c = *eptr++;
2008 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2009 }
2010 }
2011
2012 /* If max == min we can continue with the main loop without the
2013 need to recurse. */
2014
2015 if (min == max) continue;
2016
2017 /* If minimizing, keep testing the rest of the expression and advancing
2018 the pointer while it matches the class. */
2019
2020 if (minimize)
2021 {
2022 #ifdef SUPPORT_UTF8
2023 /* UTF-8 mode */
2024 if (utf8)
2025 {
2026 for (fi = min;; fi++)
2027 {
2028 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2029 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2030 if (fi >= max)
2031 {
2032 CHECK_PARTIAL();
2033 RRETURN(MATCH_NOMATCH);
2034 }
2035 if (eptr >= md->end_subject)
2036 {
2037 SCHECK_PARTIAL();
2038 RRETURN(MATCH_NOMATCH);
2039 }
2040 GETCHARINC(c, eptr);
2041 if (c > 255)
2042 {
2043 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2044 }
2045 else
2046 {
2047 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2048 }
2049 }
2050 }
2051 else
2052 #endif
2053 /* Not UTF-8 mode */
2054 {
2055 for (fi = min;; fi++)
2056 {
2057 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2059 if (fi >= max)
2060 {
2061 CHECK_PARTIAL();
2062 RRETURN(MATCH_NOMATCH);
2063 }
2064 if (eptr >= md->end_subject)
2065 {
2066 SCHECK_PARTIAL();
2067 RRETURN(MATCH_NOMATCH);
2068 }
2069 c = *eptr++;
2070 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2071 }
2072 }
2073 /* Control never gets here */
2074 }
2075
2076 /* If maximizing, find the longest possible run, then work backwards. */
2077
2078 else
2079 {
2080 pp = eptr;
2081
2082 #ifdef SUPPORT_UTF8
2083 /* UTF-8 mode */
2084 if (utf8)
2085 {
2086 for (i = min; i < max; i++)
2087 {
2088 int len = 1;
2089 if (eptr >= md->end_subject) break;
2090 GETCHARLEN(c, eptr, len);
2091 if (c > 255)
2092 {
2093 if (op == OP_CLASS) break;
2094 }
2095 else
2096 {
2097 if ((data[c/8] & (1 << (c&7))) == 0) break;
2098 }
2099 eptr += len;
2100 }
2101 CHECK_PARTIAL();
2102 for (;;)
2103 {
2104 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2105 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2106 if (eptr-- == pp) break; /* Stop if tried at original pos */
2107 BACKCHAR(eptr);
2108 }
2109 }
2110 else
2111 #endif
2112 /* Not UTF-8 mode */
2113 {
2114 for (i = min; i < max; i++)
2115 {
2116 if (eptr >= md->end_subject) break;
2117 c = *eptr;
2118 if ((data[c/8] & (1 << (c&7))) == 0) break;
2119 eptr++;
2120 }
2121 CHECK_PARTIAL();
2122 while (eptr >= pp)
2123 {
2124 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2126 eptr--;
2127 }
2128 }
2129
2130 RRETURN(MATCH_NOMATCH);
2131 }
2132 }
2133 /* Control never gets here */
2134
2135
2136 /* Match an extended character class. This opcode is encountered only
2137 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2138 mode, because Unicode properties are supported in non-UTF-8 mode. */
2139
2140 #ifdef SUPPORT_UTF8
2141 case OP_XCLASS:
2142 {
2143 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2144 ecode += GET(ecode, 1); /* Advance past the item */
2145
2146 switch (*ecode)
2147 {
2148 case OP_CRSTAR:
2149 case OP_CRMINSTAR:
2150 case OP_CRPLUS:
2151 case OP_CRMINPLUS:
2152 case OP_CRQUERY:
2153 case OP_CRMINQUERY:
2154 c = *ecode++ - OP_CRSTAR;
2155 minimize = (c & 1) != 0;
2156 min = rep_min[c]; /* Pick up values from tables; */
2157 max = rep_max[c]; /* zero for max => infinity */
2158 if (max == 0) max = INT_MAX;
2159 break;
2160
2161 case OP_CRRANGE:
2162 case OP_CRMINRANGE:
2163 minimize = (*ecode == OP_CRMINRANGE);
2164 min = GET2(ecode, 1);
2165 max = GET2(ecode, 3);
2166 if (max == 0) max = INT_MAX;
2167 ecode += 5;
2168 break;
2169
2170 default: /* No repeat follows */
2171 min = max = 1;
2172 break;
2173 }
2174
2175 /* First, ensure the minimum number of matches are present. */
2176
2177 for (i = 1; i <= min; i++)
2178 {
2179 if (eptr >= md->end_subject)
2180 {
2181 SCHECK_PARTIAL();
2182 RRETURN(MATCH_NOMATCH);
2183 }
2184 GETCHARINCTEST(c, eptr);
2185 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2186 }
2187
2188 /* If max == min we can continue with the main loop without the
2189 need to recurse. */
2190
2191 if (min == max) continue;
2192
2193 /* If minimizing, keep testing the rest of the expression and advancing
2194 the pointer while it matches the class. */
2195
2196 if (minimize)
2197 {
2198 for (fi = min;; fi++)
2199 {
2200 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2202 if (fi >= max)
2203 {
2204 CHECK_PARTIAL();
2205 RRETURN(MATCH_NOMATCH);
2206 }
2207 if (eptr >= md->end_subject)
2208 {
2209 SCHECK_PARTIAL();
2210 RRETURN(MATCH_NOMATCH);
2211 }
2212 GETCHARINCTEST(c, eptr);
2213 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2214 }
2215 /* Control never gets here */
2216 }
2217
2218 /* If maximizing, find the longest possible run, then work backwards. */
2219
2220 else
2221 {
2222 pp = eptr;
2223 for (i = min; i < max; i++)
2224 {
2225 int len = 1;
2226 if (eptr >= md->end_subject) break;
2227 GETCHARLENTEST(c, eptr, len);
2228 if (!_pcre_xclass(c, data)) break;
2229 eptr += len;
2230 }
2231 CHECK_PARTIAL();
2232 for(;;)
2233 {
2234 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2235 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2236 if (eptr-- == pp) break; /* Stop if tried at original pos */
2237 if (utf8) BACKCHAR(eptr);
2238 }
2239 RRETURN(MATCH_NOMATCH);
2240 }
2241
2242 /* Control never gets here */
2243 }
2244 #endif /* End of XCLASS */
2245
2246 /* Match a single character, casefully */
2247
2248 case OP_CHAR:
2249 #ifdef SUPPORT_UTF8
2250 if (utf8)
2251 {
2252 length = 1;
2253 ecode++;
2254 GETCHARLEN(fc, ecode, length);
2255 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2256 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2257 }
2258 else
2259 #endif
2260
2261 /* Non-UTF-8 mode */
2262 {
2263 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2264 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2265 ecode += 2;
2266 }
2267 break;
2268
2269 /* Match a single character, caselessly */
2270
2271 case OP_CHARNC:
2272 #ifdef SUPPORT_UTF8
2273 if (utf8)
2274 {
2275 length = 1;
2276 ecode++;
2277 GETCHARLEN(fc, ecode, length);
2278
2279 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2280
2281 /* If the pattern character's value is < 128, we have only one byte, and
2282 can use the fast lookup table. */
2283
2284 if (fc < 128)
2285 {
2286 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2287 }
2288
2289 /* Otherwise we must pick up the subject character */
2290
2291 else
2292 {
2293 unsigned int dc;
2294 GETCHARINC(dc, eptr);
2295 ecode += length;
2296
2297 /* If we have Unicode property support, we can use it to test the other
2298 case of the character, if there is one. */
2299
2300 if (fc != dc)
2301 {
2302 #ifdef SUPPORT_UCP
2303 if (dc != UCD_OTHERCASE(fc))
2304 #endif
2305 RRETURN(MATCH_NOMATCH);
2306 }
2307 }
2308 }
2309 else
2310 #endif /* SUPPORT_UTF8 */
2311
2312 /* Non-UTF-8 mode */
2313 {
2314 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2315 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2316 ecode += 2;
2317 }
2318 break;
2319
2320 /* Match a single character repeatedly. */
2321
2322 case OP_EXACT:
2323 min = max = GET2(ecode, 1);
2324 ecode += 3;
2325 goto REPEATCHAR;
2326
2327 case OP_POSUPTO:
2328 possessive = TRUE;
2329 /* Fall through */
2330
2331 case OP_UPTO:
2332 case OP_MINUPTO:
2333 min = 0;
2334 max = GET2(ecode, 1);
2335 minimize = *ecode == OP_MINUPTO;
2336 ecode += 3;
2337 goto REPEATCHAR;
2338
2339 case OP_POSSTAR:
2340 possessive = TRUE;
2341 min = 0;
2342 max = INT_MAX;
2343 ecode++;
2344 goto REPEATCHAR;
2345
2346 case OP_POSPLUS:
2347 possessive = TRUE;
2348 min = 1;
2349 max = INT_MAX;
2350 ecode++;
2351 goto REPEATCHAR;
2352
2353 case OP_POSQUERY:
2354 possessive = TRUE;
2355 min = 0;
2356 max = 1;
2357 ecode++;
2358 goto REPEATCHAR;
2359
2360 case OP_STAR:
2361 case OP_MINSTAR:
2362 case OP_PLUS:
2363 case OP_MINPLUS:
2364 case OP_QUERY:
2365 case OP_MINQUERY:
2366 c = *ecode++ - OP_STAR;
2367 minimize = (c & 1) != 0;
2368 min = rep_min[c]; /* Pick up values from tables; */
2369 max = rep_max[c]; /* zero for max => infinity */
2370 if (max == 0) max = INT_MAX;
2371
2372 /* Common code for all repeated single-character matches. */
2373
2374 REPEATCHAR:
2375 #ifdef SUPPORT_UTF8
2376 if (utf8)
2377 {
2378 length = 1;
2379 charptr = ecode;
2380 GETCHARLEN(fc, ecode, length);
2381 ecode += length;
2382
2383 /* Handle multibyte character matching specially here. There is
2384 support for caseless matching if UCP support is present. */
2385
2386 if (length > 1)
2387 {
2388 #ifdef SUPPORT_UCP
2389 unsigned int othercase;
2390 if ((ims & PCRE_CASELESS) != 0 &&
2391 (othercase = UCD_OTHERCASE(fc)) != fc)
2392 oclength = _pcre_ord2utf8(othercase, occhars);
2393 else oclength = 0;
2394 #endif /* SUPPORT_UCP */
2395
2396 for (i = 1; i <= min; i++)
2397 {
2398 if (eptr <= md->end_subject - length &&
2399 memcmp(eptr, charptr, length) == 0) eptr += length;
2400 #ifdef SUPPORT_UCP
2401 else if (oclength > 0 &&
2402 eptr <= md->end_subject - oclength &&
2403 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2404 #endif /* SUPPORT_UCP */
2405 else
2406 {
2407 CHECK_PARTIAL();
2408 RRETURN(MATCH_NOMATCH);
2409 }
2410 }
2411
2412 if (min == max) continue;
2413
2414 if (minimize)
2415 {
2416 for (fi = min;; fi++)
2417 {
2418 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2419 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2420 if (fi >= max)
2421 {
2422 CHECK_PARTIAL();
2423 RRETURN(MATCH_NOMATCH);
2424 }
2425 if (eptr <= md->end_subject - length &&
2426 memcmp(eptr, charptr, length) == 0) eptr += length;
2427 #ifdef SUPPORT_UCP
2428 else if (oclength > 0 &&
2429 eptr <= md->end_subject - oclength &&
2430 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2431 #endif /* SUPPORT_UCP */
2432 else
2433 {
2434 CHECK_PARTIAL();
2435 RRETURN(MATCH_NOMATCH);
2436 }
2437 }
2438 /* Control never gets here */
2439 }
2440
2441 else /* Maximize */
2442 {
2443 pp = eptr;
2444 for (i = min; i < max; i++)
2445 {
2446 if (eptr <= md->end_subject - length &&
2447 memcmp(eptr, charptr, length) == 0) eptr += length;
2448 #ifdef SUPPORT_UCP
2449 else if (oclength > 0 &&
2450 eptr <= md->end_subject - oclength &&
2451 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2452 #endif /* SUPPORT_UCP */
2453 else break;
2454 }
2455
2456 CHECK_PARTIAL();
2457 if (possessive) continue;
2458
2459 for(;;)
2460 {
2461 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2462 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2463 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2464 #ifdef SUPPORT_UCP
2465 eptr--;
2466 BACKCHAR(eptr);
2467 #else /* without SUPPORT_UCP */
2468 eptr -= length;
2469 #endif /* SUPPORT_UCP */
2470 }
2471 }
2472 /* Control never gets here */
2473 }
2474
2475 /* If the length of a UTF-8 character is 1, we fall through here, and
2476 obey the code as for non-UTF-8 characters below, though in this case the
2477 value of fc will always be < 128. */
2478 }
2479 else
2480 #endif /* SUPPORT_UTF8 */
2481
2482 /* When not in UTF-8 mode, load a single-byte character. */
2483
2484 fc = *ecode++;
2485
2486 /* The value of fc at this point is always less than 256, though we may or
2487 may not be in UTF-8 mode. The code is duplicated for the caseless and
2488 caseful cases, for speed, since matching characters is likely to be quite
2489 common. First, ensure the minimum number of matches are present. If min =
2490 max, continue at the same level without recursing. Otherwise, if
2491 minimizing, keep trying the rest of the expression and advancing one
2492 matching character if failing, up to the maximum. Alternatively, if
2493 maximizing, find the maximum number of characters and work backwards. */
2494
2495 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2496 max, eptr));
2497
2498 if ((ims & PCRE_CASELESS) != 0)
2499 {
2500 fc = md->lcc[fc];
2501 for (i = 1; i <= min; i++)
2502 {
2503 if (eptr >= md->end_subject)
2504 {
2505 SCHECK_PARTIAL();
2506 RRETURN(MATCH_NOMATCH);
2507 }
2508 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2509 }
2510 if (min == max) continue;
2511 if (minimize)
2512 {
2513 for (fi = min;; fi++)
2514 {
2515 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2516 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2517 if (fi >= max)
2518 {
2519 CHECK_PARTIAL();
2520 RRETURN(MATCH_NOMATCH);
2521 }
2522 if (eptr >= md->end_subject)
2523 {
2524 SCHECK_PARTIAL();
2525 RRETURN(MATCH_NOMATCH);
2526 }
2527 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2528 }
2529 /* Control never gets here */
2530 }
2531 else /* Maximize */
2532 {
2533 pp = eptr;
2534 for (i = min; i < max; i++)
2535 {
2536 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2537 eptr++;
2538 }
2539
2540 CHECK_PARTIAL();
2541 if (possessive) continue;
2542
2543 while (eptr >= pp)
2544 {
2545 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2546 eptr--;
2547 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2548 }
2549 RRETURN(MATCH_NOMATCH);
2550 }
2551 /* Control never gets here */
2552 }
2553
2554 /* Caseful comparisons (includes all multi-byte characters) */
2555
2556 else
2557 {
2558 for (i = 1; i <= min; i++)
2559 {
2560 if (eptr >= md->end_subject)
2561 {
2562 SCHECK_PARTIAL();
2563 RRETURN(MATCH_NOMATCH);
2564 }
2565 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2566 }
2567 if (min == max) continue;
2568 if (minimize)
2569 {
2570 for (fi = min;; fi++)
2571 {
2572 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2574 if (fi >= max)
2575 {
2576 CHECK_PARTIAL();
2577 RRETURN(MATCH_NOMATCH);
2578 }
2579 if (eptr >= md->end_subject)
2580 {
2581 SCHECK_PARTIAL();
2582 RRETURN(MATCH_NOMATCH);
2583 }
2584 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2585 }
2586 /* Control never gets here */
2587 }
2588 else /* Maximize */
2589 {
2590 pp = eptr;
2591 for (i = min; i < max; i++)
2592 {
2593 if (eptr >= md->end_subject || fc != *eptr) break;
2594 eptr++;
2595 }
2596 CHECK_PARTIAL();
2597 if (possessive) continue;
2598 while (eptr >= pp)
2599 {
2600 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2601 eptr--;
2602 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2603 }
2604 RRETURN(MATCH_NOMATCH);
2605 }
2606 }
2607 /* Control never gets here */
2608
2609 /* Match a negated single one-byte character. The character we are
2610 checking can be multibyte. */
2611
2612 case OP_NOT:
2613 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2614 ecode++;
2615 GETCHARINCTEST(c, eptr);
2616 if ((ims & PCRE_CASELESS) != 0)
2617 {
2618 #ifdef SUPPORT_UTF8
2619 if (c < 256)
2620 #endif
2621 c = md->lcc[c];
2622 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2623 }
2624 else
2625 {
2626 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2627 }
2628 break;
2629
2630 /* Match a negated single one-byte character repeatedly. This is almost a
2631 repeat of the code for a repeated single character, but I haven't found a
2632 nice way of commoning these up that doesn't require a test of the
2633 positive/negative option for each character match. Maybe that wouldn't add
2634 very much to the time taken, but character matching *is* what this is all
2635 about... */
2636
2637 case OP_NOTEXACT:
2638 min = max = GET2(ecode, 1);
2639 ecode += 3;
2640 goto REPEATNOTCHAR;
2641
2642 case OP_NOTUPTO:
2643 case OP_NOTMINUPTO:
2644 min = 0;
2645 max = GET2(ecode, 1);
2646 minimize = *ecode == OP_NOTMINUPTO;
2647 ecode += 3;
2648 goto REPEATNOTCHAR;
2649
2650 case OP_NOTPOSSTAR:
2651 possessive = TRUE;
2652 min = 0;
2653 max = INT_MAX;
2654 ecode++;
2655 goto REPEATNOTCHAR;
2656
2657 case OP_NOTPOSPLUS:
2658 possessive = TRUE;
2659 min = 1;
2660 max = INT_MAX;
2661 ecode++;
2662 goto REPEATNOTCHAR;
2663
2664 case OP_NOTPOSQUERY:
2665 possessive = TRUE;
2666 min = 0;
2667 max = 1;
2668 ecode++;
2669 goto REPEATNOTCHAR;
2670
2671 case OP_NOTPOSUPTO:
2672 possessive = TRUE;
2673 min = 0;
2674 max = GET2(ecode, 1);
2675 ecode += 3;
2676 goto REPEATNOTCHAR;
2677
2678 case OP_NOTSTAR:
2679 case OP_NOTMINSTAR:
2680 case OP_NOTPLUS:
2681 case OP_NOTMINPLUS:
2682 case OP_NOTQUERY:
2683 case OP_NOTMINQUERY:
2684 c = *ecode++ - OP_NOTSTAR;
2685 minimize = (c & 1) != 0;
2686 min = rep_min[c]; /* Pick up values from tables; */
2687 max = rep_max[c]; /* zero for max => infinity */
2688 if (max == 0) max = INT_MAX;
2689
2690 /* Common code for all repeated single-byte matches. */
2691
2692 REPEATNOTCHAR:
2693 fc = *ecode++;
2694
2695 /* The code is duplicated for the caseless and caseful cases, for speed,
2696 since matching characters is likely to be quite common. First, ensure the
2697 minimum number of matches are present. If min = max, continue at the same
2698 level without recursing. Otherwise, if minimizing, keep trying the rest of
2699 the expression and advancing one matching character if failing, up to the
2700 maximum. Alternatively, if maximizing, find the maximum number of
2701 characters and work backwards. */
2702
2703 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2704 max, eptr));
2705
2706 if ((ims & PCRE_CASELESS) != 0)
2707 {
2708 fc = md->lcc[fc];
2709
2710 #ifdef SUPPORT_UTF8
2711 /* UTF-8 mode */
2712 if (utf8)
2713 {
2714 register unsigned int d;
2715 for (i = 1; i <= min; i++)
2716 {
2717 if (eptr >= md->end_subject)
2718 {
2719 SCHECK_PARTIAL();
2720 RRETURN(MATCH_NOMATCH);
2721 }
2722 GETCHARINC(d, eptr);
2723 if (d < 256) d = md->lcc[d];
2724 if (fc == d) RRETURN(MATCH_NOMATCH);
2725 }
2726 }
2727 else
2728 #endif
2729
2730 /* Not UTF-8 mode */
2731 {
2732 for (i = 1; i <= min; i++)
2733 {
2734 if (eptr >= md->end_subject)
2735 {
2736 SCHECK_PARTIAL();
2737 RRETURN(MATCH_NOMATCH);
2738 }
2739 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2740 }
2741 }
2742
2743 if (min == max) continue;
2744
2745 if (minimize)
2746 {
2747 #ifdef SUPPORT_UTF8
2748 /* UTF-8 mode */
2749 if (utf8)
2750 {
2751 register unsigned int d;
2752 for (fi = min;; fi++)
2753 {
2754 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2755 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2756 if (fi >= max)
2757 {
2758 CHECK_PARTIAL();
2759 RRETURN(MATCH_NOMATCH);
2760 }
2761 if (eptr >= md->end_subject)
2762 {
2763 SCHECK_PARTIAL();
2764 RRETURN(MATCH_NOMATCH);
2765 }
2766 GETCHARINC(d, eptr);
2767 if (d < 256) d = md->lcc[d];
2768 if (fc == d) RRETURN(MATCH_NOMATCH);
2769 }
2770 }
2771 else
2772 #endif
2773 /* Not UTF-8 mode */
2774 {
2775 for (fi = min;; fi++)
2776 {
2777 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2779 if (fi >= max)
2780 {
2781 CHECK_PARTIAL();
2782 RRETURN(MATCH_NOMATCH);
2783 }
2784 if (eptr >= md->end_subject)
2785 {
2786 SCHECK_PARTIAL();
2787 RRETURN(MATCH_NOMATCH);
2788 }
2789 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2790 }
2791 }
2792 /* Control never gets here */
2793 }
2794
2795 /* Maximize case */
2796
2797 else
2798 {
2799 pp = eptr;
2800
2801 #ifdef SUPPORT_UTF8
2802 /* UTF-8 mode */
2803 if (utf8)
2804 {
2805 register unsigned int d;
2806 for (i = min; i < max; i++)
2807 {
2808 int len = 1;
2809 if (eptr >= md->end_subject) break;
2810 GETCHARLEN(d, eptr, len);
2811 if (d < 256) d = md->lcc[d];
2812 if (fc == d) break;
2813 eptr += len;
2814 }
2815 CHECK_PARTIAL();
2816 if (possessive) continue;
2817 for(;;)
2818 {
2819 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 if (eptr-- == pp) break; /* Stop if tried at original pos */
2822 BACKCHAR(eptr);
2823 }
2824 }
2825 else
2826 #endif
2827 /* Not UTF-8 mode */
2828 {
2829 for (i = min; i < max; i++)
2830 {
2831 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2832 eptr++;
2833 }
2834 CHECK_PARTIAL();
2835 if (possessive) continue;
2836 while (eptr >= pp)
2837 {
2838 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2840 eptr--;
2841 }
2842 }
2843
2844 RRETURN(MATCH_NOMATCH);
2845 }
2846 /* Control never gets here */
2847 }
2848
2849 /* Caseful comparisons */
2850
2851 else
2852 {
2853 #ifdef SUPPORT_UTF8
2854 /* UTF-8 mode */
2855 if (utf8)
2856 {
2857 register unsigned int d;
2858 for (i = 1; i <= min; i++)
2859 {
2860 if (eptr >= md->end_subject)
2861 {
2862 SCHECK_PARTIAL();
2863 RRETURN(MATCH_NOMATCH);
2864 }
2865 GETCHARINC(d, eptr);
2866 if (fc == d) RRETURN(MATCH_NOMATCH);
2867 }
2868 }
2869 else
2870 #endif
2871 /* Not UTF-8 mode */
2872 {
2873 for (i = 1; i <= min; i++)
2874 {
2875 if (eptr >= md->end_subject)
2876 {
2877 SCHECK_PARTIAL();
2878 RRETURN(MATCH_NOMATCH);
2879 }
2880 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2881 }
2882 }
2883
2884 if (min == max) continue;
2885
2886 if (minimize)
2887 {
2888 #ifdef SUPPORT_UTF8
2889 /* UTF-8 mode */
2890 if (utf8)
2891 {
2892 register unsigned int d;
2893 for (fi = min;; fi++)
2894 {
2895 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2896 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2897 if (fi >= max)
2898 {
2899 CHECK_PARTIAL();
2900 RRETURN(MATCH_NOMATCH);
2901 }
2902 if (eptr >= md->end_subject)
2903 {
2904 SCHECK_PARTIAL();
2905 RRETURN(MATCH_NOMATCH);
2906 }
2907 GETCHARINC(d, eptr);
2908 if (fc == d) RRETURN(MATCH_NOMATCH);
2909 }
2910 }
2911 else
2912 #endif
2913 /* Not UTF-8 mode */
2914 {
2915 for (fi = min;; fi++)
2916 {
2917 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2918 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2919 if (fi >= max)
2920 {
2921 CHECK_PARTIAL();
2922 RRETURN(MATCH_NOMATCH);
2923 }
2924 if (eptr >= md->end_subject)
2925 {
2926 SCHECK_PARTIAL();
2927 RRETURN(MATCH_NOMATCH);
2928 }
2929 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2930 }
2931 }
2932 /* Control never gets here */
2933 }
2934
2935 /* Maximize case */
2936
2937 else
2938 {
2939 pp = eptr;
2940
2941 #ifdef SUPPORT_UTF8
2942 /* UTF-8 mode */
2943 if (utf8)
2944 {
2945 register unsigned int d;
2946 for (i = min; i < max; i++)
2947 {
2948 int len = 1;
2949 if (eptr >= md->end_subject) break;
2950 GETCHARLEN(d, eptr, len);
2951 if (fc == d) break;
2952 eptr += len;
2953 }
2954 CHECK_PARTIAL();
2955 if (possessive) continue;
2956 for(;;)
2957 {
2958 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2959 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2960 if (eptr-- == pp) break; /* Stop if tried at original pos */
2961 BACKCHAR(eptr);
2962 }
2963 }
2964 else
2965 #endif
2966 /* Not UTF-8 mode */
2967 {
2968 for (i = min; i < max; i++)
2969 {
2970 if (eptr >= md->end_subject || fc == *eptr) break;
2971 eptr++;
2972 }
2973 CHECK_PARTIAL();
2974 if (possessive) continue;
2975 while (eptr >= pp)
2976 {
2977 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2978 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2979 eptr--;
2980 }
2981 }
2982
2983 RRETURN(MATCH_NOMATCH);
2984 }
2985 }
2986 /* Control never gets here */
2987
2988 /* Match a single character type repeatedly; several different opcodes
2989 share code. This is very similar to the code for single characters, but we
2990 repeat it in the interests of efficiency. */
2991
2992 case OP_TYPEEXACT:
2993 min = max = GET2(ecode, 1);
2994 minimize = TRUE;
2995 ecode += 3;
2996 goto REPEATTYPE;
2997
2998 case OP_TYPEUPTO:
2999 case OP_TYPEMINUPTO:
3000 min = 0;
3001 max = GET2(ecode, 1);
3002 minimize = *ecode == OP_TYPEMINUPTO;
3003 ecode += 3;
3004 goto REPEATTYPE;
3005
3006 case OP_TYPEPOSSTAR:
3007 possessive = TRUE;
3008 min = 0;
3009 max = INT_MAX;
3010 ecode++;
3011 goto REPEATTYPE;
3012
3013 case OP_TYPEPOSPLUS:
3014 possessive = TRUE;
3015 min = 1;
3016 max = INT_MAX;
3017 ecode++;
3018 goto REPEATTYPE;
3019
3020 case OP_TYPEPOSQUERY:
3021 possessive = TRUE;
3022 min = 0;
3023 max = 1;
3024 ecode++;
3025 goto REPEATTYPE;
3026
3027 case OP_TYPEPOSUPTO:
3028 possessive = TRUE;
3029 min = 0;
3030 max = GET2(ecode, 1);
3031 ecode += 3;
3032 goto REPEATTYPE;
3033
3034 case OP_TYPESTAR:
3035 case OP_TYPEMINSTAR:
3036 case OP_TYPEPLUS:
3037 case OP_TYPEMINPLUS:
3038 case OP_TYPEQUERY:
3039 case OP_TYPEMINQUERY:
3040 c = *ecode++ - OP_TYPESTAR;
3041 minimize = (c & 1) != 0;
3042 min = rep_min[c]; /* Pick up values from tables; */
3043 max = rep_max[c]; /* zero for max => infinity */
3044 if (max == 0) max = INT_MAX;
3045
3046 /* Common code for all repeated single character type matches. Note that
3047 in UTF-8 mode, '.' matches a character of any length, but for the other
3048 character types, the valid characters are all one-byte long. */
3049
3050 REPEATTYPE:
3051 ctype = *ecode++; /* Code for the character type */
3052
3053 #ifdef SUPPORT_UCP
3054 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3055 {
3056 prop_fail_result = ctype == OP_NOTPROP;
3057 prop_type = *ecode++;
3058 prop_value = *ecode++;
3059 }
3060 else prop_type = -1;
3061 #endif
3062
3063 /* First, ensure the minimum number of matches are present. Use inline
3064 code for maximizing the speed, and do the type test once at the start
3065 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3066 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3067 and single-bytes. */
3068
3069 if (min > 0)
3070 {
3071 #ifdef SUPPORT_UCP
3072 if (prop_type >= 0)
3073 {
3074 switch(prop_type)
3075 {
3076 case PT_ANY:
3077 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3078 for (i = 1; i <= min; i++)
3079 {
3080 if (eptr >= md->end_subject)
3081 {
3082 SCHECK_PARTIAL();
3083 RRETURN(MATCH_NOMATCH);
3084 }
3085 GETCHARINCTEST(c, eptr);
3086 }
3087 break;
3088
3089 case PT_LAMP:
3090 for (i = 1; i <= min; i++)
3091 {
3092 if (eptr >= md->end_subject)
3093 {
3094 SCHECK_PARTIAL();
3095 RRETURN(MATCH_NOMATCH);
3096 }
3097 GETCHARINCTEST(c, eptr);
3098 prop_chartype = UCD_CHARTYPE(c);
3099 if ((prop_chartype == ucp_Lu ||
3100 prop_chartype == ucp_Ll ||
3101 prop_chartype == ucp_Lt) == prop_fail_result)
3102 RRETURN(MATCH_NOMATCH);
3103 }
3104 break;
3105
3106 case PT_GC:
3107 for (i = 1; i <= min; i++)
3108 {
3109 if (eptr >= md->end_subject)
3110 {
3111 SCHECK_PARTIAL();
3112 RRETURN(MATCH_NOMATCH);
3113 }
3114 GETCHARINCTEST(c, eptr);
3115 prop_category = UCD_CATEGORY(c);
3116 if ((prop_category == prop_value) == prop_fail_result)
3117 RRETURN(MATCH_NOMATCH);
3118 }
3119 break;
3120
3121 case PT_PC:
3122 for (i = 1; i <= min; i++)
3123 {
3124 if (eptr >= md->end_subject)
3125 {
3126 SCHECK_PARTIAL();
3127 RRETURN(MATCH_NOMATCH);
3128 }
3129 GETCHARINCTEST(c, eptr);
3130 prop_chartype = UCD_CHARTYPE(c);
3131 if ((prop_chartype == prop_value) == prop_fail_result)
3132 RRETURN(MATCH_NOMATCH);
3133 }
3134 break;
3135
3136 case PT_SC:
3137 for (i = 1; i <= min; i++)
3138 {
3139 if (eptr >= md->end_subject)
3140 {
3141 SCHECK_PARTIAL();
3142 RRETURN(MATCH_NOMATCH);
3143 }
3144 GETCHARINCTEST(c, eptr);
3145 prop_script = UCD_SCRIPT(c);
3146 if ((prop_script == prop_value) == prop_fail_result)
3147 RRETURN(MATCH_NOMATCH);
3148 }
3149 break;
3150
3151 default:
3152 RRETURN(PCRE_ERROR_INTERNAL);
3153 }
3154 }
3155
3156 /* Match extended Unicode sequences. We will get here only if the
3157 support is in the binary; otherwise a compile-time error occurs. */
3158
3159 else if (ctype == OP_EXTUNI)
3160 {
3161 for (i = 1; i <= min; i++)
3162 {
3163 if (eptr >= md->end_subject)
3164 {
3165 SCHECK_PARTIAL();
3166 RRETURN(MATCH_NOMATCH);
3167 }
3168 GETCHARINCTEST(c, eptr);
3169 prop_category = UCD_CATEGORY(c);
3170 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3171 while (eptr < md->end_subject)
3172 {
3173 int len = 1;
3174 if (!utf8) c = *eptr;
3175 else { GETCHARLEN(c, eptr, len); }
3176 prop_category = UCD_CATEGORY(c);
3177 if (prop_category != ucp_M) break;
3178 eptr += len;
3179 }
3180 }
3181 }
3182
3183 else
3184 #endif /* SUPPORT_UCP */
3185
3186 /* Handle all other cases when the coding is UTF-8 */
3187
3188 #ifdef SUPPORT_UTF8
3189 if (utf8) switch(ctype)
3190 {
3191 case OP_ANY:
3192 for (i = 1; i <= min; i++)
3193 {
3194 if (eptr >= md->end_subject)
3195 {
3196 SCHECK_PARTIAL();
3197 RRETURN(MATCH_NOMATCH);
3198 }
3199 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3200 eptr++;
3201 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3202 }
3203 break;
3204
3205 case OP_ALLANY:
3206 for (i = 1; i <= min; i++)
3207 {
3208 if (eptr >= md->end_subject)
3209 {
3210 SCHECK_PARTIAL();
3211 RRETURN(MATCH_NOMATCH);
3212 }
3213 eptr++;
3214 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3215 }
3216 break;
3217
3218 case OP_ANYBYTE:
3219 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3220 eptr += min;
3221 break;
3222
3223 case OP_ANYNL:
3224 for (i = 1; i <= min; i++)
3225 {
3226 if (eptr >= md->end_subject)
3227 {
3228 SCHECK_PARTIAL();
3229 RRETURN(MATCH_NOMATCH);
3230 }
3231 GETCHARINC(c, eptr);
3232 switch(c)
3233 {
3234 default: RRETURN(MATCH_NOMATCH);
3235 case 0x000d:
3236 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3237 break;
3238
3239 case 0x000a:
3240 break;
3241
3242 case 0x000b:
3243 case 0x000c:
3244 case 0x0085:
3245 case 0x2028:
3246 case 0x2029:
3247 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3248 break;
3249 }
3250 }
3251 break;
3252
3253 case OP_NOT_HSPACE:
3254 for (i = 1; i <= min; i++)
3255 {
3256 if (eptr >= md->end_subject)
3257 {
3258 SCHECK_PARTIAL();
3259 RRETURN(MATCH_NOMATCH);
3260 }
3261 GETCHARINC(c, eptr);
3262 switch(c)
3263 {
3264 default: break;
3265 case 0x09: /* HT */
3266 case 0x20: /* SPACE */
3267 case 0xa0: /* NBSP */
3268 case 0x1680: /* OGHAM SPACE MARK */
3269 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3270 case 0x2000: /* EN QUAD */
3271 case 0x2001: /* EM QUAD */
3272 case 0x2002: /* EN SPACE */
3273 case 0x2003: /* EM SPACE */
3274 case 0x2004: /* THREE-PER-EM SPACE */
3275 case 0x2005: /* FOUR-PER-EM SPACE */
3276 case 0x2006: /* SIX-PER-EM SPACE */
3277 case 0x2007: /* FIGURE SPACE */
3278 case 0x2008: /* PUNCTUATION SPACE */
3279 case 0x2009: /* THIN SPACE */
3280 case 0x200A: /* HAIR SPACE */
3281 case 0x202f: /* NARROW NO-BREAK SPACE */
3282 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3283 case 0x3000: /* IDEOGRAPHIC SPACE */
3284 RRETURN(MATCH_NOMATCH);
3285 }
3286 }
3287 break;
3288
3289 case OP_HSPACE:
3290 for (i = 1; i <= min; i++)
3291 {
3292 if (eptr >= md->end_subject)
3293 {
3294 SCHECK_PARTIAL();
3295 RRETURN(MATCH_NOMATCH);
3296 }
3297 GETCHARINC(c, eptr);
3298 switch(c)
3299 {
3300 default: RRETURN(MATCH_NOMATCH);
3301 case 0x09: /* HT */
3302 case 0x20: /* SPACE */
3303 case 0xa0: /* NBSP */
3304 case 0x1680: /* OGHAM SPACE MARK */
3305 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3306 case 0x2000: /* EN QUAD */
3307 case 0x2001: /* EM QUAD */
3308 case 0x2002: /* EN SPACE */
3309 case 0x2003: /* EM SPACE */
3310 case 0x2004: /* THREE-PER-EM SPACE */
3311 case 0x2005: /* FOUR-PER-EM SPACE */
3312 case 0x2006: /* SIX-PER-EM SPACE */
3313 case 0x2007: /* FIGURE SPACE */
3314 case 0x2008: /* PUNCTUATION SPACE */
3315 case 0x2009: /* THIN SPACE */
3316 case 0x200A: /* HAIR SPACE */
3317 case 0x202f: /* NARROW NO-BREAK SPACE */
3318 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3319 case 0x3000: /* IDEOGRAPHIC SPACE */
3320 break;
3321 }
3322 }
3323 break;
3324
3325 case OP_NOT_VSPACE:
3326 for (i = 1; i <= min; i++)
3327 {
3328 if (eptr >= md->end_subject)
3329 {
3330 SCHECK_PARTIAL();
3331 RRETURN(MATCH_NOMATCH);
3332 }
3333 GETCHARINC(c, eptr);
3334 switch(c)
3335 {
3336 default: break;
3337 case 0x0a: /* LF */
3338 case 0x0b: /* VT */
3339 case 0x0c: /* FF */
3340 case 0x0d: /* CR */
3341 case 0x85: /* NEL */
3342 case 0x2028: /* LINE SEPARATOR */
3343 case 0x2029: /* PARAGRAPH SEPARATOR */
3344 RRETURN(MATCH_NOMATCH);
3345 }
3346 }
3347 break;
3348
3349 case OP_VSPACE:
3350 for (i = 1; i <= min; i++)
3351 {
3352 if (eptr >= md->end_subject)
3353 {
3354 SCHECK_PARTIAL();
3355 RRETURN(MATCH_NOMATCH);
3356 }
3357 GETCHARINC(c, eptr);
3358 switch(c)
3359 {
3360 default: RRETURN(MATCH_NOMATCH);
3361 case 0x0a: /* LF */
3362 case 0x0b: /* VT */
3363 case 0x0c: /* FF */
3364 case 0x0d: /* CR */
3365 case 0x85: /* NEL */
3366 case 0x2028: /* LINE SEPARATOR */
3367 case 0x2029: /* PARAGRAPH SEPARATOR */
3368 break;
3369 }
3370 }
3371 break;
3372
3373 case OP_NOT_DIGIT:
3374 for (i = 1; i <= min; i++)
3375 {
3376 if (eptr >= md->end_subject)
3377 {
3378 SCHECK_PARTIAL();
3379 RRETURN(MATCH_NOMATCH);
3380 }
3381 GETCHARINC(c, eptr);
3382 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3383 RRETURN(MATCH_NOMATCH);
3384 }
3385 break;
3386
3387 case OP_DIGIT:
3388 for (i = 1; i <= min; i++)
3389 {
3390 if (eptr >= md->end_subject)
3391 {
3392 SCHECK_PARTIAL();
3393 RRETURN(MATCH_NOMATCH);
3394 }
3395 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3396 RRETURN(MATCH_NOMATCH);
3397 /* No need to skip more bytes - we know it's a 1-byte character */
3398 }
3399 break;
3400
3401 case OP_NOT_WHITESPACE:
3402 for (i = 1; i <= min; i++)
3403 {
3404 if (eptr >= md->end_subject)
3405 {
3406 SCHECK_PARTIAL();
3407 RRETURN(MATCH_NOMATCH);
3408 }
3409 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3410 RRETURN(MATCH_NOMATCH);
3411 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3412 }
3413 break;
3414
3415 case OP_WHITESPACE:
3416 for (i = 1; i <= min; i++)
3417 {
3418 if (eptr >= md->end_subject)
3419 {
3420 SCHECK_PARTIAL();
3421 RRETURN(MATCH_NOMATCH);
3422 }
3423 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3424 RRETURN(MATCH_NOMATCH);
3425 /* No need to skip more bytes - we know it's a 1-byte character */
3426 }
3427 break;
3428
3429 case OP_NOT_WORDCHAR:
3430 for (i = 1; i <= min; i++)
3431 {
3432 if (eptr >= md->end_subject ||
3433 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3434 RRETURN(MATCH_NOMATCH);
3435 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3436 }
3437 break;
3438
3439 case OP_WORDCHAR:
3440 for (i = 1; i <= min; i++)
3441 {
3442 if (eptr >= md->end_subject)
3443 {
3444 SCHECK_PARTIAL();
3445 RRETURN(MATCH_NOMATCH);
3446 }
3447 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3448 RRETURN(MATCH_NOMATCH);
3449 /* No need to skip more bytes - we know it's a 1-byte character */
3450 }
3451 break;
3452
3453 default:
3454 RRETURN(PCRE_ERROR_INTERNAL);
3455 } /* End switch(ctype) */
3456
3457 else
3458 #endif /* SUPPORT_UTF8 */
3459
3460 /* Code for the non-UTF-8 case for minimum matching of operators other
3461 than OP_PROP and OP_NOTPROP. */
3462
3463 switch(ctype)
3464 {
3465 case OP_ANY:
3466 for (i = 1; i <= min; i++)
3467 {
3468 if (eptr >= md->end_subject)
3469 {
3470 SCHECK_PARTIAL();
3471 RRETURN(MATCH_NOMATCH);
3472 }
3473 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3474 eptr++;
3475 }
3476 break;
3477
3478 case OP_ALLANY:
3479 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3480 eptr += min;
3481 break;
3482
3483 case OP_ANYBYTE:
3484 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3485 eptr += min;
3486 break;
3487
3488 case OP_ANYNL:
3489 for (i = 1; i <= min; i++)
3490 {
3491 if (eptr >= md->end_subject)
3492 {
3493 SCHECK_PARTIAL();
3494 RRETURN(MATCH_NOMATCH);
3495 }
3496 switch(*eptr++)
3497 {
3498 default: RRETURN(MATCH_NOMATCH);
3499 case 0x000d:
3500 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3501 break;
3502 case 0x000a:
3503 break;
3504
3505 case 0x000b:
3506 case 0x000c:
3507 case 0x0085:
3508 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3509 break;
3510 }
3511 }
3512 break;
3513
3514 case OP_NOT_HSPACE:
3515 for (i = 1; i <= min; i++)
3516 {
3517 if (eptr >= md->end_subject)
3518 {
3519 SCHECK_PARTIAL();
3520 RRETURN(MATCH_NOMATCH);
3521 }
3522 switch(*eptr++)
3523 {
3524 default: break;
3525 case 0x09: /* HT */
3526 case 0x20: /* SPACE */
3527 case 0xa0: /* NBSP */
3528 RRETURN(MATCH_NOMATCH);
3529 }
3530 }
3531 break;
3532
3533 case OP_HSPACE:
3534 for (i = 1; i <= min; i++)
3535 {
3536 if (eptr >= md->end_subject)
3537 {
3538 SCHECK_PARTIAL();
3539 RRETURN(MATCH_NOMATCH);
3540 }
3541 switch(*eptr++)
3542 {
3543 default: RRETURN(MATCH_NOMATCH);
3544 case 0x09: /* HT */
3545 case 0x20: /* SPACE */
3546 case 0xa0: /* NBSP */
3547 break;
3548 }
3549 }
3550 break;
3551
3552 case OP_NOT_VSPACE:
3553 for (i = 1; i <= min; i++)
3554 {
3555 if (eptr >= md->end_subject)
3556 {
3557 SCHECK_PARTIAL();
3558 RRETURN(MATCH_NOMATCH);
3559 }
3560 switch(*eptr++)
3561 {
3562 default: break;
3563 case 0x0a: /* LF */
3564 case 0x0b: /* VT */
3565 case 0x0c: /* FF */
3566 case 0x0d: /* CR */
3567 case 0x85: /* NEL */
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 }
3571 break;
3572
3573 case OP_VSPACE:
3574 for (i = 1; i <= min; i++)
3575 {
3576 if (eptr >= md->end_subject)
3577 {
3578 SCHECK_PARTIAL();
3579 RRETURN(MATCH_NOMATCH);
3580 }
3581 switch(*eptr++)
3582 {
3583 default: RRETURN(MATCH_NOMATCH);
3584 case 0x0a: /* LF */
3585 case 0x0b: /* VT */
3586 case 0x0c: /* FF */
3587 case 0x0d: /* CR */
3588 case 0x85: /* NEL */
3589 break;
3590 }
3591 }
3592 break;
3593
3594 case OP_NOT_DIGIT:
3595 for (i = 1; i <= min; i++)
3596 {
3597 if (eptr >= md->end_subject)
3598 {
3599 SCHECK_PARTIAL();
3600 RRETURN(MATCH_NOMATCH);
3601 }
3602 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3603 }
3604 break;
3605
3606 case OP_DIGIT:
3607 for (i = 1; i <= min; i++)
3608 {
3609 if (eptr >= md->end_subject)
3610 {
3611 SCHECK_PARTIAL();
3612 RRETURN(MATCH_NOMATCH);
3613 }
3614 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3615 }
3616 break;
3617
3618 case OP_NOT_WHITESPACE:
3619 for (i = 1; i <= min; i++)
3620 {
3621 if (eptr >= md->end_subject)
3622 {
3623 SCHECK_PARTIAL();
3624 RRETURN(MATCH_NOMATCH);
3625 }
3626 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3627 }
3628 break;
3629
3630 case OP_WHITESPACE:
3631 for (i = 1; i <= min; i++)
3632 {
3633 if (eptr >= md->end_subject)
3634 {
3635 SCHECK_PARTIAL();
3636 RRETURN(MATCH_NOMATCH);
3637 }
3638 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3639 }
3640 break;
3641
3642 case OP_NOT_WORDCHAR:
3643 for (i = 1; i <= min; i++)
3644 {
3645 if (eptr >= md->end_subject)
3646 {
3647 SCHECK_PARTIAL();
3648 RRETURN(MATCH_NOMATCH);
3649 }
3650 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3651 RRETURN(MATCH_NOMATCH);
3652 }
3653 break;
3654
3655 case OP_WORDCHAR:
3656 for (i = 1; i <= min; i++)
3657 {
3658 if (eptr >= md->end_subject)
3659 {
3660 SCHECK_PARTIAL();
3661 RRETURN(MATCH_NOMATCH);
3662 }
3663 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3664 RRETURN(MATCH_NOMATCH);
3665 }
3666 break;
3667
3668 default:
3669 RRETURN(PCRE_ERROR_INTERNAL);
3670 }
3671 }
3672
3673 /* If min = max, continue at the same level without recursing */
3674
3675 if (min == max) continue;
3676
3677 /* If minimizing, we have to test the rest of the pattern before each
3678 subsequent match. Again, separate the UTF-8 case for speed, and also
3679 separate the UCP cases. */
3680
3681 if (minimize)
3682 {
3683 #ifdef SUPPORT_UCP
3684 if (prop_type >= 0)
3685 {
3686 switch(prop_type)
3687 {
3688 case PT_ANY:
3689 for (fi = min;; fi++)
3690 {
3691 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3692 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3693 if (fi >= max)
3694 {
3695 CHECK_PARTIAL();
3696 RRETURN(MATCH_NOMATCH);
3697 }
3698 if (eptr >= md->end_subject)
3699 {
3700 SCHECK_PARTIAL();
3701 RRETURN(MATCH_NOMATCH);
3702 }
3703 GETCHARINC(c, eptr);
3704 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3705 }
3706 /* Control never gets here */
3707
3708 case PT_LAMP:
3709 for (fi = min;; fi++)
3710 {
3711 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3712 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3713 if (fi >= max)
3714 {
3715 CHECK_PARTIAL();
3716 RRETURN(MATCH_NOMATCH);
3717 }
3718 if (eptr >= md->end_subject)
3719 {
3720 SCHECK_PARTIAL();
3721 RRETURN(MATCH_NOMATCH);
3722 }
3723 GETCHARINC(c, eptr);
3724 prop_chartype = UCD_CHARTYPE(c);
3725 if ((prop_chartype == ucp_Lu ||
3726 prop_chartype == ucp_Ll ||
3727 prop_chartype == ucp_Lt) == prop_fail_result)
3728 RRETURN(MATCH_NOMATCH);
3729 }
3730 /* Control never gets here */
3731
3732 case PT_GC:
3733 for (fi = min;; fi++)
3734 {
3735 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3736 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3737 if (fi >= max)
3738 {
3739 CHECK_PARTIAL();
3740 RRETURN(MATCH_NOMATCH);
3741 }
3742 if (eptr >= md->end_subject)
3743 {
3744 SCHECK_PARTIAL();
3745 RRETURN(MATCH_NOMATCH);
3746 }
3747 GETCHARINC(c, eptr);
3748 prop_category = UCD_CATEGORY(c);
3749 if ((prop_category == prop_value) == prop_fail_result)
3750 RRETURN(MATCH_NOMATCH);
3751 }
3752 /* Control never gets here */
3753
3754 case PT_PC:
3755 for (fi = min;; fi++)
3756 {
3757 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3759 if (fi >= max)
3760 {
3761 CHECK_PARTIAL();
3762 RRETURN(MATCH_NOMATCH);
3763 }
3764 if (eptr >= md->end_subject)
3765 {
3766 SCHECK_PARTIAL();
3767 RRETURN(MATCH_NOMATCH);
3768 }
3769 GETCHARINC(c, eptr);
3770 prop_chartype = UCD_CHARTYPE(c);
3771 if ((prop_chartype == prop_value) == prop_fail_result)
3772 RRETURN(MATCH_NOMATCH);
3773 }
3774 /* Control never gets here */
3775
3776 case PT_SC:
3777 for (fi = min;; fi++)
3778 {
3779 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3780 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3781 if (fi >= max)
3782 {
3783 CHECK_PARTIAL();
3784 RRETURN(MATCH_NOMATCH);
3785 }
3786 if (eptr >= md->end_subject)
3787 {
3788 SCHECK_PARTIAL();
3789 RRETURN(MATCH_NOMATCH);
3790 }
3791 GETCHARINC(c, eptr);
3792 prop_script = UCD_SCRIPT(c);
3793 if ((prop_script == prop_value) == prop_fail_result)
3794 RRETURN(MATCH_NOMATCH);
3795 }
3796 /* Control never gets here */
3797
3798 default:
3799 RRETURN(PCRE_ERROR_INTERNAL);
3800 }
3801 }
3802
3803 /* Match extended Unicode sequences. We will get here only if the
3804 support is in the binary; otherwise a compile-time error occurs. */
3805
3806 else if (ctype == OP_EXTUNI)
3807 {
3808 for (fi = min;; fi++)
3809 {
3810 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3812 if (fi >= max)
3813 {
3814 CHECK_PARTIAL();
3815 RRETURN(MATCH_NOMATCH);
3816 }
3817 if (eptr >= md->end_subject)
3818 {
3819 SCHECK_PARTIAL();
3820 RRETURN(MATCH_NOMATCH);
3821 }
3822 GETCHARINCTEST(c, eptr);
3823 prop_category = UCD_CATEGORY(c);
3824 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3825 while (eptr < md->end_subject)
3826 {
3827 int len = 1;
3828 if (!utf8) c = *eptr;
3829 else { GETCHARLEN(c, eptr, len); }
3830 prop_category = UCD_CATEGORY(c);
3831 if (prop_category != ucp_M) break;
3832 eptr += len;
3833 }
3834 }
3835 }
3836
3837 else
3838 #endif /* SUPPORT_UCP */
3839
3840 #ifdef SUPPORT_UTF8
3841 /* UTF-8 mode */
3842 if (utf8)
3843 {
3844 for (fi = min;; fi++)
3845 {
3846 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3848 if (fi >= max)
3849 {
3850 CHECK_PARTIAL();
3851 RRETURN(MATCH_NOMATCH);
3852 }
3853 if (eptr >= md->end_subject)
3854 {
3855 SCHECK_PARTIAL();
3856 RRETURN(MATCH_NOMATCH);
3857 }
3858 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3859 RRETURN(MATCH_NOMATCH);
3860 GETCHARINC(c, eptr);
3861 switch(ctype)
3862 {
3863 case OP_ANY: /* This is the non-NL case */
3864 case OP_ALLANY:
3865 case OP_ANYBYTE:
3866 break;
3867
3868 case OP_ANYNL:
3869 switch(c)
3870 {
3871 default: RRETURN(MATCH_NOMATCH);
3872 case 0x000d:
3873 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3874 break;
3875 case 0x000a:
3876 break;
3877
3878 case 0x000b:
3879 case 0x000c:
3880 case 0x0085:
3881 case 0x2028:
3882 case 0x2029:
3883 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3884 break;
3885 }
3886 break;
3887
3888 case OP_NOT_HSPACE:
3889 switch(c)
3890 {
3891 default: break;
3892 case 0x09: /* HT */
3893 case 0x20: /* SPACE */
3894 case 0xa0: /* NBSP */
3895 case 0x1680: /* OGHAM SPACE MARK */
3896 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3897 case 0x2000: /* EN QUAD */
3898 case 0x2001: /* EM QUAD */
3899 case 0x2002: /* EN SPACE */
3900 case 0x2003: /* EM SPACE */
3901 case 0x2004: /* THREE-PER-EM SPACE */
3902 case 0x2005: /* FOUR-PER-EM SPACE */
3903 case 0x2006: /* SIX-PER-EM SPACE */
3904 case 0x2007: /* FIGURE SPACE */
3905 case 0x2008: /* PUNCTUATION SPACE */
3906 case 0x2009: /* THIN SPACE */
3907 case 0x200A: /* HAIR SPACE */
3908 case 0x202f: /* NARROW NO-BREAK SPACE */
3909 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3910 case 0x3000: /* IDEOGRAPHIC SPACE */
3911 RRETURN(MATCH_NOMATCH);
3912 }
3913 break;
3914
3915 case OP_HSPACE:
3916 switch(c)
3917 {
3918 default: RRETURN(MATCH_NOMATCH);
3919 case 0x09: /* HT */
3920 case 0x20: /* SPACE */
3921 case 0xa0: /* NBSP */
3922 case 0x1680: /* OGHAM SPACE MARK */
3923 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3924 case 0x2000: /* EN QUAD */
3925 case 0x2001: /* EM QUAD */
3926 case 0x2002: /* EN SPACE */
3927 case 0x2003: /* EM SPACE */
3928 case 0x2004: /* THREE-PER-EM SPACE */
3929 case 0x2005: /* FOUR-PER-EM SPACE */
3930 case 0x2006: /* SIX-PER-EM SPACE */
3931 case 0x2007: /* FIGURE SPACE */
3932 case 0x2008: /* PUNCTUATION SPACE */
3933 case 0x2009: /* THIN SPACE */
3934 case 0x200A: /* HAIR SPACE */
3935 case 0x202f: /* NARROW NO-BREAK SPACE */
3936 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3937 case 0x3000: /* IDEOGRAPHIC SPACE */
3938 break;
3939 }
3940 break;
3941
3942 case OP_NOT_VSPACE:
3943 switch(c)
3944 {
3945 default: break;
3946 case 0x0a: /* LF */
3947 case 0x0b: /* VT */
3948 case 0x0c: /* FF */
3949 case 0x0d: /* CR */
3950 case 0x85: /* NEL */
3951 case 0x2028: /* LINE SEPARATOR */
3952 case 0x2029: /* PARAGRAPH SEPARATOR */
3953 RRETURN(MATCH_NOMATCH);
3954 }
3955 break;
3956
3957 case OP_VSPACE:
3958 switch(c)
3959 {
3960 default: RRETURN(MATCH_NOMATCH);
3961 case 0x0a: /* LF */
3962 case 0x0b: /* VT */
3963 case 0x0c: /* FF */
3964 case 0x0d: /* CR */
3965 case 0x85: /* NEL */
3966 case 0x2028: /* LINE SEPARATOR */
3967 case 0x2029: /* PARAGRAPH SEPARATOR */
3968 break;
3969 }
3970 break;
3971
3972 case OP_NOT_DIGIT:
3973 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3974 RRETURN(MATCH_NOMATCH);
3975 break;
3976
3977 case OP_DIGIT:
3978 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3979 RRETURN(MATCH_NOMATCH);
3980 break;
3981
3982 case OP_NOT_WHITESPACE:
3983 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3984 RRETURN(MATCH_NOMATCH);
3985 break;
3986
3987 case OP_WHITESPACE:
3988 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3989 RRETURN(MATCH_NOMATCH);
3990 break;
3991
3992 case OP_NOT_WORDCHAR:
3993 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3994 RRETURN(MATCH_NOMATCH);
3995 break;
3996
3997 case OP_WORDCHAR:
3998 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3999 RRETURN(MATCH_NOMATCH);
4000 break;
4001
4002 default:
4003 RRETURN(PCRE_ERROR_INTERNAL);
4004 }
4005 }
4006 }
4007 else
4008 #endif
4009 /* Not UTF-8 mode */
4010 {
4011 for (fi = min;; fi++)
4012 {
4013 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4015 if (fi >= max)
4016 {
4017 CHECK_PARTIAL();
4018 RRETURN(MATCH_NOMATCH);
4019 }
4020 if (eptr >= md->end_subject)
4021 {
4022 SCHECK_PARTIAL();
4023 RRETURN(MATCH_NOMATCH);
4024 }
4025 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4026 RRETURN(MATCH_NOMATCH);
4027 c = *eptr++;
4028 switch(ctype)
4029 {
4030 case OP_ANY: /* This is the non-NL case */
4031 case OP_ALLANY:
4032 case OP_ANYBYTE:
4033 break;
4034
4035 case OP_ANYNL:
4036 switch(c)
4037 {
4038 default: RRETURN(MATCH_NOMATCH);
4039 case 0x000d:
4040 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4041 break;
4042
4043 case 0x000a:
4044 break;
4045
4046 case 0x000b:
4047 case 0x000c:
4048 case 0x0085:
4049 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4050 break;
4051 }
4052 break;
4053
4054 case OP_NOT_HSPACE:
4055 switch(c)
4056 {
4057 default: break;
4058 case 0x09: /* HT */
4059 case 0x20: /* SPACE */
4060 case 0xa0: /* NBSP */
4061 RRETURN(MATCH_NOMATCH);
4062 }
4063 break;
4064
4065 case OP_HSPACE:
4066 switch(c)
4067 {
4068 default: RRETURN(MATCH_NOMATCH);
4069 case 0x09: /* HT */
4070 case 0x20: /* SPACE */
4071 case 0xa0: /* NBSP */
4072 break;
4073 }
4074 break;
4075
4076 case OP_NOT_VSPACE:
4077 switch(c)
4078 {
4079 default: break;
4080 case 0x0a: /* LF */
4081 case 0x0b: /* VT */
4082 case 0x0c: /* FF */
4083 case 0x0d: /* CR */
4084 case 0x85: /* NEL */
4085 RRETURN(MATCH_NOMATCH);
4086 }
4087 break;
4088
4089 case OP_VSPACE:
4090 switch(c)
4091 {
4092 default: RRETURN(MATCH_NOMATCH);
4093 case 0x0a: /* LF */
4094 case 0x0b: /* VT */
4095 case 0x0c: /* FF */
4096 case 0x0d: /* CR */
4097 case 0x85: /* NEL */
4098 break;
4099 }
4100 break;
4101
4102 case OP_NOT_DIGIT:
4103 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4104 break;
4105
4106 case OP_DIGIT:
4107 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4108 break;
4109
4110 case OP_NOT_WHITESPACE:
4111 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4112 break;
4113
4114 case OP_WHITESPACE:
4115 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4116 break;
4117
4118 case OP_NOT_WORDCHAR:
4119 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4120 break;
4121
4122 case OP_WORDCHAR:
4123 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4124 break;
4125
4126 default:
4127 RRETURN(PCRE_ERROR_INTERNAL);
4128 }
4129 }
4130 }
4131 /* Control never gets here */
4132 }
4133
4134 /* If maximizing, it is worth using inline code for speed, doing the type
4135 test once at the start (i.e. keep it out of the loop). Again, keep the
4136 UTF-8 and UCP stuff separate. */
4137
4138 else
4139 {
4140 pp = eptr; /* Remember where we started */
4141
4142 #ifdef SUPPORT_UCP
4143 if (prop_type >= 0)
4144 {
4145 switch(prop_type)
4146 {
4147 case PT_ANY:
4148 for (i = min; i < max; i++)
4149 {
4150 int len = 1;
4151 if (eptr >= md->end_subject) break;
4152 GETCHARLEN(c, eptr, len);
4153 if (prop_fail_result) break;
4154 eptr+= len;
4155 }
4156 break;
4157
4158 case PT_LAMP:
4159 for (i = min; i < max; i++)
4160 {
4161 int len = 1;
4162 if (eptr >= md->end_subject) break;
4163 GETCHARLEN(c, eptr, len);
4164 prop_chartype = UCD_CHARTYPE(c);
4165 if ((prop_chartype == ucp_Lu ||
4166 prop_chartype == ucp_Ll ||
4167 prop_chartype == ucp_Lt) == prop_fail_result)
4168 break;
4169 eptr+= len;
4170 }
4171 break;
4172
4173 case PT_GC:
4174 for (i = min; i < max; i++)
4175 {
4176 int len = 1;
4177 if (eptr >= md->end_subject) break;
4178 GETCHARLEN(c, eptr, len);
4179 prop_category = UCD_CATEGORY(c);
4180 if ((prop_category == prop_value) == prop_fail_result)
4181 break;
4182 eptr+= len;
4183 }
4184 break;
4185
4186 case PT_PC:
4187 for (i = min; i < max; i++)
4188 {
4189 int len = 1;
4190 if (eptr >= md->end_subject) break;
4191 GETCHARLEN(c, eptr, len);
4192 prop_chartype = UCD_CHARTYPE(c);
4193 if ((prop_chartype == prop_value) == prop_fail_result)
4194 break;
4195 eptr+= len;
4196 }
4197 break;
4198
4199 case PT_SC:
4200 for (i = min; i < max; i++)
4201 {
4202 int len = 1;
4203 if (eptr >= md->end_subject) break;
4204 GETCHARLEN(c, eptr, len);
4205 prop_script = UCD_SCRIPT(c);
4206 if ((prop_script == prop_value) == prop_fail_result)
4207 break;
4208 eptr+= len;
4209 }
4210 break;
4211 }
4212
4213 /* eptr is now past the end of the maximum run */
4214
4215 CHECK_PARTIAL();
4216 if (possessive) continue;
4217 for(;;)
4218 {
4219 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4220 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4221 if (eptr-- == pp) break; /* Stop if tried at original pos */
4222 if (utf8) BACKCHAR(eptr);
4223 }
4224 }
4225
4226 /* Match extended Unicode sequences. We will get here only if the
4227 support is in the binary; otherwise a compile-time error occurs. */
4228
4229 else if (ctype == OP_EXTUNI)
4230 {
4231 for (i = min; i < max; i++)
4232 {
4233 if (eptr >= md->end_subject) break;
4234 GETCHARINCTEST(c, eptr);
4235 prop_category = UCD_CATEGORY(c);
4236 if (prop_category == ucp_M) break;
4237 while (eptr < md->end_subject)
4238 {
4239 int len = 1;
4240 if (!utf8) c = *eptr; else
4241 {
4242 GETCHARLEN(c, eptr, len);
4243 }
4244 prop_category = UCD_CATEGORY(c);
4245 if (prop_category != ucp_M) break;
4246 eptr += len;
4247 }
4248 }
4249
4250 /* eptr is now past the end of the maximum run */
4251
4252 CHECK_PARTIAL();
4253 if (possessive) continue;
4254 for(;;)
4255 {
4256 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4257 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4258 if (eptr-- == pp) break; /* Stop if tried at original pos */
4259 for (;;) /* Move back over one extended */
4260 {
4261 int len = 1;
4262 if (!utf8) c = *eptr; else
4263 {
4264 BACKCHAR(eptr);
4265 GETCHARLEN(c, eptr, len);
4266 }
4267 prop_category = UCD_CATEGORY(c);
4268 if (prop_category != ucp_M) break;
4269 eptr--;
4270 }
4271 }
4272 }
4273
4274 else
4275 #endif /* SUPPORT_UCP */
4276
4277 #ifdef SUPPORT_UTF8
4278 /* UTF-8 mode */
4279
4280 if (utf8)
4281 {
4282 switch(ctype)
4283 {
4284 case OP_ANY:
4285 if (max < INT_MAX)
4286 {
4287 for (i = min; i < max; i++)
4288 {
4289 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4290 eptr++;
4291 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4292 }
4293 }
4294
4295 /* Handle unlimited UTF-8 repeat */
4296
4297 else
4298 {
4299 for (i = min; i < max; i++)
4300 {
4301 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4302 eptr++;
4303 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4304 }
4305 }
4306 break;
4307
4308 case OP_ALLANY:
4309 if (max < INT_MAX)
4310 {
4311 for (i = min; i < max; i++)
4312 {
4313 if (eptr >= md->end_subject) break;
4314 eptr++;
4315 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4316 }
4317 }
4318 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4319 break;
4320
4321 /* The byte case is the same as non-UTF8 */
4322
4323 case OP_ANYBYTE:
4324 c = max - min;
4325 if (c > (unsigned int)(md->end_subject - eptr))
4326 c = md->end_subject - eptr;
4327 eptr += c;
4328 break;
4329
4330 case OP_ANYNL:
4331 for (i = min; i < max; i++)
4332 {
4333 int len = 1;
4334 if (eptr >= md->end_subject) break;
4335 GETCHARLEN(c, eptr, len);
4336 if (c == 0x000d)
4337 {
4338 if (++eptr >= md->end_subject) break;
4339 if (*eptr == 0x000a) eptr++;
4340 }
4341 else
4342 {
4343 if (c != 0x000a &&
4344 (md->bsr_anycrlf ||
4345 (c != 0x000b && c != 0x000c &&
4346 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4347 break;
4348 eptr += len;
4349 }
4350 }
4351 break;
4352
4353 case OP_NOT_HSPACE:
4354 case OP_HSPACE:
4355 for (i = min; i < max; i++)
4356 {
4357 BOOL gotspace;
4358 int len = 1;
4359 if (eptr >= md->end_subject) break;
4360 GETCHARLEN(c, eptr, len);
4361 switch(c)
4362 {
4363 default: gotspace = FALSE; break;
4364 case 0x09: /* HT */
4365 case 0x20: /* SPACE */
4366 case 0xa0: /* NBSP */
4367 case 0x1680: /* OGHAM SPACE MARK */
4368 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4369 case 0x2000: /* EN QUAD */
4370 case 0x2001: /* EM QUAD */
4371 case 0x2002: /* EN SPACE */
4372 case 0x2003: /* EM SPACE */
4373 case 0x2004: /* THREE-PER-EM SPACE */
4374 case 0x2005: /* FOUR-PER-EM SPACE */
4375 case 0x2006: /* SIX-PER-EM SPACE */
4376 case 0x2007: /* FIGURE SPACE */
4377 case 0x2008: /* PUNCTUATION SPACE */
4378 case 0x2009: /* THIN SPACE */
4379 case 0x200A: /* HAIR SPACE */
4380 case 0x202f: /* NARROW NO-BREAK SPACE */
4381 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4382 case 0x3000: /* IDEOGRAPHIC SPACE */
4383 gotspace = TRUE;
4384 break;
4385 }
4386 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4387 eptr += len;
4388 }
4389 break;
4390
4391 case OP_NOT_VSPACE:
4392 case OP_VSPACE:
4393 for (i = min; i < max; i++)
4394 {
4395 BOOL gotspace;
4396 int len = 1;
4397 if (eptr >= md->end_subject) break;
4398 GETCHARLEN(c, eptr, len);
4399 switch(c)
4400 {
4401 default: gotspace = FALSE; break;
4402 case 0x0a: /* LF */
4403 case 0x0b: /* VT */
4404 case 0x0c: /* FF */
4405 case 0x0d: /* CR */
4406 case 0x85: /* NEL */
4407 case 0x2028: /* LINE SEPARATOR */
4408 case 0x2029: /* PARAGRAPH SEPARATOR */
4409 gotspace = TRUE;
4410 break;
4411 }
4412 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4413 eptr += len;
4414 }
4415 break;
4416
4417 case OP_NOT_DIGIT:
4418 for (i = min; i < max; i++)
4419 {
4420 int len = 1;
4421 if (eptr >= md->end_subject) break;
4422 GETCHARLEN(c, eptr, len);
4423 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4424 eptr+= len;
4425 }
4426 break;
4427
4428 case OP_DIGIT:
4429 for (i = min; i < max; i++)
4430 {
4431 int len = 1;
4432 if (eptr >= md->end_subject) break;
4433 GETCHARLEN(c, eptr, len);
4434 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4435 eptr+= len;
4436 }
4437 break;
4438
4439 case OP_NOT_WHITESPACE:
4440 for (i = min; i < max; i++)
4441 {
4442 int len = 1;
4443 if (eptr >= md->end_subject) break;
4444 GETCHARLEN(c, eptr, len);
4445 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4446 eptr+= len;
4447 }
4448 break;
4449
4450 case OP_WHITESPACE:
4451 for (i = min; i < max; i++)
4452 {
4453 int len = 1;
4454 if (eptr >= md->end_subject) break;
4455 GETCHARLEN(c, eptr, len);
4456 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4457 eptr+= len;
4458 }
4459 break;
4460
4461 case OP_NOT_WORDCHAR:
4462 for (i = min; i < max; i++)
4463 {
4464 int len = 1;
4465 if (eptr >= md->end_subject) break;
4466 GETCHARLEN(c, eptr, len);
4467 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4468 eptr+= len;
4469 }
4470 break;
4471
4472 case OP_WORDCHAR:
4473 for (i = min; i < max; i++)
4474 {
4475 int len = 1;
4476 if (eptr >= md->end_subject) break;
4477 GETCHARLEN(c, eptr, len);
4478 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4479 eptr+= len;
4480 }
4481 break;
4482
4483 default:
4484 RRETURN(PCRE_ERROR_INTERNAL);
4485 }
4486
4487 /* eptr is now past the end of the maximum run */
4488
4489 CHECK_PARTIAL();
4490 if (possessive) continue;
4491 for(;;)
4492 {
4493 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4494 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4495 if (eptr-- == pp) break; /* Stop if tried at original pos */
4496 BACKCHAR(eptr);
4497 }
4498 }
4499 else
4500 #endif /* SUPPORT_UTF8 */
4501
4502 /* Not UTF-8 mode */
4503 {
4504 switch(ctype)
4505 {
4506 case OP_ANY:
4507 for (i = min; i < max; i++)
4508 {
4509 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4510 eptr++;
4511 }
4512 break;
4513
4514 case OP_ALLANY:
4515 case OP_ANYBYTE:
4516 c = max - min;
4517 if (c > (unsigned int)(md->end_subject - eptr))
4518 c = md->end_subject - eptr;
4519 eptr += c;
4520 break;
4521
4522 case OP_ANYNL:
4523 for (i = min; i < max; i++)
4524 {
4525 if (eptr >= md->end_subject) break;
4526 c = *eptr;
4527 if (c == 0x000d)
4528 {
4529 if (++eptr >= md->end_subject) break;
4530 if (*eptr == 0x000a) eptr++;
4531 }
4532 else
4533 {
4534 if (c != 0x000a &&
4535 (md->bsr_anycrlf ||
4536 (c != 0x000b && c != 0x000c && c != 0x0085)))
4537 break;
4538 eptr++;
4539 }
4540 }
4541 break;
4542
4543 case OP_NOT_HSPACE:
4544 for (i = min; i < max; i++)
4545 {
4546 if (eptr >= md->end_subject) break;
4547 c = *eptr;
4548 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4549 eptr++;
4550 }
4551 break;
4552
4553 case OP_HSPACE:
4554 for (i = min; i < max; i++)
4555 {
4556 if (eptr >= md->end_subject) break;
4557 c = *eptr;
4558 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4559 eptr++;
4560 }
4561 break;
4562
4563 case OP_NOT_VSPACE:
4564 for (i = min; i < max; i++)
4565 {
4566 if (eptr >= md->end_subject) break;
4567 c = *eptr;
4568 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4569 break;
4570 eptr++;
4571 }
4572 break;
4573
4574 case OP_VSPACE:
4575 for (i = min; i < max; i++)
4576 {
4577 if (eptr >= md->end_subject) break;
4578 c = *eptr;
4579 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4580 break;
4581 eptr++;
4582 }
4583 break;
4584
4585 case OP_NOT_DIGIT:
4586 for (i = min; i < max; i++)
4587 {
4588 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4589 break;
4590 eptr++;
4591 }
4592 break;
4593
4594 case OP_DIGIT:
4595 for (i = min; i < max; i++)
4596 {
4597 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4598 break;
4599 eptr++;
4600 }
4601 break;
4602
4603 case OP_NOT_WHITESPACE:
4604 for (i = min; i < max; i++)
4605 {
4606 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4607 break;
4608 eptr++;
4609 }
4610 break;
4611
4612 case OP_WHITESPACE:
4613 for (i = min; i < max; i++)
4614 {
4615 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4616 break;
4617 eptr++;
4618 }
4619 break;
4620
4621 case OP_NOT_WORDCHAR:
4622 for (i = min; i < max; i++)
4623 {
4624 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4625 break;
4626 eptr++;
4627 }
4628 break;
4629
4630 case OP_WORDCHAR:
4631 for (i = min; i < max; i++)
4632 {
4633 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4634 break;
4635 eptr++;
4636 }
4637 break;
4638
4639 default:
4640 RRETURN(PCRE_ERROR_INTERNAL);
4641 }
4642
4643 /* eptr is now past the end of the maximum run */
4644
4645 CHECK_PARTIAL();
4646 if (possessive) continue;
4647 while (eptr >= pp)
4648 {
4649 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4650 eptr--;
4651 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4652 }
4653 }
4654
4655 /* Get here if we can't make it match with any permitted repetitions */
4656
4657 RRETURN(MATCH_NOMATCH);
4658 }
4659 /* Control never gets here */
4660
4661 /* There's been some horrible disaster. Arrival here can only mean there is
4662 something seriously wrong in the code above or the OP_xxx definitions. */
4663
4664 default:
4665 DPRINTF(("Unknown opcode %d\n", *ecode));
4666 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4667 }
4668
4669 /* Do not stick any code in here without much thought; it is assumed
4670 that "continue" in the code above comes out to here to repeat the main
4671 loop. */
4672
4673 } /* End of main loop */
4674 /* Control never reaches here */
4675
4676
4677 /* When compiling to use the heap rather than the stack for recursive calls to
4678 match(), the RRETURN() macro jumps here. The number that is saved in
4679 frame->Xwhere indicates which label we actually want to return to. */
4680
4681 #ifdef NO_RECURSE
4682 #define LBL(val) case val: goto L_RM##val;
4683 HEAP_RETURN:
4684 switch (frame->Xwhere)
4685 {
4686 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4687 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4688 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4689 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4690 LBL(53) LBL(54)
4691 #ifdef SUPPORT_UTF8
4692 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4693 LBL(32) LBL(34) LBL(42) LBL(46)
4694 #ifdef SUPPORT_UCP
4695 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4696 #endif /* SUPPORT_UCP */
4697 #endif /* SUPPORT_UTF8 */
4698 default:
4699 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4700 return PCRE_ERROR_INTERNAL;
4701 }
4702 #undef LBL
4703 #endif /* NO_RECURSE */
4704 }
4705
4706
4707 /***************************************************************************
4708 ****************************************************************************
4709 RECURSION IN THE match() FUNCTION
4710
4711 Undefine all the macros that were defined above to handle this. */
4712
4713 #ifdef NO_RECURSE
4714 #undef eptr
4715 #undef ecode
4716 #undef mstart
4717 #undef offset_top
4718 #undef ims
4719 #undef eptrb
4720 #undef flags
4721
4722 #undef callpat
4723 #undef charptr
4724 #undef data
4725 #undef next
4726 #undef pp
4727 #undef prev
4728 #undef saved_eptr
4729
4730 #undef new_recursive
4731
4732 #undef cur_is_word
4733 #undef condition
4734 #undef prev_is_word
4735
4736 #undef original_ims
4737
4738 #undef ctype
4739 #undef length
4740 #undef max
4741 #undef min
4742 #undef number
4743 #undef offset
4744 #undef op
4745 #undef save_capture_last
4746 #undef save_offset1
4747 #undef save_offset2
4748 #undef save_offset3
4749 #undef stacksave
4750
4751 #undef newptrb
4752
4753 #endif
4754
4755 /* These two are defined as macros in both cases */
4756
4757 #undef fc
4758 #undef fi
4759
4760 /***************************************************************************
4761 ***************************************************************************/
4762
4763
4764
4765 /*************************************************
4766 * Execute a Regular Expression *
4767 *************************************************/
4768
4769 /* This function applies a compiled re to a subject string and picks out
4770 portions of the string if it matches. Two elements in the vector are set for
4771 each substring: the offsets to the start and end of the substring.
4772
4773 Arguments:
4774 argument_re points to the compiled expression
4775 extra_data points to extra data or is NULL
4776 subject points to the subject string
4777 length length of subject string (may contain binary zeros)
4778 start_offset where to start in the subject string
4779 options option bits
4780 offsets points to a vector of ints to be filled in with offsets
4781 offsetcount the number of elements in the vector
4782
4783 Returns: > 0 => success; value is the number of elements filled in
4784 = 0 => success, but offsets is not big enough
4785 -1 => failed to match
4786 < -1 => some kind of unexpected problem
4787 */
4788
4789 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4790 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4791 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4792 int offsetcount)
4793 {
4794 int rc, resetcount, ocount;
4795 int first_byte = -1;
4796 int req_byte = -1;
4797 int req_byte2 = -1;
4798 int newline;
4799 unsigned long int ims;
4800 BOOL using_temporary_offsets = FALSE;
4801 BOOL anchored;
4802 BOOL startline;
4803 BOOL firstline;
4804 BOOL first_byte_caseless = FALSE;
4805 BOOL req_byte_caseless = FALSE;
4806 BOOL utf8;
4807 match_data match_block;
4808 match_data *md = &match_block;
4809 const uschar *tables;
4810 const uschar *start_bits = NULL;
4811 USPTR start_match = (USPTR)subject + start_offset;
4812 USPTR end_subject;
4813 USPTR start_partial = NULL;
4814 USPTR req_byte_ptr = start_match - 1;
4815
4816 pcre_study_data internal_study;
4817 const pcre_study_data *study;
4818
4819 real_pcre internal_re;
4820 const real_pcre *external_re = (const real_pcre *)argument_re;
4821 const real_pcre *re = external_re;
4822
4823 /* Plausibility checks */
4824
4825 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4826 if (re == NULL || subject == NULL ||
4827 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4828 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4829
4830 /* Fish out the optional data from the extra_data structure, first setting
4831 the default values. */
4832
4833 study = NULL;
4834 md->match_limit = MATCH_LIMIT;
4835 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4836 md->callout_data = NULL;
4837
4838 /* The table pointer is always in native byte order. */
4839
4840 tables = external_re->tables;
4841
4842 if (extra_data != NULL)
4843 {
4844 register unsigned int flags = extra_data->flags;
4845 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4846 study = (const pcre_study_data *)extra_data->study_data;
4847 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4848 md->match_limit = extra_data->match_limit;
4849 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4850 md->match_limit_recursion = extra_data->match_limit_recursion;
4851 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4852 md->callout_data = extra_data->callout_data;
4853 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4854 }
4855
4856 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4857 is a feature that makes it possible to save compiled regex and re-use them
4858 in other programs later. */
4859
4860 if (tables == NULL) tables = _pcre_default_tables;
4861
4862 /* Check that the first field in the block is the magic number. If it is not,
4863 test for a regex that was compiled on a host of opposite endianness. If this is
4864 the case, flipped values are put in internal_re and internal_study if there was
4865 study data too. */
4866
4867 if (re->magic_number != MAGIC_NUMBER)
4868 {
4869 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4870 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4871 if (study != NULL) study = &internal_study;
4872 }
4873
4874 /* Set up other data */
4875
4876 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4877 startline = (re->flags & PCRE_STARTLINE) != 0;
4878 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4879
4880 /* The code starts after the real_pcre block and the capture name table. */
4881
4882 md->start_code = (const uschar *)external_re + re->name_table_offset +
4883 re->name_count * re->name_entry_size;
4884
4885 md->start_subject = (USPTR)subject;
4886 md->start_offset = start_offset;
4887 md->end_subject = md->start_subject + length;
4888 end_subject = md->end_subject;
4889
4890 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4891 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4892 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4893
4894 md->notbol = (options & PCRE_NOTBOL) != 0;
4895 md->noteol = (options & PCRE_NOTEOL) != 0;
4896 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4897 md->partial = (options & PCRE_PARTIAL) != 0;
4898 md->hitend = FALSE;
4899
4900 md->recursive = NULL; /* No recursion at top level */
4901
4902 md->lcc = tables + lcc_offset;
4903 md->ctypes = tables + ctypes_offset;
4904
4905 /* Handle different \R options. */
4906
4907 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4908 {
4909 case 0:
4910 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4911 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4912 else
4913 #ifdef BSR_ANYCRLF
4914 md->bsr_anycrlf = TRUE;
4915 #else
4916 md->bsr_anycrlf = FALSE;
4917 #endif
4918 break;
4919
4920 case PCRE_BSR_ANYCRLF:
4921 md->bsr_anycrlf = TRUE;
4922 break;
4923
4924 case PCRE_BSR_UNICODE:
4925 md->bsr_anycrlf = FALSE;
4926 break;
4927
4928 default: return PCRE_ERROR_BADNEWLINE;
4929 }
4930
4931 /* Handle different types of newline. The three bits give eight cases. If
4932 nothing is set at run time, whatever was used at compile time applies. */
4933
4934 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4935 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4936 {
4937 case 0: newline = NEWLINE; break; /* Compile-time default */
4938 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4939 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4940 case PCRE_NEWLINE_CR+
4941 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4942 case PCRE_NEWLINE_ANY: newline = -1; break;
4943 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4944 default: return PCRE_ERROR_BADNEWLINE;
4945 }
4946
4947 if (newline == -2)
4948 {
4949 md->nltype = NLTYPE_ANYCRLF;
4950 }
4951 else if (newline < 0)
4952 {
4953 md->nltype = NLTYPE_ANY;
4954 }
4955 else
4956 {
4957 md->nltype = NLTYPE_FIXED;
4958 if (newline > 255)
4959 {
4960 md->nllen = 2;
4961 md->nl[0] = (newline >> 8) & 255;
4962 md->nl[1] = newline & 255;
4963 }
4964 else
4965 {
4966 md->nllen = 1;
4967 md->nl[0] = newline;
4968 }
4969 }
4970
4971 /* Partial matching was originally supported only for a restricted set of
4972 regexes; from release 8.00 there are no restrictions, but the bits are still
4973 defined (though never set). So there's no harm in leaving this code. */
4974
4975 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4976 return PCRE_ERROR_BADPARTIAL;
4977
4978 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4979 back the character offset. */
4980
4981 #ifdef SUPPORT_UTF8
4982 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4983 {
4984 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
4985 return PCRE_ERROR_BADUTF8;
4986 if (start_offset > 0 && start_offset < length)
4987 {
4988 int tb = ((USPTR)subject)[start_offset];
4989 if (tb > 127)
4990 {
4991 tb &= 0xc0;
4992 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4993 }
4994 }
4995 }
4996 #endif
4997
4998 /* The ims options can vary during the matching as a result of the presence
4999 of (?ims) items in the pattern. They are kept in a local variable so that
5000 restoring at the exit of a group is easy. */
5001
5002 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5003
5004 /* If the expression has got more back references than the offsets supplied can
5005 hold, we get a temporary chunk of working store to use during the matching.
5006 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5007 of 3. */
5008
5009 ocount = offsetcount - (offsetcount % 3);
5010
5011 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5012 {
5013 ocount = re->top_backref * 3 + 3;
5014 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5015 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5016 using_temporary_offsets = TRUE;
5017 DPRINTF(("Got memory to hold back references\n"));
5018 }
5019 else md->offset_vector = offsets;
5020
5021 md->offset_end = ocount;
5022 md->offset_max = (2*ocount)/3;
5023 md->offset_overflow = FALSE;
5024 md->capture_last = -1;
5025
5026 /* Compute the minimum number of offsets that we need to reset each time. Doing
5027 this makes a huge difference to execution time when there aren't many brackets
5028 in the pattern. */
5029
5030 resetcount = 2 + re->top_bracket * 2;
5031 if (resetcount > offsetcount) resetcount = ocount;
5032
5033 /* Reset the working variable associated with each extraction. These should
5034 never be used unless previously set, but they get saved and restored, and so we
5035 initialize them to avoid reading uninitialized locations. */
5036
5037 if (md->offset_vector != NULL)
5038 {
5039 register int *iptr = md->offset_vector + ocount;
5040 register int *iend = iptr - resetcount/2 + 1;
5041 while (--iptr >= iend) *iptr = -1;
5042 }
5043
5044 /* Set up the first character to match, if available. The first_byte value is
5045 never set for an anchored regular expression, but the anchoring may be forced
5046 at run time, so we have to test for anchoring. The first char may be unset for
5047 an unanchored pattern, of course. If there's no first char and the pattern was
5048 studied, there may be a bitmap of possible first characters. */
5049
5050 if (!anchored)
5051 {
5052 if ((re->flags & PCRE_FIRSTSET) != 0)
5053 {
5054 first_byte = re->first_byte & 255;
5055 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5056 first_byte = md->lcc[first_byte];
5057 }
5058 else
5059 if (!startline && study != NULL &&
5060 (study->options & PCRE_STUDY_MAPPED) != 0)
5061 start_bits = study->start_bits;
5062 }
5063
5064 /* For anchored or unanchored matches, there may be a "last known required
5065 character" set. */
5066
5067 if ((re->flags & PCRE_REQCHSET) != 0)
5068 {
5069 req_byte = re->req_byte & 255;
5070 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5071 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5072 }
5073
5074
5075 /* ==========================================================================*/
5076
5077 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5078 the loop runs just once. */
5079
5080 for(;;)
5081 {
5082 USPTR save_end_subject = end_subject;
5083 USPTR new_start_match;
5084
5085 /* Reset the maximum number of extractions we might see. */
5086
5087 if (md->offset_vector != NULL)
5088 {
5089 register int *iptr = md->offset_vector;
5090 register int *iend = iptr + resetcount;
5091 while (iptr < iend) *iptr++ = -1;
5092 }
5093
5094 /* If firstline is TRUE, the start of the match is constrained to the first
5095 line of a multiline string. That is, the match must be before or at the first
5096 newline. Implement this by temporarily adjusting end_subject so that we stop
5097 scanning at a newline. If the match fails at the newline, later code breaks
5098 this loop. */
5099
5100 if (firstline)
5101 {
5102 USPTR t = start_match;
5103 #ifdef SUPPORT_UTF8
5104 if (utf8)
5105 {
5106 while (t < md->end_subject && !IS_NEWLINE(t))
5107 {
5108 t++;
5109 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5110 }
5111 }
5112 else
5113 #endif
5114 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5115 end_subject = t;
5116 }
5117
5118 /* There are some optimizations that avoid running the match if a known
5119 starting point is not found, or if a known later character is not present.
5120 However, there is an option that disables these, for testing and for ensuring
5121 that all callouts do actually occur. */
5122
5123 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5124 {
5125 /* Advance to a unique first byte if there is one. */
5126
5127 if (first_byte >= 0)
5128 {
5129 if (first_byte_caseless)
5130 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5131 start_match++;
5132 else
5133 while (start_match < end_subject && *start_match != first_byte)
5134 start_match++;
5135 }
5136
5137 /* Or to just after a linebreak for a multiline match */
5138
5139 else if (startline)
5140 {
5141 if (start_match > md->start_subject + start_offset)
5142 {
5143 #ifdef SUPPORT_UTF8
5144 if (utf8)
5145 {
5146 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5147 {
5148 start_match++;
5149 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5150 start_match++;
5151 }
5152 }
5153 else
5154 #endif
5155 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5156 start_match++;
5157
5158 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5159 and we are now at a LF, advance the match position by one more character.
5160 */
5161
5162 if (start_match[-1] == CHAR_CR &&
5163 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5164 start_match < end_subject &&
5165 *start_match == CHAR_NL)
5166 start_match++;
5167 }
5168 }
5169
5170 /* Or to a non-unique first byte after study */
5171
5172 else if (start_bits != NULL)
5173 {
5174 while (start_match < end_subject)
5175 {
5176 register unsigned int c = *start_match;
5177 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5178 else break;
5179 }
5180 }
5181 } /* Starting optimizations */
5182
5183 /* Restore fudged end_subject */
5184
5185 end_subject = save_end_subject;
5186
5187 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5188 printf(">>>> Match against: ");
5189 pchars(start_match, end_subject - start_match, TRUE, md);
5190 printf("\n");
5191 #endif
5192
5193 /* If req_byte is set, we know that that character must appear in the
5194 subject for the match to succeed. If the first character is set, req_byte
5195 must be later in the subject; otherwise the test starts at the match point.
5196 This optimization can save a huge amount of backtracking in patterns with
5197 nested unlimited repeats that aren't going to match. Writing separate code
5198 for cased/caseless versions makes it go faster, as does using an
5199 autoincrement and backing off on a match.
5200
5201 HOWEVER: when the subject string is very, very long, searching to its end
5202 can take a long time, and give bad performance on quite ordinary patterns.
5203 This showed up when somebody was matching something like /^\d+C/ on a
5204 32-megabyte string... so we don't do this when the string is sufficiently
5205 long.
5206
5207 ALSO: this processing is disabled when partial matching is requested, or if
5208 disabling is explicitly requested. */
5209
5210 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
5211 req_byte >= 0 &&
5212 end_subject - start_match < REQ_BYTE_MAX &&
5213 !md->partial)
5214 {
5215 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5216
5217 /* We don't need to repeat the search if we haven't yet reached the
5218 place we found it at last time. */
5219
5220 if (p > req_byte_ptr)
5221 {
5222 if (req_byte_caseless)
5223 {
5224 while (p < end_subject)
5225 {
5226 register int pp = *p++;
5227 if (pp == req_byte || pp == req_byte2) { p--; break; }
5228 }
5229 }
5230 else
5231 {
5232 while (p < end_subject)
5233 {
5234 if (*p++ == req_byte) { p--; break; }
5235 }
5236 }
5237
5238 /* If we can't find the required character, break the matching loop,
5239 forcing a match failure. */
5240
5241 if (p >= end_subject)
5242 {
5243 rc = MATCH_NOMATCH;
5244 break;
5245 }
5246
5247 /* If we have found the required character, save the point where we
5248 found it, so that we don't search again next time round the loop if
5249 the start hasn't passed this character yet. */
5250
5251 req_byte_ptr = p;
5252 }
5253 }
5254
5255 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5256 first starting point for which a partial match was found. */
5257
5258 md->start_match_ptr = start_match;
5259 md->match_call_count = 0;
5260 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5261 if (md->hitend && start_partial == NULL) start_partial = start_match;
5262
5263 switch(rc)
5264 {
5265 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5266 exactly like PRUNE. */
5267
5268 case MATCH_NOMATCH:
5269 case MATCH_PRUNE:
5270 case MATCH_THEN:
5271 new_start_match = start_match + 1;
5272 #ifdef SUPPORT_UTF8
5273 if (utf8)
5274 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5275 new_start_match++;
5276 #endif
5277 break;
5278
5279 /* SKIP passes back the next starting point explicitly. */
5280
5281 case MATCH_SKIP:
5282 new_start_match = md->start_match_ptr;
5283 break;
5284
5285 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5286
5287 case MATCH_COMMIT:
5288 rc = MATCH_NOMATCH;
5289 goto ENDLOOP;
5290
5291 /* Any other return is some kind of error. */
5292
5293 default:
5294 goto ENDLOOP;
5295 }
5296
5297 /* Control reaches here for the various types of "no match at this point"
5298 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5299
5300 rc = MATCH_NOMATCH;
5301
5302 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5303 newline in the subject (though it may continue over the newline). Therefore,
5304 if we have just failed to match, starting at a newline, do not continue. */
5305
5306 if (firstline && IS_NEWLINE(start_match)) break;
5307
5308 /* Advance to new matching position */
5309
5310 start_match = new_start_match;
5311
5312 /* Break the loop if the pattern is anchored or if we have passed the end of
5313 the subject. */
5314
5315 if (anchored || start_match > end_subject) break;
5316
5317 /* If we have just passed a CR and we are now at a LF, and the pattern does
5318 not contain any explicit matches for \r or \n, and the newline option is CRLF
5319 or ANY or ANYCRLF, advance the match position by one more character. */
5320
5321 if (start_match[-1] == CHAR_CR &&
5322 start_match < end_subject &&
5323 *start_match == CHAR_NL &&
5324 (re->flags & PCRE_HASCRORLF) == 0 &&
5325 (md->nltype == NLTYPE_ANY ||
5326 md->nltype == NLTYPE_ANYCRLF ||
5327 md->nllen == 2))
5328 start_match++;
5329
5330 } /* End of for(;;) "bumpalong" loop */
5331
5332 /* ==========================================================================*/
5333
5334 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5335 conditions is true:
5336
5337 (1) The pattern is anchored or the match was failed by (*COMMIT);
5338
5339 (2) We are past the end of the subject;
5340
5341 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5342 this option requests that a match occur at or before the first newline in
5343 the subject.
5344
5345 When we have a match and the offset vector is big enough to deal with any
5346 backreferences, captured substring offsets will already be set up. In the case
5347 where we had to get some local store to hold offsets for backreference
5348 processing, copy those that we can. In this case there need not be overflow if
5349 certain parts of the pattern were not used, even though there are more
5350 capturing parentheses than vector slots. */
5351
5352 ENDLOOP:
5353
5354 if (rc == MATCH_MATCH)
5355 {
5356 if (using_temporary_offsets)
5357 {
5358 if (offsetcount >= 4)
5359 {
5360 memcpy(offsets + 2, md->offset_vector + 2,
5361 (offsetcount - 2) * sizeof(int));
5362 DPRINTF(("Copied offsets from temporary memory\n"));
5363 }
5364 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5365 DPRINTF(("Freeing temporary memory\n"));
5366 (pcre_free)(md->offset_vector);
5367 }
5368
5369 /* Set the return code to the number of captured strings, or 0 if there are
5370 too many to fit into the vector. */
5371
5372 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5373
5374 /* If there is space, set up the whole thing as substring 0. The value of
5375 md->start_match_ptr might be modified if \K was encountered on the success
5376 matching path. */
5377
5378 if (offsetcount < 2) rc = 0; else
5379 {
5380 offsets[0] = md->start_match_ptr - md->start_subject;
5381 offsets[1] = md->end_match_ptr - md->start_subject;
5382 }
5383
5384 DPRINTF((">>>> returning %d\n", rc));
5385 return rc;
5386 }
5387
5388 /* Control gets here if there has been an error, or if the overall match
5389 attempt has failed at all permitted starting positions. */
5390
5391 if (using_temporary_offsets)
5392 {
5393 DPRINTF(("Freeing temporary memory\n"));
5394 (pcre_free)(md->offset_vector);
5395 }
5396
5397 if (rc != MATCH_NOMATCH)
5398 {
5399 DPRINTF((">>>> error: returning %d\n", rc));
5400 return rc;
5401 }
5402 else if (md->partial && start_partial != NULL)
5403 {
5404 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5405 if (offsetcount > 1)
5406 {
5407 offsets[0] = start_partial - (USPTR)subject;
5408 offsets[1] = end_subject - (USPTR)subject;
5409 }
5410 return PCRE_ERROR_PARTIAL;
5411 }
5412 else
5413 {
5414 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5415 return PCRE_ERROR_NOMATCH;
5416 }
5417 }
5418
5419 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12