/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 427 - (show annotations) (download)
Fri Aug 28 09:55:54 2009 UTC (5 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 161361 byte(s)
Add new PCRE_PARTIAL_HARD option.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 USPTR Xeptr;
326 const uschar *Xecode;
327 USPTR Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338 USPTR Xcharptr;
339 #endif
340 USPTR Xdata;
341 USPTR Xnext;
342 USPTR Xpp;
343 USPTR Xprev;
344 USPTR Xsaved_eptr;
345
346 recursion_info Xnew_recursive;
347
348 BOOL Xcur_is_word;
349 BOOL Xcondition;
350 BOOL Xprev_is_word;
351
352 unsigned long int Xoriginal_ims;
353
354 #ifdef SUPPORT_UCP
355 int Xprop_type;
356 int Xprop_value;
357 int Xprop_fail_result;
358 int Xprop_category;
359 int Xprop_chartype;
360 int Xprop_script;
361 int Xoclength;
362 uschar Xocchars[8];
363 #endif
364
365 int Xcodelink;
366 int Xctype;
367 unsigned int Xfc;
368 int Xfi;
369 int Xlength;
370 int Xmax;
371 int Xmin;
372 int Xnumber;
373 int Xoffset;
374 int Xop;
375 int Xsave_capture_last;
376 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377 int Xstacksave[REC_STACK_SAVE_MAX];
378
379 eptrblock Xnewptrb;
380
381 /* Where to jump back to */
382
383 int Xwhere;
384
385 } heapframe;
386
387 #endif
388
389
390 /***************************************************************************
391 ***************************************************************************/
392
393
394
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response. */
402
403 /* These macros pack up tests that are used for partial matching, and which
404 appears several times in the code. We set the "hit end" flag if the pointer is
405 at the end of the subject and also past the start of the subject (i.e.
406 something has been matched). For hard partial matching, we then return
407 immediately. The second one is used when we already know we are past the end of
408 the subject. */
409
410 #define CHECK_PARTIAL()\
411 if (md->partial && eptr >= md->end_subject && eptr > mstart)\
412 {\
413 md->hitend = TRUE;\
414 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
415 }
416
417 #define SCHECK_PARTIAL()\
418 if (md->partial && eptr > mstart)\
419 {\
420 md->hitend = TRUE;\
421 md->hitend = TRUE;\
422 if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);\
423 }
424
425
426 /* Performance note: It might be tempting to extract commonly used fields from
427 the md structure (e.g. utf8, end_subject) into individual variables to improve
428 performance. Tests using gcc on a SPARC disproved this; in the first case, it
429 made performance worse.
430
431 Arguments:
432 eptr pointer to current character in subject
433 ecode pointer to current position in compiled code
434 mstart pointer to the current match start position (can be modified
435 by encountering \K)
436 offset_top current top pointer
437 md pointer to "static" info for the match
438 ims current /i, /m, and /s options
439 eptrb pointer to chain of blocks containing eptr at start of
440 brackets - for testing for empty matches
441 flags can contain
442 match_condassert - this is an assertion condition
443 match_cbegroup - this is the start of an unlimited repeat
444 group that can match an empty string
445 rdepth the recursion depth
446
447 Returns: MATCH_MATCH if matched ) these values are >= 0
448 MATCH_NOMATCH if failed to match )
449 a negative PCRE_ERROR_xxx value if aborted by an error condition
450 (e.g. stopped by repeated call or recursion limit)
451 */
452
453 static int
454 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
455 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
456 int flags, unsigned int rdepth)
457 {
458 /* These variables do not need to be preserved over recursion in this function,
459 so they can be ordinary variables in all cases. Mark some of them with
460 "register" because they are used a lot in loops. */
461
462 register int rrc; /* Returns from recursive calls */
463 register int i; /* Used for loops not involving calls to RMATCH() */
464 register unsigned int c; /* Character values not kept over RMATCH() calls */
465 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
466
467 BOOL minimize, possessive; /* Quantifier options */
468 int condcode;
469
470 /* When recursion is not being used, all "local" variables that have to be
471 preserved over calls to RMATCH() are part of a "frame" which is obtained from
472 heap storage. Set up the top-level frame here; others are obtained from the
473 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
474
475 #ifdef NO_RECURSE
476 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
477 frame->Xprevframe = NULL; /* Marks the top level */
478
479 /* Copy in the original argument variables */
480
481 frame->Xeptr = eptr;
482 frame->Xecode = ecode;
483 frame->Xmstart = mstart;
484 frame->Xoffset_top = offset_top;
485 frame->Xims = ims;
486 frame->Xeptrb = eptrb;
487 frame->Xflags = flags;
488 frame->Xrdepth = rdepth;
489
490 /* This is where control jumps back to to effect "recursion" */
491
492 HEAP_RECURSE:
493
494 /* Macros make the argument variables come from the current frame */
495
496 #define eptr frame->Xeptr
497 #define ecode frame->Xecode
498 #define mstart frame->Xmstart
499 #define offset_top frame->Xoffset_top
500 #define ims frame->Xims
501 #define eptrb frame->Xeptrb
502 #define flags frame->Xflags
503 #define rdepth frame->Xrdepth
504
505 /* Ditto for the local variables */
506
507 #ifdef SUPPORT_UTF8
508 #define charptr frame->Xcharptr
509 #endif
510 #define callpat frame->Xcallpat
511 #define codelink frame->Xcodelink
512 #define data frame->Xdata
513 #define next frame->Xnext
514 #define pp frame->Xpp
515 #define prev frame->Xprev
516 #define saved_eptr frame->Xsaved_eptr
517
518 #define new_recursive frame->Xnew_recursive
519
520 #define cur_is_word frame->Xcur_is_word
521 #define condition frame->Xcondition
522 #define prev_is_word frame->Xprev_is_word
523
524 #define original_ims frame->Xoriginal_ims
525
526 #ifdef SUPPORT_UCP
527 #define prop_type frame->Xprop_type
528 #define prop_value frame->Xprop_value
529 #define prop_fail_result frame->Xprop_fail_result
530 #define prop_category frame->Xprop_category
531 #define prop_chartype frame->Xprop_chartype
532 #define prop_script frame->Xprop_script
533 #define oclength frame->Xoclength
534 #define occhars frame->Xocchars
535 #endif
536
537 #define ctype frame->Xctype
538 #define fc frame->Xfc
539 #define fi frame->Xfi
540 #define length frame->Xlength
541 #define max frame->Xmax
542 #define min frame->Xmin
543 #define number frame->Xnumber
544 #define offset frame->Xoffset
545 #define op frame->Xop
546 #define save_capture_last frame->Xsave_capture_last
547 #define save_offset1 frame->Xsave_offset1
548 #define save_offset2 frame->Xsave_offset2
549 #define save_offset3 frame->Xsave_offset3
550 #define stacksave frame->Xstacksave
551
552 #define newptrb frame->Xnewptrb
553
554 /* When recursion is being used, local variables are allocated on the stack and
555 get preserved during recursion in the normal way. In this environment, fi and
556 i, and fc and c, can be the same variables. */
557
558 #else /* NO_RECURSE not defined */
559 #define fi i
560 #define fc c
561
562
563 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
564 const uschar *charptr; /* in small blocks of the code. My normal */
565 #endif /* style of coding would have declared */
566 const uschar *callpat; /* them within each of those blocks. */
567 const uschar *data; /* However, in order to accommodate the */
568 const uschar *next; /* version of this code that uses an */
569 USPTR pp; /* external "stack" implemented on the */
570 const uschar *prev; /* heap, it is easier to declare them all */
571 USPTR saved_eptr; /* here, so the declarations can be cut */
572 /* out in a block. The only declarations */
573 recursion_info new_recursive; /* within blocks below are for variables */
574 /* that do not have to be preserved over */
575 BOOL cur_is_word; /* a recursive call to RMATCH(). */
576 BOOL condition;
577 BOOL prev_is_word;
578
579 unsigned long int original_ims;
580
581 #ifdef SUPPORT_UCP
582 int prop_type;
583 int prop_value;
584 int prop_fail_result;
585 int prop_category;
586 int prop_chartype;
587 int prop_script;
588 int oclength;
589 uschar occhars[8];
590 #endif
591
592 int codelink;
593 int ctype;
594 int length;
595 int max;
596 int min;
597 int number;
598 int offset;
599 int op;
600 int save_capture_last;
601 int save_offset1, save_offset2, save_offset3;
602 int stacksave[REC_STACK_SAVE_MAX];
603
604 eptrblock newptrb;
605 #endif /* NO_RECURSE */
606
607 /* These statements are here to stop the compiler complaining about unitialized
608 variables. */
609
610 #ifdef SUPPORT_UCP
611 prop_value = 0;
612 prop_fail_result = 0;
613 #endif
614
615
616 /* This label is used for tail recursion, which is used in a few cases even
617 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
618 used. Thanks to Ian Taylor for noticing this possibility and sending the
619 original patch. */
620
621 TAIL_RECURSE:
622
623 /* OK, now we can get on with the real code of the function. Recursive calls
624 are specified by the macro RMATCH and RRETURN is used to return. When
625 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
626 and a "return", respectively (possibly with some debugging if DEBUG is
627 defined). However, RMATCH isn't like a function call because it's quite a
628 complicated macro. It has to be used in one particular way. This shouldn't,
629 however, impact performance when true recursion is being used. */
630
631 #ifdef SUPPORT_UTF8
632 utf8 = md->utf8; /* Local copy of the flag */
633 #else
634 utf8 = FALSE;
635 #endif
636
637 /* First check that we haven't called match() too many times, or that we
638 haven't exceeded the recursive call limit. */
639
640 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
641 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
642
643 original_ims = ims; /* Save for resetting on ')' */
644
645 /* At the start of a group with an unlimited repeat that may match an empty
646 string, the match_cbegroup flag is set. When this is the case, add the current
647 subject pointer to the chain of such remembered pointers, to be checked when we
648 hit the closing ket, in order to break infinite loops that match no characters.
649 When match() is called in other circumstances, don't add to the chain. The
650 match_cbegroup flag must NOT be used with tail recursion, because the memory
651 block that is used is on the stack, so a new one may be required for each
652 match(). */
653
654 if ((flags & match_cbegroup) != 0)
655 {
656 newptrb.epb_saved_eptr = eptr;
657 newptrb.epb_prev = eptrb;
658 eptrb = &newptrb;
659 }
660
661 /* Now start processing the opcodes. */
662
663 for (;;)
664 {
665 minimize = possessive = FALSE;
666 op = *ecode;
667
668 /* For partial matching, remember if we ever hit the end of the subject after
669 matching at least one subject character. This code is now wrapped in a macro
670 because it appears several times below. */
671
672 CHECK_PARTIAL();
673
674 switch(op)
675 {
676 case OP_FAIL:
677 RRETURN(MATCH_NOMATCH);
678
679 case OP_PRUNE:
680 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
681 ims, eptrb, flags, RM51);
682 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
683 RRETURN(MATCH_PRUNE);
684
685 case OP_COMMIT:
686 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
687 ims, eptrb, flags, RM52);
688 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
689 RRETURN(MATCH_COMMIT);
690
691 case OP_SKIP:
692 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
693 ims, eptrb, flags, RM53);
694 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
695 md->start_match_ptr = eptr; /* Pass back current position */
696 RRETURN(MATCH_SKIP);
697
698 case OP_THEN:
699 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
700 ims, eptrb, flags, RM54);
701 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
702 RRETURN(MATCH_THEN);
703
704 /* Handle a capturing bracket. If there is space in the offset vector, save
705 the current subject position in the working slot at the top of the vector.
706 We mustn't change the current values of the data slot, because they may be
707 set from a previous iteration of this group, and be referred to by a
708 reference inside the group.
709
710 If the bracket fails to match, we need to restore this value and also the
711 values of the final offsets, in case they were set by a previous iteration
712 of the same bracket.
713
714 If there isn't enough space in the offset vector, treat this as if it were
715 a non-capturing bracket. Don't worry about setting the flag for the error
716 case here; that is handled in the code for KET. */
717
718 case OP_CBRA:
719 case OP_SCBRA:
720 number = GET2(ecode, 1+LINK_SIZE);
721 offset = number << 1;
722
723 #ifdef DEBUG
724 printf("start bracket %d\n", number);
725 printf("subject=");
726 pchars(eptr, 16, TRUE, md);
727 printf("\n");
728 #endif
729
730 if (offset < md->offset_max)
731 {
732 save_offset1 = md->offset_vector[offset];
733 save_offset2 = md->offset_vector[offset+1];
734 save_offset3 = md->offset_vector[md->offset_end - number];
735 save_capture_last = md->capture_last;
736
737 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
738 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
739
740 flags = (op == OP_SCBRA)? match_cbegroup : 0;
741 do
742 {
743 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
744 ims, eptrb, flags, RM1);
745 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
746 md->capture_last = save_capture_last;
747 ecode += GET(ecode, 1);
748 }
749 while (*ecode == OP_ALT);
750
751 DPRINTF(("bracket %d failed\n", number));
752
753 md->offset_vector[offset] = save_offset1;
754 md->offset_vector[offset+1] = save_offset2;
755 md->offset_vector[md->offset_end - number] = save_offset3;
756
757 RRETURN(MATCH_NOMATCH);
758 }
759
760 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
761 as a non-capturing bracket. */
762
763 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
764 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
765
766 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
767
768 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
769 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
770
771 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
772 final alternative within the brackets, we would return the result of a
773 recursive call to match() whatever happened. We can reduce stack usage by
774 turning this into a tail recursion, except in the case when match_cbegroup
775 is set.*/
776
777 case OP_BRA:
778 case OP_SBRA:
779 DPRINTF(("start non-capturing bracket\n"));
780 flags = (op >= OP_SBRA)? match_cbegroup : 0;
781 for (;;)
782 {
783 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
784 {
785 if (flags == 0) /* Not a possibly empty group */
786 {
787 ecode += _pcre_OP_lengths[*ecode];
788 DPRINTF(("bracket 0 tail recursion\n"));
789 goto TAIL_RECURSE;
790 }
791
792 /* Possibly empty group; can't use tail recursion. */
793
794 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
795 eptrb, flags, RM48);
796 RRETURN(rrc);
797 }
798
799 /* For non-final alternatives, continue the loop for a NOMATCH result;
800 otherwise return. */
801
802 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
803 eptrb, flags, RM2);
804 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
805 ecode += GET(ecode, 1);
806 }
807 /* Control never reaches here. */
808
809 /* Conditional group: compilation checked that there are no more than
810 two branches. If the condition is false, skipping the first branch takes us
811 past the end if there is only one branch, but that's OK because that is
812 exactly what going to the ket would do. As there is only one branch to be
813 obeyed, we can use tail recursion to avoid using another stack frame. */
814
815 case OP_COND:
816 case OP_SCOND:
817 codelink= GET(ecode, 1);
818
819 /* Because of the way auto-callout works during compile, a callout item is
820 inserted between OP_COND and an assertion condition. */
821
822 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
823 {
824 if (pcre_callout != NULL)
825 {
826 pcre_callout_block cb;
827 cb.version = 1; /* Version 1 of the callout block */
828 cb.callout_number = ecode[LINK_SIZE+2];
829 cb.offset_vector = md->offset_vector;
830 cb.subject = (PCRE_SPTR)md->start_subject;
831 cb.subject_length = md->end_subject - md->start_subject;
832 cb.start_match = mstart - md->start_subject;
833 cb.current_position = eptr - md->start_subject;
834 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
835 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
836 cb.capture_top = offset_top/2;
837 cb.capture_last = md->capture_last;
838 cb.callout_data = md->callout_data;
839 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
840 if (rrc < 0) RRETURN(rrc);
841 }
842 ecode += _pcre_OP_lengths[OP_CALLOUT];
843 }
844
845 condcode = ecode[LINK_SIZE+1];
846
847 /* Now see what the actual condition is */
848
849 if (condcode == OP_RREF) /* Recursion test */
850 {
851 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
852 condition = md->recursive != NULL &&
853 (offset == RREF_ANY || offset == md->recursive->group_num);
854 ecode += condition? 3 : GET(ecode, 1);
855 }
856
857 else if (condcode == OP_CREF) /* Group used test */
858 {
859 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
860 condition = offset < offset_top && md->offset_vector[offset] >= 0;
861 ecode += condition? 3 : GET(ecode, 1);
862 }
863
864 else if (condcode == OP_DEF) /* DEFINE - always false */
865 {
866 condition = FALSE;
867 ecode += GET(ecode, 1);
868 }
869
870 /* The condition is an assertion. Call match() to evaluate it - setting
871 the final argument match_condassert causes it to stop at the end of an
872 assertion. */
873
874 else
875 {
876 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
877 match_condassert, RM3);
878 if (rrc == MATCH_MATCH)
879 {
880 condition = TRUE;
881 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
882 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
883 }
884 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
885 {
886 RRETURN(rrc); /* Need braces because of following else */
887 }
888 else
889 {
890 condition = FALSE;
891 ecode += codelink;
892 }
893 }
894
895 /* We are now at the branch that is to be obeyed. As there is only one,
896 we can use tail recursion to avoid using another stack frame, except when
897 match_cbegroup is required for an unlimited repeat of a possibly empty
898 group. If the second alternative doesn't exist, we can just plough on. */
899
900 if (condition || *ecode == OP_ALT)
901 {
902 ecode += 1 + LINK_SIZE;
903 if (op == OP_SCOND) /* Possibly empty group */
904 {
905 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
906 RRETURN(rrc);
907 }
908 else /* Group must match something */
909 {
910 flags = 0;
911 goto TAIL_RECURSE;
912 }
913 }
914 else /* Condition false & no alternative */
915 {
916 ecode += 1 + LINK_SIZE;
917 }
918 break;
919
920
921 /* End of the pattern, either real or forced. If we are in a top-level
922 recursion, we should restore the offsets appropriately and continue from
923 after the call. */
924
925 case OP_ACCEPT:
926 case OP_END:
927 if (md->recursive != NULL && md->recursive->group_num == 0)
928 {
929 recursion_info *rec = md->recursive;
930 DPRINTF(("End of pattern in a (?0) recursion\n"));
931 md->recursive = rec->prevrec;
932 memmove(md->offset_vector, rec->offset_save,
933 rec->saved_max * sizeof(int));
934 mstart = rec->save_start;
935 ims = original_ims;
936 ecode = rec->after_call;
937 break;
938 }
939
940 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
941 string - backtracking will then try other alternatives, if any. */
942
943 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
944 md->end_match_ptr = eptr; /* Record where we ended */
945 md->end_offset_top = offset_top; /* and how many extracts were taken */
946 md->start_match_ptr = mstart; /* and the start (\K can modify) */
947 RRETURN(MATCH_MATCH);
948
949 /* Change option settings */
950
951 case OP_OPT:
952 ims = ecode[1];
953 ecode += 2;
954 DPRINTF(("ims set to %02lx\n", ims));
955 break;
956
957 /* Assertion brackets. Check the alternative branches in turn - the
958 matching won't pass the KET for an assertion. If any one branch matches,
959 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
960 start of each branch to move the current point backwards, so the code at
961 this level is identical to the lookahead case. */
962
963 case OP_ASSERT:
964 case OP_ASSERTBACK:
965 do
966 {
967 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
968 RM4);
969 if (rrc == MATCH_MATCH) break;
970 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
971 ecode += GET(ecode, 1);
972 }
973 while (*ecode == OP_ALT);
974 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
975
976 /* If checking an assertion for a condition, return MATCH_MATCH. */
977
978 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
979
980 /* Continue from after the assertion, updating the offsets high water
981 mark, since extracts may have been taken during the assertion. */
982
983 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
984 ecode += 1 + LINK_SIZE;
985 offset_top = md->end_offset_top;
986 continue;
987
988 /* Negative assertion: all branches must fail to match */
989
990 case OP_ASSERT_NOT:
991 case OP_ASSERTBACK_NOT:
992 do
993 {
994 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
995 RM5);
996 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
997 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
998 ecode += GET(ecode,1);
999 }
1000 while (*ecode == OP_ALT);
1001
1002 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1003
1004 ecode += 1 + LINK_SIZE;
1005 continue;
1006
1007 /* Move the subject pointer back. This occurs only at the start of
1008 each branch of a lookbehind assertion. If we are too close to the start to
1009 move back, this match function fails. When working with UTF-8 we move
1010 back a number of characters, not bytes. */
1011
1012 case OP_REVERSE:
1013 #ifdef SUPPORT_UTF8
1014 if (utf8)
1015 {
1016 i = GET(ecode, 1);
1017 while (i-- > 0)
1018 {
1019 eptr--;
1020 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1021 BACKCHAR(eptr);
1022 }
1023 }
1024 else
1025 #endif
1026
1027 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1028
1029 {
1030 eptr -= GET(ecode, 1);
1031 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1032 }
1033
1034 /* Skip to next op code */
1035
1036 ecode += 1 + LINK_SIZE;
1037 break;
1038
1039 /* The callout item calls an external function, if one is provided, passing
1040 details of the match so far. This is mainly for debugging, though the
1041 function is able to force a failure. */
1042
1043 case OP_CALLOUT:
1044 if (pcre_callout != NULL)
1045 {
1046 pcre_callout_block cb;
1047 cb.version = 1; /* Version 1 of the callout block */
1048 cb.callout_number = ecode[1];
1049 cb.offset_vector = md->offset_vector;
1050 cb.subject = (PCRE_SPTR)md->start_subject;
1051 cb.subject_length = md->end_subject - md->start_subject;
1052 cb.start_match = mstart - md->start_subject;
1053 cb.current_position = eptr - md->start_subject;
1054 cb.pattern_position = GET(ecode, 2);
1055 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1056 cb.capture_top = offset_top/2;
1057 cb.capture_last = md->capture_last;
1058 cb.callout_data = md->callout_data;
1059 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1060 if (rrc < 0) RRETURN(rrc);
1061 }
1062 ecode += 2 + 2*LINK_SIZE;
1063 break;
1064
1065 /* Recursion either matches the current regex, or some subexpression. The
1066 offset data is the offset to the starting bracket from the start of the
1067 whole pattern. (This is so that it works from duplicated subpatterns.)
1068
1069 If there are any capturing brackets started but not finished, we have to
1070 save their starting points and reinstate them after the recursion. However,
1071 we don't know how many such there are (offset_top records the completed
1072 total) so we just have to save all the potential data. There may be up to
1073 65535 such values, which is too large to put on the stack, but using malloc
1074 for small numbers seems expensive. As a compromise, the stack is used when
1075 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1076 is used. A problem is what to do if the malloc fails ... there is no way of
1077 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1078 values on the stack, and accept that the rest may be wrong.
1079
1080 There are also other values that have to be saved. We use a chained
1081 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1082 for the original version of this logic. */
1083
1084 case OP_RECURSE:
1085 {
1086 callpat = md->start_code + GET(ecode, 1);
1087 new_recursive.group_num = (callpat == md->start_code)? 0 :
1088 GET2(callpat, 1 + LINK_SIZE);
1089
1090 /* Add to "recursing stack" */
1091
1092 new_recursive.prevrec = md->recursive;
1093 md->recursive = &new_recursive;
1094
1095 /* Find where to continue from afterwards */
1096
1097 ecode += 1 + LINK_SIZE;
1098 new_recursive.after_call = ecode;
1099
1100 /* Now save the offset data. */
1101
1102 new_recursive.saved_max = md->offset_end;
1103 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1104 new_recursive.offset_save = stacksave;
1105 else
1106 {
1107 new_recursive.offset_save =
1108 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1109 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1110 }
1111
1112 memcpy(new_recursive.offset_save, md->offset_vector,
1113 new_recursive.saved_max * sizeof(int));
1114 new_recursive.save_start = mstart;
1115 mstart = eptr;
1116
1117 /* OK, now we can do the recursion. For each top-level alternative we
1118 restore the offset and recursion data. */
1119
1120 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1121 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1122 do
1123 {
1124 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1125 md, ims, eptrb, flags, RM6);
1126 if (rrc == MATCH_MATCH)
1127 {
1128 DPRINTF(("Recursion matched\n"));
1129 md->recursive = new_recursive.prevrec;
1130 if (new_recursive.offset_save != stacksave)
1131 (pcre_free)(new_recursive.offset_save);
1132 RRETURN(MATCH_MATCH);
1133 }
1134 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1135 {
1136 DPRINTF(("Recursion gave error %d\n", rrc));
1137 if (new_recursive.offset_save != stacksave)
1138 (pcre_free)(new_recursive.offset_save);
1139 RRETURN(rrc);
1140 }
1141
1142 md->recursive = &new_recursive;
1143 memcpy(md->offset_vector, new_recursive.offset_save,
1144 new_recursive.saved_max * sizeof(int));
1145 callpat += GET(callpat, 1);
1146 }
1147 while (*callpat == OP_ALT);
1148
1149 DPRINTF(("Recursion didn't match\n"));
1150 md->recursive = new_recursive.prevrec;
1151 if (new_recursive.offset_save != stacksave)
1152 (pcre_free)(new_recursive.offset_save);
1153 RRETURN(MATCH_NOMATCH);
1154 }
1155 /* Control never reaches here */
1156
1157 /* "Once" brackets are like assertion brackets except that after a match,
1158 the point in the subject string is not moved back. Thus there can never be
1159 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1160 Check the alternative branches in turn - the matching won't pass the KET
1161 for this kind of subpattern. If any one branch matches, we carry on as at
1162 the end of a normal bracket, leaving the subject pointer. */
1163
1164 case OP_ONCE:
1165 prev = ecode;
1166 saved_eptr = eptr;
1167
1168 do
1169 {
1170 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1171 if (rrc == MATCH_MATCH) break;
1172 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1173 ecode += GET(ecode,1);
1174 }
1175 while (*ecode == OP_ALT);
1176
1177 /* If hit the end of the group (which could be repeated), fail */
1178
1179 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1180
1181 /* Continue as from after the assertion, updating the offsets high water
1182 mark, since extracts may have been taken. */
1183
1184 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1185
1186 offset_top = md->end_offset_top;
1187 eptr = md->end_match_ptr;
1188
1189 /* For a non-repeating ket, just continue at this level. This also
1190 happens for a repeating ket if no characters were matched in the group.
1191 This is the forcible breaking of infinite loops as implemented in Perl
1192 5.005. If there is an options reset, it will get obeyed in the normal
1193 course of events. */
1194
1195 if (*ecode == OP_KET || eptr == saved_eptr)
1196 {
1197 ecode += 1+LINK_SIZE;
1198 break;
1199 }
1200
1201 /* The repeating kets try the rest of the pattern or restart from the
1202 preceding bracket, in the appropriate order. The second "call" of match()
1203 uses tail recursion, to avoid using another stack frame. We need to reset
1204 any options that changed within the bracket before re-running it, so
1205 check the next opcode. */
1206
1207 if (ecode[1+LINK_SIZE] == OP_OPT)
1208 {
1209 ims = (ims & ~PCRE_IMS) | ecode[4];
1210 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1211 }
1212
1213 if (*ecode == OP_KETRMIN)
1214 {
1215 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1216 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1217 ecode = prev;
1218 flags = 0;
1219 goto TAIL_RECURSE;
1220 }
1221 else /* OP_KETRMAX */
1222 {
1223 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1224 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1225 ecode += 1 + LINK_SIZE;
1226 flags = 0;
1227 goto TAIL_RECURSE;
1228 }
1229 /* Control never gets here */
1230
1231 /* An alternation is the end of a branch; scan along to find the end of the
1232 bracketed group and go to there. */
1233
1234 case OP_ALT:
1235 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1236 break;
1237
1238 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1239 indicating that it may occur zero times. It may repeat infinitely, or not
1240 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1241 with fixed upper repeat limits are compiled as a number of copies, with the
1242 optional ones preceded by BRAZERO or BRAMINZERO. */
1243
1244 case OP_BRAZERO:
1245 {
1246 next = ecode+1;
1247 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1248 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1249 do next += GET(next,1); while (*next == OP_ALT);
1250 ecode = next + 1 + LINK_SIZE;
1251 }
1252 break;
1253
1254 case OP_BRAMINZERO:
1255 {
1256 next = ecode+1;
1257 do next += GET(next, 1); while (*next == OP_ALT);
1258 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1259 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1260 ecode++;
1261 }
1262 break;
1263
1264 case OP_SKIPZERO:
1265 {
1266 next = ecode+1;
1267 do next += GET(next,1); while (*next == OP_ALT);
1268 ecode = next + 1 + LINK_SIZE;
1269 }
1270 break;
1271
1272 /* End of a group, repeated or non-repeating. */
1273
1274 case OP_KET:
1275 case OP_KETRMIN:
1276 case OP_KETRMAX:
1277 prev = ecode - GET(ecode, 1);
1278
1279 /* If this was a group that remembered the subject start, in order to break
1280 infinite repeats of empty string matches, retrieve the subject start from
1281 the chain. Otherwise, set it NULL. */
1282
1283 if (*prev >= OP_SBRA)
1284 {
1285 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1286 eptrb = eptrb->epb_prev; /* Backup to previous group */
1287 }
1288 else saved_eptr = NULL;
1289
1290 /* If we are at the end of an assertion group, stop matching and return
1291 MATCH_MATCH, but record the current high water mark for use by positive
1292 assertions. Do this also for the "once" (atomic) groups. */
1293
1294 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1295 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1296 *prev == OP_ONCE)
1297 {
1298 md->end_match_ptr = eptr; /* For ONCE */
1299 md->end_offset_top = offset_top;
1300 RRETURN(MATCH_MATCH);
1301 }
1302
1303 /* For capturing groups we have to check the group number back at the start
1304 and if necessary complete handling an extraction by setting the offsets and
1305 bumping the high water mark. Note that whole-pattern recursion is coded as
1306 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1307 when the OP_END is reached. Other recursion is handled here. */
1308
1309 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1310 {
1311 number = GET2(prev, 1+LINK_SIZE);
1312 offset = number << 1;
1313
1314 #ifdef DEBUG
1315 printf("end bracket %d", number);
1316 printf("\n");
1317 #endif
1318
1319 md->capture_last = number;
1320 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1321 {
1322 md->offset_vector[offset] =
1323 md->offset_vector[md->offset_end - number];
1324 md->offset_vector[offset+1] = eptr - md->start_subject;
1325 if (offset_top <= offset) offset_top = offset + 2;
1326 }
1327
1328 /* Handle a recursively called group. Restore the offsets
1329 appropriately and continue from after the call. */
1330
1331 if (md->recursive != NULL && md->recursive->group_num == number)
1332 {
1333 recursion_info *rec = md->recursive;
1334 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1335 md->recursive = rec->prevrec;
1336 mstart = rec->save_start;
1337 memcpy(md->offset_vector, rec->offset_save,
1338 rec->saved_max * sizeof(int));
1339 ecode = rec->after_call;
1340 ims = original_ims;
1341 break;
1342 }
1343 }
1344
1345 /* For both capturing and non-capturing groups, reset the value of the ims
1346 flags, in case they got changed during the group. */
1347
1348 ims = original_ims;
1349 DPRINTF(("ims reset to %02lx\n", ims));
1350
1351 /* For a non-repeating ket, just continue at this level. This also
1352 happens for a repeating ket if no characters were matched in the group.
1353 This is the forcible breaking of infinite loops as implemented in Perl
1354 5.005. If there is an options reset, it will get obeyed in the normal
1355 course of events. */
1356
1357 if (*ecode == OP_KET || eptr == saved_eptr)
1358 {
1359 ecode += 1 + LINK_SIZE;
1360 break;
1361 }
1362
1363 /* The repeating kets try the rest of the pattern or restart from the
1364 preceding bracket, in the appropriate order. In the second case, we can use
1365 tail recursion to avoid using another stack frame, unless we have an
1366 unlimited repeat of a group that can match an empty string. */
1367
1368 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1369
1370 if (*ecode == OP_KETRMIN)
1371 {
1372 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1373 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1374 if (flags != 0) /* Could match an empty string */
1375 {
1376 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1377 RRETURN(rrc);
1378 }
1379 ecode = prev;
1380 goto TAIL_RECURSE;
1381 }
1382 else /* OP_KETRMAX */
1383 {
1384 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1385 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1386 ecode += 1 + LINK_SIZE;
1387 flags = 0;
1388 goto TAIL_RECURSE;
1389 }
1390 /* Control never gets here */
1391
1392 /* Start of subject unless notbol, or after internal newline if multiline */
1393
1394 case OP_CIRC:
1395 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1396 if ((ims & PCRE_MULTILINE) != 0)
1397 {
1398 if (eptr != md->start_subject &&
1399 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1400 RRETURN(MATCH_NOMATCH);
1401 ecode++;
1402 break;
1403 }
1404 /* ... else fall through */
1405
1406 /* Start of subject assertion */
1407
1408 case OP_SOD:
1409 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1410 ecode++;
1411 break;
1412
1413 /* Start of match assertion */
1414
1415 case OP_SOM:
1416 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1417 ecode++;
1418 break;
1419
1420 /* Reset the start of match point */
1421
1422 case OP_SET_SOM:
1423 mstart = eptr;
1424 ecode++;
1425 break;
1426
1427 /* Assert before internal newline if multiline, or before a terminating
1428 newline unless endonly is set, else end of subject unless noteol is set. */
1429
1430 case OP_DOLL:
1431 if ((ims & PCRE_MULTILINE) != 0)
1432 {
1433 if (eptr < md->end_subject)
1434 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1435 else
1436 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1437 ecode++;
1438 break;
1439 }
1440 else
1441 {
1442 if (md->noteol) RRETURN(MATCH_NOMATCH);
1443 if (!md->endonly)
1444 {
1445 if (eptr != md->end_subject &&
1446 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1447 RRETURN(MATCH_NOMATCH);
1448 ecode++;
1449 break;
1450 }
1451 }
1452 /* ... else fall through for endonly */
1453
1454 /* End of subject assertion (\z) */
1455
1456 case OP_EOD:
1457 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1458 ecode++;
1459 break;
1460
1461 /* End of subject or ending \n assertion (\Z) */
1462
1463 case OP_EODN:
1464 if (eptr != md->end_subject &&
1465 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1466 RRETURN(MATCH_NOMATCH);
1467 ecode++;
1468 break;
1469
1470 /* Word boundary assertions */
1471
1472 case OP_NOT_WORD_BOUNDARY:
1473 case OP_WORD_BOUNDARY:
1474 {
1475
1476 /* Find out if the previous and current characters are "word" characters.
1477 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1478 be "non-word" characters. */
1479
1480 #ifdef SUPPORT_UTF8
1481 if (utf8)
1482 {
1483 if (eptr == md->start_subject) prev_is_word = FALSE; else
1484 {
1485 USPTR lastptr = eptr - 1;
1486 while((*lastptr & 0xc0) == 0x80) lastptr--;
1487 GETCHAR(c, lastptr);
1488 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1489 }
1490 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1491 {
1492 GETCHAR(c, eptr);
1493 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1494 }
1495 }
1496 else
1497 #endif
1498
1499 /* More streamlined when not in UTF-8 mode */
1500
1501 {
1502 prev_is_word = (eptr != md->start_subject) &&
1503 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1504 cur_is_word = (eptr < md->end_subject) &&
1505 ((md->ctypes[*eptr] & ctype_word) != 0);
1506 }
1507
1508 /* Now see if the situation is what we want */
1509
1510 if ((*ecode++ == OP_WORD_BOUNDARY)?
1511 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1512 RRETURN(MATCH_NOMATCH);
1513 }
1514 break;
1515
1516 /* Match a single character type; inline for speed */
1517
1518 case OP_ANY:
1519 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1520 /* Fall through */
1521
1522 case OP_ALLANY:
1523 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1524 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1525 ecode++;
1526 break;
1527
1528 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1529 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1530
1531 case OP_ANYBYTE:
1532 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1533 ecode++;
1534 break;
1535
1536 case OP_NOT_DIGIT:
1537 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1538 GETCHARINCTEST(c, eptr);
1539 if (
1540 #ifdef SUPPORT_UTF8
1541 c < 256 &&
1542 #endif
1543 (md->ctypes[c] & ctype_digit) != 0
1544 )
1545 RRETURN(MATCH_NOMATCH);
1546 ecode++;
1547 break;
1548
1549 case OP_DIGIT:
1550 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1551 GETCHARINCTEST(c, eptr);
1552 if (
1553 #ifdef SUPPORT_UTF8
1554 c >= 256 ||
1555 #endif
1556 (md->ctypes[c] & ctype_digit) == 0
1557 )
1558 RRETURN(MATCH_NOMATCH);
1559 ecode++;
1560 break;
1561
1562 case OP_NOT_WHITESPACE:
1563 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1564 GETCHARINCTEST(c, eptr);
1565 if (
1566 #ifdef SUPPORT_UTF8
1567 c < 256 &&
1568 #endif
1569 (md->ctypes[c] & ctype_space) != 0
1570 )
1571 RRETURN(MATCH_NOMATCH);
1572 ecode++;
1573 break;
1574
1575 case OP_WHITESPACE:
1576 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1577 GETCHARINCTEST(c, eptr);
1578 if (
1579 #ifdef SUPPORT_UTF8
1580 c >= 256 ||
1581 #endif
1582 (md->ctypes[c] & ctype_space) == 0
1583 )
1584 RRETURN(MATCH_NOMATCH);
1585 ecode++;
1586 break;
1587
1588 case OP_NOT_WORDCHAR:
1589 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1590 GETCHARINCTEST(c, eptr);
1591 if (
1592 #ifdef SUPPORT_UTF8
1593 c < 256 &&
1594 #endif
1595 (md->ctypes[c] & ctype_word) != 0
1596 )
1597 RRETURN(MATCH_NOMATCH);
1598 ecode++;
1599 break;
1600
1601 case OP_WORDCHAR:
1602 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1603 GETCHARINCTEST(c, eptr);
1604 if (
1605 #ifdef SUPPORT_UTF8
1606 c >= 256 ||
1607 #endif
1608 (md->ctypes[c] & ctype_word) == 0
1609 )
1610 RRETURN(MATCH_NOMATCH);
1611 ecode++;
1612 break;
1613
1614 case OP_ANYNL:
1615 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1616 GETCHARINCTEST(c, eptr);
1617 switch(c)
1618 {
1619 default: RRETURN(MATCH_NOMATCH);
1620 case 0x000d:
1621 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1622 break;
1623
1624 case 0x000a:
1625 break;
1626
1627 case 0x000b:
1628 case 0x000c:
1629 case 0x0085:
1630 case 0x2028:
1631 case 0x2029:
1632 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1633 break;
1634 }
1635 ecode++;
1636 break;
1637
1638 case OP_NOT_HSPACE:
1639 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1640 GETCHARINCTEST(c, eptr);
1641 switch(c)
1642 {
1643 default: break;
1644 case 0x09: /* HT */
1645 case 0x20: /* SPACE */
1646 case 0xa0: /* NBSP */
1647 case 0x1680: /* OGHAM SPACE MARK */
1648 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1649 case 0x2000: /* EN QUAD */
1650 case 0x2001: /* EM QUAD */
1651 case 0x2002: /* EN SPACE */
1652 case 0x2003: /* EM SPACE */
1653 case 0x2004: /* THREE-PER-EM SPACE */
1654 case 0x2005: /* FOUR-PER-EM SPACE */
1655 case 0x2006: /* SIX-PER-EM SPACE */
1656 case 0x2007: /* FIGURE SPACE */
1657 case 0x2008: /* PUNCTUATION SPACE */
1658 case 0x2009: /* THIN SPACE */
1659 case 0x200A: /* HAIR SPACE */
1660 case 0x202f: /* NARROW NO-BREAK SPACE */
1661 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1662 case 0x3000: /* IDEOGRAPHIC SPACE */
1663 RRETURN(MATCH_NOMATCH);
1664 }
1665 ecode++;
1666 break;
1667
1668 case OP_HSPACE:
1669 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1670 GETCHARINCTEST(c, eptr);
1671 switch(c)
1672 {
1673 default: RRETURN(MATCH_NOMATCH);
1674 case 0x09: /* HT */
1675 case 0x20: /* SPACE */
1676 case 0xa0: /* NBSP */
1677 case 0x1680: /* OGHAM SPACE MARK */
1678 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1679 case 0x2000: /* EN QUAD */
1680 case 0x2001: /* EM QUAD */
1681 case 0x2002: /* EN SPACE */
1682 case 0x2003: /* EM SPACE */
1683 case 0x2004: /* THREE-PER-EM SPACE */
1684 case 0x2005: /* FOUR-PER-EM SPACE */
1685 case 0x2006: /* SIX-PER-EM SPACE */
1686 case 0x2007: /* FIGURE SPACE */
1687 case 0x2008: /* PUNCTUATION SPACE */
1688 case 0x2009: /* THIN SPACE */
1689 case 0x200A: /* HAIR SPACE */
1690 case 0x202f: /* NARROW NO-BREAK SPACE */
1691 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1692 case 0x3000: /* IDEOGRAPHIC SPACE */
1693 break;
1694 }
1695 ecode++;
1696 break;
1697
1698 case OP_NOT_VSPACE:
1699 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1700 GETCHARINCTEST(c, eptr);
1701 switch(c)
1702 {
1703 default: break;
1704 case 0x0a: /* LF */
1705 case 0x0b: /* VT */
1706 case 0x0c: /* FF */
1707 case 0x0d: /* CR */
1708 case 0x85: /* NEL */
1709 case 0x2028: /* LINE SEPARATOR */
1710 case 0x2029: /* PARAGRAPH SEPARATOR */
1711 RRETURN(MATCH_NOMATCH);
1712 }
1713 ecode++;
1714 break;
1715
1716 case OP_VSPACE:
1717 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1718 GETCHARINCTEST(c, eptr);
1719 switch(c)
1720 {
1721 default: RRETURN(MATCH_NOMATCH);
1722 case 0x0a: /* LF */
1723 case 0x0b: /* VT */
1724 case 0x0c: /* FF */
1725 case 0x0d: /* CR */
1726 case 0x85: /* NEL */
1727 case 0x2028: /* LINE SEPARATOR */
1728 case 0x2029: /* PARAGRAPH SEPARATOR */
1729 break;
1730 }
1731 ecode++;
1732 break;
1733
1734 #ifdef SUPPORT_UCP
1735 /* Check the next character by Unicode property. We will get here only
1736 if the support is in the binary; otherwise a compile-time error occurs. */
1737
1738 case OP_PROP:
1739 case OP_NOTPROP:
1740 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1741 GETCHARINCTEST(c, eptr);
1742 {
1743 const ucd_record *prop = GET_UCD(c);
1744
1745 switch(ecode[1])
1746 {
1747 case PT_ANY:
1748 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1749 break;
1750
1751 case PT_LAMP:
1752 if ((prop->chartype == ucp_Lu ||
1753 prop->chartype == ucp_Ll ||
1754 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1755 RRETURN(MATCH_NOMATCH);
1756 break;
1757
1758 case PT_GC:
1759 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1760 RRETURN(MATCH_NOMATCH);
1761 break;
1762
1763 case PT_PC:
1764 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1765 RRETURN(MATCH_NOMATCH);
1766 break;
1767
1768 case PT_SC:
1769 if ((ecode[2] != prop->script) == (op == OP_PROP))
1770 RRETURN(MATCH_NOMATCH);
1771 break;
1772
1773 default:
1774 RRETURN(PCRE_ERROR_INTERNAL);
1775 }
1776
1777 ecode += 3;
1778 }
1779 break;
1780
1781 /* Match an extended Unicode sequence. We will get here only if the support
1782 is in the binary; otherwise a compile-time error occurs. */
1783
1784 case OP_EXTUNI:
1785 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1786 GETCHARINCTEST(c, eptr);
1787 {
1788 int category = UCD_CATEGORY(c);
1789 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1790 while (eptr < md->end_subject)
1791 {
1792 int len = 1;
1793 if (!utf8) c = *eptr; else
1794 {
1795 GETCHARLEN(c, eptr, len);
1796 }
1797 category = UCD_CATEGORY(c);
1798 if (category != ucp_M) break;
1799 eptr += len;
1800 }
1801 }
1802 ecode++;
1803 break;
1804 #endif
1805
1806
1807 /* Match a back reference, possibly repeatedly. Look past the end of the
1808 item to see if there is repeat information following. The code is similar
1809 to that for character classes, but repeated for efficiency. Then obey
1810 similar code to character type repeats - written out again for speed.
1811 However, if the referenced string is the empty string, always treat
1812 it as matched, any number of times (otherwise there could be infinite
1813 loops). */
1814
1815 case OP_REF:
1816 {
1817 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1818 ecode += 3;
1819
1820 /* If the reference is unset, there are two possibilities:
1821
1822 (a) In the default, Perl-compatible state, set the length to be longer
1823 than the amount of subject left; this ensures that every attempt at a
1824 match fails. We can't just fail here, because of the possibility of
1825 quantifiers with zero minima.
1826
1827 (b) If the JavaScript compatibility flag is set, set the length to zero
1828 so that the back reference matches an empty string.
1829
1830 Otherwise, set the length to the length of what was matched by the
1831 referenced subpattern. */
1832
1833 if (offset >= offset_top || md->offset_vector[offset] < 0)
1834 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1835 else
1836 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1837
1838 /* Set up for repetition, or handle the non-repeated case */
1839
1840 switch (*ecode)
1841 {
1842 case OP_CRSTAR:
1843 case OP_CRMINSTAR:
1844 case OP_CRPLUS:
1845 case OP_CRMINPLUS:
1846 case OP_CRQUERY:
1847 case OP_CRMINQUERY:
1848 c = *ecode++ - OP_CRSTAR;
1849 minimize = (c & 1) != 0;
1850 min = rep_min[c]; /* Pick up values from tables; */
1851 max = rep_max[c]; /* zero for max => infinity */
1852 if (max == 0) max = INT_MAX;
1853 break;
1854
1855 case OP_CRRANGE:
1856 case OP_CRMINRANGE:
1857 minimize = (*ecode == OP_CRMINRANGE);
1858 min = GET2(ecode, 1);
1859 max = GET2(ecode, 3);
1860 if (max == 0) max = INT_MAX;
1861 ecode += 5;
1862 break;
1863
1864 default: /* No repeat follows */
1865 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1866 eptr += length;
1867 continue; /* With the main loop */
1868 }
1869
1870 /* If the length of the reference is zero, just continue with the
1871 main loop. */
1872
1873 if (length == 0) continue;
1874
1875 /* First, ensure the minimum number of matches are present. We get back
1876 the length of the reference string explicitly rather than passing the
1877 address of eptr, so that eptr can be a register variable. */
1878
1879 for (i = 1; i <= min; i++)
1880 {
1881 if (!match_ref(offset, eptr, length, md, ims))
1882 {
1883 CHECK_PARTIAL();
1884 RRETURN(MATCH_NOMATCH);
1885 }
1886 eptr += length;
1887 }
1888
1889 /* If min = max, continue at the same level without recursion.
1890 They are not both allowed to be zero. */
1891
1892 if (min == max) continue;
1893
1894 /* If minimizing, keep trying and advancing the pointer */
1895
1896 if (minimize)
1897 {
1898 for (fi = min;; fi++)
1899 {
1900 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1901 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1902 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1903 {
1904 CHECK_PARTIAL();
1905 RRETURN(MATCH_NOMATCH);
1906 }
1907 eptr += length;
1908 }
1909 /* Control never gets here */
1910 }
1911
1912 /* If maximizing, find the longest string and work backwards */
1913
1914 else
1915 {
1916 pp = eptr;
1917 for (i = min; i < max; i++)
1918 {
1919 if (!match_ref(offset, eptr, length, md, ims)) break;
1920 eptr += length;
1921 }
1922 CHECK_PARTIAL();
1923 while (eptr >= pp)
1924 {
1925 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1926 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1927 eptr -= length;
1928 }
1929 RRETURN(MATCH_NOMATCH);
1930 }
1931 }
1932 /* Control never gets here */
1933
1934
1935
1936 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1937 used when all the characters in the class have values in the range 0-255,
1938 and either the matching is caseful, or the characters are in the range
1939 0-127 when UTF-8 processing is enabled. The only difference between
1940 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1941 encountered.
1942
1943 First, look past the end of the item to see if there is repeat information
1944 following. Then obey similar code to character type repeats - written out
1945 again for speed. */
1946
1947 case OP_NCLASS:
1948 case OP_CLASS:
1949 {
1950 data = ecode + 1; /* Save for matching */
1951 ecode += 33; /* Advance past the item */
1952
1953 switch (*ecode)
1954 {
1955 case OP_CRSTAR:
1956 case OP_CRMINSTAR:
1957 case OP_CRPLUS:
1958 case OP_CRMINPLUS:
1959 case OP_CRQUERY:
1960 case OP_CRMINQUERY:
1961 c = *ecode++ - OP_CRSTAR;
1962 minimize = (c & 1) != 0;
1963 min = rep_min[c]; /* Pick up values from tables; */
1964 max = rep_max[c]; /* zero for max => infinity */
1965 if (max == 0) max = INT_MAX;
1966 break;
1967
1968 case OP_CRRANGE:
1969 case OP_CRMINRANGE:
1970 minimize = (*ecode == OP_CRMINRANGE);
1971 min = GET2(ecode, 1);
1972 max = GET2(ecode, 3);
1973 if (max == 0) max = INT_MAX;
1974 ecode += 5;
1975 break;
1976
1977 default: /* No repeat follows */
1978 min = max = 1;
1979 break;
1980 }
1981
1982 /* First, ensure the minimum number of matches are present. */
1983
1984 #ifdef SUPPORT_UTF8
1985 /* UTF-8 mode */
1986 if (utf8)
1987 {
1988 for (i = 1; i <= min; i++)
1989 {
1990 if (eptr >= md->end_subject)
1991 {
1992 CHECK_PARTIAL();
1993 RRETURN(MATCH_NOMATCH);
1994 }
1995 GETCHARINC(c, eptr);
1996 if (c > 255)
1997 {
1998 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1999 }
2000 else
2001 {
2002 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2003 }
2004 }
2005 }
2006 else
2007 #endif
2008 /* Not UTF-8 mode */
2009 {
2010 for (i = 1; i <= min; i++)
2011 {
2012 if (eptr >= md->end_subject)
2013 {
2014 CHECK_PARTIAL();
2015 RRETURN(MATCH_NOMATCH);
2016 }
2017 c = *eptr++;
2018 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2019 }
2020 }
2021
2022 /* If max == min we can continue with the main loop without the
2023 need to recurse. */
2024
2025 if (min == max) continue;
2026
2027 /* If minimizing, keep testing the rest of the expression and advancing
2028 the pointer while it matches the class. */
2029
2030 if (minimize)
2031 {
2032 #ifdef SUPPORT_UTF8
2033 /* UTF-8 mode */
2034 if (utf8)
2035 {
2036 for (fi = min;; fi++)
2037 {
2038 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2039 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2040 if (fi >= max)
2041 {
2042 CHECK_PARTIAL();
2043 RRETURN(MATCH_NOMATCH);
2044 }
2045 if (eptr >= md->end_subject)
2046 {
2047 SCHECK_PARTIAL();
2048 RRETURN(MATCH_NOMATCH);
2049 }
2050 GETCHARINC(c, eptr);
2051 if (c > 255)
2052 {
2053 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2054 }
2055 else
2056 {
2057 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2058 }
2059 }
2060 }
2061 else
2062 #endif
2063 /* Not UTF-8 mode */
2064 {
2065 for (fi = min;; fi++)
2066 {
2067 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2068 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2069 if (fi >= max)
2070 {
2071 CHECK_PARTIAL();
2072 RRETURN(MATCH_NOMATCH);
2073 }
2074 if (eptr >= md->end_subject)
2075 {
2076 SCHECK_PARTIAL();
2077 RRETURN(MATCH_NOMATCH);
2078 }
2079 c = *eptr++;
2080 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2081 }
2082 }
2083 /* Control never gets here */
2084 }
2085
2086 /* If maximizing, find the longest possible run, then work backwards. */
2087
2088 else
2089 {
2090 pp = eptr;
2091
2092 #ifdef SUPPORT_UTF8
2093 /* UTF-8 mode */
2094 if (utf8)
2095 {
2096 for (i = min; i < max; i++)
2097 {
2098 int len = 1;
2099 if (eptr >= md->end_subject) break;
2100 GETCHARLEN(c, eptr, len);
2101 if (c > 255)
2102 {
2103 if (op == OP_CLASS) break;
2104 }
2105 else
2106 {
2107 if ((data[c/8] & (1 << (c&7))) == 0) break;
2108 }
2109 eptr += len;
2110 }
2111 CHECK_PARTIAL();
2112 for (;;)
2113 {
2114 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2115 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2116 if (eptr-- == pp) break; /* Stop if tried at original pos */
2117 BACKCHAR(eptr);
2118 }
2119 }
2120 else
2121 #endif
2122 /* Not UTF-8 mode */
2123 {
2124 for (i = min; i < max; i++)
2125 {
2126 if (eptr >= md->end_subject) break;
2127 c = *eptr;
2128 if ((data[c/8] & (1 << (c&7))) == 0) break;
2129 eptr++;
2130 }
2131 CHECK_PARTIAL();
2132 while (eptr >= pp)
2133 {
2134 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2135 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2136 eptr--;
2137 }
2138 }
2139
2140 RRETURN(MATCH_NOMATCH);
2141 }
2142 }
2143 /* Control never gets here */
2144
2145
2146 /* Match an extended character class. This opcode is encountered only
2147 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2148 mode, because Unicode properties are supported in non-UTF-8 mode. */
2149
2150 #ifdef SUPPORT_UTF8
2151 case OP_XCLASS:
2152 {
2153 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2154 ecode += GET(ecode, 1); /* Advance past the item */
2155
2156 switch (*ecode)
2157 {
2158 case OP_CRSTAR:
2159 case OP_CRMINSTAR:
2160 case OP_CRPLUS:
2161 case OP_CRMINPLUS:
2162 case OP_CRQUERY:
2163 case OP_CRMINQUERY:
2164 c = *ecode++ - OP_CRSTAR;
2165 minimize = (c & 1) != 0;
2166 min = rep_min[c]; /* Pick up values from tables; */
2167 max = rep_max[c]; /* zero for max => infinity */
2168 if (max == 0) max = INT_MAX;
2169 break;
2170
2171 case OP_CRRANGE:
2172 case OP_CRMINRANGE:
2173 minimize = (*ecode == OP_CRMINRANGE);
2174 min = GET2(ecode, 1);
2175 max = GET2(ecode, 3);
2176 if (max == 0) max = INT_MAX;
2177 ecode += 5;
2178 break;
2179
2180 default: /* No repeat follows */
2181 min = max = 1;
2182 break;
2183 }
2184
2185 /* First, ensure the minimum number of matches are present. */
2186
2187 for (i = 1; i <= min; i++)
2188 {
2189 if (eptr >= md->end_subject)
2190 {
2191 SCHECK_PARTIAL();
2192 RRETURN(MATCH_NOMATCH);
2193 }
2194 GETCHARINCTEST(c, eptr);
2195 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2196 }
2197
2198 /* If max == min we can continue with the main loop without the
2199 need to recurse. */
2200
2201 if (min == max) continue;
2202
2203 /* If minimizing, keep testing the rest of the expression and advancing
2204 the pointer while it matches the class. */
2205
2206 if (minimize)
2207 {
2208 for (fi = min;; fi++)
2209 {
2210 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2211 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2212 if (fi >= max)
2213 {
2214 CHECK_PARTIAL();
2215 RRETURN(MATCH_NOMATCH);
2216 }
2217 if (eptr >= md->end_subject)
2218 {
2219 SCHECK_PARTIAL();
2220 RRETURN(MATCH_NOMATCH);
2221 }
2222 GETCHARINCTEST(c, eptr);
2223 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2224 }
2225 /* Control never gets here */
2226 }
2227
2228 /* If maximizing, find the longest possible run, then work backwards. */
2229
2230 else
2231 {
2232 pp = eptr;
2233 for (i = min; i < max; i++)
2234 {
2235 int len = 1;
2236 if (eptr >= md->end_subject) break;
2237 GETCHARLENTEST(c, eptr, len);
2238 if (!_pcre_xclass(c, data)) break;
2239 eptr += len;
2240 }
2241 CHECK_PARTIAL();
2242 for(;;)
2243 {
2244 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2245 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2246 if (eptr-- == pp) break; /* Stop if tried at original pos */
2247 if (utf8) BACKCHAR(eptr);
2248 }
2249 RRETURN(MATCH_NOMATCH);
2250 }
2251
2252 /* Control never gets here */
2253 }
2254 #endif /* End of XCLASS */
2255
2256 /* Match a single character, casefully */
2257
2258 case OP_CHAR:
2259 #ifdef SUPPORT_UTF8
2260 if (utf8)
2261 {
2262 length = 1;
2263 ecode++;
2264 GETCHARLEN(fc, ecode, length);
2265 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2266 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2267 }
2268 else
2269 #endif
2270
2271 /* Non-UTF-8 mode */
2272 {
2273 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2274 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2275 ecode += 2;
2276 }
2277 break;
2278
2279 /* Match a single character, caselessly */
2280
2281 case OP_CHARNC:
2282 #ifdef SUPPORT_UTF8
2283 if (utf8)
2284 {
2285 length = 1;
2286 ecode++;
2287 GETCHARLEN(fc, ecode, length);
2288
2289 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2290
2291 /* If the pattern character's value is < 128, we have only one byte, and
2292 can use the fast lookup table. */
2293
2294 if (fc < 128)
2295 {
2296 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2297 }
2298
2299 /* Otherwise we must pick up the subject character */
2300
2301 else
2302 {
2303 unsigned int dc;
2304 GETCHARINC(dc, eptr);
2305 ecode += length;
2306
2307 /* If we have Unicode property support, we can use it to test the other
2308 case of the character, if there is one. */
2309
2310 if (fc != dc)
2311 {
2312 #ifdef SUPPORT_UCP
2313 if (dc != UCD_OTHERCASE(fc))
2314 #endif
2315 RRETURN(MATCH_NOMATCH);
2316 }
2317 }
2318 }
2319 else
2320 #endif /* SUPPORT_UTF8 */
2321
2322 /* Non-UTF-8 mode */
2323 {
2324 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2325 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2326 ecode += 2;
2327 }
2328 break;
2329
2330 /* Match a single character repeatedly. */
2331
2332 case OP_EXACT:
2333 min = max = GET2(ecode, 1);
2334 ecode += 3;
2335 goto REPEATCHAR;
2336
2337 case OP_POSUPTO:
2338 possessive = TRUE;
2339 /* Fall through */
2340
2341 case OP_UPTO:
2342 case OP_MINUPTO:
2343 min = 0;
2344 max = GET2(ecode, 1);
2345 minimize = *ecode == OP_MINUPTO;
2346 ecode += 3;
2347 goto REPEATCHAR;
2348
2349 case OP_POSSTAR:
2350 possessive = TRUE;
2351 min = 0;
2352 max = INT_MAX;
2353 ecode++;
2354 goto REPEATCHAR;
2355
2356 case OP_POSPLUS:
2357 possessive = TRUE;
2358 min = 1;
2359 max = INT_MAX;
2360 ecode++;
2361 goto REPEATCHAR;
2362
2363 case OP_POSQUERY:
2364 possessive = TRUE;
2365 min = 0;
2366 max = 1;
2367 ecode++;
2368 goto REPEATCHAR;
2369
2370 case OP_STAR:
2371 case OP_MINSTAR:
2372 case OP_PLUS:
2373 case OP_MINPLUS:
2374 case OP_QUERY:
2375 case OP_MINQUERY:
2376 c = *ecode++ - OP_STAR;
2377 minimize = (c & 1) != 0;
2378 min = rep_min[c]; /* Pick up values from tables; */
2379 max = rep_max[c]; /* zero for max => infinity */
2380 if (max == 0) max = INT_MAX;
2381
2382 /* Common code for all repeated single-character matches. */
2383
2384 REPEATCHAR:
2385 #ifdef SUPPORT_UTF8
2386 if (utf8)
2387 {
2388 length = 1;
2389 charptr = ecode;
2390 GETCHARLEN(fc, ecode, length);
2391 ecode += length;
2392
2393 /* Handle multibyte character matching specially here. There is
2394 support for caseless matching if UCP support is present. */
2395
2396 if (length > 1)
2397 {
2398 #ifdef SUPPORT_UCP
2399 unsigned int othercase;
2400 if ((ims & PCRE_CASELESS) != 0 &&
2401 (othercase = UCD_OTHERCASE(fc)) != fc)
2402 oclength = _pcre_ord2utf8(othercase, occhars);
2403 else oclength = 0;
2404 #endif /* SUPPORT_UCP */
2405
2406 for (i = 1; i <= min; i++)
2407 {
2408 if (eptr <= md->end_subject - length &&
2409 memcmp(eptr, charptr, length) == 0) eptr += length;
2410 #ifdef SUPPORT_UCP
2411 else if (oclength > 0 &&
2412 eptr <= md->end_subject - oclength &&
2413 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2414 #endif /* SUPPORT_UCP */
2415 else
2416 {
2417 CHECK_PARTIAL();
2418 RRETURN(MATCH_NOMATCH);
2419 }
2420 }
2421
2422 if (min == max) continue;
2423
2424 if (minimize)
2425 {
2426 for (fi = min;; fi++)
2427 {
2428 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2429 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2430 if (fi >= max)
2431 {
2432 CHECK_PARTIAL();
2433 RRETURN(MATCH_NOMATCH);
2434 }
2435 if (eptr <= md->end_subject - length &&
2436 memcmp(eptr, charptr, length) == 0) eptr += length;
2437 #ifdef SUPPORT_UCP
2438 else if (oclength > 0 &&
2439 eptr <= md->end_subject - oclength &&
2440 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2441 #endif /* SUPPORT_UCP */
2442 else
2443 {
2444 CHECK_PARTIAL();
2445 RRETURN(MATCH_NOMATCH);
2446 }
2447 }
2448 /* Control never gets here */
2449 }
2450
2451 else /* Maximize */
2452 {
2453 pp = eptr;
2454 for (i = min; i < max; i++)
2455 {
2456 if (eptr <= md->end_subject - length &&
2457 memcmp(eptr, charptr, length) == 0) eptr += length;
2458 #ifdef SUPPORT_UCP
2459 else if (oclength > 0 &&
2460 eptr <= md->end_subject - oclength &&
2461 memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2462 #endif /* SUPPORT_UCP */
2463 else break;
2464 }
2465
2466 CHECK_PARTIAL();
2467 if (possessive) continue;
2468
2469 for(;;)
2470 {
2471 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2472 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2473 if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
2474 #ifdef SUPPORT_UCP
2475 eptr--;
2476 BACKCHAR(eptr);
2477 #else /* without SUPPORT_UCP */
2478 eptr -= length;
2479 #endif /* SUPPORT_UCP */
2480 }
2481 }
2482 /* Control never gets here */
2483 }
2484
2485 /* If the length of a UTF-8 character is 1, we fall through here, and
2486 obey the code as for non-UTF-8 characters below, though in this case the
2487 value of fc will always be < 128. */
2488 }
2489 else
2490 #endif /* SUPPORT_UTF8 */
2491
2492 /* When not in UTF-8 mode, load a single-byte character. */
2493
2494 fc = *ecode++;
2495
2496 /* The value of fc at this point is always less than 256, though we may or
2497 may not be in UTF-8 mode. The code is duplicated for the caseless and
2498 caseful cases, for speed, since matching characters is likely to be quite
2499 common. First, ensure the minimum number of matches are present. If min =
2500 max, continue at the same level without recursing. Otherwise, if
2501 minimizing, keep trying the rest of the expression and advancing one
2502 matching character if failing, up to the maximum. Alternatively, if
2503 maximizing, find the maximum number of characters and work backwards. */
2504
2505 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2506 max, eptr));
2507
2508 if ((ims & PCRE_CASELESS) != 0)
2509 {
2510 fc = md->lcc[fc];
2511 for (i = 1; i <= min; i++)
2512 {
2513 if (eptr >= md->end_subject)
2514 {
2515 SCHECK_PARTIAL();
2516 RRETURN(MATCH_NOMATCH);
2517 }
2518 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2519 }
2520 if (min == max) continue;
2521 if (minimize)
2522 {
2523 for (fi = min;; fi++)
2524 {
2525 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2526 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2527 if (fi >= max)
2528 {
2529 CHECK_PARTIAL();
2530 RRETURN(MATCH_NOMATCH);
2531 }
2532 if (eptr >= md->end_subject)
2533 {
2534 SCHECK_PARTIAL();
2535 RRETURN(MATCH_NOMATCH);
2536 }
2537 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2538 }
2539 /* Control never gets here */
2540 }
2541 else /* Maximize */
2542 {
2543 pp = eptr;
2544 for (i = min; i < max; i++)
2545 {
2546 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2547 eptr++;
2548 }
2549
2550 CHECK_PARTIAL();
2551 if (possessive) continue;
2552
2553 while (eptr >= pp)
2554 {
2555 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2556 eptr--;
2557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2558 }
2559 RRETURN(MATCH_NOMATCH);
2560 }
2561 /* Control never gets here */
2562 }
2563
2564 /* Caseful comparisons (includes all multi-byte characters) */
2565
2566 else
2567 {
2568 for (i = 1; i <= min; i++)
2569 {
2570 if (eptr >= md->end_subject)
2571 {
2572 SCHECK_PARTIAL();
2573 RRETURN(MATCH_NOMATCH);
2574 }
2575 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2576 }
2577 if (min == max) continue;
2578 if (minimize)
2579 {
2580 for (fi = min;; fi++)
2581 {
2582 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2583 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2584 if (fi >= max)
2585 {
2586 CHECK_PARTIAL();
2587 RRETURN(MATCH_NOMATCH);
2588 }
2589 if (eptr >= md->end_subject)
2590 {
2591 SCHECK_PARTIAL();
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2595 }
2596 /* Control never gets here */
2597 }
2598 else /* Maximize */
2599 {
2600 pp = eptr;
2601 for (i = min; i < max; i++)
2602 {
2603 if (eptr >= md->end_subject || fc != *eptr) break;
2604 eptr++;
2605 }
2606 CHECK_PARTIAL();
2607 if (possessive) continue;
2608 while (eptr >= pp)
2609 {
2610 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2611 eptr--;
2612 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2613 }
2614 RRETURN(MATCH_NOMATCH);
2615 }
2616 }
2617 /* Control never gets here */
2618
2619 /* Match a negated single one-byte character. The character we are
2620 checking can be multibyte. */
2621
2622 case OP_NOT:
2623 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2624 ecode++;
2625 GETCHARINCTEST(c, eptr);
2626 if ((ims & PCRE_CASELESS) != 0)
2627 {
2628 #ifdef SUPPORT_UTF8
2629 if (c < 256)
2630 #endif
2631 c = md->lcc[c];
2632 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2633 }
2634 else
2635 {
2636 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2637 }
2638 break;
2639
2640 /* Match a negated single one-byte character repeatedly. This is almost a
2641 repeat of the code for a repeated single character, but I haven't found a
2642 nice way of commoning these up that doesn't require a test of the
2643 positive/negative option for each character match. Maybe that wouldn't add
2644 very much to the time taken, but character matching *is* what this is all
2645 about... */
2646
2647 case OP_NOTEXACT:
2648 min = max = GET2(ecode, 1);
2649 ecode += 3;
2650 goto REPEATNOTCHAR;
2651
2652 case OP_NOTUPTO:
2653 case OP_NOTMINUPTO:
2654 min = 0;
2655 max = GET2(ecode, 1);
2656 minimize = *ecode == OP_NOTMINUPTO;
2657 ecode += 3;
2658 goto REPEATNOTCHAR;
2659
2660 case OP_NOTPOSSTAR:
2661 possessive = TRUE;
2662 min = 0;
2663 max = INT_MAX;
2664 ecode++;
2665 goto REPEATNOTCHAR;
2666
2667 case OP_NOTPOSPLUS:
2668 possessive = TRUE;
2669 min = 1;
2670 max = INT_MAX;
2671 ecode++;
2672 goto REPEATNOTCHAR;
2673
2674 case OP_NOTPOSQUERY:
2675 possessive = TRUE;
2676 min = 0;
2677 max = 1;
2678 ecode++;
2679 goto REPEATNOTCHAR;
2680
2681 case OP_NOTPOSUPTO:
2682 possessive = TRUE;
2683 min = 0;
2684 max = GET2(ecode, 1);
2685 ecode += 3;
2686 goto REPEATNOTCHAR;
2687
2688 case OP_NOTSTAR:
2689 case OP_NOTMINSTAR:
2690 case OP_NOTPLUS:
2691 case OP_NOTMINPLUS:
2692 case OP_NOTQUERY:
2693 case OP_NOTMINQUERY:
2694 c = *ecode++ - OP_NOTSTAR;
2695 minimize = (c & 1) != 0;
2696 min = rep_min[c]; /* Pick up values from tables; */
2697 max = rep_max[c]; /* zero for max => infinity */
2698 if (max == 0) max = INT_MAX;
2699
2700 /* Common code for all repeated single-byte matches. */
2701
2702 REPEATNOTCHAR:
2703 fc = *ecode++;
2704
2705 /* The code is duplicated for the caseless and caseful cases, for speed,
2706 since matching characters is likely to be quite common. First, ensure the
2707 minimum number of matches are present. If min = max, continue at the same
2708 level without recursing. Otherwise, if minimizing, keep trying the rest of
2709 the expression and advancing one matching character if failing, up to the
2710 maximum. Alternatively, if maximizing, find the maximum number of
2711 characters and work backwards. */
2712
2713 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2714 max, eptr));
2715
2716 if ((ims & PCRE_CASELESS) != 0)
2717 {
2718 fc = md->lcc[fc];
2719
2720 #ifdef SUPPORT_UTF8
2721 /* UTF-8 mode */
2722 if (utf8)
2723 {
2724 register unsigned int d;
2725 for (i = 1; i <= min; i++)
2726 {
2727 if (eptr >= md->end_subject)
2728 {
2729 SCHECK_PARTIAL();
2730 RRETURN(MATCH_NOMATCH);
2731 }
2732 GETCHARINC(d, eptr);
2733 if (d < 256) d = md->lcc[d];
2734 if (fc == d) RRETURN(MATCH_NOMATCH);
2735 }
2736 }
2737 else
2738 #endif
2739
2740 /* Not UTF-8 mode */
2741 {
2742 for (i = 1; i <= min; i++)
2743 {
2744 if (eptr >= md->end_subject)
2745 {
2746 SCHECK_PARTIAL();
2747 RRETURN(MATCH_NOMATCH);
2748 }
2749 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2750 }
2751 }
2752
2753 if (min == max) continue;
2754
2755 if (minimize)
2756 {
2757 #ifdef SUPPORT_UTF8
2758 /* UTF-8 mode */
2759 if (utf8)
2760 {
2761 register unsigned int d;
2762 for (fi = min;; fi++)
2763 {
2764 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2765 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2766 if (fi >= max)
2767 {
2768 CHECK_PARTIAL();
2769 RRETURN(MATCH_NOMATCH);
2770 }
2771 if (eptr >= md->end_subject)
2772 {
2773 SCHECK_PARTIAL();
2774 RRETURN(MATCH_NOMATCH);
2775 }
2776 GETCHARINC(d, eptr);
2777 if (d < 256) d = md->lcc[d];
2778 if (fc == d) RRETURN(MATCH_NOMATCH);
2779 }
2780 }
2781 else
2782 #endif
2783 /* Not UTF-8 mode */
2784 {
2785 for (fi = min;; fi++)
2786 {
2787 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2788 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2789 if (fi >= max)
2790 {
2791 CHECK_PARTIAL();
2792 RRETURN(MATCH_NOMATCH);
2793 }
2794 if (eptr >= md->end_subject)
2795 {
2796 SCHECK_PARTIAL();
2797 RRETURN(MATCH_NOMATCH);
2798 }
2799 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2800 }
2801 }
2802 /* Control never gets here */
2803 }
2804
2805 /* Maximize case */
2806
2807 else
2808 {
2809 pp = eptr;
2810
2811 #ifdef SUPPORT_UTF8
2812 /* UTF-8 mode */
2813 if (utf8)
2814 {
2815 register unsigned int d;
2816 for (i = min; i < max; i++)
2817 {
2818 int len = 1;
2819 if (eptr >= md->end_subject) break;
2820 GETCHARLEN(d, eptr, len);
2821 if (d < 256) d = md->lcc[d];
2822 if (fc == d) break;
2823 eptr += len;
2824 }
2825 CHECK_PARTIAL();
2826 if (possessive) continue;
2827 for(;;)
2828 {
2829 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2830 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2831 if (eptr-- == pp) break; /* Stop if tried at original pos */
2832 BACKCHAR(eptr);
2833 }
2834 }
2835 else
2836 #endif
2837 /* Not UTF-8 mode */
2838 {
2839 for (i = min; i < max; i++)
2840 {
2841 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2842 eptr++;
2843 }
2844 CHECK_PARTIAL();
2845 if (possessive) continue;
2846 while (eptr >= pp)
2847 {
2848 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2849 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2850 eptr--;
2851 }
2852 }
2853
2854 RRETURN(MATCH_NOMATCH);
2855 }
2856 /* Control never gets here */
2857 }
2858
2859 /* Caseful comparisons */
2860
2861 else
2862 {
2863 #ifdef SUPPORT_UTF8
2864 /* UTF-8 mode */
2865 if (utf8)
2866 {
2867 register unsigned int d;
2868 for (i = 1; i <= min; i++)
2869 {
2870 if (eptr >= md->end_subject)
2871 {
2872 SCHECK_PARTIAL();
2873 RRETURN(MATCH_NOMATCH);
2874 }
2875 GETCHARINC(d, eptr);
2876 if (fc == d) RRETURN(MATCH_NOMATCH);
2877 }
2878 }
2879 else
2880 #endif
2881 /* Not UTF-8 mode */
2882 {
2883 for (i = 1; i <= min; i++)
2884 {
2885 if (eptr >= md->end_subject)
2886 {
2887 SCHECK_PARTIAL();
2888 RRETURN(MATCH_NOMATCH);
2889 }
2890 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2891 }
2892 }
2893
2894 if (min == max) continue;
2895
2896 if (minimize)
2897 {
2898 #ifdef SUPPORT_UTF8
2899 /* UTF-8 mode */
2900 if (utf8)
2901 {
2902 register unsigned int d;
2903 for (fi = min;; fi++)
2904 {
2905 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2906 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2907 if (fi >= max)
2908 {
2909 CHECK_PARTIAL();
2910 RRETURN(MATCH_NOMATCH);
2911 }
2912 if (eptr >= md->end_subject)
2913 {
2914 SCHECK_PARTIAL();
2915 RRETURN(MATCH_NOMATCH);
2916 }
2917 GETCHARINC(d, eptr);
2918 if (fc == d) RRETURN(MATCH_NOMATCH);
2919 }
2920 }
2921 else
2922 #endif
2923 /* Not UTF-8 mode */
2924 {
2925 for (fi = min;; fi++)
2926 {
2927 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2928 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2929 if (fi >= max)
2930 {
2931 CHECK_PARTIAL();
2932 RRETURN(MATCH_NOMATCH);
2933 }
2934 if (eptr >= md->end_subject)
2935 {
2936 SCHECK_PARTIAL();
2937 RRETURN(MATCH_NOMATCH);
2938 }
2939 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2940 }
2941 }
2942 /* Control never gets here */
2943 }
2944
2945 /* Maximize case */
2946
2947 else
2948 {
2949 pp = eptr;
2950
2951 #ifdef SUPPORT_UTF8
2952 /* UTF-8 mode */
2953 if (utf8)
2954 {
2955 register unsigned int d;
2956 for (i = min; i < max; i++)
2957 {
2958 int len = 1;
2959 if (eptr >= md->end_subject) break;
2960 GETCHARLEN(d, eptr, len);
2961 if (fc == d) break;
2962 eptr += len;
2963 }
2964 CHECK_PARTIAL();
2965 if (possessive) continue;
2966 for(;;)
2967 {
2968 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2970 if (eptr-- == pp) break; /* Stop if tried at original pos */
2971 BACKCHAR(eptr);
2972 }
2973 }
2974 else
2975 #endif
2976 /* Not UTF-8 mode */
2977 {
2978 for (i = min; i < max; i++)
2979 {
2980 if (eptr >= md->end_subject || fc == *eptr) break;
2981 eptr++;
2982 }
2983 CHECK_PARTIAL();
2984 if (possessive) continue;
2985 while (eptr >= pp)
2986 {
2987 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2989 eptr--;
2990 }
2991 }
2992
2993 RRETURN(MATCH_NOMATCH);
2994 }
2995 }
2996 /* Control never gets here */
2997
2998 /* Match a single character type repeatedly; several different opcodes
2999 share code. This is very similar to the code for single characters, but we
3000 repeat it in the interests of efficiency. */
3001
3002 case OP_TYPEEXACT:
3003 min = max = GET2(ecode, 1);
3004 minimize = TRUE;
3005 ecode += 3;
3006 goto REPEATTYPE;
3007
3008 case OP_TYPEUPTO:
3009 case OP_TYPEMINUPTO:
3010 min = 0;
3011 max = GET2(ecode, 1);
3012 minimize = *ecode == OP_TYPEMINUPTO;
3013 ecode += 3;
3014 goto REPEATTYPE;
3015
3016 case OP_TYPEPOSSTAR:
3017 possessive = TRUE;
3018 min = 0;
3019 max = INT_MAX;
3020 ecode++;
3021 goto REPEATTYPE;
3022
3023 case OP_TYPEPOSPLUS:
3024 possessive = TRUE;
3025 min = 1;
3026 max = INT_MAX;
3027 ecode++;
3028 goto REPEATTYPE;
3029
3030 case OP_TYPEPOSQUERY:
3031 possessive = TRUE;
3032 min = 0;
3033 max = 1;
3034 ecode++;
3035 goto REPEATTYPE;
3036
3037 case OP_TYPEPOSUPTO:
3038 possessive = TRUE;
3039 min = 0;
3040 max = GET2(ecode, 1);
3041 ecode += 3;
3042 goto REPEATTYPE;
3043
3044 case OP_TYPESTAR:
3045 case OP_TYPEMINSTAR:
3046 case OP_TYPEPLUS:
3047 case OP_TYPEMINPLUS:
3048 case OP_TYPEQUERY:
3049 case OP_TYPEMINQUERY:
3050 c = *ecode++ - OP_TYPESTAR;
3051 minimize = (c & 1) != 0;
3052 min = rep_min[c]; /* Pick up values from tables; */
3053 max = rep_max[c]; /* zero for max => infinity */
3054 if (max == 0) max = INT_MAX;
3055
3056 /* Common code for all repeated single character type matches. Note that
3057 in UTF-8 mode, '.' matches a character of any length, but for the other
3058 character types, the valid characters are all one-byte long. */
3059
3060 REPEATTYPE:
3061 ctype = *ecode++; /* Code for the character type */
3062
3063 #ifdef SUPPORT_UCP
3064 if (ctype == OP_PROP || ctype == OP_NOTPROP)
3065 {
3066 prop_fail_result = ctype == OP_NOTPROP;
3067 prop_type = *ecode++;
3068 prop_value = *ecode++;
3069 }
3070 else prop_type = -1;
3071 #endif
3072
3073 /* First, ensure the minimum number of matches are present. Use inline
3074 code for maximizing the speed, and do the type test once at the start
3075 (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
3076 is tidier. Also separate the UCP code, which can be the same for both UTF-8
3077 and single-bytes. */
3078
3079 if (min > 0)
3080 {
3081 #ifdef SUPPORT_UCP
3082 if (prop_type >= 0)
3083 {
3084 switch(prop_type)
3085 {
3086 case PT_ANY:
3087 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3088 for (i = 1; i <= min; i++)
3089 {
3090 if (eptr >= md->end_subject)
3091 {
3092 SCHECK_PARTIAL();
3093 RRETURN(MATCH_NOMATCH);
3094 }
3095 GETCHARINCTEST(c, eptr);
3096 }
3097 break;
3098
3099 case PT_LAMP:
3100 for (i = 1; i <= min; i++)
3101 {
3102 if (eptr >= md->end_subject)
3103 {
3104 SCHECK_PARTIAL();
3105 RRETURN(MATCH_NOMATCH);
3106 }
3107 GETCHARINCTEST(c, eptr);
3108 prop_chartype = UCD_CHARTYPE(c);
3109 if ((prop_chartype == ucp_Lu ||
3110 prop_chartype == ucp_Ll ||
3111 prop_chartype == ucp_Lt) == prop_fail_result)
3112 RRETURN(MATCH_NOMATCH);
3113 }
3114 break;
3115
3116 case PT_GC:
3117 for (i = 1; i <= min; i++)
3118 {
3119 if (eptr >= md->end_subject)
3120 {
3121 SCHECK_PARTIAL();
3122 RRETURN(MATCH_NOMATCH);
3123 }
3124 GETCHARINCTEST(c, eptr);
3125 prop_category = UCD_CATEGORY(c);
3126 if ((prop_category == prop_value) == prop_fail_result)
3127 RRETURN(MATCH_NOMATCH);
3128 }
3129 break;
3130
3131 case PT_PC:
3132 for (i = 1; i <= min; i++)
3133 {
3134 if (eptr >= md->end_subject)
3135 {
3136 SCHECK_PARTIAL();
3137 RRETURN(MATCH_NOMATCH);
3138 }
3139 GETCHARINCTEST(c, eptr);
3140 prop_chartype = UCD_CHARTYPE(c);
3141 if ((prop_chartype == prop_value) == prop_fail_result)
3142 RRETURN(MATCH_NOMATCH);
3143 }
3144 break;
3145
3146 case PT_SC:
3147 for (i = 1; i <= min; i++)
3148 {
3149 if (eptr >= md->end_subject)
3150 {
3151 SCHECK_PARTIAL();
3152 RRETURN(MATCH_NOMATCH);
3153 }
3154 GETCHARINCTEST(c, eptr);
3155 prop_script = UCD_SCRIPT(c);
3156 if ((prop_script == prop_value) == prop_fail_result)
3157 RRETURN(MATCH_NOMATCH);
3158 }
3159 break;
3160
3161 default:
3162 RRETURN(PCRE_ERROR_INTERNAL);
3163 }
3164 }
3165
3166 /* Match extended Unicode sequences. We will get here only if the
3167 support is in the binary; otherwise a compile-time error occurs. */
3168
3169 else if (ctype == OP_EXTUNI)
3170 {
3171 for (i = 1; i <= min; i++)
3172 {
3173 if (eptr >= md->end_subject)
3174 {
3175 SCHECK_PARTIAL();
3176 RRETURN(MATCH_NOMATCH);
3177 }
3178 GETCHARINCTEST(c, eptr);
3179 prop_category = UCD_CATEGORY(c);
3180 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3181 while (eptr < md->end_subject)
3182 {
3183 int len = 1;
3184 if (!utf8) c = *eptr;
3185 else { GETCHARLEN(c, eptr, len); }
3186 prop_category = UCD_CATEGORY(c);
3187 if (prop_category != ucp_M) break;
3188 eptr += len;
3189 }
3190 }
3191 }
3192
3193 else
3194 #endif /* SUPPORT_UCP */
3195
3196 /* Handle all other cases when the coding is UTF-8 */
3197
3198 #ifdef SUPPORT_UTF8
3199 if (utf8) switch(ctype)
3200 {
3201 case OP_ANY:
3202 for (i = 1; i <= min; i++)
3203 {
3204 if (eptr >= md->end_subject)
3205 {
3206 SCHECK_PARTIAL();
3207 RRETURN(MATCH_NOMATCH);
3208 }
3209 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3210 eptr++;
3211 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3212 }
3213 break;
3214
3215 case OP_ALLANY:
3216 for (i = 1; i <= min; i++)
3217 {
3218 if (eptr >= md->end_subject)
3219 {
3220 SCHECK_PARTIAL();
3221 RRETURN(MATCH_NOMATCH);
3222 }
3223 eptr++;
3224 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3225 }
3226 break;
3227
3228 case OP_ANYBYTE:
3229 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3230 eptr += min;
3231 break;
3232
3233 case OP_ANYNL:
3234 for (i = 1; i <= min; i++)
3235 {
3236 if (eptr >= md->end_subject)
3237 {
3238 SCHECK_PARTIAL();
3239 RRETURN(MATCH_NOMATCH);
3240 }
3241 GETCHARINC(c, eptr);
3242 switch(c)
3243 {
3244 default: RRETURN(MATCH_NOMATCH);
3245 case 0x000d:
3246 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3247 break;
3248
3249 case 0x000a:
3250 break;
3251
3252 case 0x000b:
3253 case 0x000c:
3254 case 0x0085:
3255 case 0x2028:
3256 case 0x2029:
3257 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3258 break;
3259 }
3260 }
3261 break;
3262
3263 case OP_NOT_HSPACE:
3264 for (i = 1; i <= min; i++)
3265 {
3266 if (eptr >= md->end_subject)
3267 {
3268 SCHECK_PARTIAL();
3269 RRETURN(MATCH_NOMATCH);
3270 }
3271 GETCHARINC(c, eptr);
3272 switch(c)
3273 {
3274 default: break;
3275 case 0x09: /* HT */
3276 case 0x20: /* SPACE */
3277 case 0xa0: /* NBSP */
3278 case 0x1680: /* OGHAM SPACE MARK */
3279 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3280 case 0x2000: /* EN QUAD */
3281 case 0x2001: /* EM QUAD */
3282 case 0x2002: /* EN SPACE */
3283 case 0x2003: /* EM SPACE */
3284 case 0x2004: /* THREE-PER-EM SPACE */
3285 case 0x2005: /* FOUR-PER-EM SPACE */
3286 case 0x2006: /* SIX-PER-EM SPACE */
3287 case 0x2007: /* FIGURE SPACE */
3288 case 0x2008: /* PUNCTUATION SPACE */
3289 case 0x2009: /* THIN SPACE */
3290 case 0x200A: /* HAIR SPACE */
3291 case 0x202f: /* NARROW NO-BREAK SPACE */
3292 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3293 case 0x3000: /* IDEOGRAPHIC SPACE */
3294 RRETURN(MATCH_NOMATCH);
3295 }
3296 }
3297 break;
3298
3299 case OP_HSPACE:
3300 for (i = 1; i <= min; i++)
3301 {
3302 if (eptr >= md->end_subject)
3303 {
3304 SCHECK_PARTIAL();
3305 RRETURN(MATCH_NOMATCH);
3306 }
3307 GETCHARINC(c, eptr);
3308 switch(c)
3309 {
3310 default: RRETURN(MATCH_NOMATCH);
3311 case 0x09: /* HT */
3312 case 0x20: /* SPACE */
3313 case 0xa0: /* NBSP */
3314 case 0x1680: /* OGHAM SPACE MARK */
3315 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3316 case 0x2000: /* EN QUAD */
3317 case 0x2001: /* EM QUAD */
3318 case 0x2002: /* EN SPACE */
3319 case 0x2003: /* EM SPACE */
3320 case 0x2004: /* THREE-PER-EM SPACE */
3321 case 0x2005: /* FOUR-PER-EM SPACE */
3322 case 0x2006: /* SIX-PER-EM SPACE */
3323 case 0x2007: /* FIGURE SPACE */
3324 case 0x2008: /* PUNCTUATION SPACE */
3325 case 0x2009: /* THIN SPACE */
3326 case 0x200A: /* HAIR SPACE */
3327 case 0x202f: /* NARROW NO-BREAK SPACE */
3328 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3329 case 0x3000: /* IDEOGRAPHIC SPACE */
3330 break;
3331 }
3332 }
3333 break;
3334
3335 case OP_NOT_VSPACE:
3336 for (i = 1; i <= min; i++)
3337 {
3338 if (eptr >= md->end_subject)
3339 {
3340 SCHECK_PARTIAL();
3341 RRETURN(MATCH_NOMATCH);
3342 }
3343 GETCHARINC(c, eptr);
3344 switch(c)
3345 {
3346 default: break;
3347 case 0x0a: /* LF */
3348 case 0x0b: /* VT */
3349 case 0x0c: /* FF */
3350 case 0x0d: /* CR */
3351 case 0x85: /* NEL */
3352 case 0x2028: /* LINE SEPARATOR */
3353 case 0x2029: /* PARAGRAPH SEPARATOR */
3354 RRETURN(MATCH_NOMATCH);
3355 }
3356 }
3357 break;
3358
3359 case OP_VSPACE:
3360 for (i = 1; i <= min; i++)
3361 {
3362 if (eptr >= md->end_subject)
3363 {
3364 SCHECK_PARTIAL();
3365 RRETURN(MATCH_NOMATCH);
3366 }
3367 GETCHARINC(c, eptr);
3368 switch(c)
3369 {
3370 default: RRETURN(MATCH_NOMATCH);
3371 case 0x0a: /* LF */
3372 case 0x0b: /* VT */
3373 case 0x0c: /* FF */
3374 case 0x0d: /* CR */
3375 case 0x85: /* NEL */
3376 case 0x2028: /* LINE SEPARATOR */
3377 case 0x2029: /* PARAGRAPH SEPARATOR */
3378 break;
3379 }
3380 }
3381 break;
3382
3383 case OP_NOT_DIGIT:
3384 for (i = 1; i <= min; i++)
3385 {
3386 if (eptr >= md->end_subject)
3387 {
3388 SCHECK_PARTIAL();
3389 RRETURN(MATCH_NOMATCH);
3390 }
3391 GETCHARINC(c, eptr);
3392 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3393 RRETURN(MATCH_NOMATCH);
3394 }
3395 break;
3396
3397 case OP_DIGIT:
3398 for (i = 1; i <= min; i++)
3399 {
3400 if (eptr >= md->end_subject)
3401 {
3402 SCHECK_PARTIAL();
3403 RRETURN(MATCH_NOMATCH);
3404 }
3405 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3406 RRETURN(MATCH_NOMATCH);
3407 /* No need to skip more bytes - we know it's a 1-byte character */
3408 }
3409 break;
3410
3411 case OP_NOT_WHITESPACE:
3412 for (i = 1; i <= min; i++)
3413 {
3414 if (eptr >= md->end_subject)
3415 {
3416 SCHECK_PARTIAL();
3417 RRETURN(MATCH_NOMATCH);
3418 }
3419 if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3420 RRETURN(MATCH_NOMATCH);
3421 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3422 }
3423 break;
3424
3425 case OP_WHITESPACE:
3426 for (i = 1; i <= min; i++)
3427 {
3428 if (eptr >= md->end_subject)
3429 {
3430 SCHECK_PARTIAL();
3431 RRETURN(MATCH_NOMATCH);
3432 }
3433 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3434 RRETURN(MATCH_NOMATCH);
3435 /* No need to skip more bytes - we know it's a 1-byte character */
3436 }
3437 break;
3438
3439 case OP_NOT_WORDCHAR:
3440 for (i = 1; i <= min; i++)
3441 {
3442 if (eptr >= md->end_subject ||
3443 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3444 RRETURN(MATCH_NOMATCH);
3445 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3446 }
3447 break;
3448
3449 case OP_WORDCHAR:
3450 for (i = 1; i <= min; i++)
3451 {
3452 if (eptr >= md->end_subject)
3453 {
3454 SCHECK_PARTIAL();
3455 RRETURN(MATCH_NOMATCH);
3456 }
3457 if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3458 RRETURN(MATCH_NOMATCH);
3459 /* No need to skip more bytes - we know it's a 1-byte character */
3460 }
3461 break;
3462
3463 default:
3464 RRETURN(PCRE_ERROR_INTERNAL);
3465 } /* End switch(ctype) */
3466
3467 else
3468 #endif /* SUPPORT_UTF8 */
3469
3470 /* Code for the non-UTF-8 case for minimum matching of operators other
3471 than OP_PROP and OP_NOTPROP. */
3472
3473 switch(ctype)
3474 {
3475 case OP_ANY:
3476 for (i = 1; i <= min; i++)
3477 {
3478 if (eptr >= md->end_subject)
3479 {
3480 SCHECK_PARTIAL();
3481 RRETURN(MATCH_NOMATCH);
3482 }
3483 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3484 eptr++;
3485 }
3486 break;
3487
3488 case OP_ALLANY:
3489 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3490 eptr += min;
3491 break;
3492
3493 case OP_ANYBYTE:
3494 if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
3495 eptr += min;
3496 break;
3497
3498 case OP_ANYNL:
3499 for (i = 1; i <= min; i++)
3500 {
3501 if (eptr >= md->end_subject)
3502 {
3503 SCHECK_PARTIAL();
3504 RRETURN(MATCH_NOMATCH);
3505 }
3506 switch(*eptr++)
3507 {
3508 default: RRETURN(MATCH_NOMATCH);
3509 case 0x000d:
3510 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3511 break;
3512 case 0x000a:
3513 break;
3514
3515 case 0x000b:
3516 case 0x000c:
3517 case 0x0085:
3518 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3519 break;
3520 }
3521 }
3522 break;
3523
3524 case OP_NOT_HSPACE:
3525 for (i = 1; i <= min; i++)
3526 {
3527 if (eptr >= md->end_subject)
3528 {
3529 SCHECK_PARTIAL();
3530 RRETURN(MATCH_NOMATCH);
3531 }
3532 switch(*eptr++)
3533 {
3534 default: break;
3535 case 0x09: /* HT */
3536 case 0x20: /* SPACE */
3537 case 0xa0: /* NBSP */
3538 RRETURN(MATCH_NOMATCH);
3539 }
3540 }
3541 break;
3542
3543 case OP_HSPACE:
3544 for (i = 1; i <= min; i++)
3545 {
3546 if (eptr >= md->end_subject)
3547 {
3548 SCHECK_PARTIAL();
3549 RRETURN(MATCH_NOMATCH);
3550 }
3551 switch(*eptr++)
3552 {
3553 default: RRETURN(MATCH_NOMATCH);
3554 case 0x09: /* HT */
3555 case 0x20: /* SPACE */
3556 case 0xa0: /* NBSP */
3557 break;
3558 }
3559 }
3560 break;
3561
3562 case OP_NOT_VSPACE:
3563 for (i = 1; i <= min; i++)
3564 {
3565 if (eptr >= md->end_subject)
3566 {
3567 SCHECK_PARTIAL();
3568 RRETURN(MATCH_NOMATCH);
3569 }
3570 switch(*eptr++)
3571 {
3572 default: break;
3573 case 0x0a: /* LF */
3574 case 0x0b: /* VT */
3575 case 0x0c: /* FF */
3576 case 0x0d: /* CR */
3577 case 0x85: /* NEL */
3578 RRETURN(MATCH_NOMATCH);
3579 }
3580 }
3581 break;
3582
3583 case OP_VSPACE:
3584 for (i = 1; i <= min; i++)
3585 {
3586 if (eptr >= md->end_subject)
3587 {
3588 SCHECK_PARTIAL();
3589 RRETURN(MATCH_NOMATCH);
3590 }
3591 switch(*eptr++)
3592 {
3593 default: RRETURN(MATCH_NOMATCH);
3594 case 0x0a: /* LF */
3595 case 0x0b: /* VT */
3596 case 0x0c: /* FF */
3597 case 0x0d: /* CR */
3598 case 0x85: /* NEL */
3599 break;
3600 }
3601 }
3602 break;
3603
3604 case OP_NOT_DIGIT:
3605 for (i = 1; i <= min; i++)
3606 {
3607 if (eptr >= md->end_subject)
3608 {
3609 SCHECK_PARTIAL();
3610 RRETURN(MATCH_NOMATCH);
3611 }
3612 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3613 }
3614 break;
3615
3616 case OP_DIGIT:
3617 for (i = 1; i <= min; i++)
3618 {
3619 if (eptr >= md->end_subject)
3620 {
3621 SCHECK_PARTIAL();
3622 RRETURN(MATCH_NOMATCH);
3623 }
3624 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3625 }
3626 break;
3627
3628 case OP_NOT_WHITESPACE:
3629 for (i = 1; i <= min; i++)
3630 {
3631 if (eptr >= md->end_subject)
3632 {
3633 SCHECK_PARTIAL();
3634 RRETURN(MATCH_NOMATCH);
3635 }
3636 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3637 }
3638 break;
3639
3640 case OP_WHITESPACE:
3641 for (i = 1; i <= min; i++)
3642 {
3643 if (eptr >= md->end_subject)
3644 {
3645 SCHECK_PARTIAL();
3646 RRETURN(MATCH_NOMATCH);
3647 }
3648 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3649 }
3650 break;
3651
3652 case OP_NOT_WORDCHAR:
3653 for (i = 1; i <= min; i++)
3654 {
3655 if (eptr >= md->end_subject)
3656 {
3657 SCHECK_PARTIAL();
3658 RRETURN(MATCH_NOMATCH);
3659 }
3660 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3661 RRETURN(MATCH_NOMATCH);
3662 }
3663 break;
3664
3665 case OP_WORDCHAR:
3666 for (i = 1; i <= min; i++)
3667 {
3668 if (eptr >= md->end_subject)
3669 {
3670 SCHECK_PARTIAL();
3671 RRETURN(MATCH_NOMATCH);
3672 }
3673 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3674 RRETURN(MATCH_NOMATCH);
3675 }
3676 break;
3677
3678 default:
3679 RRETURN(PCRE_ERROR_INTERNAL);
3680 }
3681 }
3682
3683 /* If min = max, continue at the same level without recursing */
3684
3685 if (min == max) continue;
3686
3687 /* If minimizing, we have to test the rest of the pattern before each
3688 subsequent match. Again, separate the UTF-8 case for speed, and also
3689 separate the UCP cases. */
3690
3691 if (minimize)
3692 {
3693 #ifdef SUPPORT_UCP
3694 if (prop_type >= 0)
3695 {
3696 switch(prop_type)
3697 {
3698 case PT_ANY:
3699 for (fi = min;; fi++)
3700 {
3701 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3702 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3703 if (fi >= max)
3704 {
3705 CHECK_PARTIAL();
3706 RRETURN(MATCH_NOMATCH);
3707 }
3708 if (eptr >= md->end_subject)
3709 {
3710 SCHECK_PARTIAL();
3711 RRETURN(MATCH_NOMATCH);
3712 }
3713 GETCHARINC(c, eptr);
3714 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3715 }
3716 /* Control never gets here */
3717
3718 case PT_LAMP:
3719 for (fi = min;; fi++)
3720 {
3721 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3722 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3723 if (fi >= max)
3724 {
3725 CHECK_PARTIAL();
3726 RRETURN(MATCH_NOMATCH);
3727 }
3728 if (eptr >= md->end_subject)
3729 {
3730 SCHECK_PARTIAL();
3731 RRETURN(MATCH_NOMATCH);
3732 }
3733 GETCHARINC(c, eptr);
3734 prop_chartype = UCD_CHARTYPE(c);
3735 if ((prop_chartype == ucp_Lu ||
3736 prop_chartype == ucp_Ll ||
3737 prop_chartype == ucp_Lt) == prop_fail_result)
3738 RRETURN(MATCH_NOMATCH);
3739 }
3740 /* Control never gets here */
3741
3742 case PT_GC:
3743 for (fi = min;; fi++)
3744 {
3745 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3746 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3747 if (fi >= max)
3748 {
3749 CHECK_PARTIAL();
3750 RRETURN(MATCH_NOMATCH);
3751 }
3752 if (eptr >= md->end_subject)
3753 {
3754 SCHECK_PARTIAL();
3755 RRETURN(MATCH_NOMATCH);
3756 }
3757 GETCHARINC(c, eptr);
3758 prop_category = UCD_CATEGORY(c);
3759 if ((prop_category == prop_value) == prop_fail_result)
3760 RRETURN(MATCH_NOMATCH);
3761 }
3762 /* Control never gets here */
3763
3764 case PT_PC:
3765 for (fi = min;; fi++)
3766 {
3767 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3768 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3769 if (fi >= max)
3770 {
3771 CHECK_PARTIAL();
3772 RRETURN(MATCH_NOMATCH);
3773 }
3774 if (eptr >= md->end_subject)
3775 {
3776 SCHECK_PARTIAL();
3777 RRETURN(MATCH_NOMATCH);
3778 }
3779 GETCHARINC(c, eptr);
3780 prop_chartype = UCD_CHARTYPE(c);
3781 if ((prop_chartype == prop_value) == prop_fail_result)
3782 RRETURN(MATCH_NOMATCH);
3783 }
3784 /* Control never gets here */
3785
3786 case PT_SC:
3787 for (fi = min;; fi++)
3788 {
3789 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3790 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3791 if (fi >= max)
3792 {
3793 CHECK_PARTIAL();
3794 RRETURN(MATCH_NOMATCH);
3795 }
3796 if (eptr >= md->end_subject)
3797 {
3798 SCHECK_PARTIAL();
3799 RRETURN(MATCH_NOMATCH);
3800 }
3801 GETCHARINC(c, eptr);
3802 prop_script = UCD_SCRIPT(c);
3803 if ((prop_script == prop_value) == prop_fail_result)
3804 RRETURN(MATCH_NOMATCH);
3805 }
3806 /* Control never gets here */
3807
3808 default:
3809 RRETURN(PCRE_ERROR_INTERNAL);
3810 }
3811 }
3812
3813 /* Match extended Unicode sequences. We will get here only if the
3814 support is in the binary; otherwise a compile-time error occurs. */
3815
3816 else if (ctype == OP_EXTUNI)
3817 {
3818 for (fi = min;; fi++)
3819 {
3820 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3821 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3822 if (fi >= max)
3823 {
3824 CHECK_PARTIAL();
3825 RRETURN(MATCH_NOMATCH);
3826 }
3827 if (eptr >= md->end_subject)
3828 {
3829 SCHECK_PARTIAL();
3830 RRETURN(MATCH_NOMATCH);
3831 }
3832 GETCHARINCTEST(c, eptr);
3833 prop_category = UCD_CATEGORY(c);
3834 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3835 while (eptr < md->end_subject)
3836 {
3837 int len = 1;
3838 if (!utf8) c = *eptr;
3839 else { GETCHARLEN(c, eptr, len); }
3840 prop_category = UCD_CATEGORY(c);
3841 if (prop_category != ucp_M) break;
3842 eptr += len;
3843 }
3844 }
3845 }
3846
3847 else
3848 #endif /* SUPPORT_UCP */
3849
3850 #ifdef SUPPORT_UTF8
3851 /* UTF-8 mode */
3852 if (utf8)
3853 {
3854 for (fi = min;; fi++)
3855 {
3856 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3857 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3858 if (fi >= max)
3859 {
3860 CHECK_PARTIAL();
3861 RRETURN(MATCH_NOMATCH);
3862 }
3863 if (eptr >= md->end_subject)
3864 {
3865 SCHECK_PARTIAL();
3866 RRETURN(MATCH_NOMATCH);
3867 }
3868 if (ctype == OP_ANY && IS_NEWLINE(eptr))
3869 RRETURN(MATCH_NOMATCH);
3870 GETCHARINC(c, eptr);
3871 switch(ctype)
3872 {
3873 case OP_ANY: /* This is the non-NL case */
3874 case OP_ALLANY:
3875 case OP_ANYBYTE:
3876 break;
3877
3878 case OP_ANYNL:
3879 switch(c)
3880 {
3881 default: RRETURN(MATCH_NOMATCH);
3882 case 0x000d:
3883 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3884 break;
3885 case 0x000a:
3886 break;
3887
3888 case 0x000b:
3889 case 0x000c:
3890 case 0x0085:
3891 case 0x2028:
3892 case 0x2029:
3893 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3894 break;
3895 }
3896 break;
3897
3898 case OP_NOT_HSPACE:
3899 switch(c)
3900 {
3901 default: break;
3902 case 0x09: /* HT */
3903 case 0x20: /* SPACE */
3904 case 0xa0: /* NBSP */
3905 case 0x1680: /* OGHAM SPACE MARK */
3906 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3907 case 0x2000: /* EN QUAD */
3908 case 0x2001: /* EM QUAD */
3909 case 0x2002: /* EN SPACE */
3910 case 0x2003: /* EM SPACE */
3911 case 0x2004: /* THREE-PER-EM SPACE */
3912 case 0x2005: /* FOUR-PER-EM SPACE */
3913 case 0x2006: /* SIX-PER-EM SPACE */
3914 case 0x2007: /* FIGURE SPACE */
3915 case 0x2008: /* PUNCTUATION SPACE */
3916 case 0x2009: /* THIN SPACE */
3917 case 0x200A: /* HAIR SPACE */
3918 case 0x202f: /* NARROW NO-BREAK SPACE */
3919 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3920 case 0x3000: /* IDEOGRAPHIC SPACE */
3921 RRETURN(MATCH_NOMATCH);
3922 }
3923 break;
3924
3925 case OP_HSPACE:
3926 switch(c)
3927 {
3928 default: RRETURN(MATCH_NOMATCH);
3929 case 0x09: /* HT */
3930 case 0x20: /* SPACE */
3931 case 0xa0: /* NBSP */
3932 case 0x1680: /* OGHAM SPACE MARK */
3933 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3934 case 0x2000: /* EN QUAD */
3935 case 0x2001: /* EM QUAD */
3936 case 0x2002: /* EN SPACE */
3937 case 0x2003: /* EM SPACE */
3938 case 0x2004: /* THREE-PER-EM SPACE */
3939 case 0x2005: /* FOUR-PER-EM SPACE */
3940 case 0x2006: /* SIX-PER-EM SPACE */
3941 case 0x2007: /* FIGURE SPACE */
3942 case 0x2008: /* PUNCTUATION SPACE */
3943 case 0x2009: /* THIN SPACE */
3944 case 0x200A: /* HAIR SPACE */
3945 case 0x202f: /* NARROW NO-BREAK SPACE */
3946 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3947 case 0x3000: /* IDEOGRAPHIC SPACE */
3948 break;
3949 }
3950 break;
3951
3952 case OP_NOT_VSPACE:
3953 switch(c)
3954 {
3955 default: break;
3956 case 0x0a: /* LF */
3957 case 0x0b: /* VT */
3958 case 0x0c: /* FF */
3959 case 0x0d: /* CR */
3960 case 0x85: /* NEL */
3961 case 0x2028: /* LINE SEPARATOR */
3962 case 0x2029: /* PARAGRAPH SEPARATOR */
3963 RRETURN(MATCH_NOMATCH);
3964 }
3965 break;
3966
3967 case OP_VSPACE:
3968 switch(c)
3969 {
3970 default: RRETURN(MATCH_NOMATCH);
3971 case 0x0a: /* LF */
3972 case 0x0b: /* VT */
3973 case 0x0c: /* FF */
3974 case 0x0d: /* CR */
3975 case 0x85: /* NEL */
3976 case 0x2028: /* LINE SEPARATOR */
3977 case 0x2029: /* PARAGRAPH SEPARATOR */
3978 break;
3979 }
3980 break;
3981
3982 case OP_NOT_DIGIT:
3983 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3984 RRETURN(MATCH_NOMATCH);
3985 break;
3986
3987 case OP_DIGIT:
3988 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3989 RRETURN(MATCH_NOMATCH);
3990 break;
3991
3992 case OP_NOT_WHITESPACE:
3993 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3994 RRETURN(MATCH_NOMATCH);
3995 break;
3996
3997 case OP_WHITESPACE:
3998 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3999 RRETURN(MATCH_NOMATCH);
4000 break;
4001
4002 case OP_NOT_WORDCHAR:
4003 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
4004 RRETURN(MATCH_NOMATCH);
4005 break;
4006
4007 case OP_WORDCHAR:
4008 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
4009 RRETURN(MATCH_NOMATCH);
4010 break;
4011
4012 default:
4013 RRETURN(PCRE_ERROR_INTERNAL);
4014 }
4015 }
4016 }
4017 else
4018 #endif
4019 /* Not UTF-8 mode */
4020 {
4021 for (fi = min;; fi++)
4022 {
4023 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4025 if (fi >= max)
4026 {
4027 CHECK_PARTIAL();
4028 RRETURN(MATCH_NOMATCH);
4029 }
4030 if (eptr >= md->end_subject)
4031 {
4032 SCHECK_PARTIAL();
4033 RRETURN(MATCH_NOMATCH);
4034 }
4035 if (ctype == OP_ANY && IS_NEWLINE(eptr))
4036 RRETURN(MATCH_NOMATCH);
4037 c = *eptr++;
4038 switch(ctype)
4039 {
4040 case OP_ANY: /* This is the non-NL case */
4041 case OP_ALLANY:
4042 case OP_ANYBYTE:
4043 break;
4044
4045 case OP_ANYNL:
4046 switch(c)
4047 {
4048 default: RRETURN(MATCH_NOMATCH);
4049 case 0x000d:
4050 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4051 break;
4052
4053 case 0x000a:
4054 break;
4055
4056 case 0x000b:
4057 case 0x000c:
4058 case 0x0085:
4059 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
4060 break;
4061 }
4062 break;
4063
4064 case OP_NOT_HSPACE:
4065 switch(c)
4066 {
4067 default: break;
4068 case 0x09: /* HT */
4069 case 0x20: /* SPACE */
4070 case 0xa0: /* NBSP */
4071 RRETURN(MATCH_NOMATCH);
4072 }
4073 break;
4074
4075 case OP_HSPACE:
4076 switch(c)
4077 {
4078 default: RRETURN(MATCH_NOMATCH);
4079 case 0x09: /* HT */
4080 case 0x20: /* SPACE */
4081 case 0xa0: /* NBSP */
4082 break;
4083 }
4084 break;
4085
4086 case OP_NOT_VSPACE:
4087 switch(c)
4088 {
4089 default: break;
4090 case 0x0a: /* LF */
4091 case 0x0b: /* VT */
4092 case 0x0c: /* FF */
4093 case 0x0d: /* CR */
4094 case 0x85: /* NEL */
4095 RRETURN(MATCH_NOMATCH);
4096 }
4097 break;
4098
4099 case OP_VSPACE:
4100 switch(c)
4101 {
4102 default: RRETURN(MATCH_NOMATCH);
4103 case 0x0a: /* LF */
4104 case 0x0b: /* VT */
4105 case 0x0c: /* FF */
4106 case 0x0d: /* CR */
4107 case 0x85: /* NEL */
4108 break;
4109 }
4110 break;
4111
4112 case OP_NOT_DIGIT:
4113 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
4114 break;
4115
4116 case OP_DIGIT:
4117 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
4118 break;
4119
4120 case OP_NOT_WHITESPACE:
4121 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
4122 break;
4123
4124 case OP_WHITESPACE:
4125 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
4126 break;
4127
4128 case OP_NOT_WORDCHAR:
4129 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
4130 break;
4131
4132 case OP_WORDCHAR:
4133 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
4134 break;
4135
4136 default:
4137 RRETURN(PCRE_ERROR_INTERNAL);
4138 }
4139 }
4140 }
4141 /* Control never gets here */
4142 }
4143
4144 /* If maximizing, it is worth using inline code for speed, doing the type
4145 test once at the start (i.e. keep it out of the loop). Again, keep the
4146 UTF-8 and UCP stuff separate. */
4147
4148 else
4149 {
4150 pp = eptr; /* Remember where we started */
4151
4152 #ifdef SUPPORT_UCP
4153 if (prop_type >= 0)
4154 {
4155 switch(prop_type)
4156 {
4157 case PT_ANY:
4158 for (i = min; i < max; i++)
4159 {
4160 int len = 1;
4161 if (eptr >= md->end_subject) break;
4162 GETCHARLEN(c, eptr, len);
4163 if (prop_fail_result) break;
4164 eptr+= len;
4165 }
4166 break;
4167
4168 case PT_LAMP:
4169 for (i = min; i < max; i++)
4170 {
4171 int len = 1;
4172 if (eptr >= md->end_subject) break;
4173 GETCHARLEN(c, eptr, len);
4174 prop_chartype = UCD_CHARTYPE(c);
4175 if ((prop_chartype == ucp_Lu ||
4176 prop_chartype == ucp_Ll ||
4177 prop_chartype == ucp_Lt) == prop_fail_result)
4178 break;
4179 eptr+= len;
4180 }
4181 break;
4182
4183 case PT_GC:
4184 for (i = min; i < max; i++)
4185 {
4186 int len = 1;
4187 if (eptr >= md->end_subject) break;
4188 GETCHARLEN(c, eptr, len);
4189 prop_category = UCD_CATEGORY(c);
4190 if ((prop_category == prop_value) == prop_fail_result)
4191 break;
4192 eptr+= len;
4193 }
4194 break;
4195
4196 case PT_PC:
4197 for (i = min; i < max; i++)
4198 {
4199 int len = 1;
4200 if (eptr >= md->end_subject) break;
4201 GETCHARLEN(c, eptr, len);
4202 prop_chartype = UCD_CHARTYPE(c);
4203 if ((prop_chartype == prop_value) == prop_fail_result)
4204 break;
4205 eptr+= len;
4206 }
4207 break;
4208
4209 case PT_SC:
4210 for (i = min; i < max; i++)
4211 {
4212 int len = 1;
4213 if (eptr >= md->end_subject) break;
4214 GETCHARLEN(c, eptr, len);
4215 prop_script = UCD_SCRIPT(c);
4216 if ((prop_script == prop_value) == prop_fail_result)
4217 break;
4218 eptr+= len;
4219 }
4220 break;
4221 }
4222
4223 /* eptr is now past the end of the maximum run */
4224
4225 CHECK_PARTIAL();
4226 if (possessive) continue;
4227 for(;;)
4228 {
4229 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4230 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4231 if (eptr-- == pp) break; /* Stop if tried at original pos */
4232 if (utf8) BACKCHAR(eptr);
4233 }
4234 }
4235
4236 /* Match extended Unicode sequences. We will get here only if the
4237 support is in the binary; otherwise a compile-time error occurs. */
4238
4239 else if (ctype == OP_EXTUNI)
4240 {
4241 for (i = min; i < max; i++)
4242 {
4243 if (eptr >= md->end_subject) break;
4244 GETCHARINCTEST(c, eptr);
4245 prop_category = UCD_CATEGORY(c);
4246 if (prop_category == ucp_M) break;
4247 while (eptr < md->end_subject)
4248 {
4249 int len = 1;
4250 if (!utf8) c = *eptr; else
4251 {
4252 GETCHARLEN(c, eptr, len);
4253 }
4254 prop_category = UCD_CATEGORY(c);
4255 if (prop_category != ucp_M) break;
4256 eptr += len;
4257 }
4258 }
4259
4260 /* eptr is now past the end of the maximum run */
4261
4262 CHECK_PARTIAL();
4263 if (possessive) continue;
4264 for(;;)
4265 {
4266 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4267 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4268 if (eptr-- == pp) break; /* Stop if tried at original pos */
4269 for (;;) /* Move back over one extended */
4270 {
4271 int len = 1;
4272 if (!utf8) c = *eptr; else
4273 {
4274 BACKCHAR(eptr);
4275 GETCHARLEN(c, eptr, len);
4276 }
4277 prop_category = UCD_CATEGORY(c);
4278 if (prop_category != ucp_M) break;
4279 eptr--;
4280 }
4281 }
4282 }
4283
4284 else
4285 #endif /* SUPPORT_UCP */
4286
4287 #ifdef SUPPORT_UTF8
4288 /* UTF-8 mode */
4289
4290 if (utf8)
4291 {
4292 switch(ctype)
4293 {
4294 case OP_ANY:
4295 if (max < INT_MAX)
4296 {
4297 for (i = min; i < max; i++)
4298 {
4299 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4300 eptr++;
4301 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4302 }
4303 }
4304
4305 /* Handle unlimited UTF-8 repeat */
4306
4307 else
4308 {
4309 for (i = min; i < max; i++)
4310 {
4311 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4312 eptr++;
4313 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4314 }
4315 }
4316 break;
4317
4318 case OP_ALLANY:
4319 if (max < INT_MAX)
4320 {
4321 for (i = min; i < max; i++)
4322 {
4323 if (eptr >= md->end_subject) break;
4324 eptr++;
4325 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4326 }
4327 }
4328 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
4329 break;
4330
4331 /* The byte case is the same as non-UTF8 */
4332
4333 case OP_ANYBYTE:
4334 c = max - min;
4335 if (c > (unsigned int)(md->end_subject - eptr))
4336 c = md->end_subject - eptr;
4337 eptr += c;
4338 break;
4339
4340 case OP_ANYNL:
4341 for (i = min; i < max; i++)
4342 {
4343 int len = 1;
4344 if (eptr >= md->end_subject) break;
4345 GETCHARLEN(c, eptr, len);
4346 if (c == 0x000d)
4347 {
4348 if (++eptr >= md->end_subject) break;
4349 if (*eptr == 0x000a) eptr++;
4350 }
4351 else
4352 {
4353 if (c != 0x000a &&
4354 (md->bsr_anycrlf ||
4355 (c != 0x000b && c != 0x000c &&
4356 c != 0x0085 && c != 0x2028 && c != 0x2029)))
4357 break;
4358 eptr += len;
4359 }
4360 }
4361 break;
4362
4363 case OP_NOT_HSPACE:
4364 case OP_HSPACE:
4365 for (i = min; i < max; i++)
4366 {
4367 BOOL gotspace;
4368 int len = 1;
4369 if (eptr >= md->end_subject) break;
4370 GETCHARLEN(c, eptr, len);
4371 switch(c)
4372 {
4373 default: gotspace = FALSE; break;
4374 case 0x09: /* HT */
4375 case 0x20: /* SPACE */
4376 case 0xa0: /* NBSP */
4377 case 0x1680: /* OGHAM SPACE MARK */
4378 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4379 case 0x2000: /* EN QUAD */
4380 case 0x2001: /* EM QUAD */
4381 case 0x2002: /* EN SPACE */
4382 case 0x2003: /* EM SPACE */
4383 case 0x2004: /* THREE-PER-EM SPACE */
4384 case 0x2005: /* FOUR-PER-EM SPACE */
4385 case 0x2006: /* SIX-PER-EM SPACE */
4386 case 0x2007: /* FIGURE SPACE */
4387 case 0x2008: /* PUNCTUATION SPACE */
4388 case 0x2009: /* THIN SPACE */
4389 case 0x200A: /* HAIR SPACE */
4390 case 0x202f: /* NARROW NO-BREAK SPACE */
4391 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4392 case 0x3000: /* IDEOGRAPHIC SPACE */
4393 gotspace = TRUE;
4394 break;
4395 }
4396 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4397 eptr += len;
4398 }
4399 break;
4400
4401 case OP_NOT_VSPACE:
4402 case OP_VSPACE:
4403 for (i = min; i < max; i++)
4404 {
4405 BOOL gotspace;
4406 int len = 1;
4407 if (eptr >= md->end_subject) break;
4408 GETCHARLEN(c, eptr, len);
4409 switch(c)
4410 {
4411 default: gotspace = FALSE; break;
4412 case 0x0a: /* LF */
4413 case 0x0b: /* VT */
4414 case 0x0c: /* FF */
4415 case 0x0d: /* CR */
4416 case 0x85: /* NEL */
4417 case 0x2028: /* LINE SEPARATOR */
4418 case 0x2029: /* PARAGRAPH SEPARATOR */
4419 gotspace = TRUE;
4420 break;
4421 }
4422 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4423 eptr += len;
4424 }
4425 break;
4426
4427 case OP_NOT_DIGIT:
4428 for (i = min; i < max; i++)
4429 {
4430 int len = 1;
4431 if (eptr >= md->end_subject) break;
4432 GETCHARLEN(c, eptr, len);
4433 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4434 eptr+= len;
4435 }
4436 break;
4437
4438 case OP_DIGIT:
4439 for (i = min; i < max; i++)
4440 {
4441 int len = 1;
4442 if (eptr >= md->end_subject) break;
4443 GETCHARLEN(c, eptr, len);
4444 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4445 eptr+= len;
4446 }
4447 break;
4448
4449 case OP_NOT_WHITESPACE:
4450 for (i = min; i < max; i++)
4451 {
4452 int len = 1;
4453 if (eptr >= md->end_subject) break;
4454 GETCHARLEN(c, eptr, len);
4455 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4456 eptr+= len;
4457 }
4458 break;
4459
4460 case OP_WHITESPACE:
4461 for (i = min; i < max; i++)
4462 {
4463 int len = 1;
4464 if (eptr >= md->end_subject) break;
4465 GETCHARLEN(c, eptr, len);
4466 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4467 eptr+= len;
4468 }
4469 break;
4470
4471 case OP_NOT_WORDCHAR:
4472 for (i = min; i < max; i++)
4473 {
4474 int len = 1;
4475 if (eptr >= md->end_subject) break;
4476 GETCHARLEN(c, eptr, len);
4477 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4478 eptr+= len;
4479 }
4480 break;
4481
4482 case OP_WORDCHAR:
4483 for (i = min; i < max; i++)
4484 {
4485 int len = 1;
4486 if (eptr >= md->end_subject) break;
4487 GETCHARLEN(c, eptr, len);
4488 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4489 eptr+= len;
4490 }
4491 break;
4492
4493 default:
4494 RRETURN(PCRE_ERROR_INTERNAL);
4495 }
4496
4497 /* eptr is now past the end of the maximum run */
4498
4499 CHECK_PARTIAL();
4500 if (possessive) continue;
4501 for(;;)
4502 {
4503 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4504 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4505 if (eptr-- == pp) break; /* Stop if tried at original pos */
4506 BACKCHAR(eptr);
4507 }
4508 }
4509 else
4510 #endif /* SUPPORT_UTF8 */
4511
4512 /* Not UTF-8 mode */
4513 {
4514 switch(ctype)
4515 {
4516 case OP_ANY:
4517 for (i = min; i < max; i++)
4518 {
4519 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4520 eptr++;
4521 }
4522 break;
4523
4524 case OP_ALLANY:
4525 case OP_ANYBYTE:
4526 c = max - min;
4527 if (c > (unsigned int)(md->end_subject - eptr))
4528 c = md->end_subject - eptr;
4529 eptr += c;
4530 break;
4531
4532 case OP_ANYNL:
4533 for (i = min; i < max; i++)
4534 {
4535 if (eptr >= md->end_subject) break;
4536 c = *eptr;
4537 if (c == 0x000d)
4538 {
4539 if (++eptr >= md->end_subject) break;
4540 if (*eptr == 0x000a) eptr++;
4541 }
4542 else
4543 {
4544 if (c != 0x000a &&
4545 (md->bsr_anycrlf ||
4546 (c != 0x000b && c != 0x000c && c != 0x0085)))
4547 break;
4548 eptr++;
4549 }
4550 }
4551 break;
4552
4553 case OP_NOT_HSPACE:
4554 for (i = min; i < max; i++)
4555 {
4556 if (eptr >= md->end_subject) break;
4557 c = *eptr;
4558 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4559 eptr++;
4560 }
4561 break;
4562
4563 case OP_HSPACE:
4564 for (i = min; i < max; i++)
4565 {
4566 if (eptr >= md->end_subject) break;
4567 c = *eptr;
4568 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4569 eptr++;
4570 }
4571 break;
4572
4573 case OP_NOT_VSPACE:
4574 for (i = min; i < max; i++)
4575 {
4576 if (eptr >= md->end_subject) break;
4577 c = *eptr;
4578 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4579 break;
4580 eptr++;
4581 }
4582 break;
4583
4584 case OP_VSPACE:
4585 for (i = min; i < max; i++)
4586 {
4587 if (eptr >= md->end_subject) break;
4588 c = *eptr;
4589 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4590 break;
4591 eptr++;
4592 }
4593 break;
4594
4595 case OP_NOT_DIGIT:
4596 for (i = min; i < max; i++)
4597 {
4598 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4599 break;
4600 eptr++;
4601 }
4602 break;
4603
4604 case OP_DIGIT:
4605 for (i = min; i < max; i++)
4606 {
4607 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4608 break;
4609 eptr++;
4610 }
4611 break;
4612
4613 case OP_NOT_WHITESPACE:
4614 for (i = min; i < max; i++)
4615 {
4616 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4617 break;
4618 eptr++;
4619 }
4620 break;
4621
4622 case OP_WHITESPACE:
4623 for (i = min; i < max; i++)
4624 {
4625 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4626 break;
4627 eptr++;
4628 }
4629 break;
4630
4631 case OP_NOT_WORDCHAR:
4632 for (i = min; i < max; i++)
4633 {
4634 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4635 break;
4636 eptr++;
4637 }
4638 break;
4639
4640 case OP_WORDCHAR:
4641 for (i = min; i < max; i++)
4642 {
4643 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4644 break;
4645 eptr++;
4646 }
4647 break;
4648
4649 default:
4650 RRETURN(PCRE_ERROR_INTERNAL);
4651 }
4652
4653 /* eptr is now past the end of the maximum run */
4654
4655 CHECK_PARTIAL();
4656 if (possessive) continue;
4657 while (eptr >= pp)
4658 {
4659 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4660 eptr--;
4661 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4662 }
4663 }
4664
4665 /* Get here if we can't make it match with any permitted repetitions */
4666
4667 RRETURN(MATCH_NOMATCH);
4668 }
4669 /* Control never gets here */
4670
4671 /* There's been some horrible disaster. Arrival here can only mean there is
4672 something seriously wrong in the code above or the OP_xxx definitions. */
4673
4674 default:
4675 DPRINTF(("Unknown opcode %d\n", *ecode));
4676 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4677 }
4678
4679 /* Do not stick any code in here without much thought; it is assumed
4680 that "continue" in the code above comes out to here to repeat the main
4681 loop. */
4682
4683 } /* End of main loop */
4684 /* Control never reaches here */
4685
4686
4687 /* When compiling to use the heap rather than the stack for recursive calls to
4688 match(), the RRETURN() macro jumps here. The number that is saved in
4689 frame->Xwhere indicates which label we actually want to return to. */
4690
4691 #ifdef NO_RECURSE
4692 #define LBL(val) case val: goto L_RM##val;
4693 HEAP_RETURN:
4694 switch (frame->Xwhere)
4695 {
4696 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4697 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4698 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4699 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4700 LBL(53) LBL(54)
4701 #ifdef SUPPORT_UTF8
4702 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4703 LBL(32) LBL(34) LBL(42) LBL(46)
4704 #ifdef SUPPORT_UCP
4705 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4706 #endif /* SUPPORT_UCP */
4707 #endif /* SUPPORT_UTF8 */
4708 default:
4709 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4710 return PCRE_ERROR_INTERNAL;
4711 }
4712 #undef LBL
4713 #endif /* NO_RECURSE */
4714 }
4715
4716
4717 /***************************************************************************
4718 ****************************************************************************
4719 RECURSION IN THE match() FUNCTION
4720
4721 Undefine all the macros that were defined above to handle this. */
4722
4723 #ifdef NO_RECURSE
4724 #undef eptr
4725 #undef ecode
4726 #undef mstart
4727 #undef offset_top
4728 #undef ims
4729 #undef eptrb
4730 #undef flags
4731
4732 #undef callpat
4733 #undef charptr
4734 #undef data
4735 #undef next
4736 #undef pp
4737 #undef prev
4738 #undef saved_eptr
4739
4740 #undef new_recursive
4741
4742 #undef cur_is_word
4743 #undef condition
4744 #undef prev_is_word
4745
4746 #undef original_ims
4747
4748 #undef ctype
4749 #undef length
4750 #undef max
4751 #undef min
4752 #undef number
4753 #undef offset
4754 #undef op
4755 #undef save_capture_last
4756 #undef save_offset1
4757 #undef save_offset2
4758 #undef save_offset3
4759 #undef stacksave
4760
4761 #undef newptrb
4762
4763 #endif
4764
4765 /* These two are defined as macros in both cases */
4766
4767 #undef fc
4768 #undef fi
4769
4770 /***************************************************************************
4771 ***************************************************************************/
4772
4773
4774
4775 /*************************************************
4776 * Execute a Regular Expression *
4777 *************************************************/
4778
4779 /* This function applies a compiled re to a subject string and picks out
4780 portions of the string if it matches. Two elements in the vector are set for
4781 each substring: the offsets to the start and end of the substring.
4782
4783 Arguments:
4784 argument_re points to the compiled expression
4785 extra_data points to extra data or is NULL
4786 subject points to the subject string
4787 length length of subject string (may contain binary zeros)
4788 start_offset where to start in the subject string
4789 options option bits
4790 offsets points to a vector of ints to be filled in with offsets
4791 offsetcount the number of elements in the vector
4792
4793 Returns: > 0 => success; value is the number of elements filled in
4794 = 0 => success, but offsets is not big enough
4795 -1 => failed to match
4796 < -1 => some kind of unexpected problem
4797 */
4798
4799 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4800 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4801 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4802 int offsetcount)
4803 {
4804 int rc, resetcount, ocount;
4805 int first_byte = -1;
4806 int req_byte = -1;
4807 int req_byte2 = -1;
4808 int newline;
4809 unsigned long int ims;
4810 BOOL using_temporary_offsets = FALSE;
4811 BOOL anchored;
4812 BOOL startline;
4813 BOOL firstline;
4814 BOOL first_byte_caseless = FALSE;
4815 BOOL req_byte_caseless = FALSE;
4816 BOOL utf8;
4817 match_data match_block;
4818 match_data *md = &match_block;
4819 const uschar *tables;
4820 const uschar *start_bits = NULL;
4821 USPTR start_match = (USPTR)subject + start_offset;
4822 USPTR end_subject;
4823 USPTR start_partial = NULL;
4824 USPTR req_byte_ptr = start_match - 1;
4825
4826 pcre_study_data internal_study;
4827 const pcre_study_data *study;
4828
4829 real_pcre internal_re;
4830 const real_pcre *external_re = (const real_pcre *)argument_re;
4831 const real_pcre *re = external_re;
4832
4833 /* Plausibility checks */
4834
4835 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4836 if (re == NULL || subject == NULL ||
4837 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4838 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4839
4840 /* Fish out the optional data from the extra_data structure, first setting
4841 the default values. */
4842
4843 study = NULL;
4844 md->match_limit = MATCH_LIMIT;
4845 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4846 md->callout_data = NULL;
4847
4848 /* The table pointer is always in native byte order. */
4849
4850 tables = external_re->tables;
4851
4852 if (extra_data != NULL)
4853 {
4854 register unsigned int flags = extra_data->flags;
4855 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4856 study = (const pcre_study_data *)extra_data->study_data;
4857 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4858 md->match_limit = extra_data->match_limit;
4859 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4860 md->match_limit_recursion = extra_data->match_limit_recursion;
4861 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4862 md->callout_data = extra_data->callout_data;
4863 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4864 }
4865
4866 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4867 is a feature that makes it possible to save compiled regex and re-use them
4868 in other programs later. */
4869
4870 if (tables == NULL) tables = _pcre_default_tables;
4871
4872 /* Check that the first field in the block is the magic number. If it is not,
4873 test for a regex that was compiled on a host of opposite endianness. If this is
4874 the case, flipped values are put in internal_re and internal_study if there was
4875 study data too. */
4876
4877 if (re->magic_number != MAGIC_NUMBER)
4878 {
4879 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4880 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4881 if (study != NULL) study = &internal_study;
4882 }
4883
4884 /* Set up other data */
4885
4886 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4887 startline = (re->flags & PCRE_STARTLINE) != 0;
4888 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4889
4890 /* The code starts after the real_pcre block and the capture name table. */
4891
4892 md->start_code = (const uschar *)external_re + re->name_table_offset +
4893 re->name_count * re->name_entry_size;
4894
4895 md->start_subject = (USPTR)subject;
4896 md->start_offset = start_offset;
4897 md->end_subject = md->start_subject + length;
4898 end_subject = md->end_subject;
4899
4900 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4901 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4902 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4903
4904 md->notbol = (options & PCRE_NOTBOL) != 0;
4905 md->noteol = (options & PCRE_NOTEOL) != 0;
4906 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4907 md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
4908 ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
4909 md->hitend = FALSE;
4910
4911 md->recursive = NULL; /* No recursion at top level */
4912
4913 md->lcc = tables + lcc_offset;
4914 md->ctypes = tables + ctypes_offset;
4915
4916 /* Handle different \R options. */
4917
4918 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4919 {
4920 case 0:
4921 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4922 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4923 else
4924 #ifdef BSR_ANYCRLF
4925 md->bsr_anycrlf = TRUE;
4926 #else
4927 md->bsr_anycrlf = FALSE;
4928 #endif
4929 break;
4930
4931 case PCRE_BSR_ANYCRLF:
4932 md->bsr_anycrlf = TRUE;
4933 break;
4934
4935 case PCRE_BSR_UNICODE:
4936 md->bsr_anycrlf = FALSE;
4937 break;
4938
4939 default: return PCRE_ERROR_BADNEWLINE;
4940 }
4941
4942 /* Handle different types of newline. The three bits give eight cases. If
4943 nothing is set at run time, whatever was used at compile time applies. */
4944
4945 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4946 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4947 {
4948 case 0: newline = NEWLINE; break; /* Compile-time default */
4949 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4950 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4951 case PCRE_NEWLINE_CR+
4952 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4953 case PCRE_NEWLINE_ANY: newline = -1; break;
4954 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4955 default: return PCRE_ERROR_BADNEWLINE;
4956 }
4957
4958 if (newline == -2)
4959 {
4960 md->nltype = NLTYPE_ANYCRLF;
4961 }
4962 else if (newline < 0)
4963 {
4964 md->nltype = NLTYPE_ANY;
4965 }
4966 else
4967 {
4968 md->nltype = NLTYPE_FIXED;
4969 if (newline > 255)
4970 {
4971 md->nllen = 2;
4972 md->nl[0] = (newline >> 8) & 255;
4973 md->nl[1] = newline & 255;
4974 }
4975 else
4976 {
4977 md->nllen = 1;
4978 md->nl[0] = newline;
4979 }
4980 }
4981
4982 /* Partial matching was originally supported only for a restricted set of
4983 regexes; from release 8.00 there are no restrictions, but the bits are still
4984 defined (though never set). So there's no harm in leaving this code. */
4985
4986 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4987 return PCRE_ERROR_BADPARTIAL;
4988
4989 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4990 back the character offset. */
4991
4992 #ifdef SUPPORT_UTF8
4993 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4994 {
4995 if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
4996 return PCRE_ERROR_BADUTF8;
4997 if (start_offset > 0 && start_offset < length)
4998 {
4999 int tb = ((USPTR)subject)[start_offset];
5000 if (tb > 127)
5001 {
5002 tb &= 0xc0;
5003 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5004 }
5005 }
5006 }
5007 #endif
5008
5009 /* The ims options can vary during the matching as a result of the presence
5010 of (?ims) items in the pattern. They are kept in a local variable so that
5011 restoring at the exit of a group is easy. */
5012
5013 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5014
5015 /* If the expression has got more back references than the offsets supplied can
5016 hold, we get a temporary chunk of working store to use during the matching.
5017 Otherwise, we can use the vector supplied, rounding down its size to a multiple
5018 of 3. */
5019
5020 ocount = offsetcount - (offsetcount % 3);
5021
5022 if (re->top_backref > 0 && re->top_backref >= ocount/3)
5023 {
5024 ocount = re->top_backref * 3 + 3;
5025 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5026 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5027 using_temporary_offsets = TRUE;
5028 DPRINTF(("Got memory to hold back references\n"));
5029 }
5030 else md->offset_vector = offsets;
5031
5032 md->offset_end = ocount;
5033 md->offset_max = (2*ocount)/3;
5034 md->offset_overflow = FALSE;
5035 md->capture_last = -1;
5036
5037 /* Compute the minimum number of offsets that we need to reset each time. Doing
5038 this makes a huge difference to execution time when there aren't many brackets
5039 in the pattern. */
5040
5041 resetcount = 2 + re->top_bracket * 2;
5042 if (resetcount > offsetcount) resetcount = ocount;
5043
5044 /* Reset the working variable associated with each extraction. These should
5045 never be used unless previously set, but they get saved and restored, and so we
5046 initialize them to avoid reading uninitialized locations. */
5047
5048 if (md->offset_vector != NULL)
5049 {
5050 register int *iptr = md->offset_vector + ocount;
5051 register int *iend = iptr - resetcount/2 + 1;
5052 while (--iptr >= iend) *iptr = -1;
5053 }
5054
5055 /* Set up the first character to match, if available. The first_byte value is
5056 never set for an anchored regular expression, but the anchoring may be forced
5057 at run time, so we have to test for anchoring. The first char may be unset for
5058 an unanchored pattern, of course. If there's no first char and the pattern was
5059 studied, there may be a bitmap of possible first characters. */
5060
5061 if (!anchored)
5062 {
5063 if ((re->flags & PCRE_FIRSTSET) != 0)
5064 {
5065 first_byte = re->first_byte & 255;
5066 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5067 first_byte = md->lcc[first_byte];
5068 }
5069 else
5070 if (!startline && study != NULL &&
5071 (study->options & PCRE_STUDY_MAPPED) != 0)
5072 start_bits = study->start_bits;
5073 }
5074
5075 /* For anchored or unanchored matches, there may be a "last known required
5076 character" set. */
5077
5078 if ((re->flags & PCRE_REQCHSET) != 0)
5079 {
5080 req_byte = re->req_byte & 255;
5081 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5082 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5083 }
5084
5085
5086 /* ==========================================================================*/
5087
5088 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5089 the loop runs just once. */
5090
5091 for(;;)
5092 {
5093 USPTR save_end_subject = end_subject;
5094 USPTR new_start_match;
5095
5096 /* Reset the maximum number of extractions we might see. */
5097
5098 if (md->offset_vector != NULL)
5099 {
5100 register int *iptr = md->offset_vector;
5101 register int *iend = iptr + resetcount;
5102 while (iptr < iend) *iptr++ = -1;
5103 }
5104
5105 /* If firstline is TRUE, the start of the match is constrained to the first
5106 line of a multiline string. That is, the match must be before or at the first
5107 newline. Implement this by temporarily adjusting end_subject so that we stop
5108 scanning at a newline. If the match fails at the newline, later code breaks
5109 this loop. */
5110
5111 if (firstline)
5112 {
5113 USPTR t = start_match;
5114 #ifdef SUPPORT_UTF8
5115 if (utf8)
5116 {
5117 while (t < md->end_subject && !IS_NEWLINE(t))
5118 {
5119 t++;
5120 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5121 }
5122 }
5123 else
5124 #endif
5125 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5126 end_subject = t;
5127 }
5128
5129 /* There are some optimizations that avoid running the match if a known
5130 starting point is not found, or if a known later character is not present.
5131 However, there is an option that disables these, for testing and for ensuring
5132 that all callouts do actually occur. */
5133
5134 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
5135 {
5136 /* Advance to a unique first byte if there is one. */
5137
5138 if (first_byte >= 0)
5139 {
5140 if (first_byte_caseless)
5141 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5142 start_match++;
5143 else
5144 while (start_match < end_subject && *start_match != first_byte)
5145 start_match++;
5146 }
5147
5148 /* Or to just after a linebreak for a multiline match */
5149
5150 else if (startline)
5151 {
5152 if (start_match > md->start_subject + start_offset)
5153 {
5154 #ifdef SUPPORT_UTF8
5155 if (utf8)
5156 {
5157 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5158 {
5159 start_match++;
5160 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5161 start_match++;
5162 }
5163 }
5164 else
5165 #endif
5166 while (start_match < end_subject && !WAS_NEWLINE(start_match))
5167 start_match++;
5168
5169 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5170 and we are now at a LF, advance the match position by one more character.
5171 */
5172
5173 if (start_match[-1] == CHAR_CR &&
5174 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5175 start_match < end_subject &&
5176 *start_match == CHAR_NL)
5177 start_match++;
5178 }
5179 }
5180
5181 /* Or to a non-unique first byte after study */
5182
5183 else if (start_bits != NULL)
5184 {
5185 while (start_match < end_subject)
5186 {
5187 register unsigned int c = *start_match;
5188 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
5189 else break;
5190 }
5191 }
5192 } /* Starting optimizations */
5193
5194 /* Restore fudged end_subject */
5195
5196 end_subject = save_end_subject;
5197
5198 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5199 printf(">>>> Match against: ");
5200 pchars(start_match, end_subject - start_match, TRUE, md);
5201 printf("\n");
5202 #endif
5203
5204 /* If req_byte is set, we know that that character must appear in the
5205 subject for the match to succeed. If the first character is set, req_byte
5206 must be later in the subject; otherwise the test starts at the match point.
5207 This optimization can save a huge amount of backtracking in patterns with
5208 nested unlimited repeats that aren't going to match. Writing separate code
5209 for cased/caseless versions makes it go faster, as does using an
5210 autoincrement and backing off on a match.
5211
5212 HOWEVER: when the subject string is very, very long, searching to its end
5213 can take a long time, and give bad performance on quite ordinary patterns.
5214 This showed up when somebody was matching something like /^\d+C/ on a
5215 32-megabyte string... so we don't do this when the string is sufficiently
5216 long.
5217
5218 ALSO: this processing is disabled when partial matching is requested, or if
5219 disabling is explicitly requested. */
5220
5221 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
5222 req_byte >= 0 &&
5223 end_subject - start_match < REQ_BYTE_MAX &&
5224 !md->partial)
5225 {
5226 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
5227
5228 /* We don't need to repeat the search if we haven't yet reached the
5229 place we found it at last time. */
5230
5231 if (p > req_byte_ptr)
5232 {
5233 if (req_byte_caseless)
5234 {
5235 while (p < end_subject)
5236 {
5237 register int pp = *p++;
5238 if (pp == req_byte || pp == req_byte2) { p--; break; }
5239 }
5240 }
5241 else
5242 {
5243 while (p < end_subject)
5244 {
5245 if (*p++ == req_byte) { p--; break; }
5246 }
5247 }
5248
5249 /* If we can't find the required character, break the matching loop,
5250 forcing a match failure. */
5251
5252 if (p >= end_subject)
5253 {
5254 rc = MATCH_NOMATCH;
5255 break;
5256 }
5257
5258 /* If we have found the required character, save the point where we
5259 found it, so that we don't search again next time round the loop if
5260 the start hasn't passed this character yet. */
5261
5262 req_byte_ptr = p;
5263 }
5264 }
5265
5266 /* OK, we can now run the match. If "hitend" is set afterwards, remember the
5267 first starting point for which a partial match was found. */
5268
5269 md->start_match_ptr = start_match;
5270 md->match_call_count = 0;
5271 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
5272 if (md->hitend && start_partial == NULL) start_partial = start_match;
5273
5274 switch(rc)
5275 {
5276 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
5277 exactly like PRUNE. */
5278
5279 case MATCH_NOMATCH:
5280 case MATCH_PRUNE:
5281 case MATCH_THEN:
5282 new_start_match = start_match + 1;
5283 #ifdef SUPPORT_UTF8
5284 if (utf8)
5285 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
5286 new_start_match++;
5287 #endif
5288 break;
5289
5290 /* SKIP passes back the next starting point explicitly. */
5291
5292 case MATCH_SKIP:
5293 new_start_match = md->start_match_ptr;
5294 break;
5295
5296 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
5297
5298 case MATCH_COMMIT:
5299 rc = MATCH_NOMATCH;
5300 goto ENDLOOP;
5301
5302 /* Any other return is some kind of error. */
5303
5304 default:
5305 goto ENDLOOP;
5306 }
5307
5308 /* Control reaches here for the various types of "no match at this point"
5309 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
5310
5311 rc = MATCH_NOMATCH;
5312
5313 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
5314 newline in the subject (though it may continue over the newline). Therefore,
5315 if we have just failed to match, starting at a newline, do not continue. */
5316
5317 if (firstline && IS_NEWLINE(start_match)) break;
5318
5319 /* Advance to new matching position */
5320
5321 start_match = new_start_match;
5322
5323 /* Break the loop if the pattern is anchored or if we have passed the end of
5324 the subject. */
5325
5326 if (anchored || start_match > end_subject) break;
5327
5328 /* If we have just passed a CR and we are now at a LF, and the pattern does
5329 not contain any explicit matches for \r or \n, and the newline option is CRLF
5330 or ANY or ANYCRLF, advance the match position by one more character. */
5331
5332 if (start_match[-1] == CHAR_CR &&
5333 start_match < end_subject &&
5334 *start_match == CHAR_NL &&
5335 (re->flags & PCRE_HASCRORLF) == 0 &&
5336 (md->nltype == NLTYPE_ANY ||
5337 md->nltype == NLTYPE_ANYCRLF ||
5338 md->nllen == 2))
5339 start_match++;
5340
5341 } /* End of for(;;) "bumpalong" loop */
5342
5343 /* ==========================================================================*/
5344
5345 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
5346 conditions is true:
5347
5348 (1) The pattern is anchored or the match was failed by (*COMMIT);
5349
5350 (2) We are past the end of the subject;
5351
5352 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
5353 this option requests that a match occur at or before the first newline in
5354 the subject.
5355
5356 When we have a match and the offset vector is big enough to deal with any
5357 backreferences, captured substring offsets will already be set up. In the case
5358 where we had to get some local store to hold offsets for backreference
5359 processing, copy those that we can. In this case there need not be overflow if
5360 certain parts of the pattern were not used, even though there are more
5361 capturing parentheses than vector slots. */
5362
5363 ENDLOOP:
5364
5365 if (rc == MATCH_MATCH)
5366 {
5367 if (using_temporary_offsets)
5368 {
5369 if (offsetcount >= 4)
5370 {
5371 memcpy(offsets + 2, md->offset_vector + 2,
5372 (offsetcount - 2) * sizeof(int));
5373 DPRINTF(("Copied offsets from temporary memory\n"));
5374 }
5375 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5376 DPRINTF(("Freeing temporary memory\n"));
5377 (pcre_free)(md->offset_vector);
5378 }
5379
5380 /* Set the return code to the number of captured strings, or 0 if there are
5381 too many to fit into the vector. */
5382
5383 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5384
5385 /* If there is space, set up the whole thing as substring 0. The value of
5386 md->start_match_ptr might be modified if \K was encountered on the success
5387 matching path. */
5388
5389 if (offsetcount < 2) rc = 0; else
5390 {
5391 offsets[0] = md->start_match_ptr - md->start_subject;
5392 offsets[1] = md->end_match_ptr - md->start_subject;
5393 }
5394
5395 DPRINTF((">>>> returning %d\n", rc));
5396 return rc;
5397 }
5398
5399 /* Control gets here if there has been an error, or if the overall match
5400 attempt has failed at all permitted starting positions. */
5401
5402 if (using_temporary_offsets)
5403 {
5404 DPRINTF(("Freeing temporary memory\n"));
5405 (pcre_free)(md->offset_vector);
5406 }
5407
5408 if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
5409 {
5410 DPRINTF((">>>> error: returning %d\n", rc));
5411 return rc;
5412 }
5413 else if (start_partial != NULL)
5414 {
5415 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5416 if (offsetcount > 1)
5417 {
5418 offsets[0] = start_partial - (USPTR)subject;
5419 offsets[1] = end_subject - (USPTR)subject;
5420 }
5421 return PCRE_ERROR_PARTIAL;
5422 }
5423 else
5424 {
5425 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5426 return PCRE_ERROR_NOMATCH;
5427 }
5428 }
5429
5430 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12