/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 400 - (show annotations) (download)
Sat Mar 21 16:59:40 2009 UTC (5 years, 1 month ago) by ph10
File MIME type: text/plain
File size: 153031 byte(s)
Fix memory leak for -8 error during recursion.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2009 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52
53 #include "pcre_internal.h"
54
55 /* Undefine some potentially clashing cpp symbols */
56
57 #undef min
58 #undef max
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82
83 #define REC_STACK_SAVE_MAX 30
84
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89
90
91
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99
100 Arguments:
101 p points to characters
102 length number to print
103 is_subject TRUE if printing from within md->start_subject
104 md pointer to matching data block, if is_subject is TRUE
105
106 Returns: nothing
107 */
108
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118
119
120
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127
128 Arguments:
129 offset index into the offset vector
130 eptr points into the subject
131 length length to be matched
132 md points to match data block
133 ims the ims flags
134
135 Returns: TRUE if matched
136 */
137
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146 printf("matching subject <null>");
147 else
148 {
149 printf("matching subject ");
150 pchars(eptr, length, TRUE, md);
151 }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156
157 /* Always fail if not enough characters left */
158
159 if (length > md->end_subject - eptr) return FALSE;
160
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164
165 if ((ims & PCRE_CASELESS) != 0)
166 {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169 if (md->utf8)
170 {
171 USPTR endptr = eptr + length;
172 while (eptr < endptr)
173 {
174 int c, d;
175 GETCHARINC(c, eptr);
176 GETCHARINC(d, p);
177 if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 }
179 }
180 else
181 #endif
182 #endif
183
184 /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185 is no UCP support. */
186
187 while (length-- > 0)
188 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 }
190
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193
194 else
195 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196
197 return TRUE;
198 }
199
200
201
202 /***************************************************************************
203 ****************************************************************************
204 RECURSION IN THE match() FUNCTION
205
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 RM51, RM52, RM53, RM54 };
249
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253
254 #ifndef NO_RECURSE
255 #define REGISTER register
256
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 { \
260 printf("match() called in line %d\n", __LINE__); \
261 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 printf("to line %d\n", __LINE__); \
263 }
264 #define RRETURN(ra) \
265 { \
266 printf("match() returned %d from line %d ", ra, __LINE__); \
267 return ra; \
268 }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274
275 #else
276
277
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281
282 #define REGISTER
283
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 {\
286 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 frame->Xwhere = rw; \
288 newframe->Xeptr = ra;\
289 newframe->Xecode = rb;\
290 newframe->Xmstart = mstart;\
291 newframe->Xoffset_top = rc;\
292 newframe->Xims = re;\
293 newframe->Xeptrb = rf;\
294 newframe->Xflags = rg;\
295 newframe->Xrdepth = frame->Xrdepth + 1;\
296 newframe->Xprevframe = frame;\
297 frame = newframe;\
298 DPRINTF(("restarting from line %d\n", __LINE__));\
299 goto HEAP_RECURSE;\
300 L_##rw:\
301 DPRINTF(("jumped back to line %d\n", __LINE__));\
302 }
303
304 #define RRETURN(ra)\
305 {\
306 heapframe *newframe = frame;\
307 frame = newframe->Xprevframe;\
308 (pcre_stack_free)(newframe);\
309 if (frame != NULL)\
310 {\
311 rrc = ra;\
312 goto HEAP_RETURN;\
313 }\
314 return ra;\
315 }
316
317
318 /* Structure for remembering the local variables in a private frame */
319
320 typedef struct heapframe {
321 struct heapframe *Xprevframe;
322
323 /* Function arguments that may change */
324
325 const uschar *Xeptr;
326 const uschar *Xecode;
327 const uschar *Xmstart;
328 int Xoffset_top;
329 long int Xims;
330 eptrblock *Xeptrb;
331 int Xflags;
332 unsigned int Xrdepth;
333
334 /* Function local variables */
335
336 const uschar *Xcallpat;
337 const uschar *Xcharptr;
338 const uschar *Xdata;
339 const uschar *Xnext;
340 const uschar *Xpp;
341 const uschar *Xprev;
342 const uschar *Xsaved_eptr;
343
344 recursion_info Xnew_recursive;
345
346 BOOL Xcur_is_word;
347 BOOL Xcondition;
348 BOOL Xprev_is_word;
349
350 unsigned long int Xoriginal_ims;
351
352 #ifdef SUPPORT_UCP
353 int Xprop_type;
354 int Xprop_value;
355 int Xprop_fail_result;
356 int Xprop_category;
357 int Xprop_chartype;
358 int Xprop_script;
359 int Xoclength;
360 uschar Xocchars[8];
361 #endif
362
363 int Xctype;
364 unsigned int Xfc;
365 int Xfi;
366 int Xlength;
367 int Xmax;
368 int Xmin;
369 int Xnumber;
370 int Xoffset;
371 int Xop;
372 int Xsave_capture_last;
373 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374 int Xstacksave[REC_STACK_SAVE_MAX];
375
376 eptrblock Xnewptrb;
377
378 /* Where to jump back to */
379
380 int Xwhere;
381
382 } heapframe;
383
384 #endif
385
386
387 /***************************************************************************
388 ***************************************************************************/
389
390
391
392 /*************************************************
393 * Match from current position *
394 *************************************************/
395
396 /* This function is called recursively in many circumstances. Whenever it
397 returns a negative (error) response, the outer incarnation must also return the
398 same response.
399
400 Performance note: It might be tempting to extract commonly used fields from the
401 md structure (e.g. utf8, end_subject) into individual variables to improve
402 performance. Tests using gcc on a SPARC disproved this; in the first case, it
403 made performance worse.
404
405 Arguments:
406 eptr pointer to current character in subject
407 ecode pointer to current position in compiled code
408 mstart pointer to the current match start position (can be modified
409 by encountering \K)
410 offset_top current top pointer
411 md pointer to "static" info for the match
412 ims current /i, /m, and /s options
413 eptrb pointer to chain of blocks containing eptr at start of
414 brackets - for testing for empty matches
415 flags can contain
416 match_condassert - this is an assertion condition
417 match_cbegroup - this is the start of an unlimited repeat
418 group that can match an empty string
419 rdepth the recursion depth
420
421 Returns: MATCH_MATCH if matched ) these values are >= 0
422 MATCH_NOMATCH if failed to match )
423 a negative PCRE_ERROR_xxx value if aborted by an error condition
424 (e.g. stopped by repeated call or recursion limit)
425 */
426
427 static int
428 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 int flags, unsigned int rdepth)
431 {
432 /* These variables do not need to be preserved over recursion in this function,
433 so they can be ordinary variables in all cases. Mark some of them with
434 "register" because they are used a lot in loops. */
435
436 register int rrc; /* Returns from recursive calls */
437 register int i; /* Used for loops not involving calls to RMATCH() */
438 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440
441 BOOL minimize, possessive; /* Quantifier options */
442
443 /* When recursion is not being used, all "local" variables that have to be
444 preserved over calls to RMATCH() are part of a "frame" which is obtained from
445 heap storage. Set up the top-level frame here; others are obtained from the
446 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447
448 #ifdef NO_RECURSE
449 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450 frame->Xprevframe = NULL; /* Marks the top level */
451
452 /* Copy in the original argument variables */
453
454 frame->Xeptr = eptr;
455 frame->Xecode = ecode;
456 frame->Xmstart = mstart;
457 frame->Xoffset_top = offset_top;
458 frame->Xims = ims;
459 frame->Xeptrb = eptrb;
460 frame->Xflags = flags;
461 frame->Xrdepth = rdepth;
462
463 /* This is where control jumps back to to effect "recursion" */
464
465 HEAP_RECURSE:
466
467 /* Macros make the argument variables come from the current frame */
468
469 #define eptr frame->Xeptr
470 #define ecode frame->Xecode
471 #define mstart frame->Xmstart
472 #define offset_top frame->Xoffset_top
473 #define ims frame->Xims
474 #define eptrb frame->Xeptrb
475 #define flags frame->Xflags
476 #define rdepth frame->Xrdepth
477
478 /* Ditto for the local variables */
479
480 #ifdef SUPPORT_UTF8
481 #define charptr frame->Xcharptr
482 #endif
483 #define callpat frame->Xcallpat
484 #define data frame->Xdata
485 #define next frame->Xnext
486 #define pp frame->Xpp
487 #define prev frame->Xprev
488 #define saved_eptr frame->Xsaved_eptr
489
490 #define new_recursive frame->Xnew_recursive
491
492 #define cur_is_word frame->Xcur_is_word
493 #define condition frame->Xcondition
494 #define prev_is_word frame->Xprev_is_word
495
496 #define original_ims frame->Xoriginal_ims
497
498 #ifdef SUPPORT_UCP
499 #define prop_type frame->Xprop_type
500 #define prop_value frame->Xprop_value
501 #define prop_fail_result frame->Xprop_fail_result
502 #define prop_category frame->Xprop_category
503 #define prop_chartype frame->Xprop_chartype
504 #define prop_script frame->Xprop_script
505 #define oclength frame->Xoclength
506 #define occhars frame->Xocchars
507 #endif
508
509 #define ctype frame->Xctype
510 #define fc frame->Xfc
511 #define fi frame->Xfi
512 #define length frame->Xlength
513 #define max frame->Xmax
514 #define min frame->Xmin
515 #define number frame->Xnumber
516 #define offset frame->Xoffset
517 #define op frame->Xop
518 #define save_capture_last frame->Xsave_capture_last
519 #define save_offset1 frame->Xsave_offset1
520 #define save_offset2 frame->Xsave_offset2
521 #define save_offset3 frame->Xsave_offset3
522 #define stacksave frame->Xstacksave
523
524 #define newptrb frame->Xnewptrb
525
526 /* When recursion is being used, local variables are allocated on the stack and
527 get preserved during recursion in the normal way. In this environment, fi and
528 i, and fc and c, can be the same variables. */
529
530 #else /* NO_RECURSE not defined */
531 #define fi i
532 #define fc c
533
534
535 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536 const uschar *charptr; /* in small blocks of the code. My normal */
537 #endif /* style of coding would have declared */
538 const uschar *callpat; /* them within each of those blocks. */
539 const uschar *data; /* However, in order to accommodate the */
540 const uschar *next; /* version of this code that uses an */
541 USPTR pp; /* external "stack" implemented on the */
542 const uschar *prev; /* heap, it is easier to declare them all */
543 USPTR saved_eptr; /* here, so the declarations can be cut */
544 /* out in a block. The only declarations */
545 recursion_info new_recursive; /* within blocks below are for variables */
546 /* that do not have to be preserved over */
547 BOOL cur_is_word; /* a recursive call to RMATCH(). */
548 BOOL condition;
549 BOOL prev_is_word;
550
551 unsigned long int original_ims;
552
553 #ifdef SUPPORT_UCP
554 int prop_type;
555 int prop_value;
556 int prop_fail_result;
557 int prop_category;
558 int prop_chartype;
559 int prop_script;
560 int oclength;
561 uschar occhars[8];
562 #endif
563
564 int codelink;
565 int condcode;
566 int ctype;
567 int length;
568 int max;
569 int min;
570 int number;
571 int offset;
572 int op;
573 int save_capture_last;
574 int save_offset1, save_offset2, save_offset3;
575 int stacksave[REC_STACK_SAVE_MAX];
576
577 eptrblock newptrb;
578 #endif /* NO_RECURSE */
579
580 /* These statements are here to stop the compiler complaining about unitialized
581 variables. */
582
583 #ifdef SUPPORT_UCP
584 prop_value = 0;
585 prop_fail_result = 0;
586 #endif
587
588
589 /* This label is used for tail recursion, which is used in a few cases even
590 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
591 used. Thanks to Ian Taylor for noticing this possibility and sending the
592 original patch. */
593
594 TAIL_RECURSE:
595
596 /* OK, now we can get on with the real code of the function. Recursive calls
597 are specified by the macro RMATCH and RRETURN is used to return. When
598 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
599 and a "return", respectively (possibly with some debugging if DEBUG is
600 defined). However, RMATCH isn't like a function call because it's quite a
601 complicated macro. It has to be used in one particular way. This shouldn't,
602 however, impact performance when true recursion is being used. */
603
604 #ifdef SUPPORT_UTF8
605 utf8 = md->utf8; /* Local copy of the flag */
606 #else
607 utf8 = FALSE;
608 #endif
609
610 /* First check that we haven't called match() too many times, or that we
611 haven't exceeded the recursive call limit. */
612
613 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
614 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
615
616 original_ims = ims; /* Save for resetting on ')' */
617
618 /* At the start of a group with an unlimited repeat that may match an empty
619 string, the match_cbegroup flag is set. When this is the case, add the current
620 subject pointer to the chain of such remembered pointers, to be checked when we
621 hit the closing ket, in order to break infinite loops that match no characters.
622 When match() is called in other circumstances, don't add to the chain. The
623 match_cbegroup flag must NOT be used with tail recursion, because the memory
624 block that is used is on the stack, so a new one may be required for each
625 match(). */
626
627 if ((flags & match_cbegroup) != 0)
628 {
629 newptrb.epb_saved_eptr = eptr;
630 newptrb.epb_prev = eptrb;
631 eptrb = &newptrb;
632 }
633
634 /* Now start processing the opcodes. */
635
636 for (;;)
637 {
638 minimize = possessive = FALSE;
639 op = *ecode;
640
641 /* For partial matching, remember if we ever hit the end of the subject after
642 matching at least one subject character. */
643
644 if (md->partial &&
645 eptr >= md->end_subject &&
646 eptr > mstart)
647 md->hitend = TRUE;
648
649 switch(op)
650 {
651 case OP_FAIL:
652 RRETURN(MATCH_NOMATCH);
653
654 case OP_PRUNE:
655 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
656 ims, eptrb, flags, RM51);
657 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
658 RRETURN(MATCH_PRUNE);
659
660 case OP_COMMIT:
661 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
662 ims, eptrb, flags, RM52);
663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
664 RRETURN(MATCH_COMMIT);
665
666 case OP_SKIP:
667 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
668 ims, eptrb, flags, RM53);
669 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
670 md->start_match_ptr = eptr; /* Pass back current position */
671 RRETURN(MATCH_SKIP);
672
673 case OP_THEN:
674 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
675 ims, eptrb, flags, RM54);
676 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
677 RRETURN(MATCH_THEN);
678
679 /* Handle a capturing bracket. If there is space in the offset vector, save
680 the current subject position in the working slot at the top of the vector.
681 We mustn't change the current values of the data slot, because they may be
682 set from a previous iteration of this group, and be referred to by a
683 reference inside the group.
684
685 If the bracket fails to match, we need to restore this value and also the
686 values of the final offsets, in case they were set by a previous iteration
687 of the same bracket.
688
689 If there isn't enough space in the offset vector, treat this as if it were
690 a non-capturing bracket. Don't worry about setting the flag for the error
691 case here; that is handled in the code for KET. */
692
693 case OP_CBRA:
694 case OP_SCBRA:
695 number = GET2(ecode, 1+LINK_SIZE);
696 offset = number << 1;
697
698 #ifdef DEBUG
699 printf("start bracket %d\n", number);
700 printf("subject=");
701 pchars(eptr, 16, TRUE, md);
702 printf("\n");
703 #endif
704
705 if (offset < md->offset_max)
706 {
707 save_offset1 = md->offset_vector[offset];
708 save_offset2 = md->offset_vector[offset+1];
709 save_offset3 = md->offset_vector[md->offset_end - number];
710 save_capture_last = md->capture_last;
711
712 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
713 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
714
715 flags = (op == OP_SCBRA)? match_cbegroup : 0;
716 do
717 {
718 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
719 ims, eptrb, flags, RM1);
720 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
721 md->capture_last = save_capture_last;
722 ecode += GET(ecode, 1);
723 }
724 while (*ecode == OP_ALT);
725
726 DPRINTF(("bracket %d failed\n", number));
727
728 md->offset_vector[offset] = save_offset1;
729 md->offset_vector[offset+1] = save_offset2;
730 md->offset_vector[md->offset_end - number] = save_offset3;
731
732 RRETURN(MATCH_NOMATCH);
733 }
734
735 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
736 as a non-capturing bracket. */
737
738 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
739 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
740
741 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
742
743 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
744 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
745
746 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
747 final alternative within the brackets, we would return the result of a
748 recursive call to match() whatever happened. We can reduce stack usage by
749 turning this into a tail recursion, except in the case when match_cbegroup
750 is set.*/
751
752 case OP_BRA:
753 case OP_SBRA:
754 DPRINTF(("start non-capturing bracket\n"));
755 flags = (op >= OP_SBRA)? match_cbegroup : 0;
756 for (;;)
757 {
758 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
759 {
760 if (flags == 0) /* Not a possibly empty group */
761 {
762 ecode += _pcre_OP_lengths[*ecode];
763 DPRINTF(("bracket 0 tail recursion\n"));
764 goto TAIL_RECURSE;
765 }
766
767 /* Possibly empty group; can't use tail recursion. */
768
769 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
770 eptrb, flags, RM48);
771 RRETURN(rrc);
772 }
773
774 /* For non-final alternatives, continue the loop for a NOMATCH result;
775 otherwise return. */
776
777 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
778 eptrb, flags, RM2);
779 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
780 ecode += GET(ecode, 1);
781 }
782 /* Control never reaches here. */
783
784 /* Conditional group: compilation checked that there are no more than
785 two branches. If the condition is false, skipping the first branch takes us
786 past the end if there is only one branch, but that's OK because that is
787 exactly what going to the ket would do. As there is only one branch to be
788 obeyed, we can use tail recursion to avoid using another stack frame. */
789
790 case OP_COND:
791 case OP_SCOND:
792 codelink= GET(ecode, 1);
793
794 /* Because of the way auto-callout works during compile, a callout item is
795 inserted between OP_COND and an assertion condition. */
796
797 if (ecode[LINK_SIZE+1] == OP_CALLOUT)
798 {
799 if (pcre_callout != NULL)
800 {
801 pcre_callout_block cb;
802 cb.version = 1; /* Version 1 of the callout block */
803 cb.callout_number = ecode[LINK_SIZE+2];
804 cb.offset_vector = md->offset_vector;
805 cb.subject = (PCRE_SPTR)md->start_subject;
806 cb.subject_length = md->end_subject - md->start_subject;
807 cb.start_match = mstart - md->start_subject;
808 cb.current_position = eptr - md->start_subject;
809 cb.pattern_position = GET(ecode, LINK_SIZE + 3);
810 cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
811 cb.capture_top = offset_top/2;
812 cb.capture_last = md->capture_last;
813 cb.callout_data = md->callout_data;
814 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
815 if (rrc < 0) RRETURN(rrc);
816 }
817 ecode += _pcre_OP_lengths[OP_CALLOUT];
818 }
819
820 condcode = ecode[LINK_SIZE+1];
821
822 /* Now see what the actual condition is */
823
824 if (condcode == OP_RREF) /* Recursion test */
825 {
826 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
827 condition = md->recursive != NULL &&
828 (offset == RREF_ANY || offset == md->recursive->group_num);
829 ecode += condition? 3 : GET(ecode, 1);
830 }
831
832 else if (condcode == OP_CREF) /* Group used test */
833 {
834 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
835 condition = offset < offset_top && md->offset_vector[offset] >= 0;
836 ecode += condition? 3 : GET(ecode, 1);
837 }
838
839 else if (condcode == OP_DEF) /* DEFINE - always false */
840 {
841 condition = FALSE;
842 ecode += GET(ecode, 1);
843 }
844
845 /* The condition is an assertion. Call match() to evaluate it - setting
846 the final argument match_condassert causes it to stop at the end of an
847 assertion. */
848
849 else
850 {
851 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
852 match_condassert, RM3);
853 if (rrc == MATCH_MATCH)
854 {
855 condition = TRUE;
856 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
857 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
858 }
859 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
860 {
861 RRETURN(rrc); /* Need braces because of following else */
862 }
863 else
864 {
865 condition = FALSE;
866 ecode += codelink;
867 }
868 }
869
870 /* We are now at the branch that is to be obeyed. As there is only one,
871 we can use tail recursion to avoid using another stack frame, except when
872 match_cbegroup is required for an unlimited repeat of a possibly empty
873 group. If the second alternative doesn't exist, we can just plough on. */
874
875 if (condition || *ecode == OP_ALT)
876 {
877 ecode += 1 + LINK_SIZE;
878 if (op == OP_SCOND) /* Possibly empty group */
879 {
880 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
881 RRETURN(rrc);
882 }
883 else /* Group must match something */
884 {
885 flags = 0;
886 goto TAIL_RECURSE;
887 }
888 }
889 else /* Condition false & no alternative */
890 {
891 ecode += 1 + LINK_SIZE;
892 }
893 break;
894
895
896 /* End of the pattern, either real or forced. If we are in a top-level
897 recursion, we should restore the offsets appropriately and continue from
898 after the call. */
899
900 case OP_ACCEPT:
901 case OP_END:
902 if (md->recursive != NULL && md->recursive->group_num == 0)
903 {
904 recursion_info *rec = md->recursive;
905 DPRINTF(("End of pattern in a (?0) recursion\n"));
906 md->recursive = rec->prevrec;
907 memmove(md->offset_vector, rec->offset_save,
908 rec->saved_max * sizeof(int));
909 mstart = rec->save_start;
910 ims = original_ims;
911 ecode = rec->after_call;
912 break;
913 }
914
915 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
916 string - backtracking will then try other alternatives, if any. */
917
918 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
919 md->end_match_ptr = eptr; /* Record where we ended */
920 md->end_offset_top = offset_top; /* and how many extracts were taken */
921 md->start_match_ptr = mstart; /* and the start (\K can modify) */
922 RRETURN(MATCH_MATCH);
923
924 /* Change option settings */
925
926 case OP_OPT:
927 ims = ecode[1];
928 ecode += 2;
929 DPRINTF(("ims set to %02lx\n", ims));
930 break;
931
932 /* Assertion brackets. Check the alternative branches in turn - the
933 matching won't pass the KET for an assertion. If any one branch matches,
934 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
935 start of each branch to move the current point backwards, so the code at
936 this level is identical to the lookahead case. */
937
938 case OP_ASSERT:
939 case OP_ASSERTBACK:
940 do
941 {
942 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
943 RM4);
944 if (rrc == MATCH_MATCH) break;
945 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
946 ecode += GET(ecode, 1);
947 }
948 while (*ecode == OP_ALT);
949 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
950
951 /* If checking an assertion for a condition, return MATCH_MATCH. */
952
953 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
954
955 /* Continue from after the assertion, updating the offsets high water
956 mark, since extracts may have been taken during the assertion. */
957
958 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
959 ecode += 1 + LINK_SIZE;
960 offset_top = md->end_offset_top;
961 continue;
962
963 /* Negative assertion: all branches must fail to match */
964
965 case OP_ASSERT_NOT:
966 case OP_ASSERTBACK_NOT:
967 do
968 {
969 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
970 RM5);
971 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
972 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
973 ecode += GET(ecode,1);
974 }
975 while (*ecode == OP_ALT);
976
977 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
978
979 ecode += 1 + LINK_SIZE;
980 continue;
981
982 /* Move the subject pointer back. This occurs only at the start of
983 each branch of a lookbehind assertion. If we are too close to the start to
984 move back, this match function fails. When working with UTF-8 we move
985 back a number of characters, not bytes. */
986
987 case OP_REVERSE:
988 #ifdef SUPPORT_UTF8
989 if (utf8)
990 {
991 i = GET(ecode, 1);
992 while (i-- > 0)
993 {
994 eptr--;
995 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
996 BACKCHAR(eptr);
997 }
998 }
999 else
1000 #endif
1001
1002 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1003
1004 {
1005 eptr -= GET(ecode, 1);
1006 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1007 }
1008
1009 /* Skip to next op code */
1010
1011 ecode += 1 + LINK_SIZE;
1012 break;
1013
1014 /* The callout item calls an external function, if one is provided, passing
1015 details of the match so far. This is mainly for debugging, though the
1016 function is able to force a failure. */
1017
1018 case OP_CALLOUT:
1019 if (pcre_callout != NULL)
1020 {
1021 pcre_callout_block cb;
1022 cb.version = 1; /* Version 1 of the callout block */
1023 cb.callout_number = ecode[1];
1024 cb.offset_vector = md->offset_vector;
1025 cb.subject = (PCRE_SPTR)md->start_subject;
1026 cb.subject_length = md->end_subject - md->start_subject;
1027 cb.start_match = mstart - md->start_subject;
1028 cb.current_position = eptr - md->start_subject;
1029 cb.pattern_position = GET(ecode, 2);
1030 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1031 cb.capture_top = offset_top/2;
1032 cb.capture_last = md->capture_last;
1033 cb.callout_data = md->callout_data;
1034 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1035 if (rrc < 0) RRETURN(rrc);
1036 }
1037 ecode += 2 + 2*LINK_SIZE;
1038 break;
1039
1040 /* Recursion either matches the current regex, or some subexpression. The
1041 offset data is the offset to the starting bracket from the start of the
1042 whole pattern. (This is so that it works from duplicated subpatterns.)
1043
1044 If there are any capturing brackets started but not finished, we have to
1045 save their starting points and reinstate them after the recursion. However,
1046 we don't know how many such there are (offset_top records the completed
1047 total) so we just have to save all the potential data. There may be up to
1048 65535 such values, which is too large to put on the stack, but using malloc
1049 for small numbers seems expensive. As a compromise, the stack is used when
1050 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1051 is used. A problem is what to do if the malloc fails ... there is no way of
1052 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1053 values on the stack, and accept that the rest may be wrong.
1054
1055 There are also other values that have to be saved. We use a chained
1056 sequence of blocks that actually live on the stack. Thanks to Robin Houston
1057 for the original version of this logic. */
1058
1059 case OP_RECURSE:
1060 {
1061 callpat = md->start_code + GET(ecode, 1);
1062 new_recursive.group_num = (callpat == md->start_code)? 0 :
1063 GET2(callpat, 1 + LINK_SIZE);
1064
1065 /* Add to "recursing stack" */
1066
1067 new_recursive.prevrec = md->recursive;
1068 md->recursive = &new_recursive;
1069
1070 /* Find where to continue from afterwards */
1071
1072 ecode += 1 + LINK_SIZE;
1073 new_recursive.after_call = ecode;
1074
1075 /* Now save the offset data. */
1076
1077 new_recursive.saved_max = md->offset_end;
1078 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1079 new_recursive.offset_save = stacksave;
1080 else
1081 {
1082 new_recursive.offset_save =
1083 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1084 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1085 }
1086
1087 memcpy(new_recursive.offset_save, md->offset_vector,
1088 new_recursive.saved_max * sizeof(int));
1089 new_recursive.save_start = mstart;
1090 mstart = eptr;
1091
1092 /* OK, now we can do the recursion. For each top-level alternative we
1093 restore the offset and recursion data. */
1094
1095 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1096 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1097 do
1098 {
1099 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1100 md, ims, eptrb, flags, RM6);
1101 if (rrc == MATCH_MATCH)
1102 {
1103 DPRINTF(("Recursion matched\n"));
1104 md->recursive = new_recursive.prevrec;
1105 if (new_recursive.offset_save != stacksave)
1106 (pcre_free)(new_recursive.offset_save);
1107 RRETURN(MATCH_MATCH);
1108 }
1109 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1110 {
1111 DPRINTF(("Recursion gave error %d\n", rrc));
1112 if (new_recursive.offset_save != stacksave)
1113 (pcre_free)(new_recursive.offset_save);
1114 RRETURN(rrc);
1115 }
1116
1117 md->recursive = &new_recursive;
1118 memcpy(md->offset_vector, new_recursive.offset_save,
1119 new_recursive.saved_max * sizeof(int));
1120 callpat += GET(callpat, 1);
1121 }
1122 while (*callpat == OP_ALT);
1123
1124 DPRINTF(("Recursion didn't match\n"));
1125 md->recursive = new_recursive.prevrec;
1126 if (new_recursive.offset_save != stacksave)
1127 (pcre_free)(new_recursive.offset_save);
1128 RRETURN(MATCH_NOMATCH);
1129 }
1130 /* Control never reaches here */
1131
1132 /* "Once" brackets are like assertion brackets except that after a match,
1133 the point in the subject string is not moved back. Thus there can never be
1134 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1135 Check the alternative branches in turn - the matching won't pass the KET
1136 for this kind of subpattern. If any one branch matches, we carry on as at
1137 the end of a normal bracket, leaving the subject pointer. */
1138
1139 case OP_ONCE:
1140 prev = ecode;
1141 saved_eptr = eptr;
1142
1143 do
1144 {
1145 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1146 if (rrc == MATCH_MATCH) break;
1147 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1148 ecode += GET(ecode,1);
1149 }
1150 while (*ecode == OP_ALT);
1151
1152 /* If hit the end of the group (which could be repeated), fail */
1153
1154 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1155
1156 /* Continue as from after the assertion, updating the offsets high water
1157 mark, since extracts may have been taken. */
1158
1159 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1160
1161 offset_top = md->end_offset_top;
1162 eptr = md->end_match_ptr;
1163
1164 /* For a non-repeating ket, just continue at this level. This also
1165 happens for a repeating ket if no characters were matched in the group.
1166 This is the forcible breaking of infinite loops as implemented in Perl
1167 5.005. If there is an options reset, it will get obeyed in the normal
1168 course of events. */
1169
1170 if (*ecode == OP_KET || eptr == saved_eptr)
1171 {
1172 ecode += 1+LINK_SIZE;
1173 break;
1174 }
1175
1176 /* The repeating kets try the rest of the pattern or restart from the
1177 preceding bracket, in the appropriate order. The second "call" of match()
1178 uses tail recursion, to avoid using another stack frame. We need to reset
1179 any options that changed within the bracket before re-running it, so
1180 check the next opcode. */
1181
1182 if (ecode[1+LINK_SIZE] == OP_OPT)
1183 {
1184 ims = (ims & ~PCRE_IMS) | ecode[4];
1185 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1186 }
1187
1188 if (*ecode == OP_KETRMIN)
1189 {
1190 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1191 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 ecode = prev;
1193 flags = 0;
1194 goto TAIL_RECURSE;
1195 }
1196 else /* OP_KETRMAX */
1197 {
1198 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1199 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1200 ecode += 1 + LINK_SIZE;
1201 flags = 0;
1202 goto TAIL_RECURSE;
1203 }
1204 /* Control never gets here */
1205
1206 /* An alternation is the end of a branch; scan along to find the end of the
1207 bracketed group and go to there. */
1208
1209 case OP_ALT:
1210 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1211 break;
1212
1213 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1214 indicating that it may occur zero times. It may repeat infinitely, or not
1215 at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1216 with fixed upper repeat limits are compiled as a number of copies, with the
1217 optional ones preceded by BRAZERO or BRAMINZERO. */
1218
1219 case OP_BRAZERO:
1220 {
1221 next = ecode+1;
1222 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1223 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1224 do next += GET(next,1); while (*next == OP_ALT);
1225 ecode = next + 1 + LINK_SIZE;
1226 }
1227 break;
1228
1229 case OP_BRAMINZERO:
1230 {
1231 next = ecode+1;
1232 do next += GET(next, 1); while (*next == OP_ALT);
1233 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1234 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1235 ecode++;
1236 }
1237 break;
1238
1239 case OP_SKIPZERO:
1240 {
1241 next = ecode+1;
1242 do next += GET(next,1); while (*next == OP_ALT);
1243 ecode = next + 1 + LINK_SIZE;
1244 }
1245 break;
1246
1247 /* End of a group, repeated or non-repeating. */
1248
1249 case OP_KET:
1250 case OP_KETRMIN:
1251 case OP_KETRMAX:
1252 prev = ecode - GET(ecode, 1);
1253
1254 /* If this was a group that remembered the subject start, in order to break
1255 infinite repeats of empty string matches, retrieve the subject start from
1256 the chain. Otherwise, set it NULL. */
1257
1258 if (*prev >= OP_SBRA)
1259 {
1260 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1261 eptrb = eptrb->epb_prev; /* Backup to previous group */
1262 }
1263 else saved_eptr = NULL;
1264
1265 /* If we are at the end of an assertion group, stop matching and return
1266 MATCH_MATCH, but record the current high water mark for use by positive
1267 assertions. Do this also for the "once" (atomic) groups. */
1268
1269 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1270 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1271 *prev == OP_ONCE)
1272 {
1273 md->end_match_ptr = eptr; /* For ONCE */
1274 md->end_offset_top = offset_top;
1275 RRETURN(MATCH_MATCH);
1276 }
1277
1278 /* For capturing groups we have to check the group number back at the start
1279 and if necessary complete handling an extraction by setting the offsets and
1280 bumping the high water mark. Note that whole-pattern recursion is coded as
1281 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1282 when the OP_END is reached. Other recursion is handled here. */
1283
1284 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1285 {
1286 number = GET2(prev, 1+LINK_SIZE);
1287 offset = number << 1;
1288
1289 #ifdef DEBUG
1290 printf("end bracket %d", number);
1291 printf("\n");
1292 #endif
1293
1294 md->capture_last = number;
1295 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1296 {
1297 md->offset_vector[offset] =
1298 md->offset_vector[md->offset_end - number];
1299 md->offset_vector[offset+1] = eptr - md->start_subject;
1300 if (offset_top <= offset) offset_top = offset + 2;
1301 }
1302
1303 /* Handle a recursively called group. Restore the offsets
1304 appropriately and continue from after the call. */
1305
1306 if (md->recursive != NULL && md->recursive->group_num == number)
1307 {
1308 recursion_info *rec = md->recursive;
1309 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1310 md->recursive = rec->prevrec;
1311 mstart = rec->save_start;
1312 memcpy(md->offset_vector, rec->offset_save,
1313 rec->saved_max * sizeof(int));
1314 ecode = rec->after_call;
1315 ims = original_ims;
1316 break;
1317 }
1318 }
1319
1320 /* For both capturing and non-capturing groups, reset the value of the ims
1321 flags, in case they got changed during the group. */
1322
1323 ims = original_ims;
1324 DPRINTF(("ims reset to %02lx\n", ims));
1325
1326 /* For a non-repeating ket, just continue at this level. This also
1327 happens for a repeating ket if no characters were matched in the group.
1328 This is the forcible breaking of infinite loops as implemented in Perl
1329 5.005. If there is an options reset, it will get obeyed in the normal
1330 course of events. */
1331
1332 if (*ecode == OP_KET || eptr == saved_eptr)
1333 {
1334 ecode += 1 + LINK_SIZE;
1335 break;
1336 }
1337
1338 /* The repeating kets try the rest of the pattern or restart from the
1339 preceding bracket, in the appropriate order. In the second case, we can use
1340 tail recursion to avoid using another stack frame, unless we have an
1341 unlimited repeat of a group that can match an empty string. */
1342
1343 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1344
1345 if (*ecode == OP_KETRMIN)
1346 {
1347 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1348 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1349 if (flags != 0) /* Could match an empty string */
1350 {
1351 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1352 RRETURN(rrc);
1353 }
1354 ecode = prev;
1355 goto TAIL_RECURSE;
1356 }
1357 else /* OP_KETRMAX */
1358 {
1359 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1360 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1361 ecode += 1 + LINK_SIZE;
1362 flags = 0;
1363 goto TAIL_RECURSE;
1364 }
1365 /* Control never gets here */
1366
1367 /* Start of subject unless notbol, or after internal newline if multiline */
1368
1369 case OP_CIRC:
1370 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1371 if ((ims & PCRE_MULTILINE) != 0)
1372 {
1373 if (eptr != md->start_subject &&
1374 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1375 RRETURN(MATCH_NOMATCH);
1376 ecode++;
1377 break;
1378 }
1379 /* ... else fall through */
1380
1381 /* Start of subject assertion */
1382
1383 case OP_SOD:
1384 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1385 ecode++;
1386 break;
1387
1388 /* Start of match assertion */
1389
1390 case OP_SOM:
1391 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1392 ecode++;
1393 break;
1394
1395 /* Reset the start of match point */
1396
1397 case OP_SET_SOM:
1398 mstart = eptr;
1399 ecode++;
1400 break;
1401
1402 /* Assert before internal newline if multiline, or before a terminating
1403 newline unless endonly is set, else end of subject unless noteol is set. */
1404
1405 case OP_DOLL:
1406 if ((ims & PCRE_MULTILINE) != 0)
1407 {
1408 if (eptr < md->end_subject)
1409 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1410 else
1411 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1412 ecode++;
1413 break;
1414 }
1415 else
1416 {
1417 if (md->noteol) RRETURN(MATCH_NOMATCH);
1418 if (!md->endonly)
1419 {
1420 if (eptr != md->end_subject &&
1421 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1422 RRETURN(MATCH_NOMATCH);
1423 ecode++;
1424 break;
1425 }
1426 }
1427 /* ... else fall through for endonly */
1428
1429 /* End of subject assertion (\z) */
1430
1431 case OP_EOD:
1432 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1433 ecode++;
1434 break;
1435
1436 /* End of subject or ending \n assertion (\Z) */
1437
1438 case OP_EODN:
1439 if (eptr != md->end_subject &&
1440 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1441 RRETURN(MATCH_NOMATCH);
1442 ecode++;
1443 break;
1444
1445 /* Word boundary assertions */
1446
1447 case OP_NOT_WORD_BOUNDARY:
1448 case OP_WORD_BOUNDARY:
1449 {
1450
1451 /* Find out if the previous and current characters are "word" characters.
1452 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1453 be "non-word" characters. */
1454
1455 #ifdef SUPPORT_UTF8
1456 if (utf8)
1457 {
1458 if (eptr == md->start_subject) prev_is_word = FALSE; else
1459 {
1460 const uschar *lastptr = eptr - 1;
1461 while((*lastptr & 0xc0) == 0x80) lastptr--;
1462 GETCHAR(c, lastptr);
1463 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1464 }
1465 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1466 {
1467 GETCHAR(c, eptr);
1468 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1469 }
1470 }
1471 else
1472 #endif
1473
1474 /* More streamlined when not in UTF-8 mode */
1475
1476 {
1477 prev_is_word = (eptr != md->start_subject) &&
1478 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1479 cur_is_word = (eptr < md->end_subject) &&
1480 ((md->ctypes[*eptr] & ctype_word) != 0);
1481 }
1482
1483 /* Now see if the situation is what we want */
1484
1485 if ((*ecode++ == OP_WORD_BOUNDARY)?
1486 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1487 RRETURN(MATCH_NOMATCH);
1488 }
1489 break;
1490
1491 /* Match a single character type; inline for speed */
1492
1493 case OP_ANY:
1494 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1495 /* Fall through */
1496
1497 case OP_ALLANY:
1498 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1499 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1500 ecode++;
1501 break;
1502
1503 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1504 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1505
1506 case OP_ANYBYTE:
1507 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1508 ecode++;
1509 break;
1510
1511 case OP_NOT_DIGIT:
1512 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1513 GETCHARINCTEST(c, eptr);
1514 if (
1515 #ifdef SUPPORT_UTF8
1516 c < 256 &&
1517 #endif
1518 (md->ctypes[c] & ctype_digit) != 0
1519 )
1520 RRETURN(MATCH_NOMATCH);
1521 ecode++;
1522 break;
1523
1524 case OP_DIGIT:
1525 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1526 GETCHARINCTEST(c, eptr);
1527 if (
1528 #ifdef SUPPORT_UTF8
1529 c >= 256 ||
1530 #endif
1531 (md->ctypes[c] & ctype_digit) == 0
1532 )
1533 RRETURN(MATCH_NOMATCH);
1534 ecode++;
1535 break;
1536
1537 case OP_NOT_WHITESPACE:
1538 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1539 GETCHARINCTEST(c, eptr);
1540 if (
1541 #ifdef SUPPORT_UTF8
1542 c < 256 &&
1543 #endif
1544 (md->ctypes[c] & ctype_space) != 0
1545 )
1546 RRETURN(MATCH_NOMATCH);
1547 ecode++;
1548 break;
1549
1550 case OP_WHITESPACE:
1551 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1552 GETCHARINCTEST(c, eptr);
1553 if (
1554 #ifdef SUPPORT_UTF8
1555 c >= 256 ||
1556 #endif
1557 (md->ctypes[c] & ctype_space) == 0
1558 )
1559 RRETURN(MATCH_NOMATCH);
1560 ecode++;
1561 break;
1562
1563 case OP_NOT_WORDCHAR:
1564 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1565 GETCHARINCTEST(c, eptr);
1566 if (
1567 #ifdef SUPPORT_UTF8
1568 c < 256 &&
1569 #endif
1570 (md->ctypes[c] & ctype_word) != 0
1571 )
1572 RRETURN(MATCH_NOMATCH);
1573 ecode++;
1574 break;
1575
1576 case OP_WORDCHAR:
1577 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1578 GETCHARINCTEST(c, eptr);
1579 if (
1580 #ifdef SUPPORT_UTF8
1581 c >= 256 ||
1582 #endif
1583 (md->ctypes[c] & ctype_word) == 0
1584 )
1585 RRETURN(MATCH_NOMATCH);
1586 ecode++;
1587 break;
1588
1589 case OP_ANYNL:
1590 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1591 GETCHARINCTEST(c, eptr);
1592 switch(c)
1593 {
1594 default: RRETURN(MATCH_NOMATCH);
1595 case 0x000d:
1596 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1597 break;
1598
1599 case 0x000a:
1600 break;
1601
1602 case 0x000b:
1603 case 0x000c:
1604 case 0x0085:
1605 case 0x2028:
1606 case 0x2029:
1607 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1608 break;
1609 }
1610 ecode++;
1611 break;
1612
1613 case OP_NOT_HSPACE:
1614 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1615 GETCHARINCTEST(c, eptr);
1616 switch(c)
1617 {
1618 default: break;
1619 case 0x09: /* HT */
1620 case 0x20: /* SPACE */
1621 case 0xa0: /* NBSP */
1622 case 0x1680: /* OGHAM SPACE MARK */
1623 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1624 case 0x2000: /* EN QUAD */
1625 case 0x2001: /* EM QUAD */
1626 case 0x2002: /* EN SPACE */
1627 case 0x2003: /* EM SPACE */
1628 case 0x2004: /* THREE-PER-EM SPACE */
1629 case 0x2005: /* FOUR-PER-EM SPACE */
1630 case 0x2006: /* SIX-PER-EM SPACE */
1631 case 0x2007: /* FIGURE SPACE */
1632 case 0x2008: /* PUNCTUATION SPACE */
1633 case 0x2009: /* THIN SPACE */
1634 case 0x200A: /* HAIR SPACE */
1635 case 0x202f: /* NARROW NO-BREAK SPACE */
1636 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1637 case 0x3000: /* IDEOGRAPHIC SPACE */
1638 RRETURN(MATCH_NOMATCH);
1639 }
1640 ecode++;
1641 break;
1642
1643 case OP_HSPACE:
1644 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1645 GETCHARINCTEST(c, eptr);
1646 switch(c)
1647 {
1648 default: RRETURN(MATCH_NOMATCH);
1649 case 0x09: /* HT */
1650 case 0x20: /* SPACE */
1651 case 0xa0: /* NBSP */
1652 case 0x1680: /* OGHAM SPACE MARK */
1653 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1654 case 0x2000: /* EN QUAD */
1655 case 0x2001: /* EM QUAD */
1656 case 0x2002: /* EN SPACE */
1657 case 0x2003: /* EM SPACE */
1658 case 0x2004: /* THREE-PER-EM SPACE */
1659 case 0x2005: /* FOUR-PER-EM SPACE */
1660 case 0x2006: /* SIX-PER-EM SPACE */
1661 case 0x2007: /* FIGURE SPACE */
1662 case 0x2008: /* PUNCTUATION SPACE */
1663 case 0x2009: /* THIN SPACE */
1664 case 0x200A: /* HAIR SPACE */
1665 case 0x202f: /* NARROW NO-BREAK SPACE */
1666 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1667 case 0x3000: /* IDEOGRAPHIC SPACE */
1668 break;
1669 }
1670 ecode++;
1671 break;
1672
1673 case OP_NOT_VSPACE:
1674 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1675 GETCHARINCTEST(c, eptr);
1676 switch(c)
1677 {
1678 default: break;
1679 case 0x0a: /* LF */
1680 case 0x0b: /* VT */
1681 case 0x0c: /* FF */
1682 case 0x0d: /* CR */
1683 case 0x85: /* NEL */
1684 case 0x2028: /* LINE SEPARATOR */
1685 case 0x2029: /* PARAGRAPH SEPARATOR */
1686 RRETURN(MATCH_NOMATCH);
1687 }
1688 ecode++;
1689 break;
1690
1691 case OP_VSPACE:
1692 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1693 GETCHARINCTEST(c, eptr);
1694 switch(c)
1695 {
1696 default: RRETURN(MATCH_NOMATCH);
1697 case 0x0a: /* LF */
1698 case 0x0b: /* VT */
1699 case 0x0c: /* FF */
1700 case 0x0d: /* CR */
1701 case 0x85: /* NEL */
1702 case 0x2028: /* LINE SEPARATOR */
1703 case 0x2029: /* PARAGRAPH SEPARATOR */
1704 break;
1705 }
1706 ecode++;
1707 break;
1708
1709 #ifdef SUPPORT_UCP
1710 /* Check the next character by Unicode property. We will get here only
1711 if the support is in the binary; otherwise a compile-time error occurs. */
1712
1713 case OP_PROP:
1714 case OP_NOTPROP:
1715 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1716 GETCHARINCTEST(c, eptr);
1717 {
1718 const ucd_record *prop = GET_UCD(c);
1719
1720 switch(ecode[1])
1721 {
1722 case PT_ANY:
1723 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1724 break;
1725
1726 case PT_LAMP:
1727 if ((prop->chartype == ucp_Lu ||
1728 prop->chartype == ucp_Ll ||
1729 prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1730 RRETURN(MATCH_NOMATCH);
1731 break;
1732
1733 case PT_GC:
1734 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1735 RRETURN(MATCH_NOMATCH);
1736 break;
1737
1738 case PT_PC:
1739 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1740 RRETURN(MATCH_NOMATCH);
1741 break;
1742
1743 case PT_SC:
1744 if ((ecode[2] != prop->script) == (op == OP_PROP))
1745 RRETURN(MATCH_NOMATCH);
1746 break;
1747
1748 default:
1749 RRETURN(PCRE_ERROR_INTERNAL);
1750 }
1751
1752 ecode += 3;
1753 }
1754 break;
1755
1756 /* Match an extended Unicode sequence. We will get here only if the support
1757 is in the binary; otherwise a compile-time error occurs. */
1758
1759 case OP_EXTUNI:
1760 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1761 GETCHARINCTEST(c, eptr);
1762 {
1763 int category = UCD_CATEGORY(c);
1764 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1765 while (eptr < md->end_subject)
1766 {
1767 int len = 1;
1768 if (!utf8) c = *eptr; else
1769 {
1770 GETCHARLEN(c, eptr, len);
1771 }
1772 category = UCD_CATEGORY(c);
1773 if (category != ucp_M) break;
1774 eptr += len;
1775 }
1776 }
1777 ecode++;
1778 break;
1779 #endif
1780
1781
1782 /* Match a back reference, possibly repeatedly. Look past the end of the
1783 item to see if there is repeat information following. The code is similar
1784 to that for character classes, but repeated for efficiency. Then obey
1785 similar code to character type repeats - written out again for speed.
1786 However, if the referenced string is the empty string, always treat
1787 it as matched, any number of times (otherwise there could be infinite
1788 loops). */
1789
1790 case OP_REF:
1791 {
1792 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1793 ecode += 3;
1794
1795 /* If the reference is unset, there are two possibilities:
1796
1797 (a) In the default, Perl-compatible state, set the length to be longer
1798 than the amount of subject left; this ensures that every attempt at a
1799 match fails. We can't just fail here, because of the possibility of
1800 quantifiers with zero minima.
1801
1802 (b) If the JavaScript compatibility flag is set, set the length to zero
1803 so that the back reference matches an empty string.
1804
1805 Otherwise, set the length to the length of what was matched by the
1806 referenced subpattern. */
1807
1808 if (offset >= offset_top || md->offset_vector[offset] < 0)
1809 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1810 else
1811 length = md->offset_vector[offset+1] - md->offset_vector[offset];
1812
1813 /* Set up for repetition, or handle the non-repeated case */
1814
1815 switch (*ecode)
1816 {
1817 case OP_CRSTAR:
1818 case OP_CRMINSTAR:
1819 case OP_CRPLUS:
1820 case OP_CRMINPLUS:
1821 case OP_CRQUERY:
1822 case OP_CRMINQUERY:
1823 c = *ecode++ - OP_CRSTAR;
1824 minimize = (c & 1) != 0;
1825 min = rep_min[c]; /* Pick up values from tables; */
1826 max = rep_max[c]; /* zero for max => infinity */
1827 if (max == 0) max = INT_MAX;
1828 break;
1829
1830 case OP_CRRANGE:
1831 case OP_CRMINRANGE:
1832 minimize = (*ecode == OP_CRMINRANGE);
1833 min = GET2(ecode, 1);
1834 max = GET2(ecode, 3);
1835 if (max == 0) max = INT_MAX;
1836 ecode += 5;
1837 break;
1838
1839 default: /* No repeat follows */
1840 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1841 eptr += length;
1842 continue; /* With the main loop */
1843 }
1844
1845 /* If the length of the reference is zero, just continue with the
1846 main loop. */
1847
1848 if (length == 0) continue;
1849
1850 /* First, ensure the minimum number of matches are present. We get back
1851 the length of the reference string explicitly rather than passing the
1852 address of eptr, so that eptr can be a register variable. */
1853
1854 for (i = 1; i <= min; i++)
1855 {
1856 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1857 eptr += length;
1858 }
1859
1860 /* If min = max, continue at the same level without recursion.
1861 They are not both allowed to be zero. */
1862
1863 if (min == max) continue;
1864
1865 /* If minimizing, keep trying and advancing the pointer */
1866
1867 if (minimize)
1868 {
1869 for (fi = min;; fi++)
1870 {
1871 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1872 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1873 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1874 RRETURN(MATCH_NOMATCH);
1875 eptr += length;
1876 }
1877 /* Control never gets here */
1878 }
1879
1880 /* If maximizing, find the longest string and work backwards */
1881
1882 else
1883 {
1884 pp = eptr;
1885 for (i = min; i < max; i++)
1886 {
1887 if (!match_ref(offset, eptr, length, md, ims)) break;
1888 eptr += length;
1889 }
1890 while (eptr >= pp)
1891 {
1892 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1893 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1894 eptr -= length;
1895 }
1896 RRETURN(MATCH_NOMATCH);
1897 }
1898 }
1899 /* Control never gets here */
1900
1901
1902
1903 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1904 used when all the characters in the class have values in the range 0-255,
1905 and either the matching is caseful, or the characters are in the range
1906 0-127 when UTF-8 processing is enabled. The only difference between
1907 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1908 encountered.
1909
1910 First, look past the end of the item to see if there is repeat information
1911 following. Then obey similar code to character type repeats - written out
1912 again for speed. */
1913
1914 case OP_NCLASS:
1915 case OP_CLASS:
1916 {
1917 data = ecode + 1; /* Save for matching */
1918 ecode += 33; /* Advance past the item */
1919
1920 switch (*ecode)
1921 {
1922 case OP_CRSTAR:
1923 case OP_CRMINSTAR:
1924 case OP_CRPLUS:
1925 case OP_CRMINPLUS:
1926 case OP_CRQUERY:
1927 case OP_CRMINQUERY:
1928 c = *ecode++ - OP_CRSTAR;
1929 minimize = (c & 1) != 0;
1930 min = rep_min[c]; /* Pick up values from tables; */
1931 max = rep_max[c]; /* zero for max => infinity */
1932 if (max == 0) max = INT_MAX;
1933 break;
1934
1935 case OP_CRRANGE:
1936 case OP_CRMINRANGE:
1937 minimize = (*ecode == OP_CRMINRANGE);
1938 min = GET2(ecode, 1);
1939 max = GET2(ecode, 3);
1940 if (max == 0) max = INT_MAX;
1941 ecode += 5;
1942 break;
1943
1944 default: /* No repeat follows */
1945 min = max = 1;
1946 break;
1947 }
1948
1949 /* First, ensure the minimum number of matches are present. */
1950
1951 #ifdef SUPPORT_UTF8
1952 /* UTF-8 mode */
1953 if (utf8)
1954 {
1955 for (i = 1; i <= min; i++)
1956 {
1957 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1958 GETCHARINC(c, eptr);
1959 if (c > 255)
1960 {
1961 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1962 }
1963 else
1964 {
1965 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1966 }
1967 }
1968 }
1969 else
1970 #endif
1971 /* Not UTF-8 mode */
1972 {
1973 for (i = 1; i <= min; i++)
1974 {
1975 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1976 c = *eptr++;
1977 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1978 }
1979 }
1980
1981 /* If max == min we can continue with the main loop without the
1982 need to recurse. */
1983
1984 if (min == max) continue;
1985
1986 /* If minimizing, keep testing the rest of the expression and advancing
1987 the pointer while it matches the class. */
1988
1989 if (minimize)
1990 {
1991 #ifdef SUPPORT_UTF8
1992 /* UTF-8 mode */
1993 if (utf8)
1994 {
1995 for (fi = min;; fi++)
1996 {
1997 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1999 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2000 GETCHARINC(c, eptr);
2001 if (c > 255)
2002 {
2003 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2004 }
2005 else
2006 {
2007 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2008 }
2009 }
2010 }
2011 else
2012 #endif
2013 /* Not UTF-8 mode */
2014 {
2015 for (fi = min;; fi++)
2016 {
2017 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2018 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2020 c = *eptr++;
2021 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2022 }
2023 }
2024 /* Control never gets here */
2025 }
2026
2027 /* If maximizing, find the longest possible run, then work backwards. */
2028
2029 else
2030 {
2031 pp = eptr;
2032
2033 #ifdef SUPPORT_UTF8
2034 /* UTF-8 mode */
2035 if (utf8)
2036 {
2037 for (i = min; i < max; i++)
2038 {
2039 int len = 1;
2040 if (eptr >= md->end_subject) break;
2041 GETCHARLEN(c, eptr, len);
2042 if (c > 255)
2043 {
2044 if (op == OP_CLASS) break;
2045 }
2046 else
2047 {
2048 if ((data[c/8] & (1 << (c&7))) == 0) break;
2049 }
2050 eptr += len;
2051 }
2052 for (;;)
2053 {
2054 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2055 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2056 if (eptr-- == pp) break; /* Stop if tried at original pos */
2057 BACKCHAR(eptr);
2058 }
2059 }
2060 else
2061 #endif
2062 /* Not UTF-8 mode */
2063 {
2064 for (i = min; i < max; i++)
2065 {
2066 if (eptr >= md->end_subject) break;
2067 c = *eptr;
2068 if ((data[c/8] & (1 << (c&7))) == 0) break;
2069 eptr++;
2070 }
2071 while (eptr >= pp)
2072 {
2073 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2074 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2075 eptr--;
2076 }
2077 }
2078
2079 RRETURN(MATCH_NOMATCH);
2080 }
2081 }
2082 /* Control never gets here */
2083
2084
2085 /* Match an extended character class. This opcode is encountered only
2086 when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2087 mode, because Unicode properties are supported in non-UTF-8 mode. */
2088
2089 #ifdef SUPPORT_UTF8
2090 case OP_XCLASS:
2091 {
2092 data = ecode + 1 + LINK_SIZE; /* Save for matching */
2093 ecode += GET(ecode, 1); /* Advance past the item */
2094
2095 switch (*ecode)
2096 {
2097 case OP_CRSTAR:
2098 case OP_CRMINSTAR:
2099 case OP_CRPLUS:
2100 case OP_CRMINPLUS:
2101 case OP_CRQUERY:
2102 case OP_CRMINQUERY:
2103 c = *ecode++ - OP_CRSTAR;
2104 minimize = (c & 1) != 0;
2105 min = rep_min[c]; /* Pick up values from tables; */
2106 max = rep_max[c]; /* zero for max => infinity */
2107 if (max == 0) max = INT_MAX;
2108 break;
2109
2110 case OP_CRRANGE:
2111 case OP_CRMINRANGE:
2112 minimize = (*ecode == OP_CRMINRANGE);
2113 min = GET2(ecode, 1);
2114 max = GET2(ecode, 3);
2115 if (max == 0) max = INT_MAX;
2116 ecode += 5;
2117 break;
2118
2119 default: /* No repeat follows */
2120 min = max = 1;
2121 break;
2122 }
2123
2124 /* First, ensure the minimum number of matches are present. */
2125
2126 for (i = 1; i <= min; i++)
2127 {
2128 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2129 GETCHARINCTEST(c, eptr);
2130 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2131 }
2132
2133 /* If max == min we can continue with the main loop without the
2134 need to recurse. */
2135
2136 if (min == max) continue;
2137
2138 /* If minimizing, keep testing the rest of the expression and advancing
2139 the pointer while it matches the class. */
2140
2141 if (minimize)
2142 {
2143 for (fi = min;; fi++)
2144 {
2145 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2146 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2147 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2148 GETCHARINCTEST(c, eptr);
2149 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2150 }
2151 /* Control never gets here */
2152 }
2153
2154 /* If maximizing, find the longest possible run, then work backwards. */
2155
2156 else
2157 {
2158 pp = eptr;
2159 for (i = min; i < max; i++)
2160 {
2161 int len = 1;
2162 if (eptr >= md->end_subject) break;
2163 GETCHARLENTEST(c, eptr, len);
2164 if (!_pcre_xclass(c, data)) break;
2165 eptr += len;
2166 }
2167 for(;;)
2168 {
2169 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2170 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2171 if (eptr-- == pp) break; /* Stop if tried at original pos */
2172 if (utf8) BACKCHAR(eptr);
2173 }
2174 RRETURN(MATCH_NOMATCH);
2175 }
2176
2177 /* Control never gets here */
2178 }
2179 #endif /* End of XCLASS */
2180
2181 /* Match a single character, casefully */
2182
2183 case OP_CHAR:
2184 #ifdef SUPPORT_UTF8
2185 if (utf8)
2186 {
2187 length = 1;
2188 ecode++;
2189 GETCHARLEN(fc, ecode, length);
2190 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2191 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2192 }
2193 else
2194 #endif
2195
2196 /* Non-UTF-8 mode */
2197 {
2198 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2199 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2200 ecode += 2;
2201 }
2202 break;
2203
2204 /* Match a single character, caselessly */
2205
2206 case OP_CHARNC:
2207 #ifdef SUPPORT_UTF8
2208 if (utf8)
2209 {
2210 length = 1;
2211 ecode++;
2212 GETCHARLEN(fc, ecode, length);
2213
2214 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2215
2216 /* If the pattern character's value is < 128, we have only one byte, and
2217 can use the fast lookup table. */
2218
2219 if (fc < 128)
2220 {
2221 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2222 }
2223
2224 /* Otherwise we must pick up the subject character */
2225
2226 else
2227 {
2228 unsigned int dc;
2229 GETCHARINC(dc, eptr);
2230 ecode += length;
2231
2232 /* If we have Unicode property support, we can use it to test the other
2233 case of the character, if there is one. */
2234
2235 if (fc != dc)
2236 {
2237 #ifdef SUPPORT_UCP
2238 if (dc != UCD_OTHERCASE(fc))
2239 #endif
2240 RRETURN(MATCH_NOMATCH);
2241 }
2242 }
2243 }
2244 else
2245 #endif /* SUPPORT_UTF8 */
2246
2247 /* Non-UTF-8 mode */
2248 {
2249 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2250 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2251 ecode += 2;
2252 }
2253 break;
2254
2255 /* Match a single character repeatedly. */
2256
2257 case OP_EXACT:
2258 min = max = GET2(ecode, 1);
2259 ecode += 3;
2260 goto REPEATCHAR;
2261
2262 case OP_POSUPTO:
2263 possessive = TRUE;
2264 /* Fall through */
2265
2266 case OP_UPTO:
2267 case OP_MINUPTO:
2268 min = 0;
2269 max = GET2(ecode, 1);
2270 minimize = *ecode == OP_MINUPTO;
2271 ecode += 3;
2272 goto REPEATCHAR;
2273
2274 case OP_POSSTAR:
2275 possessive = TRUE;
2276 min = 0;
2277 max = INT_MAX;
2278 ecode++;
2279 goto REPEATCHAR;
2280
2281 case OP_POSPLUS:
2282 possessive = TRUE;
2283 min = 1;
2284 max = INT_MAX;
2285 ecode++;
2286 goto REPEATCHAR;
2287
2288 case OP_POSQUERY:
2289 possessive = TRUE;
2290 min = 0;
2291 max = 1;
2292 ecode++;
2293 goto REPEATCHAR;
2294
2295 case OP_STAR:
2296 case OP_MINSTAR:
2297 case OP_PLUS:
2298 case OP_MINPLUS:
2299 case OP_QUERY:
2300 case OP_MINQUERY:
2301 c = *ecode++ - OP_STAR;
2302 minimize = (c & 1) != 0;
2303 min = rep_min[c]; /* Pick up values from tables; */
2304 max = rep_max[c]; /* zero for max => infinity */
2305 if (max == 0) max = INT_MAX;
2306
2307 /* Common code for all repeated single-character matches. We can give
2308 up quickly if there are fewer than the minimum number of characters left in
2309 the subject. */
2310
2311 REPEATCHAR:
2312 #ifdef SUPPORT_UTF8
2313 if (utf8)
2314 {
2315 length = 1;
2316 charptr = ecode;
2317 GETCHARLEN(fc, ecode, length);
2318 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2319 ecode += length;
2320
2321 /* Handle multibyte character matching specially here. There is
2322 support for caseless matching if UCP support is present. */
2323
2324 if (length > 1)
2325 {
2326 #ifdef SUPPORT_UCP
2327 unsigned int othercase;
2328 if ((ims & PCRE_CASELESS) != 0 &&
2329 (othercase = UCD_OTHERCASE(fc)) != fc)
2330 oclength = _pcre_ord2utf8(othercase, occhars);
2331 else oclength = 0;
2332 #endif /* SUPPORT_UCP */
2333
2334 for (i = 1; i <= min; i++)
2335 {
2336 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2337 #ifdef SUPPORT_UCP
2338 /* Need braces because of following else */
2339 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2340 else
2341 {
2342 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2343 eptr += oclength;
2344 }
2345 #else /* without SUPPORT_UCP */
2346 else { RRETURN(MATCH_NOMATCH); }
2347 #endif /* SUPPORT_UCP */
2348 }
2349
2350 if (min == max) continue;
2351
2352 if (minimize)
2353 {
2354 for (fi = min;; fi++)
2355 {
2356 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2357 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2358 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2359 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2360 #ifdef SUPPORT_UCP
2361 /* Need braces because of following else */
2362 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2363 else
2364 {
2365 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2366 eptr += oclength;
2367 }
2368 #else /* without SUPPORT_UCP */
2369 else { RRETURN (MATCH_NOMATCH); }
2370 #endif /* SUPPORT_UCP */
2371 }
2372 /* Control never gets here */
2373 }
2374
2375 else /* Maximize */
2376 {
2377 pp = eptr;
2378 for (i = min; i < max; i++)
2379 {
2380 if (eptr > md->end_subject - length) break;
2381 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2382 #ifdef SUPPORT_UCP
2383 else if (oclength == 0) break;
2384 else
2385 {
2386 if (memcmp(eptr, occhars, oclength) != 0) break;
2387 eptr += oclength;
2388 }
2389 #else /* without SUPPORT_UCP */
2390 else break;
2391 #endif /* SUPPORT_UCP */
2392 }
2393
2394 if (possessive) continue;
2395 for(;;)
2396 {
2397 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2398 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2399 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2400 #ifdef SUPPORT_UCP
2401 eptr--;
2402 BACKCHAR(eptr);
2403 #else /* without SUPPORT_UCP */
2404 eptr -= length;
2405 #endif /* SUPPORT_UCP */
2406 }
2407 }
2408 /* Control never gets here */
2409 }
2410
2411 /* If the length of a UTF-8 character is 1, we fall through here, and
2412 obey the code as for non-UTF-8 characters below, though in this case the
2413 value of fc will always be < 128. */
2414 }
2415 else
2416 #endif /* SUPPORT_UTF8 */
2417
2418 /* When not in UTF-8 mode, load a single-byte character. */
2419 {
2420 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2421 fc = *ecode++;
2422 }
2423
2424 /* The value of fc at this point is always less than 256, though we may or
2425 may not be in UTF-8 mode. The code is duplicated for the caseless and
2426 caseful cases, for speed, since matching characters is likely to be quite
2427 common. First, ensure the minimum number of matches are present. If min =
2428 max, continue at the same level without recursing. Otherwise, if
2429 minimizing, keep trying the rest of the expression and advancing one
2430 matching character if failing, up to the maximum. Alternatively, if
2431 maximizing, find the maximum number of characters and work backwards. */
2432
2433 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2434 max, eptr));
2435
2436 if ((ims & PCRE_CASELESS) != 0)
2437 {
2438 fc = md->lcc[fc];
2439 for (i = 1; i <= min; i++)
2440 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2441 if (min == max) continue;
2442 if (minimize)
2443 {
2444 for (fi = min;; fi++)
2445 {
2446 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2447 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2448 if (fi >= max || eptr >= md->end_subject ||
2449 fc != md->lcc[*eptr++])
2450 RRETURN(MATCH_NOMATCH);
2451 }
2452 /* Control never gets here */
2453 }
2454 else /* Maximize */
2455 {
2456 pp = eptr;
2457 for (i = min; i < max; i++)
2458 {
2459 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2460 eptr++;
2461 }
2462 if (possessive) continue;
2463 while (eptr >= pp)
2464 {
2465 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2466 eptr--;
2467 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2468 }
2469 RRETURN(MATCH_NOMATCH);
2470 }
2471 /* Control never gets here */
2472 }
2473
2474 /* Caseful comparisons (includes all multi-byte characters) */
2475
2476 else
2477 {
2478 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2479 if (min == max) continue;
2480 if (minimize)
2481 {
2482 for (fi = min;; fi++)
2483 {
2484 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2485 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2486 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2487 RRETURN(MATCH_NOMATCH);
2488 }
2489 /* Control never gets here */
2490 }
2491 else /* Maximize */
2492 {
2493 pp = eptr;
2494 for (i = min; i < max; i++)
2495 {
2496 if (eptr >= md->end_subject || fc != *eptr) break;
2497 eptr++;
2498 }
2499 if (possessive) continue;
2500 while (eptr >= pp)
2501 {
2502 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2503 eptr--;
2504 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2505 }
2506 RRETURN(MATCH_NOMATCH);
2507 }
2508 }
2509 /* Control never gets here */
2510
2511 /* Match a negated single one-byte character. The character we are
2512 checking can be multibyte. */
2513
2514 case OP_NOT:
2515 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2516 ecode++;
2517 GETCHARINCTEST(c, eptr);
2518 if ((ims & PCRE_CASELESS) != 0)
2519 {
2520 #ifdef SUPPORT_UTF8
2521 if (c < 256)
2522 #endif
2523 c = md->lcc[c];
2524 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2525 }
2526 else
2527 {
2528 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2529 }
2530 break;
2531
2532 /* Match a negated single one-byte character repeatedly. This is almost a
2533 repeat of the code for a repeated single character, but I haven't found a
2534 nice way of commoning these up that doesn't require a test of the
2535 positive/negative option for each character match. Maybe that wouldn't add
2536 very much to the time taken, but character matching *is* what this is all
2537 about... */
2538
2539 case OP_NOTEXACT:
2540 min = max = GET2(ecode, 1);
2541 ecode += 3;
2542 goto REPEATNOTCHAR;
2543
2544 case OP_NOTUPTO:
2545 case OP_NOTMINUPTO:
2546 min = 0;
2547 max = GET2(ecode, 1);
2548 minimize = *ecode == OP_NOTMINUPTO;
2549 ecode += 3;
2550 goto REPEATNOTCHAR;
2551
2552 case OP_NOTPOSSTAR:
2553 possessive = TRUE;
2554 min = 0;
2555 max = INT_MAX;
2556 ecode++;
2557 goto REPEATNOTCHAR;
2558
2559 case OP_NOTPOSPLUS:
2560 possessive = TRUE;
2561 min = 1;
2562 max = INT_MAX;
2563 ecode++;
2564 goto REPEATNOTCHAR;
2565
2566 case OP_NOTPOSQUERY:
2567 possessive = TRUE;
2568 min = 0;
2569 max = 1;
2570 ecode++;
2571 goto REPEATNOTCHAR;
2572
2573 case OP_NOTPOSUPTO:
2574 possessive = TRUE;
2575 min = 0;
2576 max = GET2(ecode, 1);
2577 ecode += 3;
2578 goto REPEATNOTCHAR;
2579
2580 case OP_NOTSTAR:
2581 case OP_NOTMINSTAR:
2582 case OP_NOTPLUS:
2583 case OP_NOTMINPLUS:
2584 case OP_NOTQUERY:
2585 case OP_NOTMINQUERY:
2586 c = *ecode++ - OP_NOTSTAR;
2587 minimize = (c & 1) != 0;
2588 min = rep_min[c]; /* Pick up values from tables; */
2589 max = rep_max[c]; /* zero for max => infinity */
2590 if (max == 0) max = INT_MAX;
2591
2592 /* Common code for all repeated single-byte matches. We can give up quickly
2593 if there are fewer than the minimum number of bytes left in the
2594 subject. */
2595
2596 REPEATNOTCHAR:
2597 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2598 fc = *ecode++;
2599
2600 /* The code is duplicated for the caseless and caseful cases, for speed,
2601 since matching characters is likely to be quite common. First, ensure the
2602 minimum number of matches are present. If min = max, continue at the same
2603 level without recursing. Otherwise, if minimizing, keep trying the rest of
2604 the expression and advancing one matching character if failing, up to the
2605 maximum. Alternatively, if maximizing, find the maximum number of
2606 characters and work backwards. */
2607
2608 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2609 max, eptr));
2610
2611 if ((ims & PCRE_CASELESS) != 0)
2612 {
2613 fc = md->lcc[fc];
2614
2615 #ifdef SUPPORT_UTF8
2616 /* UTF-8 mode */
2617 if (utf8)
2618 {
2619 register unsigned int d;
2620 for (i = 1; i <= min; i++)
2621 {
2622 GETCHARINC(d, eptr);
2623 if (d < 256) d = md->lcc[d];
2624 if (fc == d) RRETURN(MATCH_NOMATCH);
2625 }
2626 }
2627 else
2628 #endif
2629
2630 /* Not UTF-8 mode */
2631 {
2632 for (i = 1; i <= min; i++)
2633 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2634 }
2635
2636 if (min == max) continue;
2637
2638 if (minimize)
2639 {
2640 #ifdef SUPPORT_UTF8
2641 /* UTF-8 mode */
2642 if (utf8)
2643 {
2644 register unsigned int d;
2645 for (fi = min;; fi++)
2646 {
2647 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2649 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2650 GETCHARINC(d, eptr);
2651 if (d < 256) d = md->lcc[d];
2652 if (fc == d) RRETURN(MATCH_NOMATCH);
2653
2654 }
2655 }
2656 else
2657 #endif
2658 /* Not UTF-8 mode */
2659 {
2660 for (fi = min;; fi++)
2661 {
2662 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2663 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2664 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2665 RRETURN(MATCH_NOMATCH);
2666 }
2667 }
2668 /* Control never gets here */
2669 }
2670
2671 /* Maximize case */
2672
2673 else
2674 {
2675 pp = eptr;
2676
2677 #ifdef SUPPORT_UTF8
2678 /* UTF-8 mode */
2679 if (utf8)
2680 {
2681 register unsigned int d;
2682 for (i = min; i < max; i++)
2683 {
2684 int len = 1;
2685 if (eptr >= md->end_subject) break;
2686 GETCHARLEN(d, eptr, len);
2687 if (d < 256) d = md->lcc[d];
2688 if (fc == d) break;
2689 eptr += len;
2690 }
2691 if (possessive) continue;
2692 for(;;)
2693 {
2694 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2695 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2696 if (eptr-- == pp) break; /* Stop if tried at original pos */
2697 BACKCHAR(eptr);
2698 }
2699 }
2700 else
2701 #endif
2702 /* Not UTF-8 mode */
2703 {
2704 for (i = min; i < max; i++)
2705 {
2706 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2707 eptr++;
2708 }
2709 if (possessive) continue;
2710 while (eptr >= pp)
2711 {
2712 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2713 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2714 eptr--;
2715 }
2716 }
2717
2718 RRETURN(MATCH_NOMATCH);
2719 }
2720 /* Control never gets here */
2721 }
2722
2723 /* Caseful comparisons */
2724
2725 else
2726 {
2727 #ifdef SUPPORT_UTF8
2728 /* UTF-8 mode */
2729 if (utf8)
2730 {
2731 register unsigned int d;
2732 for (i = 1; i <= min; i++)
2733 {
2734 GETCHARINC(d, eptr);
2735 if (fc == d) RRETURN(MATCH_NOMATCH);
2736 }
2737 }
2738 else
2739 #endif
2740 /* Not UTF-8 mode */
2741 {
2742 for (i = 1; i <= min; i++)
2743 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2744 }
2745
2746 if (min == max) continue;
2747
2748 if (minimize)
2749 {
2750 #ifdef SUPPORT_UTF8
2751 /* UTF-8 mode */
2752 if (utf8)
2753 {
2754 register unsigned int d;
2755 for (fi = min;; fi++)
2756 {
2757 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2758 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2759 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2760 GETCHARINC(d, eptr);
2761 if (fc == d) RRETURN(MATCH_NOMATCH);
2762 }
2763 }
2764 else
2765 #endif
2766 /* Not UTF-8 mode */
2767 {
2768 for (fi = min;; fi++)
2769 {
2770 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2771 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2772 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2773 RRETURN(MATCH_NOMATCH);
2774 }
2775 }
2776 /* Control never gets here */
2777 }
2778
2779 /* Maximize case */
2780
2781 else
2782 {
2783 pp = eptr;
2784
2785 #ifdef SUPPORT_UTF8
2786 /* UTF-8 mode */
2787 if (utf8)
2788 {
2789 register unsigned int d;
2790 for (i = min; i < max; i++)
2791 {
2792 int len = 1;
2793 if (eptr >= md->end_subject) break;
2794 GETCHARLEN(d, eptr, len);
2795 if (fc == d) break;
2796 eptr += len;
2797 }
2798 if (possessive) continue;
2799 for(;;)
2800 {
2801 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2803 if (eptr-- == pp) break; /* Stop if tried at original pos */
2804 BACKCHAR(eptr);
2805 }
2806 }
2807 else
2808 #endif
2809 /* Not UTF-8 mode */
2810 {
2811 for (i = min; i < max; i++)
2812 {
2813 if (eptr >= md->end_subject || fc == *eptr) break;
2814 eptr++;
2815 }
2816 if (possessive) continue;
2817 while (eptr >= pp)
2818 {
2819 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2820 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2821 eptr--;
2822 }
2823 }
2824
2825 RRETURN(MATCH_NOMATCH);
2826 }
2827 }
2828 /* Control never gets here */
2829
2830 /* Match a single character type repeatedly; several different opcodes
2831 share code. This is very similar to the code for single characters, but we
2832 repeat it in the interests of efficiency. */
2833
2834 case OP_TYPEEXACT:
2835 min = max = GET2(ecode, 1);
2836 minimize = TRUE;
2837 ecode += 3;
2838 goto REPEATTYPE;
2839
2840 case OP_TYPEUPTO:
2841 case OP_TYPEMINUPTO:
2842 min = 0;
2843 max = GET2(ecode, 1);
2844 minimize = *ecode == OP_TYPEMINUPTO;
2845 ecode += 3;
2846 goto REPEATTYPE;
2847
2848 case OP_TYPEPOSSTAR:
2849 possessive = TRUE;
2850 min = 0;
2851 max = INT_MAX;
2852 ecode++;
2853 goto REPEATTYPE;
2854
2855 case OP_TYPEPOSPLUS:
2856 possessive = TRUE;
2857 min = 1;
2858 max = INT_MAX;
2859 ecode++;
2860 goto REPEATTYPE;
2861
2862 case OP_TYPEPOSQUERY:
2863 possessive = TRUE;
2864 min = 0;
2865 max = 1;
2866 ecode++;
2867 goto REPEATTYPE;
2868
2869 case OP_TYPEPOSUPTO:
2870 possessive = TRUE;
2871 min = 0;
2872 max = GET2(ecode, 1);
2873 ecode += 3;
2874 goto REPEATTYPE;
2875
2876 case OP_TYPESTAR:
2877 case OP_TYPEMINSTAR:
2878 case OP_TYPEPLUS:
2879 case OP_TYPEMINPLUS:
2880 case OP_TYPEQUERY:
2881 case OP_TYPEMINQUERY:
2882 c = *ecode++ - OP_TYPESTAR;
2883 minimize = (c & 1) != 0;
2884 min = rep_min[c]; /* Pick up values from tables; */
2885 max = rep_max[c]; /* zero for max => infinity */
2886 if (max == 0) max = INT_MAX;
2887
2888 /* Common code for all repeated single character type matches. Note that
2889 in UTF-8 mode, '.' matches a character of any length, but for the other
2890 character types, the valid characters are all one-byte long. */
2891
2892 REPEATTYPE:
2893 ctype = *ecode++; /* Code for the character type */
2894
2895 #ifdef SUPPORT_UCP
2896 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2897 {
2898 prop_fail_result = ctype == OP_NOTPROP;
2899 prop_type = *ecode++;
2900 prop_value = *ecode++;
2901 }
2902 else prop_type = -1;
2903 #endif
2904
2905 /* First, ensure the minimum number of matches are present. Use inline
2906 code for maximizing the speed, and do the type test once at the start
2907 (i.e. keep it out of the loop). Also we can test that there are at least
2908 the minimum number of bytes before we start. This isn't as effective in
2909 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2910 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2911 and single-bytes. */
2912
2913 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2914 if (min > 0)
2915 {
2916 #ifdef SUPPORT_UCP
2917 if (prop_type >= 0)
2918 {
2919 switch(prop_type)
2920 {
2921 case PT_ANY:
2922 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2923 for (i = 1; i <= min; i++)
2924 {
2925 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2926 GETCHARINCTEST(c, eptr);
2927 }
2928 break;
2929
2930 case PT_LAMP:
2931 for (i = 1; i <= min; i++)
2932 {
2933 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2934 GETCHARINCTEST(c, eptr);
2935 prop_chartype = UCD_CHARTYPE(c);
2936 if ((prop_chartype == ucp_Lu ||
2937 prop_chartype == ucp_Ll ||
2938 prop_chartype == ucp_Lt) == prop_fail_result)
2939 RRETURN(MATCH_NOMATCH);
2940 }
2941 break;
2942
2943 case PT_GC:
2944 for (i = 1; i <= min; i++)
2945 {
2946 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2947 GETCHARINCTEST(c, eptr);
2948 prop_category = UCD_CATEGORY(c);
2949 if ((prop_category == prop_value) == prop_fail_result)
2950 RRETURN(MATCH_NOMATCH);
2951 }
2952 break;
2953
2954 case PT_PC:
2955 for (i = 1; i <= min; i++)
2956 {
2957 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2958 GETCHARINCTEST(c, eptr);
2959 prop_chartype = UCD_CHARTYPE(c);
2960 if ((prop_chartype == prop_value) == prop_fail_result)
2961 RRETURN(MATCH_NOMATCH);
2962 }
2963 break;
2964
2965 case PT_SC:
2966 for (i = 1; i <= min; i++)
2967 {
2968 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2969 GETCHARINCTEST(c, eptr);
2970 prop_script = UCD_SCRIPT(c);
2971 if ((prop_script == prop_value) == prop_fail_result)
2972 RRETURN(MATCH_NOMATCH);
2973 }
2974 break;
2975
2976 default:
2977 RRETURN(PCRE_ERROR_INTERNAL);
2978 }
2979 }
2980
2981 /* Match extended Unicode sequences. We will get here only if the
2982 support is in the binary; otherwise a compile-time error occurs. */
2983
2984 else if (ctype == OP_EXTUNI)
2985 {
2986 for (i = 1; i <= min; i++)
2987 {
2988 GETCHARINCTEST(c, eptr);
2989 prop_category = UCD_CATEGORY(c);
2990 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2991 while (eptr < md->end_subject)
2992 {
2993 int len = 1;
2994 if (!utf8) c = *eptr; else
2995 {
2996 GETCHARLEN(c, eptr, len);
2997 }
2998 prop_category = UCD_CATEGORY(c);
2999 if (prop_category != ucp_M) break;
3000 eptr += len;
3001 }
3002 }
3003 }
3004
3005 else
3006 #endif /* SUPPORT_UCP */
3007
3008 /* Handle all other cases when the coding is UTF-8 */
3009
3010 #ifdef SUPPORT_UTF8
3011 if (utf8) switch(ctype)
3012 {
3013 case OP_ANY:
3014 for (i = 1; i <= min; i++)
3015 {
3016 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3017 RRETURN(MATCH_NOMATCH);
3018 eptr++;
3019 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3020 }
3021 break;
3022
3023 case OP_ALLANY:
3024 for (i = 1; i <= min; i++)
3025 {
3026 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3027 eptr++;
3028 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3029 }
3030 break;
3031
3032 case OP_ANYBYTE:
3033 eptr += min;
3034 break;
3035
3036 case OP_ANYNL:
3037 for (i = 1; i <= min; i++)
3038 {
3039 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3040 GETCHARINC(c, eptr);
3041 switch(c)
3042 {
3043 default: RRETURN(MATCH_NOMATCH);
3044 case 0x000d:
3045 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3046 break;
3047
3048 case 0x000a:
3049 break;
3050
3051 case 0x000b:
3052 case 0x000c:
3053 case 0x0085:
3054 case 0x2028:
3055 case 0x2029:
3056 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3057 break;
3058 }
3059 }
3060 break;
3061
3062 case OP_NOT_HSPACE:
3063 for (i = 1; i <= min; i++)
3064 {
3065 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3066 GETCHARINC(c, eptr);
3067 switch(c)
3068 {
3069 default: break;
3070 case 0x09: /* HT */
3071 case 0x20: /* SPACE */
3072 case 0xa0: /* NBSP */
3073 case 0x1680: /* OGHAM SPACE MARK */
3074 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3075 case 0x2000: /* EN QUAD */
3076 case 0x2001: /* EM QUAD */
3077 case 0x2002: /* EN SPACE */
3078 case 0x2003: /* EM SPACE */
3079 case 0x2004: /* THREE-PER-EM SPACE */
3080 case 0x2005: /* FOUR-PER-EM SPACE */
3081 case 0x2006: /* SIX-PER-EM SPACE */
3082 case 0x2007: /* FIGURE SPACE */
3083 case 0x2008: /* PUNCTUATION SPACE */
3084 case 0x2009: /* THIN SPACE */
3085 case 0x200A: /* HAIR SPACE */
3086 case 0x202f: /* NARROW NO-BREAK SPACE */
3087 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3088 case 0x3000: /* IDEOGRAPHIC SPACE */
3089 RRETURN(MATCH_NOMATCH);
3090 }
3091 }
3092 break;
3093
3094 case OP_HSPACE:
3095 for (i = 1; i <= min; i++)
3096 {
3097 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3098 GETCHARINC(c, eptr);
3099 switch(c)
3100 {
3101 default: RRETURN(MATCH_NOMATCH);
3102 case 0x09: /* HT */
3103 case 0x20: /* SPACE */
3104 case 0xa0: /* NBSP */
3105 case 0x1680: /* OGHAM SPACE MARK */
3106 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3107 case 0x2000: /* EN QUAD */
3108 case 0x2001: /* EM QUAD */
3109 case 0x2002: /* EN SPACE */
3110 case 0x2003: /* EM SPACE */
3111 case 0x2004: /* THREE-PER-EM SPACE */
3112 case 0x2005: /* FOUR-PER-EM SPACE */
3113 case 0x2006: /* SIX-PER-EM SPACE */
3114 case 0x2007: /* FIGURE SPACE */
3115 case 0x2008: /* PUNCTUATION SPACE */
3116 case 0x2009: /* THIN SPACE */
3117 case 0x200A: /* HAIR SPACE */
3118 case 0x202f: /* NARROW NO-BREAK SPACE */
3119 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3120 case 0x3000: /* IDEOGRAPHIC SPACE */
3121 break;
3122 }
3123 }
3124 break;
3125
3126 case OP_NOT_VSPACE:
3127 for (i = 1; i <= min; i++)
3128 {
3129 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3130 GETCHARINC(c, eptr);
3131 switch(c)
3132 {
3133 default: break;
3134 case 0x0a: /* LF */
3135 case 0x0b: /* VT */
3136 case 0x0c: /* FF */
3137 case 0x0d: /* CR */
3138 case 0x85: /* NEL */
3139 case 0x2028: /* LINE SEPARATOR */
3140 case 0x2029: /* PARAGRAPH SEPARATOR */
3141 RRETURN(MATCH_NOMATCH);
3142 }
3143 }
3144 break;
3145
3146 case OP_VSPACE:
3147 for (i = 1; i <= min; i++)
3148 {
3149 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3150 GETCHARINC(c, eptr);
3151 switch(c)
3152 {
3153 default: RRETURN(MATCH_NOMATCH);
3154 case 0x0a: /* LF */
3155 case 0x0b: /* VT */
3156 case 0x0c: /* FF */
3157 case 0x0d: /* CR */
3158 case 0x85: /* NEL */
3159 case 0x2028: /* LINE SEPARATOR */
3160 case 0x2029: /* PARAGRAPH SEPARATOR */
3161 break;
3162 }
3163 }
3164 break;
3165
3166 case OP_NOT_DIGIT:
3167 for (i = 1; i <= min; i++)
3168 {
3169 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3170 GETCHARINC(c, eptr);
3171 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3172 RRETURN(MATCH_NOMATCH);
3173 }
3174 break;
3175
3176 case OP_DIGIT:
3177 for (i = 1; i <= min; i++)
3178 {
3179 if (eptr >= md->end_subject ||
3180 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3181 RRETURN(MATCH_NOMATCH);
3182 /* No need to skip more bytes - we know it's a 1-byte character */
3183 }
3184 break;
3185
3186 case OP_NOT_WHITESPACE:
3187 for (i = 1; i <= min; i++)
3188 {
3189 if (eptr >= md->end_subject ||
3190 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3191 RRETURN(MATCH_NOMATCH);
3192 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3193 }
3194 break;
3195
3196 case OP_WHITESPACE:
3197 for (i = 1; i <= min; i++)
3198 {
3199 if (eptr >= md->end_subject ||
3200 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3201 RRETURN(MATCH_NOMATCH);
3202 /* No need to skip more bytes - we know it's a 1-byte character */
3203 }
3204 break;
3205
3206 case OP_NOT_WORDCHAR:
3207 for (i = 1; i <= min; i++)
3208 {
3209 if (eptr >= md->end_subject ||
3210 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3211 RRETURN(MATCH_NOMATCH);
3212 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3213 }
3214 break;
3215
3216 case OP_WORDCHAR:
3217 for (i = 1; i <= min; i++)
3218 {
3219 if (eptr >= md->end_subject ||
3220 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3221 RRETURN(MATCH_NOMATCH);
3222 /* No need to skip more bytes - we know it's a 1-byte character */
3223 }
3224 break;
3225
3226 default:
3227 RRETURN(PCRE_ERROR_INTERNAL);
3228 } /* End switch(ctype) */
3229
3230 else
3231 #endif /* SUPPORT_UTF8 */
3232
3233 /* Code for the non-UTF-8 case for minimum matching of operators other
3234 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3235 number of bytes present, as this was tested above. */
3236
3237 switch(ctype)
3238 {
3239 case OP_ANY:
3240 for (i = 1; i <= min; i++)
3241 {
3242 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3243 eptr++;
3244 }
3245 break;
3246
3247 case OP_ALLANY:
3248 eptr += min;
3249 break;
3250
3251 case OP_ANYBYTE:
3252 eptr += min;
3253 break;
3254
3255 /* Because of the CRLF case, we can't assume the minimum number of
3256 bytes are present in this case. */
3257
3258 case OP_ANYNL:
3259 for (i = 1; i <= min; i++)
3260 {
3261 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3262 switch(*eptr++)
3263 {
3264 default: RRETURN(MATCH_NOMATCH);
3265 case 0x000d:
3266 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3267 break;
3268 case 0x000a:
3269 break;
3270
3271 case 0x000b:
3272 case 0x000c:
3273 case 0x0085:
3274 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3275 break;
3276 }
3277 }
3278 break;
3279
3280 case OP_NOT_HSPACE:
3281 for (i = 1; i <= min; i++)
3282 {
3283 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3284 switch(*eptr++)
3285 {
3286 default: break;
3287 case 0x09: /* HT */
3288 case 0x20: /* SPACE */
3289 case 0xa0: /* NBSP */
3290 RRETURN(MATCH_NOMATCH);
3291 }
3292 }
3293 break;
3294
3295 case OP_HSPACE:
3296 for (i = 1; i <= min; i++)
3297 {
3298 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3299 switch(*eptr++)
3300 {
3301 default: RRETURN(MATCH_NOMATCH);
3302 case 0x09: /* HT */
3303 case 0x20: /* SPACE */
3304 case 0xa0: /* NBSP */
3305 break;
3306 }
3307 }
3308 break;
3309
3310 case OP_NOT_VSPACE:
3311 for (i = 1; i <= min; i++)
3312 {
3313 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3314 switch(*eptr++)
3315 {
3316 default: break;
3317 case 0x0a: /* LF */
3318 case 0x0b: /* VT */
3319 case 0x0c: /* FF */
3320 case 0x0d: /* CR */
3321 case 0x85: /* NEL */
3322 RRETURN(MATCH_NOMATCH);
3323 }
3324 }
3325 break;
3326
3327 case OP_VSPACE:
3328 for (i = 1; i <= min; i++)
3329 {
3330 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3331 switch(*eptr++)
3332 {
3333 default: RRETURN(MATCH_NOMATCH);
3334 case 0x0a: /* LF */
3335 case 0x0b: /* VT */
3336 case 0x0c: /* FF */
3337 case 0x0d: /* CR */
3338 case 0x85: /* NEL */
3339 break;
3340 }
3341 }
3342 break;
3343
3344 case OP_NOT_DIGIT:
3345 for (i = 1; i <= min; i++)
3346 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3347 break;
3348
3349 case OP_DIGIT:
3350 for (i = 1; i <= min; i++)
3351 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3352 break;
3353
3354 case OP_NOT_WHITESPACE:
3355 for (i = 1; i <= min; i++)
3356 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3357 break;
3358
3359 case OP_WHITESPACE:
3360 for (i = 1; i <= min; i++)
3361 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3362 break;
3363
3364 case OP_NOT_WORDCHAR:
3365 for (i = 1; i <= min; i++)
3366 if ((md->ctypes[*eptr++] & ctype_word) != 0)
3367 RRETURN(MATCH_NOMATCH);
3368 break;
3369
3370 case OP_WORDCHAR:
3371 for (i = 1; i <= min; i++)
3372 if ((md->ctypes[*eptr++] & ctype_word) == 0)
3373 RRETURN(MATCH_NOMATCH);
3374 break;
3375
3376 default:
3377 RRETURN(PCRE_ERROR_INTERNAL);
3378 }
3379 }
3380
3381 /* If min = max, continue at the same level without recursing */
3382
3383 if (min == max) continue;
3384
3385 /* If minimizing, we have to test the rest of the pattern before each
3386 subsequent match. Again, separate the UTF-8 case for speed, and also
3387 separate the UCP cases. */
3388
3389 if (minimize)
3390 {
3391 #ifdef SUPPORT_UCP
3392 if (prop_type >= 0)
3393 {
3394 switch(prop_type)
3395 {
3396 case PT_ANY:
3397 for (fi = min;; fi++)
3398 {
3399 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3400 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3401 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3402 GETCHARINC(c, eptr);
3403 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3404 }
3405 /* Control never gets here */
3406
3407 case PT_LAMP:
3408 for (fi = min;; fi++)
3409 {
3410 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3411 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3412 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3413 GETCHARINC(c, eptr);
3414 prop_chartype = UCD_CHARTYPE(c);
3415 if ((prop_chartype == ucp_Lu ||
3416 prop_chartype == ucp_Ll ||
3417 prop_chartype == ucp_Lt) == prop_fail_result)
3418 RRETURN(MATCH_NOMATCH);
3419 }
3420 /* Control never gets here */
3421
3422 case PT_GC:
3423 for (fi = min;; fi++)
3424 {
3425 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3426 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3427 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3428 GETCHARINC(c, eptr);
3429 prop_category = UCD_CATEGORY(c);
3430 if ((prop_category == prop_value) == prop_fail_result)
3431 RRETURN(MATCH_NOMATCH);
3432 }
3433 /* Control never gets here */
3434
3435 case PT_PC:
3436 for (fi = min;; fi++)
3437 {
3438 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3439 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3440 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3441 GETCHARINC(c, eptr);
3442 prop_chartype = UCD_CHARTYPE(c);
3443 if ((prop_chartype == prop_value) == prop_fail_result)
3444 RRETURN(MATCH_NOMATCH);
3445 }
3446 /* Control never gets here */
3447
3448 case PT_SC:
3449 for (fi = min;; fi++)
3450 {
3451 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3452 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3453 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3454 GETCHARINC(c, eptr);
3455 prop_script = UCD_SCRIPT(c);
3456 if ((prop_script == prop_value) == prop_fail_result)
3457 RRETURN(MATCH_NOMATCH);
3458 }
3459 /* Control never gets here */
3460
3461 default:
3462 RRETURN(PCRE_ERROR_INTERNAL);
3463 }
3464 }
3465
3466 /* Match extended Unicode sequences. We will get here only if the
3467 support is in the binary; otherwise a compile-time error occurs. */
3468
3469 else if (ctype == OP_EXTUNI)
3470 {
3471 for (fi = min;; fi++)
3472 {
3473 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3474 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3475 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3476 GETCHARINCTEST(c, eptr);
3477 prop_category = UCD_CATEGORY(c);
3478 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3479 while (eptr < md->end_subject)
3480 {
3481 int len = 1;
3482 if (!utf8) c = *eptr; else
3483 {
3484 GETCHARLEN(c, eptr, len);
3485 }
3486 prop_category = UCD_CATEGORY(c);
3487 if (prop_category != ucp_M) break;
3488 eptr += len;
3489 }
3490 }
3491 }
3492
3493 else
3494 #endif /* SUPPORT_UCP */
3495
3496 #ifdef SUPPORT_UTF8
3497 /* UTF-8 mode */
3498 if (utf8)
3499 {
3500 for (fi = min;; fi++)
3501 {
3502 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3503 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3504 if (fi >= max || eptr >= md->end_subject ||
3505 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3506 RRETURN(MATCH_NOMATCH);
3507
3508 GETCHARINC(c, eptr);
3509 switch(ctype)
3510 {
3511 case OP_ANY: /* This is the non-NL case */
3512 case OP_ALLANY:
3513 case OP_ANYBYTE:
3514 break;
3515
3516 case OP_ANYNL:
3517 switch(c)
3518 {
3519 default: RRETURN(MATCH_NOMATCH);
3520 case 0x000d:
3521 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3522 break;
3523 case 0x000a:
3524 break;
3525
3526 case 0x000b:
3527 case 0x000c:
3528 case 0x0085:
3529 case 0x2028:
3530 case 0x2029:
3531 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3532 break;
3533 }
3534 break;
3535
3536 case OP_NOT_HSPACE:
3537 switch(c)
3538 {
3539 default: break;
3540 case 0x09: /* HT */
3541 case 0x20: /* SPACE */
3542 case 0xa0: /* NBSP */
3543 case 0x1680: /* OGHAM SPACE MARK */
3544 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3545 case 0x2000: /* EN QUAD */
3546 case 0x2001: /* EM QUAD */
3547 case 0x2002: /* EN SPACE */
3548 case 0x2003: /* EM SPACE */
3549 case 0x2004: /* THREE-PER-EM SPACE */
3550 case 0x2005: /* FOUR-PER-EM SPACE */
3551 case 0x2006: /* SIX-PER-EM SPACE */
3552 case 0x2007: /* FIGURE SPACE */
3553 case 0x2008: /* PUNCTUATION SPACE */
3554 case 0x2009: /* THIN SPACE */
3555 case 0x200A: /* HAIR SPACE */
3556 case 0x202f: /* NARROW NO-BREAK SPACE */
3557 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3558 case 0x3000: /* IDEOGRAPHIC SPACE */
3559 RRETURN(MATCH_NOMATCH);
3560 }
3561 break;
3562
3563 case OP_HSPACE:
3564 switch(c)
3565 {
3566 default: RRETURN(MATCH_NOMATCH);
3567 case 0x09: /* HT */
3568 case 0x20: /* SPACE */
3569 case 0xa0: /* NBSP */
3570 case 0x1680: /* OGHAM SPACE MARK */
3571 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3572 case 0x2000: /* EN QUAD */
3573 case 0x2001: /* EM QUAD */
3574 case 0x2002: /* EN SPACE */
3575 case 0x2003: /* EM SPACE */
3576 case 0x2004: /* THREE-PER-EM SPACE */
3577 case 0x2005: /* FOUR-PER-EM SPACE */
3578 case 0x2006: /* SIX-PER-EM SPACE */
3579 case 0x2007: /* FIGURE SPACE */
3580 case 0x2008: /* PUNCTUATION SPACE */
3581 case 0x2009: /* THIN SPACE */
3582 case 0x200A: /* HAIR SPACE */
3583 case 0x202f: /* NARROW NO-BREAK SPACE */
3584 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3585 case 0x3000: /* IDEOGRAPHIC SPACE */
3586 break;
3587 }
3588 break;
3589
3590 case OP_NOT_VSPACE:
3591 switch(c)
3592 {
3593 default: break;
3594 case 0x0a: /* LF */
3595 case 0x0b: /* VT */
3596 case 0x0c: /* FF */
3597 case 0x0d: /* CR */
3598 case 0x85: /* NEL */
3599 case 0x2028: /* LINE SEPARATOR */
3600 case 0x2029: /* PARAGRAPH SEPARATOR */
3601 RRETURN(MATCH_NOMATCH);
3602 }
3603 break;
3604
3605 case OP_VSPACE:
3606 switch(c)
3607 {
3608 default: RRETURN(MATCH_NOMATCH);
3609 case 0x0a: /* LF */
3610 case 0x0b: /* VT */
3611 case 0x0c: /* FF */
3612 case 0x0d: /* CR */
3613 case 0x85: /* NEL */
3614 case 0x2028: /* LINE SEPARATOR */
3615 case 0x2029: /* PARAGRAPH SEPARATOR */
3616 break;
3617 }
3618 break;
3619
3620 case OP_NOT_DIGIT:
3621 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3622 RRETURN(MATCH_NOMATCH);
3623 break;
3624
3625 case OP_DIGIT:
3626 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3627 RRETURN(MATCH_NOMATCH);
3628 break;
3629
3630 case OP_NOT_WHITESPACE:
3631 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3632 RRETURN(MATCH_NOMATCH);
3633 break;
3634
3635 case OP_WHITESPACE:
3636 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3637 RRETURN(MATCH_NOMATCH);
3638 break;
3639
3640 case OP_NOT_WORDCHAR:
3641 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3642 RRETURN(MATCH_NOMATCH);
3643 break;
3644
3645 case OP_WORDCHAR:
3646 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3647 RRETURN(MATCH_NOMATCH);
3648 break;
3649
3650 default:
3651 RRETURN(PCRE_ERROR_INTERNAL);
3652 }
3653 }
3654 }
3655 else
3656 #endif
3657 /* Not UTF-8 mode */
3658 {
3659 for (fi = min;; fi++)
3660 {
3661 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3663 if (fi >= max || eptr >= md->end_subject ||
3664 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3665 RRETURN(MATCH_NOMATCH);
3666
3667 c = *eptr++;
3668 switch(ctype)
3669 {
3670 case OP_ANY: /* This is the non-NL case */
3671 case OP_ALLANY:
3672 case OP_ANYBYTE:
3673 break;
3674
3675 case OP_ANYNL:
3676 switch(c)
3677 {
3678 default: RRETURN(MATCH_NOMATCH);
3679 case 0x000d:
3680 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3681 break;
3682
3683 case 0x000a:
3684 break;
3685
3686 case 0x000b:
3687 case 0x000c:
3688 case 0x0085:
3689 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3690 break;
3691 }
3692 break;
3693
3694 case OP_NOT_HSPACE:
3695 switch(c)
3696 {
3697 default: break;
3698 case 0x09: /* HT */
3699 case 0x20: /* SPACE */
3700 case 0xa0: /* NBSP */
3701 RRETURN(MATCH_NOMATCH);
3702 }
3703 break;
3704
3705 case OP_HSPACE:
3706 switch(c)
3707 {
3708 default: RRETURN(MATCH_NOMATCH);
3709 case 0x09: /* HT */
3710 case 0x20: /* SPACE */
3711 case 0xa0: /* NBSP */
3712 break;
3713 }
3714 break;
3715
3716 case OP_NOT_VSPACE:
3717 switch(c)
3718 {
3719 default: break;
3720 case 0x0a: /* LF */
3721 case 0x0b: /* VT */
3722 case 0x0c: /* FF */
3723 case 0x0d: /* CR */
3724 case 0x85: /* NEL */
3725 RRETURN(MATCH_NOMATCH);
3726 }
3727 break;
3728
3729 case OP_VSPACE:
3730 switch(c)
3731 {
3732 default: RRETURN(MATCH_NOMATCH);
3733 case 0x0a: /* LF */
3734 case 0x0b: /* VT */
3735 case 0x0c: /* FF */
3736 case 0x0d: /* CR */
3737 case 0x85: /* NEL */
3738 break;
3739 }
3740 break;
3741
3742 case OP_NOT_DIGIT:
3743 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3744 break;
3745
3746 case OP_DIGIT:
3747 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3748 break;
3749
3750 case OP_NOT_WHITESPACE:
3751 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3752 break;
3753
3754 case OP_WHITESPACE:
3755 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3756 break;
3757
3758 case OP_NOT_WORDCHAR:
3759 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3760 break;
3761
3762 case OP_WORDCHAR:
3763 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3764 break;
3765
3766 default:
3767 RRETURN(PCRE_ERROR_INTERNAL);
3768 }
3769 }
3770 }
3771 /* Control never gets here */
3772 }
3773
3774 /* If maximizing, it is worth using inline code for speed, doing the type
3775 test once at the start (i.e. keep it out of the loop). Again, keep the
3776 UTF-8 and UCP stuff separate. */
3777
3778 else
3779 {
3780 pp = eptr; /* Remember where we started */
3781
3782 #ifdef SUPPORT_UCP
3783 if (prop_type >= 0)
3784 {
3785 switch(prop_type)
3786 {
3787 case PT_ANY:
3788 for (i = min; i < max; i++)
3789 {
3790 int len = 1;
3791 if (eptr >= md->end_subject) break;
3792 GETCHARLEN(c, eptr, len);
3793 if (prop_fail_result) break;
3794 eptr+= len;
3795 }
3796 break;
3797
3798 case PT_LAMP:
3799 for (i = min; i < max; i++)
3800 {
3801 int len = 1;
3802 if (eptr >= md->end_subject) break;
3803 GETCHARLEN(c, eptr, len);
3804 prop_chartype = UCD_CHARTYPE(c);
3805 if ((prop_chartype == ucp_Lu ||
3806 prop_chartype == ucp_Ll ||
3807 prop_chartype == ucp_Lt) == prop_fail_result)
3808 break;
3809 eptr+= len;
3810 }
3811 break;
3812
3813 case PT_GC:
3814 for (i = min; i < max; i++)
3815 {
3816 int len = 1;
3817 if (eptr >= md->end_subject) break;
3818 GETCHARLEN(c, eptr, len);
3819 prop_category = UCD_CATEGORY(c);
3820 if ((prop_category == prop_value) == prop_fail_result)
3821 break;
3822 eptr+= len;
3823 }
3824 break;
3825
3826 case PT_PC:
3827 for (i = min; i < max; i++)
3828 {
3829 int len = 1;
3830 if (eptr >= md->end_subject) break;
3831 GETCHARLEN(c, eptr, len);
3832 prop_chartype = UCD_CHARTYPE(c);
3833 if ((prop_chartype == prop_value) == prop_fail_result)
3834 break;
3835 eptr+= len;
3836 }
3837 break;
3838
3839 case PT_SC:
3840 for (i = min; i < max; i++)
3841 {
3842 int len = 1;
3843 if (eptr >= md->end_subject) break;
3844 GETCHARLEN(c, eptr, len);
3845 prop_script = UCD_SCRIPT(c);
3846 if ((prop_script == prop_value) == prop_fail_result)
3847 break;
3848 eptr+= len;
3849 }
3850 break;
3851 }
3852
3853 /* eptr is now past the end of the maximum run */
3854
3855 if (possessive) continue;
3856 for(;;)
3857 {
3858 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3859 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3860 if (eptr-- == pp) break; /* Stop if tried at original pos */
3861 if (utf8) BACKCHAR(eptr);
3862 }
3863 }
3864
3865 /* Match extended Unicode sequences. We will get here only if the
3866 support is in the binary; otherwise a compile-time error occurs. */
3867
3868 else if (ctype == OP_EXTUNI)
3869 {
3870 for (i = min; i < max; i++)
3871 {
3872 if (eptr >= md->end_subject) break;
3873 GETCHARINCTEST(c, eptr);
3874 prop_category = UCD_CATEGORY(c);
3875 if (prop_category == ucp_M) break;
3876 while (eptr < md->end_subject)
3877 {
3878 int len = 1;
3879 if (!utf8) c = *eptr; else
3880 {
3881 GETCHARLEN(c, eptr, len);
3882 }
3883 prop_category = UCD_CATEGORY(c);
3884 if (prop_category != ucp_M) break;
3885 eptr += len;
3886 }
3887 }
3888
3889 /* eptr is now past the end of the maximum run */
3890
3891 if (possessive) continue;
3892 for(;;)
3893 {
3894 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3895 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3896 if (eptr-- == pp) break; /* Stop if tried at original pos */
3897 for (;;) /* Move back over one extended */
3898 {
3899 int len = 1;
3900 if (!utf8) c = *eptr; else
3901 {
3902 BACKCHAR(eptr);
3903 GETCHARLEN(c, eptr, len);
3904 }
3905 prop_category = UCD_CATEGORY(c);
3906 if (prop_category != ucp_M) break;
3907 eptr--;
3908 }
3909 }
3910 }
3911
3912 else
3913 #endif /* SUPPORT_UCP */
3914
3915 #ifdef SUPPORT_UTF8
3916 /* UTF-8 mode */
3917
3918 if (utf8)
3919 {
3920 switch(ctype)
3921 {
3922 case OP_ANY:
3923 if (max < INT_MAX)
3924 {
3925 for (i = min; i < max; i++)
3926 {
3927 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3928 eptr++;
3929 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3930 }
3931 }
3932
3933 /* Handle unlimited UTF-8 repeat */
3934
3935 else
3936 {
3937 for (i = min; i < max; i++)
3938 {
3939 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3940 eptr++;
3941 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3942 }
3943 }
3944 break;
3945
3946 case OP_ALLANY:
3947 if (max < INT_MAX)
3948 {
3949 for (i = min; i < max; i++)
3950 {
3951 if (eptr >= md->end_subject) break;
3952 eptr++;
3953 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3954 }
3955 }
3956 else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3957 break;
3958
3959 /* The byte case is the same as non-UTF8 */
3960
3961 case OP_ANYBYTE:
3962 c = max - min;
3963 if (c > (unsigned int)(md->end_subject - eptr))
3964 c = md->end_subject - eptr;
3965 eptr += c;
3966 break;
3967
3968 case OP_ANYNL:
3969 for (i = min; i < max; i++)
3970 {
3971 int len = 1;
3972 if (eptr >= md->end_subject) break;
3973 GETCHARLEN(c, eptr, len);
3974 if (c == 0x000d)
3975 {
3976 if (++eptr >= md->end_subject) break;
3977 if (*eptr == 0x000a) eptr++;
3978 }
3979 else
3980 {
3981 if (c != 0x000a &&
3982 (md->bsr_anycrlf ||
3983 (c != 0x000b && c != 0x000c &&
3984 c != 0x0085 && c != 0x2028 && c != 0x2029)))
3985 break;
3986 eptr += len;
3987 }
3988 }
3989 break;
3990
3991 case OP_NOT_HSPACE:
3992 case OP_HSPACE:
3993 for (i = min; i < max; i++)
3994 {
3995 BOOL gotspace;
3996 int len = 1;
3997 if (eptr >= md->end_subject) break;
3998 GETCHARLEN(c, eptr, len);
3999 switch(c)
4000 {
4001 default: gotspace = FALSE; break;
4002 case 0x09: /* HT */
4003 case 0x20: /* SPACE */
4004 case 0xa0: /* NBSP */
4005 case 0x1680: /* OGHAM SPACE MARK */
4006 case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4007 case 0x2000: /* EN QUAD */
4008 case 0x2001: /* EM QUAD */
4009 case 0x2002: /* EN SPACE */
4010 case 0x2003: /* EM SPACE */
4011 case 0x2004: /* THREE-PER-EM SPACE */
4012 case 0x2005: /* FOUR-PER-EM SPACE */
4013 case 0x2006: /* SIX-PER-EM SPACE */
4014 case 0x2007: /* FIGURE SPACE */
4015 case 0x2008: /* PUNCTUATION SPACE */
4016 case 0x2009: /* THIN SPACE */
4017 case 0x200A: /* HAIR SPACE */
4018 case 0x202f: /* NARROW NO-BREAK SPACE */
4019 case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4020 case 0x3000: /* IDEOGRAPHIC SPACE */
4021 gotspace = TRUE;
4022 break;
4023 }
4024 if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4025 eptr += len;
4026 }
4027 break;
4028
4029 case OP_NOT_VSPACE:
4030 case OP_VSPACE:
4031 for (i = min; i < max; i++)
4032 {
4033 BOOL gotspace;
4034 int len = 1;
4035 if (eptr >= md->end_subject) break;
4036 GETCHARLEN(c, eptr, len);
4037 switch(c)
4038 {
4039 default: gotspace = FALSE; break;
4040 case 0x0a: /* LF */
4041 case 0x0b: /* VT */
4042 case 0x0c: /* FF */
4043 case 0x0d: /* CR */
4044 case 0x85: /* NEL */
4045 case 0x2028: /* LINE SEPARATOR */
4046 case 0x2029: /* PARAGRAPH SEPARATOR */
4047 gotspace = TRUE;
4048 break;
4049 }
4050 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4051 eptr += len;
4052 }
4053 break;
4054
4055 case OP_NOT_DIGIT:
4056 for (i = min; i < max; i++)
4057 {
4058 int len = 1;
4059 if (eptr >= md->end_subject) break;
4060 GETCHARLEN(c, eptr, len);
4061 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4062 eptr+= len;
4063 }
4064 break;
4065
4066 case OP_DIGIT:
4067 for (i = min; i < max; i++)
4068 {
4069 int len = 1;
4070 if (eptr >= md->end_subject) break;
4071 GETCHARLEN(c, eptr, len);
4072 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4073 eptr+= len;
4074 }
4075 break;
4076
4077 case OP_NOT_WHITESPACE:
4078 for (i = min; i < max; i++)
4079 {
4080 int len = 1;
4081 if (eptr >= md->end_subject) break;
4082 GETCHARLEN(c, eptr, len);
4083 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4084 eptr+= len;
4085 }
4086 break;
4087
4088 case OP_WHITESPACE:
4089 for (i = min; i < max; i++)
4090 {
4091 int len = 1;
4092 if (eptr >= md->end_subject) break;
4093 GETCHARLEN(c, eptr, len);
4094 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4095 eptr+= len;
4096 }
4097 break;
4098
4099 case OP_NOT_WORDCHAR:
4100 for (i = min; i < max; i++)
4101 {
4102 int len = 1;
4103 if (eptr >= md->end_subject) break;
4104 GETCHARLEN(c, eptr, len);
4105 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4106 eptr+= len;
4107 }
4108 break;
4109
4110 case OP_WORDCHAR:
4111 for (i = min; i < max; i++)
4112 {
4113 int len = 1;
4114 if (eptr >= md->end_subject) break;
4115 GETCHARLEN(c, eptr, len);
4116 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4117 eptr+= len;
4118 }
4119 break;
4120
4121 default:
4122 RRETURN(PCRE_ERROR_INTERNAL);
4123 }
4124
4125 /* eptr is now past the end of the maximum run */
4126
4127 if (possessive) continue;
4128 for(;;)
4129 {
4130 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4131 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4132 if (eptr-- == pp) break; /* Stop if tried at original pos */
4133 BACKCHAR(eptr);
4134 }
4135 }
4136 else
4137 #endif /* SUPPORT_UTF8 */
4138
4139 /* Not UTF-8 mode */
4140 {
4141 switch(ctype)
4142 {
4143 case OP_ANY:
4144 for (i = min; i < max; i++)
4145 {
4146 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4147 eptr++;
4148 }
4149 break;
4150
4151 case OP_ALLANY:
4152 case OP_ANYBYTE:
4153 c = max - min;
4154 if (c > (unsigned int)(md->end_subject - eptr))
4155 c = md->end_subject - eptr;
4156 eptr += c;
4157 break;
4158
4159 case OP_ANYNL:
4160 for (i = min; i < max; i++)
4161 {
4162 if (eptr >= md->end_subject) break;
4163 c = *eptr;
4164 if (c == 0x000d)
4165 {
4166 if (++eptr >= md->end_subject) break;
4167 if (*eptr == 0x000a) eptr++;
4168 }
4169 else
4170 {
4171 if (c != 0x000a &&
4172 (md->bsr_anycrlf ||
4173 (c != 0x000b && c != 0x000c && c != 0x0085)))
4174 break;
4175 eptr++;
4176 }
4177 }
4178 break;
4179
4180 case OP_NOT_HSPACE:
4181 for (i = min; i < max; i++)
4182 {
4183 if (eptr >= md->end_subject) break;
4184 c = *eptr;
4185 if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4186 eptr++;
4187 }
4188 break;
4189
4190 case OP_HSPACE:
4191 for (i = min; i < max; i++)
4192 {
4193 if (eptr >= md->end_subject) break;
4194 c = *eptr;
4195 if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4196 eptr++;
4197 }
4198 break;
4199
4200 case OP_NOT_VSPACE:
4201 for (i = min; i < max; i++)
4202 {
4203 if (eptr >= md->end_subject) break;
4204 c = *eptr;
4205 if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4206 break;
4207 eptr++;
4208 }
4209 break;
4210
4211 case OP_VSPACE:
4212 for (i = min; i < max; i++)
4213 {
4214 if (eptr >= md->end_subject) break;
4215 c = *eptr;
4216 if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4217 break;
4218 eptr++;
4219 }
4220 break;
4221
4222 case OP_NOT_DIGIT:
4223 for (i = min; i < max; i++)
4224 {
4225 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4226 break;
4227 eptr++;
4228 }
4229 break;
4230
4231 case OP_DIGIT:
4232 for (i = min; i < max; i++)
4233 {
4234 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4235 break;
4236 eptr++;
4237 }
4238 break;
4239
4240 case OP_NOT_WHITESPACE:
4241 for (i = min; i < max; i++)
4242 {
4243 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4244 break;
4245 eptr++;
4246 }
4247 break;
4248
4249 case OP_WHITESPACE:
4250 for (i = min; i < max; i++)
4251 {
4252 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4253 break;
4254 eptr++;
4255 }
4256 break;
4257
4258 case OP_NOT_WORDCHAR:
4259 for (i = min; i < max; i++)
4260 {
4261 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4262 break;
4263 eptr++;
4264 }
4265 break;
4266
4267 case OP_WORDCHAR:
4268 for (i = min; i < max; i++)
4269 {
4270 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4271 break;
4272 eptr++;
4273 }
4274 break;
4275
4276 default:
4277 RRETURN(PCRE_ERROR_INTERNAL);
4278 }
4279
4280 /* eptr is now past the end of the maximum run */
4281
4282 if (possessive) continue;
4283 while (eptr >= pp)
4284 {
4285 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4286 eptr--;
4287 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4288 }
4289 }
4290
4291 /* Get here if we can't make it match with any permitted repetitions */
4292
4293 RRETURN(MATCH_NOMATCH);
4294 }
4295 /* Control never gets here */
4296
4297 /* There's been some horrible disaster. Arrival here can only mean there is
4298 something seriously wrong in the code above or the OP_xxx definitions. */
4299
4300 default:
4301 DPRINTF(("Unknown opcode %d\n", *ecode));
4302 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4303 }
4304
4305 /* Do not stick any code in here without much thought; it is assumed
4306 that "continue" in the code above comes out to here to repeat the main
4307 loop. */
4308
4309 } /* End of main loop */
4310 /* Control never reaches here */
4311
4312
4313 /* When compiling to use the heap rather than the stack for recursive calls to
4314 match(), the RRETURN() macro jumps here. The number that is saved in
4315 frame->Xwhere indicates which label we actually want to return to. */
4316
4317 #ifdef NO_RECURSE
4318 #define LBL(val) case val: goto L_RM##val;
4319 HEAP_RETURN:
4320 switch (frame->Xwhere)
4321 {
4322 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4323 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4324 LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4325 LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4326 LBL(53) LBL(54)
4327 #ifdef SUPPORT_UTF8
4328 LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4329 LBL(32) LBL(34) LBL(42) LBL(46)
4330 #ifdef SUPPORT_UCP
4331 LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4332 #endif /* SUPPORT_UCP */
4333 #endif /* SUPPORT_UTF8 */
4334 default:
4335 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4336 return PCRE_ERROR_INTERNAL;
4337 }
4338 #undef LBL
4339 #endif /* NO_RECURSE */
4340 }
4341
4342
4343 /***************************************************************************
4344 ****************************************************************************
4345 RECURSION IN THE match() FUNCTION
4346
4347 Undefine all the macros that were defined above to handle this. */
4348
4349 #ifdef NO_RECURSE
4350 #undef eptr
4351 #undef ecode
4352 #undef mstart
4353 #undef offset_top
4354 #undef ims
4355 #undef eptrb
4356 #undef flags
4357
4358 #undef callpat
4359 #undef charptr
4360 #undef data
4361 #undef next
4362 #undef pp
4363 #undef prev
4364 #undef saved_eptr
4365
4366 #undef new_recursive
4367
4368 #undef cur_is_word
4369 #undef condition
4370 #undef prev_is_word
4371
4372 #undef original_ims
4373
4374 #undef ctype
4375 #undef length
4376 #undef max
4377 #undef min
4378 #undef number
4379 #undef offset
4380 #undef op
4381 #undef save_capture_last
4382 #undef save_offset1
4383 #undef save_offset2
4384 #undef save_offset3
4385 #undef stacksave
4386
4387 #undef newptrb
4388
4389 #endif
4390
4391 /* These two are defined as macros in both cases */
4392
4393 #undef fc
4394 #undef fi
4395
4396 /***************************************************************************
4397 ***************************************************************************/
4398
4399
4400
4401 /*************************************************
4402 * Execute a Regular Expression *
4403 *************************************************/
4404
4405 /* This function applies a compiled re to a subject string and picks out
4406 portions of the string if it matches. Two elements in the vector are set for
4407 each substring: the offsets to the start and end of the substring.
4408
4409 Arguments:
4410 argument_re points to the compiled expression
4411 extra_data points to extra data or is NULL
4412 subject points to the subject string
4413 length length of subject string (may contain binary zeros)
4414 start_offset where to start in the subject string
4415 options option bits
4416 offsets points to a vector of ints to be filled in with offsets
4417 offsetcount the number of elements in the vector
4418
4419 Returns: > 0 => success; value is the number of elements filled in
4420 = 0 => success, but offsets is not big enough
4421 -1 => failed to match
4422 < -1 => some kind of unexpected problem
4423 */
4424
4425 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
4426 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4427 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4428 int offsetcount)
4429 {
4430 int rc, resetcount, ocount;
4431 int first_byte = -1;
4432 int req_byte = -1;
4433 int req_byte2 = -1;
4434 int newline;
4435 unsigned long int ims;
4436 BOOL using_temporary_offsets = FALSE;
4437 BOOL anchored;
4438 BOOL startline;
4439 BOOL firstline;
4440 BOOL first_byte_caseless = FALSE;
4441 BOOL req_byte_caseless = FALSE;
4442 BOOL utf8;
4443 match_data match_block;
4444 match_data *md = &match_block;
4445 const uschar *tables;
4446 const uschar *start_bits = NULL;
4447 USPTR start_match = (USPTR)subject + start_offset;
4448 USPTR end_subject;
4449 USPTR req_byte_ptr = start_match - 1;
4450
4451 pcre_study_data internal_study;
4452 const pcre_study_data *study;
4453
4454 real_pcre internal_re;
4455 const real_pcre *external_re = (const real_pcre *)argument_re;
4456 const real_pcre *re = external_re;
4457
4458 /* Plausibility checks */
4459
4460 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4461 if (re == NULL || subject == NULL ||
4462 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4463 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4464
4465 /* Fish out the optional data from the extra_data structure, first setting
4466 the default values. */
4467
4468 study = NULL;
4469 md->match_limit = MATCH_LIMIT;
4470 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4471 md->callout_data = NULL;
4472
4473 /* The table pointer is always in native byte order. */
4474
4475 tables = external_re->tables;
4476
4477 if (extra_data != NULL)
4478 {
4479 register unsigned int flags = extra_data->flags;
4480 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4481 study = (const pcre_study_data *)extra_data->study_data;
4482 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4483 md->match_limit = extra_data->match_limit;
4484 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4485 md->match_limit_recursion = extra_data->match_limit_recursion;
4486 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4487 md->callout_data = extra_data->callout_data;
4488 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4489 }
4490
4491 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4492 is a feature that makes it possible to save compiled regex and re-use them
4493 in other programs later. */
4494
4495 if (tables == NULL) tables = _pcre_default_tables;
4496
4497 /* Check that the first field in the block is the magic number. If it is not,
4498 test for a regex that was compiled on a host of opposite endianness. If this is
4499 the case, flipped values are put in internal_re and internal_study if there was
4500 study data too. */
4501
4502 if (re->magic_number != MAGIC_NUMBER)
4503 {
4504 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4505 if (re == NULL) return PCRE_ERROR_BADMAGIC;
4506 if (study != NULL) study = &internal_study;
4507 }
4508
4509 /* Set up other data */
4510
4511 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4512 startline = (re->flags & PCRE_STARTLINE) != 0;
4513 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4514
4515 /* The code starts after the real_pcre block and the capture name table. */
4516
4517 md->start_code = (const uschar *)external_re + re->name_table_offset +
4518 re->name_count * re->name_entry_size;
4519
4520 md->start_subject = (USPTR)subject;
4521 md->start_offset = start_offset;
4522 md->end_subject = md->start_subject + length;
4523 end_subject = md->end_subject;
4524
4525 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4526 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4527 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4528
4529 md->notbol = (options & PCRE_NOTBOL) != 0;
4530 md->noteol = (options & PCRE_NOTEOL) != 0;
4531 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4532 md->partial = (options & PCRE_PARTIAL) != 0;
4533 md->hitend = FALSE;
4534
4535 md->recursive = NULL; /* No recursion at top level */
4536
4537 md->lcc = tables + lcc_offset;
4538 md->ctypes = tables + ctypes_offset;
4539
4540 /* Handle different \R options. */
4541
4542 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4543 {
4544 case 0:
4545 if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4546 md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4547 else
4548 #ifdef BSR_ANYCRLF
4549 md->bsr_anycrlf = TRUE;
4550 #else
4551 md->bsr_anycrlf = FALSE;
4552 #endif
4553 break;
4554
4555 case PCRE_BSR_ANYCRLF:
4556 md->bsr_anycrlf = TRUE;
4557 break;
4558
4559 case PCRE_BSR_UNICODE:
4560 md->bsr_anycrlf = FALSE;
4561 break;
4562
4563 default: return PCRE_ERROR_BADNEWLINE;
4564 }
4565
4566 /* Handle different types of newline. The three bits give eight cases. If
4567 nothing is set at run time, whatever was used at compile time applies. */
4568
4569 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4570 (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4571 {
4572 case 0: newline = NEWLINE; break; /* Compile-time default */
4573 case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4574 case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4575 case PCRE_NEWLINE_CR+
4576 PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4577 case PCRE_NEWLINE_ANY: newline = -1; break;
4578 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4579 default: return PCRE_ERROR_BADNEWLINE;
4580 }
4581
4582 if (newline == -2)
4583 {
4584 md->nltype = NLTYPE_ANYCRLF;
4585 }
4586 else if (newline < 0)
4587 {
4588 md->nltype = NLTYPE_ANY;
4589 }
4590 else
4591 {
4592 md->nltype = NLTYPE_FIXED;
4593 if (newline > 255)
4594 {
4595 md->nllen = 2;
4596 md->nl[0] = (newline >> 8) & 255;
4597 md->nl[1] = newline & 255;
4598 }
4599 else
4600 {
4601 md->nllen = 1;
4602 md->nl[0] = newline;
4603 }
4604 }
4605
4606 /* Partial matching is supported only for a restricted set of regexes at the
4607 moment. */
4608
4609 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4610 return PCRE_ERROR_BADPARTIAL;
4611
4612 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4613 back the character offset. */
4614
4615 #ifdef SUPPORT_UTF8
4616 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4617 {
4618 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4619 return PCRE_ERROR_BADUTF8;
4620 if (start_offset > 0 && start_offset < length)
4621 {
4622 int tb = ((uschar *)subject)[start_offset];
4623 if (tb > 127)
4624 {
4625 tb &= 0xc0;
4626 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4627 }
4628 }
4629 }
4630 #endif
4631
4632 /* The ims options can vary during the matching as a result of the presence
4633 of (?ims) items in the pattern. They are kept in a local variable so that
4634 restoring at the exit of a group is easy. */
4635
4636 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4637
4638 /* If the expression has got more back references than the offsets supplied can
4639 hold, we get a temporary chunk of working store to use during the matching.
4640 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4641 of 3. */
4642
4643 ocount = offsetcount - (offsetcount % 3);
4644
4645 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4646 {
4647 ocount = re->top_backref * 3 + 3;
4648 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4649 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4650 using_temporary_offsets = TRUE;
4651 DPRINTF(("Got memory to hold back references\n"));
4652 }
4653 else md->offset_vector = offsets;
4654
4655 md->offset_end = ocount;
4656 md->offset_max = (2*ocount)/3;
4657 md->offset_overflow = FALSE;
4658 md->capture_last = -1;
4659
4660 /* Compute the minimum number of offsets that we need to reset each time. Doing
4661 this makes a huge difference to execution time when there aren't many brackets
4662 in the pattern. */
4663
4664 resetcount = 2 + re->top_bracket * 2;
4665 if (resetcount > offsetcount) resetcount = ocount;
4666
4667 /* Reset the working variable associated with each extraction. These should
4668 never be used unless previously set, but they get saved and restored, and so we
4669 initialize them to avoid reading uninitialized locations. */
4670
4671 if (md->offset_vector != NULL)
4672 {
4673 register int *iptr = md->offset_vector + ocount;
4674 register int *iend = iptr - resetcount/2 + 1;
4675 while (--iptr >= iend) *iptr = -1;
4676 }
4677
4678 /* Set up the first character to match, if available. The first_byte value is
4679 never set for an anchored regular expression, but the anchoring may be forced
4680 at run time, so we have to test for anchoring. The first char may be unset for
4681 an unanchored pattern, of course. If there's no first char and the pattern was
4682 studied, there may be a bitmap of possible first characters. */
4683
4684 if (!anchored)
4685 {
4686 if ((re->flags & PCRE_FIRSTSET) != 0)
4687 {
4688 first_byte = re->first_byte & 255;
4689 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4690 first_byte = md->lcc[first_byte];
4691 }
4692 else
4693 if (!startline && study != NULL &&
4694 (study->options & PCRE_STUDY_MAPPED) != 0)
4695 start_bits = study->start_bits;
4696 }
4697
4698 /* For anchored or unanchored matches, there may be a "last known required
4699 character" set. */
4700
4701 if ((re->flags & PCRE_REQCHSET) != 0)
4702 {
4703 req_byte = re->req_byte & 255;
4704 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4705 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4706 }
4707
4708
4709 /* ==========================================================================*/
4710
4711 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4712 the loop runs just once. */
4713
4714 for(;;)
4715 {
4716 USPTR save_end_subject = end_subject;
4717 USPTR new_start_match;
4718
4719 /* Reset the maximum number of extractions we might see. */
4720
4721 if (md->offset_vector != NULL)
4722 {
4723 register int *iptr = md->offset_vector;
4724 register int *iend = iptr + resetcount;
4725 while (iptr < iend) *iptr++ = -1;
4726 }
4727
4728 /* If firstline is TRUE, the start of the match is constrained to the first
4729 line of a multiline string. That is, the match must be before or at the first
4730 newline. Implement this by temporarily adjusting end_subject so that we stop
4731 scanning at a newline. If the match fails at the newline, later code breaks
4732 this loop. */
4733
4734 if (firstline)
4735 {
4736 USPTR t = start_match;
4737 #ifdef SUPPORT_UTF8
4738 if (utf8)
4739 {
4740 while (t < md->end_subject && !IS_NEWLINE(t))
4741 {
4742 t++;
4743 while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4744 }
4745 }
4746 else
4747 #endif
4748 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4749 end_subject = t;
4750 }
4751
4752 /* There are some optimizations that avoid running the match if a known
4753 starting point is not found, or if a known later character is not present.
4754 However, there is an option that disables these, for testing and for ensuring
4755 that all callouts do actually occur. */
4756
4757 if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4758 {
4759 /* Advance to a unique first byte if there is one. */
4760
4761 if (first_byte >= 0)
4762 {
4763 if (first_byte_caseless)
4764 while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4765 start_match++;
4766 else
4767 while (start_match < end_subject && *start_match != first_byte)
4768 start_match++;
4769 }
4770
4771 /* Or to just after a linebreak for a multiline match */
4772
4773 else if (startline)
4774 {
4775 if (start_match > md->start_subject + start_offset)
4776 {
4777 #ifdef SUPPORT_UTF8
4778 if (utf8)
4779 {
4780 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4781 {
4782 start_match++;
4783 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4784 start_match++;
4785 }
4786 }
4787 else
4788 #endif
4789 while (start_match < end_subject && !WAS_NEWLINE(start_match))
4790 start_match++;
4791
4792 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4793 and we are now at a LF, advance the match position by one more character.
4794 */
4795
4796 if (start_match[-1] == CHAR_CR &&
4797 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4798 start_match < end_subject &&
4799 *start_match == CHAR_NL)
4800 start_match++;
4801 }
4802 }
4803
4804 /* Or to a non-unique first byte after study */
4805
4806 else if (start_bits != NULL)
4807 {
4808 while (start_match < end_subject)
4809 {
4810 register unsigned int c = *start_match;
4811 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4812 else break;
4813 }
4814 }
4815 } /* Starting optimizations */
4816
4817 /* Restore fudged end_subject */
4818
4819 end_subject = save_end_subject;
4820
4821 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4822 printf(">>>> Match against: ");
4823 pchars(start_match, end_subject - start_match, TRUE, md);
4824 printf("\n");
4825 #endif
4826
4827 /* If req_byte is set, we know that that character must appear in the
4828 subject for the match to succeed. If the first character is set, req_byte
4829 must be later in the subject; otherwise the test starts at the match point.
4830 This optimization can save a huge amount of backtracking in patterns with
4831 nested unlimited repeats that aren't going to match. Writing separate code
4832 for cased/caseless versions makes it go faster, as does using an
4833 autoincrement and backing off on a match.
4834
4835 HOWEVER: when the subject string is very, very long, searching to its end
4836 can take a long time, and give bad performance on quite ordinary patterns.
4837 This showed up when somebody was matching something like /^\d+C/ on a
4838 32-megabyte string... so we don't do this when the string is sufficiently
4839 long.
4840
4841 ALSO: this processing is disabled when partial matching is requested, or if
4842 disabling is explicitly requested. */
4843
4844 if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4845 req_byte >= 0 &&
4846 end_subject - start_match < REQ_BYTE_MAX &&
4847 !md->partial)
4848 {
4849 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4850
4851 /* We don't need to repeat the search if we haven't yet reached the
4852 place we found it at last time. */
4853
4854 if (p > req_byte_ptr)
4855 {
4856 if (req_byte_caseless)
4857 {
4858 while (p < end_subject)
4859 {
4860 register int pp = *p++;
4861 if (pp == req_byte || pp == req_byte2) { p--; break; }
4862 }
4863 }
4864 else
4865 {
4866 while (p < end_subject)
4867 {
4868 if (*p++ == req_byte) { p--; break; }
4869 }
4870 }
4871
4872 /* If we can't find the required character, break the matching loop,
4873 forcing a match failure. */
4874
4875 if (p >= end_subject)
4876 {
4877 rc = MATCH_NOMATCH;
4878 break;
4879 }
4880
4881 /* If we have found the required character, save the point where we
4882 found it, so that we don't search again next time round the loop if
4883 the start hasn't passed this character yet. */
4884
4885 req_byte_ptr = p;
4886 }
4887 }
4888
4889 /* OK, we can now run the match. */
4890
4891 md->start_match_ptr = start_match;
4892 md->match_call_count = 0;
4893 rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4894
4895 switch(rc)
4896 {
4897 /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4898 exactly like PRUNE. */
4899
4900 case MATCH_NOMATCH:
4901 case MATCH_PRUNE:
4902 case MATCH_THEN:
4903 new_start_match = start_match + 1;
4904 #ifdef SUPPORT_UTF8
4905 if (utf8)
4906 while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4907 new_start_match++;
4908 #endif
4909 break;
4910
4911 /* SKIP passes back the next starting point explicitly. */
4912
4913 case MATCH_SKIP:
4914 new_start_match = md->start_match_ptr;
4915 break;
4916
4917 /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4918
4919 case MATCH_COMMIT:
4920 rc = MATCH_NOMATCH;
4921 goto ENDLOOP;
4922
4923 /* Any other return is some kind of error. */
4924
4925 default:
4926 goto ENDLOOP;
4927 }
4928
4929 /* Control reaches here for the various types of "no match at this point"
4930 result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4931
4932 rc = MATCH_NOMATCH;
4933
4934 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4935 newline in the subject (though it may continue over the newline). Therefore,
4936 if we have just failed to match, starting at a newline, do not continue. */
4937
4938 if (firstline && IS_NEWLINE(start_match)) break;
4939
4940 /* Advance to new matching position */
4941
4942 start_match = new_start_match;
4943
4944 /* Break the loop if the pattern is anchored or if we have passed the end of
4945 the subject. */
4946
4947 if (anchored || start_match > end_subject) break;
4948
4949 /* If we have just passed a CR and we are now at a LF, and the pattern does
4950 not contain any explicit matches for \r or \n, and the newline option is CRLF
4951 or ANY or ANYCRLF, advance the match position by one more character. */
4952
4953 if (start_match[-1] == CHAR_CR &&
4954 start_match < end_subject &&
4955 *start_match == CHAR_NL &&
4956 (re->flags & PCRE_HASCRORLF) == 0 &&
4957 (md->nltype == NLTYPE_ANY ||
4958 md->nltype == NLTYPE_ANYCRLF ||
4959 md->nllen == 2))
4960 start_match++;
4961
4962 } /* End of for(;;) "bumpalong" loop */
4963
4964 /* ==========================================================================*/
4965
4966 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4967 conditions is true:
4968
4969 (1) The pattern is anchored or the match was failed by (*COMMIT);
4970
4971 (2) We are past the end of the subject;
4972
4973 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4974 this option requests that a match occur at or before the first newline in
4975 the subject.
4976
4977 When we have a match and the offset vector is big enough to deal with any
4978 backreferences, captured substring offsets will already be set up. In the case
4979 where we had to get some local store to hold offsets for backreference
4980 processing, copy those that we can. In this case there need not be overflow if
4981 certain parts of the pattern were not used, even though there are more
4982 capturing parentheses than vector slots. */
4983
4984 ENDLOOP:
4985
4986 if (rc == MATCH_MATCH)
4987 {
4988 if (using_temporary_offsets)
4989 {
4990 if (offsetcount >= 4)
4991 {
4992 memcpy(offsets + 2, md->offset_vector + 2,
4993 (offsetcount - 2) * sizeof(int));
4994 DPRINTF(("Copied offsets from temporary memory\n"));
4995 }
4996 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4997 DPRINTF(("Freeing temporary memory\n"));
4998 (pcre_free)(md->offset_vector);
4999 }
5000
5001 /* Set the return code to the number of captured strings, or 0 if there are
5002 too many to fit into the vector. */
5003
5004 rc = md->offset_overflow? 0 : md->end_offset_top/2;
5005
5006 /* If there is space, set up the whole thing as substring 0. The value of
5007 md->start_match_ptr might be modified if \K was encountered on the success
5008 matching path. */
5009
5010 if (offsetcount < 2) rc = 0; else
5011 {
5012 offsets[0] = md->start_match_ptr - md->start_subject;
5013 offsets[1] = md->end_match_ptr - md->start_subject;
5014 }
5015
5016 DPRINTF((">>>> returning %d\n", rc));
5017 return rc;
5018 }
5019
5020 /* Control gets here if there has been an error, or if the overall match
5021 attempt has failed at all permitted starting positions. */
5022
5023 if (using_temporary_offsets)
5024 {
5025 DPRINTF(("Freeing temporary memory\n"));
5026 (pcre_free)(md->offset_vector);
5027 }
5028
5029 if (rc != MATCH_NOMATCH)
5030 {
5031 DPRINTF((">>>> error: returning %d\n", rc));
5032 return rc;
5033 }
5034 else if (md->partial && md->hitend)
5035 {
5036 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5037 return PCRE_ERROR_PARTIAL;
5038 }
5039 else
5040 {
5041 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5042 return PCRE_ERROR_NOMATCH;
5043 }
5044 }
5045
5046 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12