/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 165 - (show annotations) (download)
Wed May 9 10:50:57 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 129791 byte(s)
Non-longjmp heap recursion.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #define NLBLOCK md /* Block containing newline information */
46 #define PSSTART start_subject /* Field containing processed string start */
47 #define PSEND end_subject /* Field containing processed string end */
48
49 #include "pcre_internal.h"
50
51 /* Undefine some potentially clashing cpp symbols */
52
53 #undef min
54 #undef max
55
56 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58
59 #define EPTR_WORK_SIZE (1000)
60
61 /* Flag bits for the match() function */
62
63 #define match_condassert 0x01 /* Called to check a condition assertion */
64 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 #define match_tail_recursed 0x04 /* Tail recursive call */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
76
77 #define REC_STACK_SAVE_MAX 30
78
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83
84
85
86 #ifdef DEBUG
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
90
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
93
94 Arguments:
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
99
100 Returns: nothing
101 */
102
103 static void
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105 {
106 unsigned int c;
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108 while (length-- > 0)
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110 }
111 #endif
112
113
114
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
118
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
121
122 Arguments:
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
127 ims the ims flags
128
129 Returns: TRUE if matched
130 */
131
132 static BOOL
133 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 unsigned long int ims)
135 {
136 USPTR p = md->start_subject + md->offset_vector[offset];
137
138 #ifdef DEBUG
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
141 else
142 {
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
145 }
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
148 printf("\n");
149 #endif
150
151 /* Always fail if not enough characters left */
152
153 if (length > md->end_subject - eptr) return FALSE;
154
155 /* Separate the caselesss case for speed */
156
157 if ((ims & PCRE_CASELESS) != 0)
158 {
159 while (length-- > 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161 }
162 else
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164
165 return TRUE;
166 }
167
168
169
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
173
174 The match() function is highly recursive, though not every recursive call
175 increases the recursive depth. Nevertheless, some regular expressions can cause
176 it to recurse to a great depth. I was writing for Unix, so I just let it call
177 itself recursively. This uses the stack for saving everything that has to be
178 saved for a recursive call. On Unix, the stack can be large, and this works
179 fine.
180
181 It turns out that on some non-Unix-like systems there are problems with
182 programs that use a lot of stack. (This despite the fact that every last chip
183 has oodles of memory these days, and techniques for extending the stack have
184 been known for decades.) So....
185
186 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187 calls by keeping local variables that need to be preserved in blocks of memory
188 obtained from malloc() instead instead of on the stack. Macros are used to
189 achieve this so that the actual code doesn't look very different to what it
190 always used to.
191
192 The original heap-recursive code used longjmp(). However, it seems that this
193 can be very slow on some operating systems. Following a suggestion from Stan
194 Switzer, the use of longjmp() has been abolished, at the cost of having to
195 provide a unique number for each call to RMATCH. There is no way of generating
196 a sequence of numbers at compile time in C. I have given them names, to make
197 them stand out more clearly.
198
199 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
200 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
201 tests. Furthermore, not using longjmp() means that local dynamic variables
202 don't have indeterminate values; this has meant that the frame size can be
203 reduced because the result can be "passed back" by straight setting of the
204 variable instead of being passed in the frame.
205 ****************************************************************************
206 ***************************************************************************/
207
208
209 /* Numbers for RMATCH calls */
210
211 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
212 RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
213 RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
214 RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
215 RM41, RM42, RM43, RM44, RM45, RM46, RM47 };
216
217
218 /* These versions of the macros use the stack, as normal. There are debugging
219 versions and production versions. Note that the "rw" argument of RMATCH isn't
220 actuall used in this definition. */
221
222 #ifndef NO_RECURSE
223 #define REGISTER register
224
225 #ifdef DEBUG
226 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
227 { \
228 printf("match() called in line %d\n", __LINE__); \
229 rrc = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
230 printf("to line %d\n", __LINE__); \
231 }
232 #define RRETURN(ra) \
233 { \
234 printf("match() returned %d from line %d ", ra, __LINE__); \
235 return ra; \
236 }
237 #else
238 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
239 rrc = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
240 #define RRETURN(ra) return ra
241 #endif
242
243 #else
244
245
246 /* These versions of the macros manage a private stack on the heap. Note that
247 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
248 argument of match(), which never changes. */
249
250 #define REGISTER
251
252 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
253 {\
254 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
255 frame->Xwhere = rw; \
256 newframe->Xeptr = ra;\
257 newframe->Xecode = rb;\
258 newframe->Xoffset_top = rc;\
259 newframe->Xims = re;\
260 newframe->Xeptrb = rf;\
261 newframe->Xflags = rg;\
262 newframe->Xrdepth = frame->Xrdepth + 1;\
263 newframe->Xprevframe = frame;\
264 frame = newframe;\
265 DPRINTF(("restarting from line %d\n", __LINE__));\
266 goto HEAP_RECURSE;\
267 L_##rw:\
268 DPRINTF(("jumped back to line %d\n", __LINE__));\
269 }
270
271 #define RRETURN(ra)\
272 {\
273 heapframe *newframe = frame;\
274 frame = newframe->Xprevframe;\
275 (pcre_stack_free)(newframe);\
276 if (frame != NULL)\
277 {\
278 rrc = ra;\
279 goto HEAP_RETURN;\
280 }\
281 return ra;\
282 }
283
284
285 /* Structure for remembering the local variables in a private frame */
286
287 typedef struct heapframe {
288 struct heapframe *Xprevframe;
289
290 /* Function arguments that may change */
291
292 const uschar *Xeptr;
293 const uschar *Xecode;
294 int Xoffset_top;
295 long int Xims;
296 eptrblock *Xeptrb;
297 int Xflags;
298 unsigned int Xrdepth;
299
300 /* Function local variables */
301
302 const uschar *Xcallpat;
303 const uschar *Xcharptr;
304 const uschar *Xdata;
305 const uschar *Xnext;
306 const uschar *Xpp;
307 const uschar *Xprev;
308 const uschar *Xsaved_eptr;
309
310 recursion_info Xnew_recursive;
311
312 BOOL Xcur_is_word;
313 BOOL Xcondition;
314 BOOL Xprev_is_word;
315
316 unsigned long int Xoriginal_ims;
317
318 #ifdef SUPPORT_UCP
319 int Xprop_type;
320 int Xprop_value;
321 int Xprop_fail_result;
322 int Xprop_category;
323 int Xprop_chartype;
324 int Xprop_script;
325 int Xoclength;
326 uschar Xocchars[8];
327 #endif
328
329 int Xctype;
330 unsigned int Xfc;
331 int Xfi;
332 int Xlength;
333 int Xmax;
334 int Xmin;
335 int Xnumber;
336 int Xoffset;
337 int Xop;
338 int Xsave_capture_last;
339 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
340 int Xstacksave[REC_STACK_SAVE_MAX];
341
342 eptrblock Xnewptrb;
343
344 /* Where to jump back to */
345
346 int Xwhere;
347
348 } heapframe;
349
350 #endif
351
352
353 /***************************************************************************
354 ***************************************************************************/
355
356
357
358 /*************************************************
359 * Match from current position *
360 *************************************************/
361
362 /* This function is called recursively in many circumstances. Whenever it
363 returns a negative (error) response, the outer incarnation must also return the
364 same response.
365
366 Performance note: It might be tempting to extract commonly used fields from the
367 md structure (e.g. utf8, end_subject) into individual variables to improve
368 performance. Tests using gcc on a SPARC disproved this; in the first case, it
369 made performance worse.
370
371 Arguments:
372 eptr pointer to current character in subject
373 ecode pointer to current position in compiled code
374 offset_top current top pointer
375 md pointer to "static" info for the match
376 ims current /i, /m, and /s options
377 eptrb pointer to chain of blocks containing eptr at start of
378 brackets - for testing for empty matches
379 flags can contain
380 match_condassert - this is an assertion condition
381 match_cbegroup - this is the start of an unlimited repeat
382 group that can match an empty string
383 match_tail_recursed - this is a tail_recursed group
384 rdepth the recursion depth
385
386 Returns: MATCH_MATCH if matched ) these values are >= 0
387 MATCH_NOMATCH if failed to match )
388 a negative PCRE_ERROR_xxx value if aborted by an error condition
389 (e.g. stopped by repeated call or recursion limit)
390 */
391
392 static int
393 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
394 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
395 int flags, unsigned int rdepth)
396 {
397 /* These variables do not need to be preserved over recursion in this function,
398 so they can be ordinary variables in all cases. Mark some of them with
399 "register" because they are used a lot in loops. */
400
401 register int rrc; /* Returns from recursive calls */
402 register int i; /* Used for loops not involving calls to RMATCH() */
403 register unsigned int c; /* Character values not kept over RMATCH() calls */
404 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
405
406 BOOL minimize, possessive; /* Quantifier options */
407
408 /* When recursion is not being used, all "local" variables that have to be
409 preserved over calls to RMATCH() are part of a "frame" which is obtained from
410 heap storage. Set up the top-level frame here; others are obtained from the
411 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
412
413 #ifdef NO_RECURSE
414 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
415 frame->Xprevframe = NULL; /* Marks the top level */
416
417 /* Copy in the original argument variables */
418
419 frame->Xeptr = eptr;
420 frame->Xecode = ecode;
421 frame->Xoffset_top = offset_top;
422 frame->Xims = ims;
423 frame->Xeptrb = eptrb;
424 frame->Xflags = flags;
425 frame->Xrdepth = rdepth;
426
427 /* This is where control jumps back to to effect "recursion" */
428
429 HEAP_RECURSE:
430
431 /* Macros make the argument variables come from the current frame */
432
433 #define eptr frame->Xeptr
434 #define ecode frame->Xecode
435 #define offset_top frame->Xoffset_top
436 #define ims frame->Xims
437 #define eptrb frame->Xeptrb
438 #define flags frame->Xflags
439 #define rdepth frame->Xrdepth
440
441 /* Ditto for the local variables */
442
443 #ifdef SUPPORT_UTF8
444 #define charptr frame->Xcharptr
445 #endif
446 #define callpat frame->Xcallpat
447 #define data frame->Xdata
448 #define next frame->Xnext
449 #define pp frame->Xpp
450 #define prev frame->Xprev
451 #define saved_eptr frame->Xsaved_eptr
452
453 #define new_recursive frame->Xnew_recursive
454
455 #define cur_is_word frame->Xcur_is_word
456 #define condition frame->Xcondition
457 #define prev_is_word frame->Xprev_is_word
458
459 #define original_ims frame->Xoriginal_ims
460
461 #ifdef SUPPORT_UCP
462 #define prop_type frame->Xprop_type
463 #define prop_value frame->Xprop_value
464 #define prop_fail_result frame->Xprop_fail_result
465 #define prop_category frame->Xprop_category
466 #define prop_chartype frame->Xprop_chartype
467 #define prop_script frame->Xprop_script
468 #define oclength frame->Xoclength
469 #define occhars frame->Xocchars
470 #endif
471
472 #define ctype frame->Xctype
473 #define fc frame->Xfc
474 #define fi frame->Xfi
475 #define length frame->Xlength
476 #define max frame->Xmax
477 #define min frame->Xmin
478 #define number frame->Xnumber
479 #define offset frame->Xoffset
480 #define op frame->Xop
481 #define save_capture_last frame->Xsave_capture_last
482 #define save_offset1 frame->Xsave_offset1
483 #define save_offset2 frame->Xsave_offset2
484 #define save_offset3 frame->Xsave_offset3
485 #define stacksave frame->Xstacksave
486
487 #define newptrb frame->Xnewptrb
488
489 /* When recursion is being used, local variables are allocated on the stack and
490 get preserved during recursion in the normal way. In this environment, fi and
491 i, and fc and c, can be the same variables. */
492
493 #else /* NO_RECURSE not defined */
494 #define fi i
495 #define fc c
496
497
498 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
499 const uschar *charptr; /* in small blocks of the code. My normal */
500 #endif /* style of coding would have declared */
501 const uschar *callpat; /* them within each of those blocks. */
502 const uschar *data; /* However, in order to accommodate the */
503 const uschar *next; /* version of this code that uses an */
504 USPTR pp; /* external "stack" implemented on the */
505 const uschar *prev; /* heap, it is easier to declare them all */
506 USPTR saved_eptr; /* here, so the declarations can be cut */
507 /* out in a block. The only declarations */
508 recursion_info new_recursive; /* within blocks below are for variables */
509 /* that do not have to be preserved over */
510 BOOL cur_is_word; /* a recursive call to RMATCH(). */
511 BOOL condition;
512 BOOL prev_is_word;
513
514 unsigned long int original_ims;
515
516 #ifdef SUPPORT_UCP
517 int prop_type;
518 int prop_value;
519 int prop_fail_result;
520 int prop_category;
521 int prop_chartype;
522 int prop_script;
523 int oclength;
524 uschar occhars[8];
525 #endif
526
527 int ctype;
528 int length;
529 int max;
530 int min;
531 int number;
532 int offset;
533 int op;
534 int save_capture_last;
535 int save_offset1, save_offset2, save_offset3;
536 int stacksave[REC_STACK_SAVE_MAX];
537
538 eptrblock newptrb;
539 #endif /* NO_RECURSE */
540
541 /* These statements are here to stop the compiler complaining about unitialized
542 variables. */
543
544 #ifdef SUPPORT_UCP
545 prop_value = 0;
546 prop_fail_result = 0;
547 #endif
548
549
550 /* This label is used for tail recursion, which is used in a few cases even
551 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
552 used. Thanks to Ian Taylor for noticing this possibility and sending the
553 original patch. */
554
555 TAIL_RECURSE:
556
557 /* OK, now we can get on with the real code of the function. Recursive calls
558 are specified by the macro RMATCH and RRETURN is used to return. When
559 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
560 and a "return", respectively (possibly with some debugging if DEBUG is
561 defined). However, RMATCH isn't like a function call because it's quite a
562 complicated macro. It has to be used in one particular way. This shouldn't,
563 however, impact performance when true recursion is being used. */
564
565 #ifdef SUPPORT_UTF8
566 utf8 = md->utf8; /* Local copy of the flag */
567 #else
568 utf8 = FALSE;
569 #endif
570
571 /* First check that we haven't called match() too many times, or that we
572 haven't exceeded the recursive call limit. */
573
574 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
575 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
576
577 original_ims = ims; /* Save for resetting on ')' */
578
579 /* At the start of a group with an unlimited repeat that may match an empty
580 string, the match_cbegroup flag is set. When this is the case, add the current
581 subject pointer to the chain of such remembered pointers, to be checked when we
582 hit the closing ket, in order to break infinite loops that match no characters.
583 When match() is called in other circumstances, don't add to the chain. If this
584 is a tail recursion, use a block from the workspace, as the one on the stack is
585 already used. */
586
587 if ((flags & match_cbegroup) != 0)
588 {
589 eptrblock *p;
590 if ((flags & match_tail_recursed) != 0)
591 {
592 if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
593 p = md->eptrchain + md->eptrn++;
594 }
595 else p = &newptrb;
596 p->epb_saved_eptr = eptr;
597 p->epb_prev = eptrb;
598 eptrb = p;
599 }
600
601 /* Now start processing the opcodes. */
602
603 for (;;)
604 {
605 minimize = possessive = FALSE;
606 op = *ecode;
607
608 /* For partial matching, remember if we ever hit the end of the subject after
609 matching at least one subject character. */
610
611 if (md->partial &&
612 eptr >= md->end_subject &&
613 eptr > md->start_match)
614 md->hitend = TRUE;
615
616 switch(op)
617 {
618 /* Handle a capturing bracket. If there is space in the offset vector, save
619 the current subject position in the working slot at the top of the vector.
620 We mustn't change the current values of the data slot, because they may be
621 set from a previous iteration of this group, and be referred to by a
622 reference inside the group.
623
624 If the bracket fails to match, we need to restore this value and also the
625 values of the final offsets, in case they were set by a previous iteration
626 of the same bracket.
627
628 If there isn't enough space in the offset vector, treat this as if it were
629 a non-capturing bracket. Don't worry about setting the flag for the error
630 case here; that is handled in the code for KET. */
631
632 case OP_CBRA:
633 case OP_SCBRA:
634 number = GET2(ecode, 1+LINK_SIZE);
635 offset = number << 1;
636
637 #ifdef DEBUG
638 printf("start bracket %d\n", number);
639 printf("subject=");
640 pchars(eptr, 16, TRUE, md);
641 printf("\n");
642 #endif
643
644 if (offset < md->offset_max)
645 {
646 save_offset1 = md->offset_vector[offset];
647 save_offset2 = md->offset_vector[offset+1];
648 save_offset3 = md->offset_vector[md->offset_end - number];
649 save_capture_last = md->capture_last;
650
651 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
652 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
653
654 flags = (op == OP_SCBRA)? match_cbegroup : 0;
655 do
656 {
657 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
658 ims, eptrb, flags, RM1);
659 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
660 md->capture_last = save_capture_last;
661 ecode += GET(ecode, 1);
662 }
663 while (*ecode == OP_ALT);
664
665 DPRINTF(("bracket %d failed\n", number));
666
667 md->offset_vector[offset] = save_offset1;
668 md->offset_vector[offset+1] = save_offset2;
669 md->offset_vector[md->offset_end - number] = save_offset3;
670
671 RRETURN(MATCH_NOMATCH);
672 }
673
674 /* Insufficient room for saving captured contents. Treat as a non-capturing
675 bracket. */
676
677 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
678
679 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
680 final alternative within the brackets, we would return the result of a
681 recursive call to match() whatever happened. We can reduce stack usage by
682 turning this into a tail recursion. */
683
684 case OP_BRA:
685 case OP_SBRA:
686 DPRINTF(("start non-capturing bracket\n"));
687 flags = (op >= OP_SBRA)? match_cbegroup : 0;
688 for (;;)
689 {
690 if (ecode[GET(ecode, 1)] != OP_ALT)
691 {
692 ecode += _pcre_OP_lengths[*ecode];
693 flags |= match_tail_recursed;
694 DPRINTF(("bracket 0 tail recursion\n"));
695 goto TAIL_RECURSE;
696 }
697
698 /* For non-final alternatives, continue the loop for a NOMATCH result;
699 otherwise return. */
700
701 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
702 eptrb, flags, RM2);
703 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
704 ecode += GET(ecode, 1);
705 }
706 /* Control never reaches here. */
707
708 /* Conditional group: compilation checked that there are no more than
709 two branches. If the condition is false, skipping the first branch takes us
710 past the end if there is only one branch, but that's OK because that is
711 exactly what going to the ket would do. As there is only one branch to be
712 obeyed, we can use tail recursion to avoid using another stack frame. */
713
714 case OP_COND:
715 case OP_SCOND:
716 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
717 {
718 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
719 condition = md->recursive != NULL &&
720 (offset == RREF_ANY || offset == md->recursive->group_num);
721 ecode += condition? 3 : GET(ecode, 1);
722 }
723
724 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
725 {
726 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
727 condition = offset < offset_top && md->offset_vector[offset] >= 0;
728 ecode += condition? 3 : GET(ecode, 1);
729 }
730
731 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
732 {
733 condition = FALSE;
734 ecode += GET(ecode, 1);
735 }
736
737 /* The condition is an assertion. Call match() to evaluate it - setting
738 the final argument match_condassert causes it to stop at the end of an
739 assertion. */
740
741 else
742 {
743 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
744 match_condassert, RM3);
745 if (rrc == MATCH_MATCH)
746 {
747 condition = TRUE;
748 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
749 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
750 }
751 else if (rrc != MATCH_NOMATCH)
752 {
753 RRETURN(rrc); /* Need braces because of following else */
754 }
755 else
756 {
757 condition = FALSE;
758 ecode += GET(ecode, 1);
759 }
760 }
761
762 /* We are now at the branch that is to be obeyed. As there is only one,
763 we can use tail recursion to avoid using another stack frame. If the second
764 alternative doesn't exist, we can just plough on. */
765
766 if (condition || *ecode == OP_ALT)
767 {
768 ecode += 1 + LINK_SIZE;
769 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
770 goto TAIL_RECURSE;
771 }
772 else
773 {
774 ecode += 1 + LINK_SIZE;
775 }
776 break;
777
778
779 /* End of the pattern. If we are in a top-level recursion, we should
780 restore the offsets appropriately and continue from after the call. */
781
782 case OP_END:
783 if (md->recursive != NULL && md->recursive->group_num == 0)
784 {
785 recursion_info *rec = md->recursive;
786 DPRINTF(("End of pattern in a (?0) recursion\n"));
787 md->recursive = rec->prevrec;
788 memmove(md->offset_vector, rec->offset_save,
789 rec->saved_max * sizeof(int));
790 md->start_match = rec->save_start;
791 ims = original_ims;
792 ecode = rec->after_call;
793 break;
794 }
795
796 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
797 string - backtracking will then try other alternatives, if any. */
798
799 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
800 md->end_match_ptr = eptr; /* Record where we ended */
801 md->end_offset_top = offset_top; /* and how many extracts were taken */
802 RRETURN(MATCH_MATCH);
803
804 /* Change option settings */
805
806 case OP_OPT:
807 ims = ecode[1];
808 ecode += 2;
809 DPRINTF(("ims set to %02lx\n", ims));
810 break;
811
812 /* Assertion brackets. Check the alternative branches in turn - the
813 matching won't pass the KET for an assertion. If any one branch matches,
814 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
815 start of each branch to move the current point backwards, so the code at
816 this level is identical to the lookahead case. */
817
818 case OP_ASSERT:
819 case OP_ASSERTBACK:
820 do
821 {
822 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
823 RM4);
824 if (rrc == MATCH_MATCH) break;
825 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
826 ecode += GET(ecode, 1);
827 }
828 while (*ecode == OP_ALT);
829 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
830
831 /* If checking an assertion for a condition, return MATCH_MATCH. */
832
833 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
834
835 /* Continue from after the assertion, updating the offsets high water
836 mark, since extracts may have been taken during the assertion. */
837
838 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
839 ecode += 1 + LINK_SIZE;
840 offset_top = md->end_offset_top;
841 continue;
842
843 /* Negative assertion: all branches must fail to match */
844
845 case OP_ASSERT_NOT:
846 case OP_ASSERTBACK_NOT:
847 do
848 {
849 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
850 RM5);
851 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
852 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
853 ecode += GET(ecode,1);
854 }
855 while (*ecode == OP_ALT);
856
857 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
858
859 ecode += 1 + LINK_SIZE;
860 continue;
861
862 /* Move the subject pointer back. This occurs only at the start of
863 each branch of a lookbehind assertion. If we are too close to the start to
864 move back, this match function fails. When working with UTF-8 we move
865 back a number of characters, not bytes. */
866
867 case OP_REVERSE:
868 #ifdef SUPPORT_UTF8
869 if (utf8)
870 {
871 i = GET(ecode, 1);
872 while (i-- > 0)
873 {
874 eptr--;
875 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
876 BACKCHAR(eptr)
877 }
878 }
879 else
880 #endif
881
882 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
883
884 {
885 eptr -= GET(ecode, 1);
886 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
887 }
888
889 /* Skip to next op code */
890
891 ecode += 1 + LINK_SIZE;
892 break;
893
894 /* The callout item calls an external function, if one is provided, passing
895 details of the match so far. This is mainly for debugging, though the
896 function is able to force a failure. */
897
898 case OP_CALLOUT:
899 if (pcre_callout != NULL)
900 {
901 pcre_callout_block cb;
902 cb.version = 1; /* Version 1 of the callout block */
903 cb.callout_number = ecode[1];
904 cb.offset_vector = md->offset_vector;
905 cb.subject = (PCRE_SPTR)md->start_subject;
906 cb.subject_length = md->end_subject - md->start_subject;
907 cb.start_match = md->start_match - md->start_subject;
908 cb.current_position = eptr - md->start_subject;
909 cb.pattern_position = GET(ecode, 2);
910 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
911 cb.capture_top = offset_top/2;
912 cb.capture_last = md->capture_last;
913 cb.callout_data = md->callout_data;
914 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
915 if (rrc < 0) RRETURN(rrc);
916 }
917 ecode += 2 + 2*LINK_SIZE;
918 break;
919
920 /* Recursion either matches the current regex, or some subexpression. The
921 offset data is the offset to the starting bracket from the start of the
922 whole pattern. (This is so that it works from duplicated subpatterns.)
923
924 If there are any capturing brackets started but not finished, we have to
925 save their starting points and reinstate them after the recursion. However,
926 we don't know how many such there are (offset_top records the completed
927 total) so we just have to save all the potential data. There may be up to
928 65535 such values, which is too large to put on the stack, but using malloc
929 for small numbers seems expensive. As a compromise, the stack is used when
930 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
931 is used. A problem is what to do if the malloc fails ... there is no way of
932 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
933 values on the stack, and accept that the rest may be wrong.
934
935 There are also other values that have to be saved. We use a chained
936 sequence of blocks that actually live on the stack. Thanks to Robin Houston
937 for the original version of this logic. */
938
939 case OP_RECURSE:
940 {
941 callpat = md->start_code + GET(ecode, 1);
942 new_recursive.group_num = (callpat == md->start_code)? 0 :
943 GET2(callpat, 1 + LINK_SIZE);
944
945 /* Add to "recursing stack" */
946
947 new_recursive.prevrec = md->recursive;
948 md->recursive = &new_recursive;
949
950 /* Find where to continue from afterwards */
951
952 ecode += 1 + LINK_SIZE;
953 new_recursive.after_call = ecode;
954
955 /* Now save the offset data. */
956
957 new_recursive.saved_max = md->offset_end;
958 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
959 new_recursive.offset_save = stacksave;
960 else
961 {
962 new_recursive.offset_save =
963 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
964 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
965 }
966
967 memcpy(new_recursive.offset_save, md->offset_vector,
968 new_recursive.saved_max * sizeof(int));
969 new_recursive.save_start = md->start_match;
970 md->start_match = eptr;
971
972 /* OK, now we can do the recursion. For each top-level alternative we
973 restore the offset and recursion data. */
974
975 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
976 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
977 do
978 {
979 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
980 md, ims, eptrb, flags, RM6);
981 if (rrc == MATCH_MATCH)
982 {
983 DPRINTF(("Recursion matched\n"));
984 md->recursive = new_recursive.prevrec;
985 if (new_recursive.offset_save != stacksave)
986 (pcre_free)(new_recursive.offset_save);
987 RRETURN(MATCH_MATCH);
988 }
989 else if (rrc != MATCH_NOMATCH)
990 {
991 DPRINTF(("Recursion gave error %d\n", rrc));
992 RRETURN(rrc);
993 }
994
995 md->recursive = &new_recursive;
996 memcpy(md->offset_vector, new_recursive.offset_save,
997 new_recursive.saved_max * sizeof(int));
998 callpat += GET(callpat, 1);
999 }
1000 while (*callpat == OP_ALT);
1001
1002 DPRINTF(("Recursion didn't match\n"));
1003 md->recursive = new_recursive.prevrec;
1004 if (new_recursive.offset_save != stacksave)
1005 (pcre_free)(new_recursive.offset_save);
1006 RRETURN(MATCH_NOMATCH);
1007 }
1008 /* Control never reaches here */
1009
1010 /* "Once" brackets are like assertion brackets except that after a match,
1011 the point in the subject string is not moved back. Thus there can never be
1012 a move back into the brackets. Friedl calls these "atomic" subpatterns.
1013 Check the alternative branches in turn - the matching won't pass the KET
1014 for this kind of subpattern. If any one branch matches, we carry on as at
1015 the end of a normal bracket, leaving the subject pointer. */
1016
1017 case OP_ONCE:
1018 prev = ecode;
1019 saved_eptr = eptr;
1020
1021 do
1022 {
1023 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1024 eptrb, 0, RM7);
1025 if (rrc == MATCH_MATCH) break;
1026 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1027 ecode += GET(ecode,1);
1028 }
1029 while (*ecode == OP_ALT);
1030
1031 /* If hit the end of the group (which could be repeated), fail */
1032
1033 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1034
1035 /* Continue as from after the assertion, updating the offsets high water
1036 mark, since extracts may have been taken. */
1037
1038 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1039
1040 offset_top = md->end_offset_top;
1041 eptr = md->end_match_ptr;
1042
1043 /* For a non-repeating ket, just continue at this level. This also
1044 happens for a repeating ket if no characters were matched in the group.
1045 This is the forcible breaking of infinite loops as implemented in Perl
1046 5.005. If there is an options reset, it will get obeyed in the normal
1047 course of events. */
1048
1049 if (*ecode == OP_KET || eptr == saved_eptr)
1050 {
1051 ecode += 1+LINK_SIZE;
1052 break;
1053 }
1054
1055 /* The repeating kets try the rest of the pattern or restart from the
1056 preceding bracket, in the appropriate order. The second "call" of match()
1057 uses tail recursion, to avoid using another stack frame. We need to reset
1058 any options that changed within the bracket before re-running it, so
1059 check the next opcode. */
1060
1061 if (ecode[1+LINK_SIZE] == OP_OPT)
1062 {
1063 ims = (ims & ~PCRE_IMS) | ecode[4];
1064 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1065 }
1066
1067 if (*ecode == OP_KETRMIN)
1068 {
1069 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,
1070 RM8);
1071 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1072 ecode = prev;
1073 flags = match_tail_recursed;
1074 goto TAIL_RECURSE;
1075 }
1076 else /* OP_KETRMAX */
1077 {
1078 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1079 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1080 ecode += 1 + LINK_SIZE;
1081 flags = match_tail_recursed;
1082 goto TAIL_RECURSE;
1083 }
1084 /* Control never gets here */
1085
1086 /* An alternation is the end of a branch; scan along to find the end of the
1087 bracketed group and go to there. */
1088
1089 case OP_ALT:
1090 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1091 break;
1092
1093 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1094 that it may occur zero times. It may repeat infinitely, or not at all -
1095 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1096 repeat limits are compiled as a number of copies, with the optional ones
1097 preceded by BRAZERO or BRAMINZERO. */
1098
1099 case OP_BRAZERO:
1100 {
1101 next = ecode+1;
1102 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1103 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1104 do next += GET(next,1); while (*next == OP_ALT);
1105 ecode = next + 1 + LINK_SIZE;
1106 }
1107 break;
1108
1109 case OP_BRAMINZERO:
1110 {
1111 next = ecode+1;
1112 do next += GET(next, 1); while (*next == OP_ALT);
1113 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1114 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1115 ecode++;
1116 }
1117 break;
1118
1119 /* End of a group, repeated or non-repeating. */
1120
1121 case OP_KET:
1122 case OP_KETRMIN:
1123 case OP_KETRMAX:
1124 prev = ecode - GET(ecode, 1);
1125
1126 /* If this was a group that remembered the subject start, in order to break
1127 infinite repeats of empty string matches, retrieve the subject start from
1128 the chain. Otherwise, set it NULL. */
1129
1130 if (*prev >= OP_SBRA)
1131 {
1132 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1133 eptrb = eptrb->epb_prev; /* Backup to previous group */
1134 }
1135 else saved_eptr = NULL;
1136
1137 /* If we are at the end of an assertion group, stop matching and return
1138 MATCH_MATCH, but record the current high water mark for use by positive
1139 assertions. Do this also for the "once" (atomic) groups. */
1140
1141 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1142 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1143 *prev == OP_ONCE)
1144 {
1145 md->end_match_ptr = eptr; /* For ONCE */
1146 md->end_offset_top = offset_top;
1147 RRETURN(MATCH_MATCH);
1148 }
1149
1150 /* For capturing groups we have to check the group number back at the start
1151 and if necessary complete handling an extraction by setting the offsets and
1152 bumping the high water mark. Note that whole-pattern recursion is coded as
1153 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1154 when the OP_END is reached. Other recursion is handled here. */
1155
1156 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1157 {
1158 number = GET2(prev, 1+LINK_SIZE);
1159 offset = number << 1;
1160
1161 #ifdef DEBUG
1162 printf("end bracket %d", number);
1163 printf("\n");
1164 #endif
1165
1166 md->capture_last = number;
1167 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1168 {
1169 md->offset_vector[offset] =
1170 md->offset_vector[md->offset_end - number];
1171 md->offset_vector[offset+1] = eptr - md->start_subject;
1172 if (offset_top <= offset) offset_top = offset + 2;
1173 }
1174
1175 /* Handle a recursively called group. Restore the offsets
1176 appropriately and continue from after the call. */
1177
1178 if (md->recursive != NULL && md->recursive->group_num == number)
1179 {
1180 recursion_info *rec = md->recursive;
1181 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1182 md->recursive = rec->prevrec;
1183 md->start_match = rec->save_start;
1184 memcpy(md->offset_vector, rec->offset_save,
1185 rec->saved_max * sizeof(int));
1186 ecode = rec->after_call;
1187 ims = original_ims;
1188 break;
1189 }
1190 }
1191
1192 /* For both capturing and non-capturing groups, reset the value of the ims
1193 flags, in case they got changed during the group. */
1194
1195 ims = original_ims;
1196 DPRINTF(("ims reset to %02lx\n", ims));
1197
1198 /* For a non-repeating ket, just continue at this level. This also
1199 happens for a repeating ket if no characters were matched in the group.
1200 This is the forcible breaking of infinite loops as implemented in Perl
1201 5.005. If there is an options reset, it will get obeyed in the normal
1202 course of events. */
1203
1204 if (*ecode == OP_KET || eptr == saved_eptr)
1205 {
1206 ecode += 1 + LINK_SIZE;
1207 break;
1208 }
1209
1210 /* The repeating kets try the rest of the pattern or restart from the
1211 preceding bracket, in the appropriate order. In the second case, we can use
1212 tail recursion to avoid using another stack frame. */
1213
1214 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1215
1216 if (*ecode == OP_KETRMIN)
1217 {
1218 RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,
1219 RM12);
1220 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1221 ecode = prev;
1222 flags |= match_tail_recursed;
1223 goto TAIL_RECURSE;
1224 }
1225 else /* OP_KETRMAX */
1226 {
1227 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1228 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1229 ecode += 1 + LINK_SIZE;
1230 flags = match_tail_recursed;
1231 goto TAIL_RECURSE;
1232 }
1233 /* Control never gets here */
1234
1235 /* Start of subject unless notbol, or after internal newline if multiline */
1236
1237 case OP_CIRC:
1238 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1239 if ((ims & PCRE_MULTILINE) != 0)
1240 {
1241 if (eptr != md->start_subject &&
1242 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1243 RRETURN(MATCH_NOMATCH);
1244 ecode++;
1245 break;
1246 }
1247 /* ... else fall through */
1248
1249 /* Start of subject assertion */
1250
1251 case OP_SOD:
1252 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1253 ecode++;
1254 break;
1255
1256 /* Start of match assertion */
1257
1258 case OP_SOM:
1259 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1260 ecode++;
1261 break;
1262
1263 /* Assert before internal newline if multiline, or before a terminating
1264 newline unless endonly is set, else end of subject unless noteol is set. */
1265
1266 case OP_DOLL:
1267 if ((ims & PCRE_MULTILINE) != 0)
1268 {
1269 if (eptr < md->end_subject)
1270 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1271 else
1272 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1273 ecode++;
1274 break;
1275 }
1276 else
1277 {
1278 if (md->noteol) RRETURN(MATCH_NOMATCH);
1279 if (!md->endonly)
1280 {
1281 if (eptr != md->end_subject &&
1282 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1283 RRETURN(MATCH_NOMATCH);
1284 ecode++;
1285 break;
1286 }
1287 }
1288 /* ... else fall through for endonly */
1289
1290 /* End of subject assertion (\z) */
1291
1292 case OP_EOD:
1293 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1294 ecode++;
1295 break;
1296
1297 /* End of subject or ending \n assertion (\Z) */
1298
1299 case OP_EODN:
1300 if (eptr != md->end_subject &&
1301 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1302 RRETURN(MATCH_NOMATCH);
1303 ecode++;
1304 break;
1305
1306 /* Word boundary assertions */
1307
1308 case OP_NOT_WORD_BOUNDARY:
1309 case OP_WORD_BOUNDARY:
1310 {
1311
1312 /* Find out if the previous and current characters are "word" characters.
1313 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1314 be "non-word" characters. */
1315
1316 #ifdef SUPPORT_UTF8
1317 if (utf8)
1318 {
1319 if (eptr == md->start_subject) prev_is_word = FALSE; else
1320 {
1321 const uschar *lastptr = eptr - 1;
1322 while((*lastptr & 0xc0) == 0x80) lastptr--;
1323 GETCHAR(c, lastptr);
1324 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1325 }
1326 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1327 {
1328 GETCHAR(c, eptr);
1329 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1330 }
1331 }
1332 else
1333 #endif
1334
1335 /* More streamlined when not in UTF-8 mode */
1336
1337 {
1338 prev_is_word = (eptr != md->start_subject) &&
1339 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1340 cur_is_word = (eptr < md->end_subject) &&
1341 ((md->ctypes[*eptr] & ctype_word) != 0);
1342 }
1343
1344 /* Now see if the situation is what we want */
1345
1346 if ((*ecode++ == OP_WORD_BOUNDARY)?
1347 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1348 RRETURN(MATCH_NOMATCH);
1349 }
1350 break;
1351
1352 /* Match a single character type; inline for speed */
1353
1354 case OP_ANY:
1355 if ((ims & PCRE_DOTALL) == 0)
1356 {
1357 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1358 }
1359 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1360 if (utf8)
1361 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1362 ecode++;
1363 break;
1364
1365 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1366 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1367
1368 case OP_ANYBYTE:
1369 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1370 ecode++;
1371 break;
1372
1373 case OP_NOT_DIGIT:
1374 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1375 GETCHARINCTEST(c, eptr);
1376 if (
1377 #ifdef SUPPORT_UTF8
1378 c < 256 &&
1379 #endif
1380 (md->ctypes[c] & ctype_digit) != 0
1381 )
1382 RRETURN(MATCH_NOMATCH);
1383 ecode++;
1384 break;
1385
1386 case OP_DIGIT:
1387 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1388 GETCHARINCTEST(c, eptr);
1389 if (
1390 #ifdef SUPPORT_UTF8
1391 c >= 256 ||
1392 #endif
1393 (md->ctypes[c] & ctype_digit) == 0
1394 )
1395 RRETURN(MATCH_NOMATCH);
1396 ecode++;
1397 break;
1398
1399 case OP_NOT_WHITESPACE:
1400 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1401 GETCHARINCTEST(c, eptr);
1402 if (
1403 #ifdef SUPPORT_UTF8
1404 c < 256 &&
1405 #endif
1406 (md->ctypes[c] & ctype_space) != 0
1407 )
1408 RRETURN(MATCH_NOMATCH);
1409 ecode++;
1410 break;
1411
1412 case OP_WHITESPACE:
1413 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1414 GETCHARINCTEST(c, eptr);
1415 if (
1416 #ifdef SUPPORT_UTF8
1417 c >= 256 ||
1418 #endif
1419 (md->ctypes[c] & ctype_space) == 0
1420 )
1421 RRETURN(MATCH_NOMATCH);
1422 ecode++;
1423 break;
1424
1425 case OP_NOT_WORDCHAR:
1426 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1427 GETCHARINCTEST(c, eptr);
1428 if (
1429 #ifdef SUPPORT_UTF8
1430 c < 256 &&
1431 #endif
1432 (md->ctypes[c] & ctype_word) != 0
1433 )
1434 RRETURN(MATCH_NOMATCH);
1435 ecode++;
1436 break;
1437
1438 case OP_WORDCHAR:
1439 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1440 GETCHARINCTEST(c, eptr);
1441 if (
1442 #ifdef SUPPORT_UTF8
1443 c >= 256 ||
1444 #endif
1445 (md->ctypes[c] & ctype_word) == 0
1446 )
1447 RRETURN(MATCH_NOMATCH);
1448 ecode++;
1449 break;
1450
1451 case OP_ANYNL:
1452 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1453 GETCHARINCTEST(c, eptr);
1454 switch(c)
1455 {
1456 default: RRETURN(MATCH_NOMATCH);
1457 case 0x000d:
1458 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1459 break;
1460 case 0x000a:
1461 case 0x000b:
1462 case 0x000c:
1463 case 0x0085:
1464 case 0x2028:
1465 case 0x2029:
1466 break;
1467 }
1468 ecode++;
1469 break;
1470
1471 #ifdef SUPPORT_UCP
1472 /* Check the next character by Unicode property. We will get here only
1473 if the support is in the binary; otherwise a compile-time error occurs. */
1474
1475 case OP_PROP:
1476 case OP_NOTPROP:
1477 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1478 GETCHARINCTEST(c, eptr);
1479 {
1480 int chartype, script;
1481 int category = _pcre_ucp_findprop(c, &chartype, &script);
1482
1483 switch(ecode[1])
1484 {
1485 case PT_ANY:
1486 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1487 break;
1488
1489 case PT_LAMP:
1490 if ((chartype == ucp_Lu ||
1491 chartype == ucp_Ll ||
1492 chartype == ucp_Lt) == (op == OP_NOTPROP))
1493 RRETURN(MATCH_NOMATCH);
1494 break;
1495
1496 case PT_GC:
1497 if ((ecode[2] != category) == (op == OP_PROP))
1498 RRETURN(MATCH_NOMATCH);
1499 break;
1500
1501 case PT_PC:
1502 if ((ecode[2] != chartype) == (op == OP_PROP))
1503 RRETURN(MATCH_NOMATCH);
1504 break;
1505
1506 case PT_SC:
1507 if ((ecode[2] != script) == (op == OP_PROP))
1508 RRETURN(MATCH_NOMATCH);
1509 break;
1510
1511 default:
1512 RRETURN(PCRE_ERROR_INTERNAL);
1513 }
1514
1515 ecode += 3;
1516 }
1517 break;
1518
1519 /* Match an extended Unicode sequence. We will get here only if the support
1520 is in the binary; otherwise a compile-time error occurs. */
1521
1522 case OP_EXTUNI:
1523 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1524 GETCHARINCTEST(c, eptr);
1525 {
1526 int chartype, script;
1527 int category = _pcre_ucp_findprop(c, &chartype, &script);
1528 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1529 while (eptr < md->end_subject)
1530 {
1531 int len = 1;
1532 if (!utf8) c = *eptr; else
1533 {
1534 GETCHARLEN(c, eptr, len);
1535 }
1536 category = _pcre_ucp_findprop(c, &chartype, &script);
1537 if (category != ucp_M) break;
1538 eptr += len;
1539 }
1540 }
1541 ecode++;
1542 break;
1543 #endif
1544
1545
1546 /* Match a back reference, possibly repeatedly. Look past the end of the
1547 item to see if there is repeat information following. The code is similar
1548 to that for character classes, but repeated for efficiency. Then obey
1549 similar code to character type repeats - written out again for speed.
1550 However, if the referenced string is the empty string, always treat
1551 it as matched, any number of times (otherwise there could be infinite
1552 loops). */
1553
1554 case OP_REF:
1555 {
1556 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1557 ecode += 3; /* Advance past item */
1558
1559 /* If the reference is unset, set the length to be longer than the amount
1560 of subject left; this ensures that every attempt at a match fails. We
1561 can't just fail here, because of the possibility of quantifiers with zero
1562 minima. */
1563
1564 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1565 md->end_subject - eptr + 1 :
1566 md->offset_vector[offset+1] - md->offset_vector[offset];
1567
1568 /* Set up for repetition, or handle the non-repeated case */
1569
1570 switch (*ecode)
1571 {
1572 case OP_CRSTAR:
1573 case OP_CRMINSTAR:
1574 case OP_CRPLUS:
1575 case OP_CRMINPLUS:
1576 case OP_CRQUERY:
1577 case OP_CRMINQUERY:
1578 c = *ecode++ - OP_CRSTAR;
1579 minimize = (c & 1) != 0;
1580 min = rep_min[c]; /* Pick up values from tables; */
1581 max = rep_max[c]; /* zero for max => infinity */
1582 if (max == 0) max = INT_MAX;
1583 break;
1584
1585 case OP_CRRANGE:
1586 case OP_CRMINRANGE:
1587 minimize = (*ecode == OP_CRMINRANGE);
1588 min = GET2(ecode, 1);
1589 max = GET2(ecode, 3);
1590 if (max == 0) max = INT_MAX;
1591 ecode += 5;
1592 break;
1593
1594 default: /* No repeat follows */
1595 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1596 eptr += length;
1597 continue; /* With the main loop */
1598 }
1599
1600 /* If the length of the reference is zero, just continue with the
1601 main loop. */
1602
1603 if (length == 0) continue;
1604
1605 /* First, ensure the minimum number of matches are present. We get back
1606 the length of the reference string explicitly rather than passing the
1607 address of eptr, so that eptr can be a register variable. */
1608
1609 for (i = 1; i <= min; i++)
1610 {
1611 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1612 eptr += length;
1613 }
1614
1615 /* If min = max, continue at the same level without recursion.
1616 They are not both allowed to be zero. */
1617
1618 if (min == max) continue;
1619
1620 /* If minimizing, keep trying and advancing the pointer */
1621
1622 if (minimize)
1623 {
1624 for (fi = min;; fi++)
1625 {
1626 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1628 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1629 RRETURN(MATCH_NOMATCH);
1630 eptr += length;
1631 }
1632 /* Control never gets here */
1633 }
1634
1635 /* If maximizing, find the longest string and work backwards */
1636
1637 else
1638 {
1639 pp = eptr;
1640 for (i = min; i < max; i++)
1641 {
1642 if (!match_ref(offset, eptr, length, md, ims)) break;
1643 eptr += length;
1644 }
1645 while (eptr >= pp)
1646 {
1647 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1648 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1649 eptr -= length;
1650 }
1651 RRETURN(MATCH_NOMATCH);
1652 }
1653 }
1654 /* Control never gets here */
1655
1656
1657
1658 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1659 used when all the characters in the class have values in the range 0-255,
1660 and either the matching is caseful, or the characters are in the range
1661 0-127 when UTF-8 processing is enabled. The only difference between
1662 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1663 encountered.
1664
1665 First, look past the end of the item to see if there is repeat information
1666 following. Then obey similar code to character type repeats - written out
1667 again for speed. */
1668
1669 case OP_NCLASS:
1670 case OP_CLASS:
1671 {
1672 data = ecode + 1; /* Save for matching */
1673 ecode += 33; /* Advance past the item */
1674
1675 switch (*ecode)
1676 {
1677 case OP_CRSTAR:
1678 case OP_CRMINSTAR:
1679 case OP_CRPLUS:
1680 case OP_CRMINPLUS:
1681 case OP_CRQUERY:
1682 case OP_CRMINQUERY:
1683 c = *ecode++ - OP_CRSTAR;
1684 minimize = (c & 1) != 0;
1685 min = rep_min[c]; /* Pick up values from tables; */
1686 max = rep_max[c]; /* zero for max => infinity */
1687 if (max == 0) max = INT_MAX;
1688 break;
1689
1690 case OP_CRRANGE:
1691 case OP_CRMINRANGE:
1692 minimize = (*ecode == OP_CRMINRANGE);
1693 min = GET2(ecode, 1);
1694 max = GET2(ecode, 3);
1695 if (max == 0) max = INT_MAX;
1696 ecode += 5;
1697 break;
1698
1699 default: /* No repeat follows */
1700 min = max = 1;
1701 break;
1702 }
1703
1704 /* First, ensure the minimum number of matches are present. */
1705
1706 #ifdef SUPPORT_UTF8
1707 /* UTF-8 mode */
1708 if (utf8)
1709 {
1710 for (i = 1; i <= min; i++)
1711 {
1712 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1713 GETCHARINC(c, eptr);
1714 if (c > 255)
1715 {
1716 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1717 }
1718 else
1719 {
1720 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1721 }
1722 }
1723 }
1724 else
1725 #endif
1726 /* Not UTF-8 mode */
1727 {
1728 for (i = 1; i <= min; i++)
1729 {
1730 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1731 c = *eptr++;
1732 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1733 }
1734 }
1735
1736 /* If max == min we can continue with the main loop without the
1737 need to recurse. */
1738
1739 if (min == max) continue;
1740
1741 /* If minimizing, keep testing the rest of the expression and advancing
1742 the pointer while it matches the class. */
1743
1744 if (minimize)
1745 {
1746 #ifdef SUPPORT_UTF8
1747 /* UTF-8 mode */
1748 if (utf8)
1749 {
1750 for (fi = min;; fi++)
1751 {
1752 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1754 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1755 GETCHARINC(c, eptr);
1756 if (c > 255)
1757 {
1758 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1759 }
1760 else
1761 {
1762 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1763 }
1764 }
1765 }
1766 else
1767 #endif
1768 /* Not UTF-8 mode */
1769 {
1770 for (fi = min;; fi++)
1771 {
1772 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1773 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1774 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1775 c = *eptr++;
1776 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1777 }
1778 }
1779 /* Control never gets here */
1780 }
1781
1782 /* If maximizing, find the longest possible run, then work backwards. */
1783
1784 else
1785 {
1786 pp = eptr;
1787
1788 #ifdef SUPPORT_UTF8
1789 /* UTF-8 mode */
1790 if (utf8)
1791 {
1792 for (i = min; i < max; i++)
1793 {
1794 int len = 1;
1795 if (eptr >= md->end_subject) break;
1796 GETCHARLEN(c, eptr, len);
1797 if (c > 255)
1798 {
1799 if (op == OP_CLASS) break;
1800 }
1801 else
1802 {
1803 if ((data[c/8] & (1 << (c&7))) == 0) break;
1804 }
1805 eptr += len;
1806 }
1807 for (;;)
1808 {
1809 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811 if (eptr-- == pp) break; /* Stop if tried at original pos */
1812 BACKCHAR(eptr);
1813 }
1814 }
1815 else
1816 #endif
1817 /* Not UTF-8 mode */
1818 {
1819 for (i = min; i < max; i++)
1820 {
1821 if (eptr >= md->end_subject) break;
1822 c = *eptr;
1823 if ((data[c/8] & (1 << (c&7))) == 0) break;
1824 eptr++;
1825 }
1826 while (eptr >= pp)
1827 {
1828 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1829 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1830 eptr--;
1831 }
1832 }
1833
1834 RRETURN(MATCH_NOMATCH);
1835 }
1836 }
1837 /* Control never gets here */
1838
1839
1840 /* Match an extended character class. This opcode is encountered only
1841 in UTF-8 mode, because that's the only time it is compiled. */
1842
1843 #ifdef SUPPORT_UTF8
1844 case OP_XCLASS:
1845 {
1846 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1847 ecode += GET(ecode, 1); /* Advance past the item */
1848
1849 switch (*ecode)
1850 {
1851 case OP_CRSTAR:
1852 case OP_CRMINSTAR:
1853 case OP_CRPLUS:
1854 case OP_CRMINPLUS:
1855 case OP_CRQUERY:
1856 case OP_CRMINQUERY:
1857 c = *ecode++ - OP_CRSTAR;
1858 minimize = (c & 1) != 0;
1859 min = rep_min[c]; /* Pick up values from tables; */
1860 max = rep_max[c]; /* zero for max => infinity */
1861 if (max == 0) max = INT_MAX;
1862 break;
1863
1864 case OP_CRRANGE:
1865 case OP_CRMINRANGE:
1866 minimize = (*ecode == OP_CRMINRANGE);
1867 min = GET2(ecode, 1);
1868 max = GET2(ecode, 3);
1869 if (max == 0) max = INT_MAX;
1870 ecode += 5;
1871 break;
1872
1873 default: /* No repeat follows */
1874 min = max = 1;
1875 break;
1876 }
1877
1878 /* First, ensure the minimum number of matches are present. */
1879
1880 for (i = 1; i <= min; i++)
1881 {
1882 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1883 GETCHARINC(c, eptr);
1884 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1885 }
1886
1887 /* If max == min we can continue with the main loop without the
1888 need to recurse. */
1889
1890 if (min == max) continue;
1891
1892 /* If minimizing, keep testing the rest of the expression and advancing
1893 the pointer while it matches the class. */
1894
1895 if (minimize)
1896 {
1897 for (fi = min;; fi++)
1898 {
1899 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
1900 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1901 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1902 GETCHARINC(c, eptr);
1903 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1904 }
1905 /* Control never gets here */
1906 }
1907
1908 /* If maximizing, find the longest possible run, then work backwards. */
1909
1910 else
1911 {
1912 pp = eptr;
1913 for (i = min; i < max; i++)
1914 {
1915 int len = 1;
1916 if (eptr >= md->end_subject) break;
1917 GETCHARLEN(c, eptr, len);
1918 if (!_pcre_xclass(c, data)) break;
1919 eptr += len;
1920 }
1921 for(;;)
1922 {
1923 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
1924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1925 if (eptr-- == pp) break; /* Stop if tried at original pos */
1926 BACKCHAR(eptr)
1927 }
1928 RRETURN(MATCH_NOMATCH);
1929 }
1930
1931 /* Control never gets here */
1932 }
1933 #endif /* End of XCLASS */
1934
1935 /* Match a single character, casefully */
1936
1937 case OP_CHAR:
1938 #ifdef SUPPORT_UTF8
1939 if (utf8)
1940 {
1941 length = 1;
1942 ecode++;
1943 GETCHARLEN(fc, ecode, length);
1944 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1945 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1946 }
1947 else
1948 #endif
1949
1950 /* Non-UTF-8 mode */
1951 {
1952 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1953 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1954 ecode += 2;
1955 }
1956 break;
1957
1958 /* Match a single character, caselessly */
1959
1960 case OP_CHARNC:
1961 #ifdef SUPPORT_UTF8
1962 if (utf8)
1963 {
1964 length = 1;
1965 ecode++;
1966 GETCHARLEN(fc, ecode, length);
1967
1968 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1969
1970 /* If the pattern character's value is < 128, we have only one byte, and
1971 can use the fast lookup table. */
1972
1973 if (fc < 128)
1974 {
1975 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1976 }
1977
1978 /* Otherwise we must pick up the subject character */
1979
1980 else
1981 {
1982 unsigned int dc;
1983 GETCHARINC(dc, eptr);
1984 ecode += length;
1985
1986 /* If we have Unicode property support, we can use it to test the other
1987 case of the character, if there is one. */
1988
1989 if (fc != dc)
1990 {
1991 #ifdef SUPPORT_UCP
1992 if (dc != _pcre_ucp_othercase(fc))
1993 #endif
1994 RRETURN(MATCH_NOMATCH);
1995 }
1996 }
1997 }
1998 else
1999 #endif /* SUPPORT_UTF8 */
2000
2001 /* Non-UTF-8 mode */
2002 {
2003 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2004 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2005 ecode += 2;
2006 }
2007 break;
2008
2009 /* Match a single character repeatedly. */
2010
2011 case OP_EXACT:
2012 min = max = GET2(ecode, 1);
2013 ecode += 3;
2014 goto REPEATCHAR;
2015
2016 case OP_POSUPTO:
2017 possessive = TRUE;
2018 /* Fall through */
2019
2020 case OP_UPTO:
2021 case OP_MINUPTO:
2022 min = 0;
2023 max = GET2(ecode, 1);
2024 minimize = *ecode == OP_MINUPTO;
2025 ecode += 3;
2026 goto REPEATCHAR;
2027
2028 case OP_POSSTAR:
2029 possessive = TRUE;
2030 min = 0;
2031 max = INT_MAX;
2032 ecode++;
2033 goto REPEATCHAR;
2034
2035 case OP_POSPLUS:
2036 possessive = TRUE;
2037 min = 1;
2038 max = INT_MAX;
2039 ecode++;
2040 goto REPEATCHAR;
2041
2042 case OP_POSQUERY:
2043 possessive = TRUE;
2044 min = 0;
2045 max = 1;
2046 ecode++;
2047 goto REPEATCHAR;
2048
2049 case OP_STAR:
2050 case OP_MINSTAR:
2051 case OP_PLUS:
2052 case OP_MINPLUS:
2053 case OP_QUERY:
2054 case OP_MINQUERY:
2055 c = *ecode++ - OP_STAR;
2056 minimize = (c & 1) != 0;
2057 min = rep_min[c]; /* Pick up values from tables; */
2058 max = rep_max[c]; /* zero for max => infinity */
2059 if (max == 0) max = INT_MAX;
2060
2061 /* Common code for all repeated single-character matches. We can give
2062 up quickly if there are fewer than the minimum number of characters left in
2063 the subject. */
2064
2065 REPEATCHAR:
2066 #ifdef SUPPORT_UTF8
2067 if (utf8)
2068 {
2069 length = 1;
2070 charptr = ecode;
2071 GETCHARLEN(fc, ecode, length);
2072 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2073 ecode += length;
2074
2075 /* Handle multibyte character matching specially here. There is
2076 support for caseless matching if UCP support is present. */
2077
2078 if (length > 1)
2079 {
2080 #ifdef SUPPORT_UCP
2081 unsigned int othercase;
2082 if ((ims & PCRE_CASELESS) != 0 &&
2083 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2084 oclength = _pcre_ord2utf8(othercase, occhars);
2085 else oclength = 0;
2086 #endif /* SUPPORT_UCP */
2087
2088 for (i = 1; i <= min; i++)
2089 {
2090 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2091 #ifdef SUPPORT_UCP
2092 /* Need braces because of following else */
2093 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2094 else
2095 {
2096 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2097 eptr += oclength;
2098 }
2099 #else /* without SUPPORT_UCP */
2100 else { RRETURN(MATCH_NOMATCH); }
2101 #endif /* SUPPORT_UCP */
2102 }
2103
2104 if (min == max) continue;
2105
2106 if (minimize)
2107 {
2108 for (fi = min;; fi++)
2109 {
2110 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2111 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2112 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2113 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2114 #ifdef SUPPORT_UCP
2115 /* Need braces because of following else */
2116 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2117 else
2118 {
2119 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2120 eptr += oclength;
2121 }
2122 #else /* without SUPPORT_UCP */
2123 else { RRETURN (MATCH_NOMATCH); }
2124 #endif /* SUPPORT_UCP */
2125 }
2126 /* Control never gets here */
2127 }
2128
2129 else /* Maximize */
2130 {
2131 pp = eptr;
2132 for (i = min; i < max; i++)
2133 {
2134 if (eptr > md->end_subject - length) break;
2135 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2136 #ifdef SUPPORT_UCP
2137 else if (oclength == 0) break;
2138 else
2139 {
2140 if (memcmp(eptr, occhars, oclength) != 0) break;
2141 eptr += oclength;
2142 }
2143 #else /* without SUPPORT_UCP */
2144 else break;
2145 #endif /* SUPPORT_UCP */
2146 }
2147
2148 if (possessive) continue;
2149 for(;;)
2150 {
2151 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2152 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2153 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2154 #ifdef SUPPORT_UCP
2155 eptr--;
2156 BACKCHAR(eptr);
2157 #else /* without SUPPORT_UCP */
2158 eptr -= length;
2159 #endif /* SUPPORT_UCP */
2160 }
2161 }
2162 /* Control never gets here */
2163 }
2164
2165 /* If the length of a UTF-8 character is 1, we fall through here, and
2166 obey the code as for non-UTF-8 characters below, though in this case the
2167 value of fc will always be < 128. */
2168 }
2169 else
2170 #endif /* SUPPORT_UTF8 */
2171
2172 /* When not in UTF-8 mode, load a single-byte character. */
2173 {
2174 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2175 fc = *ecode++;
2176 }
2177
2178 /* The value of fc at this point is always less than 256, though we may or
2179 may not be in UTF-8 mode. The code is duplicated for the caseless and
2180 caseful cases, for speed, since matching characters is likely to be quite
2181 common. First, ensure the minimum number of matches are present. If min =
2182 max, continue at the same level without recursing. Otherwise, if
2183 minimizing, keep trying the rest of the expression and advancing one
2184 matching character if failing, up to the maximum. Alternatively, if
2185 maximizing, find the maximum number of characters and work backwards. */
2186
2187 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2188 max, eptr));
2189
2190 if ((ims & PCRE_CASELESS) != 0)
2191 {
2192 fc = md->lcc[fc];
2193 for (i = 1; i <= min; i++)
2194 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2195 if (min == max) continue;
2196 if (minimize)
2197 {
2198 for (fi = min;; fi++)
2199 {
2200 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2201 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2202 if (fi >= max || eptr >= md->end_subject ||
2203 fc != md->lcc[*eptr++])
2204 RRETURN(MATCH_NOMATCH);
2205 }
2206 /* Control never gets here */
2207 }
2208 else /* Maximize */
2209 {
2210 pp = eptr;
2211 for (i = min; i < max; i++)
2212 {
2213 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2214 eptr++;
2215 }
2216 if (possessive) continue;
2217 while (eptr >= pp)
2218 {
2219 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2220 eptr--;
2221 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2222 }
2223 RRETURN(MATCH_NOMATCH);
2224 }
2225 /* Control never gets here */
2226 }
2227
2228 /* Caseful comparisons (includes all multi-byte characters) */
2229
2230 else
2231 {
2232 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2233 if (min == max) continue;
2234 if (minimize)
2235 {
2236 for (fi = min;; fi++)
2237 {
2238 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2239 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2241 RRETURN(MATCH_NOMATCH);
2242 }
2243 /* Control never gets here */
2244 }
2245 else /* Maximize */
2246 {
2247 pp = eptr;
2248 for (i = min; i < max; i++)
2249 {
2250 if (eptr >= md->end_subject || fc != *eptr) break;
2251 eptr++;
2252 }
2253 if (possessive) continue;
2254 while (eptr >= pp)
2255 {
2256 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2257 eptr--;
2258 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2259 }
2260 RRETURN(MATCH_NOMATCH);
2261 }
2262 }
2263 /* Control never gets here */
2264
2265 /* Match a negated single one-byte character. The character we are
2266 checking can be multibyte. */
2267
2268 case OP_NOT:
2269 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2270 ecode++;
2271 GETCHARINCTEST(c, eptr);
2272 if ((ims & PCRE_CASELESS) != 0)
2273 {
2274 #ifdef SUPPORT_UTF8
2275 if (c < 256)
2276 #endif
2277 c = md->lcc[c];
2278 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2279 }
2280 else
2281 {
2282 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2283 }
2284 break;
2285
2286 /* Match a negated single one-byte character repeatedly. This is almost a
2287 repeat of the code for a repeated single character, but I haven't found a
2288 nice way of commoning these up that doesn't require a test of the
2289 positive/negative option for each character match. Maybe that wouldn't add
2290 very much to the time taken, but character matching *is* what this is all
2291 about... */
2292
2293 case OP_NOTEXACT:
2294 min = max = GET2(ecode, 1);
2295 ecode += 3;
2296 goto REPEATNOTCHAR;
2297
2298 case OP_NOTUPTO:
2299 case OP_NOTMINUPTO:
2300 min = 0;
2301 max = GET2(ecode, 1);
2302 minimize = *ecode == OP_NOTMINUPTO;
2303 ecode += 3;
2304 goto REPEATNOTCHAR;
2305
2306 case OP_NOTPOSSTAR:
2307 possessive = TRUE;
2308 min = 0;
2309 max = INT_MAX;
2310 ecode++;
2311 goto REPEATNOTCHAR;
2312
2313 case OP_NOTPOSPLUS:
2314 possessive = TRUE;
2315 min = 1;
2316 max = INT_MAX;
2317 ecode++;
2318 goto REPEATNOTCHAR;
2319
2320 case OP_NOTPOSQUERY:
2321 possessive = TRUE;
2322 min = 0;
2323 max = 1;
2324 ecode++;
2325 goto REPEATNOTCHAR;
2326
2327 case OP_NOTPOSUPTO:
2328 possessive = TRUE;
2329 min = 0;
2330 max = GET2(ecode, 1);
2331 ecode += 3;
2332 goto REPEATNOTCHAR;
2333
2334 case OP_NOTSTAR:
2335 case OP_NOTMINSTAR:
2336 case OP_NOTPLUS:
2337 case OP_NOTMINPLUS:
2338 case OP_NOTQUERY:
2339 case OP_NOTMINQUERY:
2340 c = *ecode++ - OP_NOTSTAR;
2341 minimize = (c & 1) != 0;
2342 min = rep_min[c]; /* Pick up values from tables; */
2343 max = rep_max[c]; /* zero for max => infinity */
2344 if (max == 0) max = INT_MAX;
2345
2346 /* Common code for all repeated single-byte matches. We can give up quickly
2347 if there are fewer than the minimum number of bytes left in the
2348 subject. */
2349
2350 REPEATNOTCHAR:
2351 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2352 fc = *ecode++;
2353
2354 /* The code is duplicated for the caseless and caseful cases, for speed,
2355 since matching characters is likely to be quite common. First, ensure the
2356 minimum number of matches are present. If min = max, continue at the same
2357 level without recursing. Otherwise, if minimizing, keep trying the rest of
2358 the expression and advancing one matching character if failing, up to the
2359 maximum. Alternatively, if maximizing, find the maximum number of
2360 characters and work backwards. */
2361
2362 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2363 max, eptr));
2364
2365 if ((ims & PCRE_CASELESS) != 0)
2366 {
2367 fc = md->lcc[fc];
2368
2369 #ifdef SUPPORT_UTF8
2370 /* UTF-8 mode */
2371 if (utf8)
2372 {
2373 register unsigned int d;
2374 for (i = 1; i <= min; i++)
2375 {
2376 GETCHARINC(d, eptr);
2377 if (d < 256) d = md->lcc[d];
2378 if (fc == d) RRETURN(MATCH_NOMATCH);
2379 }
2380 }
2381 else
2382 #endif
2383
2384 /* Not UTF-8 mode */
2385 {
2386 for (i = 1; i <= min; i++)
2387 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2388 }
2389
2390 if (min == max) continue;
2391
2392 if (minimize)
2393 {
2394 #ifdef SUPPORT_UTF8
2395 /* UTF-8 mode */
2396 if (utf8)
2397 {
2398 register unsigned int d;
2399 for (fi = min;; fi++)
2400 {
2401 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2402 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2403 GETCHARINC(d, eptr);
2404 if (d < 256) d = md->lcc[d];
2405 if (fi >= max || eptr >= md->end_subject || fc == d)
2406 RRETURN(MATCH_NOMATCH);
2407 }
2408 }
2409 else
2410 #endif
2411 /* Not UTF-8 mode */
2412 {
2413 for (fi = min;; fi++)
2414 {
2415 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2416 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2417 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2418 RRETURN(MATCH_NOMATCH);
2419 }
2420 }
2421 /* Control never gets here */
2422 }
2423
2424 /* Maximize case */
2425
2426 else
2427 {
2428 pp = eptr;
2429
2430 #ifdef SUPPORT_UTF8
2431 /* UTF-8 mode */
2432 if (utf8)
2433 {
2434 register unsigned int d;
2435 for (i = min; i < max; i++)
2436 {
2437 int len = 1;
2438 if (eptr >= md->end_subject) break;
2439 GETCHARLEN(d, eptr, len);
2440 if (d < 256) d = md->lcc[d];
2441 if (fc == d) break;
2442 eptr += len;
2443 }
2444 if (possessive) continue;
2445 for(;;)
2446 {
2447 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2448 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449 if (eptr-- == pp) break; /* Stop if tried at original pos */
2450 BACKCHAR(eptr);
2451 }
2452 }
2453 else
2454 #endif
2455 /* Not UTF-8 mode */
2456 {
2457 for (i = min; i < max; i++)
2458 {
2459 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2460 eptr++;
2461 }
2462 if (possessive) continue;
2463 while (eptr >= pp)
2464 {
2465 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2466 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2467 eptr--;
2468 }
2469 }
2470
2471 RRETURN(MATCH_NOMATCH);
2472 }
2473 /* Control never gets here */
2474 }
2475
2476 /* Caseful comparisons */
2477
2478 else
2479 {
2480 #ifdef SUPPORT_UTF8
2481 /* UTF-8 mode */
2482 if (utf8)
2483 {
2484 register unsigned int d;
2485 for (i = 1; i <= min; i++)
2486 {
2487 GETCHARINC(d, eptr);
2488 if (fc == d) RRETURN(MATCH_NOMATCH);
2489 }
2490 }
2491 else
2492 #endif
2493 /* Not UTF-8 mode */
2494 {
2495 for (i = 1; i <= min; i++)
2496 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2497 }
2498
2499 if (min == max) continue;
2500
2501 if (minimize)
2502 {
2503 #ifdef SUPPORT_UTF8
2504 /* UTF-8 mode */
2505 if (utf8)
2506 {
2507 register unsigned int d;
2508 for (fi = min;; fi++)
2509 {
2510 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2511 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2512 GETCHARINC(d, eptr);
2513 if (fi >= max || eptr >= md->end_subject || fc == d)
2514 RRETURN(MATCH_NOMATCH);
2515 }
2516 }
2517 else
2518 #endif
2519 /* Not UTF-8 mode */
2520 {
2521 for (fi = min;; fi++)
2522 {
2523 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2524 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2525 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2526 RRETURN(MATCH_NOMATCH);
2527 }
2528 }
2529 /* Control never gets here */
2530 }
2531
2532 /* Maximize case */
2533
2534 else
2535 {
2536 pp = eptr;
2537
2538 #ifdef SUPPORT_UTF8
2539 /* UTF-8 mode */
2540 if (utf8)
2541 {
2542 register unsigned int d;
2543 for (i = min; i < max; i++)
2544 {
2545 int len = 1;
2546 if (eptr >= md->end_subject) break;
2547 GETCHARLEN(d, eptr, len);
2548 if (fc == d) break;
2549 eptr += len;
2550 }
2551 if (possessive) continue;
2552 for(;;)
2553 {
2554 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2555 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2556 if (eptr-- == pp) break; /* Stop if tried at original pos */
2557 BACKCHAR(eptr);
2558 }
2559 }
2560 else
2561 #endif
2562 /* Not UTF-8 mode */
2563 {
2564 for (i = min; i < max; i++)
2565 {
2566 if (eptr >= md->end_subject || fc == *eptr) break;
2567 eptr++;
2568 }
2569 if (possessive) continue;
2570 while (eptr >= pp)
2571 {
2572 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2573 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2574 eptr--;
2575 }
2576 }
2577
2578 RRETURN(MATCH_NOMATCH);
2579 }
2580 }
2581 /* Control never gets here */
2582
2583 /* Match a single character type repeatedly; several different opcodes
2584 share code. This is very similar to the code for single characters, but we
2585 repeat it in the interests of efficiency. */
2586
2587 case OP_TYPEEXACT:
2588 min = max = GET2(ecode, 1);
2589 minimize = TRUE;
2590 ecode += 3;
2591 goto REPEATTYPE;
2592
2593 case OP_TYPEUPTO:
2594 case OP_TYPEMINUPTO:
2595 min = 0;
2596 max = GET2(ecode, 1);
2597 minimize = *ecode == OP_TYPEMINUPTO;
2598 ecode += 3;
2599 goto REPEATTYPE;
2600
2601 case OP_TYPEPOSSTAR:
2602 possessive = TRUE;
2603 min = 0;
2604 max = INT_MAX;
2605 ecode++;
2606 goto REPEATTYPE;
2607
2608 case OP_TYPEPOSPLUS:
2609 possessive = TRUE;
2610 min = 1;
2611 max = INT_MAX;
2612 ecode++;
2613 goto REPEATTYPE;
2614
2615 case OP_TYPEPOSQUERY:
2616 possessive = TRUE;
2617 min = 0;
2618 max = 1;
2619 ecode++;
2620 goto REPEATTYPE;
2621
2622 case OP_TYPEPOSUPTO:
2623 possessive = TRUE;
2624 min = 0;
2625 max = GET2(ecode, 1);
2626 ecode += 3;
2627 goto REPEATTYPE;
2628
2629 case OP_TYPESTAR:
2630 case OP_TYPEMINSTAR:
2631 case OP_TYPEPLUS:
2632 case OP_TYPEMINPLUS:
2633 case OP_TYPEQUERY:
2634 case OP_TYPEMINQUERY:
2635 c = *ecode++ - OP_TYPESTAR;
2636 minimize = (c & 1) != 0;
2637 min = rep_min[c]; /* Pick up values from tables; */
2638 max = rep_max[c]; /* zero for max => infinity */
2639 if (max == 0) max = INT_MAX;
2640
2641 /* Common code for all repeated single character type matches. Note that
2642 in UTF-8 mode, '.' matches a character of any length, but for the other
2643 character types, the valid characters are all one-byte long. */
2644
2645 REPEATTYPE:
2646 ctype = *ecode++; /* Code for the character type */
2647
2648 #ifdef SUPPORT_UCP
2649 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2650 {
2651 prop_fail_result = ctype == OP_NOTPROP;
2652 prop_type = *ecode++;
2653 prop_value = *ecode++;
2654 }
2655 else prop_type = -1;
2656 #endif
2657
2658 /* First, ensure the minimum number of matches are present. Use inline
2659 code for maximizing the speed, and do the type test once at the start
2660 (i.e. keep it out of the loop). Also we can test that there are at least
2661 the minimum number of bytes before we start. This isn't as effective in
2662 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2663 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2664 and single-bytes. */
2665
2666 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2667 if (min > 0)
2668 {
2669 #ifdef SUPPORT_UCP
2670 if (prop_type >= 0)
2671 {
2672 switch(prop_type)
2673 {
2674 case PT_ANY:
2675 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2676 for (i = 1; i <= min; i++)
2677 {
2678 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2679 GETCHARINC(c, eptr);
2680 }
2681 break;
2682
2683 case PT_LAMP:
2684 for (i = 1; i <= min; i++)
2685 {
2686 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2687 GETCHARINC(c, eptr);
2688 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2689 if ((prop_chartype == ucp_Lu ||
2690 prop_chartype == ucp_Ll ||
2691 prop_chartype == ucp_Lt) == prop_fail_result)
2692 RRETURN(MATCH_NOMATCH);
2693 }
2694 break;
2695
2696 case PT_GC:
2697 for (i = 1; i <= min; i++)
2698 {
2699 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2700 GETCHARINC(c, eptr);
2701 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2702 if ((prop_category == prop_value) == prop_fail_result)
2703 RRETURN(MATCH_NOMATCH);
2704 }
2705 break;
2706
2707 case PT_PC:
2708 for (i = 1; i <= min; i++)
2709 {
2710 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2711 GETCHARINC(c, eptr);
2712 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2713 if ((prop_chartype == prop_value) == prop_fail_result)
2714 RRETURN(MATCH_NOMATCH);
2715 }
2716 break;
2717
2718 case PT_SC:
2719 for (i = 1; i <= min; i++)
2720 {
2721 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2722 GETCHARINC(c, eptr);
2723 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2724 if ((prop_script == prop_value) == prop_fail_result)
2725 RRETURN(MATCH_NOMATCH);
2726 }
2727 break;
2728
2729 default:
2730 RRETURN(PCRE_ERROR_INTERNAL);
2731 }
2732 }
2733
2734 /* Match extended Unicode sequences. We will get here only if the
2735 support is in the binary; otherwise a compile-time error occurs. */
2736
2737 else if (ctype == OP_EXTUNI)
2738 {
2739 for (i = 1; i <= min; i++)
2740 {
2741 GETCHARINCTEST(c, eptr);
2742 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2743 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2744 while (eptr < md->end_subject)
2745 {
2746 int len = 1;
2747 if (!utf8) c = *eptr; else
2748 {
2749 GETCHARLEN(c, eptr, len);
2750 }
2751 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2752 if (prop_category != ucp_M) break;
2753 eptr += len;
2754 }
2755 }
2756 }
2757
2758 else
2759 #endif /* SUPPORT_UCP */
2760
2761 /* Handle all other cases when the coding is UTF-8 */
2762
2763 #ifdef SUPPORT_UTF8
2764 if (utf8) switch(ctype)
2765 {
2766 case OP_ANY:
2767 for (i = 1; i <= min; i++)
2768 {
2769 if (eptr >= md->end_subject ||
2770 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2771 RRETURN(MATCH_NOMATCH);
2772 eptr++;
2773 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2774 }
2775 break;
2776
2777 case OP_ANYBYTE:
2778 eptr += min;
2779 break;
2780
2781 case OP_ANYNL:
2782 for (i = 1; i <= min; i++)
2783 {
2784 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2785 GETCHARINC(c, eptr);
2786 switch(c)
2787 {
2788 default: RRETURN(MATCH_NOMATCH);
2789 case 0x000d:
2790 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2791 break;
2792 case 0x000a:
2793 case 0x000b:
2794 case 0x000c:
2795 case 0x0085:
2796 case 0x2028:
2797 case 0x2029:
2798 break;
2799 }
2800 }
2801 break;
2802
2803 case OP_NOT_DIGIT:
2804 for (i = 1; i <= min; i++)
2805 {
2806 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2807 GETCHARINC(c, eptr);
2808 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2809 RRETURN(MATCH_NOMATCH);
2810 }
2811 break;
2812
2813 case OP_DIGIT:
2814 for (i = 1; i <= min; i++)
2815 {
2816 if (eptr >= md->end_subject ||
2817 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2818 RRETURN(MATCH_NOMATCH);
2819 /* No need to skip more bytes - we know it's a 1-byte character */
2820 }
2821 break;
2822
2823 case OP_NOT_WHITESPACE:
2824 for (i = 1; i <= min; i++)
2825 {
2826 if (eptr >= md->end_subject ||
2827 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2828 RRETURN(MATCH_NOMATCH);
2829 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2830 }
2831 break;
2832
2833 case OP_WHITESPACE:
2834 for (i = 1; i <= min; i++)
2835 {
2836 if (eptr >= md->end_subject ||
2837 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2838 RRETURN(MATCH_NOMATCH);
2839 /* No need to skip more bytes - we know it's a 1-byte character */
2840 }
2841 break;
2842
2843 case OP_NOT_WORDCHAR:
2844 for (i = 1; i <= min; i++)
2845 {
2846 if (eptr >= md->end_subject ||
2847 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2848 RRETURN(MATCH_NOMATCH);
2849 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2850 }
2851 break;
2852
2853 case OP_WORDCHAR:
2854 for (i = 1; i <= min; i++)
2855 {
2856 if (eptr >= md->end_subject ||
2857 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2858 RRETURN(MATCH_NOMATCH);
2859 /* No need to skip more bytes - we know it's a 1-byte character */
2860 }
2861 break;
2862
2863 default:
2864 RRETURN(PCRE_ERROR_INTERNAL);
2865 } /* End switch(ctype) */
2866
2867 else
2868 #endif /* SUPPORT_UTF8 */
2869
2870 /* Code for the non-UTF-8 case for minimum matching of operators other
2871 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2872 number of bytes present, as this was tested above. */
2873
2874 switch(ctype)
2875 {
2876 case OP_ANY:
2877 if ((ims & PCRE_DOTALL) == 0)
2878 {
2879 for (i = 1; i <= min; i++)
2880 {
2881 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2882 eptr++;
2883 }
2884 }
2885 else eptr += min;
2886 break;
2887
2888 case OP_ANYBYTE:
2889 eptr += min;
2890 break;
2891
2892 /* Because of the CRLF case, we can't assume the minimum number of
2893 bytes are present in this case. */
2894
2895 case OP_ANYNL:
2896 for (i = 1; i <= min; i++)
2897 {
2898 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2899 switch(*eptr++)
2900 {
2901 default: RRETURN(MATCH_NOMATCH);
2902 case 0x000d:
2903 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2904 break;
2905 case 0x000a:
2906 case 0x000b:
2907 case 0x000c:
2908 case 0x0085:
2909 break;
2910 }
2911 }
2912 break;
2913
2914 case OP_NOT_DIGIT:
2915 for (i = 1; i <= min; i++)
2916 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2917 break;
2918
2919 case OP_DIGIT:
2920 for (i = 1; i <= min; i++)
2921 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2922 break;
2923
2924 case OP_NOT_WHITESPACE:
2925 for (i = 1; i <= min; i++)
2926 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2927 break;
2928
2929 case OP_WHITESPACE:
2930 for (i = 1; i <= min; i++)
2931 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2932 break;
2933
2934 case OP_NOT_WORDCHAR:
2935 for (i = 1; i <= min; i++)
2936 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2937 RRETURN(MATCH_NOMATCH);
2938 break;
2939
2940 case OP_WORDCHAR:
2941 for (i = 1; i <= min; i++)
2942 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2943 RRETURN(MATCH_NOMATCH);
2944 break;
2945
2946 default:
2947 RRETURN(PCRE_ERROR_INTERNAL);
2948 }
2949 }
2950
2951 /* If min = max, continue at the same level without recursing */
2952
2953 if (min == max) continue;
2954
2955 /* If minimizing, we have to test the rest of the pattern before each
2956 subsequent match. Again, separate the UTF-8 case for speed, and also
2957 separate the UCP cases. */
2958
2959 if (minimize)
2960 {
2961 #ifdef SUPPORT_UCP
2962 if (prop_type >= 0)
2963 {
2964 switch(prop_type)
2965 {
2966 case PT_ANY:
2967 for (fi = min;; fi++)
2968 {
2969 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
2970 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2971 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2972 GETCHARINC(c, eptr);
2973 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2974 }
2975 /* Control never gets here */
2976
2977 case PT_LAMP:
2978 for (fi = min;; fi++)
2979 {
2980 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
2981 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2982 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2983 GETCHARINC(c, eptr);
2984 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2985 if ((prop_chartype == ucp_Lu ||
2986 prop_chartype == ucp_Ll ||
2987 prop_chartype == ucp_Lt) == prop_fail_result)
2988 RRETURN(MATCH_NOMATCH);
2989 }
2990 /* Control never gets here */
2991
2992 case PT_GC:
2993 for (fi = min;; fi++)
2994 {
2995 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
2996 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2997 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2998 GETCHARINC(c, eptr);
2999 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3000 if ((prop_category == prop_value) == prop_fail_result)
3001 RRETURN(MATCH_NOMATCH);
3002 }
3003 /* Control never gets here */
3004
3005 case PT_PC:
3006 for (fi = min;; fi++)
3007 {
3008 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3010 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3011 GETCHARINC(c, eptr);
3012 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3013 if ((prop_chartype == prop_value) == prop_fail_result)
3014 RRETURN(MATCH_NOMATCH);
3015 }
3016 /* Control never gets here */
3017
3018 case PT_SC:
3019 for (fi = min;; fi++)
3020 {
3021 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3022 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3023 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3024 GETCHARINC(c, eptr);
3025 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3026 if ((prop_script == prop_value) == prop_fail_result)
3027 RRETURN(MATCH_NOMATCH);
3028 }
3029 /* Control never gets here */
3030
3031 default:
3032 RRETURN(PCRE_ERROR_INTERNAL);
3033 }
3034 }
3035
3036 /* Match extended Unicode sequences. We will get here only if the
3037 support is in the binary; otherwise a compile-time error occurs. */
3038
3039 else if (ctype == OP_EXTUNI)
3040 {
3041 for (fi = min;; fi++)
3042 {
3043 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3044 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3045 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3046 GETCHARINCTEST(c, eptr);
3047 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3048 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3049 while (eptr < md->end_subject)
3050 {
3051 int len = 1;
3052 if (!utf8) c = *eptr; else
3053 {
3054 GETCHARLEN(c, eptr, len);
3055 }
3056 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3057 if (prop_category != ucp_M) break;
3058 eptr += len;
3059 }
3060 }
3061 }
3062
3063 else
3064 #endif /* SUPPORT_UCP */
3065
3066 #ifdef SUPPORT_UTF8
3067 /* UTF-8 mode */
3068 if (utf8)
3069 {
3070 for (fi = min;; fi++)
3071 {
3072 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3073 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3074 if (fi >= max || eptr >= md->end_subject ||
3075 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3076 IS_NEWLINE(eptr)))
3077 RRETURN(MATCH_NOMATCH);
3078
3079 GETCHARINC(c, eptr);
3080 switch(ctype)
3081 {
3082 case OP_ANY: /* This is the DOTALL case */
3083 break;
3084
3085 case OP_ANYBYTE:
3086 break;
3087
3088 case OP_ANYNL:
3089 switch(c)
3090 {
3091 default: RRETURN(MATCH_NOMATCH);
3092 case 0x000d:
3093 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3094 break;
3095 case 0x000a:
3096 case 0x000b:
3097 case 0x000c:
3098 case 0x0085:
3099 case 0x2028:
3100 case 0x2029:
3101 break;
3102 }
3103 break;
3104
3105 case OP_NOT_DIGIT:
3106 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3107 RRETURN(MATCH_NOMATCH);
3108 break;
3109
3110 case OP_DIGIT:
3111 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3112 RRETURN(MATCH_NOMATCH);
3113 break;
3114
3115 case OP_NOT_WHITESPACE:
3116 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3117 RRETURN(MATCH_NOMATCH);
3118 break;
3119
3120 case OP_WHITESPACE:
3121 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3122 RRETURN(MATCH_NOMATCH);
3123 break;
3124
3125 case OP_NOT_WORDCHAR:
3126 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3127 RRETURN(MATCH_NOMATCH);
3128 break;
3129
3130 case OP_WORDCHAR:
3131 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3132 RRETURN(MATCH_NOMATCH);
3133 break;
3134
3135 default:
3136 RRETURN(PCRE_ERROR_INTERNAL);
3137 }
3138 }
3139 }
3140 else
3141 #endif
3142 /* Not UTF-8 mode */
3143 {
3144 for (fi = min;; fi++)
3145 {
3146 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3147 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3148 if (fi >= max || eptr >= md->end_subject ||
3149 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3150 RRETURN(MATCH_NOMATCH);
3151
3152 c = *eptr++;
3153 switch(ctype)
3154 {
3155 case OP_ANY: /* This is the DOTALL case */
3156 break;
3157
3158 case OP_ANYBYTE:
3159 break;
3160
3161 case OP_ANYNL:
3162 switch(c)
3163 {
3164 default: RRETURN(MATCH_NOMATCH);
3165 case 0x000d:
3166 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3167 break;
3168 case 0x000a:
3169 case 0x000b:
3170 case 0x000c:
3171 case 0x0085:
3172 break;
3173 }
3174 break;
3175
3176 case OP_NOT_DIGIT:
3177 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3178 break;
3179
3180 case OP_DIGIT:
3181 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3182 break;
3183
3184 case OP_NOT_WHITESPACE:
3185 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3186 break;
3187
3188 case OP_WHITESPACE:
3189 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3190 break;
3191
3192 case OP_NOT_WORDCHAR:
3193 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3194 break;
3195
3196 case OP_WORDCHAR:
3197 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3198 break;
3199
3200 default:
3201 RRETURN(PCRE_ERROR_INTERNAL);
3202 }
3203 }
3204 }
3205 /* Control never gets here */
3206 }
3207
3208 /* If maximizing, it is worth using inline code for speed, doing the type
3209 test once at the start (i.e. keep it out of the loop). Again, keep the
3210 UTF-8 and UCP stuff separate. */
3211
3212 else
3213 {
3214 pp = eptr; /* Remember where we started */
3215
3216 #ifdef SUPPORT_UCP
3217 if (prop_type >= 0)
3218 {
3219 switch(prop_type)
3220 {
3221 case PT_ANY:
3222 for (i = min; i < max; i++)
3223 {
3224 int len = 1;
3225 if (eptr >= md->end_subject) break;
3226 GETCHARLEN(c, eptr, len);
3227 if (prop_fail_result) break;
3228 eptr+= len;
3229 }
3230 break;
3231
3232 case PT_LAMP:
3233 for (i = min; i < max; i++)
3234 {
3235 int len = 1;
3236 if (eptr >= md->end_subject) break;
3237 GETCHARLEN(c, eptr, len);
3238 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3239 if ((prop_chartype == ucp_Lu ||
3240 prop_chartype == ucp_Ll ||
3241 prop_chartype == ucp_Lt) == prop_fail_result)
3242 break;
3243 eptr+= len;
3244 }
3245 break;
3246
3247 case PT_GC:
3248 for (i = min; i < max; i++)
3249 {
3250 int len = 1;
3251 if (eptr >= md->end_subject) break;
3252 GETCHARLEN(c, eptr, len);
3253 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3254 if ((prop_category == prop_value) == prop_fail_result)
3255 break;
3256 eptr+= len;
3257 }
3258 break;
3259
3260 case PT_PC:
3261 for (i = min; i < max; i++)
3262 {
3263 int len = 1;
3264 if (eptr >= md->end_subject) break;
3265 GETCHARLEN(c, eptr, len);
3266 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3267 if ((prop_chartype == prop_value) == prop_fail_result)
3268 break;
3269 eptr+= len;
3270 }
3271 break;
3272
3273 case PT_SC:
3274 for (i = min; i < max; i++)
3275 {
3276 int len = 1;
3277 if (eptr >= md->end_subject) break;
3278 GETCHARLEN(c, eptr, len);
3279 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3280 if ((prop_script == prop_value) == prop_fail_result)
3281 break;
3282 eptr+= len;
3283 }
3284 break;
3285 }
3286
3287 /* eptr is now past the end of the maximum run */
3288
3289 if (possessive) continue;
3290 for(;;)
3291 {
3292 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3293 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3294 if (eptr-- == pp) break; /* Stop if tried at original pos */
3295 BACKCHAR(eptr);
3296 }
3297 }
3298
3299 /* Match extended Unicode sequences. We will get here only if the
3300 support is in the binary; otherwise a compile-time error occurs. */
3301
3302 else if (ctype == OP_EXTUNI)
3303 {
3304 for (i = min; i < max; i++)
3305 {
3306 if (eptr >= md->end_subject) break;
3307 GETCHARINCTEST(c, eptr);
3308 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3309 if (prop_category == ucp_M) break;
3310 while (eptr < md->end_subject)
3311 {
3312 int len = 1;
3313 if (!utf8) c = *eptr; else
3314 {
3315 GETCHARLEN(c, eptr, len);
3316 }
3317 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3318 if (prop_category != ucp_M) break;
3319 eptr += len;
3320 }
3321 }
3322
3323 /* eptr is now past the end of the maximum run */
3324
3325 if (possessive) continue;
3326 for(;;)
3327 {
3328 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3329 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3330 if (eptr-- == pp) break; /* Stop if tried at original pos */
3331 for (;;) /* Move back over one extended */
3332 {
3333 int len = 1;
3334 BACKCHAR(eptr);
3335 if (!utf8) c = *eptr; else
3336 {
3337 GETCHARLEN(c, eptr, len);
3338 }
3339 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3340 if (prop_category != ucp_M) break;
3341 eptr--;
3342 }
3343 }
3344 }
3345
3346 else
3347 #endif /* SUPPORT_UCP */
3348
3349 #ifdef SUPPORT_UTF8
3350 /* UTF-8 mode */
3351
3352 if (utf8)
3353 {
3354 switch(ctype)
3355 {
3356 case OP_ANY:
3357
3358 /* Special code is required for UTF8, but when the maximum is
3359 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3360 probably worth it, because .* is quite a common idiom. */
3361
3362 if (max < INT_MAX)
3363 {
3364 if ((ims & PCRE_DOTALL) == 0)
3365 {
3366 for (i = min; i < max; i++)
3367 {
3368 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3369 eptr++;
3370 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3371 }
3372 }
3373 else
3374 {
3375 for (i = min; i < max; i++)
3376 {
3377 if (eptr >= md->end_subject) break;
3378 eptr++;
3379 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3380 }
3381 }
3382 }
3383
3384 /* Handle unlimited UTF-8 repeat */
3385
3386 else
3387 {
3388 if ((ims & PCRE_DOTALL) == 0)
3389 {
3390 for (i = min; i < max; i++)
3391 {
3392 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3393 eptr++;
3394 }
3395 break;
3396 }
3397 else
3398 {
3399 c = max - min;
3400 if (c > (unsigned int)(md->end_subject - eptr))
3401 c = md->end_subject - eptr;
3402 eptr += c;
3403 }
3404 }
3405 break;
3406
3407 /* The byte case is the same as non-UTF8 */
3408
3409 case OP_ANYBYTE:
3410 c = max - min;
3411 if (c > (unsigned int)(md->end_subject - eptr))
3412 c = md->end_subject - eptr;
3413 eptr += c;
3414 break;
3415
3416 case OP_ANYNL:
3417 for (i = min; i < max; i++)
3418 {
3419 int len = 1;
3420 if (eptr >= md->end_subject) break;
3421 GETCHARLEN(c, eptr, len);
3422 if (c == 0x000d)
3423 {
3424 if (++eptr >= md->end_subject) break;
3425 if (*eptr == 0x000a) eptr++;
3426 }
3427 else
3428 {
3429 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3430 c != 0x0085 && c != 0x2028 && c != 0x2029)
3431 break;
3432 eptr += len;
3433 }
3434 }
3435 break;
3436
3437 case OP_NOT_DIGIT:
3438 for (i = min; i < max; i++)
3439 {
3440 int len = 1;
3441 if (eptr >= md->end_subject) break;
3442 GETCHARLEN(c, eptr, len);
3443 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3444 eptr+= len;
3445 }
3446 break;
3447
3448 case OP_DIGIT:
3449 for (i = min; i < max; i++)
3450 {
3451 int len = 1;
3452 if (eptr >= md->end_subject) break;
3453 GETCHARLEN(c, eptr, len);
3454 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3455 eptr+= len;
3456 }
3457 break;
3458
3459 case OP_NOT_WHITESPACE:
3460 for (i = min; i < max; i++)
3461 {
3462 int len = 1;
3463 if (eptr >= md->end_subject) break;
3464 GETCHARLEN(c, eptr, len);
3465 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3466 eptr+= len;
3467 }
3468 break;
3469
3470 case OP_WHITESPACE:
3471 for (i = min; i < max; i++)
3472 {
3473 int len = 1;
3474 if (eptr >= md->end_subject) break;
3475 GETCHARLEN(c, eptr, len);
3476 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3477 eptr+= len;
3478 }
3479 break;
3480
3481 case OP_NOT_WORDCHAR:
3482 for (i = min; i < max; i++)
3483 {
3484 int len = 1;
3485 if (eptr >= md->end_subject) break;
3486 GETCHARLEN(c, eptr, len);
3487 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3488 eptr+= len;
3489 }
3490 break;
3491
3492 case OP_WORDCHAR:
3493 for (i = min; i < max; i++)
3494 {
3495 int len = 1;
3496 if (eptr >= md->end_subject) break;
3497 GETCHARLEN(c, eptr, len);
3498 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3499 eptr+= len;
3500 }
3501 break;
3502
3503 default:
3504 RRETURN(PCRE_ERROR_INTERNAL);
3505 }
3506
3507 /* eptr is now past the end of the maximum run */
3508
3509 if (possessive) continue;
3510 for(;;)
3511 {
3512 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3513 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3514 if (eptr-- == pp) break; /* Stop if tried at original pos */
3515 BACKCHAR(eptr);
3516 }
3517 }
3518 else
3519 #endif
3520
3521 /* Not UTF-8 mode */
3522 {
3523 switch(ctype)
3524 {
3525 case OP_ANY:
3526 if ((ims & PCRE_DOTALL) == 0)
3527 {
3528 for (i = min; i < max; i++)
3529 {
3530 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3531 eptr++;
3532 }
3533 break;
3534 }
3535 /* For DOTALL case, fall through and treat as \C */
3536
3537 case OP_ANYBYTE:
3538 c = max - min;
3539 if (c > (unsigned int)(md->end_subject - eptr))
3540 c = md->end_subject - eptr;
3541 eptr += c;
3542 break;
3543
3544 case OP_ANYNL:
3545 for (i = min; i < max; i++)
3546 {
3547 if (eptr >= md->end_subject) break;
3548 c = *eptr;
3549 if (c == 0x000d)
3550 {
3551 if (++eptr >= md->end_subject) break;
3552 if (*eptr == 0x000a) eptr++;
3553 }
3554 else
3555 {
3556 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3557 break;
3558 eptr++;
3559 }
3560 }
3561 break;
3562
3563 case OP_NOT_DIGIT:
3564 for (i = min; i < max; i++)
3565 {
3566 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3567 break;
3568 eptr++;
3569 }
3570 break;
3571
3572 case OP_DIGIT:
3573 for (i = min; i < max; i++)
3574 {
3575 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3576 break;
3577 eptr++;
3578 }
3579 break;
3580
3581 case OP_NOT_WHITESPACE:
3582 for (i = min; i < max; i++)
3583 {
3584 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3585 break;
3586 eptr++;
3587 }
3588 break;
3589
3590 case OP_WHITESPACE:
3591 for (i = min; i < max; i++)
3592 {
3593 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3594 break;
3595 eptr++;
3596 }
3597 break;
3598
3599 case OP_NOT_WORDCHAR:
3600 for (i = min; i < max; i++)
3601 {
3602 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3603 break;
3604 eptr++;
3605 }
3606 break;
3607
3608 case OP_WORDCHAR:
3609 for (i = min; i < max; i++)
3610 {
3611 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3612 break;
3613 eptr++;
3614 }
3615 break;
3616
3617 default:
3618 RRETURN(PCRE_ERROR_INTERNAL);
3619 }
3620
3621 /* eptr is now past the end of the maximum run */
3622
3623 if (possessive) continue;
3624 while (eptr >= pp)
3625 {
3626 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
3627 eptr--;
3628 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629 }
3630 }
3631
3632 /* Get here if we can't make it match with any permitted repetitions */
3633
3634 RRETURN(MATCH_NOMATCH);
3635 }
3636 /* Control never gets here */
3637
3638 /* There's been some horrible disaster. Arrival here can only mean there is
3639 something seriously wrong in the code above or the OP_xxx definitions. */
3640
3641 default:
3642 DPRINTF(("Unknown opcode %d\n", *ecode));
3643 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3644 }
3645
3646 /* Do not stick any code in here without much thought; it is assumed
3647 that "continue" in the code above comes out to here to repeat the main
3648 loop. */
3649
3650 } /* End of main loop */
3651 /* Control never reaches here */
3652
3653
3654 /* When compiling to use the heap rather than the stack for recursive calls to
3655 match(), the RRETURN() macro jumps here. The number that is saved in
3656 frame->Xwhere indicates which label we actually want to return to. */
3657
3658 #ifdef NO_RECURSE
3659 #define LBL(val) case val: goto L_RM##val;
3660 HEAP_RETURN:
3661 switch (frame->Xwhere)
3662 {
3663 LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
3664 LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
3665 LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
3666 LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
3667 LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
3668 LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47)
3669 default:
3670 DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
3671 return PCRE_ERROR_INTERNAL;
3672 }
3673 #undef LBL
3674 #endif /* NO_RECURSE */
3675 }
3676
3677
3678 /***************************************************************************
3679 ****************************************************************************
3680 RECURSION IN THE match() FUNCTION
3681
3682 Undefine all the macros that were defined above to handle this. */
3683
3684 #ifdef NO_RECURSE
3685 #undef eptr
3686 #undef ecode
3687 #undef offset_top
3688 #undef ims
3689 #undef eptrb
3690 #undef flags
3691
3692 #undef callpat
3693 #undef charptr
3694 #undef data
3695 #undef next
3696 #undef pp
3697 #undef prev
3698 #undef saved_eptr
3699
3700 #undef new_recursive
3701
3702 #undef cur_is_word
3703 #undef condition
3704 #undef prev_is_word
3705
3706 #undef original_ims
3707
3708 #undef ctype
3709 #undef length
3710 #undef max
3711 #undef min
3712 #undef number
3713 #undef offset
3714 #undef op
3715 #undef save_capture_last
3716 #undef save_offset1
3717 #undef save_offset2
3718 #undef save_offset3
3719 #undef stacksave
3720
3721 #undef newptrb
3722
3723 #endif
3724
3725 /* These two are defined as macros in both cases */
3726
3727 #undef fc
3728 #undef fi
3729
3730 /***************************************************************************
3731 ***************************************************************************/
3732
3733
3734
3735 /*************************************************
3736 * Execute a Regular Expression *
3737 *************************************************/
3738
3739 /* This function applies a compiled re to a subject string and picks out
3740 portions of the string if it matches. Two elements in the vector are set for
3741 each substring: the offsets to the start and end of the substring.
3742
3743 Arguments:
3744 argument_re points to the compiled expression
3745 extra_data points to extra data or is NULL
3746 subject points to the subject string
3747 length length of subject string (may contain binary zeros)
3748 start_offset where to start in the subject string
3749 options option bits
3750 offsets points to a vector of ints to be filled in with offsets
3751 offsetcount the number of elements in the vector
3752
3753 Returns: > 0 => success; value is the number of elements filled in
3754 = 0 => success, but offsets is not big enough
3755 -1 => failed to match
3756 < -1 => some kind of unexpected problem
3757 */
3758
3759 PCRE_EXP_DEFN int
3760 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3761 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3762 int offsetcount)
3763 {
3764 int rc, resetcount, ocount;
3765 int first_byte = -1;
3766 int req_byte = -1;
3767 int req_byte2 = -1;
3768 int newline;
3769 unsigned long int ims;
3770 BOOL using_temporary_offsets = FALSE;
3771 BOOL anchored;
3772 BOOL startline;
3773 BOOL firstline;
3774 BOOL first_byte_caseless = FALSE;
3775 BOOL req_byte_caseless = FALSE;
3776 BOOL utf8;
3777 match_data match_block;
3778 match_data *md = &match_block;
3779 const uschar *tables;
3780 const uschar *start_bits = NULL;
3781 USPTR start_match = (USPTR)subject + start_offset;
3782 USPTR end_subject;
3783 USPTR req_byte_ptr = start_match - 1;
3784 eptrblock eptrchain[EPTR_WORK_SIZE];
3785
3786 pcre_study_data internal_study;
3787 const pcre_study_data *study;
3788
3789 real_pcre internal_re;
3790 const real_pcre *external_re = (const real_pcre *)argument_re;
3791 const real_pcre *re = external_re;
3792
3793 /* Plausibility checks */
3794
3795 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3796 if (re == NULL || subject == NULL ||
3797 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3798 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3799
3800 /* Fish out the optional data from the extra_data structure, first setting
3801 the default values. */
3802
3803 study = NULL;
3804 md->match_limit = MATCH_LIMIT;
3805 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3806 md->callout_data = NULL;
3807
3808 /* The table pointer is always in native byte order. */
3809
3810 tables = external_re->tables;
3811
3812 if (extra_data != NULL)
3813 {
3814 register unsigned int flags = extra_data->flags;
3815 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3816 study = (const pcre_study_data *)extra_data->study_data;
3817 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3818 md->match_limit = extra_data->match_limit;
3819 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3820 md->match_limit_recursion = extra_data->match_limit_recursion;
3821 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3822 md->callout_data = extra_data->callout_data;
3823 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3824 }
3825
3826 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3827 is a feature that makes it possible to save compiled regex and re-use them
3828 in other programs later. */
3829
3830 if (tables == NULL) tables = _pcre_default_tables;
3831
3832 /* Check that the first field in the block is the magic number. If it is not,
3833 test for a regex that was compiled on a host of opposite endianness. If this is
3834 the case, flipped values are put in internal_re and internal_study if there was
3835 study data too. */
3836
3837 if (re->magic_number != MAGIC_NUMBER)
3838 {
3839 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3840 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3841 if (study != NULL) study = &internal_study;
3842 }
3843
3844 /* Set up other data */
3845
3846 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3847 startline = (re->options & PCRE_STARTLINE) != 0;
3848 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3849
3850 /* The code starts after the real_pcre block and the capture name table. */
3851
3852 md->start_code = (const uschar *)external_re + re->name_table_offset +
3853 re->name_count * re->name_entry_size;
3854
3855 md->start_subject = (USPTR)subject;
3856 md->start_offset = start_offset;
3857 md->end_subject = md->start_subject + length;
3858 end_subject = md->end_subject;
3859
3860 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3861 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3862
3863 md->notbol = (options & PCRE_NOTBOL) != 0;
3864 md->noteol = (options & PCRE_NOTEOL) != 0;
3865 md->notempty = (options & PCRE_NOTEMPTY) != 0;
3866 md->partial = (options & PCRE_PARTIAL) != 0;
3867 md->hitend = FALSE;
3868
3869 md->recursive = NULL; /* No recursion at top level */
3870 md->eptrchain = eptrchain; /* Make workspace generally available */
3871
3872 md->lcc = tables + lcc_offset;
3873 md->ctypes = tables + ctypes_offset;
3874
3875 /* Handle different types of newline. The three bits give eight cases. If
3876 nothing is set at run time, whatever was used at compile time applies. */
3877
3878 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3879 PCRE_NEWLINE_BITS)
3880 {
3881 case 0: newline = NEWLINE; break; /* Compile-time default */
3882 case PCRE_NEWLINE_CR: newline = '\r'; break;
3883 case PCRE_NEWLINE_LF: newline = '\n'; break;
3884 case PCRE_NEWLINE_CR+
3885 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3886 case PCRE_NEWLINE_ANY: newline = -1; break;
3887 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3888 default: return PCRE_ERROR_BADNEWLINE;
3889 }
3890
3891 if (newline == -2)
3892 {
3893 md->nltype = NLTYPE_ANYCRLF;
3894 }
3895 else if (newline < 0)
3896 {
3897 md->nltype = NLTYPE_ANY;
3898 }
3899 else
3900 {
3901 md->nltype = NLTYPE_FIXED;
3902 if (newline > 255)
3903 {
3904 md->nllen = 2;
3905 md->nl[0] = (newline >> 8) & 255;
3906 md->nl[1] = newline & 255;
3907 }
3908 else
3909 {
3910 md->nllen = 1;
3911 md->nl[0] = newline;
3912 }
3913 }
3914
3915 /* Partial matching is supported only for a restricted set of regexes at the
3916 moment. */
3917
3918 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3919 return PCRE_ERROR_BADPARTIAL;
3920
3921 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3922 back the character offset. */
3923
3924 #ifdef SUPPORT_UTF8
3925 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3926 {
3927 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3928 return PCRE_ERROR_BADUTF8;
3929 if (start_offset > 0 && start_offset < length)
3930 {
3931 int tb = ((uschar *)subject)[start_offset];
3932 if (tb > 127)
3933 {
3934 tb &= 0xc0;
3935 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3936 }
3937 }
3938 }
3939 #endif
3940
3941 /* The ims options can vary during the matching as a result of the presence
3942 of (?ims) items in the pattern. They are kept in a local variable so that
3943 restoring at the exit of a group is easy. */
3944
3945 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3946
3947 /* If the expression has got more back references than the offsets supplied can
3948 hold, we get a temporary chunk of working store to use during the matching.
3949 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3950 of 3. */
3951
3952 ocount = offsetcount - (offsetcount % 3);
3953
3954 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3955 {
3956 ocount = re->top_backref * 3 + 3;
3957 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3958 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3959 using_temporary_offsets = TRUE;
3960 DPRINTF(("Got memory to hold back references\n"));
3961 }
3962 else md->offset_vector = offsets;
3963
3964 md->offset_end = ocount;
3965 md->offset_max = (2*ocount)/3;
3966 md->offset_overflow = FALSE;
3967 md->capture_last = -1;
3968
3969 /* Compute the minimum number of offsets that we need to reset each time. Doing
3970 this makes a huge difference to execution time when there aren't many brackets
3971 in the pattern. */
3972
3973 resetcount = 2 + re->top_bracket * 2;
3974 if (resetcount > offsetcount) resetcount = ocount;
3975
3976 /* Reset the working variable associated with each extraction. These should
3977 never be used unless previously set, but they get saved and restored, and so we
3978 initialize them to avoid reading uninitialized locations. */
3979
3980 if (md->offset_vector != NULL)
3981 {
3982 register int *iptr = md->offset_vector + ocount;
3983 register int *iend = iptr - resetcount/2 + 1;
3984 while (--iptr >= iend) *iptr = -1;
3985 }
3986
3987 /* Set up the first character to match, if available. The first_byte value is
3988 never set for an anchored regular expression, but the anchoring may be forced
3989 at run time, so we have to test for anchoring. The first char may be unset for
3990 an unanchored pattern, of course. If there's no first char and the pattern was
3991 studied, there may be a bitmap of possible first characters. */
3992
3993 if (!anchored)
3994 {
3995 if ((re->options & PCRE_FIRSTSET) != 0)
3996 {
3997 first_byte = re->first_byte & 255;
3998 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3999 first_byte = md->lcc[first_byte];
4000 }
4001 else
4002 if (!startline && study != NULL &&
4003 (study->options & PCRE_STUDY_MAPPED) != 0)
4004 start_bits = study->start_bits;
4005 }
4006
4007 /* For anchored or unanchored matches, there may be a "last known required
4008 character" set. */
4009
4010 if ((re->options & PCRE_REQCHSET) != 0)
4011 {
4012 req_byte = re->req_byte & 255;
4013 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4014 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4015 }
4016
4017
4018 /* ==========================================================================*/
4019
4020 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4021 the loop runs just once. */
4022
4023 for(;;)
4024 {
4025 USPTR save_end_subject = end_subject;
4026
4027 /* Reset the maximum number of extractions we might see. */
4028
4029 if (md->offset_vector != NULL)
4030 {
4031 register int *iptr = md->offset_vector;
4032 register int *iend = iptr + resetcount;
4033 while (iptr < iend) *iptr++ = -1;
4034 }
4035
4036 /* Advance to a unique first char if possible. If firstline is TRUE, the
4037 start of the match is constrained to the first line of a multiline string.
4038 That is, the match must be before or at the first newline. Implement this by
4039 temporarily adjusting end_subject so that we stop scanning at a newline. If
4040 the match fails at the newline, later code breaks this loop. */
4041
4042 if (firstline)
4043 {
4044 USPTR t = start_match;
4045 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4046 end_subject = t;
4047 }
4048
4049 /* Now test for a unique first byte */
4050
4051 if (first_byte >= 0)
4052 {
4053 if (first_byte_caseless)
4054 while (start_match < end_subject &&
4055 md->lcc[*start_match] != first_byte)
4056 start_match++;
4057 else
4058 while (start_match < end_subject && *start_match != first_byte)
4059 start_match++;
4060 }
4061
4062 /* Or to just after a linebreak for a multiline match if possible */
4063
4064 else if (startline)
4065 {
4066 if (start_match > md->start_subject + start_offset)
4067 {
4068 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4069 start_match++;
4070
4071 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4072 and we are now at a LF, advance the match position by one more character.
4073 */
4074
4075 if (start_match[-1] == '\r' &&
4076 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4077 start_match < end_subject &&
4078 *start_match == '\n')
4079 start_match++;
4080 }
4081 }
4082
4083 /* Or to a non-unique first char after study */
4084
4085 else if (start_bits != NULL)
4086 {
4087 while (start_match < end_subject)
4088 {
4089 register unsigned int c = *start_match;
4090 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4091 }
4092 }
4093
4094 /* Restore fudged end_subject */
4095
4096 end_subject = save_end_subject;
4097
4098 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4099 printf(">>>> Match against: ");
4100 pchars(start_match, end_subject - start_match, TRUE, md);
4101 printf("\n");
4102 #endif
4103
4104 /* If req_byte is set, we know that that character must appear in the subject
4105 for the match to succeed. If the first character is set, req_byte must be
4106 later in the subject; otherwise the test starts at the match point. This
4107 optimization can save a huge amount of backtracking in patterns with nested
4108 unlimited repeats that aren't going to match. Writing separate code for
4109 cased/caseless versions makes it go faster, as does using an autoincrement
4110 and backing off on a match.
4111
4112 HOWEVER: when the subject string is very, very long, searching to its end can
4113 take a long time, and give bad performance on quite ordinary patterns. This
4114 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4115 string... so we don't do this when the string is sufficiently long.
4116
4117 ALSO: this processing is disabled when partial matching is requested.
4118 */
4119
4120 if (req_byte >= 0 &&
4121 end_subject - start_match < REQ_BYTE_MAX &&
4122 !md->partial)
4123 {
4124 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4125
4126 /* We don't need to repeat the search if we haven't yet reached the
4127 place we found it at last time. */
4128
4129 if (p > req_byte_ptr)
4130 {
4131 if (req_byte_caseless)
4132 {
4133 while (p < end_subject)
4134 {
4135 register int pp = *p++;
4136 if (pp == req_byte || pp == req_byte2) { p--; break; }
4137 }
4138 }
4139 else
4140 {
4141 while (p < end_subject)
4142 {
4143 if (*p++ == req_byte) { p--; break; }
4144 }
4145 }
4146
4147 /* If we can't find the required character, break the matching loop,
4148 forcing a match failure. */
4149
4150 if (p >= end_subject)
4151 {
4152 rc = MATCH_NOMATCH;
4153 break;
4154 }
4155
4156 /* If we have found the required character, save the point where we
4157 found it, so that we don't search again next time round the loop if
4158 the start hasn't passed this character yet. */
4159
4160 req_byte_ptr = p;
4161 }
4162 }
4163
4164 /* OK, we can now run the match. */
4165
4166 md->start_match = start_match;
4167 md->match_call_count = 0;
4168 md->eptrn = 0; /* Next free eptrchain slot */
4169 rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4170
4171 /* Any return other than MATCH_NOMATCH breaks the loop. */
4172
4173 if (rc != MATCH_NOMATCH) break;
4174
4175 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4176 newline in the subject (though it may continue over the newline). Therefore,
4177 if we have just failed to match, starting at a newline, do not continue. */
4178
4179 if (firstline && IS_NEWLINE(start_match)) break;
4180
4181 /* Advance the match position by one character. */
4182
4183 start_match++;
4184 #ifdef SUPPORT_UTF8
4185 if (utf8)
4186 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4187 start_match++;
4188 #endif
4189
4190 /* Break the loop if the pattern is anchored or if we have passed the end of
4191 the subject. */
4192
4193 if (anchored || start_match > end_subject) break;
4194
4195 /* If we have just passed a CR and the newline option is CRLF or ANY or
4196 ANYCRLF, and we are now at a LF, advance the match position by one more
4197 character. */
4198
4199 if (start_match[-1] == '\r' &&
4200 (md->nltype == NLTYPE_ANY ||
4201 md->nltype == NLTYPE_ANYCRLF ||
4202 md->nllen == 2) &&
4203 start_match < end_subject &&
4204 *start_match == '\n')
4205 start_match++;
4206
4207 } /* End of for(;;) "bumpalong" loop */
4208
4209 /* ==========================================================================*/
4210
4211 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4212 conditions is true:
4213
4214 (1) The pattern is anchored;
4215
4216 (2) We are past the end of the subject;
4217
4218 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4219 this option requests that a match occur at or before the first newline in
4220 the subject.
4221
4222 When we have a match and the offset vector is big enough to deal with any
4223 backreferences, captured substring offsets will already be set up. In the case
4224 where we had to get some local store to hold offsets for backreference
4225 processing, copy those that we can. In this case there need not be overflow if
4226 certain parts of the pattern were not used, even though there are more
4227 capturing parentheses than vector slots. */
4228
4229 if (rc == MATCH_MATCH)
4230 {
4231 if (using_temporary_offsets)
4232 {
4233 if (offsetcount >= 4)
4234 {
4235 memcpy(offsets + 2, md->offset_vector + 2,
4236 (offsetcount - 2) * sizeof(int));
4237 DPRINTF(("Copied offsets from temporary memory\n"));
4238 }
4239 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4240 DPRINTF(("Freeing temporary memory\n"));
4241 (pcre_free)(md->offset_vector);
4242 }
4243
4244 /* Set the return code to the number of captured strings, or 0 if there are
4245 too many to fit into the vector. */
4246
4247 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4248
4249 /* If there is space, set up the whole thing as substring 0. */
4250
4251 if (offsetcount < 2) rc = 0; else
4252 {
4253 offsets[0] = start_match - md->start_subject;
4254 offsets[1] = md->end_match_ptr - md->start_subject;
4255 }
4256
4257 DPRINTF((">>>> returning %d\n", rc));
4258 return rc;
4259 }
4260
4261 /* Control gets here if there has been an error, or if the overall match
4262 attempt has failed at all permitted starting positions. */
4263
4264 if (using_temporary_offsets)
4265 {
4266 DPRINTF(("Freeing temporary memory\n"));
4267 (pcre_free)(md->offset_vector);
4268 }
4269
4270 if (rc != MATCH_NOMATCH)
4271 {
4272 DPRINTF((">>>> error: returning %d\n", rc));
4273 return rc;
4274 }
4275 else if (md->partial && md->hitend)
4276 {
4277 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4278 return PCRE_ERROR_PARTIAL;
4279 }
4280 else
4281 {
4282 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4283 return PCRE_ERROR_NOMATCH;
4284 }
4285 }
4286
4287 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12