/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 149 - (show annotations) (download)
Mon Apr 16 15:28:08 2007 UTC (7 years, 4 months ago) by ph10
File MIME type: text/plain
File size: 127769 byte(s)
Add PCRE_NEWLINE_ANYCRLF.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2007 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #define NLBLOCK md /* Block containing newline information */
46 #define PSSTART start_subject /* Field containing processed string start */
47 #define PSEND end_subject /* Field containing processed string end */
48
49 #include "pcre_internal.h"
50
51 /* Undefine some potentially clashing cpp symbols */
52
53 #undef min
54 #undef max
55
56 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57 obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58
59 #define EPTR_WORK_SIZE (1000)
60
61 /* Flag bits for the match() function */
62
63 #define match_condassert 0x01 /* Called to check a condition assertion */
64 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65 #define match_tail_recursed 0x04 /* Tail recursive call */
66
67 /* Non-error returns from the match() function. Error returns are externally
68 defined PCRE_ERROR_xxx codes, which are all negative. */
69
70 #define MATCH_MATCH 1
71 #define MATCH_NOMATCH 0
72
73 /* Maximum number of ints of offset to save on the stack for recursive calls.
74 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75 because the offset vector is always a multiple of 3 long. */
76
77 #define REC_STACK_SAVE_MAX 30
78
79 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80
81 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83
84
85
86 #ifdef DEBUG
87 /*************************************************
88 * Debugging function to print chars *
89 *************************************************/
90
91 /* Print a sequence of chars in printable format, stopping at the end of the
92 subject if the requested.
93
94 Arguments:
95 p points to characters
96 length number to print
97 is_subject TRUE if printing from within md->start_subject
98 md pointer to matching data block, if is_subject is TRUE
99
100 Returns: nothing
101 */
102
103 static void
104 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105 {
106 unsigned int c;
107 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108 while (length-- > 0)
109 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110 }
111 #endif
112
113
114
115 /*************************************************
116 * Match a back-reference *
117 *************************************************/
118
119 /* If a back reference hasn't been set, the length that is passed is greater
120 than the number of characters left in the string, so the match fails.
121
122 Arguments:
123 offset index into the offset vector
124 eptr points into the subject
125 length length to be matched
126 md points to match data block
127 ims the ims flags
128
129 Returns: TRUE if matched
130 */
131
132 static BOOL
133 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 unsigned long int ims)
135 {
136 USPTR p = md->start_subject + md->offset_vector[offset];
137
138 #ifdef DEBUG
139 if (eptr >= md->end_subject)
140 printf("matching subject <null>");
141 else
142 {
143 printf("matching subject ");
144 pchars(eptr, length, TRUE, md);
145 }
146 printf(" against backref ");
147 pchars(p, length, FALSE, md);
148 printf("\n");
149 #endif
150
151 /* Always fail if not enough characters left */
152
153 if (length > md->end_subject - eptr) return FALSE;
154
155 /* Separate the caselesss case for speed */
156
157 if ((ims & PCRE_CASELESS) != 0)
158 {
159 while (length-- > 0)
160 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161 }
162 else
163 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164
165 return TRUE;
166 }
167
168
169
170 /***************************************************************************
171 ****************************************************************************
172 RECURSION IN THE match() FUNCTION
173
174 The match() function is highly recursive, though not every recursive call
175 increases the recursive depth. Nevertheless, some regular expressions can cause
176 it to recurse to a great depth. I was writing for Unix, so I just let it call
177 itself recursively. This uses the stack for saving everything that has to be
178 saved for a recursive call. On Unix, the stack can be large, and this works
179 fine.
180
181 It turns out that on some non-Unix-like systems there are problems with
182 programs that use a lot of stack. (This despite the fact that every last chip
183 has oodles of memory these days, and techniques for extending the stack have
184 been known for decades.) So....
185
186 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187 calls by keeping local variables that need to be preserved in blocks of memory
188 obtained from malloc() instead instead of on the stack. Macros are used to
189 achieve this so that the actual code doesn't look very different to what it
190 always used to.
191 ****************************************************************************
192 ***************************************************************************/
193
194
195 /* These versions of the macros use the stack, as normal. There are debugging
196 versions and production versions. */
197
198 #ifndef NO_RECURSE
199 #define REGISTER register
200 #ifdef DEBUG
201 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
202 { \
203 printf("match() called in line %d\n", __LINE__); \
204 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
205 printf("to line %d\n", __LINE__); \
206 }
207 #define RRETURN(ra) \
208 { \
209 printf("match() returned %d from line %d ", ra, __LINE__); \
210 return ra; \
211 }
212 #else
213 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
214 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
215 #define RRETURN(ra) return ra
216 #endif
217
218 #else
219
220
221 /* These versions of the macros manage a private stack on the heap. Note
222 that the rd argument of RMATCH isn't actually used. It's the md argument of
223 match(), which never changes. */
224
225 #define REGISTER
226
227 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
228 {\
229 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
230 if (setjmp(frame->Xwhere) == 0)\
231 {\
232 newframe->Xeptr = ra;\
233 newframe->Xecode = rb;\
234 newframe->Xoffset_top = rc;\
235 newframe->Xims = re;\
236 newframe->Xeptrb = rf;\
237 newframe->Xflags = rg;\
238 newframe->Xrdepth = frame->Xrdepth + 1;\
239 newframe->Xprevframe = frame;\
240 frame = newframe;\
241 DPRINTF(("restarting from line %d\n", __LINE__));\
242 goto HEAP_RECURSE;\
243 }\
244 else\
245 {\
246 DPRINTF(("longjumped back to line %d\n", __LINE__));\
247 frame = md->thisframe;\
248 rx = frame->Xresult;\
249 }\
250 }
251
252 #define RRETURN(ra)\
253 {\
254 heapframe *newframe = frame;\
255 frame = newframe->Xprevframe;\
256 (pcre_stack_free)(newframe);\
257 if (frame != NULL)\
258 {\
259 frame->Xresult = ra;\
260 md->thisframe = frame;\
261 longjmp(frame->Xwhere, 1);\
262 }\
263 return ra;\
264 }
265
266
267 /* Structure for remembering the local variables in a private frame */
268
269 typedef struct heapframe {
270 struct heapframe *Xprevframe;
271
272 /* Function arguments that may change */
273
274 const uschar *Xeptr;
275 const uschar *Xecode;
276 int Xoffset_top;
277 long int Xims;
278 eptrblock *Xeptrb;
279 int Xflags;
280 unsigned int Xrdepth;
281
282 /* Function local variables */
283
284 const uschar *Xcallpat;
285 const uschar *Xcharptr;
286 const uschar *Xdata;
287 const uschar *Xnext;
288 const uschar *Xpp;
289 const uschar *Xprev;
290 const uschar *Xsaved_eptr;
291
292 recursion_info Xnew_recursive;
293
294 BOOL Xcur_is_word;
295 BOOL Xcondition;
296 BOOL Xprev_is_word;
297
298 unsigned long int Xoriginal_ims;
299
300 #ifdef SUPPORT_UCP
301 int Xprop_type;
302 int Xprop_value;
303 int Xprop_fail_result;
304 int Xprop_category;
305 int Xprop_chartype;
306 int Xprop_script;
307 int Xoclength;
308 uschar Xocchars[8];
309 #endif
310
311 int Xctype;
312 unsigned int Xfc;
313 int Xfi;
314 int Xlength;
315 int Xmax;
316 int Xmin;
317 int Xnumber;
318 int Xoffset;
319 int Xop;
320 int Xsave_capture_last;
321 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
322 int Xstacksave[REC_STACK_SAVE_MAX];
323
324 eptrblock Xnewptrb;
325
326 /* Place to pass back result, and where to jump back to */
327
328 int Xresult;
329 jmp_buf Xwhere;
330
331 } heapframe;
332
333 #endif
334
335
336 /***************************************************************************
337 ***************************************************************************/
338
339
340
341 /*************************************************
342 * Match from current position *
343 *************************************************/
344
345 /* This function is called recursively in many circumstances. Whenever it
346 returns a negative (error) response, the outer incarnation must also return the
347 same response.
348
349 Performance note: It might be tempting to extract commonly used fields from the
350 md structure (e.g. utf8, end_subject) into individual variables to improve
351 performance. Tests using gcc on a SPARC disproved this; in the first case, it
352 made performance worse.
353
354 Arguments:
355 eptr pointer to current character in subject
356 ecode pointer to current position in compiled code
357 offset_top current top pointer
358 md pointer to "static" info for the match
359 ims current /i, /m, and /s options
360 eptrb pointer to chain of blocks containing eptr at start of
361 brackets - for testing for empty matches
362 flags can contain
363 match_condassert - this is an assertion condition
364 match_cbegroup - this is the start of an unlimited repeat
365 group that can match an empty string
366 match_tail_recursed - this is a tail_recursed group
367 rdepth the recursion depth
368
369 Returns: MATCH_MATCH if matched ) these values are >= 0
370 MATCH_NOMATCH if failed to match )
371 a negative PCRE_ERROR_xxx value if aborted by an error condition
372 (e.g. stopped by repeated call or recursion limit)
373 */
374
375 static int
376 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
377 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
378 int flags, unsigned int rdepth)
379 {
380 /* These variables do not need to be preserved over recursion in this function,
381 so they can be ordinary variables in all cases. Mark some of them with
382 "register" because they are used a lot in loops. */
383
384 register int rrc; /* Returns from recursive calls */
385 register int i; /* Used for loops not involving calls to RMATCH() */
386 register unsigned int c; /* Character values not kept over RMATCH() calls */
387 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
388
389 BOOL minimize, possessive; /* Quantifier options */
390
391 /* When recursion is not being used, all "local" variables that have to be
392 preserved over calls to RMATCH() are part of a "frame" which is obtained from
393 heap storage. Set up the top-level frame here; others are obtained from the
394 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
395
396 #ifdef NO_RECURSE
397 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
398 frame->Xprevframe = NULL; /* Marks the top level */
399
400 /* Copy in the original argument variables */
401
402 frame->Xeptr = eptr;
403 frame->Xecode = ecode;
404 frame->Xoffset_top = offset_top;
405 frame->Xims = ims;
406 frame->Xeptrb = eptrb;
407 frame->Xflags = flags;
408 frame->Xrdepth = rdepth;
409
410 /* This is where control jumps back to to effect "recursion" */
411
412 HEAP_RECURSE:
413
414 /* Macros make the argument variables come from the current frame */
415
416 #define eptr frame->Xeptr
417 #define ecode frame->Xecode
418 #define offset_top frame->Xoffset_top
419 #define ims frame->Xims
420 #define eptrb frame->Xeptrb
421 #define flags frame->Xflags
422 #define rdepth frame->Xrdepth
423
424 /* Ditto for the local variables */
425
426 #ifdef SUPPORT_UTF8
427 #define charptr frame->Xcharptr
428 #endif
429 #define callpat frame->Xcallpat
430 #define data frame->Xdata
431 #define next frame->Xnext
432 #define pp frame->Xpp
433 #define prev frame->Xprev
434 #define saved_eptr frame->Xsaved_eptr
435
436 #define new_recursive frame->Xnew_recursive
437
438 #define cur_is_word frame->Xcur_is_word
439 #define condition frame->Xcondition
440 #define prev_is_word frame->Xprev_is_word
441
442 #define original_ims frame->Xoriginal_ims
443
444 #ifdef SUPPORT_UCP
445 #define prop_type frame->Xprop_type
446 #define prop_value frame->Xprop_value
447 #define prop_fail_result frame->Xprop_fail_result
448 #define prop_category frame->Xprop_category
449 #define prop_chartype frame->Xprop_chartype
450 #define prop_script frame->Xprop_script
451 #define oclength frame->Xoclength
452 #define occhars frame->Xocchars
453 #endif
454
455 #define ctype frame->Xctype
456 #define fc frame->Xfc
457 #define fi frame->Xfi
458 #define length frame->Xlength
459 #define max frame->Xmax
460 #define min frame->Xmin
461 #define number frame->Xnumber
462 #define offset frame->Xoffset
463 #define op frame->Xop
464 #define save_capture_last frame->Xsave_capture_last
465 #define save_offset1 frame->Xsave_offset1
466 #define save_offset2 frame->Xsave_offset2
467 #define save_offset3 frame->Xsave_offset3
468 #define stacksave frame->Xstacksave
469
470 #define newptrb frame->Xnewptrb
471
472 /* When recursion is being used, local variables are allocated on the stack and
473 get preserved during recursion in the normal way. In this environment, fi and
474 i, and fc and c, can be the same variables. */
475
476 #else /* NO_RECURSE not defined */
477 #define fi i
478 #define fc c
479
480
481 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
482 const uschar *charptr; /* in small blocks of the code. My normal */
483 #endif /* style of coding would have declared */
484 const uschar *callpat; /* them within each of those blocks. */
485 const uschar *data; /* However, in order to accommodate the */
486 const uschar *next; /* version of this code that uses an */
487 USPTR pp; /* external "stack" implemented on the */
488 const uschar *prev; /* heap, it is easier to declare them all */
489 USPTR saved_eptr; /* here, so the declarations can be cut */
490 /* out in a block. The only declarations */
491 recursion_info new_recursive; /* within blocks below are for variables */
492 /* that do not have to be preserved over */
493 BOOL cur_is_word; /* a recursive call to RMATCH(). */
494 BOOL condition;
495 BOOL prev_is_word;
496
497 unsigned long int original_ims;
498
499 #ifdef SUPPORT_UCP
500 int prop_type;
501 int prop_value;
502 int prop_fail_result;
503 int prop_category;
504 int prop_chartype;
505 int prop_script;
506 int oclength;
507 uschar occhars[8];
508 #endif
509
510 int ctype;
511 int length;
512 int max;
513 int min;
514 int number;
515 int offset;
516 int op;
517 int save_capture_last;
518 int save_offset1, save_offset2, save_offset3;
519 int stacksave[REC_STACK_SAVE_MAX];
520
521 eptrblock newptrb;
522 #endif /* NO_RECURSE */
523
524 /* These statements are here to stop the compiler complaining about unitialized
525 variables. */
526
527 #ifdef SUPPORT_UCP
528 prop_value = 0;
529 prop_fail_result = 0;
530 #endif
531
532
533 /* This label is used for tail recursion, which is used in a few cases even
534 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
535 used. Thanks to Ian Taylor for noticing this possibility and sending the
536 original patch. */
537
538 TAIL_RECURSE:
539
540 /* OK, now we can get on with the real code of the function. Recursive calls
541 are specified by the macro RMATCH and RRETURN is used to return. When
542 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
543 and a "return", respectively (possibly with some debugging if DEBUG is
544 defined). However, RMATCH isn't like a function call because it's quite a
545 complicated macro. It has to be used in one particular way. This shouldn't,
546 however, impact performance when true recursion is being used. */
547
548 /* First check that we haven't called match() too many times, or that we
549 haven't exceeded the recursive call limit. */
550
551 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
552 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
553
554 original_ims = ims; /* Save for resetting on ')' */
555
556 #ifdef SUPPORT_UTF8
557 utf8 = md->utf8; /* Local copy of the flag */
558 #else
559 utf8 = FALSE;
560 #endif
561
562 /* At the start of a group with an unlimited repeat that may match an empty
563 string, the match_cbegroup flag is set. When this is the case, add the current
564 subject pointer to the chain of such remembered pointers, to be checked when we
565 hit the closing ket, in order to break infinite loops that match no characters.
566 When match() is called in other circumstances, don't add to the chain. If this
567 is a tail recursion, use a block from the workspace, as the one on the stack is
568 already used. */
569
570 if ((flags & match_cbegroup) != 0)
571 {
572 eptrblock *p;
573 if ((flags & match_tail_recursed) != 0)
574 {
575 if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
576 p = md->eptrchain + md->eptrn++;
577 }
578 else p = &newptrb;
579 p->epb_saved_eptr = eptr;
580 p->epb_prev = eptrb;
581 eptrb = p;
582 }
583
584 /* Now start processing the opcodes. */
585
586 for (;;)
587 {
588 minimize = possessive = FALSE;
589 op = *ecode;
590
591 /* For partial matching, remember if we ever hit the end of the subject after
592 matching at least one subject character. */
593
594 if (md->partial &&
595 eptr >= md->end_subject &&
596 eptr > md->start_match)
597 md->hitend = TRUE;
598
599 switch(op)
600 {
601 /* Handle a capturing bracket. If there is space in the offset vector, save
602 the current subject position in the working slot at the top of the vector.
603 We mustn't change the current values of the data slot, because they may be
604 set from a previous iteration of this group, and be referred to by a
605 reference inside the group.
606
607 If the bracket fails to match, we need to restore this value and also the
608 values of the final offsets, in case they were set by a previous iteration
609 of the same bracket.
610
611 If there isn't enough space in the offset vector, treat this as if it were
612 a non-capturing bracket. Don't worry about setting the flag for the error
613 case here; that is handled in the code for KET. */
614
615 case OP_CBRA:
616 case OP_SCBRA:
617 number = GET2(ecode, 1+LINK_SIZE);
618 offset = number << 1;
619
620 #ifdef DEBUG
621 printf("start bracket %d\n", number);
622 printf("subject=");
623 pchars(eptr, 16, TRUE, md);
624 printf("\n");
625 #endif
626
627 if (offset < md->offset_max)
628 {
629 save_offset1 = md->offset_vector[offset];
630 save_offset2 = md->offset_vector[offset+1];
631 save_offset3 = md->offset_vector[md->offset_end - number];
632 save_capture_last = md->capture_last;
633
634 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
635 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
636
637 flags = (op == OP_SCBRA)? match_cbegroup : 0;
638 do
639 {
640 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
641 ims, eptrb, flags);
642 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
643 md->capture_last = save_capture_last;
644 ecode += GET(ecode, 1);
645 }
646 while (*ecode == OP_ALT);
647
648 DPRINTF(("bracket %d failed\n", number));
649
650 md->offset_vector[offset] = save_offset1;
651 md->offset_vector[offset+1] = save_offset2;
652 md->offset_vector[md->offset_end - number] = save_offset3;
653
654 RRETURN(MATCH_NOMATCH);
655 }
656
657 /* Insufficient room for saving captured contents. Treat as a non-capturing
658 bracket. */
659
660 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
661
662 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
663 final alternative within the brackets, we would return the result of a
664 recursive call to match() whatever happened. We can reduce stack usage by
665 turning this into a tail recursion. */
666
667 case OP_BRA:
668 case OP_SBRA:
669 DPRINTF(("start non-capturing bracket\n"));
670 flags = (op >= OP_SBRA)? match_cbegroup : 0;
671 for (;;)
672 {
673 if (ecode[GET(ecode, 1)] != OP_ALT)
674 {
675 ecode += _pcre_OP_lengths[*ecode];
676 flags |= match_tail_recursed;
677 DPRINTF(("bracket 0 tail recursion\n"));
678 goto TAIL_RECURSE;
679 }
680
681 /* For non-final alternatives, continue the loop for a NOMATCH result;
682 otherwise return. */
683
684 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
685 eptrb, flags);
686 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
687 ecode += GET(ecode, 1);
688 }
689 /* Control never reaches here. */
690
691 /* Conditional group: compilation checked that there are no more than
692 two branches. If the condition is false, skipping the first branch takes us
693 past the end if there is only one branch, but that's OK because that is
694 exactly what going to the ket would do. As there is only one branch to be
695 obeyed, we can use tail recursion to avoid using another stack frame. */
696
697 case OP_COND:
698 case OP_SCOND:
699 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
700 {
701 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
702 condition = md->recursive != NULL &&
703 (offset == RREF_ANY || offset == md->recursive->group_num);
704 ecode += condition? 3 : GET(ecode, 1);
705 }
706
707 else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
708 {
709 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
710 condition = offset < offset_top && md->offset_vector[offset] >= 0;
711 ecode += condition? 3 : GET(ecode, 1);
712 }
713
714 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
715 {
716 condition = FALSE;
717 ecode += GET(ecode, 1);
718 }
719
720 /* The condition is an assertion. Call match() to evaluate it - setting
721 the final argument match_condassert causes it to stop at the end of an
722 assertion. */
723
724 else
725 {
726 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
727 match_condassert);
728 if (rrc == MATCH_MATCH)
729 {
730 condition = TRUE;
731 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
732 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
733 }
734 else if (rrc != MATCH_NOMATCH)
735 {
736 RRETURN(rrc); /* Need braces because of following else */
737 }
738 else
739 {
740 condition = FALSE;
741 ecode += GET(ecode, 1);
742 }
743 }
744
745 /* We are now at the branch that is to be obeyed. As there is only one,
746 we can use tail recursion to avoid using another stack frame. If the second
747 alternative doesn't exist, we can just plough on. */
748
749 if (condition || *ecode == OP_ALT)
750 {
751 ecode += 1 + LINK_SIZE;
752 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
753 goto TAIL_RECURSE;
754 }
755 else
756 {
757 ecode += 1 + LINK_SIZE;
758 }
759 break;
760
761
762 /* End of the pattern. If we are in a top-level recursion, we should
763 restore the offsets appropriately and continue from after the call. */
764
765 case OP_END:
766 if (md->recursive != NULL && md->recursive->group_num == 0)
767 {
768 recursion_info *rec = md->recursive;
769 DPRINTF(("End of pattern in a (?0) recursion\n"));
770 md->recursive = rec->prevrec;
771 memmove(md->offset_vector, rec->offset_save,
772 rec->saved_max * sizeof(int));
773 md->start_match = rec->save_start;
774 ims = original_ims;
775 ecode = rec->after_call;
776 break;
777 }
778
779 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
780 string - backtracking will then try other alternatives, if any. */
781
782 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
783 md->end_match_ptr = eptr; /* Record where we ended */
784 md->end_offset_top = offset_top; /* and how many extracts were taken */
785 RRETURN(MATCH_MATCH);
786
787 /* Change option settings */
788
789 case OP_OPT:
790 ims = ecode[1];
791 ecode += 2;
792 DPRINTF(("ims set to %02lx\n", ims));
793 break;
794
795 /* Assertion brackets. Check the alternative branches in turn - the
796 matching won't pass the KET for an assertion. If any one branch matches,
797 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
798 start of each branch to move the current point backwards, so the code at
799 this level is identical to the lookahead case. */
800
801 case OP_ASSERT:
802 case OP_ASSERTBACK:
803 do
804 {
805 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
806 if (rrc == MATCH_MATCH) break;
807 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808 ecode += GET(ecode, 1);
809 }
810 while (*ecode == OP_ALT);
811 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
812
813 /* If checking an assertion for a condition, return MATCH_MATCH. */
814
815 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
816
817 /* Continue from after the assertion, updating the offsets high water
818 mark, since extracts may have been taken during the assertion. */
819
820 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
821 ecode += 1 + LINK_SIZE;
822 offset_top = md->end_offset_top;
823 continue;
824
825 /* Negative assertion: all branches must fail to match */
826
827 case OP_ASSERT_NOT:
828 case OP_ASSERTBACK_NOT:
829 do
830 {
831 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
832 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
834 ecode += GET(ecode,1);
835 }
836 while (*ecode == OP_ALT);
837
838 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
839
840 ecode += 1 + LINK_SIZE;
841 continue;
842
843 /* Move the subject pointer back. This occurs only at the start of
844 each branch of a lookbehind assertion. If we are too close to the start to
845 move back, this match function fails. When working with UTF-8 we move
846 back a number of characters, not bytes. */
847
848 case OP_REVERSE:
849 #ifdef SUPPORT_UTF8
850 if (utf8)
851 {
852 i = GET(ecode, 1);
853 while (i-- > 0)
854 {
855 eptr--;
856 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
857 BACKCHAR(eptr)
858 }
859 }
860 else
861 #endif
862
863 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
864
865 {
866 eptr -= GET(ecode, 1);
867 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
868 }
869
870 /* Skip to next op code */
871
872 ecode += 1 + LINK_SIZE;
873 break;
874
875 /* The callout item calls an external function, if one is provided, passing
876 details of the match so far. This is mainly for debugging, though the
877 function is able to force a failure. */
878
879 case OP_CALLOUT:
880 if (pcre_callout != NULL)
881 {
882 pcre_callout_block cb;
883 cb.version = 1; /* Version 1 of the callout block */
884 cb.callout_number = ecode[1];
885 cb.offset_vector = md->offset_vector;
886 cb.subject = (PCRE_SPTR)md->start_subject;
887 cb.subject_length = md->end_subject - md->start_subject;
888 cb.start_match = md->start_match - md->start_subject;
889 cb.current_position = eptr - md->start_subject;
890 cb.pattern_position = GET(ecode, 2);
891 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
892 cb.capture_top = offset_top/2;
893 cb.capture_last = md->capture_last;
894 cb.callout_data = md->callout_data;
895 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
896 if (rrc < 0) RRETURN(rrc);
897 }
898 ecode += 2 + 2*LINK_SIZE;
899 break;
900
901 /* Recursion either matches the current regex, or some subexpression. The
902 offset data is the offset to the starting bracket from the start of the
903 whole pattern. (This is so that it works from duplicated subpatterns.)
904
905 If there are any capturing brackets started but not finished, we have to
906 save their starting points and reinstate them after the recursion. However,
907 we don't know how many such there are (offset_top records the completed
908 total) so we just have to save all the potential data. There may be up to
909 65535 such values, which is too large to put on the stack, but using malloc
910 for small numbers seems expensive. As a compromise, the stack is used when
911 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
912 is used. A problem is what to do if the malloc fails ... there is no way of
913 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
914 values on the stack, and accept that the rest may be wrong.
915
916 There are also other values that have to be saved. We use a chained
917 sequence of blocks that actually live on the stack. Thanks to Robin Houston
918 for the original version of this logic. */
919
920 case OP_RECURSE:
921 {
922 callpat = md->start_code + GET(ecode, 1);
923 new_recursive.group_num = (callpat == md->start_code)? 0 :
924 GET2(callpat, 1 + LINK_SIZE);
925
926 /* Add to "recursing stack" */
927
928 new_recursive.prevrec = md->recursive;
929 md->recursive = &new_recursive;
930
931 /* Find where to continue from afterwards */
932
933 ecode += 1 + LINK_SIZE;
934 new_recursive.after_call = ecode;
935
936 /* Now save the offset data. */
937
938 new_recursive.saved_max = md->offset_end;
939 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
940 new_recursive.offset_save = stacksave;
941 else
942 {
943 new_recursive.offset_save =
944 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
945 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
946 }
947
948 memcpy(new_recursive.offset_save, md->offset_vector,
949 new_recursive.saved_max * sizeof(int));
950 new_recursive.save_start = md->start_match;
951 md->start_match = eptr;
952
953 /* OK, now we can do the recursion. For each top-level alternative we
954 restore the offset and recursion data. */
955
956 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
957 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
958 do
959 {
960 RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
961 md, ims, eptrb, flags);
962 if (rrc == MATCH_MATCH)
963 {
964 DPRINTF(("Recursion matched\n"));
965 md->recursive = new_recursive.prevrec;
966 if (new_recursive.offset_save != stacksave)
967 (pcre_free)(new_recursive.offset_save);
968 RRETURN(MATCH_MATCH);
969 }
970 else if (rrc != MATCH_NOMATCH)
971 {
972 DPRINTF(("Recursion gave error %d\n", rrc));
973 RRETURN(rrc);
974 }
975
976 md->recursive = &new_recursive;
977 memcpy(md->offset_vector, new_recursive.offset_save,
978 new_recursive.saved_max * sizeof(int));
979 callpat += GET(callpat, 1);
980 }
981 while (*callpat == OP_ALT);
982
983 DPRINTF(("Recursion didn't match\n"));
984 md->recursive = new_recursive.prevrec;
985 if (new_recursive.offset_save != stacksave)
986 (pcre_free)(new_recursive.offset_save);
987 RRETURN(MATCH_NOMATCH);
988 }
989 /* Control never reaches here */
990
991 /* "Once" brackets are like assertion brackets except that after a match,
992 the point in the subject string is not moved back. Thus there can never be
993 a move back into the brackets. Friedl calls these "atomic" subpatterns.
994 Check the alternative branches in turn - the matching won't pass the KET
995 for this kind of subpattern. If any one branch matches, we carry on as at
996 the end of a normal bracket, leaving the subject pointer. */
997
998 case OP_ONCE:
999 prev = ecode;
1000 saved_eptr = eptr;
1001
1002 do
1003 {
1004 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1005 eptrb, 0);
1006 if (rrc == MATCH_MATCH) break;
1007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1008 ecode += GET(ecode,1);
1009 }
1010 while (*ecode == OP_ALT);
1011
1012 /* If hit the end of the group (which could be repeated), fail */
1013
1014 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1015
1016 /* Continue as from after the assertion, updating the offsets high water
1017 mark, since extracts may have been taken. */
1018
1019 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1020
1021 offset_top = md->end_offset_top;
1022 eptr = md->end_match_ptr;
1023
1024 /* For a non-repeating ket, just continue at this level. This also
1025 happens for a repeating ket if no characters were matched in the group.
1026 This is the forcible breaking of infinite loops as implemented in Perl
1027 5.005. If there is an options reset, it will get obeyed in the normal
1028 course of events. */
1029
1030 if (*ecode == OP_KET || eptr == saved_eptr)
1031 {
1032 ecode += 1+LINK_SIZE;
1033 break;
1034 }
1035
1036 /* The repeating kets try the rest of the pattern or restart from the
1037 preceding bracket, in the appropriate order. The second "call" of match()
1038 uses tail recursion, to avoid using another stack frame. We need to reset
1039 any options that changed within the bracket before re-running it, so
1040 check the next opcode. */
1041
1042 if (ecode[1+LINK_SIZE] == OP_OPT)
1043 {
1044 ims = (ims & ~PCRE_IMS) | ecode[4];
1045 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1046 }
1047
1048 if (*ecode == OP_KETRMIN)
1049 {
1050 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1051 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1052 ecode = prev;
1053 flags = match_tail_recursed;
1054 goto TAIL_RECURSE;
1055 }
1056 else /* OP_KETRMAX */
1057 {
1058 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1059 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1060 ecode += 1 + LINK_SIZE;
1061 flags = match_tail_recursed;
1062 goto TAIL_RECURSE;
1063 }
1064 /* Control never gets here */
1065
1066 /* An alternation is the end of a branch; scan along to find the end of the
1067 bracketed group and go to there. */
1068
1069 case OP_ALT:
1070 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1071 break;
1072
1073 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1074 that it may occur zero times. It may repeat infinitely, or not at all -
1075 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1076 repeat limits are compiled as a number of copies, with the optional ones
1077 preceded by BRAZERO or BRAMINZERO. */
1078
1079 case OP_BRAZERO:
1080 {
1081 next = ecode+1;
1082 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1083 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1084 do next += GET(next,1); while (*next == OP_ALT);
1085 ecode = next + 1 + LINK_SIZE;
1086 }
1087 break;
1088
1089 case OP_BRAMINZERO:
1090 {
1091 next = ecode+1;
1092 do next += GET(next, 1); while (*next == OP_ALT);
1093 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1094 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1095 ecode++;
1096 }
1097 break;
1098
1099 /* End of a group, repeated or non-repeating. */
1100
1101 case OP_KET:
1102 case OP_KETRMIN:
1103 case OP_KETRMAX:
1104 prev = ecode - GET(ecode, 1);
1105
1106 /* If this was a group that remembered the subject start, in order to break
1107 infinite repeats of empty string matches, retrieve the subject start from
1108 the chain. Otherwise, set it NULL. */
1109
1110 if (*prev >= OP_SBRA)
1111 {
1112 saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1113 eptrb = eptrb->epb_prev; /* Backup to previous group */
1114 }
1115 else saved_eptr = NULL;
1116
1117 /* If we are at the end of an assertion group, stop matching and return
1118 MATCH_MATCH, but record the current high water mark for use by positive
1119 assertions. Do this also for the "once" (atomic) groups. */
1120
1121 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1122 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1123 *prev == OP_ONCE)
1124 {
1125 md->end_match_ptr = eptr; /* For ONCE */
1126 md->end_offset_top = offset_top;
1127 RRETURN(MATCH_MATCH);
1128 }
1129
1130 /* For capturing groups we have to check the group number back at the start
1131 and if necessary complete handling an extraction by setting the offsets and
1132 bumping the high water mark. Note that whole-pattern recursion is coded as
1133 a recurse into group 0, so it won't be picked up here. Instead, we catch it
1134 when the OP_END is reached. Other recursion is handled here. */
1135
1136 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1137 {
1138 number = GET2(prev, 1+LINK_SIZE);
1139 offset = number << 1;
1140
1141 #ifdef DEBUG
1142 printf("end bracket %d", number);
1143 printf("\n");
1144 #endif
1145
1146 md->capture_last = number;
1147 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1148 {
1149 md->offset_vector[offset] =
1150 md->offset_vector[md->offset_end - number];
1151 md->offset_vector[offset+1] = eptr - md->start_subject;
1152 if (offset_top <= offset) offset_top = offset + 2;
1153 }
1154
1155 /* Handle a recursively called group. Restore the offsets
1156 appropriately and continue from after the call. */
1157
1158 if (md->recursive != NULL && md->recursive->group_num == number)
1159 {
1160 recursion_info *rec = md->recursive;
1161 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1162 md->recursive = rec->prevrec;
1163 md->start_match = rec->save_start;
1164 memcpy(md->offset_vector, rec->offset_save,
1165 rec->saved_max * sizeof(int));
1166 ecode = rec->after_call;
1167 ims = original_ims;
1168 break;
1169 }
1170 }
1171
1172 /* For both capturing and non-capturing groups, reset the value of the ims
1173 flags, in case they got changed during the group. */
1174
1175 ims = original_ims;
1176 DPRINTF(("ims reset to %02lx\n", ims));
1177
1178 /* For a non-repeating ket, just continue at this level. This also
1179 happens for a repeating ket if no characters were matched in the group.
1180 This is the forcible breaking of infinite loops as implemented in Perl
1181 5.005. If there is an options reset, it will get obeyed in the normal
1182 course of events. */
1183
1184 if (*ecode == OP_KET || eptr == saved_eptr)
1185 {
1186 ecode += 1 + LINK_SIZE;
1187 break;
1188 }
1189
1190 /* The repeating kets try the rest of the pattern or restart from the
1191 preceding bracket, in the appropriate order. In the second case, we can use
1192 tail recursion to avoid using another stack frame. */
1193
1194 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1195
1196 if (*ecode == OP_KETRMIN)
1197 {
1198 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1199 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1200 ecode = prev;
1201 flags |= match_tail_recursed;
1202 goto TAIL_RECURSE;
1203 }
1204 else /* OP_KETRMAX */
1205 {
1206 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1207 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1208 ecode += 1 + LINK_SIZE;
1209 flags = match_tail_recursed;
1210 goto TAIL_RECURSE;
1211 }
1212 /* Control never gets here */
1213
1214 /* Start of subject unless notbol, or after internal newline if multiline */
1215
1216 case OP_CIRC:
1217 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1218 if ((ims & PCRE_MULTILINE) != 0)
1219 {
1220 if (eptr != md->start_subject &&
1221 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1222 RRETURN(MATCH_NOMATCH);
1223 ecode++;
1224 break;
1225 }
1226 /* ... else fall through */
1227
1228 /* Start of subject assertion */
1229
1230 case OP_SOD:
1231 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1232 ecode++;
1233 break;
1234
1235 /* Start of match assertion */
1236
1237 case OP_SOM:
1238 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1239 ecode++;
1240 break;
1241
1242 /* Assert before internal newline if multiline, or before a terminating
1243 newline unless endonly is set, else end of subject unless noteol is set. */
1244
1245 case OP_DOLL:
1246 if ((ims & PCRE_MULTILINE) != 0)
1247 {
1248 if (eptr < md->end_subject)
1249 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1250 else
1251 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1252 ecode++;
1253 break;
1254 }
1255 else
1256 {
1257 if (md->noteol) RRETURN(MATCH_NOMATCH);
1258 if (!md->endonly)
1259 {
1260 if (eptr != md->end_subject &&
1261 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1262 RRETURN(MATCH_NOMATCH);
1263 ecode++;
1264 break;
1265 }
1266 }
1267 /* ... else fall through for endonly */
1268
1269 /* End of subject assertion (\z) */
1270
1271 case OP_EOD:
1272 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1273 ecode++;
1274 break;
1275
1276 /* End of subject or ending \n assertion (\Z) */
1277
1278 case OP_EODN:
1279 if (eptr != md->end_subject &&
1280 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1281 RRETURN(MATCH_NOMATCH);
1282 ecode++;
1283 break;
1284
1285 /* Word boundary assertions */
1286
1287 case OP_NOT_WORD_BOUNDARY:
1288 case OP_WORD_BOUNDARY:
1289 {
1290
1291 /* Find out if the previous and current characters are "word" characters.
1292 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1293 be "non-word" characters. */
1294
1295 #ifdef SUPPORT_UTF8
1296 if (utf8)
1297 {
1298 if (eptr == md->start_subject) prev_is_word = FALSE; else
1299 {
1300 const uschar *lastptr = eptr - 1;
1301 while((*lastptr & 0xc0) == 0x80) lastptr--;
1302 GETCHAR(c, lastptr);
1303 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1304 }
1305 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1306 {
1307 GETCHAR(c, eptr);
1308 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1309 }
1310 }
1311 else
1312 #endif
1313
1314 /* More streamlined when not in UTF-8 mode */
1315
1316 {
1317 prev_is_word = (eptr != md->start_subject) &&
1318 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1319 cur_is_word = (eptr < md->end_subject) &&
1320 ((md->ctypes[*eptr] & ctype_word) != 0);
1321 }
1322
1323 /* Now see if the situation is what we want */
1324
1325 if ((*ecode++ == OP_WORD_BOUNDARY)?
1326 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1327 RRETURN(MATCH_NOMATCH);
1328 }
1329 break;
1330
1331 /* Match a single character type; inline for speed */
1332
1333 case OP_ANY:
1334 if ((ims & PCRE_DOTALL) == 0)
1335 {
1336 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1337 }
1338 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1339 if (utf8)
1340 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1341 ecode++;
1342 break;
1343
1344 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1345 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1346
1347 case OP_ANYBYTE:
1348 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1349 ecode++;
1350 break;
1351
1352 case OP_NOT_DIGIT:
1353 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1354 GETCHARINCTEST(c, eptr);
1355 if (
1356 #ifdef SUPPORT_UTF8
1357 c < 256 &&
1358 #endif
1359 (md->ctypes[c] & ctype_digit) != 0
1360 )
1361 RRETURN(MATCH_NOMATCH);
1362 ecode++;
1363 break;
1364
1365 case OP_DIGIT:
1366 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1367 GETCHARINCTEST(c, eptr);
1368 if (
1369 #ifdef SUPPORT_UTF8
1370 c >= 256 ||
1371 #endif
1372 (md->ctypes[c] & ctype_digit) == 0
1373 )
1374 RRETURN(MATCH_NOMATCH);
1375 ecode++;
1376 break;
1377
1378 case OP_NOT_WHITESPACE:
1379 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1380 GETCHARINCTEST(c, eptr);
1381 if (
1382 #ifdef SUPPORT_UTF8
1383 c < 256 &&
1384 #endif
1385 (md->ctypes[c] & ctype_space) != 0
1386 )
1387 RRETURN(MATCH_NOMATCH);
1388 ecode++;
1389 break;
1390
1391 case OP_WHITESPACE:
1392 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1393 GETCHARINCTEST(c, eptr);
1394 if (
1395 #ifdef SUPPORT_UTF8
1396 c >= 256 ||
1397 #endif
1398 (md->ctypes[c] & ctype_space) == 0
1399 )
1400 RRETURN(MATCH_NOMATCH);
1401 ecode++;
1402 break;
1403
1404 case OP_NOT_WORDCHAR:
1405 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1406 GETCHARINCTEST(c, eptr);
1407 if (
1408 #ifdef SUPPORT_UTF8
1409 c < 256 &&
1410 #endif
1411 (md->ctypes[c] & ctype_word) != 0
1412 )
1413 RRETURN(MATCH_NOMATCH);
1414 ecode++;
1415 break;
1416
1417 case OP_WORDCHAR:
1418 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1419 GETCHARINCTEST(c, eptr);
1420 if (
1421 #ifdef SUPPORT_UTF8
1422 c >= 256 ||
1423 #endif
1424 (md->ctypes[c] & ctype_word) == 0
1425 )
1426 RRETURN(MATCH_NOMATCH);
1427 ecode++;
1428 break;
1429
1430 case OP_ANYNL:
1431 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1432 GETCHARINCTEST(c, eptr);
1433 switch(c)
1434 {
1435 default: RRETURN(MATCH_NOMATCH);
1436 case 0x000d:
1437 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1438 break;
1439 case 0x000a:
1440 case 0x000b:
1441 case 0x000c:
1442 case 0x0085:
1443 case 0x2028:
1444 case 0x2029:
1445 break;
1446 }
1447 ecode++;
1448 break;
1449
1450 #ifdef SUPPORT_UCP
1451 /* Check the next character by Unicode property. We will get here only
1452 if the support is in the binary; otherwise a compile-time error occurs. */
1453
1454 case OP_PROP:
1455 case OP_NOTPROP:
1456 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1457 GETCHARINCTEST(c, eptr);
1458 {
1459 int chartype, script;
1460 int category = _pcre_ucp_findprop(c, &chartype, &script);
1461
1462 switch(ecode[1])
1463 {
1464 case PT_ANY:
1465 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1466 break;
1467
1468 case PT_LAMP:
1469 if ((chartype == ucp_Lu ||
1470 chartype == ucp_Ll ||
1471 chartype == ucp_Lt) == (op == OP_NOTPROP))
1472 RRETURN(MATCH_NOMATCH);
1473 break;
1474
1475 case PT_GC:
1476 if ((ecode[2] != category) == (op == OP_PROP))
1477 RRETURN(MATCH_NOMATCH);
1478 break;
1479
1480 case PT_PC:
1481 if ((ecode[2] != chartype) == (op == OP_PROP))
1482 RRETURN(MATCH_NOMATCH);
1483 break;
1484
1485 case PT_SC:
1486 if ((ecode[2] != script) == (op == OP_PROP))
1487 RRETURN(MATCH_NOMATCH);
1488 break;
1489
1490 default:
1491 RRETURN(PCRE_ERROR_INTERNAL);
1492 }
1493
1494 ecode += 3;
1495 }
1496 break;
1497
1498 /* Match an extended Unicode sequence. We will get here only if the support
1499 is in the binary; otherwise a compile-time error occurs. */
1500
1501 case OP_EXTUNI:
1502 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503 GETCHARINCTEST(c, eptr);
1504 {
1505 int chartype, script;
1506 int category = _pcre_ucp_findprop(c, &chartype, &script);
1507 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1508 while (eptr < md->end_subject)
1509 {
1510 int len = 1;
1511 if (!utf8) c = *eptr; else
1512 {
1513 GETCHARLEN(c, eptr, len);
1514 }
1515 category = _pcre_ucp_findprop(c, &chartype, &script);
1516 if (category != ucp_M) break;
1517 eptr += len;
1518 }
1519 }
1520 ecode++;
1521 break;
1522 #endif
1523
1524
1525 /* Match a back reference, possibly repeatedly. Look past the end of the
1526 item to see if there is repeat information following. The code is similar
1527 to that for character classes, but repeated for efficiency. Then obey
1528 similar code to character type repeats - written out again for speed.
1529 However, if the referenced string is the empty string, always treat
1530 it as matched, any number of times (otherwise there could be infinite
1531 loops). */
1532
1533 case OP_REF:
1534 {
1535 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1536 ecode += 3; /* Advance past item */
1537
1538 /* If the reference is unset, set the length to be longer than the amount
1539 of subject left; this ensures that every attempt at a match fails. We
1540 can't just fail here, because of the possibility of quantifiers with zero
1541 minima. */
1542
1543 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1544 md->end_subject - eptr + 1 :
1545 md->offset_vector[offset+1] - md->offset_vector[offset];
1546
1547 /* Set up for repetition, or handle the non-repeated case */
1548
1549 switch (*ecode)
1550 {
1551 case OP_CRSTAR:
1552 case OP_CRMINSTAR:
1553 case OP_CRPLUS:
1554 case OP_CRMINPLUS:
1555 case OP_CRQUERY:
1556 case OP_CRMINQUERY:
1557 c = *ecode++ - OP_CRSTAR;
1558 minimize = (c & 1) != 0;
1559 min = rep_min[c]; /* Pick up values from tables; */
1560 max = rep_max[c]; /* zero for max => infinity */
1561 if (max == 0) max = INT_MAX;
1562 break;
1563
1564 case OP_CRRANGE:
1565 case OP_CRMINRANGE:
1566 minimize = (*ecode == OP_CRMINRANGE);
1567 min = GET2(ecode, 1);
1568 max = GET2(ecode, 3);
1569 if (max == 0) max = INT_MAX;
1570 ecode += 5;
1571 break;
1572
1573 default: /* No repeat follows */
1574 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1575 eptr += length;
1576 continue; /* With the main loop */
1577 }
1578
1579 /* If the length of the reference is zero, just continue with the
1580 main loop. */
1581
1582 if (length == 0) continue;
1583
1584 /* First, ensure the minimum number of matches are present. We get back
1585 the length of the reference string explicitly rather than passing the
1586 address of eptr, so that eptr can be a register variable. */
1587
1588 for (i = 1; i <= min; i++)
1589 {
1590 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1591 eptr += length;
1592 }
1593
1594 /* If min = max, continue at the same level without recursion.
1595 They are not both allowed to be zero. */
1596
1597 if (min == max) continue;
1598
1599 /* If minimizing, keep trying and advancing the pointer */
1600
1601 if (minimize)
1602 {
1603 for (fi = min;; fi++)
1604 {
1605 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1606 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1607 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1608 RRETURN(MATCH_NOMATCH);
1609 eptr += length;
1610 }
1611 /* Control never gets here */
1612 }
1613
1614 /* If maximizing, find the longest string and work backwards */
1615
1616 else
1617 {
1618 pp = eptr;
1619 for (i = min; i < max; i++)
1620 {
1621 if (!match_ref(offset, eptr, length, md, ims)) break;
1622 eptr += length;
1623 }
1624 while (eptr >= pp)
1625 {
1626 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1627 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1628 eptr -= length;
1629 }
1630 RRETURN(MATCH_NOMATCH);
1631 }
1632 }
1633 /* Control never gets here */
1634
1635
1636
1637 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1638 used when all the characters in the class have values in the range 0-255,
1639 and either the matching is caseful, or the characters are in the range
1640 0-127 when UTF-8 processing is enabled. The only difference between
1641 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1642 encountered.
1643
1644 First, look past the end of the item to see if there is repeat information
1645 following. Then obey similar code to character type repeats - written out
1646 again for speed. */
1647
1648 case OP_NCLASS:
1649 case OP_CLASS:
1650 {
1651 data = ecode + 1; /* Save for matching */
1652 ecode += 33; /* Advance past the item */
1653
1654 switch (*ecode)
1655 {
1656 case OP_CRSTAR:
1657 case OP_CRMINSTAR:
1658 case OP_CRPLUS:
1659 case OP_CRMINPLUS:
1660 case OP_CRQUERY:
1661 case OP_CRMINQUERY:
1662 c = *ecode++ - OP_CRSTAR;
1663 minimize = (c & 1) != 0;
1664 min = rep_min[c]; /* Pick up values from tables; */
1665 max = rep_max[c]; /* zero for max => infinity */
1666 if (max == 0) max = INT_MAX;
1667 break;
1668
1669 case OP_CRRANGE:
1670 case OP_CRMINRANGE:
1671 minimize = (*ecode == OP_CRMINRANGE);
1672 min = GET2(ecode, 1);
1673 max = GET2(ecode, 3);
1674 if (max == 0) max = INT_MAX;
1675 ecode += 5;
1676 break;
1677
1678 default: /* No repeat follows */
1679 min = max = 1;
1680 break;
1681 }
1682
1683 /* First, ensure the minimum number of matches are present. */
1684
1685 #ifdef SUPPORT_UTF8
1686 /* UTF-8 mode */
1687 if (utf8)
1688 {
1689 for (i = 1; i <= min; i++)
1690 {
1691 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1692 GETCHARINC(c, eptr);
1693 if (c > 255)
1694 {
1695 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1696 }
1697 else
1698 {
1699 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1700 }
1701 }
1702 }
1703 else
1704 #endif
1705 /* Not UTF-8 mode */
1706 {
1707 for (i = 1; i <= min; i++)
1708 {
1709 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1710 c = *eptr++;
1711 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1712 }
1713 }
1714
1715 /* If max == min we can continue with the main loop without the
1716 need to recurse. */
1717
1718 if (min == max) continue;
1719
1720 /* If minimizing, keep testing the rest of the expression and advancing
1721 the pointer while it matches the class. */
1722
1723 if (minimize)
1724 {
1725 #ifdef SUPPORT_UTF8
1726 /* UTF-8 mode */
1727 if (utf8)
1728 {
1729 for (fi = min;; fi++)
1730 {
1731 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1732 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1733 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1734 GETCHARINC(c, eptr);
1735 if (c > 255)
1736 {
1737 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1738 }
1739 else
1740 {
1741 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1742 }
1743 }
1744 }
1745 else
1746 #endif
1747 /* Not UTF-8 mode */
1748 {
1749 for (fi = min;; fi++)
1750 {
1751 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1752 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1753 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1754 c = *eptr++;
1755 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1756 }
1757 }
1758 /* Control never gets here */
1759 }
1760
1761 /* If maximizing, find the longest possible run, then work backwards. */
1762
1763 else
1764 {
1765 pp = eptr;
1766
1767 #ifdef SUPPORT_UTF8
1768 /* UTF-8 mode */
1769 if (utf8)
1770 {
1771 for (i = min; i < max; i++)
1772 {
1773 int len = 1;
1774 if (eptr >= md->end_subject) break;
1775 GETCHARLEN(c, eptr, len);
1776 if (c > 255)
1777 {
1778 if (op == OP_CLASS) break;
1779 }
1780 else
1781 {
1782 if ((data[c/8] & (1 << (c&7))) == 0) break;
1783 }
1784 eptr += len;
1785 }
1786 for (;;)
1787 {
1788 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1789 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1790 if (eptr-- == pp) break; /* Stop if tried at original pos */
1791 BACKCHAR(eptr);
1792 }
1793 }
1794 else
1795 #endif
1796 /* Not UTF-8 mode */
1797 {
1798 for (i = min; i < max; i++)
1799 {
1800 if (eptr >= md->end_subject) break;
1801 c = *eptr;
1802 if ((data[c/8] & (1 << (c&7))) == 0) break;
1803 eptr++;
1804 }
1805 while (eptr >= pp)
1806 {
1807 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1808 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1809 eptr--;
1810 }
1811 }
1812
1813 RRETURN(MATCH_NOMATCH);
1814 }
1815 }
1816 /* Control never gets here */
1817
1818
1819 /* Match an extended character class. This opcode is encountered only
1820 in UTF-8 mode, because that's the only time it is compiled. */
1821
1822 #ifdef SUPPORT_UTF8
1823 case OP_XCLASS:
1824 {
1825 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1826 ecode += GET(ecode, 1); /* Advance past the item */
1827
1828 switch (*ecode)
1829 {
1830 case OP_CRSTAR:
1831 case OP_CRMINSTAR:
1832 case OP_CRPLUS:
1833 case OP_CRMINPLUS:
1834 case OP_CRQUERY:
1835 case OP_CRMINQUERY:
1836 c = *ecode++ - OP_CRSTAR;
1837 minimize = (c & 1) != 0;
1838 min = rep_min[c]; /* Pick up values from tables; */
1839 max = rep_max[c]; /* zero for max => infinity */
1840 if (max == 0) max = INT_MAX;
1841 break;
1842
1843 case OP_CRRANGE:
1844 case OP_CRMINRANGE:
1845 minimize = (*ecode == OP_CRMINRANGE);
1846 min = GET2(ecode, 1);
1847 max = GET2(ecode, 3);
1848 if (max == 0) max = INT_MAX;
1849 ecode += 5;
1850 break;
1851
1852 default: /* No repeat follows */
1853 min = max = 1;
1854 break;
1855 }
1856
1857 /* First, ensure the minimum number of matches are present. */
1858
1859 for (i = 1; i <= min; i++)
1860 {
1861 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1862 GETCHARINC(c, eptr);
1863 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1864 }
1865
1866 /* If max == min we can continue with the main loop without the
1867 need to recurse. */
1868
1869 if (min == max) continue;
1870
1871 /* If minimizing, keep testing the rest of the expression and advancing
1872 the pointer while it matches the class. */
1873
1874 if (minimize)
1875 {
1876 for (fi = min;; fi++)
1877 {
1878 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1879 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1880 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1881 GETCHARINC(c, eptr);
1882 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1883 }
1884 /* Control never gets here */
1885 }
1886
1887 /* If maximizing, find the longest possible run, then work backwards. */
1888
1889 else
1890 {
1891 pp = eptr;
1892 for (i = min; i < max; i++)
1893 {
1894 int len = 1;
1895 if (eptr >= md->end_subject) break;
1896 GETCHARLEN(c, eptr, len);
1897 if (!_pcre_xclass(c, data)) break;
1898 eptr += len;
1899 }
1900 for(;;)
1901 {
1902 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1903 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1904 if (eptr-- == pp) break; /* Stop if tried at original pos */
1905 BACKCHAR(eptr)
1906 }
1907 RRETURN(MATCH_NOMATCH);
1908 }
1909
1910 /* Control never gets here */
1911 }
1912 #endif /* End of XCLASS */
1913
1914 /* Match a single character, casefully */
1915
1916 case OP_CHAR:
1917 #ifdef SUPPORT_UTF8
1918 if (utf8)
1919 {
1920 length = 1;
1921 ecode++;
1922 GETCHARLEN(fc, ecode, length);
1923 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1924 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1925 }
1926 else
1927 #endif
1928
1929 /* Non-UTF-8 mode */
1930 {
1931 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1932 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1933 ecode += 2;
1934 }
1935 break;
1936
1937 /* Match a single character, caselessly */
1938
1939 case OP_CHARNC:
1940 #ifdef SUPPORT_UTF8
1941 if (utf8)
1942 {
1943 length = 1;
1944 ecode++;
1945 GETCHARLEN(fc, ecode, length);
1946
1947 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1948
1949 /* If the pattern character's value is < 128, we have only one byte, and
1950 can use the fast lookup table. */
1951
1952 if (fc < 128)
1953 {
1954 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1955 }
1956
1957 /* Otherwise we must pick up the subject character */
1958
1959 else
1960 {
1961 unsigned int dc;
1962 GETCHARINC(dc, eptr);
1963 ecode += length;
1964
1965 /* If we have Unicode property support, we can use it to test the other
1966 case of the character, if there is one. */
1967
1968 if (fc != dc)
1969 {
1970 #ifdef SUPPORT_UCP
1971 if (dc != _pcre_ucp_othercase(fc))
1972 #endif
1973 RRETURN(MATCH_NOMATCH);
1974 }
1975 }
1976 }
1977 else
1978 #endif /* SUPPORT_UTF8 */
1979
1980 /* Non-UTF-8 mode */
1981 {
1982 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1983 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1984 ecode += 2;
1985 }
1986 break;
1987
1988 /* Match a single character repeatedly. */
1989
1990 case OP_EXACT:
1991 min = max = GET2(ecode, 1);
1992 ecode += 3;
1993 goto REPEATCHAR;
1994
1995 case OP_POSUPTO:
1996 possessive = TRUE;
1997 /* Fall through */
1998
1999 case OP_UPTO:
2000 case OP_MINUPTO:
2001 min = 0;
2002 max = GET2(ecode, 1);
2003 minimize = *ecode == OP_MINUPTO;
2004 ecode += 3;
2005 goto REPEATCHAR;
2006
2007 case OP_POSSTAR:
2008 possessive = TRUE;
2009 min = 0;
2010 max = INT_MAX;
2011 ecode++;
2012 goto REPEATCHAR;
2013
2014 case OP_POSPLUS:
2015 possessive = TRUE;
2016 min = 1;
2017 max = INT_MAX;
2018 ecode++;
2019 goto REPEATCHAR;
2020
2021 case OP_POSQUERY:
2022 possessive = TRUE;
2023 min = 0;
2024 max = 1;
2025 ecode++;
2026 goto REPEATCHAR;
2027
2028 case OP_STAR:
2029 case OP_MINSTAR:
2030 case OP_PLUS:
2031 case OP_MINPLUS:
2032 case OP_QUERY:
2033 case OP_MINQUERY:
2034 c = *ecode++ - OP_STAR;
2035 minimize = (c & 1) != 0;
2036 min = rep_min[c]; /* Pick up values from tables; */
2037 max = rep_max[c]; /* zero for max => infinity */
2038 if (max == 0) max = INT_MAX;
2039
2040 /* Common code for all repeated single-character matches. We can give
2041 up quickly if there are fewer than the minimum number of characters left in
2042 the subject. */
2043
2044 REPEATCHAR:
2045 #ifdef SUPPORT_UTF8
2046 if (utf8)
2047 {
2048 length = 1;
2049 charptr = ecode;
2050 GETCHARLEN(fc, ecode, length);
2051 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2052 ecode += length;
2053
2054 /* Handle multibyte character matching specially here. There is
2055 support for caseless matching if UCP support is present. */
2056
2057 if (length > 1)
2058 {
2059 #ifdef SUPPORT_UCP
2060 unsigned int othercase;
2061 if ((ims & PCRE_CASELESS) != 0 &&
2062 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2063 oclength = _pcre_ord2utf8(othercase, occhars);
2064 else oclength = 0;
2065 #endif /* SUPPORT_UCP */
2066
2067 for (i = 1; i <= min; i++)
2068 {
2069 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2070 #ifdef SUPPORT_UCP
2071 /* Need braces because of following else */
2072 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2073 else
2074 {
2075 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2076 eptr += oclength;
2077 }
2078 #else /* without SUPPORT_UCP */
2079 else { RRETURN(MATCH_NOMATCH); }
2080 #endif /* SUPPORT_UCP */
2081 }
2082
2083 if (min == max) continue;
2084
2085 if (minimize)
2086 {
2087 for (fi = min;; fi++)
2088 {
2089 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2090 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2091 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2092 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2093 #ifdef SUPPORT_UCP
2094 /* Need braces because of following else */
2095 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2096 else
2097 {
2098 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2099 eptr += oclength;
2100 }
2101 #else /* without SUPPORT_UCP */
2102 else { RRETURN (MATCH_NOMATCH); }
2103 #endif /* SUPPORT_UCP */
2104 }
2105 /* Control never gets here */
2106 }
2107
2108 else /* Maximize */
2109 {
2110 pp = eptr;
2111 for (i = min; i < max; i++)
2112 {
2113 if (eptr > md->end_subject - length) break;
2114 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2115 #ifdef SUPPORT_UCP
2116 else if (oclength == 0) break;
2117 else
2118 {
2119 if (memcmp(eptr, occhars, oclength) != 0) break;
2120 eptr += oclength;
2121 }
2122 #else /* without SUPPORT_UCP */
2123 else break;
2124 #endif /* SUPPORT_UCP */
2125 }
2126
2127 if (possessive) continue;
2128 for(;;)
2129 {
2130 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2131 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2132 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2133 #ifdef SUPPORT_UCP
2134 eptr--;
2135 BACKCHAR(eptr);
2136 #else /* without SUPPORT_UCP */
2137 eptr -= length;
2138 #endif /* SUPPORT_UCP */
2139 }
2140 }
2141 /* Control never gets here */
2142 }
2143
2144 /* If the length of a UTF-8 character is 1, we fall through here, and
2145 obey the code as for non-UTF-8 characters below, though in this case the
2146 value of fc will always be < 128. */
2147 }
2148 else
2149 #endif /* SUPPORT_UTF8 */
2150
2151 /* When not in UTF-8 mode, load a single-byte character. */
2152 {
2153 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2154 fc = *ecode++;
2155 }
2156
2157 /* The value of fc at this point is always less than 256, though we may or
2158 may not be in UTF-8 mode. The code is duplicated for the caseless and
2159 caseful cases, for speed, since matching characters is likely to be quite
2160 common. First, ensure the minimum number of matches are present. If min =
2161 max, continue at the same level without recursing. Otherwise, if
2162 minimizing, keep trying the rest of the expression and advancing one
2163 matching character if failing, up to the maximum. Alternatively, if
2164 maximizing, find the maximum number of characters and work backwards. */
2165
2166 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2167 max, eptr));
2168
2169 if ((ims & PCRE_CASELESS) != 0)
2170 {
2171 fc = md->lcc[fc];
2172 for (i = 1; i <= min; i++)
2173 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2174 if (min == max) continue;
2175 if (minimize)
2176 {
2177 for (fi = min;; fi++)
2178 {
2179 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2180 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2181 if (fi >= max || eptr >= md->end_subject ||
2182 fc != md->lcc[*eptr++])
2183 RRETURN(MATCH_NOMATCH);
2184 }
2185 /* Control never gets here */
2186 }
2187 else /* Maximize */
2188 {
2189 pp = eptr;
2190 for (i = min; i < max; i++)
2191 {
2192 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2193 eptr++;
2194 }
2195 if (possessive) continue;
2196 while (eptr >= pp)
2197 {
2198 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2199 eptr--;
2200 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2201 }
2202 RRETURN(MATCH_NOMATCH);
2203 }
2204 /* Control never gets here */
2205 }
2206
2207 /* Caseful comparisons (includes all multi-byte characters) */
2208
2209 else
2210 {
2211 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2212 if (min == max) continue;
2213 if (minimize)
2214 {
2215 for (fi = min;; fi++)
2216 {
2217 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2218 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2219 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2220 RRETURN(MATCH_NOMATCH);
2221 }
2222 /* Control never gets here */
2223 }
2224 else /* Maximize */
2225 {
2226 pp = eptr;
2227 for (i = min; i < max; i++)
2228 {
2229 if (eptr >= md->end_subject || fc != *eptr) break;
2230 eptr++;
2231 }
2232 if (possessive) continue;
2233 while (eptr >= pp)
2234 {
2235 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2236 eptr--;
2237 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2238 }
2239 RRETURN(MATCH_NOMATCH);
2240 }
2241 }
2242 /* Control never gets here */
2243
2244 /* Match a negated single one-byte character. The character we are
2245 checking can be multibyte. */
2246
2247 case OP_NOT:
2248 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2249 ecode++;
2250 GETCHARINCTEST(c, eptr);
2251 if ((ims & PCRE_CASELESS) != 0)
2252 {
2253 #ifdef SUPPORT_UTF8
2254 if (c < 256)
2255 #endif
2256 c = md->lcc[c];
2257 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2258 }
2259 else
2260 {
2261 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2262 }
2263 break;
2264
2265 /* Match a negated single one-byte character repeatedly. This is almost a
2266 repeat of the code for a repeated single character, but I haven't found a
2267 nice way of commoning these up that doesn't require a test of the
2268 positive/negative option for each character match. Maybe that wouldn't add
2269 very much to the time taken, but character matching *is* what this is all
2270 about... */
2271
2272 case OP_NOTEXACT:
2273 min = max = GET2(ecode, 1);
2274 ecode += 3;
2275 goto REPEATNOTCHAR;
2276
2277 case OP_NOTUPTO:
2278 case OP_NOTMINUPTO:
2279 min = 0;
2280 max = GET2(ecode, 1);
2281 minimize = *ecode == OP_NOTMINUPTO;
2282 ecode += 3;
2283 goto REPEATNOTCHAR;
2284
2285 case OP_NOTPOSSTAR:
2286 possessive = TRUE;
2287 min = 0;
2288 max = INT_MAX;
2289 ecode++;
2290 goto REPEATNOTCHAR;
2291
2292 case OP_NOTPOSPLUS:
2293 possessive = TRUE;
2294 min = 1;
2295 max = INT_MAX;
2296 ecode++;
2297 goto REPEATNOTCHAR;
2298
2299 case OP_NOTPOSQUERY:
2300 possessive = TRUE;
2301 min = 0;
2302 max = 1;
2303 ecode++;
2304 goto REPEATNOTCHAR;
2305
2306 case OP_NOTPOSUPTO:
2307 possessive = TRUE;
2308 min = 0;
2309 max = GET2(ecode, 1);
2310 ecode += 3;
2311 goto REPEATNOTCHAR;
2312
2313 case OP_NOTSTAR:
2314 case OP_NOTMINSTAR:
2315 case OP_NOTPLUS:
2316 case OP_NOTMINPLUS:
2317 case OP_NOTQUERY:
2318 case OP_NOTMINQUERY:
2319 c = *ecode++ - OP_NOTSTAR;
2320 minimize = (c & 1) != 0;
2321 min = rep_min[c]; /* Pick up values from tables; */
2322 max = rep_max[c]; /* zero for max => infinity */
2323 if (max == 0) max = INT_MAX;
2324
2325 /* Common code for all repeated single-byte matches. We can give up quickly
2326 if there are fewer than the minimum number of bytes left in the
2327 subject. */
2328
2329 REPEATNOTCHAR:
2330 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2331 fc = *ecode++;
2332
2333 /* The code is duplicated for the caseless and caseful cases, for speed,
2334 since matching characters is likely to be quite common. First, ensure the
2335 minimum number of matches are present. If min = max, continue at the same
2336 level without recursing. Otherwise, if minimizing, keep trying the rest of
2337 the expression and advancing one matching character if failing, up to the
2338 maximum. Alternatively, if maximizing, find the maximum number of
2339 characters and work backwards. */
2340
2341 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2342 max, eptr));
2343
2344 if ((ims & PCRE_CASELESS) != 0)
2345 {
2346 fc = md->lcc[fc];
2347
2348 #ifdef SUPPORT_UTF8
2349 /* UTF-8 mode */
2350 if (utf8)
2351 {
2352 register unsigned int d;
2353 for (i = 1; i <= min; i++)
2354 {
2355 GETCHARINC(d, eptr);
2356 if (d < 256) d = md->lcc[d];
2357 if (fc == d) RRETURN(MATCH_NOMATCH);
2358 }
2359 }
2360 else
2361 #endif
2362
2363 /* Not UTF-8 mode */
2364 {
2365 for (i = 1; i <= min; i++)
2366 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2367 }
2368
2369 if (min == max) continue;
2370
2371 if (minimize)
2372 {
2373 #ifdef SUPPORT_UTF8
2374 /* UTF-8 mode */
2375 if (utf8)
2376 {
2377 register unsigned int d;
2378 for (fi = min;; fi++)
2379 {
2380 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2381 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2382 GETCHARINC(d, eptr);
2383 if (d < 256) d = md->lcc[d];
2384 if (fi >= max || eptr >= md->end_subject || fc == d)
2385 RRETURN(MATCH_NOMATCH);
2386 }
2387 }
2388 else
2389 #endif
2390 /* Not UTF-8 mode */
2391 {
2392 for (fi = min;; fi++)
2393 {
2394 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2395 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2396 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2397 RRETURN(MATCH_NOMATCH);
2398 }
2399 }
2400 /* Control never gets here */
2401 }
2402
2403 /* Maximize case */
2404
2405 else
2406 {
2407 pp = eptr;
2408
2409 #ifdef SUPPORT_UTF8
2410 /* UTF-8 mode */
2411 if (utf8)
2412 {
2413 register unsigned int d;
2414 for (i = min; i < max; i++)
2415 {
2416 int len = 1;
2417 if (eptr >= md->end_subject) break;
2418 GETCHARLEN(d, eptr, len);
2419 if (d < 256) d = md->lcc[d];
2420 if (fc == d) break;
2421 eptr += len;
2422 }
2423 if (possessive) continue;
2424 for(;;)
2425 {
2426 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2428 if (eptr-- == pp) break; /* Stop if tried at original pos */
2429 BACKCHAR(eptr);
2430 }
2431 }
2432 else
2433 #endif
2434 /* Not UTF-8 mode */
2435 {
2436 for (i = min; i < max; i++)
2437 {
2438 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2439 eptr++;
2440 }
2441 if (possessive) continue;
2442 while (eptr >= pp)
2443 {
2444 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2445 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2446 eptr--;
2447 }
2448 }
2449
2450 RRETURN(MATCH_NOMATCH);
2451 }
2452 /* Control never gets here */
2453 }
2454
2455 /* Caseful comparisons */
2456
2457 else
2458 {
2459 #ifdef SUPPORT_UTF8
2460 /* UTF-8 mode */
2461 if (utf8)
2462 {
2463 register unsigned int d;
2464 for (i = 1; i <= min; i++)
2465 {
2466 GETCHARINC(d, eptr);
2467 if (fc == d) RRETURN(MATCH_NOMATCH);
2468 }
2469 }
2470 else
2471 #endif
2472 /* Not UTF-8 mode */
2473 {
2474 for (i = 1; i <= min; i++)
2475 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2476 }
2477
2478 if (min == max) continue;
2479
2480 if (minimize)
2481 {
2482 #ifdef SUPPORT_UTF8
2483 /* UTF-8 mode */
2484 if (utf8)
2485 {
2486 register unsigned int d;
2487 for (fi = min;; fi++)
2488 {
2489 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2490 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2491 GETCHARINC(d, eptr);
2492 if (fi >= max || eptr >= md->end_subject || fc == d)
2493 RRETURN(MATCH_NOMATCH);
2494 }
2495 }
2496 else
2497 #endif
2498 /* Not UTF-8 mode */
2499 {
2500 for (fi = min;; fi++)
2501 {
2502 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2503 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2504 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2505 RRETURN(MATCH_NOMATCH);
2506 }
2507 }
2508 /* Control never gets here */
2509 }
2510
2511 /* Maximize case */
2512
2513 else
2514 {
2515 pp = eptr;
2516
2517 #ifdef SUPPORT_UTF8
2518 /* UTF-8 mode */
2519 if (utf8)
2520 {
2521 register unsigned int d;
2522 for (i = min; i < max; i++)
2523 {
2524 int len = 1;
2525 if (eptr >= md->end_subject) break;
2526 GETCHARLEN(d, eptr, len);
2527 if (fc == d) break;
2528 eptr += len;
2529 }
2530 if (possessive) continue;
2531 for(;;)
2532 {
2533 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2534 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2535 if (eptr-- == pp) break; /* Stop if tried at original pos */
2536 BACKCHAR(eptr);
2537 }
2538 }
2539 else
2540 #endif
2541 /* Not UTF-8 mode */
2542 {
2543 for (i = min; i < max; i++)
2544 {
2545 if (eptr >= md->end_subject || fc == *eptr) break;
2546 eptr++;
2547 }
2548 if (possessive) continue;
2549 while (eptr >= pp)
2550 {
2551 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2552 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2553 eptr--;
2554 }
2555 }
2556
2557 RRETURN(MATCH_NOMATCH);
2558 }
2559 }
2560 /* Control never gets here */
2561
2562 /* Match a single character type repeatedly; several different opcodes
2563 share code. This is very similar to the code for single characters, but we
2564 repeat it in the interests of efficiency. */
2565
2566 case OP_TYPEEXACT:
2567 min = max = GET2(ecode, 1);
2568 minimize = TRUE;
2569 ecode += 3;
2570 goto REPEATTYPE;
2571
2572 case OP_TYPEUPTO:
2573 case OP_TYPEMINUPTO:
2574 min = 0;
2575 max = GET2(ecode, 1);
2576 minimize = *ecode == OP_TYPEMINUPTO;
2577 ecode += 3;
2578 goto REPEATTYPE;
2579
2580 case OP_TYPEPOSSTAR:
2581 possessive = TRUE;
2582 min = 0;
2583 max = INT_MAX;
2584 ecode++;
2585 goto REPEATTYPE;
2586
2587 case OP_TYPEPOSPLUS:
2588 possessive = TRUE;
2589 min = 1;
2590 max = INT_MAX;
2591 ecode++;
2592 goto REPEATTYPE;
2593
2594 case OP_TYPEPOSQUERY:
2595 possessive = TRUE;
2596 min = 0;
2597 max = 1;
2598 ecode++;
2599 goto REPEATTYPE;
2600
2601 case OP_TYPEPOSUPTO:
2602 possessive = TRUE;
2603 min = 0;
2604 max = GET2(ecode, 1);
2605 ecode += 3;
2606 goto REPEATTYPE;
2607
2608 case OP_TYPESTAR:
2609 case OP_TYPEMINSTAR:
2610 case OP_TYPEPLUS:
2611 case OP_TYPEMINPLUS:
2612 case OP_TYPEQUERY:
2613 case OP_TYPEMINQUERY:
2614 c = *ecode++ - OP_TYPESTAR;
2615 minimize = (c & 1) != 0;
2616 min = rep_min[c]; /* Pick up values from tables; */
2617 max = rep_max[c]; /* zero for max => infinity */
2618 if (max == 0) max = INT_MAX;
2619
2620 /* Common code for all repeated single character type matches. Note that
2621 in UTF-8 mode, '.' matches a character of any length, but for the other
2622 character types, the valid characters are all one-byte long. */
2623
2624 REPEATTYPE:
2625 ctype = *ecode++; /* Code for the character type */
2626
2627 #ifdef SUPPORT_UCP
2628 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2629 {
2630 prop_fail_result = ctype == OP_NOTPROP;
2631 prop_type = *ecode++;
2632 prop_value = *ecode++;
2633 }
2634 else prop_type = -1;
2635 #endif
2636
2637 /* First, ensure the minimum number of matches are present. Use inline
2638 code for maximizing the speed, and do the type test once at the start
2639 (i.e. keep it out of the loop). Also we can test that there are at least
2640 the minimum number of bytes before we start. This isn't as effective in
2641 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2642 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2643 and single-bytes. */
2644
2645 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2646 if (min > 0)
2647 {
2648 #ifdef SUPPORT_UCP
2649 if (prop_type >= 0)
2650 {
2651 switch(prop_type)
2652 {
2653 case PT_ANY:
2654 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2655 for (i = 1; i <= min; i++)
2656 {
2657 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2658 GETCHARINC(c, eptr);
2659 }
2660 break;
2661
2662 case PT_LAMP:
2663 for (i = 1; i <= min; i++)
2664 {
2665 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2666 GETCHARINC(c, eptr);
2667 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2668 if ((prop_chartype == ucp_Lu ||
2669 prop_chartype == ucp_Ll ||
2670 prop_chartype == ucp_Lt) == prop_fail_result)
2671 RRETURN(MATCH_NOMATCH);
2672 }
2673 break;
2674
2675 case PT_GC:
2676 for (i = 1; i <= min; i++)
2677 {
2678 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2679 GETCHARINC(c, eptr);
2680 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2681 if ((prop_category == prop_value) == prop_fail_result)
2682 RRETURN(MATCH_NOMATCH);
2683 }
2684 break;
2685
2686 case PT_PC:
2687 for (i = 1; i <= min; i++)
2688 {
2689 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2690 GETCHARINC(c, eptr);
2691 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2692 if ((prop_chartype == prop_value) == prop_fail_result)
2693 RRETURN(MATCH_NOMATCH);
2694 }
2695 break;
2696
2697 case PT_SC:
2698 for (i = 1; i <= min; i++)
2699 {
2700 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2701 GETCHARINC(c, eptr);
2702 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2703 if ((prop_script == prop_value) == prop_fail_result)
2704 RRETURN(MATCH_NOMATCH);
2705 }
2706 break;
2707
2708 default:
2709 RRETURN(PCRE_ERROR_INTERNAL);
2710 }
2711 }
2712
2713 /* Match extended Unicode sequences. We will get here only if the
2714 support is in the binary; otherwise a compile-time error occurs. */
2715
2716 else if (ctype == OP_EXTUNI)
2717 {
2718 for (i = 1; i <= min; i++)
2719 {
2720 GETCHARINCTEST(c, eptr);
2721 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2722 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2723 while (eptr < md->end_subject)
2724 {
2725 int len = 1;
2726 if (!utf8) c = *eptr; else
2727 {
2728 GETCHARLEN(c, eptr, len);
2729 }
2730 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2731 if (prop_category != ucp_M) break;
2732 eptr += len;
2733 }
2734 }
2735 }
2736
2737 else
2738 #endif /* SUPPORT_UCP */
2739
2740 /* Handle all other cases when the coding is UTF-8 */
2741
2742 #ifdef SUPPORT_UTF8
2743 if (utf8) switch(ctype)
2744 {
2745 case OP_ANY:
2746 for (i = 1; i <= min; i++)
2747 {
2748 if (eptr >= md->end_subject ||
2749 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2750 RRETURN(MATCH_NOMATCH);
2751 eptr++;
2752 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2753 }
2754 break;
2755
2756 case OP_ANYBYTE:
2757 eptr += min;
2758 break;
2759
2760 case OP_ANYNL:
2761 for (i = 1; i <= min; i++)
2762 {
2763 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2764 GETCHARINC(c, eptr);
2765 switch(c)
2766 {
2767 default: RRETURN(MATCH_NOMATCH);
2768 case 0x000d:
2769 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2770 break;
2771 case 0x000a:
2772 case 0x000b:
2773 case 0x000c:
2774 case 0x0085:
2775 case 0x2028:
2776 case 0x2029:
2777 break;
2778 }
2779 }
2780 break;
2781
2782 case OP_NOT_DIGIT:
2783 for (i = 1; i <= min; i++)
2784 {
2785 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2786 GETCHARINC(c, eptr);
2787 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2788 RRETURN(MATCH_NOMATCH);
2789 }
2790 break;
2791
2792 case OP_DIGIT:
2793 for (i = 1; i <= min; i++)
2794 {
2795 if (eptr >= md->end_subject ||
2796 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2797 RRETURN(MATCH_NOMATCH);
2798 /* No need to skip more bytes - we know it's a 1-byte character */
2799 }
2800 break;
2801
2802 case OP_NOT_WHITESPACE:
2803 for (i = 1; i <= min; i++)
2804 {
2805 if (eptr >= md->end_subject ||
2806 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2807 RRETURN(MATCH_NOMATCH);
2808 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2809 }
2810 break;
2811
2812 case OP_WHITESPACE:
2813 for (i = 1; i <= min; i++)
2814 {
2815 if (eptr >= md->end_subject ||
2816 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2817 RRETURN(MATCH_NOMATCH);
2818 /* No need to skip more bytes - we know it's a 1-byte character */
2819 }
2820 break;
2821
2822 case OP_NOT_WORDCHAR:
2823 for (i = 1; i <= min; i++)
2824 {
2825 if (eptr >= md->end_subject ||
2826 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2827 RRETURN(MATCH_NOMATCH);
2828 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2829 }
2830 break;
2831
2832 case OP_WORDCHAR:
2833 for (i = 1; i <= min; i++)
2834 {
2835 if (eptr >= md->end_subject ||
2836 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2837 RRETURN(MATCH_NOMATCH);
2838 /* No need to skip more bytes - we know it's a 1-byte character */
2839 }
2840 break;
2841
2842 default:
2843 RRETURN(PCRE_ERROR_INTERNAL);
2844 } /* End switch(ctype) */
2845
2846 else
2847 #endif /* SUPPORT_UTF8 */
2848
2849 /* Code for the non-UTF-8 case for minimum matching of operators other
2850 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2851 number of bytes present, as this was tested above. */
2852
2853 switch(ctype)
2854 {
2855 case OP_ANY:
2856 if ((ims & PCRE_DOTALL) == 0)
2857 {
2858 for (i = 1; i <= min; i++)
2859 {
2860 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2861 eptr++;
2862 }
2863 }
2864 else eptr += min;
2865 break;
2866
2867 case OP_ANYBYTE:
2868 eptr += min;
2869 break;
2870
2871 /* Because of the CRLF case, we can't assume the minimum number of
2872 bytes are present in this case. */
2873
2874 case OP_ANYNL:
2875 for (i = 1; i <= min; i++)
2876 {
2877 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2878 switch(*eptr++)
2879 {
2880 default: RRETURN(MATCH_NOMATCH);
2881 case 0x000d:
2882 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2883 break;
2884 case 0x000a:
2885 case 0x000b:
2886 case 0x000c:
2887 case 0x0085:
2888 break;
2889 }
2890 }
2891 break;
2892
2893 case OP_NOT_DIGIT:
2894 for (i = 1; i <= min; i++)
2895 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2896 break;
2897
2898 case OP_DIGIT:
2899 for (i = 1; i <= min; i++)
2900 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2901 break;
2902
2903 case OP_NOT_WHITESPACE:
2904 for (i = 1; i <= min; i++)
2905 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2906 break;
2907
2908 case OP_WHITESPACE:
2909 for (i = 1; i <= min; i++)
2910 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2911 break;
2912
2913 case OP_NOT_WORDCHAR:
2914 for (i = 1; i <= min; i++)
2915 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2916 RRETURN(MATCH_NOMATCH);
2917 break;
2918
2919 case OP_WORDCHAR:
2920 for (i = 1; i <= min; i++)
2921 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2922 RRETURN(MATCH_NOMATCH);
2923 break;
2924
2925 default:
2926 RRETURN(PCRE_ERROR_INTERNAL);
2927 }
2928 }
2929
2930 /* If min = max, continue at the same level without recursing */
2931
2932 if (min == max) continue;
2933
2934 /* If minimizing, we have to test the rest of the pattern before each
2935 subsequent match. Again, separate the UTF-8 case for speed, and also
2936 separate the UCP cases. */
2937
2938 if (minimize)
2939 {
2940 #ifdef SUPPORT_UCP
2941 if (prop_type >= 0)
2942 {
2943 switch(prop_type)
2944 {
2945 case PT_ANY:
2946 for (fi = min;; fi++)
2947 {
2948 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2949 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2950 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951 GETCHARINC(c, eptr);
2952 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2953 }
2954 /* Control never gets here */
2955
2956 case PT_LAMP:
2957 for (fi = min;; fi++)
2958 {
2959 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2960 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2961 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962 GETCHARINC(c, eptr);
2963 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2964 if ((prop_chartype == ucp_Lu ||
2965 prop_chartype == ucp_Ll ||
2966 prop_chartype == ucp_Lt) == prop_fail_result)
2967 RRETURN(MATCH_NOMATCH);
2968 }
2969 /* Control never gets here */
2970
2971 case PT_GC:
2972 for (fi = min;; fi++)
2973 {
2974 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2975 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2976 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2977 GETCHARINC(c, eptr);
2978 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2979 if ((prop_category == prop_value) == prop_fail_result)
2980 RRETURN(MATCH_NOMATCH);
2981 }
2982 /* Control never gets here */
2983
2984 case PT_PC:
2985 for (fi = min;; fi++)
2986 {
2987 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2989 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2990 GETCHARINC(c, eptr);
2991 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2992 if ((prop_chartype == prop_value) == prop_fail_result)
2993 RRETURN(MATCH_NOMATCH);
2994 }
2995 /* Control never gets here */
2996
2997 case PT_SC:
2998 for (fi = min;; fi++)
2999 {
3000 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3001 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3002 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3003 GETCHARINC(c, eptr);
3004 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3005 if ((prop_script == prop_value) == prop_fail_result)
3006 RRETURN(MATCH_NOMATCH);
3007 }
3008 /* Control never gets here */
3009
3010 default:
3011 RRETURN(PCRE_ERROR_INTERNAL);
3012 }
3013 }
3014
3015 /* Match extended Unicode sequences. We will get here only if the
3016 support is in the binary; otherwise a compile-time error occurs. */
3017
3018 else if (ctype == OP_EXTUNI)
3019 {
3020 for (fi = min;; fi++)
3021 {
3022 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3023 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3025 GETCHARINCTEST(c, eptr);
3026 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3027 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3028 while (eptr < md->end_subject)
3029 {
3030 int len = 1;
3031 if (!utf8) c = *eptr; else
3032 {
3033 GETCHARLEN(c, eptr, len);
3034 }
3035 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3036 if (prop_category != ucp_M) break;
3037 eptr += len;
3038 }
3039 }
3040 }
3041
3042 else
3043 #endif /* SUPPORT_UCP */
3044
3045 #ifdef SUPPORT_UTF8
3046 /* UTF-8 mode */
3047 if (utf8)
3048 {
3049 for (fi = min;; fi++)
3050 {
3051 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3052 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3053 if (fi >= max || eptr >= md->end_subject ||
3054 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3055 IS_NEWLINE(eptr)))
3056 RRETURN(MATCH_NOMATCH);
3057
3058 GETCHARINC(c, eptr);
3059 switch(ctype)
3060 {
3061 case OP_ANY: /* This is the DOTALL case */
3062 break;
3063
3064 case OP_ANYBYTE:
3065 break;
3066
3067 case OP_ANYNL:
3068 switch(c)
3069 {
3070 default: RRETURN(MATCH_NOMATCH);
3071 case 0x000d:
3072 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3073 break;
3074 case 0x000a:
3075 case 0x000b:
3076 case 0x000c:
3077 case 0x0085:
3078 case 0x2028:
3079 case 0x2029:
3080 break;
3081 }
3082 break;
3083
3084 case OP_NOT_DIGIT:
3085 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3086 RRETURN(MATCH_NOMATCH);
3087 break;
3088
3089 case OP_DIGIT:
3090 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3091 RRETURN(MATCH_NOMATCH);
3092 break;
3093
3094 case OP_NOT_WHITESPACE:
3095 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3096 RRETURN(MATCH_NOMATCH);
3097 break;
3098
3099 case OP_WHITESPACE:
3100 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3101 RRETURN(MATCH_NOMATCH);
3102 break;
3103
3104 case OP_NOT_WORDCHAR:
3105 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3106 RRETURN(MATCH_NOMATCH);
3107 break;
3108
3109 case OP_WORDCHAR:
3110 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3111 RRETURN(MATCH_NOMATCH);
3112 break;
3113
3114 default:
3115 RRETURN(PCRE_ERROR_INTERNAL);
3116 }
3117 }
3118 }
3119 else
3120 #endif
3121 /* Not UTF-8 mode */
3122 {
3123 for (fi = min;; fi++)
3124 {
3125 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3127 if (fi >= max || eptr >= md->end_subject ||
3128 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3129 RRETURN(MATCH_NOMATCH);
3130
3131 c = *eptr++;
3132 switch(ctype)
3133 {
3134 case OP_ANY: /* This is the DOTALL case */
3135 break;
3136
3137 case OP_ANYBYTE:
3138 break;
3139
3140 case OP_ANYNL:
3141 switch(c)
3142 {
3143 default: RRETURN(MATCH_NOMATCH);
3144 case 0x000d:
3145 if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3146 break;
3147 case 0x000a:
3148 case 0x000b:
3149 case 0x000c:
3150 case 0x0085:
3151 break;
3152 }
3153 break;
3154
3155 case OP_NOT_DIGIT:
3156 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3157 break;
3158
3159 case OP_DIGIT:
3160 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3161 break;
3162
3163 case OP_NOT_WHITESPACE:
3164 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3165 break;
3166
3167 case OP_WHITESPACE:
3168 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3169 break;
3170
3171 case OP_NOT_WORDCHAR:
3172 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3173 break;
3174
3175 case OP_WORDCHAR:
3176 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3177 break;
3178
3179 default:
3180 RRETURN(PCRE_ERROR_INTERNAL);
3181 }
3182 }
3183 }
3184 /* Control never gets here */
3185 }
3186
3187 /* If maximizing, it is worth using inline code for speed, doing the type
3188 test once at the start (i.e. keep it out of the loop). Again, keep the
3189 UTF-8 and UCP stuff separate. */
3190
3191 else
3192 {
3193 pp = eptr; /* Remember where we started */
3194
3195 #ifdef SUPPORT_UCP
3196 if (prop_type >= 0)
3197 {
3198 switch(prop_type)
3199 {
3200 case PT_ANY:
3201 for (i = min; i < max; i++)
3202 {
3203 int len = 1;
3204 if (eptr >= md->end_subject) break;
3205 GETCHARLEN(c, eptr, len);
3206 if (prop_fail_result) break;
3207 eptr+= len;
3208 }
3209 break;
3210
3211 case PT_LAMP:
3212 for (i = min; i < max; i++)
3213 {
3214 int len = 1;
3215 if (eptr >= md->end_subject) break;
3216 GETCHARLEN(c, eptr, len);
3217 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3218 if ((prop_chartype == ucp_Lu ||
3219 prop_chartype == ucp_Ll ||
3220 prop_chartype == ucp_Lt) == prop_fail_result)
3221 break;
3222 eptr+= len;
3223 }
3224 break;
3225
3226 case PT_GC:
3227 for (i = min; i < max; i++)
3228 {
3229 int len = 1;
3230 if (eptr >= md->end_subject) break;
3231 GETCHARLEN(c, eptr, len);
3232 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3233 if ((prop_category == prop_value) == prop_fail_result)
3234 break;
3235 eptr+= len;
3236 }
3237 break;
3238
3239 case PT_PC:
3240 for (i = min; i < max; i++)
3241 {
3242 int len = 1;
3243 if (eptr >= md->end_subject) break;
3244 GETCHARLEN(c, eptr, len);
3245 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3246 if ((prop_chartype == prop_value) == prop_fail_result)
3247 break;
3248 eptr+= len;
3249 }
3250 break;
3251
3252 case PT_SC:
3253 for (i = min; i < max; i++)
3254 {
3255 int len = 1;
3256 if (eptr >= md->end_subject) break;
3257 GETCHARLEN(c, eptr, len);
3258 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3259 if ((prop_script == prop_value) == prop_fail_result)
3260 break;
3261 eptr+= len;
3262 }
3263 break;
3264 }
3265
3266 /* eptr is now past the end of the maximum run */
3267
3268 if (possessive) continue;
3269 for(;;)
3270 {
3271 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3272 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3273 if (eptr-- == pp) break; /* Stop if tried at original pos */
3274 BACKCHAR(eptr);
3275 }
3276 }
3277
3278 /* Match extended Unicode sequences. We will get here only if the
3279 support is in the binary; otherwise a compile-time error occurs. */
3280
3281 else if (ctype == OP_EXTUNI)
3282 {
3283 for (i = min; i < max; i++)
3284 {
3285 if (eptr >= md->end_subject) break;
3286 GETCHARINCTEST(c, eptr);
3287 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3288 if (prop_category == ucp_M) break;
3289 while (eptr < md->end_subject)
3290 {
3291 int len = 1;
3292 if (!utf8) c = *eptr; else
3293 {
3294 GETCHARLEN(c, eptr, len);
3295 }
3296 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3297 if (prop_category != ucp_M) break;
3298 eptr += len;
3299 }
3300 }
3301
3302 /* eptr is now past the end of the maximum run */
3303
3304 if (possessive) continue;
3305 for(;;)
3306 {
3307 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3308 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3309 if (eptr-- == pp) break; /* Stop if tried at original pos */
3310 for (;;) /* Move back over one extended */
3311 {
3312 int len = 1;
3313 BACKCHAR(eptr);
3314 if (!utf8) c = *eptr; else
3315 {
3316 GETCHARLEN(c, eptr, len);
3317 }
3318 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3319 if (prop_category != ucp_M) break;
3320 eptr--;
3321 }
3322 }
3323 }
3324
3325 else
3326 #endif /* SUPPORT_UCP */
3327
3328 #ifdef SUPPORT_UTF8
3329 /* UTF-8 mode */
3330
3331 if (utf8)
3332 {
3333 switch(ctype)
3334 {
3335 case OP_ANY:
3336
3337 /* Special code is required for UTF8, but when the maximum is
3338 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3339 probably worth it, because .* is quite a common idiom. */
3340
3341 if (max < INT_MAX)
3342 {
3343 if ((ims & PCRE_DOTALL) == 0)
3344 {
3345 for (i = min; i < max; i++)
3346 {
3347 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3348 eptr++;
3349 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3350 }
3351 }
3352 else
3353 {
3354 for (i = min; i < max; i++)
3355 {
3356 if (eptr >= md->end_subject) break;
3357 eptr++;
3358 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3359 }
3360 }
3361 }
3362
3363 /* Handle unlimited UTF-8 repeat */
3364
3365 else
3366 {
3367 if ((ims & PCRE_DOTALL) == 0)
3368 {
3369 for (i = min; i < max; i++)
3370 {
3371 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3372 eptr++;
3373 }
3374 break;
3375 }
3376 else
3377 {
3378 c = max - min;
3379 if (c > (unsigned int)(md->end_subject - eptr))
3380 c = md->end_subject - eptr;
3381 eptr += c;
3382 }
3383 }
3384 break;
3385
3386 /* The byte case is the same as non-UTF8 */
3387
3388 case OP_ANYBYTE:
3389 c = max - min;
3390 if (c > (unsigned int)(md->end_subject - eptr))
3391 c = md->end_subject - eptr;
3392 eptr += c;
3393 break;
3394
3395 case OP_ANYNL:
3396 for (i = min; i < max; i++)
3397 {
3398 int len = 1;
3399 if (eptr >= md->end_subject) break;
3400 GETCHARLEN(c, eptr, len);
3401 if (c == 0x000d)
3402 {
3403 if (++eptr >= md->end_subject) break;
3404 if (*eptr == 0x000a) eptr++;
3405 }
3406 else
3407 {
3408 if (c != 0x000a && c != 0x000b && c != 0x000c &&
3409 c != 0x0085 && c != 0x2028 && c != 0x2029)
3410 break;
3411 eptr += len;
3412 }
3413 }
3414 break;
3415
3416 case OP_NOT_DIGIT:
3417 for (i = min; i < max; i++)
3418 {
3419 int len = 1;
3420 if (eptr >= md->end_subject) break;
3421 GETCHARLEN(c, eptr, len);
3422 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3423 eptr+= len;
3424 }
3425 break;
3426
3427 case OP_DIGIT:
3428 for (i = min; i < max; i++)
3429 {
3430 int len = 1;
3431 if (eptr >= md->end_subject) break;
3432 GETCHARLEN(c, eptr, len);
3433 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3434 eptr+= len;
3435 }
3436 break;
3437
3438 case OP_NOT_WHITESPACE:
3439 for (i = min; i < max; i++)
3440 {
3441 int len = 1;
3442 if (eptr >= md->end_subject) break;
3443 GETCHARLEN(c, eptr, len);
3444 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3445 eptr+= len;
3446 }
3447 break;
3448
3449 case OP_WHITESPACE:
3450 for (i = min; i < max; i++)
3451 {
3452 int len = 1;
3453 if (eptr >= md->end_subject) break;
3454 GETCHARLEN(c, eptr, len);
3455 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3456 eptr+= len;
3457 }
3458 break;
3459
3460 case OP_NOT_WORDCHAR:
3461 for (i = min; i < max; i++)
3462 {
3463 int len = 1;
3464 if (eptr >= md->end_subject) break;
3465 GETCHARLEN(c, eptr, len);
3466 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3467 eptr+= len;
3468 }
3469 break;
3470
3471 case OP_WORDCHAR:
3472 for (i = min; i < max; i++)
3473 {
3474 int len = 1;
3475 if (eptr >= md->end_subject) break;
3476 GETCHARLEN(c, eptr, len);
3477 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3478 eptr+= len;
3479 }
3480 break;
3481
3482 default:
3483 RRETURN(PCRE_ERROR_INTERNAL);
3484 }
3485
3486 /* eptr is now past the end of the maximum run */
3487
3488 if (possessive) continue;
3489 for(;;)
3490 {
3491 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3492 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3493 if (eptr-- == pp) break; /* Stop if tried at original pos */
3494 BACKCHAR(eptr);
3495 }
3496 }
3497 else
3498 #endif
3499
3500 /* Not UTF-8 mode */
3501 {
3502 switch(ctype)
3503 {
3504 case OP_ANY:
3505 if ((ims & PCRE_DOTALL) == 0)
3506 {
3507 for (i = min; i < max; i++)
3508 {
3509 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3510 eptr++;
3511 }
3512 break;
3513 }
3514 /* For DOTALL case, fall through and treat as \C */
3515
3516 case OP_ANYBYTE:
3517 c = max - min;
3518 if (c > (unsigned int)(md->end_subject - eptr))
3519 c = md->end_subject - eptr;
3520 eptr += c;
3521 break;
3522
3523 case OP_ANYNL:
3524 for (i = min; i < max; i++)
3525 {
3526 if (eptr >= md->end_subject) break;
3527 c = *eptr;
3528 if (c == 0x000d)
3529 {
3530 if (++eptr >= md->end_subject) break;
3531 if (*eptr == 0x000a) eptr++;
3532 }
3533 else
3534 {
3535 if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3536 break;
3537 eptr++;
3538 }
3539 }
3540 break;
3541
3542 case OP_NOT_DIGIT:
3543 for (i = min; i < max; i++)
3544 {
3545 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3546 break;
3547 eptr++;
3548 }
3549 break;
3550
3551 case OP_DIGIT:
3552 for (i = min; i < max; i++)
3553 {
3554 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3555 break;
3556 eptr++;
3557 }
3558 break;
3559
3560 case OP_NOT_WHITESPACE:
3561 for (i = min; i < max; i++)
3562 {
3563 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3564 break;
3565 eptr++;
3566 }
3567 break;
3568
3569 case OP_WHITESPACE:
3570 for (i = min; i < max; i++)
3571 {
3572 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3573 break;
3574 eptr++;
3575 }
3576 break;
3577
3578 case OP_NOT_WORDCHAR:
3579 for (i = min; i < max; i++)
3580 {
3581 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3582 break;
3583 eptr++;
3584 }
3585 break;
3586
3587 case OP_WORDCHAR:
3588 for (i = min; i < max; i++)
3589 {
3590 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3591 break;
3592 eptr++;
3593 }
3594 break;
3595
3596 default:
3597 RRETURN(PCRE_ERROR_INTERNAL);
3598 }
3599
3600 /* eptr is now past the end of the maximum run */
3601
3602 if (possessive) continue;
3603 while (eptr >= pp)
3604 {
3605 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3606 eptr--;
3607 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3608 }
3609 }
3610
3611 /* Get here if we can't make it match with any permitted repetitions */
3612
3613 RRETURN(MATCH_NOMATCH);
3614 }
3615 /* Control never gets here */
3616
3617 /* There's been some horrible disaster. Arrival here can only mean there is
3618 something seriously wrong in the code above or the OP_xxx definitions. */
3619
3620 default:
3621 DPRINTF(("Unknown opcode %d\n", *ecode));
3622 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3623 }
3624
3625 /* Do not stick any code in here without much thought; it is assumed
3626 that "continue" in the code above comes out to here to repeat the main
3627 loop. */
3628
3629 } /* End of main loop */
3630 /* Control never reaches here */
3631 }
3632
3633
3634 /***************************************************************************
3635 ****************************************************************************
3636 RECURSION IN THE match() FUNCTION
3637
3638 Undefine all the macros that were defined above to handle this. */
3639
3640 #ifdef NO_RECURSE
3641 #undef eptr
3642 #undef ecode
3643 #undef offset_top
3644 #undef ims
3645 #undef eptrb
3646 #undef flags
3647
3648 #undef callpat
3649 #undef charptr
3650 #undef data
3651 #undef next
3652 #undef pp
3653 #undef prev
3654 #undef saved_eptr
3655
3656 #undef new_recursive
3657
3658 #undef cur_is_word
3659 #undef condition
3660 #undef prev_is_word
3661
3662 #undef original_ims
3663
3664 #undef ctype
3665 #undef length
3666 #undef max
3667 #undef min
3668 #undef number
3669 #undef offset
3670 #undef op
3671 #undef save_capture_last
3672 #undef save_offset1
3673 #undef save_offset2
3674 #undef save_offset3
3675 #undef stacksave
3676
3677 #undef newptrb
3678
3679 #endif
3680
3681 /* These two are defined as macros in both cases */
3682
3683 #undef fc
3684 #undef fi
3685
3686 /***************************************************************************
3687 ***************************************************************************/
3688
3689
3690
3691 /*************************************************
3692 * Execute a Regular Expression *
3693 *************************************************/
3694
3695 /* This function applies a compiled re to a subject string and picks out
3696 portions of the string if it matches. Two elements in the vector are set for
3697 each substring: the offsets to the start and end of the substring.
3698
3699 Arguments:
3700 argument_re points to the compiled expression
3701 extra_data points to extra data or is NULL
3702 subject points to the subject string
3703 length length of subject string (may contain binary zeros)
3704 start_offset where to start in the subject string
3705 options option bits
3706 offsets points to a vector of ints to be filled in with offsets
3707 offsetcount the number of elements in the vector
3708
3709 Returns: > 0 => success; value is the number of elements filled in
3710 = 0 => success, but offsets is not big enough
3711 -1 => failed to match
3712 < -1 => some kind of unexpected problem
3713 */
3714
3715 PCRE_EXP_DEFN int
3716 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3717 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3718 int offsetcount)
3719 {
3720 int rc, resetcount, ocount;
3721 int first_byte = -1;
3722 int req_byte = -1;
3723 int req_byte2 = -1;
3724 int newline;
3725 unsigned long int ims;
3726 BOOL using_temporary_offsets = FALSE;
3727 BOOL anchored;
3728 BOOL startline;
3729 BOOL firstline;
3730 BOOL first_byte_caseless = FALSE;
3731 BOOL req_byte_caseless = FALSE;
3732 BOOL utf8;
3733 match_data match_block;
3734 match_data *md = &match_block;
3735 const uschar *tables;
3736 const uschar *start_bits = NULL;
3737 USPTR start_match = (USPTR)subject + start_offset;
3738 USPTR end_subject;
3739 USPTR req_byte_ptr = start_match - 1;
3740 eptrblock eptrchain[EPTR_WORK_SIZE];
3741
3742 pcre_study_data internal_study;
3743 const pcre_study_data *study;
3744
3745 real_pcre internal_re;
3746 const real_pcre *external_re = (const real_pcre *)argument_re;
3747 const real_pcre *re = external_re;
3748
3749 /* Plausibility checks */
3750
3751 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3752 if (re == NULL || subject == NULL ||
3753 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3754 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3755
3756 /* Fish out the optional data from the extra_data structure, first setting
3757 the default values. */
3758
3759 study = NULL;
3760 md->match_limit = MATCH_LIMIT;
3761 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3762 md->callout_data = NULL;
3763
3764 /* The table pointer is always in native byte order. */
3765
3766 tables = external_re->tables;
3767
3768 if (extra_data != NULL)
3769 {
3770 register unsigned int flags = extra_data->flags;
3771 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3772 study = (const pcre_study_data *)extra_data->study_data;
3773 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3774 md->match_limit = extra_data->match_limit;
3775 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3776 md->match_limit_recursion = extra_data->match_limit_recursion;
3777 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3778 md->callout_data = extra_data->callout_data;
3779 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3780 }
3781
3782 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3783 is a feature that makes it possible to save compiled regex and re-use them
3784 in other programs later. */
3785
3786 if (tables == NULL) tables = _pcre_default_tables;
3787
3788 /* Check that the first field in the block is the magic number. If it is not,
3789 test for a regex that was compiled on a host of opposite endianness. If this is
3790 the case, flipped values are put in internal_re and internal_study if there was
3791 study data too. */
3792
3793 if (re->magic_number != MAGIC_NUMBER)
3794 {
3795 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3796 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3797 if (study != NULL) study = &internal_study;
3798 }
3799
3800 /* Set up other data */
3801
3802 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3803 startline = (re->options & PCRE_STARTLINE) != 0;
3804 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3805
3806 /* The code starts after the real_pcre block and the capture name table. */
3807
3808 md->start_code = (const uschar *)external_re + re->name_table_offset +
3809 re->name_count * re->name_entry_size;
3810
3811 md->start_subject = (USPTR)subject;
3812 md->start_offset = start_offset;
3813 md->end_subject = md->start_subject + length;
3814 end_subject = md->end_subject;
3815
3816 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3817 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3818
3819 md->notbol = (options & PCRE_NOTBOL) != 0;
3820 md->noteol = (options & PCRE_NOTEOL) != 0;
3821 md->notempty = (options & PCRE_NOTEMPTY) != 0;
3822 md->partial = (options & PCRE_PARTIAL) != 0;
3823 md->hitend = FALSE;
3824
3825 md->recursive = NULL; /* No recursion at top level */
3826 md->eptrchain = eptrchain; /* Make workspace generally available */
3827
3828 md->lcc = tables + lcc_offset;
3829 md->ctypes = tables + ctypes_offset;
3830
3831 /* Handle different types of newline. The three bits give eight cases. If
3832 nothing is set at run time, whatever was used at compile time applies. */
3833
3834 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3835 PCRE_NEWLINE_BITS)
3836 {
3837 case 0: newline = NEWLINE; break; /* Compile-time default */
3838 case PCRE_NEWLINE_CR: newline = '\r'; break;
3839 case PCRE_NEWLINE_LF: newline = '\n'; break;
3840 case PCRE_NEWLINE_CR+
3841 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3842 case PCRE_NEWLINE_ANY: newline = -1; break;
3843 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3844 default: return PCRE_ERROR_BADNEWLINE;
3845 }
3846
3847 if (newline == -2)
3848 {
3849 md->nltype = NLTYPE_ANYCRLF;
3850 }
3851 else if (newline < 0)
3852 {
3853 md->nltype = NLTYPE_ANY;
3854 }
3855 else
3856 {
3857 md->nltype = NLTYPE_FIXED;
3858 if (newline > 255)
3859 {
3860 md->nllen = 2;
3861 md->nl[0] = (newline >> 8) & 255;
3862 md->nl[1] = newline & 255;
3863 }
3864 else
3865 {
3866 md->nllen = 1;
3867 md->nl[0] = newline;
3868 }
3869 }
3870
3871 /* Partial matching is supported only for a restricted set of regexes at the
3872 moment. */
3873
3874 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3875 return PCRE_ERROR_BADPARTIAL;
3876
3877 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3878 back the character offset. */
3879
3880 #ifdef SUPPORT_UTF8
3881 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3882 {
3883 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3884 return PCRE_ERROR_BADUTF8;
3885 if (start_offset > 0 && start_offset < length)
3886 {
3887 int tb = ((uschar *)subject)[start_offset];
3888 if (tb > 127)
3889 {
3890 tb &= 0xc0;
3891 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3892 }
3893 }
3894 }
3895 #endif
3896
3897 /* The ims options can vary during the matching as a result of the presence
3898 of (?ims) items in the pattern. They are kept in a local variable so that
3899 restoring at the exit of a group is easy. */
3900
3901 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3902
3903 /* If the expression has got more back references than the offsets supplied can
3904 hold, we get a temporary chunk of working store to use during the matching.
3905 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3906 of 3. */
3907
3908 ocount = offsetcount - (offsetcount % 3);
3909
3910 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3911 {
3912 ocount = re->top_backref * 3 + 3;
3913 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3914 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3915 using_temporary_offsets = TRUE;
3916 DPRINTF(("Got memory to hold back references\n"));
3917 }
3918 else md->offset_vector = offsets;
3919
3920 md->offset_end = ocount;
3921 md->offset_max = (2*ocount)/3;
3922 md->offset_overflow = FALSE;
3923 md->capture_last = -1;
3924
3925 /* Compute the minimum number of offsets that we need to reset each time. Doing
3926 this makes a huge difference to execution time when there aren't many brackets
3927 in the pattern. */
3928
3929 resetcount = 2 + re->top_bracket * 2;
3930 if (resetcount > offsetcount) resetcount = ocount;
3931
3932 /* Reset the working variable associated with each extraction. These should
3933 never be used unless previously set, but they get saved and restored, and so we
3934 initialize them to avoid reading uninitialized locations. */
3935
3936 if (md->offset_vector != NULL)
3937 {
3938 register int *iptr = md->offset_vector + ocount;
3939 register int *iend = iptr - resetcount/2 + 1;
3940 while (--iptr >= iend) *iptr = -1;
3941 }
3942
3943 /* Set up the first character to match, if available. The first_byte value is
3944 never set for an anchored regular expression, but the anchoring may be forced
3945 at run time, so we have to test for anchoring. The first char may be unset for
3946 an unanchored pattern, of course. If there's no first char and the pattern was
3947 studied, there may be a bitmap of possible first characters. */
3948
3949 if (!anchored)
3950 {
3951 if ((re->options & PCRE_FIRSTSET) != 0)
3952 {
3953 first_byte = re->first_byte & 255;
3954 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3955 first_byte = md->lcc[first_byte];
3956 }
3957 else
3958 if (!startline && study != NULL &&
3959 (study->options & PCRE_STUDY_MAPPED) != 0)
3960 start_bits = study->start_bits;
3961 }
3962
3963 /* For anchored or unanchored matches, there may be a "last known required
3964 character" set. */
3965
3966 if ((re->options & PCRE_REQCHSET) != 0)
3967 {
3968 req_byte = re->req_byte & 255;
3969 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3970 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3971 }
3972
3973
3974 /* ==========================================================================*/
3975
3976 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3977 the loop runs just once. */
3978
3979 for(;;)
3980 {
3981 USPTR save_end_subject = end_subject;
3982
3983 /* Reset the maximum number of extractions we might see. */
3984
3985 if (md->offset_vector != NULL)
3986 {
3987 register int *iptr = md->offset_vector;
3988 register int *iend = iptr + resetcount;
3989 while (iptr < iend) *iptr++ = -1;
3990 }
3991
3992 /* Advance to a unique first char if possible. If firstline is TRUE, the
3993 start of the match is constrained to the first line of a multiline string.
3994 That is, the match must be before or at the first newline. Implement this by
3995 temporarily adjusting end_subject so that we stop scanning at a newline. If
3996 the match fails at the newline, later code breaks this loop. */
3997
3998 if (firstline)
3999 {
4000 USPTR t = start_match;
4001 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4002 end_subject = t;
4003 }
4004
4005 /* Now test for a unique first byte */
4006
4007 if (first_byte >= 0)
4008 {
4009 if (first_byte_caseless)
4010 while (start_match < end_subject &&
4011 md->lcc[*start_match] != first_byte)
4012 start_match++;
4013 else
4014 while (start_match < end_subject && *start_match != first_byte)
4015 start_match++;
4016 }
4017
4018 /* Or to just after a linebreak for a multiline match if possible */
4019
4020 else if (startline)
4021 {
4022 if (start_match > md->start_subject + start_offset)
4023 {
4024 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4025 start_match++;
4026
4027 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4028 and we are now at a LF, advance the match position by one more character.
4029 */
4030
4031 if (start_match[-1] == '\r' &&
4032 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4033 start_match < end_subject &&
4034 *start_match == '\n')
4035 start_match++;
4036 }
4037 }
4038
4039 /* Or to a non-unique first char after study */
4040
4041 else if (start_bits != NULL)
4042 {
4043 while (start_match < end_subject)
4044 {
4045 register unsigned int c = *start_match;
4046 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4047 }
4048 }
4049
4050 /* Restore fudged end_subject */
4051
4052 end_subject = save_end_subject;
4053
4054 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4055 printf(">>>> Match against: ");
4056 pchars(start_match, end_subject - start_match, TRUE, md);
4057 printf("\n");
4058 #endif
4059
4060 /* If req_byte is set, we know that that character must appear in the subject
4061 for the match to succeed. If the first character is set, req_byte must be
4062 later in the subject; otherwise the test starts at the match point. This
4063 optimization can save a huge amount of backtracking in patterns with nested
4064 unlimited repeats that aren't going to match. Writing separate code for
4065 cased/caseless versions makes it go faster, as does using an autoincrement
4066 and backing off on a match.
4067
4068 HOWEVER: when the subject string is very, very long, searching to its end can
4069 take a long time, and give bad performance on quite ordinary patterns. This
4070 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4071 string... so we don't do this when the string is sufficiently long.
4072
4073 ALSO: this processing is disabled when partial matching is requested.
4074 */
4075
4076 if (req_byte >= 0 &&
4077 end_subject - start_match < REQ_BYTE_MAX &&
4078 !md->partial)
4079 {
4080 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4081
4082 /* We don't need to repeat the search if we haven't yet reached the
4083 place we found it at last time. */
4084
4085 if (p > req_byte_ptr)
4086 {
4087 if (req_byte_caseless)
4088 {
4089 while (p < end_subject)
4090 {
4091 register int pp = *p++;
4092 if (pp == req_byte || pp == req_byte2) { p--; break; }
4093 }
4094 }
4095 else
4096 {
4097 while (p < end_subject)
4098 {
4099 if (*p++ == req_byte) { p--; break; }
4100 }
4101 }
4102
4103 /* If we can't find the required character, break the matching loop,
4104 forcing a match failure. */
4105
4106 if (p >= end_subject)
4107 {
4108 rc = MATCH_NOMATCH;
4109 break;
4110 }
4111
4112 /* If we have found the required character, save the point where we
4113 found it, so that we don't search again next time round the loop if
4114 the start hasn't passed this character yet. */
4115
4116 req_byte_ptr = p;
4117 }
4118 }
4119
4120 /* OK, we can now run the match. */
4121
4122 md->start_match = start_match;
4123 md->match_call_count = 0;
4124 md->eptrn = 0; /* Next free eptrchain slot */
4125 rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4126
4127 /* Any return other than MATCH_NOMATCH breaks the loop. */
4128
4129 if (rc != MATCH_NOMATCH) break;
4130
4131 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4132 newline in the subject (though it may continue over the newline). Therefore,
4133 if we have just failed to match, starting at a newline, do not continue. */
4134
4135 if (firstline && IS_NEWLINE(start_match)) break;
4136
4137 /* Advance the match position by one character. */
4138
4139 start_match++;
4140 #ifdef SUPPORT_UTF8
4141 if (utf8)
4142 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4143 start_match++;
4144 #endif
4145
4146 /* Break the loop if the pattern is anchored or if we have passed the end of
4147 the subject. */
4148
4149 if (anchored || start_match > end_subject) break;
4150
4151 /* If we have just passed a CR and the newline option is CRLF or ANY or
4152 ANYCRLF, and we are now at a LF, advance the match position by one more
4153 character. */
4154
4155 if (start_match[-1] == '\r' &&
4156 (md->nltype == NLTYPE_ANY ||
4157 md->nltype == NLTYPE_ANYCRLF ||
4158 md->nllen == 2) &&
4159 start_match < end_subject &&
4160 *start_match == '\n')
4161 start_match++;
4162
4163 } /* End of for(;;) "bumpalong" loop */
4164
4165 /* ==========================================================================*/
4166
4167 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4168 conditions is true:
4169
4170 (1) The pattern is anchored;
4171
4172 (2) We are past the end of the subject;
4173
4174 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4175 this option requests that a match occur at or before the first newline in
4176 the subject.
4177
4178 When we have a match and the offset vector is big enough to deal with any
4179 backreferences, captured substring offsets will already be set up. In the case
4180 where we had to get some local store to hold offsets for backreference
4181 processing, copy those that we can. In this case there need not be overflow if
4182 certain parts of the pattern were not used, even though there are more
4183 capturing parentheses than vector slots. */
4184
4185 if (rc == MATCH_MATCH)
4186 {
4187 if (using_temporary_offsets)
4188 {
4189 if (offsetcount >= 4)
4190 {
4191 memcpy(offsets + 2, md->offset_vector + 2,
4192 (offsetcount - 2) * sizeof(int));
4193 DPRINTF(("Copied offsets from temporary memory\n"));
4194 }
4195 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4196 DPRINTF(("Freeing temporary memory\n"));
4197 (pcre_free)(md->offset_vector);
4198 }
4199
4200 /* Set the return code to the number of captured strings, or 0 if there are
4201 too many to fit into the vector. */
4202
4203 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4204
4205 /* If there is space, set up the whole thing as substring 0. */
4206
4207 if (offsetcount < 2) rc = 0; else
4208 {
4209 offsets[0] = start_match - md->start_subject;
4210 offsets[1] = md->end_match_ptr - md->start_subject;
4211 }
4212
4213 DPRINTF((">>>> returning %d\n", rc));
4214 return rc;
4215 }
4216
4217 /* Control gets here if there has been an error, or if the overall match
4218 attempt has failed at all permitted starting positions. */
4219
4220 if (using_temporary_offsets)
4221 {
4222 DPRINTF(("Freeing temporary memory\n"));
4223 (pcre_free)(md->offset_vector);
4224 }
4225
4226 if (rc != MATCH_NOMATCH)
4227 {
4228 DPRINTF((">>>> error: returning %d\n", rc));
4229 return rc;
4230 }
4231 else if (md->partial && md->hitend)
4232 {
4233 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4234 return PCRE_ERROR_PARTIAL;
4235 }
4236 else
4237 {
4238 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4239 return PCRE_ERROR_NOMATCH;
4240 }
4241 }
4242
4243 /* End of pcre_exec.c */

Properties

Name Value
svn:eol-style native
svn:keywords "Author Date Id Revision Url"

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12