/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 85 - (show annotations) (download)
Sat Feb 24 21:41:13 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 110857 byte(s)
Load pcre-6.4 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2005 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45
46 #include "pcre_internal.h"
47
48
49 /* Structure for building a chain of data that actually lives on the
50 stack, for holding the values of the subject pointer at the start of each
51 subpattern, so as to detect when an empty string has been matched by a
52 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
53 are on the heap, not on the stack. */
54
55 typedef struct eptrblock {
56 struct eptrblock *epb_prev;
57 const uschar *epb_saved_eptr;
58 } eptrblock;
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_isgroup 0x02 /* Set if start of bracketed group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78
79 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
80 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
81
82
83
84 #ifdef DEBUG
85 /*************************************************
86 * Debugging function to print chars *
87 *************************************************/
88
89 /* Print a sequence of chars in printable format, stopping at the end of the
90 subject if the requested.
91
92 Arguments:
93 p points to characters
94 length number to print
95 is_subject TRUE if printing from within md->start_subject
96 md pointer to matching data block, if is_subject is TRUE
97
98 Returns: nothing
99 */
100
101 static void
102 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
103 {
104 int c;
105 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106 while (length-- > 0)
107 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
108 }
109 #endif
110
111
112
113 /*************************************************
114 * Match a back-reference *
115 *************************************************/
116
117 /* If a back reference hasn't been set, the length that is passed is greater
118 than the number of characters left in the string, so the match fails.
119
120 Arguments:
121 offset index into the offset vector
122 eptr points into the subject
123 length length to be matched
124 md points to match data block
125 ims the ims flags
126
127 Returns: TRUE if matched
128 */
129
130 static BOOL
131 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
132 unsigned long int ims)
133 {
134 const uschar *p = md->start_subject + md->offset_vector[offset];
135
136 #ifdef DEBUG
137 if (eptr >= md->end_subject)
138 printf("matching subject <null>");
139 else
140 {
141 printf("matching subject ");
142 pchars(eptr, length, TRUE, md);
143 }
144 printf(" against backref ");
145 pchars(p, length, FALSE, md);
146 printf("\n");
147 #endif
148
149 /* Always fail if not enough characters left */
150
151 if (length > md->end_subject - eptr) return FALSE;
152
153 /* Separate the caselesss case for speed */
154
155 if ((ims & PCRE_CASELESS) != 0)
156 {
157 while (length-- > 0)
158 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
159 }
160 else
161 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
162
163 return TRUE;
164 }
165
166
167
168 /***************************************************************************
169 ****************************************************************************
170 RECURSION IN THE match() FUNCTION
171
172 The match() function is highly recursive. Some regular expressions can cause
173 it to recurse thousands of times. I was writing for Unix, so I just let it
174 call itself recursively. This uses the stack for saving everything that has
175 to be saved for a recursive call. On Unix, the stack can be large, and this
176 works fine.
177
178 It turns out that on non-Unix systems there are problems with programs that
179 use a lot of stack. (This despite the fact that every last chip has oodles
180 of memory these days, and techniques for extending the stack have been known
181 for decades.) So....
182
183 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
184 calls by keeping local variables that need to be preserved in blocks of memory
185 obtained from malloc instead instead of on the stack. Macros are used to
186 achieve this so that the actual code doesn't look very different to what it
187 always used to.
188 ****************************************************************************
189 ***************************************************************************/
190
191
192 /* These versions of the macros use the stack, as normal */
193
194 #ifndef NO_RECURSE
195 #define REGISTER register
196 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
197 #define RRETURN(ra) return ra
198 #else
199
200
201 /* These versions of the macros manage a private stack on the heap. Note
202 that the rd argument of RMATCH isn't actually used. It's the md argument of
203 match(), which never changes. */
204
205 #define REGISTER
206
207 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
208 {\
209 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
210 if (setjmp(frame->Xwhere) == 0)\
211 {\
212 newframe->Xeptr = ra;\
213 newframe->Xecode = rb;\
214 newframe->Xoffset_top = rc;\
215 newframe->Xims = re;\
216 newframe->Xeptrb = rf;\
217 newframe->Xflags = rg;\
218 newframe->Xprevframe = frame;\
219 frame = newframe;\
220 DPRINTF(("restarting from line %d\n", __LINE__));\
221 goto HEAP_RECURSE;\
222 }\
223 else\
224 {\
225 DPRINTF(("longjumped back to line %d\n", __LINE__));\
226 frame = md->thisframe;\
227 rx = frame->Xresult;\
228 }\
229 }
230
231 #define RRETURN(ra)\
232 {\
233 heapframe *newframe = frame;\
234 frame = newframe->Xprevframe;\
235 (pcre_stack_free)(newframe);\
236 if (frame != NULL)\
237 {\
238 frame->Xresult = ra;\
239 md->thisframe = frame;\
240 longjmp(frame->Xwhere, 1);\
241 }\
242 return ra;\
243 }
244
245
246 /* Structure for remembering the local variables in a private frame */
247
248 typedef struct heapframe {
249 struct heapframe *Xprevframe;
250
251 /* Function arguments that may change */
252
253 const uschar *Xeptr;
254 const uschar *Xecode;
255 int Xoffset_top;
256 long int Xims;
257 eptrblock *Xeptrb;
258 int Xflags;
259
260 /* Function local variables */
261
262 const uschar *Xcallpat;
263 const uschar *Xcharptr;
264 const uschar *Xdata;
265 const uschar *Xnext;
266 const uschar *Xpp;
267 const uschar *Xprev;
268 const uschar *Xsaved_eptr;
269
270 recursion_info Xnew_recursive;
271
272 BOOL Xcur_is_word;
273 BOOL Xcondition;
274 BOOL Xminimize;
275 BOOL Xprev_is_word;
276
277 unsigned long int Xoriginal_ims;
278
279 #ifdef SUPPORT_UCP
280 int Xprop_type;
281 int Xprop_fail_result;
282 int Xprop_category;
283 int Xprop_chartype;
284 int Xprop_othercase;
285 int Xprop_test_against;
286 int *Xprop_test_variable;
287 #endif
288
289 int Xctype;
290 int Xfc;
291 int Xfi;
292 int Xlength;
293 int Xmax;
294 int Xmin;
295 int Xnumber;
296 int Xoffset;
297 int Xop;
298 int Xsave_capture_last;
299 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
300 int Xstacksave[REC_STACK_SAVE_MAX];
301
302 eptrblock Xnewptrb;
303
304 /* Place to pass back result, and where to jump back to */
305
306 int Xresult;
307 jmp_buf Xwhere;
308
309 } heapframe;
310
311 #endif
312
313
314 /***************************************************************************
315 ***************************************************************************/
316
317
318
319 /*************************************************
320 * Match from current position *
321 *************************************************/
322
323 /* On entry ecode points to the first opcode, and eptr to the first character
324 in the subject string, while eptrb holds the value of eptr at the start of the
325 last bracketed group - used for breaking infinite loops matching zero-length
326 strings. This function is called recursively in many circumstances. Whenever it
327 returns a negative (error) response, the outer incarnation must also return the
328 same response.
329
330 Performance note: It might be tempting to extract commonly used fields from the
331 md structure (e.g. utf8, end_subject) into individual variables to improve
332 performance. Tests using gcc on a SPARC disproved this; in the first case, it
333 made performance worse.
334
335 Arguments:
336 eptr pointer in subject
337 ecode position in code
338 offset_top current top pointer
339 md pointer to "static" info for the match
340 ims current /i, /m, and /s options
341 eptrb pointer to chain of blocks containing eptr at start of
342 brackets - for testing for empty matches
343 flags can contain
344 match_condassert - this is an assertion condition
345 match_isgroup - this is the start of a bracketed group
346
347 Returns: MATCH_MATCH if matched ) these values are >= 0
348 MATCH_NOMATCH if failed to match )
349 a negative PCRE_ERROR_xxx value if aborted by an error condition
350 (e.g. stopped by recursion limit)
351 */
352
353 static int
354 match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
355 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
356 int flags)
357 {
358 /* These variables do not need to be preserved over recursion in this function,
359 so they can be ordinary variables in all cases. Mark them with "register"
360 because they are used a lot in loops. */
361
362 register int rrc; /* Returns from recursive calls */
363 register int i; /* Used for loops not involving calls to RMATCH() */
364 register int c; /* Character values not kept over RMATCH() calls */
365 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
366
367 /* When recursion is not being used, all "local" variables that have to be
368 preserved over calls to RMATCH() are part of a "frame" which is obtained from
369 heap storage. Set up the top-level frame here; others are obtained from the
370 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
371
372 #ifdef NO_RECURSE
373 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
374 frame->Xprevframe = NULL; /* Marks the top level */
375
376 /* Copy in the original argument variables */
377
378 frame->Xeptr = eptr;
379 frame->Xecode = ecode;
380 frame->Xoffset_top = offset_top;
381 frame->Xims = ims;
382 frame->Xeptrb = eptrb;
383 frame->Xflags = flags;
384
385 /* This is where control jumps back to to effect "recursion" */
386
387 HEAP_RECURSE:
388
389 /* Macros make the argument variables come from the current frame */
390
391 #define eptr frame->Xeptr
392 #define ecode frame->Xecode
393 #define offset_top frame->Xoffset_top
394 #define ims frame->Xims
395 #define eptrb frame->Xeptrb
396 #define flags frame->Xflags
397
398 /* Ditto for the local variables */
399
400 #ifdef SUPPORT_UTF8
401 #define charptr frame->Xcharptr
402 #endif
403 #define callpat frame->Xcallpat
404 #define data frame->Xdata
405 #define next frame->Xnext
406 #define pp frame->Xpp
407 #define prev frame->Xprev
408 #define saved_eptr frame->Xsaved_eptr
409
410 #define new_recursive frame->Xnew_recursive
411
412 #define cur_is_word frame->Xcur_is_word
413 #define condition frame->Xcondition
414 #define minimize frame->Xminimize
415 #define prev_is_word frame->Xprev_is_word
416
417 #define original_ims frame->Xoriginal_ims
418
419 #ifdef SUPPORT_UCP
420 #define prop_type frame->Xprop_type
421 #define prop_fail_result frame->Xprop_fail_result
422 #define prop_category frame->Xprop_category
423 #define prop_chartype frame->Xprop_chartype
424 #define prop_othercase frame->Xprop_othercase
425 #define prop_test_against frame->Xprop_test_against
426 #define prop_test_variable frame->Xprop_test_variable
427 #endif
428
429 #define ctype frame->Xctype
430 #define fc frame->Xfc
431 #define fi frame->Xfi
432 #define length frame->Xlength
433 #define max frame->Xmax
434 #define min frame->Xmin
435 #define number frame->Xnumber
436 #define offset frame->Xoffset
437 #define op frame->Xop
438 #define save_capture_last frame->Xsave_capture_last
439 #define save_offset1 frame->Xsave_offset1
440 #define save_offset2 frame->Xsave_offset2
441 #define save_offset3 frame->Xsave_offset3
442 #define stacksave frame->Xstacksave
443
444 #define newptrb frame->Xnewptrb
445
446 /* When recursion is being used, local variables are allocated on the stack and
447 get preserved during recursion in the normal way. In this environment, fi and
448 i, and fc and c, can be the same variables. */
449
450 #else
451 #define fi i
452 #define fc c
453
454
455 #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
456 const uschar *charptr; /* small blocks of the code. My normal */
457 #endif /* style of coding would have declared */
458 const uschar *callpat; /* them within each of those blocks. */
459 const uschar *data; /* However, in order to accommodate the */
460 const uschar *next; /* version of this code that uses an */
461 const uschar *pp; /* external "stack" implemented on the */
462 const uschar *prev; /* heap, it is easier to declare them */
463 const uschar *saved_eptr; /* all here, so the declarations can */
464 /* be cut out in a block. The only */
465 recursion_info new_recursive; /* declarations within blocks below are */
466 /* for variables that do not have to */
467 BOOL cur_is_word; /* be preserved over a recursive call */
468 BOOL condition; /* to RMATCH(). */
469 BOOL minimize;
470 BOOL prev_is_word;
471
472 unsigned long int original_ims;
473
474 #ifdef SUPPORT_UCP
475 int prop_type;
476 int prop_fail_result;
477 int prop_category;
478 int prop_chartype;
479 int prop_othercase;
480 int prop_test_against;
481 int *prop_test_variable;
482 #endif
483
484 int ctype;
485 int length;
486 int max;
487 int min;
488 int number;
489 int offset;
490 int op;
491 int save_capture_last;
492 int save_offset1, save_offset2, save_offset3;
493 int stacksave[REC_STACK_SAVE_MAX];
494
495 eptrblock newptrb;
496 #endif
497
498 /* These statements are here to stop the compiler complaining about unitialized
499 variables. */
500
501 #ifdef SUPPORT_UCP
502 prop_fail_result = 0;
503 prop_test_against = 0;
504 prop_test_variable = NULL;
505 #endif
506
507 /* OK, now we can get on with the real code of the function. Recursion is
508 specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
509 these just turn into a recursive call to match() and a "return", respectively.
510 However, RMATCH isn't like a function call because it's quite a complicated
511 macro. It has to be used in one particular way. This shouldn't, however, impact
512 performance when true recursion is being used. */
513
514 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
515
516 original_ims = ims; /* Save for resetting on ')' */
517 utf8 = md->utf8; /* Local copy of the flag */
518
519 /* At the start of a bracketed group, add the current subject pointer to the
520 stack of such pointers, to be re-instated at the end of the group when we hit
521 the closing ket. When match() is called in other circumstances, we don't add to
522 this stack. */
523
524 if ((flags & match_isgroup) != 0)
525 {
526 newptrb.epb_prev = eptrb;
527 newptrb.epb_saved_eptr = eptr;
528 eptrb = &newptrb;
529 }
530
531 /* Now start processing the operations. */
532
533 for (;;)
534 {
535 op = *ecode;
536 minimize = FALSE;
537
538 /* For partial matching, remember if we ever hit the end of the subject after
539 matching at least one subject character. */
540
541 if (md->partial &&
542 eptr >= md->end_subject &&
543 eptr > md->start_match)
544 md->hitend = TRUE;
545
546 /* Opening capturing bracket. If there is space in the offset vector, save
547 the current subject position in the working slot at the top of the vector. We
548 mustn't change the current values of the data slot, because they may be set
549 from a previous iteration of this group, and be referred to by a reference
550 inside the group.
551
552 If the bracket fails to match, we need to restore this value and also the
553 values of the final offsets, in case they were set by a previous iteration of
554 the same bracket.
555
556 If there isn't enough space in the offset vector, treat this as if it were a
557 non-capturing bracket. Don't worry about setting the flag for the error case
558 here; that is handled in the code for KET. */
559
560 if (op > OP_BRA)
561 {
562 number = op - OP_BRA;
563
564 /* For extended extraction brackets (large number), we have to fish out the
565 number from a dummy opcode at the start. */
566
567 if (number > EXTRACT_BASIC_MAX)
568 number = GET2(ecode, 2+LINK_SIZE);
569 offset = number << 1;
570
571 #ifdef DEBUG
572 printf("start bracket %d subject=", number);
573 pchars(eptr, 16, TRUE, md);
574 printf("\n");
575 #endif
576
577 if (offset < md->offset_max)
578 {
579 save_offset1 = md->offset_vector[offset];
580 save_offset2 = md->offset_vector[offset+1];
581 save_offset3 = md->offset_vector[md->offset_end - number];
582 save_capture_last = md->capture_last;
583
584 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
585 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
586
587 do
588 {
589 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
590 match_isgroup);
591 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
592 md->capture_last = save_capture_last;
593 ecode += GET(ecode, 1);
594 }
595 while (*ecode == OP_ALT);
596
597 DPRINTF(("bracket %d failed\n", number));
598
599 md->offset_vector[offset] = save_offset1;
600 md->offset_vector[offset+1] = save_offset2;
601 md->offset_vector[md->offset_end - number] = save_offset3;
602
603 RRETURN(MATCH_NOMATCH);
604 }
605
606 /* Insufficient room for saving captured contents */
607
608 else op = OP_BRA;
609 }
610
611 /* Other types of node can be handled by a switch */
612
613 switch(op)
614 {
615 case OP_BRA: /* Non-capturing bracket: optimized */
616 DPRINTF(("start bracket 0\n"));
617 do
618 {
619 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
620 match_isgroup);
621 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
622 ecode += GET(ecode, 1);
623 }
624 while (*ecode == OP_ALT);
625 DPRINTF(("bracket 0 failed\n"));
626 RRETURN(MATCH_NOMATCH);
627
628 /* Conditional group: compilation checked that there are no more than
629 two branches. If the condition is false, skipping the first branch takes us
630 past the end if there is only one branch, but that's OK because that is
631 exactly what going to the ket would do. */
632
633 case OP_COND:
634 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
635 {
636 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
637 condition = (offset == CREF_RECURSE * 2)?
638 (md->recursive != NULL) :
639 (offset < offset_top && md->offset_vector[offset] >= 0);
640 RMATCH(rrc, eptr, ecode + (condition?
641 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
642 offset_top, md, ims, eptrb, match_isgroup);
643 RRETURN(rrc);
644 }
645
646 /* The condition is an assertion. Call match() to evaluate it - setting
647 the final argument TRUE causes it to stop at the end of an assertion. */
648
649 else
650 {
651 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
652 match_condassert | match_isgroup);
653 if (rrc == MATCH_MATCH)
654 {
655 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
656 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
657 }
658 else if (rrc != MATCH_NOMATCH)
659 {
660 RRETURN(rrc); /* Need braces because of following else */
661 }
662 else ecode += GET(ecode, 1);
663 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
664 match_isgroup);
665 RRETURN(rrc);
666 }
667 /* Control never reaches here */
668
669 /* Skip over conditional reference or large extraction number data if
670 encountered. */
671
672 case OP_CREF:
673 case OP_BRANUMBER:
674 ecode += 3;
675 break;
676
677 /* End of the pattern. If we are in a recursion, we should restore the
678 offsets appropriately and continue from after the call. */
679
680 case OP_END:
681 if (md->recursive != NULL && md->recursive->group_num == 0)
682 {
683 recursion_info *rec = md->recursive;
684 DPRINTF(("Hit the end in a (?0) recursion\n"));
685 md->recursive = rec->prevrec;
686 memmove(md->offset_vector, rec->offset_save,
687 rec->saved_max * sizeof(int));
688 md->start_match = rec->save_start;
689 ims = original_ims;
690 ecode = rec->after_call;
691 break;
692 }
693
694 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
695 string - backtracking will then try other alternatives, if any. */
696
697 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
698 md->end_match_ptr = eptr; /* Record where we ended */
699 md->end_offset_top = offset_top; /* and how many extracts were taken */
700 RRETURN(MATCH_MATCH);
701
702 /* Change option settings */
703
704 case OP_OPT:
705 ims = ecode[1];
706 ecode += 2;
707 DPRINTF(("ims set to %02lx\n", ims));
708 break;
709
710 /* Assertion brackets. Check the alternative branches in turn - the
711 matching won't pass the KET for an assertion. If any one branch matches,
712 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
713 start of each branch to move the current point backwards, so the code at
714 this level is identical to the lookahead case. */
715
716 case OP_ASSERT:
717 case OP_ASSERTBACK:
718 do
719 {
720 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
721 match_isgroup);
722 if (rrc == MATCH_MATCH) break;
723 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
724 ecode += GET(ecode, 1);
725 }
726 while (*ecode == OP_ALT);
727 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
728
729 /* If checking an assertion for a condition, return MATCH_MATCH. */
730
731 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
732
733 /* Continue from after the assertion, updating the offsets high water
734 mark, since extracts may have been taken during the assertion. */
735
736 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
737 ecode += 1 + LINK_SIZE;
738 offset_top = md->end_offset_top;
739 continue;
740
741 /* Negative assertion: all branches must fail to match */
742
743 case OP_ASSERT_NOT:
744 case OP_ASSERTBACK_NOT:
745 do
746 {
747 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
748 match_isgroup);
749 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
750 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
751 ecode += GET(ecode,1);
752 }
753 while (*ecode == OP_ALT);
754
755 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
756
757 ecode += 1 + LINK_SIZE;
758 continue;
759
760 /* Move the subject pointer back. This occurs only at the start of
761 each branch of a lookbehind assertion. If we are too close to the start to
762 move back, this match function fails. When working with UTF-8 we move
763 back a number of characters, not bytes. */
764
765 case OP_REVERSE:
766 #ifdef SUPPORT_UTF8
767 if (utf8)
768 {
769 c = GET(ecode,1);
770 for (i = 0; i < c; i++)
771 {
772 eptr--;
773 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
774 BACKCHAR(eptr)
775 }
776 }
777 else
778 #endif
779
780 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
781
782 {
783 eptr -= GET(ecode,1);
784 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
785 }
786
787 /* Skip to next op code */
788
789 ecode += 1 + LINK_SIZE;
790 break;
791
792 /* The callout item calls an external function, if one is provided, passing
793 details of the match so far. This is mainly for debugging, though the
794 function is able to force a failure. */
795
796 case OP_CALLOUT:
797 if (pcre_callout != NULL)
798 {
799 pcre_callout_block cb;
800 cb.version = 1; /* Version 1 of the callout block */
801 cb.callout_number = ecode[1];
802 cb.offset_vector = md->offset_vector;
803 cb.subject = (const char *)md->start_subject;
804 cb.subject_length = md->end_subject - md->start_subject;
805 cb.start_match = md->start_match - md->start_subject;
806 cb.current_position = eptr - md->start_subject;
807 cb.pattern_position = GET(ecode, 2);
808 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
809 cb.capture_top = offset_top/2;
810 cb.capture_last = md->capture_last;
811 cb.callout_data = md->callout_data;
812 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
813 if (rrc < 0) RRETURN(rrc);
814 }
815 ecode += 2 + 2*LINK_SIZE;
816 break;
817
818 /* Recursion either matches the current regex, or some subexpression. The
819 offset data is the offset to the starting bracket from the start of the
820 whole pattern. (This is so that it works from duplicated subpatterns.)
821
822 If there are any capturing brackets started but not finished, we have to
823 save their starting points and reinstate them after the recursion. However,
824 we don't know how many such there are (offset_top records the completed
825 total) so we just have to save all the potential data. There may be up to
826 65535 such values, which is too large to put on the stack, but using malloc
827 for small numbers seems expensive. As a compromise, the stack is used when
828 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
829 is used. A problem is what to do if the malloc fails ... there is no way of
830 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
831 values on the stack, and accept that the rest may be wrong.
832
833 There are also other values that have to be saved. We use a chained
834 sequence of blocks that actually live on the stack. Thanks to Robin Houston
835 for the original version of this logic. */
836
837 case OP_RECURSE:
838 {
839 callpat = md->start_code + GET(ecode, 1);
840 new_recursive.group_num = *callpat - OP_BRA;
841
842 /* For extended extraction brackets (large number), we have to fish out
843 the number from a dummy opcode at the start. */
844
845 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
846 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
847
848 /* Add to "recursing stack" */
849
850 new_recursive.prevrec = md->recursive;
851 md->recursive = &new_recursive;
852
853 /* Find where to continue from afterwards */
854
855 ecode += 1 + LINK_SIZE;
856 new_recursive.after_call = ecode;
857
858 /* Now save the offset data. */
859
860 new_recursive.saved_max = md->offset_end;
861 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
862 new_recursive.offset_save = stacksave;
863 else
864 {
865 new_recursive.offset_save =
866 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
867 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
868 }
869
870 memcpy(new_recursive.offset_save, md->offset_vector,
871 new_recursive.saved_max * sizeof(int));
872 new_recursive.save_start = md->start_match;
873 md->start_match = eptr;
874
875 /* OK, now we can do the recursion. For each top-level alternative we
876 restore the offset and recursion data. */
877
878 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
879 do
880 {
881 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
882 eptrb, match_isgroup);
883 if (rrc == MATCH_MATCH)
884 {
885 md->recursive = new_recursive.prevrec;
886 if (new_recursive.offset_save != stacksave)
887 (pcre_free)(new_recursive.offset_save);
888 RRETURN(MATCH_MATCH);
889 }
890 else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
891
892 md->recursive = &new_recursive;
893 memcpy(md->offset_vector, new_recursive.offset_save,
894 new_recursive.saved_max * sizeof(int));
895 callpat += GET(callpat, 1);
896 }
897 while (*callpat == OP_ALT);
898
899 DPRINTF(("Recursion didn't match\n"));
900 md->recursive = new_recursive.prevrec;
901 if (new_recursive.offset_save != stacksave)
902 (pcre_free)(new_recursive.offset_save);
903 RRETURN(MATCH_NOMATCH);
904 }
905 /* Control never reaches here */
906
907 /* "Once" brackets are like assertion brackets except that after a match,
908 the point in the subject string is not moved back. Thus there can never be
909 a move back into the brackets. Friedl calls these "atomic" subpatterns.
910 Check the alternative branches in turn - the matching won't pass the KET
911 for this kind of subpattern. If any one branch matches, we carry on as at
912 the end of a normal bracket, leaving the subject pointer. */
913
914 case OP_ONCE:
915 {
916 prev = ecode;
917 saved_eptr = eptr;
918
919 do
920 {
921 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
922 eptrb, match_isgroup);
923 if (rrc == MATCH_MATCH) break;
924 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
925 ecode += GET(ecode,1);
926 }
927 while (*ecode == OP_ALT);
928
929 /* If hit the end of the group (which could be repeated), fail */
930
931 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
932
933 /* Continue as from after the assertion, updating the offsets high water
934 mark, since extracts may have been taken. */
935
936 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
937
938 offset_top = md->end_offset_top;
939 eptr = md->end_match_ptr;
940
941 /* For a non-repeating ket, just continue at this level. This also
942 happens for a repeating ket if no characters were matched in the group.
943 This is the forcible breaking of infinite loops as implemented in Perl
944 5.005. If there is an options reset, it will get obeyed in the normal
945 course of events. */
946
947 if (*ecode == OP_KET || eptr == saved_eptr)
948 {
949 ecode += 1+LINK_SIZE;
950 break;
951 }
952
953 /* The repeating kets try the rest of the pattern or restart from the
954 preceding bracket, in the appropriate order. We need to reset any options
955 that changed within the bracket before re-running it, so check the next
956 opcode. */
957
958 if (ecode[1+LINK_SIZE] == OP_OPT)
959 {
960 ims = (ims & ~PCRE_IMS) | ecode[4];
961 DPRINTF(("ims set to %02lx at group repeat\n", ims));
962 }
963
964 if (*ecode == OP_KETRMIN)
965 {
966 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
967 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
968 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
969 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
970 }
971 else /* OP_KETRMAX */
972 {
973 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
974 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
975 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
976 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
977 }
978 }
979 RRETURN(MATCH_NOMATCH);
980
981 /* An alternation is the end of a branch; scan along to find the end of the
982 bracketed group and go to there. */
983
984 case OP_ALT:
985 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
986 break;
987
988 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
989 that it may occur zero times. It may repeat infinitely, or not at all -
990 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
991 repeat limits are compiled as a number of copies, with the optional ones
992 preceded by BRAZERO or BRAMINZERO. */
993
994 case OP_BRAZERO:
995 {
996 next = ecode+1;
997 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
999 do next += GET(next,1); while (*next == OP_ALT);
1000 ecode = next + 1+LINK_SIZE;
1001 }
1002 break;
1003
1004 case OP_BRAMINZERO:
1005 {
1006 next = ecode+1;
1007 do next += GET(next,1); while (*next == OP_ALT);
1008 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1009 match_isgroup);
1010 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011 ecode++;
1012 }
1013 break;
1014
1015 /* End of a group, repeated or non-repeating. If we are at the end of
1016 an assertion "group", stop matching and return MATCH_MATCH, but record the
1017 current high water mark for use by positive assertions. Do this also
1018 for the "once" (not-backup up) groups. */
1019
1020 case OP_KET:
1021 case OP_KETRMIN:
1022 case OP_KETRMAX:
1023 {
1024 prev = ecode - GET(ecode, 1);
1025 saved_eptr = eptrb->epb_saved_eptr;
1026
1027 /* Back up the stack of bracket start pointers. */
1028
1029 eptrb = eptrb->epb_prev;
1030
1031 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1032 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1033 *prev == OP_ONCE)
1034 {
1035 md->end_match_ptr = eptr; /* For ONCE */
1036 md->end_offset_top = offset_top;
1037 RRETURN(MATCH_MATCH);
1038 }
1039
1040 /* In all other cases except a conditional group we have to check the
1041 group number back at the start and if necessary complete handling an
1042 extraction by setting the offsets and bumping the high water mark. */
1043
1044 if (*prev != OP_COND)
1045 {
1046 number = *prev - OP_BRA;
1047
1048 /* For extended extraction brackets (large number), we have to fish out
1049 the number from a dummy opcode at the start. */
1050
1051 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1052 offset = number << 1;
1053
1054 #ifdef DEBUG
1055 printf("end bracket %d", number);
1056 printf("\n");
1057 #endif
1058
1059 /* Test for a numbered group. This includes groups called as a result
1060 of recursion. Note that whole-pattern recursion is coded as a recurse
1061 into group 0, so it won't be picked up here. Instead, we catch it when
1062 the OP_END is reached. */
1063
1064 if (number > 0)
1065 {
1066 md->capture_last = number;
1067 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1068 {
1069 md->offset_vector[offset] =
1070 md->offset_vector[md->offset_end - number];
1071 md->offset_vector[offset+1] = eptr - md->start_subject;
1072 if (offset_top <= offset) offset_top = offset + 2;
1073 }
1074
1075 /* Handle a recursively called group. Restore the offsets
1076 appropriately and continue from after the call. */
1077
1078 if (md->recursive != NULL && md->recursive->group_num == number)
1079 {
1080 recursion_info *rec = md->recursive;
1081 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1082 md->recursive = rec->prevrec;
1083 md->start_match = rec->save_start;
1084 memcpy(md->offset_vector, rec->offset_save,
1085 rec->saved_max * sizeof(int));
1086 ecode = rec->after_call;
1087 ims = original_ims;
1088 break;
1089 }
1090 }
1091 }
1092
1093 /* Reset the value of the ims flags, in case they got changed during
1094 the group. */
1095
1096 ims = original_ims;
1097 DPRINTF(("ims reset to %02lx\n", ims));
1098
1099 /* For a non-repeating ket, just continue at this level. This also
1100 happens for a repeating ket if no characters were matched in the group.
1101 This is the forcible breaking of infinite loops as implemented in Perl
1102 5.005. If there is an options reset, it will get obeyed in the normal
1103 course of events. */
1104
1105 if (*ecode == OP_KET || eptr == saved_eptr)
1106 {
1107 ecode += 1 + LINK_SIZE;
1108 break;
1109 }
1110
1111 /* The repeating kets try the rest of the pattern or restart from the
1112 preceding bracket, in the appropriate order. */
1113
1114 if (*ecode == OP_KETRMIN)
1115 {
1116 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1117 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1118 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1119 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120 }
1121 else /* OP_KETRMAX */
1122 {
1123 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1124 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1125 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1126 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1127 }
1128 }
1129
1130 RRETURN(MATCH_NOMATCH);
1131
1132 /* Start of subject unless notbol, or after internal newline if multiline */
1133
1134 case OP_CIRC:
1135 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1136 if ((ims & PCRE_MULTILINE) != 0)
1137 {
1138 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1139 RRETURN(MATCH_NOMATCH);
1140 ecode++;
1141 break;
1142 }
1143 /* ... else fall through */
1144
1145 /* Start of subject assertion */
1146
1147 case OP_SOD:
1148 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1149 ecode++;
1150 break;
1151
1152 /* Start of match assertion */
1153
1154 case OP_SOM:
1155 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1156 ecode++;
1157 break;
1158
1159 /* Assert before internal newline if multiline, or before a terminating
1160 newline unless endonly is set, else end of subject unless noteol is set. */
1161
1162 case OP_DOLL:
1163 if ((ims & PCRE_MULTILINE) != 0)
1164 {
1165 if (eptr < md->end_subject)
1166 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1167 else
1168 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1169 ecode++;
1170 break;
1171 }
1172 else
1173 {
1174 if (md->noteol) RRETURN(MATCH_NOMATCH);
1175 if (!md->endonly)
1176 {
1177 if (eptr < md->end_subject - 1 ||
1178 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1179 RRETURN(MATCH_NOMATCH);
1180 ecode++;
1181 break;
1182 }
1183 }
1184 /* ... else fall through */
1185
1186 /* End of subject assertion (\z) */
1187
1188 case OP_EOD:
1189 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1190 ecode++;
1191 break;
1192
1193 /* End of subject or ending \n assertion (\Z) */
1194
1195 case OP_EODN:
1196 if (eptr < md->end_subject - 1 ||
1197 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1198 ecode++;
1199 break;
1200
1201 /* Word boundary assertions */
1202
1203 case OP_NOT_WORD_BOUNDARY:
1204 case OP_WORD_BOUNDARY:
1205 {
1206
1207 /* Find out if the previous and current characters are "word" characters.
1208 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1209 be "non-word" characters. */
1210
1211 #ifdef SUPPORT_UTF8
1212 if (utf8)
1213 {
1214 if (eptr == md->start_subject) prev_is_word = FALSE; else
1215 {
1216 const uschar *lastptr = eptr - 1;
1217 while((*lastptr & 0xc0) == 0x80) lastptr--;
1218 GETCHAR(c, lastptr);
1219 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1220 }
1221 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1222 {
1223 GETCHAR(c, eptr);
1224 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1225 }
1226 }
1227 else
1228 #endif
1229
1230 /* More streamlined when not in UTF-8 mode */
1231
1232 {
1233 prev_is_word = (eptr != md->start_subject) &&
1234 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1235 cur_is_word = (eptr < md->end_subject) &&
1236 ((md->ctypes[*eptr] & ctype_word) != 0);
1237 }
1238
1239 /* Now see if the situation is what we want */
1240
1241 if ((*ecode++ == OP_WORD_BOUNDARY)?
1242 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1243 RRETURN(MATCH_NOMATCH);
1244 }
1245 break;
1246
1247 /* Match a single character type; inline for speed */
1248
1249 case OP_ANY:
1250 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1251 RRETURN(MATCH_NOMATCH);
1252 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1253 #ifdef SUPPORT_UTF8
1254 if (utf8)
1255 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1256 #endif
1257 ecode++;
1258 break;
1259
1260 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1261 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1262
1263 case OP_ANYBYTE:
1264 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1265 ecode++;
1266 break;
1267
1268 case OP_NOT_DIGIT:
1269 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1270 GETCHARINCTEST(c, eptr);
1271 if (
1272 #ifdef SUPPORT_UTF8
1273 c < 256 &&
1274 #endif
1275 (md->ctypes[c] & ctype_digit) != 0
1276 )
1277 RRETURN(MATCH_NOMATCH);
1278 ecode++;
1279 break;
1280
1281 case OP_DIGIT:
1282 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1283 GETCHARINCTEST(c, eptr);
1284 if (
1285 #ifdef SUPPORT_UTF8
1286 c >= 256 ||
1287 #endif
1288 (md->ctypes[c] & ctype_digit) == 0
1289 )
1290 RRETURN(MATCH_NOMATCH);
1291 ecode++;
1292 break;
1293
1294 case OP_NOT_WHITESPACE:
1295 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1296 GETCHARINCTEST(c, eptr);
1297 if (
1298 #ifdef SUPPORT_UTF8
1299 c < 256 &&
1300 #endif
1301 (md->ctypes[c] & ctype_space) != 0
1302 )
1303 RRETURN(MATCH_NOMATCH);
1304 ecode++;
1305 break;
1306
1307 case OP_WHITESPACE:
1308 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1309 GETCHARINCTEST(c, eptr);
1310 if (
1311 #ifdef SUPPORT_UTF8
1312 c >= 256 ||
1313 #endif
1314 (md->ctypes[c] & ctype_space) == 0
1315 )
1316 RRETURN(MATCH_NOMATCH);
1317 ecode++;
1318 break;
1319
1320 case OP_NOT_WORDCHAR:
1321 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1322 GETCHARINCTEST(c, eptr);
1323 if (
1324 #ifdef SUPPORT_UTF8
1325 c < 256 &&
1326 #endif
1327 (md->ctypes[c] & ctype_word) != 0
1328 )
1329 RRETURN(MATCH_NOMATCH);
1330 ecode++;
1331 break;
1332
1333 case OP_WORDCHAR:
1334 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1335 GETCHARINCTEST(c, eptr);
1336 if (
1337 #ifdef SUPPORT_UTF8
1338 c >= 256 ||
1339 #endif
1340 (md->ctypes[c] & ctype_word) == 0
1341 )
1342 RRETURN(MATCH_NOMATCH);
1343 ecode++;
1344 break;
1345
1346 #ifdef SUPPORT_UCP
1347 /* Check the next character by Unicode property. We will get here only
1348 if the support is in the binary; otherwise a compile-time error occurs. */
1349
1350 case OP_PROP:
1351 case OP_NOTPROP:
1352 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1353 GETCHARINCTEST(c, eptr);
1354 {
1355 int chartype, rqdtype;
1356 int othercase;
1357 int category = _pcre_ucp_findchar(c, &chartype, &othercase);
1358
1359 rqdtype = *(++ecode);
1360 ecode++;
1361
1362 if (rqdtype >= 128)
1363 {
1364 if ((rqdtype - 128 != category) == (op == OP_PROP))
1365 RRETURN(MATCH_NOMATCH);
1366 }
1367 else
1368 {
1369 if ((rqdtype != chartype) == (op == OP_PROP))
1370 RRETURN(MATCH_NOMATCH);
1371 }
1372 }
1373 break;
1374
1375 /* Match an extended Unicode sequence. We will get here only if the support
1376 is in the binary; otherwise a compile-time error occurs. */
1377
1378 case OP_EXTUNI:
1379 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1380 GETCHARINCTEST(c, eptr);
1381 {
1382 int chartype;
1383 int othercase;
1384 int category = _pcre_ucp_findchar(c, &chartype, &othercase);
1385 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1386 while (eptr < md->end_subject)
1387 {
1388 int len = 1;
1389 if (!utf8) c = *eptr; else
1390 {
1391 GETCHARLEN(c, eptr, len);
1392 }
1393 category = _pcre_ucp_findchar(c, &chartype, &othercase);
1394 if (category != ucp_M) break;
1395 eptr += len;
1396 }
1397 }
1398 ecode++;
1399 break;
1400 #endif
1401
1402
1403 /* Match a back reference, possibly repeatedly. Look past the end of the
1404 item to see if there is repeat information following. The code is similar
1405 to that for character classes, but repeated for efficiency. Then obey
1406 similar code to character type repeats - written out again for speed.
1407 However, if the referenced string is the empty string, always treat
1408 it as matched, any number of times (otherwise there could be infinite
1409 loops). */
1410
1411 case OP_REF:
1412 {
1413 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1414 ecode += 3; /* Advance past item */
1415
1416 /* If the reference is unset, set the length to be longer than the amount
1417 of subject left; this ensures that every attempt at a match fails. We
1418 can't just fail here, because of the possibility of quantifiers with zero
1419 minima. */
1420
1421 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1422 md->end_subject - eptr + 1 :
1423 md->offset_vector[offset+1] - md->offset_vector[offset];
1424
1425 /* Set up for repetition, or handle the non-repeated case */
1426
1427 switch (*ecode)
1428 {
1429 case OP_CRSTAR:
1430 case OP_CRMINSTAR:
1431 case OP_CRPLUS:
1432 case OP_CRMINPLUS:
1433 case OP_CRQUERY:
1434 case OP_CRMINQUERY:
1435 c = *ecode++ - OP_CRSTAR;
1436 minimize = (c & 1) != 0;
1437 min = rep_min[c]; /* Pick up values from tables; */
1438 max = rep_max[c]; /* zero for max => infinity */
1439 if (max == 0) max = INT_MAX;
1440 break;
1441
1442 case OP_CRRANGE:
1443 case OP_CRMINRANGE:
1444 minimize = (*ecode == OP_CRMINRANGE);
1445 min = GET2(ecode, 1);
1446 max = GET2(ecode, 3);
1447 if (max == 0) max = INT_MAX;
1448 ecode += 5;
1449 break;
1450
1451 default: /* No repeat follows */
1452 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1453 eptr += length;
1454 continue; /* With the main loop */
1455 }
1456
1457 /* If the length of the reference is zero, just continue with the
1458 main loop. */
1459
1460 if (length == 0) continue;
1461
1462 /* First, ensure the minimum number of matches are present. We get back
1463 the length of the reference string explicitly rather than passing the
1464 address of eptr, so that eptr can be a register variable. */
1465
1466 for (i = 1; i <= min; i++)
1467 {
1468 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1469 eptr += length;
1470 }
1471
1472 /* If min = max, continue at the same level without recursion.
1473 They are not both allowed to be zero. */
1474
1475 if (min == max) continue;
1476
1477 /* If minimizing, keep trying and advancing the pointer */
1478
1479 if (minimize)
1480 {
1481 for (fi = min;; fi++)
1482 {
1483 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1484 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1485 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1486 RRETURN(MATCH_NOMATCH);
1487 eptr += length;
1488 }
1489 /* Control never gets here */
1490 }
1491
1492 /* If maximizing, find the longest string and work backwards */
1493
1494 else
1495 {
1496 pp = eptr;
1497 for (i = min; i < max; i++)
1498 {
1499 if (!match_ref(offset, eptr, length, md, ims)) break;
1500 eptr += length;
1501 }
1502 while (eptr >= pp)
1503 {
1504 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1505 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1506 eptr -= length;
1507 }
1508 RRETURN(MATCH_NOMATCH);
1509 }
1510 }
1511 /* Control never gets here */
1512
1513
1514
1515 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1516 used when all the characters in the class have values in the range 0-255,
1517 and either the matching is caseful, or the characters are in the range
1518 0-127 when UTF-8 processing is enabled. The only difference between
1519 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1520 encountered.
1521
1522 First, look past the end of the item to see if there is repeat information
1523 following. Then obey similar code to character type repeats - written out
1524 again for speed. */
1525
1526 case OP_NCLASS:
1527 case OP_CLASS:
1528 {
1529 data = ecode + 1; /* Save for matching */
1530 ecode += 33; /* Advance past the item */
1531
1532 switch (*ecode)
1533 {
1534 case OP_CRSTAR:
1535 case OP_CRMINSTAR:
1536 case OP_CRPLUS:
1537 case OP_CRMINPLUS:
1538 case OP_CRQUERY:
1539 case OP_CRMINQUERY:
1540 c = *ecode++ - OP_CRSTAR;
1541 minimize = (c & 1) != 0;
1542 min = rep_min[c]; /* Pick up values from tables; */
1543 max = rep_max[c]; /* zero for max => infinity */
1544 if (max == 0) max = INT_MAX;
1545 break;
1546
1547 case OP_CRRANGE:
1548 case OP_CRMINRANGE:
1549 minimize = (*ecode == OP_CRMINRANGE);
1550 min = GET2(ecode, 1);
1551 max = GET2(ecode, 3);
1552 if (max == 0) max = INT_MAX;
1553 ecode += 5;
1554 break;
1555
1556 default: /* No repeat follows */
1557 min = max = 1;
1558 break;
1559 }
1560
1561 /* First, ensure the minimum number of matches are present. */
1562
1563 #ifdef SUPPORT_UTF8
1564 /* UTF-8 mode */
1565 if (utf8)
1566 {
1567 for (i = 1; i <= min; i++)
1568 {
1569 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1570 GETCHARINC(c, eptr);
1571 if (c > 255)
1572 {
1573 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1574 }
1575 else
1576 {
1577 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1578 }
1579 }
1580 }
1581 else
1582 #endif
1583 /* Not UTF-8 mode */
1584 {
1585 for (i = 1; i <= min; i++)
1586 {
1587 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1588 c = *eptr++;
1589 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1590 }
1591 }
1592
1593 /* If max == min we can continue with the main loop without the
1594 need to recurse. */
1595
1596 if (min == max) continue;
1597
1598 /* If minimizing, keep testing the rest of the expression and advancing
1599 the pointer while it matches the class. */
1600
1601 if (minimize)
1602 {
1603 #ifdef SUPPORT_UTF8
1604 /* UTF-8 mode */
1605 if (utf8)
1606 {
1607 for (fi = min;; fi++)
1608 {
1609 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1610 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1611 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1612 GETCHARINC(c, eptr);
1613 if (c > 255)
1614 {
1615 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1616 }
1617 else
1618 {
1619 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1620 }
1621 }
1622 }
1623 else
1624 #endif
1625 /* Not UTF-8 mode */
1626 {
1627 for (fi = min;; fi++)
1628 {
1629 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1630 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1631 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1632 c = *eptr++;
1633 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1634 }
1635 }
1636 /* Control never gets here */
1637 }
1638
1639 /* If maximizing, find the longest possible run, then work backwards. */
1640
1641 else
1642 {
1643 pp = eptr;
1644
1645 #ifdef SUPPORT_UTF8
1646 /* UTF-8 mode */
1647 if (utf8)
1648 {
1649 for (i = min; i < max; i++)
1650 {
1651 int len = 1;
1652 if (eptr >= md->end_subject) break;
1653 GETCHARLEN(c, eptr, len);
1654 if (c > 255)
1655 {
1656 if (op == OP_CLASS) break;
1657 }
1658 else
1659 {
1660 if ((data[c/8] & (1 << (c&7))) == 0) break;
1661 }
1662 eptr += len;
1663 }
1664 for (;;)
1665 {
1666 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1667 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1668 if (eptr-- == pp) break; /* Stop if tried at original pos */
1669 BACKCHAR(eptr);
1670 }
1671 }
1672 else
1673 #endif
1674 /* Not UTF-8 mode */
1675 {
1676 for (i = min; i < max; i++)
1677 {
1678 if (eptr >= md->end_subject) break;
1679 c = *eptr;
1680 if ((data[c/8] & (1 << (c&7))) == 0) break;
1681 eptr++;
1682 }
1683 while (eptr >= pp)
1684 {
1685 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1686 eptr--;
1687 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1688 }
1689 }
1690
1691 RRETURN(MATCH_NOMATCH);
1692 }
1693 }
1694 /* Control never gets here */
1695
1696
1697 /* Match an extended character class. This opcode is encountered only
1698 in UTF-8 mode, because that's the only time it is compiled. */
1699
1700 #ifdef SUPPORT_UTF8
1701 case OP_XCLASS:
1702 {
1703 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1704 ecode += GET(ecode, 1); /* Advance past the item */
1705
1706 switch (*ecode)
1707 {
1708 case OP_CRSTAR:
1709 case OP_CRMINSTAR:
1710 case OP_CRPLUS:
1711 case OP_CRMINPLUS:
1712 case OP_CRQUERY:
1713 case OP_CRMINQUERY:
1714 c = *ecode++ - OP_CRSTAR;
1715 minimize = (c & 1) != 0;
1716 min = rep_min[c]; /* Pick up values from tables; */
1717 max = rep_max[c]; /* zero for max => infinity */
1718 if (max == 0) max = INT_MAX;
1719 break;
1720
1721 case OP_CRRANGE:
1722 case OP_CRMINRANGE:
1723 minimize = (*ecode == OP_CRMINRANGE);
1724 min = GET2(ecode, 1);
1725 max = GET2(ecode, 3);
1726 if (max == 0) max = INT_MAX;
1727 ecode += 5;
1728 break;
1729
1730 default: /* No repeat follows */
1731 min = max = 1;
1732 break;
1733 }
1734
1735 /* First, ensure the minimum number of matches are present. */
1736
1737 for (i = 1; i <= min; i++)
1738 {
1739 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1740 GETCHARINC(c, eptr);
1741 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1742 }
1743
1744 /* If max == min we can continue with the main loop without the
1745 need to recurse. */
1746
1747 if (min == max) continue;
1748
1749 /* If minimizing, keep testing the rest of the expression and advancing
1750 the pointer while it matches the class. */
1751
1752 if (minimize)
1753 {
1754 for (fi = min;; fi++)
1755 {
1756 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1757 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1758 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1759 GETCHARINC(c, eptr);
1760 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1761 }
1762 /* Control never gets here */
1763 }
1764
1765 /* If maximizing, find the longest possible run, then work backwards. */
1766
1767 else
1768 {
1769 pp = eptr;
1770 for (i = min; i < max; i++)
1771 {
1772 int len = 1;
1773 if (eptr >= md->end_subject) break;
1774 GETCHARLEN(c, eptr, len);
1775 if (!_pcre_xclass(c, data)) break;
1776 eptr += len;
1777 }
1778 for(;;)
1779 {
1780 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1781 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1782 if (eptr-- == pp) break; /* Stop if tried at original pos */
1783 BACKCHAR(eptr)
1784 }
1785 RRETURN(MATCH_NOMATCH);
1786 }
1787
1788 /* Control never gets here */
1789 }
1790 #endif /* End of XCLASS */
1791
1792 /* Match a single character, casefully */
1793
1794 case OP_CHAR:
1795 #ifdef SUPPORT_UTF8
1796 if (utf8)
1797 {
1798 length = 1;
1799 ecode++;
1800 GETCHARLEN(fc, ecode, length);
1801 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1802 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1803 }
1804 else
1805 #endif
1806
1807 /* Non-UTF-8 mode */
1808 {
1809 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1810 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1811 ecode += 2;
1812 }
1813 break;
1814
1815 /* Match a single character, caselessly */
1816
1817 case OP_CHARNC:
1818 #ifdef SUPPORT_UTF8
1819 if (utf8)
1820 {
1821 length = 1;
1822 ecode++;
1823 GETCHARLEN(fc, ecode, length);
1824
1825 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1826
1827 /* If the pattern character's value is < 128, we have only one byte, and
1828 can use the fast lookup table. */
1829
1830 if (fc < 128)
1831 {
1832 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1833 }
1834
1835 /* Otherwise we must pick up the subject character */
1836
1837 else
1838 {
1839 int dc;
1840 GETCHARINC(dc, eptr);
1841 ecode += length;
1842
1843 /* If we have Unicode property support, we can use it to test the other
1844 case of the character, if there is one. The result of _pcre_ucp_findchar() is
1845 < 0 if the char isn't found, and othercase is returned as zero if there
1846 isn't one. */
1847
1848 if (fc != dc)
1849 {
1850 #ifdef SUPPORT_UCP
1851 int chartype;
1852 int othercase;
1853 if (_pcre_ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
1854 #endif
1855 RRETURN(MATCH_NOMATCH);
1856 }
1857 }
1858 }
1859 else
1860 #endif /* SUPPORT_UTF8 */
1861
1862 /* Non-UTF-8 mode */
1863 {
1864 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1865 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1866 ecode += 2;
1867 }
1868 break;
1869
1870 /* Match a single character repeatedly; different opcodes share code. */
1871
1872 case OP_EXACT:
1873 min = max = GET2(ecode, 1);
1874 ecode += 3;
1875 goto REPEATCHAR;
1876
1877 case OP_UPTO:
1878 case OP_MINUPTO:
1879 min = 0;
1880 max = GET2(ecode, 1);
1881 minimize = *ecode == OP_MINUPTO;
1882 ecode += 3;
1883 goto REPEATCHAR;
1884
1885 case OP_STAR:
1886 case OP_MINSTAR:
1887 case OP_PLUS:
1888 case OP_MINPLUS:
1889 case OP_QUERY:
1890 case OP_MINQUERY:
1891 c = *ecode++ - OP_STAR;
1892 minimize = (c & 1) != 0;
1893 min = rep_min[c]; /* Pick up values from tables; */
1894 max = rep_max[c]; /* zero for max => infinity */
1895 if (max == 0) max = INT_MAX;
1896
1897 /* Common code for all repeated single-character matches. We can give
1898 up quickly if there are fewer than the minimum number of characters left in
1899 the subject. */
1900
1901 REPEATCHAR:
1902 #ifdef SUPPORT_UTF8
1903 if (utf8)
1904 {
1905 length = 1;
1906 charptr = ecode;
1907 GETCHARLEN(fc, ecode, length);
1908 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1909 ecode += length;
1910
1911 /* Handle multibyte character matching specially here. There is
1912 support for caseless matching if UCP support is present. */
1913
1914 if (length > 1)
1915 {
1916 int oclength = 0;
1917 uschar occhars[8];
1918
1919 #ifdef SUPPORT_UCP
1920 int othercase;
1921 int chartype;
1922 if ((ims & PCRE_CASELESS) != 0 &&
1923 _pcre_ucp_findchar(fc, &chartype, &othercase) >= 0 &&
1924 othercase > 0)
1925 oclength = _pcre_ord2utf8(othercase, occhars);
1926 #endif /* SUPPORT_UCP */
1927
1928 for (i = 1; i <= min; i++)
1929 {
1930 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1931 /* Need braces because of following else */
1932 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1933 else
1934 {
1935 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1936 eptr += oclength;
1937 }
1938 }
1939
1940 if (min == max) continue;
1941
1942 if (minimize)
1943 {
1944 for (fi = min;; fi++)
1945 {
1946 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1947 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1948 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1949 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1950 /* Need braces because of following else */
1951 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1952 else
1953 {
1954 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1955 eptr += oclength;
1956 }
1957 }
1958 /* Control never gets here */
1959 }
1960 else
1961 {
1962 pp = eptr;
1963 for (i = min; i < max; i++)
1964 {
1965 if (eptr > md->end_subject - length) break;
1966 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1967 else if (oclength == 0) break;
1968 else
1969 {
1970 if (memcmp(eptr, occhars, oclength) != 0) break;
1971 eptr += oclength;
1972 }
1973 }
1974 while (eptr >= pp)
1975 {
1976 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1977 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1978 eptr -= length;
1979 }
1980 RRETURN(MATCH_NOMATCH);
1981 }
1982 /* Control never gets here */
1983 }
1984
1985 /* If the length of a UTF-8 character is 1, we fall through here, and
1986 obey the code as for non-UTF-8 characters below, though in this case the
1987 value of fc will always be < 128. */
1988 }
1989 else
1990 #endif /* SUPPORT_UTF8 */
1991
1992 /* When not in UTF-8 mode, load a single-byte character. */
1993 {
1994 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1995 fc = *ecode++;
1996 }
1997
1998 /* The value of fc at this point is always less than 256, though we may or
1999 may not be in UTF-8 mode. The code is duplicated for the caseless and
2000 caseful cases, for speed, since matching characters is likely to be quite
2001 common. First, ensure the minimum number of matches are present. If min =
2002 max, continue at the same level without recursing. Otherwise, if
2003 minimizing, keep trying the rest of the expression and advancing one
2004 matching character if failing, up to the maximum. Alternatively, if
2005 maximizing, find the maximum number of characters and work backwards. */
2006
2007 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2008 max, eptr));
2009
2010 if ((ims & PCRE_CASELESS) != 0)
2011 {
2012 fc = md->lcc[fc];
2013 for (i = 1; i <= min; i++)
2014 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2015 if (min == max) continue;
2016 if (minimize)
2017 {
2018 for (fi = min;; fi++)
2019 {
2020 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2021 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022 if (fi >= max || eptr >= md->end_subject ||
2023 fc != md->lcc[*eptr++])
2024 RRETURN(MATCH_NOMATCH);
2025 }
2026 /* Control never gets here */
2027 }
2028 else
2029 {
2030 pp = eptr;
2031 for (i = min; i < max; i++)
2032 {
2033 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2034 eptr++;
2035 }
2036 while (eptr >= pp)
2037 {
2038 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2039 eptr--;
2040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2041 }
2042 RRETURN(MATCH_NOMATCH);
2043 }
2044 /* Control never gets here */
2045 }
2046
2047 /* Caseful comparisons (includes all multi-byte characters) */
2048
2049 else
2050 {
2051 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2052 if (min == max) continue;
2053 if (minimize)
2054 {
2055 for (fi = min;; fi++)
2056 {
2057 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2058 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2059 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2060 RRETURN(MATCH_NOMATCH);
2061 }
2062 /* Control never gets here */
2063 }
2064 else
2065 {
2066 pp = eptr;
2067 for (i = min; i < max; i++)
2068 {
2069 if (eptr >= md->end_subject || fc != *eptr) break;
2070 eptr++;
2071 }
2072 while (eptr >= pp)
2073 {
2074 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2075 eptr--;
2076 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2077 }
2078 RRETURN(MATCH_NOMATCH);
2079 }
2080 }
2081 /* Control never gets here */
2082
2083 /* Match a negated single one-byte character. The character we are
2084 checking can be multibyte. */
2085
2086 case OP_NOT:
2087 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2088 ecode++;
2089 GETCHARINCTEST(c, eptr);
2090 if ((ims & PCRE_CASELESS) != 0)
2091 {
2092 #ifdef SUPPORT_UTF8
2093 if (c < 256)
2094 #endif
2095 c = md->lcc[c];
2096 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2097 }
2098 else
2099 {
2100 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2101 }
2102 break;
2103
2104 /* Match a negated single one-byte character repeatedly. This is almost a
2105 repeat of the code for a repeated single character, but I haven't found a
2106 nice way of commoning these up that doesn't require a test of the
2107 positive/negative option for each character match. Maybe that wouldn't add
2108 very much to the time taken, but character matching *is* what this is all
2109 about... */
2110
2111 case OP_NOTEXACT:
2112 min = max = GET2(ecode, 1);
2113 ecode += 3;
2114 goto REPEATNOTCHAR;
2115
2116 case OP_NOTUPTO:
2117 case OP_NOTMINUPTO:
2118 min = 0;
2119 max = GET2(ecode, 1);
2120 minimize = *ecode == OP_NOTMINUPTO;
2121 ecode += 3;
2122 goto REPEATNOTCHAR;
2123
2124 case OP_NOTSTAR:
2125 case OP_NOTMINSTAR:
2126 case OP_NOTPLUS:
2127 case OP_NOTMINPLUS:
2128 case OP_NOTQUERY:
2129 case OP_NOTMINQUERY:
2130 c = *ecode++ - OP_NOTSTAR;
2131 minimize = (c & 1) != 0;
2132 min = rep_min[c]; /* Pick up values from tables; */
2133 max = rep_max[c]; /* zero for max => infinity */
2134 if (max == 0) max = INT_MAX;
2135
2136 /* Common code for all repeated single-byte matches. We can give up quickly
2137 if there are fewer than the minimum number of bytes left in the
2138 subject. */
2139
2140 REPEATNOTCHAR:
2141 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2142 fc = *ecode++;
2143
2144 /* The code is duplicated for the caseless and caseful cases, for speed,
2145 since matching characters is likely to be quite common. First, ensure the
2146 minimum number of matches are present. If min = max, continue at the same
2147 level without recursing. Otherwise, if minimizing, keep trying the rest of
2148 the expression and advancing one matching character if failing, up to the
2149 maximum. Alternatively, if maximizing, find the maximum number of
2150 characters and work backwards. */
2151
2152 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2153 max, eptr));
2154
2155 if ((ims & PCRE_CASELESS) != 0)
2156 {
2157 fc = md->lcc[fc];
2158
2159 #ifdef SUPPORT_UTF8
2160 /* UTF-8 mode */
2161 if (utf8)
2162 {
2163 register int d;
2164 for (i = 1; i <= min; i++)
2165 {
2166 GETCHARINC(d, eptr);
2167 if (d < 256) d = md->lcc[d];
2168 if (fc == d) RRETURN(MATCH_NOMATCH);
2169 }
2170 }
2171 else
2172 #endif
2173
2174 /* Not UTF-8 mode */
2175 {
2176 for (i = 1; i <= min; i++)
2177 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2178 }
2179
2180 if (min == max) continue;
2181
2182 if (minimize)
2183 {
2184 #ifdef SUPPORT_UTF8
2185 /* UTF-8 mode */
2186 if (utf8)
2187 {
2188 register int d;
2189 for (fi = min;; fi++)
2190 {
2191 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2192 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2193 GETCHARINC(d, eptr);
2194 if (d < 256) d = md->lcc[d];
2195 if (fi >= max || eptr >= md->end_subject || fc == d)
2196 RRETURN(MATCH_NOMATCH);
2197 }
2198 }
2199 else
2200 #endif
2201 /* Not UTF-8 mode */
2202 {
2203 for (fi = min;; fi++)
2204 {
2205 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2206 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2207 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2208 RRETURN(MATCH_NOMATCH);
2209 }
2210 }
2211 /* Control never gets here */
2212 }
2213
2214 /* Maximize case */
2215
2216 else
2217 {
2218 pp = eptr;
2219
2220 #ifdef SUPPORT_UTF8
2221 /* UTF-8 mode */
2222 if (utf8)
2223 {
2224 register int d;
2225 for (i = min; i < max; i++)
2226 {
2227 int len = 1;
2228 if (eptr >= md->end_subject) break;
2229 GETCHARLEN(d, eptr, len);
2230 if (d < 256) d = md->lcc[d];
2231 if (fc == d) break;
2232 eptr += len;
2233 }
2234 for(;;)
2235 {
2236 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2237 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2238 if (eptr-- == pp) break; /* Stop if tried at original pos */
2239 BACKCHAR(eptr);
2240 }
2241 }
2242 else
2243 #endif
2244 /* Not UTF-8 mode */
2245 {
2246 for (i = min; i < max; i++)
2247 {
2248 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2249 eptr++;
2250 }
2251 while (eptr >= pp)
2252 {
2253 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2254 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2255 eptr--;
2256 }
2257 }
2258
2259 RRETURN(MATCH_NOMATCH);
2260 }
2261 /* Control never gets here */
2262 }
2263
2264 /* Caseful comparisons */
2265
2266 else
2267 {
2268 #ifdef SUPPORT_UTF8
2269 /* UTF-8 mode */
2270 if (utf8)
2271 {
2272 register int d;
2273 for (i = 1; i <= min; i++)
2274 {
2275 GETCHARINC(d, eptr);
2276 if (fc == d) RRETURN(MATCH_NOMATCH);
2277 }
2278 }
2279 else
2280 #endif
2281 /* Not UTF-8 mode */
2282 {
2283 for (i = 1; i <= min; i++)
2284 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2285 }
2286
2287 if (min == max) continue;
2288
2289 if (minimize)
2290 {
2291 #ifdef SUPPORT_UTF8
2292 /* UTF-8 mode */
2293 if (utf8)
2294 {
2295 register int d;
2296 for (fi = min;; fi++)
2297 {
2298 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2299 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2300 GETCHARINC(d, eptr);
2301 if (fi >= max || eptr >= md->end_subject || fc == d)
2302 RRETURN(MATCH_NOMATCH);
2303 }
2304 }
2305 else
2306 #endif
2307 /* Not UTF-8 mode */
2308 {
2309 for (fi = min;; fi++)
2310 {
2311 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2312 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2313 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2314 RRETURN(MATCH_NOMATCH);
2315 }
2316 }
2317 /* Control never gets here */
2318 }
2319
2320 /* Maximize case */
2321
2322 else
2323 {
2324 pp = eptr;
2325
2326 #ifdef SUPPORT_UTF8
2327 /* UTF-8 mode */
2328 if (utf8)
2329 {
2330 register int d;
2331 for (i = min; i < max; i++)
2332 {
2333 int len = 1;
2334 if (eptr >= md->end_subject) break;
2335 GETCHARLEN(d, eptr, len);
2336 if (fc == d) break;
2337 eptr += len;
2338 }
2339 for(;;)
2340 {
2341 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2342 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2343 if (eptr-- == pp) break; /* Stop if tried at original pos */
2344 BACKCHAR(eptr);
2345 }
2346 }
2347 else
2348 #endif
2349 /* Not UTF-8 mode */
2350 {
2351 for (i = min; i < max; i++)
2352 {
2353 if (eptr >= md->end_subject || fc == *eptr) break;
2354 eptr++;
2355 }
2356 while (eptr >= pp)
2357 {
2358 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360 eptr--;
2361 }
2362 }
2363
2364 RRETURN(MATCH_NOMATCH);
2365 }
2366 }
2367 /* Control never gets here */
2368
2369 /* Match a single character type repeatedly; several different opcodes
2370 share code. This is very similar to the code for single characters, but we
2371 repeat it in the interests of efficiency. */
2372
2373 case OP_TYPEEXACT:
2374 min = max = GET2(ecode, 1);
2375 minimize = TRUE;
2376 ecode += 3;
2377 goto REPEATTYPE;
2378
2379 case OP_TYPEUPTO:
2380 case OP_TYPEMINUPTO:
2381 min = 0;
2382 max = GET2(ecode, 1);
2383 minimize = *ecode == OP_TYPEMINUPTO;
2384 ecode += 3;
2385 goto REPEATTYPE;
2386
2387 case OP_TYPESTAR:
2388 case OP_TYPEMINSTAR:
2389 case OP_TYPEPLUS:
2390 case OP_TYPEMINPLUS:
2391 case OP_TYPEQUERY:
2392 case OP_TYPEMINQUERY:
2393 c = *ecode++ - OP_TYPESTAR;
2394 minimize = (c & 1) != 0;
2395 min = rep_min[c]; /* Pick up values from tables; */
2396 max = rep_max[c]; /* zero for max => infinity */
2397 if (max == 0) max = INT_MAX;
2398
2399 /* Common code for all repeated single character type matches. Note that
2400 in UTF-8 mode, '.' matches a character of any length, but for the other
2401 character types, the valid characters are all one-byte long. */
2402
2403 REPEATTYPE:
2404 ctype = *ecode++; /* Code for the character type */
2405
2406 #ifdef SUPPORT_UCP
2407 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2408 {
2409 prop_fail_result = ctype == OP_NOTPROP;
2410 prop_type = *ecode++;
2411 if (prop_type >= 128)
2412 {
2413 prop_test_against = prop_type - 128;
2414 prop_test_variable = &prop_category;
2415 }
2416 else
2417 {
2418 prop_test_against = prop_type;
2419 prop_test_variable = &prop_chartype;
2420 }
2421 }
2422 else prop_type = -1;
2423 #endif
2424
2425 /* First, ensure the minimum number of matches are present. Use inline
2426 code for maximizing the speed, and do the type test once at the start
2427 (i.e. keep it out of the loop). Also we can test that there are at least
2428 the minimum number of bytes before we start. This isn't as effective in
2429 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2430 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2431 and single-bytes. */
2432
2433 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2434 if (min > 0)
2435 {
2436 #ifdef SUPPORT_UCP
2437 if (prop_type > 0)
2438 {
2439 for (i = 1; i <= min; i++)
2440 {
2441 GETCHARINC(c, eptr);
2442 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2443 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2444 RRETURN(MATCH_NOMATCH);
2445 }
2446 }
2447
2448 /* Match extended Unicode sequences. We will get here only if the
2449 support is in the binary; otherwise a compile-time error occurs. */
2450
2451 else if (ctype == OP_EXTUNI)
2452 {
2453 for (i = 1; i <= min; i++)
2454 {
2455 GETCHARINCTEST(c, eptr);
2456 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2457 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2458 while (eptr < md->end_subject)
2459 {
2460 int len = 1;
2461 if (!utf8) c = *eptr; else
2462 {
2463 GETCHARLEN(c, eptr, len);
2464 }
2465 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2466 if (prop_category != ucp_M) break;
2467 eptr += len;
2468 }
2469 }
2470 }
2471
2472 else
2473 #endif /* SUPPORT_UCP */
2474
2475 /* Handle all other cases when the coding is UTF-8 */
2476
2477 #ifdef SUPPORT_UTF8
2478 if (utf8) switch(ctype)
2479 {
2480 case OP_ANY:
2481 for (i = 1; i <= min; i++)
2482 {
2483 if (eptr >= md->end_subject ||
2484 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2485 RRETURN(MATCH_NOMATCH);
2486 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2487 }
2488 break;
2489
2490 case OP_ANYBYTE:
2491 eptr += min;
2492 break;
2493
2494 case OP_NOT_DIGIT:
2495 for (i = 1; i <= min; i++)
2496 {
2497 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2498 GETCHARINC(c, eptr);
2499 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2500 RRETURN(MATCH_NOMATCH);
2501 }
2502 break;
2503
2504 case OP_DIGIT:
2505 for (i = 1; i <= min; i++)
2506 {
2507 if (eptr >= md->end_subject ||
2508 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2509 RRETURN(MATCH_NOMATCH);
2510 /* No need to skip more bytes - we know it's a 1-byte character */
2511 }
2512 break;
2513
2514 case OP_NOT_WHITESPACE:
2515 for (i = 1; i <= min; i++)
2516 {
2517 if (eptr >= md->end_subject ||
2518 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2519 RRETURN(MATCH_NOMATCH);
2520 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2521 }
2522 break;
2523
2524 case OP_WHITESPACE:
2525 for (i = 1; i <= min; i++)
2526 {
2527 if (eptr >= md->end_subject ||
2528 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2529 RRETURN(MATCH_NOMATCH);
2530 /* No need to skip more bytes - we know it's a 1-byte character */
2531 }
2532 break;
2533
2534 case OP_NOT_WORDCHAR:
2535 for (i = 1; i <= min; i++)
2536 {
2537 if (eptr >= md->end_subject ||
2538 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2539 RRETURN(MATCH_NOMATCH);
2540 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2541 }
2542 break;
2543
2544 case OP_WORDCHAR:
2545 for (i = 1; i <= min; i++)
2546 {
2547 if (eptr >= md->end_subject ||
2548 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2549 RRETURN(MATCH_NOMATCH);
2550 /* No need to skip more bytes - we know it's a 1-byte character */
2551 }
2552 break;
2553
2554 default:
2555 RRETURN(PCRE_ERROR_INTERNAL);
2556 } /* End switch(ctype) */
2557
2558 else
2559 #endif /* SUPPORT_UTF8 */
2560
2561 /* Code for the non-UTF-8 case for minimum matching of operators other
2562 than OP_PROP and OP_NOTPROP. */
2563
2564 switch(ctype)
2565 {
2566 case OP_ANY:
2567 if ((ims & PCRE_DOTALL) == 0)
2568 {
2569 for (i = 1; i <= min; i++)
2570 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2571 }
2572 else eptr += min;
2573 break;
2574
2575 case OP_ANYBYTE:
2576 eptr += min;
2577 break;
2578
2579 case OP_NOT_DIGIT:
2580 for (i = 1; i <= min; i++)
2581 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2582 break;
2583
2584 case OP_DIGIT:
2585 for (i = 1; i <= min; i++)
2586 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2587 break;
2588
2589 case OP_NOT_WHITESPACE:
2590 for (i = 1; i <= min; i++)
2591 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2592 break;
2593
2594 case OP_WHITESPACE:
2595 for (i = 1; i <= min; i++)
2596 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2597 break;
2598
2599 case OP_NOT_WORDCHAR:
2600 for (i = 1; i <= min; i++)
2601 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2602 RRETURN(MATCH_NOMATCH);
2603 break;
2604
2605 case OP_WORDCHAR:
2606 for (i = 1; i <= min; i++)
2607 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2608 RRETURN(MATCH_NOMATCH);
2609 break;
2610
2611 default:
2612 RRETURN(PCRE_ERROR_INTERNAL);
2613 }
2614 }
2615
2616 /* If min = max, continue at the same level without recursing */
2617
2618 if (min == max) continue;
2619
2620 /* If minimizing, we have to test the rest of the pattern before each
2621 subsequent match. Again, separate the UTF-8 case for speed, and also
2622 separate the UCP cases. */
2623
2624 if (minimize)
2625 {
2626 #ifdef SUPPORT_UCP
2627 if (prop_type > 0)
2628 {
2629 for (fi = min;; fi++)
2630 {
2631 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2632 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2633 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2634 GETCHARINC(c, eptr);
2635 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2636 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2637 RRETURN(MATCH_NOMATCH);
2638 }
2639 }
2640
2641 /* Match extended Unicode sequences. We will get here only if the
2642 support is in the binary; otherwise a compile-time error occurs. */
2643
2644 else if (ctype == OP_EXTUNI)
2645 {
2646 for (fi = min;; fi++)
2647 {
2648 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2650 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2651 GETCHARINCTEST(c, eptr);
2652 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2653 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2654 while (eptr < md->end_subject)
2655 {
2656 int len = 1;
2657 if (!utf8) c = *eptr; else
2658 {
2659 GETCHARLEN(c, eptr, len);
2660 }
2661 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2662 if (prop_category != ucp_M) break;
2663 eptr += len;
2664 }
2665 }
2666 }
2667
2668 else
2669 #endif /* SUPPORT_UCP */
2670
2671 #ifdef SUPPORT_UTF8
2672 /* UTF-8 mode */
2673 if (utf8)
2674 {
2675 for (fi = min;; fi++)
2676 {
2677 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2678 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2679 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2680
2681 GETCHARINC(c, eptr);
2682 switch(ctype)
2683 {
2684 case OP_ANY:
2685 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2686 break;
2687
2688 case OP_ANYBYTE:
2689 break;
2690
2691 case OP_NOT_DIGIT:
2692 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2693 RRETURN(MATCH_NOMATCH);
2694 break;
2695
2696 case OP_DIGIT:
2697 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2698 RRETURN(MATCH_NOMATCH);
2699 break;
2700
2701 case OP_NOT_WHITESPACE:
2702 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2703 RRETURN(MATCH_NOMATCH);
2704 break;
2705
2706 case OP_WHITESPACE:
2707 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2708 RRETURN(MATCH_NOMATCH);
2709 break;
2710
2711 case OP_NOT_WORDCHAR:
2712 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2713 RRETURN(MATCH_NOMATCH);
2714 break;
2715
2716 case OP_WORDCHAR:
2717 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2718 RRETURN(MATCH_NOMATCH);
2719 break;
2720
2721 default:
2722 RRETURN(PCRE_ERROR_INTERNAL);
2723 }
2724 }
2725 }
2726 else
2727 #endif
2728 /* Not UTF-8 mode */
2729 {
2730 for (fi = min;; fi++)
2731 {
2732 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2733 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2734 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2735 c = *eptr++;
2736 switch(ctype)
2737 {
2738 case OP_ANY:
2739 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2740 break;
2741
2742 case OP_ANYBYTE:
2743 break;
2744
2745 case OP_NOT_DIGIT:
2746 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2747 break;
2748
2749 case OP_DIGIT:
2750 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2751 break;
2752
2753 case OP_NOT_WHITESPACE:
2754 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2755 break;
2756
2757 case OP_WHITESPACE:
2758 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2759 break;
2760
2761 case OP_NOT_WORDCHAR:
2762 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2763 break;
2764
2765 case OP_WORDCHAR:
2766 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2767 break;
2768
2769 default:
2770 RRETURN(PCRE_ERROR_INTERNAL);
2771 }
2772 }
2773 }
2774 /* Control never gets here */
2775 }
2776
2777 /* If maximizing it is worth using inline code for speed, doing the type
2778 test once at the start (i.e. keep it out of the loop). Again, keep the
2779 UTF-8 and UCP stuff separate. */
2780
2781 else
2782 {
2783 pp = eptr; /* Remember where we started */
2784
2785 #ifdef SUPPORT_UCP
2786 if (prop_type > 0)
2787 {
2788 for (i = min; i < max; i++)
2789 {
2790 int len = 1;
2791 if (eptr >= md->end_subject) break;
2792 GETCHARLEN(c, eptr, len);
2793 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2794 if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2795 break;
2796 eptr+= len;
2797 }
2798
2799 /* eptr is now past the end of the maximum run */
2800
2801 for(;;)
2802 {
2803 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2804 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2805 if (eptr-- == pp) break; /* Stop if tried at original pos */
2806 BACKCHAR(eptr);
2807 }
2808 }
2809
2810 /* Match extended Unicode sequences. We will get here only if the
2811 support is in the binary; otherwise a compile-time error occurs. */
2812
2813 else if (ctype == OP_EXTUNI)
2814 {
2815 for (i = min; i < max; i++)
2816 {
2817 if (eptr >= md->end_subject) break;
2818 GETCHARINCTEST(c, eptr);
2819 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2820 if (prop_category == ucp_M) break;
2821 while (eptr < md->end_subject)
2822 {
2823 int len = 1;
2824 if (!utf8) c = *eptr; else
2825 {
2826 GETCHARLEN(c, eptr, len);
2827 }
2828 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2829 if (prop_category != ucp_M) break;
2830 eptr += len;
2831 }
2832 }
2833
2834 /* eptr is now past the end of the maximum run */
2835
2836 for(;;)
2837 {
2838 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2839 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2840 if (eptr-- == pp) break; /* Stop if tried at original pos */
2841 for (;;) /* Move back over one extended */
2842 {
2843 int len = 1;
2844 BACKCHAR(eptr);
2845 if (!utf8) c = *eptr; else
2846 {
2847 GETCHARLEN(c, eptr, len);
2848 }
2849 prop_category = _pcre_ucp_findchar(c, &prop_chartype, &prop_othercase);
2850 if (prop_category != ucp_M) break;
2851 eptr--;
2852 }
2853 }
2854 }
2855
2856 else
2857 #endif /* SUPPORT_UCP */
2858
2859 #ifdef SUPPORT_UTF8
2860 /* UTF-8 mode */
2861
2862 if (utf8)
2863 {
2864 switch(ctype)
2865 {
2866 case OP_ANY:
2867
2868 /* Special code is required for UTF8, but when the maximum is unlimited
2869 we don't need it, so we repeat the non-UTF8 code. This is probably
2870 worth it, because .* is quite a common idiom. */
2871
2872 if (max < INT_MAX)
2873 {
2874 if ((ims & PCRE_DOTALL) == 0)
2875 {
2876 for (i = min; i < max; i++)
2877 {
2878 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2879 eptr++;
2880 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2881 }
2882 }
2883 else
2884 {
2885 for (i = min; i < max; i++)
2886 {
2887 eptr++;
2888 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2889 }
2890 }
2891 }
2892
2893 /* Handle unlimited UTF-8 repeat */
2894
2895 else
2896 {
2897 if ((ims & PCRE_DOTALL) == 0)
2898 {
2899 for (i = min; i < max; i++)
2900 {
2901 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2902 eptr++;
2903 }
2904 break;
2905 }
2906 else
2907 {
2908 c = max - min;
2909 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2910 eptr += c;
2911 }
2912 }
2913 break;
2914
2915 /* The byte case is the same as non-UTF8 */
2916
2917 case OP_ANYBYTE:
2918 c = max - min;
2919 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2920 eptr += c;
2921 break;
2922
2923 case OP_NOT_DIGIT:
2924 for (i = min; i < max; i++)
2925 {
2926 int len = 1;
2927 if (eptr >= md->end_subject) break;
2928 GETCHARLEN(c, eptr, len);
2929 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
2930 eptr+= len;
2931 }
2932 break;
2933
2934 case OP_DIGIT:
2935 for (i = min; i < max; i++)
2936 {
2937 int len = 1;
2938 if (eptr >= md->end_subject) break;
2939 GETCHARLEN(c, eptr, len);
2940 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
2941 eptr+= len;
2942 }
2943 break;
2944
2945 case OP_NOT_WHITESPACE:
2946 for (i = min; i < max; i++)
2947 {
2948 int len = 1;
2949 if (eptr >= md->end_subject) break;
2950 GETCHARLEN(c, eptr, len);
2951 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
2952 eptr+= len;
2953 }
2954 break;
2955
2956 case OP_WHITESPACE:
2957 for (i = min; i < max; i++)
2958 {
2959 int len = 1;
2960 if (eptr >= md->end_subject) break;
2961 GETCHARLEN(c, eptr, len);
2962 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
2963 eptr+= len;
2964 }
2965 break;
2966
2967 case OP_NOT_WORDCHAR:
2968 for (i = min; i < max; i++)
2969 {
2970 int len = 1;
2971 if (eptr >= md->end_subject) break;
2972 GETCHARLEN(c, eptr, len);
2973 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
2974 eptr+= len;
2975 }
2976 break;
2977
2978 case OP_WORDCHAR:
2979 for (i = min; i < max; i++)
2980 {
2981 int len = 1;
2982 if (eptr >= md->end_subject) break;
2983 GETCHARLEN(c, eptr, len);
2984 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
2985 eptr+= len;
2986 }
2987 break;
2988
2989 default:
2990 RRETURN(PCRE_ERROR_INTERNAL);
2991 }
2992
2993 /* eptr is now past the end of the maximum run */
2994
2995 for(;;)
2996 {
2997 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2998 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2999 if (eptr-- == pp) break; /* Stop if tried at original pos */
3000 BACKCHAR(eptr);
3001 }
3002 }
3003 else
3004 #endif
3005
3006 /* Not UTF-8 mode */
3007 {
3008 switch(ctype)
3009 {
3010 case OP_ANY:
3011 if ((ims & PCRE_DOTALL) == 0)
3012 {
3013 for (i = min; i < max; i++)
3014 {
3015 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3016 eptr++;
3017 }
3018 break;
3019 }
3020 /* For DOTALL case, fall through and treat as \C */
3021
3022 case OP_ANYBYTE:
3023 c = max - min;
3024 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3025 eptr += c;
3026 break;
3027
3028 case OP_NOT_DIGIT:
3029 for (i = min; i < max; i++)
3030 {
3031 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3032 break;
3033 eptr++;
3034 }
3035 break;
3036
3037 case OP_DIGIT:
3038 for (i = min; i < max; i++)
3039 {
3040 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3041 break;
3042 eptr++;
3043 }
3044 break;
3045
3046 case OP_NOT_WHITESPACE:
3047 for (i = min; i < max; i++)
3048 {
3049 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3050 break;
3051 eptr++;
3052 }
3053 break;
3054
3055 case OP_WHITESPACE:
3056 for (i = min; i < max; i++)
3057 {
3058 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3059 break;
3060 eptr++;
3061 }
3062 break;
3063
3064 case OP_NOT_WORDCHAR:
3065 for (i = min; i < max; i++)
3066 {
3067 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3068 break;
3069 eptr++;
3070 }
3071 break;
3072
3073 case OP_WORDCHAR:
3074 for (i = min; i < max; i++)
3075 {
3076 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3077 break;
3078 eptr++;
3079 }
3080 break;
3081
3082 default:
3083 RRETURN(PCRE_ERROR_INTERNAL);
3084 }
3085
3086 /* eptr is now past the end of the maximum run */
3087
3088 while (eptr >= pp)
3089 {
3090 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3091 eptr--;
3092 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3093 }
3094 }
3095
3096 /* Get here if we can't make it match with any permitted repetitions */
3097
3098 RRETURN(MATCH_NOMATCH);
3099 }
3100 /* Control never gets here */
3101
3102 /* There's been some horrible disaster. Since all codes > OP_BRA are
3103 for capturing brackets, and there shouldn't be any gaps between 0 and
3104 OP_BRA, arrival here can only mean there is something seriously wrong
3105 in the code above or the OP_xxx definitions. */
3106
3107 default:
3108 DPRINTF(("Unknown opcode %d\n", *ecode));
3109 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3110 }
3111
3112 /* Do not stick any code in here without much thought; it is assumed
3113 that "continue" in the code above comes out to here to repeat the main
3114 loop. */
3115
3116 } /* End of main loop */
3117 /* Control never reaches here */
3118 }
3119
3120
3121 /***************************************************************************
3122 ****************************************************************************
3123 RECURSION IN THE match() FUNCTION
3124
3125 Undefine all the macros that were defined above to handle this. */
3126
3127 #ifdef NO_RECURSE
3128 #undef eptr
3129 #undef ecode
3130 #undef offset_top
3131 #undef ims
3132 #undef eptrb
3133 #undef flags
3134
3135 #undef callpat
3136 #undef charptr
3137 #undef data
3138 #undef next
3139 #undef pp
3140 #undef prev
3141 #undef saved_eptr
3142
3143 #undef new_recursive
3144
3145 #undef cur_is_word
3146 #undef condition
3147 #undef minimize
3148 #undef prev_is_word
3149
3150 #undef original_ims
3151
3152 #undef ctype
3153 #undef length
3154 #undef max
3155 #undef min
3156 #undef number
3157 #undef offset
3158 #undef op
3159 #undef save_capture_last
3160 #undef save_offset1
3161 #undef save_offset2
3162 #undef save_offset3
3163 #undef stacksave
3164
3165 #undef newptrb
3166
3167 #endif
3168
3169 /* These two are defined as macros in both cases */
3170
3171 #undef fc
3172 #undef fi
3173
3174 /***************************************************************************
3175 ***************************************************************************/
3176
3177
3178
3179 /*************************************************
3180 * Execute a Regular Expression *
3181 *************************************************/
3182
3183 /* This function applies a compiled re to a subject string and picks out
3184 portions of the string if it matches. Two elements in the vector are set for
3185 each substring: the offsets to the start and end of the substring.
3186
3187 Arguments:
3188 argument_re points to the compiled expression
3189 extra_data points to extra data or is NULL
3190 subject points to the subject string
3191 length length of subject string (may contain binary zeros)
3192 start_offset where to start in the subject string
3193 options option bits
3194 offsets points to a vector of ints to be filled in with offsets
3195 offsetcount the number of elements in the vector
3196
3197 Returns: > 0 => success; value is the number of elements filled in
3198 = 0 => success, but offsets is not big enough
3199 -1 => failed to match
3200 < -1 => some kind of unexpected problem
3201 */
3202
3203 PCRE_EXPORT int
3204 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3205 const char *subject, int length, int start_offset, int options, int *offsets,
3206 int offsetcount)
3207 {
3208 int rc, resetcount, ocount;
3209 int first_byte = -1;
3210 int req_byte = -1;
3211 int req_byte2 = -1;
3212 unsigned long int ims = 0;
3213 BOOL using_temporary_offsets = FALSE;
3214 BOOL anchored;
3215 BOOL startline;
3216 BOOL firstline;
3217 BOOL first_byte_caseless = FALSE;
3218 BOOL req_byte_caseless = FALSE;
3219 match_data match_block;
3220 const uschar *tables;
3221 const uschar *start_bits = NULL;
3222 const uschar *start_match = (const uschar *)subject + start_offset;
3223 const uschar *end_subject;
3224 const uschar *req_byte_ptr = start_match - 1;
3225
3226 pcre_study_data internal_study;
3227 const pcre_study_data *study;
3228
3229 real_pcre internal_re;
3230 const real_pcre *external_re = (const real_pcre *)argument_re;
3231 const real_pcre *re = external_re;
3232
3233 /* Plausibility checks */
3234
3235 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3236 if (re == NULL || subject == NULL ||
3237 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3238 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3239
3240 /* Fish out the optional data from the extra_data structure, first setting
3241 the default values. */
3242
3243 study = NULL;
3244 match_block.match_limit = MATCH_LIMIT;
3245 match_block.callout_data = NULL;
3246
3247 /* The table pointer is always in native byte order. */
3248
3249 tables = external_re->tables;
3250
3251 if (extra_data != NULL)
3252 {
3253 register unsigned int flags = extra_data->flags;
3254 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3255 study = (const pcre_study_data *)extra_data->study_data;
3256 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3257 match_block.match_limit = extra_data->match_limit;
3258 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3259 match_block.callout_data = extra_data->callout_data;
3260 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3261 }
3262
3263 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3264 is a feature that makes it possible to save compiled regex and re-use them
3265 in other programs later. */
3266
3267 if (tables == NULL) tables = _pcre_default_tables;
3268
3269 /* Check that the first field in the block is the magic number. If it is not,
3270 test for a regex that was compiled on a host of opposite endianness. If this is
3271 the case, flipped values are put in internal_re and internal_study if there was
3272 study data too. */
3273
3274 if (re->magic_number != MAGIC_NUMBER)
3275 {
3276 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3277 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3278 if (study != NULL) study = &internal_study;
3279 }
3280
3281 /* Set up other data */
3282
3283 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3284 startline = (re->options & PCRE_STARTLINE) != 0;
3285 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3286
3287 /* The code starts after the real_pcre block and the capture name table. */
3288
3289 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3290 re->name_count * re->name_entry_size;
3291
3292 match_block.start_subject = (const uschar *)subject;
3293 match_block.start_offset = start_offset;
3294 match_block.end_subject = match_block.start_subject + length;
3295 end_subject = match_block.end_subject;
3296
3297 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3298 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3299
3300 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3301 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3302 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3303 match_block.partial = (options & PCRE_PARTIAL) != 0;
3304 match_block.hitend = FALSE;
3305
3306 match_block.recursive = NULL; /* No recursion at top level */
3307
3308 match_block.lcc = tables + lcc_offset;
3309 match_block.ctypes = tables + ctypes_offset;
3310
3311 /* Partial matching is supported only for a restricted set of regexes at the
3312 moment. */
3313
3314 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3315 return PCRE_ERROR_BADPARTIAL;
3316
3317 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3318 back the character offset. */
3319
3320 #ifdef SUPPORT_UTF8
3321 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3322 {
3323 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3324 return PCRE_ERROR_BADUTF8;
3325 if (start_offset > 0 && start_offset < length)
3326 {
3327 int tb = ((uschar *)subject)[start_offset];
3328 if (tb > 127)
3329 {
3330 tb &= 0xc0;
3331 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3332 }
3333 }
3334 }
3335 #endif
3336
3337 /* The ims options can vary during the matching as a result of the presence
3338 of (?ims) items in the pattern. They are kept in a local variable so that
3339 restoring at the exit of a group is easy. */
3340
3341 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3342
3343 /* If the expression has got more back references than the offsets supplied can
3344 hold, we get a temporary chunk of working store to use during the matching.
3345 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3346 of 3. */
3347
3348 ocount = offsetcount - (offsetcount % 3);
3349
3350 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3351 {
3352 ocount = re->top_backref * 3 + 3;
3353 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3354 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3355 using_temporary_offsets = TRUE;
3356 DPRINTF(("Got memory to hold back references\n"));
3357 }
3358 else match_block.offset_vector = offsets;
3359
3360 match_block.offset_end = ocount;
3361 match_block.offset_max = (2*ocount)/3;
3362 match_block.offset_overflow = FALSE;
3363 match_block.capture_last = -1;
3364
3365 /* Compute the minimum number of offsets that we need to reset each time. Doing
3366 this makes a huge difference to execution time when there aren't many brackets
3367 in the pattern. */
3368
3369 resetcount = 2 + re->top_bracket * 2;
3370 if (resetcount > offsetcount) resetcount = ocount;
3371
3372 /* Reset the working variable associated with each extraction. These should
3373 never be used unless previously set, but they get saved and restored, and so we
3374 initialize them to avoid reading uninitialized locations. */
3375
3376 if (match_block.offset_vector != NULL)
3377 {
3378 register int *iptr = match_block.offset_vector + ocount;
3379 register int *iend = iptr - resetcount/2 + 1;
3380 while (--iptr >= iend) *iptr = -1;
3381 }
3382
3383 /* Set up the first character to match, if available. The first_byte value is
3384 never set for an anchored regular expression, but the anchoring may be forced
3385 at run time, so we have to test for anchoring. The first char may be unset for
3386 an unanchored pattern, of course. If there's no first char and the pattern was
3387 studied, there may be a bitmap of possible first characters. */
3388
3389 if (!anchored)
3390 {
3391 if ((re->options & PCRE_FIRSTSET) != 0)
3392 {
3393 first_byte = re->first_byte & 255;
3394 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3395 first_byte = match_block.lcc[first_byte];
3396 }
3397 else
3398 if (!startline && study != NULL &&
3399 (study->options & PCRE_STUDY_MAPPED) != 0)
3400 start_bits = study->start_bits;
3401 }
3402
3403 /* For anchored or unanchored matches, there may be a "last known required
3404 character" set. */
3405
3406 if ((re->options & PCRE_REQCHSET) != 0)
3407 {
3408 req_byte = re->req_byte & 255;
3409 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3410 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3411 }
3412
3413 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3414 the loop runs just once. */
3415
3416 do
3417 {
3418 const uschar *save_end_subject = end_subject;
3419
3420 /* Reset the maximum number of extractions we might see. */
3421
3422 if (match_block.offset_vector != NULL)
3423 {
3424 register int *iptr = match_block.offset_vector;
3425 register int *iend = iptr + resetcount;
3426 while (iptr < iend) *iptr++ = -1;
3427 }
3428
3429 /* Advance to a unique first char if possible. If firstline is TRUE, the
3430 start of the match is constrained to the first line of a multiline string.
3431 Implement this by temporarily adjusting end_subject so that we stop scanning
3432 at a newline. If the match fails at the newline, later code breaks this loop.
3433 */
3434
3435 if (firstline)
3436 {
3437 const uschar *t = start_match;
3438 while (t < save_end_subject && *t != '\n') t++;
3439 end_subject = t;
3440 }
3441
3442 /* Now test for a unique first byte */
3443
3444 if (first_byte >= 0)
3445 {
3446 if (first_byte_caseless)
3447 while (start_match < end_subject &&
3448 match_block.lcc[*start_match] != first_byte)
3449 start_match++;
3450 else
3451 while (start_match < end_subject && *start_match != first_byte)
3452 start_match++;
3453 }
3454
3455 /* Or to just after \n for a multiline match if possible */
3456
3457 else if (startline)
3458 {
3459 if (start_match > match_block.start_subject + start_offset)
3460 {
3461 while (start_match < end_subject && start_match[-1] != NEWLINE)
3462 start_match++;
3463 }
3464 }
3465
3466 /* Or to a non-unique first char after study */
3467
3468 else if (start_bits != NULL)
3469 {
3470 while (start_match < end_subject)
3471 {
3472 register unsigned int c = *start_match;
3473 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3474 }
3475 }
3476
3477 /* Restore fudged end_subject */
3478
3479 end_subject = save_end_subject;
3480
3481 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3482 printf(">>>> Match against: ");
3483 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3484 printf("\n");
3485 #endif
3486
3487 /* If req_byte is set, we know that that character must appear in the subject
3488 for the match to succeed. If the first character is set, req_byte must be
3489 later in the subject; otherwise the test starts at the match point. This
3490 optimization can save a huge amount of backtracking in patterns with nested
3491 unlimited repeats that aren't going to match. Writing separate code for
3492 cased/caseless versions makes it go faster, as does using an autoincrement
3493 and backing off on a match.
3494
3495 HOWEVER: when the subject string is very, very long, searching to its end can
3496 take a long time, and give bad performance on quite ordinary patterns. This
3497 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3498 don't do this when the string is sufficiently long.
3499
3500 ALSO: this processing is disabled when partial matching is requested.
3501 */
3502
3503 if (req_byte >= 0 &&
3504 end_subject - start_match < REQ_BYTE_MAX &&
3505 !match_block.partial)
3506 {
3507 register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
3508
3509 /* We don't need to repeat the search if we haven't yet reached the
3510 place we found it at last time. */
3511
3512 if (p > req_byte_ptr)
3513 {
3514 if (req_byte_caseless)
3515 {
3516 while (p < end_subject)
3517 {
3518 register int pp = *p++;
3519 if (pp == req_byte || pp == req_byte2) { p--; break; }
3520 }
3521 }
3522 else
3523 {
3524 while (p < end_subject)
3525 {
3526 if (*p++ == req_byte) { p--; break; }
3527 }
3528 }
3529
3530 /* If we can't find the required character, break the matching loop */
3531
3532 if (p >= end_subject) break;
3533
3534 /* If we have found the required character, save the point where we
3535 found it, so that we don't search again next time round the loop if
3536 the start hasn't passed this character yet. */
3537
3538 req_byte_ptr = p;
3539 }
3540 }
3541
3542 /* When a match occurs, substrings will be set for all internal extractions;
3543 we just need to set up the whole thing as substring 0 before returning. If
3544 there were too many extractions, set the return code to zero. In the case
3545 where we had to get some local store to hold offsets for backreferences, copy
3546 those back references that we can. In this case there need not be overflow
3547 if certain parts of the pattern were not used. */
3548
3549 match_block.start_match = start_match;
3550 match_block.match_call_count = 0;
3551
3552 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3553 match_isgroup);
3554
3555 /* When the result is no match, if the subject's first character was a
3556 newline and the PCRE_FIRSTLINE option is set, break (which will return
3557 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3558 newline in the subject. Otherwise, advance the pointer to the next character
3559 and continue - but the continuation will actually happen only when the
3560 pattern is not anchored. */
3561
3562 if (rc == MATCH_NOMATCH)
3563 {
3564 if (firstline && *start_match == NEWLINE) break;
3565 start_match++;
3566 #ifdef SUPPORT_UTF8
3567 if (match_block.utf8)
3568 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3569 start_match++;
3570 #endif
3571 continue;
3572 }
3573
3574 if (rc != MATCH_MATCH)
3575 {
3576 DPRINTF((">>>> error: returning %d\n", rc));
3577 return rc;
3578 }
3579
3580 /* We have a match! Copy the offset information from temporary store if
3581 necessary */
3582
3583 if (using_temporary_offsets)
3584 {
3585 if (offsetcount >= 4)
3586 {
3587 memcpy(offsets + 2, match_block.offset_vector + 2,
3588 (offsetcount - 2) * sizeof(int));
3589 DPRINTF(("Copied offsets from temporary memory\n"));
3590 }
3591 if (match_block.end_offset_top > offsetcount)
3592 match_block.offset_overflow = TRUE;
3593
3594 DPRINTF(("Freeing temporary memory\n"));
3595 (pcre_free)(match_block.offset_vector);
3596 }
3597
3598 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3599
3600 if (offsetcount < 2) rc = 0; else
3601 {
3602 offsets[0] = start_match - match_block.start_subject;
3603 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3604 }
3605
3606 DPRINTF((">>>> returning %d\n", rc));
3607 return rc;
3608 }
3609
3610 /* This "while" is the end of the "do" above */
3611
3612 while (!anchored && start_match <= end_subject);
3613
3614 if (using_temporary_offsets)
3615 {
3616 DPRINTF(("Freeing temporary memory\n"));
3617 (pcre_free)(match_block.offset_vector);
3618 }
3619
3620 if (match_block.partial && match_block.hitend)
3621 {
3622 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3623 return PCRE_ERROR_PARTIAL;
3624 }
3625 else
3626 {
3627 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3628 return PCRE_ERROR_NOMATCH;
3629 }
3630 }
3631
3632 /* End of pcre_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12