/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 87 - (show annotations) (download)
Sat Feb 24 21:41:21 2007 UTC (7 years, 8 months ago) by nigel
File MIME type: text/plain
File size: 117538 byte(s)
Load pcre-6.5 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45
46 #include "pcre_internal.h"
47
48
49 /* Structure for building a chain of data that actually lives on the
50 stack, for holding the values of the subject pointer at the start of each
51 subpattern, so as to detect when an empty string has been matched by a
52 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
53 are on the heap, not on the stack. */
54
55 typedef struct eptrblock {
56 struct eptrblock *epb_prev;
57 USPTR epb_saved_eptr;
58 } eptrblock;
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_isgroup 0x02 /* Set if start of bracketed group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78
79 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
80 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
81
82
83
84 #ifdef DEBUG
85 /*************************************************
86 * Debugging function to print chars *
87 *************************************************/
88
89 /* Print a sequence of chars in printable format, stopping at the end of the
90 subject if the requested.
91
92 Arguments:
93 p points to characters
94 length number to print
95 is_subject TRUE if printing from within md->start_subject
96 md pointer to matching data block, if is_subject is TRUE
97
98 Returns: nothing
99 */
100
101 static void
102 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
103 {
104 int c;
105 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106 while (length-- > 0)
107 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
108 }
109 #endif
110
111
112
113 /*************************************************
114 * Match a back-reference *
115 *************************************************/
116
117 /* If a back reference hasn't been set, the length that is passed is greater
118 than the number of characters left in the string, so the match fails.
119
120 Arguments:
121 offset index into the offset vector
122 eptr points into the subject
123 length length to be matched
124 md points to match data block
125 ims the ims flags
126
127 Returns: TRUE if matched
128 */
129
130 static BOOL
131 match_ref(int offset, register USPTR eptr, int length, match_data *md,
132 unsigned long int ims)
133 {
134 USPTR p = md->start_subject + md->offset_vector[offset];
135
136 #ifdef DEBUG
137 if (eptr >= md->end_subject)
138 printf("matching subject <null>");
139 else
140 {
141 printf("matching subject ");
142 pchars(eptr, length, TRUE, md);
143 }
144 printf(" against backref ");
145 pchars(p, length, FALSE, md);
146 printf("\n");
147 #endif
148
149 /* Always fail if not enough characters left */
150
151 if (length > md->end_subject - eptr) return FALSE;
152
153 /* Separate the caselesss case for speed */
154
155 if ((ims & PCRE_CASELESS) != 0)
156 {
157 while (length-- > 0)
158 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
159 }
160 else
161 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
162
163 return TRUE;
164 }
165
166
167
168 /***************************************************************************
169 ****************************************************************************
170 RECURSION IN THE match() FUNCTION
171
172 The match() function is highly recursive, though not every recursive call
173 increases the recursive depth. Nevertheless, some regular expressions can cause
174 it to recurse to a great depth. I was writing for Unix, so I just let it call
175 itself recursively. This uses the stack for saving everything that has to be
176 saved for a recursive call. On Unix, the stack can be large, and this works
177 fine.
178
179 It turns out that on some non-Unix-like systems there are problems with
180 programs that use a lot of stack. (This despite the fact that every last chip
181 has oodles of memory these days, and techniques for extending the stack have
182 been known for decades.) So....
183
184 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
185 calls by keeping local variables that need to be preserved in blocks of memory
186 obtained from malloc() instead instead of on the stack. Macros are used to
187 achieve this so that the actual code doesn't look very different to what it
188 always used to.
189 ****************************************************************************
190 ***************************************************************************/
191
192
193 /* These versions of the macros use the stack, as normal. There are debugging
194 versions and production versions. */
195
196 #ifndef NO_RECURSE
197 #define REGISTER register
198 #ifdef DEBUG
199 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
200 { \
201 printf("match() called in line %d\n", __LINE__); \
202 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
203 printf("to line %d\n", __LINE__); \
204 }
205 #define RRETURN(ra) \
206 { \
207 printf("match() returned %d from line %d ", ra, __LINE__); \
208 return ra; \
209 }
210 #else
211 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
212 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
213 #define RRETURN(ra) return ra
214 #endif
215
216 #else
217
218
219 /* These versions of the macros manage a private stack on the heap. Note
220 that the rd argument of RMATCH isn't actually used. It's the md argument of
221 match(), which never changes. */
222
223 #define REGISTER
224
225 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
226 {\
227 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
228 if (setjmp(frame->Xwhere) == 0)\
229 {\
230 newframe->Xeptr = ra;\
231 newframe->Xecode = rb;\
232 newframe->Xoffset_top = rc;\
233 newframe->Xims = re;\
234 newframe->Xeptrb = rf;\
235 newframe->Xflags = rg;\
236 newframe->Xrdepth = frame->Xrdepth + 1;\
237 newframe->Xprevframe = frame;\
238 frame = newframe;\
239 DPRINTF(("restarting from line %d\n", __LINE__));\
240 goto HEAP_RECURSE;\
241 }\
242 else\
243 {\
244 DPRINTF(("longjumped back to line %d\n", __LINE__));\
245 frame = md->thisframe;\
246 rx = frame->Xresult;\
247 }\
248 }
249
250 #define RRETURN(ra)\
251 {\
252 heapframe *newframe = frame;\
253 frame = newframe->Xprevframe;\
254 (pcre_stack_free)(newframe);\
255 if (frame != NULL)\
256 {\
257 frame->Xresult = ra;\
258 md->thisframe = frame;\
259 longjmp(frame->Xwhere, 1);\
260 }\
261 return ra;\
262 }
263
264
265 /* Structure for remembering the local variables in a private frame */
266
267 typedef struct heapframe {
268 struct heapframe *Xprevframe;
269
270 /* Function arguments that may change */
271
272 const uschar *Xeptr;
273 const uschar *Xecode;
274 int Xoffset_top;
275 long int Xims;
276 eptrblock *Xeptrb;
277 int Xflags;
278 int Xrdepth;
279
280 /* Function local variables */
281
282 const uschar *Xcallpat;
283 const uschar *Xcharptr;
284 const uschar *Xdata;
285 const uschar *Xnext;
286 const uschar *Xpp;
287 const uschar *Xprev;
288 const uschar *Xsaved_eptr;
289
290 recursion_info Xnew_recursive;
291
292 BOOL Xcur_is_word;
293 BOOL Xcondition;
294 BOOL Xminimize;
295 BOOL Xprev_is_word;
296
297 unsigned long int Xoriginal_ims;
298
299 #ifdef SUPPORT_UCP
300 int Xprop_type;
301 int Xprop_value;
302 int Xprop_fail_result;
303 int Xprop_category;
304 int Xprop_chartype;
305 int Xprop_script;
306 int *Xprop_test_variable;
307 #endif
308
309 int Xctype;
310 int Xfc;
311 int Xfi;
312 int Xlength;
313 int Xmax;
314 int Xmin;
315 int Xnumber;
316 int Xoffset;
317 int Xop;
318 int Xsave_capture_last;
319 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
320 int Xstacksave[REC_STACK_SAVE_MAX];
321
322 eptrblock Xnewptrb;
323
324 /* Place to pass back result, and where to jump back to */
325
326 int Xresult;
327 jmp_buf Xwhere;
328
329 } heapframe;
330
331 #endif
332
333
334 /***************************************************************************
335 ***************************************************************************/
336
337
338
339 /*************************************************
340 * Match from current position *
341 *************************************************/
342
343 /* On entry ecode points to the first opcode, and eptr to the first character
344 in the subject string, while eptrb holds the value of eptr at the start of the
345 last bracketed group - used for breaking infinite loops matching zero-length
346 strings. This function is called recursively in many circumstances. Whenever it
347 returns a negative (error) response, the outer incarnation must also return the
348 same response.
349
350 Performance note: It might be tempting to extract commonly used fields from the
351 md structure (e.g. utf8, end_subject) into individual variables to improve
352 performance. Tests using gcc on a SPARC disproved this; in the first case, it
353 made performance worse.
354
355 Arguments:
356 eptr pointer in subject
357 ecode position in code
358 offset_top current top pointer
359 md pointer to "static" info for the match
360 ims current /i, /m, and /s options
361 eptrb pointer to chain of blocks containing eptr at start of
362 brackets - for testing for empty matches
363 flags can contain
364 match_condassert - this is an assertion condition
365 match_isgroup - this is the start of a bracketed group
366 rdepth the recursion depth
367
368 Returns: MATCH_MATCH if matched ) these values are >= 0
369 MATCH_NOMATCH if failed to match )
370 a negative PCRE_ERROR_xxx value if aborted by an error condition
371 (e.g. stopped by repeated call or recursion limit)
372 */
373
374 static int
375 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
376 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
377 int flags, int rdepth)
378 {
379 /* These variables do not need to be preserved over recursion in this function,
380 so they can be ordinary variables in all cases. Mark them with "register"
381 because they are used a lot in loops. */
382
383 register int rrc; /* Returns from recursive calls */
384 register int i; /* Used for loops not involving calls to RMATCH() */
385 register int c; /* Character values not kept over RMATCH() calls */
386 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
387
388 /* When recursion is not being used, all "local" variables that have to be
389 preserved over calls to RMATCH() are part of a "frame" which is obtained from
390 heap storage. Set up the top-level frame here; others are obtained from the
391 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
392
393 #ifdef NO_RECURSE
394 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
395 frame->Xprevframe = NULL; /* Marks the top level */
396
397 /* Copy in the original argument variables */
398
399 frame->Xeptr = eptr;
400 frame->Xecode = ecode;
401 frame->Xoffset_top = offset_top;
402 frame->Xims = ims;
403 frame->Xeptrb = eptrb;
404 frame->Xflags = flags;
405 frame->Xrdepth = rdepth;
406
407 /* This is where control jumps back to to effect "recursion" */
408
409 HEAP_RECURSE:
410
411 /* Macros make the argument variables come from the current frame */
412
413 #define eptr frame->Xeptr
414 #define ecode frame->Xecode
415 #define offset_top frame->Xoffset_top
416 #define ims frame->Xims
417 #define eptrb frame->Xeptrb
418 #define flags frame->Xflags
419 #define rdepth frame->Xrdepth
420
421 /* Ditto for the local variables */
422
423 #ifdef SUPPORT_UTF8
424 #define charptr frame->Xcharptr
425 #endif
426 #define callpat frame->Xcallpat
427 #define data frame->Xdata
428 #define next frame->Xnext
429 #define pp frame->Xpp
430 #define prev frame->Xprev
431 #define saved_eptr frame->Xsaved_eptr
432
433 #define new_recursive frame->Xnew_recursive
434
435 #define cur_is_word frame->Xcur_is_word
436 #define condition frame->Xcondition
437 #define minimize frame->Xminimize
438 #define prev_is_word frame->Xprev_is_word
439
440 #define original_ims frame->Xoriginal_ims
441
442 #ifdef SUPPORT_UCP
443 #define prop_type frame->Xprop_type
444 #define prop_value frame->Xprop_value
445 #define prop_fail_result frame->Xprop_fail_result
446 #define prop_category frame->Xprop_category
447 #define prop_chartype frame->Xprop_chartype
448 #define prop_script frame->Xprop_script
449 #define prop_test_variable frame->Xprop_test_variable
450 #endif
451
452 #define ctype frame->Xctype
453 #define fc frame->Xfc
454 #define fi frame->Xfi
455 #define length frame->Xlength
456 #define max frame->Xmax
457 #define min frame->Xmin
458 #define number frame->Xnumber
459 #define offset frame->Xoffset
460 #define op frame->Xop
461 #define save_capture_last frame->Xsave_capture_last
462 #define save_offset1 frame->Xsave_offset1
463 #define save_offset2 frame->Xsave_offset2
464 #define save_offset3 frame->Xsave_offset3
465 #define stacksave frame->Xstacksave
466
467 #define newptrb frame->Xnewptrb
468
469 /* When recursion is being used, local variables are allocated on the stack and
470 get preserved during recursion in the normal way. In this environment, fi and
471 i, and fc and c, can be the same variables. */
472
473 #else
474 #define fi i
475 #define fc c
476
477
478 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
479 const uschar *charptr; /* in small blocks of the code. My normal */
480 #endif /* style of coding would have declared */
481 const uschar *callpat; /* them within each of those blocks. */
482 const uschar *data; /* However, in order to accommodate the */
483 const uschar *next; /* version of this code that uses an */
484 USPTR pp; /* external "stack" implemented on the */
485 const uschar *prev; /* heap, it is easier to declare them all */
486 USPTR saved_eptr; /* here, so the declarations can be cut */
487 /* out in a block. The only declarations */
488 recursion_info new_recursive; /* within blocks below are for variables */
489 /* that do not have to be preserved over */
490 BOOL cur_is_word; /* a recursive call to RMATCH(). */
491 BOOL condition;
492 BOOL minimize;
493 BOOL prev_is_word;
494
495 unsigned long int original_ims;
496
497 #ifdef SUPPORT_UCP
498 int prop_type;
499 int prop_value;
500 int prop_fail_result;
501 int prop_category;
502 int prop_chartype;
503 int prop_script;
504 int *prop_test_variable;
505 #endif
506
507 int ctype;
508 int length;
509 int max;
510 int min;
511 int number;
512 int offset;
513 int op;
514 int save_capture_last;
515 int save_offset1, save_offset2, save_offset3;
516 int stacksave[REC_STACK_SAVE_MAX];
517
518 eptrblock newptrb;
519 #endif
520
521 /* These statements are here to stop the compiler complaining about unitialized
522 variables. */
523
524 #ifdef SUPPORT_UCP
525 prop_value = 0;
526 prop_fail_result = 0;
527 prop_test_variable = NULL;
528 #endif
529
530 /* OK, now we can get on with the real code of the function. Recursive calls
531 are specified by the macro RMATCH and RRETURN is used to return. When
532 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
533 and a "return", respectively (possibly with some debugging if DEBUG is
534 defined). However, RMATCH isn't like a function call because it's quite a
535 complicated macro. It has to be used in one particular way. This shouldn't,
536 however, impact performance when true recursion is being used. */
537
538 /* First check that we haven't called match() too many times, or that we
539 haven't exceeded the recursive call limit. */
540
541 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
542 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
543
544 original_ims = ims; /* Save for resetting on ')' */
545 utf8 = md->utf8; /* Local copy of the flag */
546
547 /* At the start of a bracketed group, add the current subject pointer to the
548 stack of such pointers, to be re-instated at the end of the group when we hit
549 the closing ket. When match() is called in other circumstances, we don't add to
550 this stack. */
551
552 if ((flags & match_isgroup) != 0)
553 {
554 newptrb.epb_prev = eptrb;
555 newptrb.epb_saved_eptr = eptr;
556 eptrb = &newptrb;
557 }
558
559 /* Now start processing the operations. */
560
561 for (;;)
562 {
563 op = *ecode;
564 minimize = FALSE;
565
566 /* For partial matching, remember if we ever hit the end of the subject after
567 matching at least one subject character. */
568
569 if (md->partial &&
570 eptr >= md->end_subject &&
571 eptr > md->start_match)
572 md->hitend = TRUE;
573
574 /* Opening capturing bracket. If there is space in the offset vector, save
575 the current subject position in the working slot at the top of the vector. We
576 mustn't change the current values of the data slot, because they may be set
577 from a previous iteration of this group, and be referred to by a reference
578 inside the group.
579
580 If the bracket fails to match, we need to restore this value and also the
581 values of the final offsets, in case they were set by a previous iteration of
582 the same bracket.
583
584 If there isn't enough space in the offset vector, treat this as if it were a
585 non-capturing bracket. Don't worry about setting the flag for the error case
586 here; that is handled in the code for KET. */
587
588 if (op > OP_BRA)
589 {
590 number = op - OP_BRA;
591
592 /* For extended extraction brackets (large number), we have to fish out the
593 number from a dummy opcode at the start. */
594
595 if (number > EXTRACT_BASIC_MAX)
596 number = GET2(ecode, 2+LINK_SIZE);
597 offset = number << 1;
598
599 #ifdef DEBUG
600 printf("start bracket %d subject=", number);
601 pchars(eptr, 16, TRUE, md);
602 printf("\n");
603 #endif
604
605 if (offset < md->offset_max)
606 {
607 save_offset1 = md->offset_vector[offset];
608 save_offset2 = md->offset_vector[offset+1];
609 save_offset3 = md->offset_vector[md->offset_end - number];
610 save_capture_last = md->capture_last;
611
612 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
613 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
614
615 do
616 {
617 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
618 match_isgroup);
619 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
620 md->capture_last = save_capture_last;
621 ecode += GET(ecode, 1);
622 }
623 while (*ecode == OP_ALT);
624
625 DPRINTF(("bracket %d failed\n", number));
626
627 md->offset_vector[offset] = save_offset1;
628 md->offset_vector[offset+1] = save_offset2;
629 md->offset_vector[md->offset_end - number] = save_offset3;
630
631 RRETURN(MATCH_NOMATCH);
632 }
633
634 /* Insufficient room for saving captured contents */
635
636 else op = OP_BRA;
637 }
638
639 /* Other types of node can be handled by a switch */
640
641 switch(op)
642 {
643 case OP_BRA: /* Non-capturing bracket: optimized */
644 DPRINTF(("start bracket 0\n"));
645 do
646 {
647 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
648 match_isgroup);
649 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
650 ecode += GET(ecode, 1);
651 }
652 while (*ecode == OP_ALT);
653 DPRINTF(("bracket 0 failed\n"));
654 RRETURN(MATCH_NOMATCH);
655
656 /* Conditional group: compilation checked that there are no more than
657 two branches. If the condition is false, skipping the first branch takes us
658 past the end if there is only one branch, but that's OK because that is
659 exactly what going to the ket would do. */
660
661 case OP_COND:
662 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
663 {
664 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
665 condition = (offset == CREF_RECURSE * 2)?
666 (md->recursive != NULL) :
667 (offset < offset_top && md->offset_vector[offset] >= 0);
668 RMATCH(rrc, eptr, ecode + (condition?
669 (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
670 offset_top, md, ims, eptrb, match_isgroup);
671 RRETURN(rrc);
672 }
673
674 /* The condition is an assertion. Call match() to evaluate it - setting
675 the final argument TRUE causes it to stop at the end of an assertion. */
676
677 else
678 {
679 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
680 match_condassert | match_isgroup);
681 if (rrc == MATCH_MATCH)
682 {
683 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
684 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
685 }
686 else if (rrc != MATCH_NOMATCH)
687 {
688 RRETURN(rrc); /* Need braces because of following else */
689 }
690 else ecode += GET(ecode, 1);
691 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
692 match_isgroup);
693 RRETURN(rrc);
694 }
695 /* Control never reaches here */
696
697 /* Skip over conditional reference or large extraction number data if
698 encountered. */
699
700 case OP_CREF:
701 case OP_BRANUMBER:
702 ecode += 3;
703 break;
704
705 /* End of the pattern. If we are in a recursion, we should restore the
706 offsets appropriately and continue from after the call. */
707
708 case OP_END:
709 if (md->recursive != NULL && md->recursive->group_num == 0)
710 {
711 recursion_info *rec = md->recursive;
712 DPRINTF(("End of pattern in a (?0) recursion\n"));
713 md->recursive = rec->prevrec;
714 memmove(md->offset_vector, rec->offset_save,
715 rec->saved_max * sizeof(int));
716 md->start_match = rec->save_start;
717 ims = original_ims;
718 ecode = rec->after_call;
719 break;
720 }
721
722 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
723 string - backtracking will then try other alternatives, if any. */
724
725 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
726 md->end_match_ptr = eptr; /* Record where we ended */
727 md->end_offset_top = offset_top; /* and how many extracts were taken */
728 RRETURN(MATCH_MATCH);
729
730 /* Change option settings */
731
732 case OP_OPT:
733 ims = ecode[1];
734 ecode += 2;
735 DPRINTF(("ims set to %02lx\n", ims));
736 break;
737
738 /* Assertion brackets. Check the alternative branches in turn - the
739 matching won't pass the KET for an assertion. If any one branch matches,
740 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
741 start of each branch to move the current point backwards, so the code at
742 this level is identical to the lookahead case. */
743
744 case OP_ASSERT:
745 case OP_ASSERTBACK:
746 do
747 {
748 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
749 match_isgroup);
750 if (rrc == MATCH_MATCH) break;
751 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
752 ecode += GET(ecode, 1);
753 }
754 while (*ecode == OP_ALT);
755 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
756
757 /* If checking an assertion for a condition, return MATCH_MATCH. */
758
759 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
760
761 /* Continue from after the assertion, updating the offsets high water
762 mark, since extracts may have been taken during the assertion. */
763
764 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
765 ecode += 1 + LINK_SIZE;
766 offset_top = md->end_offset_top;
767 continue;
768
769 /* Negative assertion: all branches must fail to match */
770
771 case OP_ASSERT_NOT:
772 case OP_ASSERTBACK_NOT:
773 do
774 {
775 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
776 match_isgroup);
777 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
778 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
779 ecode += GET(ecode,1);
780 }
781 while (*ecode == OP_ALT);
782
783 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
784
785 ecode += 1 + LINK_SIZE;
786 continue;
787
788 /* Move the subject pointer back. This occurs only at the start of
789 each branch of a lookbehind assertion. If we are too close to the start to
790 move back, this match function fails. When working with UTF-8 we move
791 back a number of characters, not bytes. */
792
793 case OP_REVERSE:
794 #ifdef SUPPORT_UTF8
795 if (utf8)
796 {
797 c = GET(ecode,1);
798 for (i = 0; i < c; i++)
799 {
800 eptr--;
801 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
802 BACKCHAR(eptr)
803 }
804 }
805 else
806 #endif
807
808 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
809
810 {
811 eptr -= GET(ecode,1);
812 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
813 }
814
815 /* Skip to next op code */
816
817 ecode += 1 + LINK_SIZE;
818 break;
819
820 /* The callout item calls an external function, if one is provided, passing
821 details of the match so far. This is mainly for debugging, though the
822 function is able to force a failure. */
823
824 case OP_CALLOUT:
825 if (pcre_callout != NULL)
826 {
827 pcre_callout_block cb;
828 cb.version = 1; /* Version 1 of the callout block */
829 cb.callout_number = ecode[1];
830 cb.offset_vector = md->offset_vector;
831 cb.subject = (PCRE_SPTR)md->start_subject;
832 cb.subject_length = md->end_subject - md->start_subject;
833 cb.start_match = md->start_match - md->start_subject;
834 cb.current_position = eptr - md->start_subject;
835 cb.pattern_position = GET(ecode, 2);
836 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
837 cb.capture_top = offset_top/2;
838 cb.capture_last = md->capture_last;
839 cb.callout_data = md->callout_data;
840 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
841 if (rrc < 0) RRETURN(rrc);
842 }
843 ecode += 2 + 2*LINK_SIZE;
844 break;
845
846 /* Recursion either matches the current regex, or some subexpression. The
847 offset data is the offset to the starting bracket from the start of the
848 whole pattern. (This is so that it works from duplicated subpatterns.)
849
850 If there are any capturing brackets started but not finished, we have to
851 save their starting points and reinstate them after the recursion. However,
852 we don't know how many such there are (offset_top records the completed
853 total) so we just have to save all the potential data. There may be up to
854 65535 such values, which is too large to put on the stack, but using malloc
855 for small numbers seems expensive. As a compromise, the stack is used when
856 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
857 is used. A problem is what to do if the malloc fails ... there is no way of
858 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
859 values on the stack, and accept that the rest may be wrong.
860
861 There are also other values that have to be saved. We use a chained
862 sequence of blocks that actually live on the stack. Thanks to Robin Houston
863 for the original version of this logic. */
864
865 case OP_RECURSE:
866 {
867 callpat = md->start_code + GET(ecode, 1);
868 new_recursive.group_num = *callpat - OP_BRA;
869
870 /* For extended extraction brackets (large number), we have to fish out
871 the number from a dummy opcode at the start. */
872
873 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
874 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
875
876 /* Add to "recursing stack" */
877
878 new_recursive.prevrec = md->recursive;
879 md->recursive = &new_recursive;
880
881 /* Find where to continue from afterwards */
882
883 ecode += 1 + LINK_SIZE;
884 new_recursive.after_call = ecode;
885
886 /* Now save the offset data. */
887
888 new_recursive.saved_max = md->offset_end;
889 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
890 new_recursive.offset_save = stacksave;
891 else
892 {
893 new_recursive.offset_save =
894 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
895 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
896 }
897
898 memcpy(new_recursive.offset_save, md->offset_vector,
899 new_recursive.saved_max * sizeof(int));
900 new_recursive.save_start = md->start_match;
901 md->start_match = eptr;
902
903 /* OK, now we can do the recursion. For each top-level alternative we
904 restore the offset and recursion data. */
905
906 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
907 do
908 {
909 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
910 eptrb, match_isgroup);
911 if (rrc == MATCH_MATCH)
912 {
913 DPRINTF(("Recursion matched\n"));
914 md->recursive = new_recursive.prevrec;
915 if (new_recursive.offset_save != stacksave)
916 (pcre_free)(new_recursive.offset_save);
917 RRETURN(MATCH_MATCH);
918 }
919 else if (rrc != MATCH_NOMATCH)
920 {
921 DPRINTF(("Recursion gave error %d\n", rrc));
922 RRETURN(rrc);
923 }
924
925 md->recursive = &new_recursive;
926 memcpy(md->offset_vector, new_recursive.offset_save,
927 new_recursive.saved_max * sizeof(int));
928 callpat += GET(callpat, 1);
929 }
930 while (*callpat == OP_ALT);
931
932 DPRINTF(("Recursion didn't match\n"));
933 md->recursive = new_recursive.prevrec;
934 if (new_recursive.offset_save != stacksave)
935 (pcre_free)(new_recursive.offset_save);
936 RRETURN(MATCH_NOMATCH);
937 }
938 /* Control never reaches here */
939
940 /* "Once" brackets are like assertion brackets except that after a match,
941 the point in the subject string is not moved back. Thus there can never be
942 a move back into the brackets. Friedl calls these "atomic" subpatterns.
943 Check the alternative branches in turn - the matching won't pass the KET
944 for this kind of subpattern. If any one branch matches, we carry on as at
945 the end of a normal bracket, leaving the subject pointer. */
946
947 case OP_ONCE:
948 {
949 prev = ecode;
950 saved_eptr = eptr;
951
952 do
953 {
954 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
955 eptrb, match_isgroup);
956 if (rrc == MATCH_MATCH) break;
957 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
958 ecode += GET(ecode,1);
959 }
960 while (*ecode == OP_ALT);
961
962 /* If hit the end of the group (which could be repeated), fail */
963
964 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
965
966 /* Continue as from after the assertion, updating the offsets high water
967 mark, since extracts may have been taken. */
968
969 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
970
971 offset_top = md->end_offset_top;
972 eptr = md->end_match_ptr;
973
974 /* For a non-repeating ket, just continue at this level. This also
975 happens for a repeating ket if no characters were matched in the group.
976 This is the forcible breaking of infinite loops as implemented in Perl
977 5.005. If there is an options reset, it will get obeyed in the normal
978 course of events. */
979
980 if (*ecode == OP_KET || eptr == saved_eptr)
981 {
982 ecode += 1+LINK_SIZE;
983 break;
984 }
985
986 /* The repeating kets try the rest of the pattern or restart from the
987 preceding bracket, in the appropriate order. We need to reset any options
988 that changed within the bracket before re-running it, so check the next
989 opcode. */
990
991 if (ecode[1+LINK_SIZE] == OP_OPT)
992 {
993 ims = (ims & ~PCRE_IMS) | ecode[4];
994 DPRINTF(("ims set to %02lx at group repeat\n", ims));
995 }
996
997 if (*ecode == OP_KETRMIN)
998 {
999 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1000 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1001 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1002 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1003 }
1004 else /* OP_KETRMAX */
1005 {
1006 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1007 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1008 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1009 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1010 }
1011 }
1012 RRETURN(MATCH_NOMATCH);
1013
1014 /* An alternation is the end of a branch; scan along to find the end of the
1015 bracketed group and go to there. */
1016
1017 case OP_ALT:
1018 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1019 break;
1020
1021 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1022 that it may occur zero times. It may repeat infinitely, or not at all -
1023 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1024 repeat limits are compiled as a number of copies, with the optional ones
1025 preceded by BRAZERO or BRAMINZERO. */
1026
1027 case OP_BRAZERO:
1028 {
1029 next = ecode+1;
1030 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1031 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1032 do next += GET(next,1); while (*next == OP_ALT);
1033 ecode = next + 1+LINK_SIZE;
1034 }
1035 break;
1036
1037 case OP_BRAMINZERO:
1038 {
1039 next = ecode+1;
1040 do next += GET(next,1); while (*next == OP_ALT);
1041 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1042 match_isgroup);
1043 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1044 ecode++;
1045 }
1046 break;
1047
1048 /* End of a group, repeated or non-repeating. If we are at the end of
1049 an assertion "group", stop matching and return MATCH_MATCH, but record the
1050 current high water mark for use by positive assertions. Do this also
1051 for the "once" (not-backup up) groups. */
1052
1053 case OP_KET:
1054 case OP_KETRMIN:
1055 case OP_KETRMAX:
1056 {
1057 prev = ecode - GET(ecode, 1);
1058 saved_eptr = eptrb->epb_saved_eptr;
1059
1060 /* Back up the stack of bracket start pointers. */
1061
1062 eptrb = eptrb->epb_prev;
1063
1064 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1065 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1066 *prev == OP_ONCE)
1067 {
1068 md->end_match_ptr = eptr; /* For ONCE */
1069 md->end_offset_top = offset_top;
1070 RRETURN(MATCH_MATCH);
1071 }
1072
1073 /* In all other cases except a conditional group we have to check the
1074 group number back at the start and if necessary complete handling an
1075 extraction by setting the offsets and bumping the high water mark. */
1076
1077 if (*prev != OP_COND)
1078 {
1079 number = *prev - OP_BRA;
1080
1081 /* For extended extraction brackets (large number), we have to fish out
1082 the number from a dummy opcode at the start. */
1083
1084 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1085 offset = number << 1;
1086
1087 #ifdef DEBUG
1088 printf("end bracket %d", number);
1089 printf("\n");
1090 #endif
1091
1092 /* Test for a numbered group. This includes groups called as a result
1093 of recursion. Note that whole-pattern recursion is coded as a recurse
1094 into group 0, so it won't be picked up here. Instead, we catch it when
1095 the OP_END is reached. */
1096
1097 if (number > 0)
1098 {
1099 md->capture_last = number;
1100 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1101 {
1102 md->offset_vector[offset] =
1103 md->offset_vector[md->offset_end - number];
1104 md->offset_vector[offset+1] = eptr - md->start_subject;
1105 if (offset_top <= offset) offset_top = offset + 2;
1106 }
1107
1108 /* Handle a recursively called group. Restore the offsets
1109 appropriately and continue from after the call. */
1110
1111 if (md->recursive != NULL && md->recursive->group_num == number)
1112 {
1113 recursion_info *rec = md->recursive;
1114 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1115 md->recursive = rec->prevrec;
1116 md->start_match = rec->save_start;
1117 memcpy(md->offset_vector, rec->offset_save,
1118 rec->saved_max * sizeof(int));
1119 ecode = rec->after_call;
1120 ims = original_ims;
1121 break;
1122 }
1123 }
1124 }
1125
1126 /* Reset the value of the ims flags, in case they got changed during
1127 the group. */
1128
1129 ims = original_ims;
1130 DPRINTF(("ims reset to %02lx\n", ims));
1131
1132 /* For a non-repeating ket, just continue at this level. This also
1133 happens for a repeating ket if no characters were matched in the group.
1134 This is the forcible breaking of infinite loops as implemented in Perl
1135 5.005. If there is an options reset, it will get obeyed in the normal
1136 course of events. */
1137
1138 if (*ecode == OP_KET || eptr == saved_eptr)
1139 {
1140 ecode += 1 + LINK_SIZE;
1141 break;
1142 }
1143
1144 /* The repeating kets try the rest of the pattern or restart from the
1145 preceding bracket, in the appropriate order. */
1146
1147 if (*ecode == OP_KETRMIN)
1148 {
1149 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1150 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1151 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1152 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1153 }
1154 else /* OP_KETRMAX */
1155 {
1156 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1157 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1158 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1159 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1160 }
1161 }
1162
1163 RRETURN(MATCH_NOMATCH);
1164
1165 /* Start of subject unless notbol, or after internal newline if multiline */
1166
1167 case OP_CIRC:
1168 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1169 if ((ims & PCRE_MULTILINE) != 0)
1170 {
1171 if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1172 RRETURN(MATCH_NOMATCH);
1173 ecode++;
1174 break;
1175 }
1176 /* ... else fall through */
1177
1178 /* Start of subject assertion */
1179
1180 case OP_SOD:
1181 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1182 ecode++;
1183 break;
1184
1185 /* Start of match assertion */
1186
1187 case OP_SOM:
1188 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1189 ecode++;
1190 break;
1191
1192 /* Assert before internal newline if multiline, or before a terminating
1193 newline unless endonly is set, else end of subject unless noteol is set. */
1194
1195 case OP_DOLL:
1196 if ((ims & PCRE_MULTILINE) != 0)
1197 {
1198 if (eptr < md->end_subject)
1199 { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1200 else
1201 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1202 ecode++;
1203 break;
1204 }
1205 else
1206 {
1207 if (md->noteol) RRETURN(MATCH_NOMATCH);
1208 if (!md->endonly)
1209 {
1210 if (eptr < md->end_subject - 1 ||
1211 (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1212 RRETURN(MATCH_NOMATCH);
1213 ecode++;
1214 break;
1215 }
1216 }
1217 /* ... else fall through */
1218
1219 /* End of subject assertion (\z) */
1220
1221 case OP_EOD:
1222 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1223 ecode++;
1224 break;
1225
1226 /* End of subject or ending \n assertion (\Z) */
1227
1228 case OP_EODN:
1229 if (eptr < md->end_subject - 1 ||
1230 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1231 ecode++;
1232 break;
1233
1234 /* Word boundary assertions */
1235
1236 case OP_NOT_WORD_BOUNDARY:
1237 case OP_WORD_BOUNDARY:
1238 {
1239
1240 /* Find out if the previous and current characters are "word" characters.
1241 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1242 be "non-word" characters. */
1243
1244 #ifdef SUPPORT_UTF8
1245 if (utf8)
1246 {
1247 if (eptr == md->start_subject) prev_is_word = FALSE; else
1248 {
1249 const uschar *lastptr = eptr - 1;
1250 while((*lastptr & 0xc0) == 0x80) lastptr--;
1251 GETCHAR(c, lastptr);
1252 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1253 }
1254 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1255 {
1256 GETCHAR(c, eptr);
1257 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1258 }
1259 }
1260 else
1261 #endif
1262
1263 /* More streamlined when not in UTF-8 mode */
1264
1265 {
1266 prev_is_word = (eptr != md->start_subject) &&
1267 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1268 cur_is_word = (eptr < md->end_subject) &&
1269 ((md->ctypes[*eptr] & ctype_word) != 0);
1270 }
1271
1272 /* Now see if the situation is what we want */
1273
1274 if ((*ecode++ == OP_WORD_BOUNDARY)?
1275 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1276 RRETURN(MATCH_NOMATCH);
1277 }
1278 break;
1279
1280 /* Match a single character type; inline for speed */
1281
1282 case OP_ANY:
1283 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1284 RRETURN(MATCH_NOMATCH);
1285 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1286 #ifdef SUPPORT_UTF8
1287 if (utf8)
1288 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1289 #endif
1290 ecode++;
1291 break;
1292
1293 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1294 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1295
1296 case OP_ANYBYTE:
1297 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1298 ecode++;
1299 break;
1300
1301 case OP_NOT_DIGIT:
1302 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1303 GETCHARINCTEST(c, eptr);
1304 if (
1305 #ifdef SUPPORT_UTF8
1306 c < 256 &&
1307 #endif
1308 (md->ctypes[c] & ctype_digit) != 0
1309 )
1310 RRETURN(MATCH_NOMATCH);
1311 ecode++;
1312 break;
1313
1314 case OP_DIGIT:
1315 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1316 GETCHARINCTEST(c, eptr);
1317 if (
1318 #ifdef SUPPORT_UTF8
1319 c >= 256 ||
1320 #endif
1321 (md->ctypes[c] & ctype_digit) == 0
1322 )
1323 RRETURN(MATCH_NOMATCH);
1324 ecode++;
1325 break;
1326
1327 case OP_NOT_WHITESPACE:
1328 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1329 GETCHARINCTEST(c, eptr);
1330 if (
1331 #ifdef SUPPORT_UTF8
1332 c < 256 &&
1333 #endif
1334 (md->ctypes[c] & ctype_space) != 0
1335 )
1336 RRETURN(MATCH_NOMATCH);
1337 ecode++;
1338 break;
1339
1340 case OP_WHITESPACE:
1341 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1342 GETCHARINCTEST(c, eptr);
1343 if (
1344 #ifdef SUPPORT_UTF8
1345 c >= 256 ||
1346 #endif
1347 (md->ctypes[c] & ctype_space) == 0
1348 )
1349 RRETURN(MATCH_NOMATCH);
1350 ecode++;
1351 break;
1352
1353 case OP_NOT_WORDCHAR:
1354 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1355 GETCHARINCTEST(c, eptr);
1356 if (
1357 #ifdef SUPPORT_UTF8
1358 c < 256 &&
1359 #endif
1360 (md->ctypes[c] & ctype_word) != 0
1361 )
1362 RRETURN(MATCH_NOMATCH);
1363 ecode++;
1364 break;
1365
1366 case OP_WORDCHAR:
1367 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1368 GETCHARINCTEST(c, eptr);
1369 if (
1370 #ifdef SUPPORT_UTF8
1371 c >= 256 ||
1372 #endif
1373 (md->ctypes[c] & ctype_word) == 0
1374 )
1375 RRETURN(MATCH_NOMATCH);
1376 ecode++;
1377 break;
1378
1379 #ifdef SUPPORT_UCP
1380 /* Check the next character by Unicode property. We will get here only
1381 if the support is in the binary; otherwise a compile-time error occurs. */
1382
1383 case OP_PROP:
1384 case OP_NOTPROP:
1385 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1386 GETCHARINCTEST(c, eptr);
1387 {
1388 int chartype, script;
1389 int category = _pcre_ucp_findprop(c, &chartype, &script);
1390
1391 switch(ecode[1])
1392 {
1393 case PT_ANY:
1394 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1395 break;
1396
1397 case PT_LAMP:
1398 if ((chartype == ucp_Lu ||
1399 chartype == ucp_Ll ||
1400 chartype == ucp_Lt) == (op == OP_NOTPROP))
1401 RRETURN(MATCH_NOMATCH);
1402 break;
1403
1404 case PT_GC:
1405 if ((ecode[2] != category) == (op == OP_PROP))
1406 RRETURN(MATCH_NOMATCH);
1407 break;
1408
1409 case PT_PC:
1410 if ((ecode[2] != chartype) == (op == OP_PROP))
1411 RRETURN(MATCH_NOMATCH);
1412 break;
1413
1414 case PT_SC:
1415 if ((ecode[2] != script) == (op == OP_PROP))
1416 RRETURN(MATCH_NOMATCH);
1417 break;
1418
1419 default:
1420 RRETURN(PCRE_ERROR_INTERNAL);
1421 break;
1422 }
1423
1424 ecode += 3;
1425 }
1426 break;
1427
1428 /* Match an extended Unicode sequence. We will get here only if the support
1429 is in the binary; otherwise a compile-time error occurs. */
1430
1431 case OP_EXTUNI:
1432 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1433 GETCHARINCTEST(c, eptr);
1434 {
1435 int chartype, script;
1436 int category = _pcre_ucp_findprop(c, &chartype, &script);
1437 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1438 while (eptr < md->end_subject)
1439 {
1440 int len = 1;
1441 if (!utf8) c = *eptr; else
1442 {
1443 GETCHARLEN(c, eptr, len);
1444 }
1445 category = _pcre_ucp_findprop(c, &chartype, &script);
1446 if (category != ucp_M) break;
1447 eptr += len;
1448 }
1449 }
1450 ecode++;
1451 break;
1452 #endif
1453
1454
1455 /* Match a back reference, possibly repeatedly. Look past the end of the
1456 item to see if there is repeat information following. The code is similar
1457 to that for character classes, but repeated for efficiency. Then obey
1458 similar code to character type repeats - written out again for speed.
1459 However, if the referenced string is the empty string, always treat
1460 it as matched, any number of times (otherwise there could be infinite
1461 loops). */
1462
1463 case OP_REF:
1464 {
1465 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1466 ecode += 3; /* Advance past item */
1467
1468 /* If the reference is unset, set the length to be longer than the amount
1469 of subject left; this ensures that every attempt at a match fails. We
1470 can't just fail here, because of the possibility of quantifiers with zero
1471 minima. */
1472
1473 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1474 md->end_subject - eptr + 1 :
1475 md->offset_vector[offset+1] - md->offset_vector[offset];
1476
1477 /* Set up for repetition, or handle the non-repeated case */
1478
1479 switch (*ecode)
1480 {
1481 case OP_CRSTAR:
1482 case OP_CRMINSTAR:
1483 case OP_CRPLUS:
1484 case OP_CRMINPLUS:
1485 case OP_CRQUERY:
1486 case OP_CRMINQUERY:
1487 c = *ecode++ - OP_CRSTAR;
1488 minimize = (c & 1) != 0;
1489 min = rep_min[c]; /* Pick up values from tables; */
1490 max = rep_max[c]; /* zero for max => infinity */
1491 if (max == 0) max = INT_MAX;
1492 break;
1493
1494 case OP_CRRANGE:
1495 case OP_CRMINRANGE:
1496 minimize = (*ecode == OP_CRMINRANGE);
1497 min = GET2(ecode, 1);
1498 max = GET2(ecode, 3);
1499 if (max == 0) max = INT_MAX;
1500 ecode += 5;
1501 break;
1502
1503 default: /* No repeat follows */
1504 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1505 eptr += length;
1506 continue; /* With the main loop */
1507 }
1508
1509 /* If the length of the reference is zero, just continue with the
1510 main loop. */
1511
1512 if (length == 0) continue;
1513
1514 /* First, ensure the minimum number of matches are present. We get back
1515 the length of the reference string explicitly rather than passing the
1516 address of eptr, so that eptr can be a register variable. */
1517
1518 for (i = 1; i <= min; i++)
1519 {
1520 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1521 eptr += length;
1522 }
1523
1524 /* If min = max, continue at the same level without recursion.
1525 They are not both allowed to be zero. */
1526
1527 if (min == max) continue;
1528
1529 /* If minimizing, keep trying and advancing the pointer */
1530
1531 if (minimize)
1532 {
1533 for (fi = min;; fi++)
1534 {
1535 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1536 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1537 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1538 RRETURN(MATCH_NOMATCH);
1539 eptr += length;
1540 }
1541 /* Control never gets here */
1542 }
1543
1544 /* If maximizing, find the longest string and work backwards */
1545
1546 else
1547 {
1548 pp = eptr;
1549 for (i = min; i < max; i++)
1550 {
1551 if (!match_ref(offset, eptr, length, md, ims)) break;
1552 eptr += length;
1553 }
1554 while (eptr >= pp)
1555 {
1556 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1557 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1558 eptr -= length;
1559 }
1560 RRETURN(MATCH_NOMATCH);
1561 }
1562 }
1563 /* Control never gets here */
1564
1565
1566
1567 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1568 used when all the characters in the class have values in the range 0-255,
1569 and either the matching is caseful, or the characters are in the range
1570 0-127 when UTF-8 processing is enabled. The only difference between
1571 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1572 encountered.
1573
1574 First, look past the end of the item to see if there is repeat information
1575 following. Then obey similar code to character type repeats - written out
1576 again for speed. */
1577
1578 case OP_NCLASS:
1579 case OP_CLASS:
1580 {
1581 data = ecode + 1; /* Save for matching */
1582 ecode += 33; /* Advance past the item */
1583
1584 switch (*ecode)
1585 {
1586 case OP_CRSTAR:
1587 case OP_CRMINSTAR:
1588 case OP_CRPLUS:
1589 case OP_CRMINPLUS:
1590 case OP_CRQUERY:
1591 case OP_CRMINQUERY:
1592 c = *ecode++ - OP_CRSTAR;
1593 minimize = (c & 1) != 0;
1594 min = rep_min[c]; /* Pick up values from tables; */
1595 max = rep_max[c]; /* zero for max => infinity */
1596 if (max == 0) max = INT_MAX;
1597 break;
1598
1599 case OP_CRRANGE:
1600 case OP_CRMINRANGE:
1601 minimize = (*ecode == OP_CRMINRANGE);
1602 min = GET2(ecode, 1);
1603 max = GET2(ecode, 3);
1604 if (max == 0) max = INT_MAX;
1605 ecode += 5;
1606 break;
1607
1608 default: /* No repeat follows */
1609 min = max = 1;
1610 break;
1611 }
1612
1613 /* First, ensure the minimum number of matches are present. */
1614
1615 #ifdef SUPPORT_UTF8
1616 /* UTF-8 mode */
1617 if (utf8)
1618 {
1619 for (i = 1; i <= min; i++)
1620 {
1621 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1622 GETCHARINC(c, eptr);
1623 if (c > 255)
1624 {
1625 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1626 }
1627 else
1628 {
1629 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1630 }
1631 }
1632 }
1633 else
1634 #endif
1635 /* Not UTF-8 mode */
1636 {
1637 for (i = 1; i <= min; i++)
1638 {
1639 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1640 c = *eptr++;
1641 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1642 }
1643 }
1644
1645 /* If max == min we can continue with the main loop without the
1646 need to recurse. */
1647
1648 if (min == max) continue;
1649
1650 /* If minimizing, keep testing the rest of the expression and advancing
1651 the pointer while it matches the class. */
1652
1653 if (minimize)
1654 {
1655 #ifdef SUPPORT_UTF8
1656 /* UTF-8 mode */
1657 if (utf8)
1658 {
1659 for (fi = min;; fi++)
1660 {
1661 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1662 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1663 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1664 GETCHARINC(c, eptr);
1665 if (c > 255)
1666 {
1667 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1668 }
1669 else
1670 {
1671 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1672 }
1673 }
1674 }
1675 else
1676 #endif
1677 /* Not UTF-8 mode */
1678 {
1679 for (fi = min;; fi++)
1680 {
1681 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1682 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1683 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1684 c = *eptr++;
1685 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1686 }
1687 }
1688 /* Control never gets here */
1689 }
1690
1691 /* If maximizing, find the longest possible run, then work backwards. */
1692
1693 else
1694 {
1695 pp = eptr;
1696
1697 #ifdef SUPPORT_UTF8
1698 /* UTF-8 mode */
1699 if (utf8)
1700 {
1701 for (i = min; i < max; i++)
1702 {
1703 int len = 1;
1704 if (eptr >= md->end_subject) break;
1705 GETCHARLEN(c, eptr, len);
1706 if (c > 255)
1707 {
1708 if (op == OP_CLASS) break;
1709 }
1710 else
1711 {
1712 if ((data[c/8] & (1 << (c&7))) == 0) break;
1713 }
1714 eptr += len;
1715 }
1716 for (;;)
1717 {
1718 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1719 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1720 if (eptr-- == pp) break; /* Stop if tried at original pos */
1721 BACKCHAR(eptr);
1722 }
1723 }
1724 else
1725 #endif
1726 /* Not UTF-8 mode */
1727 {
1728 for (i = min; i < max; i++)
1729 {
1730 if (eptr >= md->end_subject) break;
1731 c = *eptr;
1732 if ((data[c/8] & (1 << (c&7))) == 0) break;
1733 eptr++;
1734 }
1735 while (eptr >= pp)
1736 {
1737 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1738 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1739 eptr--;
1740 }
1741 }
1742
1743 RRETURN(MATCH_NOMATCH);
1744 }
1745 }
1746 /* Control never gets here */
1747
1748
1749 /* Match an extended character class. This opcode is encountered only
1750 in UTF-8 mode, because that's the only time it is compiled. */
1751
1752 #ifdef SUPPORT_UTF8
1753 case OP_XCLASS:
1754 {
1755 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1756 ecode += GET(ecode, 1); /* Advance past the item */
1757
1758 switch (*ecode)
1759 {
1760 case OP_CRSTAR:
1761 case OP_CRMINSTAR:
1762 case OP_CRPLUS:
1763 case OP_CRMINPLUS:
1764 case OP_CRQUERY:
1765 case OP_CRMINQUERY:
1766 c = *ecode++ - OP_CRSTAR;
1767 minimize = (c & 1) != 0;
1768 min = rep_min[c]; /* Pick up values from tables; */
1769 max = rep_max[c]; /* zero for max => infinity */
1770 if (max == 0) max = INT_MAX;
1771 break;
1772
1773 case OP_CRRANGE:
1774 case OP_CRMINRANGE:
1775 minimize = (*ecode == OP_CRMINRANGE);
1776 min = GET2(ecode, 1);
1777 max = GET2(ecode, 3);
1778 if (max == 0) max = INT_MAX;
1779 ecode += 5;
1780 break;
1781
1782 default: /* No repeat follows */
1783 min = max = 1;
1784 break;
1785 }
1786
1787 /* First, ensure the minimum number of matches are present. */
1788
1789 for (i = 1; i <= min; i++)
1790 {
1791 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1792 GETCHARINC(c, eptr);
1793 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1794 }
1795
1796 /* If max == min we can continue with the main loop without the
1797 need to recurse. */
1798
1799 if (min == max) continue;
1800
1801 /* If minimizing, keep testing the rest of the expression and advancing
1802 the pointer while it matches the class. */
1803
1804 if (minimize)
1805 {
1806 for (fi = min;; fi++)
1807 {
1808 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1809 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1810 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1811 GETCHARINC(c, eptr);
1812 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1813 }
1814 /* Control never gets here */
1815 }
1816
1817 /* If maximizing, find the longest possible run, then work backwards. */
1818
1819 else
1820 {
1821 pp = eptr;
1822 for (i = min; i < max; i++)
1823 {
1824 int len = 1;
1825 if (eptr >= md->end_subject) break;
1826 GETCHARLEN(c, eptr, len);
1827 if (!_pcre_xclass(c, data)) break;
1828 eptr += len;
1829 }
1830 for(;;)
1831 {
1832 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1833 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1834 if (eptr-- == pp) break; /* Stop if tried at original pos */
1835 BACKCHAR(eptr)
1836 }
1837 RRETURN(MATCH_NOMATCH);
1838 }
1839
1840 /* Control never gets here */
1841 }
1842 #endif /* End of XCLASS */
1843
1844 /* Match a single character, casefully */
1845
1846 case OP_CHAR:
1847 #ifdef SUPPORT_UTF8
1848 if (utf8)
1849 {
1850 length = 1;
1851 ecode++;
1852 GETCHARLEN(fc, ecode, length);
1853 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1854 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1855 }
1856 else
1857 #endif
1858
1859 /* Non-UTF-8 mode */
1860 {
1861 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1862 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1863 ecode += 2;
1864 }
1865 break;
1866
1867 /* Match a single character, caselessly */
1868
1869 case OP_CHARNC:
1870 #ifdef SUPPORT_UTF8
1871 if (utf8)
1872 {
1873 length = 1;
1874 ecode++;
1875 GETCHARLEN(fc, ecode, length);
1876
1877 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1878
1879 /* If the pattern character's value is < 128, we have only one byte, and
1880 can use the fast lookup table. */
1881
1882 if (fc < 128)
1883 {
1884 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1885 }
1886
1887 /* Otherwise we must pick up the subject character */
1888
1889 else
1890 {
1891 int dc;
1892 GETCHARINC(dc, eptr);
1893 ecode += length;
1894
1895 /* If we have Unicode property support, we can use it to test the other
1896 case of the character, if there is one. */
1897
1898 if (fc != dc)
1899 {
1900 #ifdef SUPPORT_UCP
1901 if (dc != _pcre_ucp_othercase(fc))
1902 #endif
1903 RRETURN(MATCH_NOMATCH);
1904 }
1905 }
1906 }
1907 else
1908 #endif /* SUPPORT_UTF8 */
1909
1910 /* Non-UTF-8 mode */
1911 {
1912 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1913 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1914 ecode += 2;
1915 }
1916 break;
1917
1918 /* Match a single character repeatedly; different opcodes share code. */
1919
1920 case OP_EXACT:
1921 min = max = GET2(ecode, 1);
1922 ecode += 3;
1923 goto REPEATCHAR;
1924
1925 case OP_UPTO:
1926 case OP_MINUPTO:
1927 min = 0;
1928 max = GET2(ecode, 1);
1929 minimize = *ecode == OP_MINUPTO;
1930 ecode += 3;
1931 goto REPEATCHAR;
1932
1933 case OP_STAR:
1934 case OP_MINSTAR:
1935 case OP_PLUS:
1936 case OP_MINPLUS:
1937 case OP_QUERY:
1938 case OP_MINQUERY:
1939 c = *ecode++ - OP_STAR;
1940 minimize = (c & 1) != 0;
1941 min = rep_min[c]; /* Pick up values from tables; */
1942 max = rep_max[c]; /* zero for max => infinity */
1943 if (max == 0) max = INT_MAX;
1944
1945 /* Common code for all repeated single-character matches. We can give
1946 up quickly if there are fewer than the minimum number of characters left in
1947 the subject. */
1948
1949 REPEATCHAR:
1950 #ifdef SUPPORT_UTF8
1951 if (utf8)
1952 {
1953 length = 1;
1954 charptr = ecode;
1955 GETCHARLEN(fc, ecode, length);
1956 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1957 ecode += length;
1958
1959 /* Handle multibyte character matching specially here. There is
1960 support for caseless matching if UCP support is present. */
1961
1962 if (length > 1)
1963 {
1964 int oclength = 0;
1965 uschar occhars[8];
1966
1967 #ifdef SUPPORT_UCP
1968 int othercase;
1969 if ((ims & PCRE_CASELESS) != 0 &&
1970 (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
1971 othercase >= 0)
1972 oclength = _pcre_ord2utf8(othercase, occhars);
1973 #endif /* SUPPORT_UCP */
1974
1975 for (i = 1; i <= min; i++)
1976 {
1977 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1978 /* Need braces because of following else */
1979 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1980 else
1981 {
1982 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1983 eptr += oclength;
1984 }
1985 }
1986
1987 if (min == max) continue;
1988
1989 if (minimize)
1990 {
1991 for (fi = min;; fi++)
1992 {
1993 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1994 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1995 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1996 if (memcmp(eptr, charptr, length) == 0) eptr += length;
1997 /* Need braces because of following else */
1998 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1999 else
2000 {
2001 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2002 eptr += oclength;
2003 }
2004 }
2005 /* Control never gets here */
2006 }
2007 else
2008 {
2009 pp = eptr;
2010 for (i = min; i < max; i++)
2011 {
2012 if (eptr > md->end_subject - length) break;
2013 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2014 else if (oclength == 0) break;
2015 else
2016 {
2017 if (memcmp(eptr, occhars, oclength) != 0) break;
2018 eptr += oclength;
2019 }
2020 }
2021 while (eptr >= pp)
2022 {
2023 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2024 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2025 eptr -= length;
2026 }
2027 RRETURN(MATCH_NOMATCH);
2028 }
2029 /* Control never gets here */
2030 }
2031
2032 /* If the length of a UTF-8 character is 1, we fall through here, and
2033 obey the code as for non-UTF-8 characters below, though in this case the
2034 value of fc will always be < 128. */
2035 }
2036 else
2037 #endif /* SUPPORT_UTF8 */
2038
2039 /* When not in UTF-8 mode, load a single-byte character. */
2040 {
2041 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2042 fc = *ecode++;
2043 }
2044
2045 /* The value of fc at this point is always less than 256, though we may or
2046 may not be in UTF-8 mode. The code is duplicated for the caseless and
2047 caseful cases, for speed, since matching characters is likely to be quite
2048 common. First, ensure the minimum number of matches are present. If min =
2049 max, continue at the same level without recursing. Otherwise, if
2050 minimizing, keep trying the rest of the expression and advancing one
2051 matching character if failing, up to the maximum. Alternatively, if
2052 maximizing, find the maximum number of characters and work backwards. */
2053
2054 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2055 max, eptr));
2056
2057 if ((ims & PCRE_CASELESS) != 0)
2058 {
2059 fc = md->lcc[fc];
2060 for (i = 1; i <= min; i++)
2061 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2062 if (min == max) continue;
2063 if (minimize)
2064 {
2065 for (fi = min;; fi++)
2066 {
2067 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2068 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2069 if (fi >= max || eptr >= md->end_subject ||
2070 fc != md->lcc[*eptr++])
2071 RRETURN(MATCH_NOMATCH);
2072 }
2073 /* Control never gets here */
2074 }
2075 else
2076 {
2077 pp = eptr;
2078 for (i = min; i < max; i++)
2079 {
2080 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2081 eptr++;
2082 }
2083 while (eptr >= pp)
2084 {
2085 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2086 eptr--;
2087 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2088 }
2089 RRETURN(MATCH_NOMATCH);
2090 }
2091 /* Control never gets here */
2092 }
2093
2094 /* Caseful comparisons (includes all multi-byte characters) */
2095
2096 else
2097 {
2098 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2099 if (min == max) continue;
2100 if (minimize)
2101 {
2102 for (fi = min;; fi++)
2103 {
2104 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2105 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2106 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2107 RRETURN(MATCH_NOMATCH);
2108 }
2109 /* Control never gets here */
2110 }
2111 else
2112 {
2113 pp = eptr;
2114 for (i = min; i < max; i++)
2115 {
2116 if (eptr >= md->end_subject || fc != *eptr) break;
2117 eptr++;
2118 }
2119 while (eptr >= pp)
2120 {
2121 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2122 eptr--;
2123 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2124 }
2125 RRETURN(MATCH_NOMATCH);
2126 }
2127 }
2128 /* Control never gets here */
2129
2130 /* Match a negated single one-byte character. The character we are
2131 checking can be multibyte. */
2132
2133 case OP_NOT:
2134 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2135 ecode++;
2136 GETCHARINCTEST(c, eptr);
2137 if ((ims & PCRE_CASELESS) != 0)
2138 {
2139 #ifdef SUPPORT_UTF8
2140 if (c < 256)
2141 #endif
2142 c = md->lcc[c];
2143 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2144 }
2145 else
2146 {
2147 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2148 }
2149 break;
2150
2151 /* Match a negated single one-byte character repeatedly. This is almost a
2152 repeat of the code for a repeated single character, but I haven't found a
2153 nice way of commoning these up that doesn't require a test of the
2154 positive/negative option for each character match. Maybe that wouldn't add
2155 very much to the time taken, but character matching *is* what this is all
2156 about... */
2157
2158 case OP_NOTEXACT:
2159 min = max = GET2(ecode, 1);
2160 ecode += 3;
2161 goto REPEATNOTCHAR;
2162
2163 case OP_NOTUPTO:
2164 case OP_NOTMINUPTO:
2165 min = 0;
2166 max = GET2(ecode, 1);
2167 minimize = *ecode == OP_NOTMINUPTO;
2168 ecode += 3;
2169 goto REPEATNOTCHAR;
2170
2171 case OP_NOTSTAR:
2172 case OP_NOTMINSTAR:
2173 case OP_NOTPLUS:
2174 case OP_NOTMINPLUS:
2175 case OP_NOTQUERY:
2176 case OP_NOTMINQUERY:
2177 c = *ecode++ - OP_NOTSTAR;
2178 minimize = (c & 1) != 0;
2179 min = rep_min[c]; /* Pick up values from tables; */
2180 max = rep_max[c]; /* zero for max => infinity */
2181 if (max == 0) max = INT_MAX;
2182
2183 /* Common code for all repeated single-byte matches. We can give up quickly
2184 if there are fewer than the minimum number of bytes left in the
2185 subject. */
2186
2187 REPEATNOTCHAR:
2188 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2189 fc = *ecode++;
2190
2191 /* The code is duplicated for the caseless and caseful cases, for speed,
2192 since matching characters is likely to be quite common. First, ensure the
2193 minimum number of matches are present. If min = max, continue at the same
2194 level without recursing. Otherwise, if minimizing, keep trying the rest of
2195 the expression and advancing one matching character if failing, up to the
2196 maximum. Alternatively, if maximizing, find the maximum number of
2197 characters and work backwards. */
2198
2199 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2200 max, eptr));
2201
2202 if ((ims & PCRE_CASELESS) != 0)
2203 {
2204 fc = md->lcc[fc];
2205
2206 #ifdef SUPPORT_UTF8
2207 /* UTF-8 mode */
2208 if (utf8)
2209 {
2210 register int d;
2211 for (i = 1; i <= min; i++)
2212 {
2213 GETCHARINC(d, eptr);
2214 if (d < 256) d = md->lcc[d];
2215 if (fc == d) RRETURN(MATCH_NOMATCH);
2216 }
2217 }
2218 else
2219 #endif
2220
2221 /* Not UTF-8 mode */
2222 {
2223 for (i = 1; i <= min; i++)
2224 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2225 }
2226
2227 if (min == max) continue;
2228
2229 if (minimize)
2230 {
2231 #ifdef SUPPORT_UTF8
2232 /* UTF-8 mode */
2233 if (utf8)
2234 {
2235 register int d;
2236 for (fi = min;; fi++)
2237 {
2238 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2239 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240 GETCHARINC(d, eptr);
2241 if (d < 256) d = md->lcc[d];
2242 if (fi >= max || eptr >= md->end_subject || fc == d)
2243 RRETURN(MATCH_NOMATCH);
2244 }
2245 }
2246 else
2247 #endif
2248 /* Not UTF-8 mode */
2249 {
2250 for (fi = min;; fi++)
2251 {
2252 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2253 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2254 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2255 RRETURN(MATCH_NOMATCH);
2256 }
2257 }
2258 /* Control never gets here */
2259 }
2260
2261 /* Maximize case */
2262
2263 else
2264 {
2265 pp = eptr;
2266
2267 #ifdef SUPPORT_UTF8
2268 /* UTF-8 mode */
2269 if (utf8)
2270 {
2271 register int d;
2272 for (i = min; i < max; i++)
2273 {
2274 int len = 1;
2275 if (eptr >= md->end_subject) break;
2276 GETCHARLEN(d, eptr, len);
2277 if (d < 256) d = md->lcc[d];
2278 if (fc == d) break;
2279 eptr += len;
2280 }
2281 for(;;)
2282 {
2283 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2284 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2285 if (eptr-- == pp) break; /* Stop if tried at original pos */
2286 BACKCHAR(eptr);
2287 }
2288 }
2289 else
2290 #endif
2291 /* Not UTF-8 mode */
2292 {
2293 for (i = min; i < max; i++)
2294 {
2295 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2296 eptr++;
2297 }
2298 while (eptr >= pp)
2299 {
2300 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2301 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2302 eptr--;
2303 }
2304 }
2305
2306 RRETURN(MATCH_NOMATCH);
2307 }
2308 /* Control never gets here */
2309 }
2310
2311 /* Caseful comparisons */
2312
2313 else
2314 {
2315 #ifdef SUPPORT_UTF8
2316 /* UTF-8 mode */
2317 if (utf8)
2318 {
2319 register int d;
2320 for (i = 1; i <= min; i++)
2321 {
2322 GETCHARINC(d, eptr);
2323 if (fc == d) RRETURN(MATCH_NOMATCH);
2324 }
2325 }
2326 else
2327 #endif
2328 /* Not UTF-8 mode */
2329 {
2330 for (i = 1; i <= min; i++)
2331 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2332 }
2333
2334 if (min == max) continue;
2335
2336 if (minimize)
2337 {
2338 #ifdef SUPPORT_UTF8
2339 /* UTF-8 mode */
2340 if (utf8)
2341 {
2342 register int d;
2343 for (fi = min;; fi++)
2344 {
2345 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2346 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2347 GETCHARINC(d, eptr);
2348 if (fi >= max || eptr >= md->end_subject || fc == d)
2349 RRETURN(MATCH_NOMATCH);
2350 }
2351 }
2352 else
2353 #endif
2354 /* Not UTF-8 mode */
2355 {
2356 for (fi = min;; fi++)
2357 {
2358 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2361 RRETURN(MATCH_NOMATCH);
2362 }
2363 }
2364 /* Control never gets here */
2365 }
2366
2367 /* Maximize case */
2368
2369 else
2370 {
2371 pp = eptr;
2372
2373 #ifdef SUPPORT_UTF8
2374 /* UTF-8 mode */
2375 if (utf8)
2376 {
2377 register int d;
2378 for (i = min; i < max; i++)
2379 {
2380 int len = 1;
2381 if (eptr >= md->end_subject) break;
2382 GETCHARLEN(d, eptr, len);
2383 if (fc == d) break;
2384 eptr += len;
2385 }
2386 for(;;)
2387 {
2388 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2389 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2390 if (eptr-- == pp) break; /* Stop if tried at original pos */
2391 BACKCHAR(eptr);
2392 }
2393 }
2394 else
2395 #endif
2396 /* Not UTF-8 mode */
2397 {
2398 for (i = min; i < max; i++)
2399 {
2400 if (eptr >= md->end_subject || fc == *eptr) break;
2401 eptr++;
2402 }
2403 while (eptr >= pp)
2404 {
2405 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2406 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2407 eptr--;
2408 }
2409 }
2410
2411 RRETURN(MATCH_NOMATCH);
2412 }
2413 }
2414 /* Control never gets here */
2415
2416 /* Match a single character type repeatedly; several different opcodes
2417 share code. This is very similar to the code for single characters, but we
2418 repeat it in the interests of efficiency. */
2419
2420 case OP_TYPEEXACT:
2421 min = max = GET2(ecode, 1);
2422 minimize = TRUE;
2423 ecode += 3;
2424 goto REPEATTYPE;
2425
2426 case OP_TYPEUPTO:
2427 case OP_TYPEMINUPTO:
2428 min = 0;
2429 max = GET2(ecode, 1);
2430 minimize = *ecode == OP_TYPEMINUPTO;
2431 ecode += 3;
2432 goto REPEATTYPE;
2433
2434 case OP_TYPESTAR:
2435 case OP_TYPEMINSTAR:
2436 case OP_TYPEPLUS:
2437 case OP_TYPEMINPLUS:
2438 case OP_TYPEQUERY:
2439 case OP_TYPEMINQUERY:
2440 c = *ecode++ - OP_TYPESTAR;
2441 minimize = (c & 1) != 0;
2442 min = rep_min[c]; /* Pick up values from tables; */
2443 max = rep_max[c]; /* zero for max => infinity */
2444 if (max == 0) max = INT_MAX;
2445
2446 /* Common code for all repeated single character type matches. Note that
2447 in UTF-8 mode, '.' matches a character of any length, but for the other
2448 character types, the valid characters are all one-byte long. */
2449
2450 REPEATTYPE:
2451 ctype = *ecode++; /* Code for the character type */
2452
2453 #ifdef SUPPORT_UCP
2454 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2455 {
2456 prop_fail_result = ctype == OP_NOTPROP;
2457 prop_type = *ecode++;
2458 prop_value = *ecode++;
2459 }
2460 else prop_type = -1;
2461 #endif
2462
2463 /* First, ensure the minimum number of matches are present. Use inline
2464 code for maximizing the speed, and do the type test once at the start
2465 (i.e. keep it out of the loop). Also we can test that there are at least
2466 the minimum number of bytes before we start. This isn't as effective in
2467 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2468 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2469 and single-bytes. */
2470
2471 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2472 if (min > 0)
2473 {
2474 #ifdef SUPPORT_UCP
2475 if (prop_type >= 0)
2476 {
2477 switch(prop_type)
2478 {
2479 case PT_ANY:
2480 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2481 for (i = 1; i <= min; i++)
2482 {
2483 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2484 GETCHARINC(c, eptr);
2485 }
2486 break;
2487
2488 case PT_LAMP:
2489 for (i = 1; i <= min; i++)
2490 {
2491 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2492 GETCHARINC(c, eptr);
2493 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2494 if ((prop_chartype == ucp_Lu ||
2495 prop_chartype == ucp_Ll ||
2496 prop_chartype == ucp_Lt) == prop_fail_result)
2497 RRETURN(MATCH_NOMATCH);
2498 }
2499 break;
2500
2501 case PT_GC:
2502 for (i = 1; i <= min; i++)
2503 {
2504 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2505 GETCHARINC(c, eptr);
2506 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2507 if ((prop_category == prop_value) == prop_fail_result)
2508 RRETURN(MATCH_NOMATCH);
2509 }
2510 break;
2511
2512 case PT_PC:
2513 for (i = 1; i <= min; i++)
2514 {
2515 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2516 GETCHARINC(c, eptr);
2517 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2518 if ((prop_chartype == prop_value) == prop_fail_result)
2519 RRETURN(MATCH_NOMATCH);
2520 }
2521 break;
2522
2523 case PT_SC:
2524 for (i = 1; i <= min; i++)
2525 {
2526 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2527 GETCHARINC(c, eptr);
2528 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2529 if ((prop_script == prop_value) == prop_fail_result)
2530 RRETURN(MATCH_NOMATCH);
2531 }
2532 break;
2533
2534 default:
2535 RRETURN(PCRE_ERROR_INTERNAL);
2536 break;
2537 }
2538 }
2539
2540 /* Match extended Unicode sequences. We will get here only if the
2541 support is in the binary; otherwise a compile-time error occurs. */
2542
2543 else if (ctype == OP_EXTUNI)
2544 {
2545 for (i = 1; i <= min; i++)
2546 {
2547 GETCHARINCTEST(c, eptr);
2548 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2549 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2550 while (eptr < md->end_subject)
2551 {
2552 int len = 1;
2553 if (!utf8) c = *eptr; else
2554 {
2555 GETCHARLEN(c, eptr, len);
2556 }
2557 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2558 if (prop_category != ucp_M) break;
2559 eptr += len;
2560 }
2561 }
2562 }
2563
2564 else
2565 #endif /* SUPPORT_UCP */
2566
2567 /* Handle all other cases when the coding is UTF-8 */
2568
2569 #ifdef SUPPORT_UTF8
2570 if (utf8) switch(ctype)
2571 {
2572 case OP_ANY:
2573 for (i = 1; i <= min; i++)
2574 {
2575 if (eptr >= md->end_subject ||
2576 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2577 RRETURN(MATCH_NOMATCH);
2578 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2579 }
2580 break;
2581
2582 case OP_ANYBYTE:
2583 eptr += min;
2584 break;
2585
2586 case OP_NOT_DIGIT:
2587 for (i = 1; i <= min; i++)
2588 {
2589 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2590 GETCHARINC(c, eptr);
2591 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2592 RRETURN(MATCH_NOMATCH);
2593 }
2594 break;
2595
2596 case OP_DIGIT:
2597 for (i = 1; i <= min; i++)
2598 {
2599 if (eptr >= md->end_subject ||
2600 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2601 RRETURN(MATCH_NOMATCH);
2602 /* No need to skip more bytes - we know it's a 1-byte character */
2603 }
2604 break;
2605
2606 case OP_NOT_WHITESPACE:
2607 for (i = 1; i <= min; i++)
2608 {
2609 if (eptr >= md->end_subject ||
2610 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2611 RRETURN(MATCH_NOMATCH);
2612 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2613 }
2614 break;
2615
2616 case OP_WHITESPACE:
2617 for (i = 1; i <= min; i++)
2618 {
2619 if (eptr >= md->end_subject ||
2620 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2621 RRETURN(MATCH_NOMATCH);
2622 /* No need to skip more bytes - we know it's a 1-byte character */
2623 }
2624 break;
2625
2626 case OP_NOT_WORDCHAR:
2627 for (i = 1; i <= min; i++)
2628 {
2629 if (eptr >= md->end_subject ||
2630 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2631 RRETURN(MATCH_NOMATCH);
2632 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2633 }
2634 break;
2635
2636 case OP_WORDCHAR:
2637 for (i = 1; i <= min; i++)
2638 {
2639 if (eptr >= md->end_subject ||
2640 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2641 RRETURN(MATCH_NOMATCH);
2642 /* No need to skip more bytes - we know it's a 1-byte character */
2643 }
2644 break;
2645
2646 default:
2647 RRETURN(PCRE_ERROR_INTERNAL);
2648 } /* End switch(ctype) */
2649
2650 else
2651 #endif /* SUPPORT_UTF8 */
2652
2653 /* Code for the non-UTF-8 case for minimum matching of operators other
2654 than OP_PROP and OP_NOTPROP. */
2655
2656 switch(ctype)
2657 {
2658 case OP_ANY:
2659 if ((ims & PCRE_DOTALL) == 0)
2660 {
2661 for (i = 1; i <= min; i++)
2662 if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2663 }
2664 else eptr += min;
2665 break;
2666
2667 case OP_ANYBYTE:
2668 eptr += min;
2669 break;
2670
2671 case OP_NOT_DIGIT:
2672 for (i = 1; i <= min; i++)
2673 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2674 break;
2675
2676 case OP_DIGIT:
2677 for (i = 1; i <= min; i++)
2678 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2679 break;
2680
2681 case OP_NOT_WHITESPACE:
2682 for (i = 1; i <= min; i++)
2683 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2684 break;
2685
2686 case OP_WHITESPACE:
2687 for (i = 1; i <= min; i++)
2688 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2689 break;
2690
2691 case OP_NOT_WORDCHAR:
2692 for (i = 1; i <= min; i++)
2693 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2694 RRETURN(MATCH_NOMATCH);
2695 break;
2696
2697 case OP_WORDCHAR:
2698 for (i = 1; i <= min; i++)
2699 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2700 RRETURN(MATCH_NOMATCH);
2701 break;
2702
2703 default:
2704 RRETURN(PCRE_ERROR_INTERNAL);
2705 }
2706 }
2707
2708 /* If min = max, continue at the same level without recursing */
2709
2710 if (min == max) continue;
2711
2712 /* If minimizing, we have to test the rest of the pattern before each
2713 subsequent match. Again, separate the UTF-8 case for speed, and also
2714 separate the UCP cases. */
2715
2716 if (minimize)
2717 {
2718 #ifdef SUPPORT_UCP
2719 if (prop_type >= 0)
2720 {
2721 switch(prop_type)
2722 {
2723 case PT_ANY:
2724 for (fi = min;; fi++)
2725 {
2726 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2727 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2728 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2729 GETCHARINC(c, eptr);
2730 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2731 }
2732 break;
2733
2734 case PT_LAMP:
2735 for (fi = min;; fi++)
2736 {
2737 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2738 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2739 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2740 GETCHARINC(c, eptr);
2741 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2742 if ((prop_chartype == ucp_Lu ||
2743 prop_chartype == ucp_Ll ||
2744 prop_chartype == ucp_Lt) == prop_fail_result)
2745 RRETURN(MATCH_NOMATCH);
2746 }
2747 break;
2748
2749 case PT_GC:
2750 for (fi = min;; fi++)
2751 {
2752 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2753 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2754 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2755 GETCHARINC(c, eptr);
2756 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2757 if ((prop_category == prop_value) == prop_fail_result)
2758 RRETURN(MATCH_NOMATCH);
2759 }
2760 break;
2761
2762 case PT_PC:
2763 for (fi = min;; fi++)
2764 {
2765 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2766 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2767 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2768 GETCHARINC(c, eptr);
2769 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2770 if ((prop_chartype == prop_value) == prop_fail_result)
2771 RRETURN(MATCH_NOMATCH);
2772 }
2773 break;
2774
2775 case PT_SC:
2776 for (fi = min;; fi++)
2777 {
2778 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2779 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2780 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2781 GETCHARINC(c, eptr);
2782 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2783 if ((prop_script == prop_value) == prop_fail_result)
2784 RRETURN(MATCH_NOMATCH);
2785 }
2786 break;
2787
2788 default:
2789 RRETURN(PCRE_ERROR_INTERNAL);
2790 break;
2791 }
2792 }
2793
2794 /* Match extended Unicode sequences. We will get here only if the
2795 support is in the binary; otherwise a compile-time error occurs. */
2796
2797 else if (ctype == OP_EXTUNI)
2798 {
2799 for (fi = min;; fi++)
2800 {
2801 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2802 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2803 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2804 GETCHARINCTEST(c, eptr);
2805 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2806 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2807 while (eptr < md->end_subject)
2808 {
2809 int len = 1;
2810 if (!utf8) c = *eptr; else
2811 {
2812 GETCHARLEN(c, eptr, len);
2813 }
2814 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2815 if (prop_category != ucp_M) break;
2816 eptr += len;
2817 }
2818 }
2819 }
2820
2821 else
2822 #endif /* SUPPORT_UCP */
2823
2824 #ifdef SUPPORT_UTF8
2825 /* UTF-8 mode */
2826 if (utf8)
2827 {
2828 for (fi = min;; fi++)
2829 {
2830 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2831 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2832 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2833
2834 GETCHARINC(c, eptr);
2835 switch(ctype)
2836 {
2837 case OP_ANY:
2838 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2839 break;
2840
2841 case OP_ANYBYTE:
2842 break;
2843
2844 case OP_NOT_DIGIT:
2845 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2846 RRETURN(MATCH_NOMATCH);
2847 break;
2848
2849 case OP_DIGIT:
2850 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2851 RRETURN(MATCH_NOMATCH);
2852 break;
2853
2854 case OP_NOT_WHITESPACE:
2855 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2856 RRETURN(MATCH_NOMATCH);
2857 break;
2858
2859 case OP_WHITESPACE:
2860 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2861 RRETURN(MATCH_NOMATCH);
2862 break;
2863
2864 case OP_NOT_WORDCHAR:
2865 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2866 RRETURN(MATCH_NOMATCH);
2867 break;
2868
2869 case OP_WORDCHAR:
2870 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2871 RRETURN(MATCH_NOMATCH);
2872 break;
2873
2874 default:
2875 RRETURN(PCRE_ERROR_INTERNAL);
2876 }
2877 }
2878 }
2879 else
2880 #endif
2881 /* Not UTF-8 mode */
2882 {
2883 for (fi = min;; fi++)
2884 {
2885 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2886 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2887 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2888 c = *eptr++;
2889 switch(ctype)
2890 {
2891 case OP_ANY:
2892 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2893 break;
2894
2895 case OP_ANYBYTE:
2896 break;
2897
2898 case OP_NOT_DIGIT:
2899 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2900 break;
2901
2902 case OP_DIGIT:
2903 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2904 break;
2905
2906 case OP_NOT_WHITESPACE:
2907 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2908 break;
2909
2910 case OP_WHITESPACE:
2911 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2912 break;
2913
2914 case OP_NOT_WORDCHAR:
2915 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2916 break;
2917
2918 case OP_WORDCHAR:
2919 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2920 break;
2921
2922 default:
2923 RRETURN(PCRE_ERROR_INTERNAL);
2924 }
2925 }
2926 }
2927 /* Control never gets here */
2928 }
2929
2930 /* If maximizing it is worth using inline code for speed, doing the type
2931 test once at the start (i.e. keep it out of the loop). Again, keep the
2932 UTF-8 and UCP stuff separate. */
2933
2934 else
2935 {
2936 pp = eptr; /* Remember where we started */
2937
2938 #ifdef SUPPORT_UCP
2939 if (prop_type >= 0)
2940 {
2941 switch(prop_type)
2942 {
2943 case PT_ANY:
2944 for (i = min; i < max; i++)
2945 {
2946 int len = 1;
2947 if (eptr >= md->end_subject) break;
2948 GETCHARLEN(c, eptr, len);
2949 if (prop_fail_result) break;
2950 eptr+= len;
2951 }
2952 break;
2953
2954 case PT_LAMP:
2955 for (i = min; i < max; i++)
2956 {
2957 int len = 1;
2958 if (eptr >= md->end_subject) break;
2959 GETCHARLEN(c, eptr, len);
2960 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2961 if ((prop_chartype == ucp_Lu ||
2962 prop_chartype == ucp_Ll ||
2963 prop_chartype == ucp_Lt) == prop_fail_result)
2964 break;
2965 eptr+= len;
2966 }
2967 break;
2968
2969 case PT_GC:
2970 for (i = min; i < max; i++)
2971 {
2972 int len = 1;
2973 if (eptr >= md->end_subject) break;
2974 GETCHARLEN(c, eptr, len);
2975 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2976 if ((prop_category == prop_value) == prop_fail_result)
2977 break;
2978 eptr+= len;
2979 }
2980 break;
2981
2982 case PT_PC:
2983 for (i = min; i < max; i++)
2984 {
2985 int len = 1;
2986 if (eptr >= md->end_subject) break;
2987 GETCHARLEN(c, eptr, len);
2988 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2989 if ((prop_chartype == prop_value) == prop_fail_result)
2990 break;
2991 eptr+= len;
2992 }
2993 break;
2994
2995 case PT_SC:
2996 for (i = min; i < max; i++)
2997 {
2998 int len = 1;
2999 if (eptr >= md->end_subject) break;
3000 GETCHARLEN(c, eptr, len);
3001 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3002 if ((prop_script == prop_value) == prop_fail_result)
3003 break;
3004 eptr+= len;
3005 }
3006 break;
3007 }
3008
3009 /* eptr is now past the end of the maximum run */
3010
3011 for(;;)
3012 {
3013 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3014 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015 if (eptr-- == pp) break; /* Stop if tried at original pos */
3016 BACKCHAR(eptr);
3017 }
3018 }
3019
3020 /* Match extended Unicode sequences. We will get here only if the
3021 support is in the binary; otherwise a compile-time error occurs. */
3022
3023 else if (ctype == OP_EXTUNI)
3024 {
3025 for (i = min; i < max; i++)
3026 {
3027 if (eptr >= md->end_subject) break;
3028 GETCHARINCTEST(c, eptr);
3029 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3030 if (prop_category == ucp_M) break;
3031 while (eptr < md->end_subject)
3032 {
3033 int len = 1;
3034 if (!utf8) c = *eptr; else
3035 {
3036 GETCHARLEN(c, eptr, len);
3037 }
3038 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3039 if (prop_category != ucp_M) break;
3040 eptr += len;
3041 }
3042 }
3043
3044 /* eptr is now past the end of the maximum run */
3045
3046 for(;;)
3047 {
3048 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3049 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3050 if (eptr-- == pp) break; /* Stop if tried at original pos */
3051 for (;;) /* Move back over one extended */
3052 {
3053 int len = 1;
3054 BACKCHAR(eptr);
3055 if (!utf8) c = *eptr; else
3056 {
3057 GETCHARLEN(c, eptr, len);
3058 }
3059 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3060 if (prop_category != ucp_M) break;
3061 eptr--;
3062 }
3063 }
3064 }
3065
3066 else
3067 #endif /* SUPPORT_UCP */
3068
3069 #ifdef SUPPORT_UTF8
3070 /* UTF-8 mode */
3071
3072 if (utf8)
3073 {
3074 switch(ctype)
3075 {
3076 case OP_ANY:
3077
3078 /* Special code is required for UTF8, but when the maximum is unlimited
3079 we don't need it, so we repeat the non-UTF8 code. This is probably
3080 worth it, because .* is quite a common idiom. */
3081
3082 if (max < INT_MAX)
3083 {
3084 if ((ims & PCRE_DOTALL) == 0)
3085 {
3086 for (i = min; i < max; i++)
3087 {
3088 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3089 eptr++;
3090 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3091 }
3092 }
3093 else
3094 {
3095 for (i = min; i < max; i++)
3096 {
3097 eptr++;
3098 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3099 }
3100 }
3101 }
3102
3103 /* Handle unlimited UTF-8 repeat */
3104
3105 else
3106 {
3107 if ((ims & PCRE_DOTALL) == 0)
3108 {
3109 for (i = min; i < max; i++)
3110 {
3111 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3112 eptr++;
3113 }
3114 break;
3115 }
3116 else
3117 {
3118 c = max - min;
3119 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3120 eptr += c;
3121 }
3122 }
3123 break;
3124
3125 /* The byte case is the same as non-UTF8 */
3126
3127 case OP_ANYBYTE:
3128 c = max - min;
3129 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3130 eptr += c;
3131 break;
3132
3133 case OP_NOT_DIGIT:
3134 for (i = min; i < max; i++)
3135 {
3136 int len = 1;
3137 if (eptr >= md->end_subject) break;
3138 GETCHARLEN(c, eptr, len);
3139 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3140 eptr+= len;
3141 }
3142 break;
3143
3144 case OP_DIGIT:
3145 for (i = min; i < max; i++)
3146 {
3147 int len = 1;
3148 if (eptr >= md->end_subject) break;
3149 GETCHARLEN(c, eptr, len);
3150 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3151 eptr+= len;
3152 }
3153 break;
3154
3155 case OP_NOT_WHITESPACE:
3156 for (i = min; i < max; i++)
3157 {
3158 int len = 1;
3159 if (eptr >= md->end_subject) break;
3160 GETCHARLEN(c, eptr, len);
3161 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3162 eptr+= len;
3163 }
3164 break;
3165
3166 case OP_WHITESPACE:
3167 for (i = min; i < max; i++)
3168 {
3169 int len = 1;
3170 if (eptr >= md->end_subject) break;
3171 GETCHARLEN(c, eptr, len);
3172 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3173 eptr+= len;
3174 }
3175 break;
3176
3177 case OP_NOT_WORDCHAR:
3178 for (i = min; i < max; i++)
3179 {
3180 int len = 1;
3181 if (eptr >= md->end_subject) break;
3182 GETCHARLEN(c, eptr, len);
3183 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3184 eptr+= len;
3185 }
3186 break;
3187
3188 case OP_WORDCHAR:
3189 for (i = min; i < max; i++)
3190 {
3191 int len = 1;
3192 if (eptr >= md->end_subject) break;
3193 GETCHARLEN(c, eptr, len);
3194 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3195 eptr+= len;
3196 }
3197 break;
3198
3199 default:
3200 RRETURN(PCRE_ERROR_INTERNAL);
3201 }
3202
3203 /* eptr is now past the end of the maximum run */
3204
3205 for(;;)
3206 {
3207 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3208 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3209 if (eptr-- == pp) break; /* Stop if tried at original pos */
3210 BACKCHAR(eptr);
3211 }
3212 }
3213 else
3214 #endif
3215
3216 /* Not UTF-8 mode */
3217 {
3218 switch(ctype)
3219 {
3220 case OP_ANY:
3221 if ((ims & PCRE_DOTALL) == 0)
3222 {
3223 for (i = min; i < max; i++)
3224 {
3225 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3226 eptr++;
3227 }
3228 break;
3229 }
3230 /* For DOTALL case, fall through and treat as \C */
3231
3232 case OP_ANYBYTE:
3233 c = max - min;
3234 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3235 eptr += c;
3236 break;
3237
3238 case OP_NOT_DIGIT:
3239 for (i = min; i < max; i++)
3240 {
3241 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3242 break;
3243 eptr++;
3244 }
3245 break;
3246
3247 case OP_DIGIT:
3248 for (i = min; i < max; i++)
3249 {
3250 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3251 break;
3252 eptr++;
3253 }
3254 break;
3255
3256 case OP_NOT_WHITESPACE:
3257 for (i = min; i < max; i++)
3258 {
3259 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3260 break;
3261 eptr++;
3262 }
3263 break;
3264
3265 case OP_WHITESPACE:
3266 for (i = min; i < max; i++)
3267 {
3268 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3269 break;
3270 eptr++;
3271 }
3272 break;
3273
3274 case OP_NOT_WORDCHAR:
3275 for (i = min; i < max; i++)
3276 {
3277 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3278 break;
3279 eptr++;
3280 }
3281 break;
3282
3283 case OP_WORDCHAR:
3284 for (i = min; i < max; i++)
3285 {
3286 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3287 break;
3288 eptr++;
3289 }
3290 break;
3291
3292 default:
3293 RRETURN(PCRE_ERROR_INTERNAL);
3294 }
3295
3296 /* eptr is now past the end of the maximum run */
3297
3298 while (eptr >= pp)
3299 {
3300 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3301 eptr--;
3302 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3303 }
3304 }
3305
3306 /* Get here if we can't make it match with any permitted repetitions */
3307
3308 RRETURN(MATCH_NOMATCH);
3309 }
3310 /* Control never gets here */
3311
3312 /* There's been some horrible disaster. Since all codes > OP_BRA are
3313 for capturing brackets, and there shouldn't be any gaps between 0 and
3314 OP_BRA, arrival here can only mean there is something seriously wrong
3315 in the code above or the OP_xxx definitions. */
3316
3317 default:
3318 DPRINTF(("Unknown opcode %d\n", *ecode));
3319 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3320 }
3321
3322 /* Do not stick any code in here without much thought; it is assumed
3323 that "continue" in the code above comes out to here to repeat the main
3324 loop. */
3325
3326 } /* End of main loop */
3327 /* Control never reaches here */
3328 }
3329
3330
3331 /***************************************************************************
3332 ****************************************************************************
3333 RECURSION IN THE match() FUNCTION
3334
3335 Undefine all the macros that were defined above to handle this. */
3336
3337 #ifdef NO_RECURSE
3338 #undef eptr
3339 #undef ecode
3340 #undef offset_top
3341 #undef ims
3342 #undef eptrb
3343 #undef flags
3344
3345 #undef callpat
3346 #undef charptr
3347 #undef data
3348 #undef next
3349 #undef pp
3350 #undef prev
3351 #undef saved_eptr
3352
3353 #undef new_recursive
3354
3355 #undef cur_is_word
3356 #undef condition
3357 #undef minimize
3358 #undef prev_is_word
3359
3360 #undef original_ims
3361
3362 #undef ctype
3363 #undef length
3364 #undef max
3365 #undef min
3366 #undef number
3367 #undef offset
3368 #undef op
3369 #undef save_capture_last
3370 #undef save_offset1
3371 #undef save_offset2
3372 #undef save_offset3
3373 #undef stacksave
3374
3375 #undef newptrb
3376
3377 #endif
3378
3379 /* These two are defined as macros in both cases */
3380
3381 #undef fc
3382 #undef fi
3383
3384 /***************************************************************************
3385 ***************************************************************************/
3386
3387
3388
3389 /*************************************************
3390 * Execute a Regular Expression *
3391 *************************************************/
3392
3393 /* This function applies a compiled re to a subject string and picks out
3394 portions of the string if it matches. Two elements in the vector are set for
3395 each substring: the offsets to the start and end of the substring.
3396
3397 Arguments:
3398 argument_re points to the compiled expression
3399 extra_data points to extra data or is NULL
3400 subject points to the subject string
3401 length length of subject string (may contain binary zeros)
3402 start_offset where to start in the subject string
3403 options option bits
3404 offsets points to a vector of ints to be filled in with offsets
3405 offsetcount the number of elements in the vector
3406
3407 Returns: > 0 => success; value is the number of elements filled in
3408 = 0 => success, but offsets is not big enough
3409 -1 => failed to match
3410 < -1 => some kind of unexpected problem
3411 */
3412
3413 PCRE_DATA_SCOPE int
3414 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3415 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3416 int offsetcount)
3417 {
3418 int rc, resetcount, ocount;
3419 int first_byte = -1;
3420 int req_byte = -1;
3421 int req_byte2 = -1;
3422 unsigned long int ims = 0;
3423 BOOL using_temporary_offsets = FALSE;
3424 BOOL anchored;
3425 BOOL startline;
3426 BOOL firstline;
3427 BOOL first_byte_caseless = FALSE;
3428 BOOL req_byte_caseless = FALSE;
3429 match_data match_block;
3430 const uschar *tables;
3431 const uschar *start_bits = NULL;
3432 USPTR start_match = (USPTR)subject + start_offset;
3433 USPTR end_subject;
3434 USPTR req_byte_ptr = start_match - 1;
3435
3436 pcre_study_data internal_study;
3437 const pcre_study_data *study;
3438
3439 real_pcre internal_re;
3440 const real_pcre *external_re = (const real_pcre *)argument_re;
3441 const real_pcre *re = external_re;
3442
3443 /* Plausibility checks */
3444
3445 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3446 if (re == NULL || subject == NULL ||
3447 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3448 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3449
3450 /* Fish out the optional data from the extra_data structure, first setting
3451 the default values. */
3452
3453 study = NULL;
3454 match_block.match_limit = MATCH_LIMIT;
3455 match_block.match_limit_recursion = MATCH_LIMIT_RECURSION;
3456 match_block.callout_data = NULL;
3457
3458 /* The table pointer is always in native byte order. */
3459
3460 tables = external_re->tables;
3461
3462 if (extra_data != NULL)
3463 {
3464 register unsigned int flags = extra_data->flags;
3465 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3466 study = (const pcre_study_data *)extra_data->study_data;
3467 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3468 match_block.match_limit = extra_data->match_limit;
3469 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3470 match_block.match_limit_recursion = extra_data->match_limit_recursion;
3471 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3472 match_block.callout_data = extra_data->callout_data;
3473 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3474 }
3475
3476 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3477 is a feature that makes it possible to save compiled regex and re-use them
3478 in other programs later. */
3479
3480 if (tables == NULL) tables = _pcre_default_tables;
3481
3482 /* Check that the first field in the block is the magic number. If it is not,
3483 test for a regex that was compiled on a host of opposite endianness. If this is
3484 the case, flipped values are put in internal_re and internal_study if there was
3485 study data too. */
3486
3487 if (re->magic_number != MAGIC_NUMBER)
3488 {
3489 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3490 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3491 if (study != NULL) study = &internal_study;
3492 }
3493
3494 /* Set up other data */
3495
3496 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3497 startline = (re->options & PCRE_STARTLINE) != 0;
3498 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3499
3500 /* The code starts after the real_pcre block and the capture name table. */
3501
3502 match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3503 re->name_count * re->name_entry_size;
3504
3505 match_block.start_subject = (USPTR)subject;
3506 match_block.start_offset = start_offset;
3507 match_block.end_subject = match_block.start_subject + length;
3508 end_subject = match_block.end_subject;
3509
3510 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3511 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3512
3513 match_block.notbol = (options & PCRE_NOTBOL) != 0;
3514 match_block.noteol = (options & PCRE_NOTEOL) != 0;
3515 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3516 match_block.partial = (options & PCRE_PARTIAL) != 0;
3517 match_block.hitend = FALSE;
3518
3519 match_block.recursive = NULL; /* No recursion at top level */
3520
3521 match_block.lcc = tables + lcc_offset;
3522 match_block.ctypes = tables + ctypes_offset;
3523
3524 /* Partial matching is supported only for a restricted set of regexes at the
3525 moment. */
3526
3527 if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3528 return PCRE_ERROR_BADPARTIAL;
3529
3530 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3531 back the character offset. */
3532
3533 #ifdef SUPPORT_UTF8
3534 if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3535 {
3536 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3537 return PCRE_ERROR_BADUTF8;
3538 if (start_offset > 0 && start_offset < length)
3539 {
3540 int tb = ((uschar *)subject)[start_offset];
3541 if (tb > 127)
3542 {
3543 tb &= 0xc0;
3544 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3545 }
3546 }
3547 }
3548 #endif
3549
3550 /* The ims options can vary during the matching as a result of the presence
3551 of (?ims) items in the pattern. They are kept in a local variable so that
3552 restoring at the exit of a group is easy. */
3553
3554 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3555
3556 /* If the expression has got more back references than the offsets supplied can
3557 hold, we get a temporary chunk of working store to use during the matching.
3558 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3559 of 3. */
3560
3561 ocount = offsetcount - (offsetcount % 3);
3562
3563 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3564 {
3565 ocount = re->top_backref * 3 + 3;
3566 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3567 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3568 using_temporary_offsets = TRUE;
3569 DPRINTF(("Got memory to hold back references\n"));
3570 }
3571 else match_block.offset_vector = offsets;
3572
3573 match_block.offset_end = ocount;
3574 match_block.offset_max = (2*ocount)/3;
3575 match_block.offset_overflow = FALSE;
3576 match_block.capture_last = -1;
3577
3578 /* Compute the minimum number of offsets that we need to reset each time. Doing
3579 this makes a huge difference to execution time when there aren't many brackets
3580 in the pattern. */
3581
3582 resetcount = 2 + re->top_bracket * 2;
3583 if (resetcount > offsetcount) resetcount = ocount;
3584
3585 /* Reset the working variable associated with each extraction. These should
3586 never be used unless previously set, but they get saved and restored, and so we
3587 initialize them to avoid reading uninitialized locations. */
3588
3589 if (match_block.offset_vector != NULL)
3590 {
3591 register int *iptr = match_block.offset_vector + ocount;
3592 register int *iend = iptr - resetcount/2 + 1;
3593 while (--iptr >= iend) *iptr = -1;
3594 }
3595
3596 /* Set up the first character to match, if available. The first_byte value is
3597 never set for an anchored regular expression, but the anchoring may be forced
3598 at run time, so we have to test for anchoring. The first char may be unset for
3599 an unanchored pattern, of course. If there's no first char and the pattern was
3600 studied, there may be a bitmap of possible first characters. */
3601
3602 if (!anchored)
3603 {
3604 if ((re->options & PCRE_FIRSTSET) != 0)
3605 {
3606 first_byte = re->first_byte & 255;
3607 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3608 first_byte = match_block.lcc[first_byte];
3609 }
3610 else
3611 if (!startline && study != NULL &&
3612 (study->options & PCRE_STUDY_MAPPED) != 0)
3613 start_bits = study->start_bits;
3614 }
3615
3616 /* For anchored or unanchored matches, there may be a "last known required
3617 character" set. */
3618
3619 if ((re->options & PCRE_REQCHSET) != 0)
3620 {
3621 req_byte = re->req_byte & 255;
3622 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3623 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3624 }
3625
3626 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3627 the loop runs just once. */
3628
3629 do
3630 {
3631 USPTR save_end_subject = end_subject;
3632
3633 /* Reset the maximum number of extractions we might see. */
3634
3635 if (match_block.offset_vector != NULL)
3636 {
3637 register int *iptr = match_block.offset_vector;
3638 register int *iend = iptr + resetcount;
3639 while (iptr < iend) *iptr++ = -1;
3640 }
3641
3642 /* Advance to a unique first char if possible. If firstline is TRUE, the
3643 start of the match is constrained to the first line of a multiline string.
3644 Implement this by temporarily adjusting end_subject so that we stop scanning
3645 at a newline. If the match fails at the newline, later code breaks this loop.
3646 */
3647
3648 if (firstline)
3649 {
3650 USPTR t = start_match;
3651 while (t < save_end_subject && *t != '\n') t++;
3652 end_subject = t;
3653 }
3654
3655 /* Now test for a unique first byte */
3656
3657 if (first_byte >= 0)
3658 {
3659 if (first_byte_caseless)
3660 while (start_match < end_subject &&
3661 match_block.lcc[*start_match] != first_byte)
3662 start_match++;
3663 else
3664 while (start_match < end_subject && *start_match != first_byte)
3665 start_match++;
3666 }
3667
3668 /* Or to just after \n for a multiline match if possible */
3669
3670 else if (startline)
3671 {
3672 if (start_match > match_block.start_subject + start_offset)
3673 {
3674 while (start_match < end_subject && start_match[-1] != NEWLINE)
3675 start_match++;
3676 }
3677 }
3678
3679 /* Or to a non-unique first char after study */
3680
3681 else if (start_bits != NULL)
3682 {
3683 while (start_match < end_subject)
3684 {
3685 register unsigned int c = *start_match;
3686 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3687 }
3688 }
3689
3690 /* Restore fudged end_subject */
3691
3692 end_subject = save_end_subject;
3693
3694 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3695 printf(">>>> Match against: ");
3696 pchars(start_match, end_subject - start_match, TRUE, &match_block);
3697 printf("\n");
3698 #endif
3699
3700 /* If req_byte is set, we know that that character must appear in the subject
3701 for the match to succeed. If the first character is set, req_byte must be
3702 later in the subject; otherwise the test starts at the match point. This
3703 optimization can save a huge amount of backtracking in patterns with nested
3704 unlimited repeats that aren't going to match. Writing separate code for
3705 cased/caseless versions makes it go faster, as does using an autoincrement
3706 and backing off on a match.
3707
3708 HOWEVER: when the subject string is very, very long, searching to its end can
3709 take a long time, and give bad performance on quite ordinary patterns. This
3710 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3711 don't do this when the string is sufficiently long.
3712
3713 ALSO: this processing is disabled when partial matching is requested.
3714 */
3715
3716 if (req_byte >= 0 &&
3717 end_subject - start_match < REQ_BYTE_MAX &&
3718 !match_block.partial)
3719 {
3720 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
3721
3722 /* We don't need to repeat the search if we haven't yet reached the
3723 place we found it at last time. */
3724
3725 if (p > req_byte_ptr)
3726 {
3727 if (req_byte_caseless)
3728 {
3729 while (p < end_subject)
3730 {
3731 register int pp = *p++;
3732 if (pp == req_byte || pp == req_byte2) { p--; break; }
3733 }
3734 }
3735 else
3736 {
3737 while (p < end_subject)
3738 {
3739 if (*p++ == req_byte) { p--; break; }
3740 }
3741 }
3742
3743 /* If we can't find the required character, break the matching loop */
3744
3745 if (p >= end_subject) break;
3746
3747 /* If we have found the required character, save the point where we
3748 found it, so that we don't search again next time round the loop if
3749 the start hasn't passed this character yet. */
3750
3751 req_byte_ptr = p;
3752 }
3753 }
3754
3755 /* When a match occurs, substrings will be set for all internal extractions;
3756 we just need to set up the whole thing as substring 0 before returning. If
3757 there were too many extractions, set the return code to zero. In the case
3758 where we had to get some local store to hold offsets for backreferences, copy
3759 those back references that we can. In this case there need not be overflow
3760 if certain parts of the pattern were not used. */
3761
3762 match_block.start_match = start_match;
3763 match_block.match_call_count = 0;
3764
3765 rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3766 match_isgroup, 0);
3767
3768 /* When the result is no match, if the subject's first character was a
3769 newline and the PCRE_FIRSTLINE option is set, break (which will return
3770 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3771 newline in the subject. Otherwise, advance the pointer to the next character
3772 and continue - but the continuation will actually happen only when the
3773 pattern is not anchored. */
3774
3775 if (rc == MATCH_NOMATCH)
3776 {
3777 if (firstline && *start_match == NEWLINE) break;
3778 start_match++;
3779 #ifdef SUPPORT_UTF8
3780 if (match_block.utf8)
3781 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3782 start_match++;
3783 #endif
3784 continue;
3785 }
3786
3787 if (rc != MATCH_MATCH)
3788 {
3789 DPRINTF((">>>> error: returning %d\n", rc));
3790 return rc;
3791 }
3792
3793 /* We have a match! Copy the offset information from temporary store if
3794 necessary */
3795
3796 if (using_temporary_offsets)
3797 {
3798 if (offsetcount >= 4)
3799 {
3800 memcpy(offsets + 2, match_block.offset_vector + 2,
3801 (offsetcount - 2) * sizeof(int));
3802 DPRINTF(("Copied offsets from temporary memory\n"));
3803 }
3804 if (match_block.end_offset_top > offsetcount)
3805 match_block.offset_overflow = TRUE;
3806
3807 DPRINTF(("Freeing temporary memory\n"));
3808 (pcre_free)(match_block.offset_vector);
3809 }
3810
3811 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3812
3813 if (offsetcount < 2) rc = 0; else
3814 {
3815 offsets[0] = start_match - match_block.start_subject;
3816 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3817 }
3818
3819 DPRINTF((">>>> returning %d\n", rc));
3820 return rc;
3821 }
3822
3823 /* This "while" is the end of the "do" above */
3824
3825 while (!anchored && start_match <= end_subject);
3826
3827 if (using_temporary_offsets)
3828 {
3829 DPRINTF(("Freeing temporary memory\n"));
3830 (pcre_free)(match_block.offset_vector);
3831 }
3832
3833 if (match_block.partial && match_block.hitend)
3834 {
3835 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3836 return PCRE_ERROR_PARTIAL;
3837 }
3838 else
3839 {
3840 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3841 return PCRE_ERROR_NOMATCH;
3842 }
3843 }
3844
3845 /* End of pcre_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12