/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (show annotations) (download)
Sat Feb 24 21:41:34 2007 UTC (7 years, 1 month ago) by nigel
File MIME type: text/plain
File size: 119563 byte(s)
Load pcre-6.7 into code/trunk.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Copyright (c) 1997-2006 University of Cambridge
10
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14
15 * Redistributions of source code must retain the above copyright notice,
16 this list of conditions and the following disclaimer.
17
18 * Redistributions in binary form must reproduce the above copyright
19 notice, this list of conditions and the following disclaimer in the
20 documentation and/or other materials provided with the distribution.
21
22 * Neither the name of the University of Cambridge nor the names of its
23 contributors may be used to endorse or promote products derived from
24 this software without specific prior written permission.
25
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39
40
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44
45 #define NLBLOCK md /* The block containing newline information */
46 #include "pcre_internal.h"
47
48
49 /* Structure for building a chain of data that actually lives on the
50 stack, for holding the values of the subject pointer at the start of each
51 subpattern, so as to detect when an empty string has been matched by a
52 subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
53 are on the heap, not on the stack. */
54
55 typedef struct eptrblock {
56 struct eptrblock *epb_prev;
57 USPTR epb_saved_eptr;
58 } eptrblock;
59
60 /* Flag bits for the match() function */
61
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_isgroup 0x02 /* Set if start of bracketed group */
64
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70
71 /* Maximum number of ints of offset to save on the stack for recursive calls.
72 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73 because the offset vector is always a multiple of 3 long. */
74
75 #define REC_STACK_SAVE_MAX 30
76
77 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78
79 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
80 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
81
82
83
84 #ifdef DEBUG
85 /*************************************************
86 * Debugging function to print chars *
87 *************************************************/
88
89 /* Print a sequence of chars in printable format, stopping at the end of the
90 subject if the requested.
91
92 Arguments:
93 p points to characters
94 length number to print
95 is_subject TRUE if printing from within md->start_subject
96 md pointer to matching data block, if is_subject is TRUE
97
98 Returns: nothing
99 */
100
101 static void
102 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
103 {
104 int c;
105 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106 while (length-- > 0)
107 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
108 }
109 #endif
110
111
112
113 /*************************************************
114 * Match a back-reference *
115 *************************************************/
116
117 /* If a back reference hasn't been set, the length that is passed is greater
118 than the number of characters left in the string, so the match fails.
119
120 Arguments:
121 offset index into the offset vector
122 eptr points into the subject
123 length length to be matched
124 md points to match data block
125 ims the ims flags
126
127 Returns: TRUE if matched
128 */
129
130 static BOOL
131 match_ref(int offset, register USPTR eptr, int length, match_data *md,
132 unsigned long int ims)
133 {
134 USPTR p = md->start_subject + md->offset_vector[offset];
135
136 #ifdef DEBUG
137 if (eptr >= md->end_subject)
138 printf("matching subject <null>");
139 else
140 {
141 printf("matching subject ");
142 pchars(eptr, length, TRUE, md);
143 }
144 printf(" against backref ");
145 pchars(p, length, FALSE, md);
146 printf("\n");
147 #endif
148
149 /* Always fail if not enough characters left */
150
151 if (length > md->end_subject - eptr) return FALSE;
152
153 /* Separate the caselesss case for speed */
154
155 if ((ims & PCRE_CASELESS) != 0)
156 {
157 while (length-- > 0)
158 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
159 }
160 else
161 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
162
163 return TRUE;
164 }
165
166
167
168 /***************************************************************************
169 ****************************************************************************
170 RECURSION IN THE match() FUNCTION
171
172 The match() function is highly recursive, though not every recursive call
173 increases the recursive depth. Nevertheless, some regular expressions can cause
174 it to recurse to a great depth. I was writing for Unix, so I just let it call
175 itself recursively. This uses the stack for saving everything that has to be
176 saved for a recursive call. On Unix, the stack can be large, and this works
177 fine.
178
179 It turns out that on some non-Unix-like systems there are problems with
180 programs that use a lot of stack. (This despite the fact that every last chip
181 has oodles of memory these days, and techniques for extending the stack have
182 been known for decades.) So....
183
184 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
185 calls by keeping local variables that need to be preserved in blocks of memory
186 obtained from malloc() instead instead of on the stack. Macros are used to
187 achieve this so that the actual code doesn't look very different to what it
188 always used to.
189 ****************************************************************************
190 ***************************************************************************/
191
192
193 /* These versions of the macros use the stack, as normal. There are debugging
194 versions and production versions. */
195
196 #ifndef NO_RECURSE
197 #define REGISTER register
198 #ifdef DEBUG
199 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
200 { \
201 printf("match() called in line %d\n", __LINE__); \
202 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
203 printf("to line %d\n", __LINE__); \
204 }
205 #define RRETURN(ra) \
206 { \
207 printf("match() returned %d from line %d ", ra, __LINE__); \
208 return ra; \
209 }
210 #else
211 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
212 rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
213 #define RRETURN(ra) return ra
214 #endif
215
216 #else
217
218
219 /* These versions of the macros manage a private stack on the heap. Note
220 that the rd argument of RMATCH isn't actually used. It's the md argument of
221 match(), which never changes. */
222
223 #define REGISTER
224
225 #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
226 {\
227 heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
228 if (setjmp(frame->Xwhere) == 0)\
229 {\
230 newframe->Xeptr = ra;\
231 newframe->Xecode = rb;\
232 newframe->Xoffset_top = rc;\
233 newframe->Xims = re;\
234 newframe->Xeptrb = rf;\
235 newframe->Xflags = rg;\
236 newframe->Xrdepth = frame->Xrdepth + 1;\
237 newframe->Xprevframe = frame;\
238 frame = newframe;\
239 DPRINTF(("restarting from line %d\n", __LINE__));\
240 goto HEAP_RECURSE;\
241 }\
242 else\
243 {\
244 DPRINTF(("longjumped back to line %d\n", __LINE__));\
245 frame = md->thisframe;\
246 rx = frame->Xresult;\
247 }\
248 }
249
250 #define RRETURN(ra)\
251 {\
252 heapframe *newframe = frame;\
253 frame = newframe->Xprevframe;\
254 (pcre_stack_free)(newframe);\
255 if (frame != NULL)\
256 {\
257 frame->Xresult = ra;\
258 md->thisframe = frame;\
259 longjmp(frame->Xwhere, 1);\
260 }\
261 return ra;\
262 }
263
264
265 /* Structure for remembering the local variables in a private frame */
266
267 typedef struct heapframe {
268 struct heapframe *Xprevframe;
269
270 /* Function arguments that may change */
271
272 const uschar *Xeptr;
273 const uschar *Xecode;
274 int Xoffset_top;
275 long int Xims;
276 eptrblock *Xeptrb;
277 int Xflags;
278 unsigned int Xrdepth;
279
280 /* Function local variables */
281
282 const uschar *Xcallpat;
283 const uschar *Xcharptr;
284 const uschar *Xdata;
285 const uschar *Xnext;
286 const uschar *Xpp;
287 const uschar *Xprev;
288 const uschar *Xsaved_eptr;
289
290 recursion_info Xnew_recursive;
291
292 BOOL Xcur_is_word;
293 BOOL Xcondition;
294 BOOL Xminimize;
295 BOOL Xprev_is_word;
296
297 unsigned long int Xoriginal_ims;
298
299 #ifdef SUPPORT_UCP
300 int Xprop_type;
301 int Xprop_value;
302 int Xprop_fail_result;
303 int Xprop_category;
304 int Xprop_chartype;
305 int Xprop_script;
306 int *Xprop_test_variable;
307 #endif
308
309 int Xctype;
310 int Xfc;
311 int Xfi;
312 int Xlength;
313 int Xmax;
314 int Xmin;
315 int Xnumber;
316 int Xoffset;
317 int Xop;
318 int Xsave_capture_last;
319 int Xsave_offset1, Xsave_offset2, Xsave_offset3;
320 int Xstacksave[REC_STACK_SAVE_MAX];
321
322 eptrblock Xnewptrb;
323
324 /* Place to pass back result, and where to jump back to */
325
326 int Xresult;
327 jmp_buf Xwhere;
328
329 } heapframe;
330
331 #endif
332
333
334 /***************************************************************************
335 ***************************************************************************/
336
337
338
339 /*************************************************
340 * Match from current position *
341 *************************************************/
342
343 /* On entry ecode points to the first opcode, and eptr to the first character
344 in the subject string, while eptrb holds the value of eptr at the start of the
345 last bracketed group - used for breaking infinite loops matching zero-length
346 strings. This function is called recursively in many circumstances. Whenever it
347 returns a negative (error) response, the outer incarnation must also return the
348 same response.
349
350 Performance note: It might be tempting to extract commonly used fields from the
351 md structure (e.g. utf8, end_subject) into individual variables to improve
352 performance. Tests using gcc on a SPARC disproved this; in the first case, it
353 made performance worse.
354
355 Arguments:
356 eptr pointer in subject
357 ecode position in code
358 offset_top current top pointer
359 md pointer to "static" info for the match
360 ims current /i, /m, and /s options
361 eptrb pointer to chain of blocks containing eptr at start of
362 brackets - for testing for empty matches
363 flags can contain
364 match_condassert - this is an assertion condition
365 match_isgroup - this is the start of a bracketed group
366 rdepth the recursion depth
367
368 Returns: MATCH_MATCH if matched ) these values are >= 0
369 MATCH_NOMATCH if failed to match )
370 a negative PCRE_ERROR_xxx value if aborted by an error condition
371 (e.g. stopped by repeated call or recursion limit)
372 */
373
374 static int
375 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
376 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
377 int flags, unsigned int rdepth)
378 {
379 /* These variables do not need to be preserved over recursion in this function,
380 so they can be ordinary variables in all cases. Mark them with "register"
381 because they are used a lot in loops. */
382
383 register int rrc; /* Returns from recursive calls */
384 register int i; /* Used for loops not involving calls to RMATCH() */
385 register unsigned int c; /* Character values not kept over RMATCH() calls */
386 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
387
388 /* When recursion is not being used, all "local" variables that have to be
389 preserved over calls to RMATCH() are part of a "frame" which is obtained from
390 heap storage. Set up the top-level frame here; others are obtained from the
391 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
392
393 #ifdef NO_RECURSE
394 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
395 frame->Xprevframe = NULL; /* Marks the top level */
396
397 /* Copy in the original argument variables */
398
399 frame->Xeptr = eptr;
400 frame->Xecode = ecode;
401 frame->Xoffset_top = offset_top;
402 frame->Xims = ims;
403 frame->Xeptrb = eptrb;
404 frame->Xflags = flags;
405 frame->Xrdepth = rdepth;
406
407 /* This is where control jumps back to to effect "recursion" */
408
409 HEAP_RECURSE:
410
411 /* Macros make the argument variables come from the current frame */
412
413 #define eptr frame->Xeptr
414 #define ecode frame->Xecode
415 #define offset_top frame->Xoffset_top
416 #define ims frame->Xims
417 #define eptrb frame->Xeptrb
418 #define flags frame->Xflags
419 #define rdepth frame->Xrdepth
420
421 /* Ditto for the local variables */
422
423 #ifdef SUPPORT_UTF8
424 #define charptr frame->Xcharptr
425 #endif
426 #define callpat frame->Xcallpat
427 #define data frame->Xdata
428 #define next frame->Xnext
429 #define pp frame->Xpp
430 #define prev frame->Xprev
431 #define saved_eptr frame->Xsaved_eptr
432
433 #define new_recursive frame->Xnew_recursive
434
435 #define cur_is_word frame->Xcur_is_word
436 #define condition frame->Xcondition
437 #define minimize frame->Xminimize
438 #define prev_is_word frame->Xprev_is_word
439
440 #define original_ims frame->Xoriginal_ims
441
442 #ifdef SUPPORT_UCP
443 #define prop_type frame->Xprop_type
444 #define prop_value frame->Xprop_value
445 #define prop_fail_result frame->Xprop_fail_result
446 #define prop_category frame->Xprop_category
447 #define prop_chartype frame->Xprop_chartype
448 #define prop_script frame->Xprop_script
449 #define prop_test_variable frame->Xprop_test_variable
450 #endif
451
452 #define ctype frame->Xctype
453 #define fc frame->Xfc
454 #define fi frame->Xfi
455 #define length frame->Xlength
456 #define max frame->Xmax
457 #define min frame->Xmin
458 #define number frame->Xnumber
459 #define offset frame->Xoffset
460 #define op frame->Xop
461 #define save_capture_last frame->Xsave_capture_last
462 #define save_offset1 frame->Xsave_offset1
463 #define save_offset2 frame->Xsave_offset2
464 #define save_offset3 frame->Xsave_offset3
465 #define stacksave frame->Xstacksave
466
467 #define newptrb frame->Xnewptrb
468
469 /* When recursion is being used, local variables are allocated on the stack and
470 get preserved during recursion in the normal way. In this environment, fi and
471 i, and fc and c, can be the same variables. */
472
473 #else
474 #define fi i
475 #define fc c
476
477
478 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
479 const uschar *charptr; /* in small blocks of the code. My normal */
480 #endif /* style of coding would have declared */
481 const uschar *callpat; /* them within each of those blocks. */
482 const uschar *data; /* However, in order to accommodate the */
483 const uschar *next; /* version of this code that uses an */
484 USPTR pp; /* external "stack" implemented on the */
485 const uschar *prev; /* heap, it is easier to declare them all */
486 USPTR saved_eptr; /* here, so the declarations can be cut */
487 /* out in a block. The only declarations */
488 recursion_info new_recursive; /* within blocks below are for variables */
489 /* that do not have to be preserved over */
490 BOOL cur_is_word; /* a recursive call to RMATCH(). */
491 BOOL condition;
492 BOOL minimize;
493 BOOL prev_is_word;
494
495 unsigned long int original_ims;
496
497 #ifdef SUPPORT_UCP
498 int prop_type;
499 int prop_value;
500 int prop_fail_result;
501 int prop_category;
502 int prop_chartype;
503 int prop_script;
504 int *prop_test_variable;
505 #endif
506
507 int ctype;
508 int length;
509 int max;
510 int min;
511 int number;
512 int offset;
513 int op;
514 int save_capture_last;
515 int save_offset1, save_offset2, save_offset3;
516 int stacksave[REC_STACK_SAVE_MAX];
517
518 eptrblock newptrb;
519 #endif
520
521 /* These statements are here to stop the compiler complaining about unitialized
522 variables. */
523
524 #ifdef SUPPORT_UCP
525 prop_value = 0;
526 prop_fail_result = 0;
527 prop_test_variable = NULL;
528 #endif
529
530 /* This label is used for tail recursion, which is used in a few cases even
531 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
532 used. Thanks to Ian Taylor for noticing this possibility and sending the
533 original patch. */
534
535 TAIL_RECURSE:
536
537 /* OK, now we can get on with the real code of the function. Recursive calls
538 are specified by the macro RMATCH and RRETURN is used to return. When
539 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
540 and a "return", respectively (possibly with some debugging if DEBUG is
541 defined). However, RMATCH isn't like a function call because it's quite a
542 complicated macro. It has to be used in one particular way. This shouldn't,
543 however, impact performance when true recursion is being used. */
544
545 /* First check that we haven't called match() too many times, or that we
546 haven't exceeded the recursive call limit. */
547
548 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
549 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
550
551 original_ims = ims; /* Save for resetting on ')' */
552
553 #ifdef SUPPORT_UTF8
554 utf8 = md->utf8; /* Local copy of the flag */
555 #else
556 utf8 = FALSE;
557 #endif
558
559 /* At the start of a bracketed group, add the current subject pointer to the
560 stack of such pointers, to be re-instated at the end of the group when we hit
561 the closing ket. When match() is called in other circumstances, we don't add to
562 this stack. */
563
564 if ((flags & match_isgroup) != 0)
565 {
566 newptrb.epb_prev = eptrb;
567 newptrb.epb_saved_eptr = eptr;
568 eptrb = &newptrb;
569 }
570
571 /* Now start processing the operations. */
572
573 for (;;)
574 {
575 op = *ecode;
576 minimize = FALSE;
577
578 /* For partial matching, remember if we ever hit the end of the subject after
579 matching at least one subject character. */
580
581 if (md->partial &&
582 eptr >= md->end_subject &&
583 eptr > md->start_match)
584 md->hitend = TRUE;
585
586 /* Opening capturing bracket. If there is space in the offset vector, save
587 the current subject position in the working slot at the top of the vector. We
588 mustn't change the current values of the data slot, because they may be set
589 from a previous iteration of this group, and be referred to by a reference
590 inside the group.
591
592 If the bracket fails to match, we need to restore this value and also the
593 values of the final offsets, in case they were set by a previous iteration of
594 the same bracket.
595
596 If there isn't enough space in the offset vector, treat this as if it were a
597 non-capturing bracket. Don't worry about setting the flag for the error case
598 here; that is handled in the code for KET. */
599
600 if (op > OP_BRA)
601 {
602 number = op - OP_BRA;
603
604 /* For extended extraction brackets (large number), we have to fish out the
605 number from a dummy opcode at the start. */
606
607 if (number > EXTRACT_BASIC_MAX)
608 number = GET2(ecode, 2+LINK_SIZE);
609 offset = number << 1;
610
611 #ifdef DEBUG
612 printf("start bracket %d subject=", number);
613 pchars(eptr, 16, TRUE, md);
614 printf("\n");
615 #endif
616
617 if (offset < md->offset_max)
618 {
619 save_offset1 = md->offset_vector[offset];
620 save_offset2 = md->offset_vector[offset+1];
621 save_offset3 = md->offset_vector[md->offset_end - number];
622 save_capture_last = md->capture_last;
623
624 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
625 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
626
627 do
628 {
629 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
630 match_isgroup);
631 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
632 md->capture_last = save_capture_last;
633 ecode += GET(ecode, 1);
634 }
635 while (*ecode == OP_ALT);
636
637 DPRINTF(("bracket %d failed\n", number));
638
639 md->offset_vector[offset] = save_offset1;
640 md->offset_vector[offset+1] = save_offset2;
641 md->offset_vector[md->offset_end - number] = save_offset3;
642
643 RRETURN(MATCH_NOMATCH);
644 }
645
646 /* Insufficient room for saving captured contents */
647
648 else op = OP_BRA;
649 }
650
651 /* Other types of node can be handled by a switch */
652
653 switch(op)
654 {
655 case OP_BRA: /* Non-capturing bracket: optimized */
656 DPRINTF(("start bracket 0\n"));
657
658 /* Loop for all the alternatives */
659
660 for (;;)
661 {
662 /* When we get to the final alternative within the brackets, we would
663 return the result of a recursive call to match() whatever happened. We
664 can reduce stack usage by turning this into a tail recursion. */
665
666 if (ecode[GET(ecode, 1)] != OP_ALT)
667 {
668 ecode += 1 + LINK_SIZE;
669 flags = match_isgroup;
670 DPRINTF(("bracket 0 tail recursion\n"));
671 goto TAIL_RECURSE;
672 }
673
674 /* For non-final alternatives, continue the loop for a NOMATCH result;
675 otherwise return. */
676
677 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
678 match_isgroup);
679 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
680 ecode += GET(ecode, 1);
681 }
682 /* Control never reaches here. */
683
684 /* Conditional group: compilation checked that there are no more than
685 two branches. If the condition is false, skipping the first branch takes us
686 past the end if there is only one branch, but that's OK because that is
687 exactly what going to the ket would do. As there is only one branch to be
688 obeyed, we can use tail recursion to avoid using another stack frame. */
689
690 case OP_COND:
691 if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
692 {
693 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
694 condition = (offset == CREF_RECURSE * 2)?
695 (md->recursive != NULL) :
696 (offset < offset_top && md->offset_vector[offset] >= 0);
697 ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));
698 flags = match_isgroup;
699 goto TAIL_RECURSE;
700 }
701
702 /* The condition is an assertion. Call match() to evaluate it - setting
703 the final argument TRUE causes it to stop at the end of an assertion. */
704
705 else
706 {
707 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
708 match_condassert | match_isgroup);
709 if (rrc == MATCH_MATCH)
710 {
711 ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
712 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
713 }
714 else if (rrc != MATCH_NOMATCH)
715 {
716 RRETURN(rrc); /* Need braces because of following else */
717 }
718 else ecode += GET(ecode, 1);
719
720 /* We are now at the branch that is to be obeyed. As there is only one,
721 we can use tail recursion to avoid using another stack frame. */
722
723 ecode += 1 + LINK_SIZE;
724 flags = match_isgroup;
725 goto TAIL_RECURSE;
726 }
727 /* Control never reaches here */
728
729 /* Skip over conditional reference or large extraction number data if
730 encountered. */
731
732 case OP_CREF:
733 case OP_BRANUMBER:
734 ecode += 3;
735 break;
736
737 /* End of the pattern. If we are in a recursion, we should restore the
738 offsets appropriately and continue from after the call. */
739
740 case OP_END:
741 if (md->recursive != NULL && md->recursive->group_num == 0)
742 {
743 recursion_info *rec = md->recursive;
744 DPRINTF(("End of pattern in a (?0) recursion\n"));
745 md->recursive = rec->prevrec;
746 memmove(md->offset_vector, rec->offset_save,
747 rec->saved_max * sizeof(int));
748 md->start_match = rec->save_start;
749 ims = original_ims;
750 ecode = rec->after_call;
751 break;
752 }
753
754 /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
755 string - backtracking will then try other alternatives, if any. */
756
757 if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
758 md->end_match_ptr = eptr; /* Record where we ended */
759 md->end_offset_top = offset_top; /* and how many extracts were taken */
760 RRETURN(MATCH_MATCH);
761
762 /* Change option settings */
763
764 case OP_OPT:
765 ims = ecode[1];
766 ecode += 2;
767 DPRINTF(("ims set to %02lx\n", ims));
768 break;
769
770 /* Assertion brackets. Check the alternative branches in turn - the
771 matching won't pass the KET for an assertion. If any one branch matches,
772 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
773 start of each branch to move the current point backwards, so the code at
774 this level is identical to the lookahead case. */
775
776 case OP_ASSERT:
777 case OP_ASSERTBACK:
778 do
779 {
780 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
781 match_isgroup);
782 if (rrc == MATCH_MATCH) break;
783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
784 ecode += GET(ecode, 1);
785 }
786 while (*ecode == OP_ALT);
787 if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
788
789 /* If checking an assertion for a condition, return MATCH_MATCH. */
790
791 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
792
793 /* Continue from after the assertion, updating the offsets high water
794 mark, since extracts may have been taken during the assertion. */
795
796 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
797 ecode += 1 + LINK_SIZE;
798 offset_top = md->end_offset_top;
799 continue;
800
801 /* Negative assertion: all branches must fail to match */
802
803 case OP_ASSERT_NOT:
804 case OP_ASSERTBACK_NOT:
805 do
806 {
807 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
808 match_isgroup);
809 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
810 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
811 ecode += GET(ecode,1);
812 }
813 while (*ecode == OP_ALT);
814
815 if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
816
817 ecode += 1 + LINK_SIZE;
818 continue;
819
820 /* Move the subject pointer back. This occurs only at the start of
821 each branch of a lookbehind assertion. If we are too close to the start to
822 move back, this match function fails. When working with UTF-8 we move
823 back a number of characters, not bytes. */
824
825 case OP_REVERSE:
826 #ifdef SUPPORT_UTF8
827 if (utf8)
828 {
829 c = GET(ecode,1);
830 for (i = 0; i < c; i++)
831 {
832 eptr--;
833 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
834 BACKCHAR(eptr)
835 }
836 }
837 else
838 #endif
839
840 /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
841
842 {
843 eptr -= GET(ecode,1);
844 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
845 }
846
847 /* Skip to next op code */
848
849 ecode += 1 + LINK_SIZE;
850 break;
851
852 /* The callout item calls an external function, if one is provided, passing
853 details of the match so far. This is mainly for debugging, though the
854 function is able to force a failure. */
855
856 case OP_CALLOUT:
857 if (pcre_callout != NULL)
858 {
859 pcre_callout_block cb;
860 cb.version = 1; /* Version 1 of the callout block */
861 cb.callout_number = ecode[1];
862 cb.offset_vector = md->offset_vector;
863 cb.subject = (PCRE_SPTR)md->start_subject;
864 cb.subject_length = md->end_subject - md->start_subject;
865 cb.start_match = md->start_match - md->start_subject;
866 cb.current_position = eptr - md->start_subject;
867 cb.pattern_position = GET(ecode, 2);
868 cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
869 cb.capture_top = offset_top/2;
870 cb.capture_last = md->capture_last;
871 cb.callout_data = md->callout_data;
872 if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
873 if (rrc < 0) RRETURN(rrc);
874 }
875 ecode += 2 + 2*LINK_SIZE;
876 break;
877
878 /* Recursion either matches the current regex, or some subexpression. The
879 offset data is the offset to the starting bracket from the start of the
880 whole pattern. (This is so that it works from duplicated subpatterns.)
881
882 If there are any capturing brackets started but not finished, we have to
883 save their starting points and reinstate them after the recursion. However,
884 we don't know how many such there are (offset_top records the completed
885 total) so we just have to save all the potential data. There may be up to
886 65535 such values, which is too large to put on the stack, but using malloc
887 for small numbers seems expensive. As a compromise, the stack is used when
888 there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
889 is used. A problem is what to do if the malloc fails ... there is no way of
890 returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
891 values on the stack, and accept that the rest may be wrong.
892
893 There are also other values that have to be saved. We use a chained
894 sequence of blocks that actually live on the stack. Thanks to Robin Houston
895 for the original version of this logic. */
896
897 case OP_RECURSE:
898 {
899 callpat = md->start_code + GET(ecode, 1);
900 new_recursive.group_num = *callpat - OP_BRA;
901
902 /* For extended extraction brackets (large number), we have to fish out
903 the number from a dummy opcode at the start. */
904
905 if (new_recursive.group_num > EXTRACT_BASIC_MAX)
906 new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
907
908 /* Add to "recursing stack" */
909
910 new_recursive.prevrec = md->recursive;
911 md->recursive = &new_recursive;
912
913 /* Find where to continue from afterwards */
914
915 ecode += 1 + LINK_SIZE;
916 new_recursive.after_call = ecode;
917
918 /* Now save the offset data. */
919
920 new_recursive.saved_max = md->offset_end;
921 if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
922 new_recursive.offset_save = stacksave;
923 else
924 {
925 new_recursive.offset_save =
926 (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
927 if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
928 }
929
930 memcpy(new_recursive.offset_save, md->offset_vector,
931 new_recursive.saved_max * sizeof(int));
932 new_recursive.save_start = md->start_match;
933 md->start_match = eptr;
934
935 /* OK, now we can do the recursion. For each top-level alternative we
936 restore the offset and recursion data. */
937
938 DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
939 do
940 {
941 RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
942 eptrb, match_isgroup);
943 if (rrc == MATCH_MATCH)
944 {
945 DPRINTF(("Recursion matched\n"));
946 md->recursive = new_recursive.prevrec;
947 if (new_recursive.offset_save != stacksave)
948 (pcre_free)(new_recursive.offset_save);
949 RRETURN(MATCH_MATCH);
950 }
951 else if (rrc != MATCH_NOMATCH)
952 {
953 DPRINTF(("Recursion gave error %d\n", rrc));
954 RRETURN(rrc);
955 }
956
957 md->recursive = &new_recursive;
958 memcpy(md->offset_vector, new_recursive.offset_save,
959 new_recursive.saved_max * sizeof(int));
960 callpat += GET(callpat, 1);
961 }
962 while (*callpat == OP_ALT);
963
964 DPRINTF(("Recursion didn't match\n"));
965 md->recursive = new_recursive.prevrec;
966 if (new_recursive.offset_save != stacksave)
967 (pcre_free)(new_recursive.offset_save);
968 RRETURN(MATCH_NOMATCH);
969 }
970 /* Control never reaches here */
971
972 /* "Once" brackets are like assertion brackets except that after a match,
973 the point in the subject string is not moved back. Thus there can never be
974 a move back into the brackets. Friedl calls these "atomic" subpatterns.
975 Check the alternative branches in turn - the matching won't pass the KET
976 for this kind of subpattern. If any one branch matches, we carry on as at
977 the end of a normal bracket, leaving the subject pointer. */
978
979 case OP_ONCE:
980 prev = ecode;
981 saved_eptr = eptr;
982
983 do
984 {
985 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
986 eptrb, match_isgroup);
987 if (rrc == MATCH_MATCH) break;
988 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
989 ecode += GET(ecode,1);
990 }
991 while (*ecode == OP_ALT);
992
993 /* If hit the end of the group (which could be repeated), fail */
994
995 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
996
997 /* Continue as from after the assertion, updating the offsets high water
998 mark, since extracts may have been taken. */
999
1000 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1001
1002 offset_top = md->end_offset_top;
1003 eptr = md->end_match_ptr;
1004
1005 /* For a non-repeating ket, just continue at this level. This also
1006 happens for a repeating ket if no characters were matched in the group.
1007 This is the forcible breaking of infinite loops as implemented in Perl
1008 5.005. If there is an options reset, it will get obeyed in the normal
1009 course of events. */
1010
1011 if (*ecode == OP_KET || eptr == saved_eptr)
1012 {
1013 ecode += 1+LINK_SIZE;
1014 break;
1015 }
1016
1017 /* The repeating kets try the rest of the pattern or restart from the
1018 preceding bracket, in the appropriate order. The second "call" of match()
1019 uses tail recursion, to avoid using another stack frame. We need to reset
1020 any options that changed within the bracket before re-running it, so
1021 check the next opcode. */
1022
1023 if (ecode[1+LINK_SIZE] == OP_OPT)
1024 {
1025 ims = (ims & ~PCRE_IMS) | ecode[4];
1026 DPRINTF(("ims set to %02lx at group repeat\n", ims));
1027 }
1028
1029 if (*ecode == OP_KETRMIN)
1030 {
1031 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1033 ecode = prev;
1034 flags = match_isgroup;
1035 goto TAIL_RECURSE;
1036 }
1037 else /* OP_KETRMAX */
1038 {
1039 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1040 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1041 ecode += 1 + LINK_SIZE;
1042 flags = 0;
1043 goto TAIL_RECURSE;
1044 }
1045 /* Control never gets here */
1046
1047 /* An alternation is the end of a branch; scan along to find the end of the
1048 bracketed group and go to there. */
1049
1050 case OP_ALT:
1051 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1052 break;
1053
1054 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1055 that it may occur zero times. It may repeat infinitely, or not at all -
1056 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1057 repeat limits are compiled as a number of copies, with the optional ones
1058 preceded by BRAZERO or BRAMINZERO. */
1059
1060 case OP_BRAZERO:
1061 {
1062 next = ecode+1;
1063 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1065 do next += GET(next,1); while (*next == OP_ALT);
1066 ecode = next + 1+LINK_SIZE;
1067 }
1068 break;
1069
1070 case OP_BRAMINZERO:
1071 {
1072 next = ecode+1;
1073 do next += GET(next,1); while (*next == OP_ALT);
1074 RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1075 match_isgroup);
1076 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1077 ecode++;
1078 }
1079 break;
1080
1081 /* End of a group, repeated or non-repeating. If we are at the end of
1082 an assertion "group", stop matching and return MATCH_MATCH, but record the
1083 current high water mark for use by positive assertions. Do this also
1084 for the "once" (not-backup up) groups. */
1085
1086 case OP_KET:
1087 case OP_KETRMIN:
1088 case OP_KETRMAX:
1089 prev = ecode - GET(ecode, 1);
1090 saved_eptr = eptrb->epb_saved_eptr;
1091
1092 /* Back up the stack of bracket start pointers. */
1093
1094 eptrb = eptrb->epb_prev;
1095
1096 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1097 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1098 *prev == OP_ONCE)
1099 {
1100 md->end_match_ptr = eptr; /* For ONCE */
1101 md->end_offset_top = offset_top;
1102 RRETURN(MATCH_MATCH);
1103 }
1104
1105 /* In all other cases except a conditional group we have to check the
1106 group number back at the start and if necessary complete handling an
1107 extraction by setting the offsets and bumping the high water mark. */
1108
1109 if (*prev != OP_COND)
1110 {
1111 number = *prev - OP_BRA;
1112
1113 /* For extended extraction brackets (large number), we have to fish out
1114 the number from a dummy opcode at the start. */
1115
1116 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1117 offset = number << 1;
1118
1119 #ifdef DEBUG
1120 printf("end bracket %d", number);
1121 printf("\n");
1122 #endif
1123
1124 /* Test for a numbered group. This includes groups called as a result
1125 of recursion. Note that whole-pattern recursion is coded as a recurse
1126 into group 0, so it won't be picked up here. Instead, we catch it when
1127 the OP_END is reached. */
1128
1129 if (number > 0)
1130 {
1131 md->capture_last = number;
1132 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1133 {
1134 md->offset_vector[offset] =
1135 md->offset_vector[md->offset_end - number];
1136 md->offset_vector[offset+1] = eptr - md->start_subject;
1137 if (offset_top <= offset) offset_top = offset + 2;
1138 }
1139
1140 /* Handle a recursively called group. Restore the offsets
1141 appropriately and continue from after the call. */
1142
1143 if (md->recursive != NULL && md->recursive->group_num == number)
1144 {
1145 recursion_info *rec = md->recursive;
1146 DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1147 md->recursive = rec->prevrec;
1148 md->start_match = rec->save_start;
1149 memcpy(md->offset_vector, rec->offset_save,
1150 rec->saved_max * sizeof(int));
1151 ecode = rec->after_call;
1152 ims = original_ims;
1153 break;
1154 }
1155 }
1156 }
1157
1158 /* Reset the value of the ims flags, in case they got changed during
1159 the group. */
1160
1161 ims = original_ims;
1162 DPRINTF(("ims reset to %02lx\n", ims));
1163
1164 /* For a non-repeating ket, just continue at this level. This also
1165 happens for a repeating ket if no characters were matched in the group.
1166 This is the forcible breaking of infinite loops as implemented in Perl
1167 5.005. If there is an options reset, it will get obeyed in the normal
1168 course of events. */
1169
1170 if (*ecode == OP_KET || eptr == saved_eptr)
1171 {
1172 ecode += 1 + LINK_SIZE;
1173 break;
1174 }
1175
1176 /* The repeating kets try the rest of the pattern or restart from the
1177 preceding bracket, in the appropriate order. In the second case, we can use
1178 tail recursion to avoid using another stack frame. */
1179
1180 if (*ecode == OP_KETRMIN)
1181 {
1182 RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1183 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1184 ecode = prev;
1185 flags = match_isgroup;
1186 goto TAIL_RECURSE;
1187 }
1188 else /* OP_KETRMAX */
1189 {
1190 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1191 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192 ecode += 1 + LINK_SIZE;
1193 flags = 0;
1194 goto TAIL_RECURSE;
1195 }
1196 /* Control never gets here */
1197
1198 /* Start of subject unless notbol, or after internal newline if multiline */
1199
1200 case OP_CIRC:
1201 if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1202 if ((ims & PCRE_MULTILINE) != 0)
1203 {
1204 if (eptr != md->start_subject &&
1205 (eptr == md->end_subject ||
1206 eptr < md->start_subject + md->nllen ||
1207 !IS_NEWLINE(eptr - md->nllen)))
1208 RRETURN(MATCH_NOMATCH);
1209 ecode++;
1210 break;
1211 }
1212 /* ... else fall through */
1213
1214 /* Start of subject assertion */
1215
1216 case OP_SOD:
1217 if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1218 ecode++;
1219 break;
1220
1221 /* Start of match assertion */
1222
1223 case OP_SOM:
1224 if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1225 ecode++;
1226 break;
1227
1228 /* Assert before internal newline if multiline, or before a terminating
1229 newline unless endonly is set, else end of subject unless noteol is set. */
1230
1231 case OP_DOLL:
1232 if ((ims & PCRE_MULTILINE) != 0)
1233 {
1234 if (eptr < md->end_subject)
1235 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1236 else
1237 { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1238 ecode++;
1239 break;
1240 }
1241 else
1242 {
1243 if (md->noteol) RRETURN(MATCH_NOMATCH);
1244 if (!md->endonly)
1245 {
1246 if (eptr != md->end_subject &&
1247 (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
1248 RRETURN(MATCH_NOMATCH);
1249 ecode++;
1250 break;
1251 }
1252 }
1253 /* ... else fall through for endonly */
1254
1255 /* End of subject assertion (\z) */
1256
1257 case OP_EOD:
1258 if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1259 ecode++;
1260 break;
1261
1262 /* End of subject or ending \n assertion (\Z) */
1263
1264 case OP_EODN:
1265 if (eptr != md->end_subject &&
1266 (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
1267 RRETURN(MATCH_NOMATCH);
1268 ecode++;
1269 break;
1270
1271 /* Word boundary assertions */
1272
1273 case OP_NOT_WORD_BOUNDARY:
1274 case OP_WORD_BOUNDARY:
1275 {
1276
1277 /* Find out if the previous and current characters are "word" characters.
1278 It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1279 be "non-word" characters. */
1280
1281 #ifdef SUPPORT_UTF8
1282 if (utf8)
1283 {
1284 if (eptr == md->start_subject) prev_is_word = FALSE; else
1285 {
1286 const uschar *lastptr = eptr - 1;
1287 while((*lastptr & 0xc0) == 0x80) lastptr--;
1288 GETCHAR(c, lastptr);
1289 prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1290 }
1291 if (eptr >= md->end_subject) cur_is_word = FALSE; else
1292 {
1293 GETCHAR(c, eptr);
1294 cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1295 }
1296 }
1297 else
1298 #endif
1299
1300 /* More streamlined when not in UTF-8 mode */
1301
1302 {
1303 prev_is_word = (eptr != md->start_subject) &&
1304 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1305 cur_is_word = (eptr < md->end_subject) &&
1306 ((md->ctypes[*eptr] & ctype_word) != 0);
1307 }
1308
1309 /* Now see if the situation is what we want */
1310
1311 if ((*ecode++ == OP_WORD_BOUNDARY)?
1312 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1313 RRETURN(MATCH_NOMATCH);
1314 }
1315 break;
1316
1317 /* Match a single character type; inline for speed */
1318
1319 case OP_ANY:
1320 if ((ims & PCRE_DOTALL) == 0)
1321 {
1322 if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
1323 RRETURN(MATCH_NOMATCH);
1324 }
1325 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1326 if (utf8)
1327 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1328 ecode++;
1329 break;
1330
1331 /* Match a single byte, even in UTF-8 mode. This opcode really does match
1332 any byte, even newline, independent of the setting of PCRE_DOTALL. */
1333
1334 case OP_ANYBYTE:
1335 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1336 ecode++;
1337 break;
1338
1339 case OP_NOT_DIGIT:
1340 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1341 GETCHARINCTEST(c, eptr);
1342 if (
1343 #ifdef SUPPORT_UTF8
1344 c < 256 &&
1345 #endif
1346 (md->ctypes[c] & ctype_digit) != 0
1347 )
1348 RRETURN(MATCH_NOMATCH);
1349 ecode++;
1350 break;
1351
1352 case OP_DIGIT:
1353 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1354 GETCHARINCTEST(c, eptr);
1355 if (
1356 #ifdef SUPPORT_UTF8
1357 c >= 256 ||
1358 #endif
1359 (md->ctypes[c] & ctype_digit) == 0
1360 )
1361 RRETURN(MATCH_NOMATCH);
1362 ecode++;
1363 break;
1364
1365 case OP_NOT_WHITESPACE:
1366 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1367 GETCHARINCTEST(c, eptr);
1368 if (
1369 #ifdef SUPPORT_UTF8
1370 c < 256 &&
1371 #endif
1372 (md->ctypes[c] & ctype_space) != 0
1373 )
1374 RRETURN(MATCH_NOMATCH);
1375 ecode++;
1376 break;
1377
1378 case OP_WHITESPACE:
1379 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1380 GETCHARINCTEST(c, eptr);
1381 if (
1382 #ifdef SUPPORT_UTF8
1383 c >= 256 ||
1384 #endif
1385 (md->ctypes[c] & ctype_space) == 0
1386 )
1387 RRETURN(MATCH_NOMATCH);
1388 ecode++;
1389 break;
1390
1391 case OP_NOT_WORDCHAR:
1392 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1393 GETCHARINCTEST(c, eptr);
1394 if (
1395 #ifdef SUPPORT_UTF8
1396 c < 256 &&
1397 #endif
1398 (md->ctypes[c] & ctype_word) != 0
1399 )
1400 RRETURN(MATCH_NOMATCH);
1401 ecode++;
1402 break;
1403
1404 case OP_WORDCHAR:
1405 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1406 GETCHARINCTEST(c, eptr);
1407 if (
1408 #ifdef SUPPORT_UTF8
1409 c >= 256 ||
1410 #endif
1411 (md->ctypes[c] & ctype_word) == 0
1412 )
1413 RRETURN(MATCH_NOMATCH);
1414 ecode++;
1415 break;
1416
1417 #ifdef SUPPORT_UCP
1418 /* Check the next character by Unicode property. We will get here only
1419 if the support is in the binary; otherwise a compile-time error occurs. */
1420
1421 case OP_PROP:
1422 case OP_NOTPROP:
1423 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1424 GETCHARINCTEST(c, eptr);
1425 {
1426 int chartype, script;
1427 int category = _pcre_ucp_findprop(c, &chartype, &script);
1428
1429 switch(ecode[1])
1430 {
1431 case PT_ANY:
1432 if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1433 break;
1434
1435 case PT_LAMP:
1436 if ((chartype == ucp_Lu ||
1437 chartype == ucp_Ll ||
1438 chartype == ucp_Lt) == (op == OP_NOTPROP))
1439 RRETURN(MATCH_NOMATCH);
1440 break;
1441
1442 case PT_GC:
1443 if ((ecode[2] != category) == (op == OP_PROP))
1444 RRETURN(MATCH_NOMATCH);
1445 break;
1446
1447 case PT_PC:
1448 if ((ecode[2] != chartype) == (op == OP_PROP))
1449 RRETURN(MATCH_NOMATCH);
1450 break;
1451
1452 case PT_SC:
1453 if ((ecode[2] != script) == (op == OP_PROP))
1454 RRETURN(MATCH_NOMATCH);
1455 break;
1456
1457 default:
1458 RRETURN(PCRE_ERROR_INTERNAL);
1459 break;
1460 }
1461
1462 ecode += 3;
1463 }
1464 break;
1465
1466 /* Match an extended Unicode sequence. We will get here only if the support
1467 is in the binary; otherwise a compile-time error occurs. */
1468
1469 case OP_EXTUNI:
1470 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1471 GETCHARINCTEST(c, eptr);
1472 {
1473 int chartype, script;
1474 int category = _pcre_ucp_findprop(c, &chartype, &script);
1475 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1476 while (eptr < md->end_subject)
1477 {
1478 int len = 1;
1479 if (!utf8) c = *eptr; else
1480 {
1481 GETCHARLEN(c, eptr, len);
1482 }
1483 category = _pcre_ucp_findprop(c, &chartype, &script);
1484 if (category != ucp_M) break;
1485 eptr += len;
1486 }
1487 }
1488 ecode++;
1489 break;
1490 #endif
1491
1492
1493 /* Match a back reference, possibly repeatedly. Look past the end of the
1494 item to see if there is repeat information following. The code is similar
1495 to that for character classes, but repeated for efficiency. Then obey
1496 similar code to character type repeats - written out again for speed.
1497 However, if the referenced string is the empty string, always treat
1498 it as matched, any number of times (otherwise there could be infinite
1499 loops). */
1500
1501 case OP_REF:
1502 {
1503 offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1504 ecode += 3; /* Advance past item */
1505
1506 /* If the reference is unset, set the length to be longer than the amount
1507 of subject left; this ensures that every attempt at a match fails. We
1508 can't just fail here, because of the possibility of quantifiers with zero
1509 minima. */
1510
1511 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1512 md->end_subject - eptr + 1 :
1513 md->offset_vector[offset+1] - md->offset_vector[offset];
1514
1515 /* Set up for repetition, or handle the non-repeated case */
1516
1517 switch (*ecode)
1518 {
1519 case OP_CRSTAR:
1520 case OP_CRMINSTAR:
1521 case OP_CRPLUS:
1522 case OP_CRMINPLUS:
1523 case OP_CRQUERY:
1524 case OP_CRMINQUERY:
1525 c = *ecode++ - OP_CRSTAR;
1526 minimize = (c & 1) != 0;
1527 min = rep_min[c]; /* Pick up values from tables; */
1528 max = rep_max[c]; /* zero for max => infinity */
1529 if (max == 0) max = INT_MAX;
1530 break;
1531
1532 case OP_CRRANGE:
1533 case OP_CRMINRANGE:
1534 minimize = (*ecode == OP_CRMINRANGE);
1535 min = GET2(ecode, 1);
1536 max = GET2(ecode, 3);
1537 if (max == 0) max = INT_MAX;
1538 ecode += 5;
1539 break;
1540
1541 default: /* No repeat follows */
1542 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1543 eptr += length;
1544 continue; /* With the main loop */
1545 }
1546
1547 /* If the length of the reference is zero, just continue with the
1548 main loop. */
1549
1550 if (length == 0) continue;
1551
1552 /* First, ensure the minimum number of matches are present. We get back
1553 the length of the reference string explicitly rather than passing the
1554 address of eptr, so that eptr can be a register variable. */
1555
1556 for (i = 1; i <= min; i++)
1557 {
1558 if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1559 eptr += length;
1560 }
1561
1562 /* If min = max, continue at the same level without recursion.
1563 They are not both allowed to be zero. */
1564
1565 if (min == max) continue;
1566
1567 /* If minimizing, keep trying and advancing the pointer */
1568
1569 if (minimize)
1570 {
1571 for (fi = min;; fi++)
1572 {
1573 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1574 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1575 if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1576 RRETURN(MATCH_NOMATCH);
1577 eptr += length;
1578 }
1579 /* Control never gets here */
1580 }
1581
1582 /* If maximizing, find the longest string and work backwards */
1583
1584 else
1585 {
1586 pp = eptr;
1587 for (i = min; i < max; i++)
1588 {
1589 if (!match_ref(offset, eptr, length, md, ims)) break;
1590 eptr += length;
1591 }
1592 while (eptr >= pp)
1593 {
1594 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1595 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1596 eptr -= length;
1597 }
1598 RRETURN(MATCH_NOMATCH);
1599 }
1600 }
1601 /* Control never gets here */
1602
1603
1604
1605 /* Match a bit-mapped character class, possibly repeatedly. This op code is
1606 used when all the characters in the class have values in the range 0-255,
1607 and either the matching is caseful, or the characters are in the range
1608 0-127 when UTF-8 processing is enabled. The only difference between
1609 OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1610 encountered.
1611
1612 First, look past the end of the item to see if there is repeat information
1613 following. Then obey similar code to character type repeats - written out
1614 again for speed. */
1615
1616 case OP_NCLASS:
1617 case OP_CLASS:
1618 {
1619 data = ecode + 1; /* Save for matching */
1620 ecode += 33; /* Advance past the item */
1621
1622 switch (*ecode)
1623 {
1624 case OP_CRSTAR:
1625 case OP_CRMINSTAR:
1626 case OP_CRPLUS:
1627 case OP_CRMINPLUS:
1628 case OP_CRQUERY:
1629 case OP_CRMINQUERY:
1630 c = *ecode++ - OP_CRSTAR;
1631 minimize = (c & 1) != 0;
1632 min = rep_min[c]; /* Pick up values from tables; */
1633 max = rep_max[c]; /* zero for max => infinity */
1634 if (max == 0) max = INT_MAX;
1635 break;
1636
1637 case OP_CRRANGE:
1638 case OP_CRMINRANGE:
1639 minimize = (*ecode == OP_CRMINRANGE);
1640 min = GET2(ecode, 1);
1641 max = GET2(ecode, 3);
1642 if (max == 0) max = INT_MAX;
1643 ecode += 5;
1644 break;
1645
1646 default: /* No repeat follows */
1647 min = max = 1;
1648 break;
1649 }
1650
1651 /* First, ensure the minimum number of matches are present. */
1652
1653 #ifdef SUPPORT_UTF8
1654 /* UTF-8 mode */
1655 if (utf8)
1656 {
1657 for (i = 1; i <= min; i++)
1658 {
1659 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1660 GETCHARINC(c, eptr);
1661 if (c > 255)
1662 {
1663 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1664 }
1665 else
1666 {
1667 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1668 }
1669 }
1670 }
1671 else
1672 #endif
1673 /* Not UTF-8 mode */
1674 {
1675 for (i = 1; i <= min; i++)
1676 {
1677 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1678 c = *eptr++;
1679 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1680 }
1681 }
1682
1683 /* If max == min we can continue with the main loop without the
1684 need to recurse. */
1685
1686 if (min == max) continue;
1687
1688 /* If minimizing, keep testing the rest of the expression and advancing
1689 the pointer while it matches the class. */
1690
1691 if (minimize)
1692 {
1693 #ifdef SUPPORT_UTF8
1694 /* UTF-8 mode */
1695 if (utf8)
1696 {
1697 for (fi = min;; fi++)
1698 {
1699 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1700 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1701 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1702 GETCHARINC(c, eptr);
1703 if (c > 255)
1704 {
1705 if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1706 }
1707 else
1708 {
1709 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1710 }
1711 }
1712 }
1713 else
1714 #endif
1715 /* Not UTF-8 mode */
1716 {
1717 for (fi = min;; fi++)
1718 {
1719 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1720 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1721 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1722 c = *eptr++;
1723 if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1724 }
1725 }
1726 /* Control never gets here */
1727 }
1728
1729 /* If maximizing, find the longest possible run, then work backwards. */
1730
1731 else
1732 {
1733 pp = eptr;
1734
1735 #ifdef SUPPORT_UTF8
1736 /* UTF-8 mode */
1737 if (utf8)
1738 {
1739 for (i = min; i < max; i++)
1740 {
1741 int len = 1;
1742 if (eptr >= md->end_subject) break;
1743 GETCHARLEN(c, eptr, len);
1744 if (c > 255)
1745 {
1746 if (op == OP_CLASS) break;
1747 }
1748 else
1749 {
1750 if ((data[c/8] & (1 << (c&7))) == 0) break;
1751 }
1752 eptr += len;
1753 }
1754 for (;;)
1755 {
1756 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1757 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1758 if (eptr-- == pp) break; /* Stop if tried at original pos */
1759 BACKCHAR(eptr);
1760 }
1761 }
1762 else
1763 #endif
1764 /* Not UTF-8 mode */
1765 {
1766 for (i = min; i < max; i++)
1767 {
1768 if (eptr >= md->end_subject) break;
1769 c = *eptr;
1770 if ((data[c/8] & (1 << (c&7))) == 0) break;
1771 eptr++;
1772 }
1773 while (eptr >= pp)
1774 {
1775 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1776 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1777 eptr--;
1778 }
1779 }
1780
1781 RRETURN(MATCH_NOMATCH);
1782 }
1783 }
1784 /* Control never gets here */
1785
1786
1787 /* Match an extended character class. This opcode is encountered only
1788 in UTF-8 mode, because that's the only time it is compiled. */
1789
1790 #ifdef SUPPORT_UTF8
1791 case OP_XCLASS:
1792 {
1793 data = ecode + 1 + LINK_SIZE; /* Save for matching */
1794 ecode += GET(ecode, 1); /* Advance past the item */
1795
1796 switch (*ecode)
1797 {
1798 case OP_CRSTAR:
1799 case OP_CRMINSTAR:
1800 case OP_CRPLUS:
1801 case OP_CRMINPLUS:
1802 case OP_CRQUERY:
1803 case OP_CRMINQUERY:
1804 c = *ecode++ - OP_CRSTAR;
1805 minimize = (c & 1) != 0;
1806 min = rep_min[c]; /* Pick up values from tables; */
1807 max = rep_max[c]; /* zero for max => infinity */
1808 if (max == 0) max = INT_MAX;
1809 break;
1810
1811 case OP_CRRANGE:
1812 case OP_CRMINRANGE:
1813 minimize = (*ecode == OP_CRMINRANGE);
1814 min = GET2(ecode, 1);
1815 max = GET2(ecode, 3);
1816 if (max == 0) max = INT_MAX;
1817 ecode += 5;
1818 break;
1819
1820 default: /* No repeat follows */
1821 min = max = 1;
1822 break;
1823 }
1824
1825 /* First, ensure the minimum number of matches are present. */
1826
1827 for (i = 1; i <= min; i++)
1828 {
1829 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1830 GETCHARINC(c, eptr);
1831 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1832 }
1833
1834 /* If max == min we can continue with the main loop without the
1835 need to recurse. */
1836
1837 if (min == max) continue;
1838
1839 /* If minimizing, keep testing the rest of the expression and advancing
1840 the pointer while it matches the class. */
1841
1842 if (minimize)
1843 {
1844 for (fi = min;; fi++)
1845 {
1846 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1848 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1849 GETCHARINC(c, eptr);
1850 if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1851 }
1852 /* Control never gets here */
1853 }
1854
1855 /* If maximizing, find the longest possible run, then work backwards. */
1856
1857 else
1858 {
1859 pp = eptr;
1860 for (i = min; i < max; i++)
1861 {
1862 int len = 1;
1863 if (eptr >= md->end_subject) break;
1864 GETCHARLEN(c, eptr, len);
1865 if (!_pcre_xclass(c, data)) break;
1866 eptr += len;
1867 }
1868 for(;;)
1869 {
1870 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1871 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1872 if (eptr-- == pp) break; /* Stop if tried at original pos */
1873 BACKCHAR(eptr)
1874 }
1875 RRETURN(MATCH_NOMATCH);
1876 }
1877
1878 /* Control never gets here */
1879 }
1880 #endif /* End of XCLASS */
1881
1882 /* Match a single character, casefully */
1883
1884 case OP_CHAR:
1885 #ifdef SUPPORT_UTF8
1886 if (utf8)
1887 {
1888 length = 1;
1889 ecode++;
1890 GETCHARLEN(fc, ecode, length);
1891 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1892 while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1893 }
1894 else
1895 #endif
1896
1897 /* Non-UTF-8 mode */
1898 {
1899 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1900 if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1901 ecode += 2;
1902 }
1903 break;
1904
1905 /* Match a single character, caselessly */
1906
1907 case OP_CHARNC:
1908 #ifdef SUPPORT_UTF8
1909 if (utf8)
1910 {
1911 length = 1;
1912 ecode++;
1913 GETCHARLEN(fc, ecode, length);
1914
1915 if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1916
1917 /* If the pattern character's value is < 128, we have only one byte, and
1918 can use the fast lookup table. */
1919
1920 if (fc < 128)
1921 {
1922 if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1923 }
1924
1925 /* Otherwise we must pick up the subject character */
1926
1927 else
1928 {
1929 int dc;
1930 GETCHARINC(dc, eptr);
1931 ecode += length;
1932
1933 /* If we have Unicode property support, we can use it to test the other
1934 case of the character, if there is one. */
1935
1936 if (fc != dc)
1937 {
1938 #ifdef SUPPORT_UCP
1939 if (dc != _pcre_ucp_othercase(fc))
1940 #endif
1941 RRETURN(MATCH_NOMATCH);
1942 }
1943 }
1944 }
1945 else
1946 #endif /* SUPPORT_UTF8 */
1947
1948 /* Non-UTF-8 mode */
1949 {
1950 if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1951 if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1952 ecode += 2;
1953 }
1954 break;
1955
1956 /* Match a single character repeatedly; different opcodes share code. */
1957
1958 case OP_EXACT:
1959 min = max = GET2(ecode, 1);
1960 ecode += 3;
1961 goto REPEATCHAR;
1962
1963 case OP_UPTO:
1964 case OP_MINUPTO:
1965 min = 0;
1966 max = GET2(ecode, 1);
1967 minimize = *ecode == OP_MINUPTO;
1968 ecode += 3;
1969 goto REPEATCHAR;
1970
1971 case OP_STAR:
1972 case OP_MINSTAR:
1973 case OP_PLUS:
1974 case OP_MINPLUS:
1975 case OP_QUERY:
1976 case OP_MINQUERY:
1977 c = *ecode++ - OP_STAR;
1978 minimize = (c & 1) != 0;
1979 min = rep_min[c]; /* Pick up values from tables; */
1980 max = rep_max[c]; /* zero for max => infinity */
1981 if (max == 0) max = INT_MAX;
1982
1983 /* Common code for all repeated single-character matches. We can give
1984 up quickly if there are fewer than the minimum number of characters left in
1985 the subject. */
1986
1987 REPEATCHAR:
1988 #ifdef SUPPORT_UTF8
1989 if (utf8)
1990 {
1991 length = 1;
1992 charptr = ecode;
1993 GETCHARLEN(fc, ecode, length);
1994 if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1995 ecode += length;
1996
1997 /* Handle multibyte character matching specially here. There is
1998 support for caseless matching if UCP support is present. */
1999
2000 if (length > 1)
2001 {
2002 int oclength = 0;
2003 uschar occhars[8];
2004
2005 #ifdef SUPPORT_UCP
2006 int othercase;
2007 if ((ims & PCRE_CASELESS) != 0 &&
2008 (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
2009 othercase >= 0)
2010 oclength = _pcre_ord2utf8(othercase, occhars);
2011 #endif /* SUPPORT_UCP */
2012
2013 for (i = 1; i <= min; i++)
2014 {
2015 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2016 /* Need braces because of following else */
2017 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2018 else
2019 {
2020 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2021 eptr += oclength;
2022 }
2023 }
2024
2025 if (min == max) continue;
2026
2027 if (minimize)
2028 {
2029 for (fi = min;; fi++)
2030 {
2031 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2032 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2033 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2034 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2035 /* Need braces because of following else */
2036 else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2037 else
2038 {
2039 if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2040 eptr += oclength;
2041 }
2042 }
2043 /* Control never gets here */
2044 }
2045 else
2046 {
2047 pp = eptr;
2048 for (i = min; i < max; i++)
2049 {
2050 if (eptr > md->end_subject - length) break;
2051 if (memcmp(eptr, charptr, length) == 0) eptr += length;
2052 else if (oclength == 0) break;
2053 else
2054 {
2055 if (memcmp(eptr, occhars, oclength) != 0) break;
2056 eptr += oclength;
2057 }
2058 }
2059 while (eptr >= pp)
2060 {
2061 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2062 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2063 eptr -= length;
2064 }
2065 RRETURN(MATCH_NOMATCH);
2066 }
2067 /* Control never gets here */
2068 }
2069
2070 /* If the length of a UTF-8 character is 1, we fall through here, and
2071 obey the code as for non-UTF-8 characters below, though in this case the
2072 value of fc will always be < 128. */
2073 }
2074 else
2075 #endif /* SUPPORT_UTF8 */
2076
2077 /* When not in UTF-8 mode, load a single-byte character. */
2078 {
2079 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2080 fc = *ecode++;
2081 }
2082
2083 /* The value of fc at this point is always less than 256, though we may or
2084 may not be in UTF-8 mode. The code is duplicated for the caseless and
2085 caseful cases, for speed, since matching characters is likely to be quite
2086 common. First, ensure the minimum number of matches are present. If min =
2087 max, continue at the same level without recursing. Otherwise, if
2088 minimizing, keep trying the rest of the expression and advancing one
2089 matching character if failing, up to the maximum. Alternatively, if
2090 maximizing, find the maximum number of characters and work backwards. */
2091
2092 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2093 max, eptr));
2094
2095 if ((ims & PCRE_CASELESS) != 0)
2096 {
2097 fc = md->lcc[fc];
2098 for (i = 1; i <= min; i++)
2099 if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2100 if (min == max) continue;
2101 if (minimize)
2102 {
2103 for (fi = min;; fi++)
2104 {
2105 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2106 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2107 if (fi >= max || eptr >= md->end_subject ||
2108 fc != md->lcc[*eptr++])
2109 RRETURN(MATCH_NOMATCH);
2110 }
2111 /* Control never gets here */
2112 }
2113 else
2114 {
2115 pp = eptr;
2116 for (i = min; i < max; i++)
2117 {
2118 if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2119 eptr++;
2120 }
2121 while (eptr >= pp)
2122 {
2123 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2124 eptr--;
2125 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2126 }
2127 RRETURN(MATCH_NOMATCH);
2128 }
2129 /* Control never gets here */
2130 }
2131
2132 /* Caseful comparisons (includes all multi-byte characters) */
2133
2134 else
2135 {
2136 for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2137 if (min == max) continue;
2138 if (minimize)
2139 {
2140 for (fi = min;; fi++)
2141 {
2142 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2143 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2144 if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2145 RRETURN(MATCH_NOMATCH);
2146 }
2147 /* Control never gets here */
2148 }
2149 else
2150 {
2151 pp = eptr;
2152 for (i = min; i < max; i++)
2153 {
2154 if (eptr >= md->end_subject || fc != *eptr) break;
2155 eptr++;
2156 }
2157 while (eptr >= pp)
2158 {
2159 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2160 eptr--;
2161 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2162 }
2163 RRETURN(MATCH_NOMATCH);
2164 }
2165 }
2166 /* Control never gets here */
2167
2168 /* Match a negated single one-byte character. The character we are
2169 checking can be multibyte. */
2170
2171 case OP_NOT:
2172 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2173 ecode++;
2174 GETCHARINCTEST(c, eptr);
2175 if ((ims & PCRE_CASELESS) != 0)
2176 {
2177 #ifdef SUPPORT_UTF8
2178 if (c < 256)
2179 #endif
2180 c = md->lcc[c];
2181 if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2182 }
2183 else
2184 {
2185 if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2186 }
2187 break;
2188
2189 /* Match a negated single one-byte character repeatedly. This is almost a
2190 repeat of the code for a repeated single character, but I haven't found a
2191 nice way of commoning these up that doesn't require a test of the
2192 positive/negative option for each character match. Maybe that wouldn't add
2193 very much to the time taken, but character matching *is* what this is all
2194 about... */
2195
2196 case OP_NOTEXACT:
2197 min = max = GET2(ecode, 1);
2198 ecode += 3;
2199 goto REPEATNOTCHAR;
2200
2201 case OP_NOTUPTO:
2202 case OP_NOTMINUPTO:
2203 min = 0;
2204 max = GET2(ecode, 1);
2205 minimize = *ecode == OP_NOTMINUPTO;
2206 ecode += 3;
2207 goto REPEATNOTCHAR;
2208
2209 case OP_NOTSTAR:
2210 case OP_NOTMINSTAR:
2211 case OP_NOTPLUS:
2212 case OP_NOTMINPLUS:
2213 case OP_NOTQUERY:
2214 case OP_NOTMINQUERY:
2215 c = *ecode++ - OP_NOTSTAR;
2216 minimize = (c & 1) != 0;
2217 min = rep_min[c]; /* Pick up values from tables; */
2218 max = rep_max[c]; /* zero for max => infinity */
2219 if (max == 0) max = INT_MAX;
2220
2221 /* Common code for all repeated single-byte matches. We can give up quickly
2222 if there are fewer than the minimum number of bytes left in the
2223 subject. */
2224
2225 REPEATNOTCHAR:
2226 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2227 fc = *ecode++;
2228
2229 /* The code is duplicated for the caseless and caseful cases, for speed,
2230 since matching characters is likely to be quite common. First, ensure the
2231 minimum number of matches are present. If min = max, continue at the same
2232 level without recursing. Otherwise, if minimizing, keep trying the rest of
2233 the expression and advancing one matching character if failing, up to the
2234 maximum. Alternatively, if maximizing, find the maximum number of
2235 characters and work backwards. */
2236
2237 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2238 max, eptr));
2239
2240 if ((ims & PCRE_CASELESS) != 0)
2241 {
2242 fc = md->lcc[fc];
2243
2244 #ifdef SUPPORT_UTF8
2245 /* UTF-8 mode */
2246 if (utf8)
2247 {
2248 register int d;
2249 for (i = 1; i <= min; i++)
2250 {
2251 GETCHARINC(d, eptr);
2252 if (d < 256) d = md->lcc[d];
2253 if (fc == d) RRETURN(MATCH_NOMATCH);
2254 }
2255 }
2256 else
2257 #endif
2258
2259 /* Not UTF-8 mode */
2260 {
2261 for (i = 1; i <= min; i++)
2262 if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2263 }
2264
2265 if (min == max) continue;
2266
2267 if (minimize)
2268 {
2269 #ifdef SUPPORT_UTF8
2270 /* UTF-8 mode */
2271 if (utf8)
2272 {
2273 register int d;
2274 for (fi = min;; fi++)
2275 {
2276 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2277 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2278 GETCHARINC(d, eptr);
2279 if (d < 256) d = md->lcc[d];
2280 if (fi >= max || eptr >= md->end_subject || fc == d)
2281 RRETURN(MATCH_NOMATCH);
2282 }
2283 }
2284 else
2285 #endif
2286 /* Not UTF-8 mode */
2287 {
2288 for (fi = min;; fi++)
2289 {
2290 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2291 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2292 if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2293 RRETURN(MATCH_NOMATCH);
2294 }
2295 }
2296 /* Control never gets here */
2297 }
2298
2299 /* Maximize case */
2300
2301 else
2302 {
2303 pp = eptr;
2304
2305 #ifdef SUPPORT_UTF8
2306 /* UTF-8 mode */
2307 if (utf8)
2308 {
2309 register int d;
2310 for (i = min; i < max; i++)
2311 {
2312 int len = 1;
2313 if (eptr >= md->end_subject) break;
2314 GETCHARLEN(d, eptr, len);
2315 if (d < 256) d = md->lcc[d];
2316 if (fc == d) break;
2317 eptr += len;
2318 }
2319 for(;;)
2320 {
2321 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2322 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2323 if (eptr-- == pp) break; /* Stop if tried at original pos */
2324 BACKCHAR(eptr);
2325 }
2326 }
2327 else
2328 #endif
2329 /* Not UTF-8 mode */
2330 {
2331 for (i = min; i < max; i++)
2332 {
2333 if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2334 eptr++;
2335 }
2336 while (eptr >= pp)
2337 {
2338 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2339 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2340 eptr--;
2341 }
2342 }
2343
2344 RRETURN(MATCH_NOMATCH);
2345 }
2346 /* Control never gets here */
2347 }
2348
2349 /* Caseful comparisons */
2350
2351 else
2352 {
2353 #ifdef SUPPORT_UTF8
2354 /* UTF-8 mode */
2355 if (utf8)
2356 {
2357 register int d;
2358 for (i = 1; i <= min; i++)
2359 {
2360 GETCHARINC(d, eptr);
2361 if (fc == d) RRETURN(MATCH_NOMATCH);
2362 }
2363 }
2364 else
2365 #endif
2366 /* Not UTF-8 mode */
2367 {
2368 for (i = 1; i <= min; i++)
2369 if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2370 }
2371
2372 if (min == max) continue;
2373
2374 if (minimize)
2375 {
2376 #ifdef SUPPORT_UTF8
2377 /* UTF-8 mode */
2378 if (utf8)
2379 {
2380 register int d;
2381 for (fi = min;; fi++)
2382 {
2383 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2384 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2385 GETCHARINC(d, eptr);
2386 if (fi >= max || eptr >= md->end_subject || fc == d)
2387 RRETURN(MATCH_NOMATCH);
2388 }
2389 }
2390 else
2391 #endif
2392 /* Not UTF-8 mode */
2393 {
2394 for (fi = min;; fi++)
2395 {
2396 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2397 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2398 if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2399 RRETURN(MATCH_NOMATCH);
2400 }
2401 }
2402 /* Control never gets here */
2403 }
2404
2405 /* Maximize case */
2406
2407 else
2408 {
2409 pp = eptr;
2410
2411 #ifdef SUPPORT_UTF8
2412 /* UTF-8 mode */
2413 if (utf8)
2414 {
2415 register int d;
2416 for (i = min; i < max; i++)
2417 {
2418 int len = 1;
2419 if (eptr >= md->end_subject) break;
2420 GETCHARLEN(d, eptr, len);
2421 if (fc == d) break;
2422 eptr += len;
2423 }
2424 for(;;)
2425 {
2426 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2427 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2428 if (eptr-- == pp) break; /* Stop if tried at original pos */
2429 BACKCHAR(eptr);
2430 }
2431 }
2432 else
2433 #endif
2434 /* Not UTF-8 mode */
2435 {
2436 for (i = min; i < max; i++)
2437 {
2438 if (eptr >= md->end_subject || fc == *eptr) break;
2439 eptr++;
2440 }
2441 while (eptr >= pp)
2442 {
2443 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2444 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2445 eptr--;
2446 }
2447 }
2448
2449 RRETURN(MATCH_NOMATCH);
2450 }
2451 }
2452 /* Control never gets here */
2453
2454 /* Match a single character type repeatedly; several different opcodes
2455 share code. This is very similar to the code for single characters, but we
2456 repeat it in the interests of efficiency. */
2457
2458 case OP_TYPEEXACT:
2459 min = max = GET2(ecode, 1);
2460 minimize = TRUE;
2461 ecode += 3;
2462 goto REPEATTYPE;
2463
2464 case OP_TYPEUPTO:
2465 case OP_TYPEMINUPTO:
2466 min = 0;
2467 max = GET2(ecode, 1);
2468 minimize = *ecode == OP_TYPEMINUPTO;
2469 ecode += 3;
2470 goto REPEATTYPE;
2471
2472 case OP_TYPESTAR:
2473 case OP_TYPEMINSTAR:
2474 case OP_TYPEPLUS:
2475 case OP_TYPEMINPLUS:
2476 case OP_TYPEQUERY:
2477 case OP_TYPEMINQUERY:
2478 c = *ecode++ - OP_TYPESTAR;
2479 minimize = (c & 1) != 0;
2480 min = rep_min[c]; /* Pick up values from tables; */
2481 max = rep_max[c]; /* zero for max => infinity */
2482 if (max == 0) max = INT_MAX;
2483
2484 /* Common code for all repeated single character type matches. Note that
2485 in UTF-8 mode, '.' matches a character of any length, but for the other
2486 character types, the valid characters are all one-byte long. */
2487
2488 REPEATTYPE:
2489 ctype = *ecode++; /* Code for the character type */
2490
2491 #ifdef SUPPORT_UCP
2492 if (ctype == OP_PROP || ctype == OP_NOTPROP)
2493 {
2494 prop_fail_result = ctype == OP_NOTPROP;
2495 prop_type = *ecode++;
2496 prop_value = *ecode++;
2497 }
2498 else prop_type = -1;
2499 #endif
2500
2501 /* First, ensure the minimum number of matches are present. Use inline
2502 code for maximizing the speed, and do the type test once at the start
2503 (i.e. keep it out of the loop). Also we can test that there are at least
2504 the minimum number of bytes before we start. This isn't as effective in
2505 UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2506 is tidier. Also separate the UCP code, which can be the same for both UTF-8
2507 and single-bytes. */
2508
2509 if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2510 if (min > 0)
2511 {
2512 #ifdef SUPPORT_UCP
2513 if (prop_type >= 0)
2514 {
2515 switch(prop_type)
2516 {
2517 case PT_ANY:
2518 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2519 for (i = 1; i <= min; i++)
2520 {
2521 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2522 GETCHARINC(c, eptr);
2523 }
2524 break;
2525
2526 case PT_LAMP:
2527 for (i = 1; i <= min; i++)
2528 {
2529 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2530 GETCHARINC(c, eptr);
2531 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2532 if ((prop_chartype == ucp_Lu ||
2533 prop_chartype == ucp_Ll ||
2534 prop_chartype == ucp_Lt) == prop_fail_result)
2535 RRETURN(MATCH_NOMATCH);
2536 }
2537 break;
2538
2539 case PT_GC:
2540 for (i = 1; i <= min; i++)
2541 {
2542 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2543 GETCHARINC(c, eptr);
2544 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2545 if ((prop_category == prop_value) == prop_fail_result)
2546 RRETURN(MATCH_NOMATCH);
2547 }
2548 break;
2549
2550 case PT_PC:
2551 for (i = 1; i <= min; i++)
2552 {
2553 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2554 GETCHARINC(c, eptr);
2555 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2556 if ((prop_chartype == prop_value) == prop_fail_result)
2557 RRETURN(MATCH_NOMATCH);
2558 }
2559 break;
2560
2561 case PT_SC:
2562 for (i = 1; i <= min; i++)
2563 {
2564 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2565 GETCHARINC(c, eptr);
2566 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2567 if ((prop_script == prop_value) == prop_fail_result)
2568 RRETURN(MATCH_NOMATCH);
2569 }
2570 break;
2571
2572 default:
2573 RRETURN(PCRE_ERROR_INTERNAL);
2574 break;
2575 }
2576 }
2577
2578 /* Match extended Unicode sequences. We will get here only if the
2579 support is in the binary; otherwise a compile-time error occurs. */
2580
2581 else if (ctype == OP_EXTUNI)
2582 {
2583 for (i = 1; i <= min; i++)
2584 {
2585 GETCHARINCTEST(c, eptr);
2586 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2587 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2588 while (eptr < md->end_subject)
2589 {
2590 int len = 1;
2591 if (!utf8) c = *eptr; else
2592 {
2593 GETCHARLEN(c, eptr, len);
2594 }
2595 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2596 if (prop_category != ucp_M) break;
2597 eptr += len;
2598 }
2599 }
2600 }
2601
2602 else
2603 #endif /* SUPPORT_UCP */
2604
2605 /* Handle all other cases when the coding is UTF-8 */
2606
2607 #ifdef SUPPORT_UTF8
2608 if (utf8) switch(ctype)
2609 {
2610 case OP_ANY:
2611 for (i = 1; i <= min; i++)
2612 {
2613 if (eptr >= md->end_subject ||
2614 ((ims & PCRE_DOTALL) == 0 &&
2615 eptr <= md->end_subject - md->nllen &&
2616 IS_NEWLINE(eptr)))
2617 RRETURN(MATCH_NOMATCH);
2618 eptr++;
2619 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2620 }
2621 break;
2622
2623 case OP_ANYBYTE:
2624 eptr += min;
2625 break;
2626
2627 case OP_NOT_DIGIT:
2628 for (i = 1; i <= min; i++)
2629 {
2630 if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2631 GETCHARINC(c, eptr);
2632 if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2633 RRETURN(MATCH_NOMATCH);
2634 }
2635 break;
2636
2637 case OP_DIGIT:
2638 for (i = 1; i <= min; i++)
2639 {
2640 if (eptr >= md->end_subject ||
2641 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2642 RRETURN(MATCH_NOMATCH);
2643 /* No need to skip more bytes - we know it's a 1-byte character */
2644 }
2645 break;
2646
2647 case OP_NOT_WHITESPACE:
2648 for (i = 1; i <= min; i++)
2649 {
2650 if (eptr >= md->end_subject ||
2651 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2652 RRETURN(MATCH_NOMATCH);
2653 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2654 }
2655 break;
2656
2657 case OP_WHITESPACE:
2658 for (i = 1; i <= min; i++)
2659 {
2660 if (eptr >= md->end_subject ||
2661 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2662 RRETURN(MATCH_NOMATCH);
2663 /* No need to skip more bytes - we know it's a 1-byte character */
2664 }
2665 break;
2666
2667 case OP_NOT_WORDCHAR:
2668 for (i = 1; i <= min; i++)
2669 {
2670 if (eptr >= md->end_subject ||
2671 (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2672 RRETURN(MATCH_NOMATCH);
2673 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2674 }
2675 break;
2676
2677 case OP_WORDCHAR:
2678 for (i = 1; i <= min; i++)
2679 {
2680 if (eptr >= md->end_subject ||
2681 *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2682 RRETURN(MATCH_NOMATCH);
2683 /* No need to skip more bytes - we know it's a 1-byte character */
2684 }
2685 break;
2686
2687 default:
2688 RRETURN(PCRE_ERROR_INTERNAL);
2689 } /* End switch(ctype) */
2690
2691 else
2692 #endif /* SUPPORT_UTF8 */
2693
2694 /* Code for the non-UTF-8 case for minimum matching of operators other
2695 than OP_PROP and OP_NOTPROP. */
2696
2697 switch(ctype)
2698 {
2699 case OP_ANY:
2700 if ((ims & PCRE_DOTALL) == 0)
2701 {
2702 for (i = 1; i <= min; i++)
2703 {
2704 if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
2705 RRETURN(MATCH_NOMATCH);
2706 eptr++;
2707 }
2708 }
2709 else eptr += min;
2710 break;
2711
2712 case OP_ANYBYTE:
2713 eptr += min;
2714 break;
2715
2716 case OP_NOT_DIGIT:
2717 for (i = 1; i <= min; i++)
2718 if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2719 break;
2720
2721 case OP_DIGIT:
2722 for (i = 1; i <= min; i++)
2723 if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2724 break;
2725
2726 case OP_NOT_WHITESPACE:
2727 for (i = 1; i <= min; i++)
2728 if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2729 break;
2730
2731 case OP_WHITESPACE:
2732 for (i = 1; i <= min; i++)
2733 if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2734 break;
2735
2736 case OP_NOT_WORDCHAR:
2737 for (i = 1; i <= min; i++)
2738 if ((md->ctypes[*eptr++] & ctype_word) != 0)
2739 RRETURN(MATCH_NOMATCH);
2740 break;
2741
2742 case OP_WORDCHAR:
2743 for (i = 1; i <= min; i++)
2744 if ((md->ctypes[*eptr++] & ctype_word) == 0)
2745 RRETURN(MATCH_NOMATCH);
2746 break;
2747
2748 default:
2749 RRETURN(PCRE_ERROR_INTERNAL);
2750 }
2751 }
2752
2753 /* If min = max, continue at the same level without recursing */
2754
2755 if (min == max) continue;
2756
2757 /* If minimizing, we have to test the rest of the pattern before each
2758 subsequent match. Again, separate the UTF-8 case for speed, and also
2759 separate the UCP cases. */
2760
2761 if (minimize)
2762 {
2763 #ifdef SUPPORT_UCP
2764 if (prop_type >= 0)
2765 {
2766 switch(prop_type)
2767 {
2768 case PT_ANY:
2769 for (fi = min;; fi++)
2770 {
2771 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2772 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2773 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2774 GETCHARINC(c, eptr);
2775 if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2776 }
2777 break;
2778
2779 case PT_LAMP:
2780 for (fi = min;; fi++)
2781 {
2782 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2783 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2784 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2785 GETCHARINC(c, eptr);
2786 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2787 if ((prop_chartype == ucp_Lu ||
2788 prop_chartype == ucp_Ll ||
2789 prop_chartype == ucp_Lt) == prop_fail_result)
2790 RRETURN(MATCH_NOMATCH);
2791 }
2792 break;
2793
2794 case PT_GC:
2795 for (fi = min;; fi++)
2796 {
2797 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2798 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2799 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2800 GETCHARINC(c, eptr);
2801 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2802 if ((prop_category == prop_value) == prop_fail_result)
2803 RRETURN(MATCH_NOMATCH);
2804 }
2805 break;
2806
2807 case PT_PC:
2808 for (fi = min;; fi++)
2809 {
2810 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2811 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2812 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2813 GETCHARINC(c, eptr);
2814 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2815 if ((prop_chartype == prop_value) == prop_fail_result)
2816 RRETURN(MATCH_NOMATCH);
2817 }
2818 break;
2819
2820 case PT_SC:
2821 for (fi = min;; fi++)
2822 {
2823 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2824 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2825 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2826 GETCHARINC(c, eptr);
2827 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2828 if ((prop_script == prop_value) == prop_fail_result)
2829 RRETURN(MATCH_NOMATCH);
2830 }
2831 break;
2832
2833 default:
2834 RRETURN(PCRE_ERROR_INTERNAL);
2835 break;
2836 }
2837 }
2838
2839 /* Match extended Unicode sequences. We will get here only if the
2840 support is in the binary; otherwise a compile-time error occurs. */
2841
2842 else if (ctype == OP_EXTUNI)
2843 {
2844 for (fi = min;; fi++)
2845 {
2846 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2847 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2848 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2849 GETCHARINCTEST(c, eptr);
2850 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2851 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2852 while (eptr < md->end_subject)
2853 {
2854 int len = 1;
2855 if (!utf8) c = *eptr; else
2856 {
2857 GETCHARLEN(c, eptr, len);
2858 }
2859 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2860 if (prop_category != ucp_M) break;
2861 eptr += len;
2862 }
2863 }
2864 }
2865
2866 else
2867 #endif /* SUPPORT_UCP */
2868
2869 #ifdef SUPPORT_UTF8
2870 /* UTF-8 mode */
2871 if (utf8)
2872 {
2873 for (fi = min;; fi++)
2874 {
2875 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2876 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2877 if (fi >= max || eptr >= md->end_subject ||
2878 (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
2879 eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
2880 RRETURN(MATCH_NOMATCH);
2881
2882 GETCHARINC(c, eptr);
2883 switch(ctype)
2884 {
2885 case OP_ANY: /* This is the DOTALL case */
2886 break;
2887
2888 case OP_ANYBYTE:
2889 break;
2890
2891 case OP_NOT_DIGIT:
2892 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2893 RRETURN(MATCH_NOMATCH);
2894 break;
2895
2896 case OP_DIGIT:
2897 if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2898 RRETURN(MATCH_NOMATCH);
2899 break;
2900
2901 case OP_NOT_WHITESPACE:
2902 if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2903 RRETURN(MATCH_NOMATCH);
2904 break;
2905
2906 case OP_WHITESPACE:
2907 if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2908 RRETURN(MATCH_NOMATCH);
2909 break;
2910
2911 case OP_NOT_WORDCHAR:
2912 if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2913 RRETURN(MATCH_NOMATCH);
2914 break;
2915
2916 case OP_WORDCHAR:
2917 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2918 RRETURN(MATCH_NOMATCH);
2919 break;
2920
2921 default:
2922 RRETURN(PCRE_ERROR_INTERNAL);
2923 }
2924 }
2925 }
2926 else
2927 #endif
2928 /* Not UTF-8 mode */
2929 {
2930 for (fi = min;; fi++)
2931 {
2932 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2933 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2934 if (fi >= max || eptr >= md->end_subject ||
2935 ((ims & PCRE_DOTALL) == 0 &&
2936 eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
2937 RRETURN(MATCH_NOMATCH);
2938
2939 c = *eptr++;
2940 switch(ctype)
2941 {
2942 case OP_ANY: /* This is the DOTALL case */
2943 break;
2944
2945 case OP_ANYBYTE:
2946 break;
2947
2948 case OP_NOT_DIGIT:
2949 if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2950 break;
2951
2952 case OP_DIGIT:
2953 if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2954 break;
2955
2956 case OP_NOT_WHITESPACE:
2957 if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2958 break;
2959
2960 case OP_WHITESPACE:
2961 if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2962 break;
2963
2964 case OP_NOT_WORDCHAR:
2965 if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2966 break;
2967
2968 case OP_WORDCHAR:
2969 if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2970 break;
2971
2972 default:
2973 RRETURN(PCRE_ERROR_INTERNAL);
2974 }
2975 }
2976 }
2977 /* Control never gets here */
2978 }
2979
2980 /* If maximizing it is worth using inline code for speed, doing the type
2981 test once at the start (i.e. keep it out of the loop). Again, keep the
2982 UTF-8 and UCP stuff separate. */
2983
2984 else
2985 {
2986 pp = eptr; /* Remember where we started */
2987
2988 #ifdef SUPPORT_UCP
2989 if (prop_type >= 0)
2990 {
2991 switch(prop_type)
2992 {
2993 case PT_ANY:
2994 for (i = min; i < max; i++)
2995 {
2996 int len = 1;
2997 if (eptr >= md->end_subject) break;
2998 GETCHARLEN(c, eptr, len);
2999 if (prop_fail_result) break;
3000 eptr+= len;
3001 }
3002 break;
3003
3004 case PT_LAMP:
3005 for (i = min; i < max; i++)
3006 {
3007 int len = 1;
3008 if (eptr >= md->end_subject) break;
3009 GETCHARLEN(c, eptr, len);
3010 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3011 if ((prop_chartype == ucp_Lu ||
3012 prop_chartype == ucp_Ll ||
3013 prop_chartype == ucp_Lt) == prop_fail_result)
3014 break;
3015 eptr+= len;
3016 }
3017 break;
3018
3019 case PT_GC:
3020 for (i = min; i < max; i++)
3021 {
3022 int len = 1;
3023 if (eptr >= md->end_subject) break;
3024 GETCHARLEN(c, eptr, len);
3025 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3026 if ((prop_category == prop_value) == prop_fail_result)
3027 break;
3028 eptr+= len;
3029 }
3030 break;
3031
3032 case PT_PC:
3033 for (i = min; i < max; i++)
3034 {
3035 int len = 1;
3036 if (eptr >= md->end_subject) break;
3037 GETCHARLEN(c, eptr, len);
3038 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3039 if ((prop_chartype == prop_value) == prop_fail_result)
3040 break;
3041 eptr+= len;
3042 }
3043 break;
3044
3045 case PT_SC:
3046 for (i = min; i < max; i++)
3047 {
3048 int len = 1;
3049 if (eptr >= md->end_subject) break;
3050 GETCHARLEN(c, eptr, len);
3051 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3052 if ((prop_script == prop_value) == prop_fail_result)
3053 break;
3054 eptr+= len;
3055 }
3056 break;
3057 }
3058
3059 /* eptr is now past the end of the maximum run */
3060
3061 for(;;)
3062 {
3063 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3064 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3065 if (eptr-- == pp) break; /* Stop if tried at original pos */
3066 BACKCHAR(eptr);
3067 }
3068 }
3069
3070 /* Match extended Unicode sequences. We will get here only if the
3071 support is in the binary; otherwise a compile-time error occurs. */
3072
3073 else if (ctype == OP_EXTUNI)
3074 {
3075 for (i = min; i < max; i++)
3076 {
3077 if (eptr >= md->end_subject) break;
3078 GETCHARINCTEST(c, eptr);
3079 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3080 if (prop_category == ucp_M) break;
3081 while (eptr < md->end_subject)
3082 {
3083 int len = 1;
3084 if (!utf8) c = *eptr; else
3085 {
3086 GETCHARLEN(c, eptr, len);
3087 }
3088 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3089 if (prop_category != ucp_M) break;
3090 eptr += len;
3091 }
3092 }
3093
3094 /* eptr is now past the end of the maximum run */
3095
3096 for(;;)
3097 {
3098 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3099 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3100 if (eptr-- == pp) break; /* Stop if tried at original pos */
3101 for (;;) /* Move back over one extended */
3102 {
3103 int len = 1;
3104 BACKCHAR(eptr);
3105 if (!utf8) c = *eptr; else
3106 {
3107 GETCHARLEN(c, eptr, len);
3108 }
3109 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3110 if (prop_category != ucp_M) break;
3111 eptr--;
3112 }
3113 }
3114 }
3115
3116 else
3117 #endif /* SUPPORT_UCP */
3118
3119 #ifdef SUPPORT_UTF8
3120 /* UTF-8 mode */
3121
3122 if (utf8)
3123 {
3124 switch(ctype)
3125 {
3126 case OP_ANY:
3127
3128 /* Special code is required for UTF8, but when the maximum is
3129 unlimited we don't need it, so we repeat the non-UTF8 code. This is
3130 probably worth it, because .* is quite a common idiom. */
3131
3132 if (max < INT_MAX)
3133 {
3134 if ((ims & PCRE_DOTALL) == 0)
3135 {
3136 for (i = min; i < max; i++)
3137 {
3138 if (eptr >= md->end_subject ||
3139 (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3140 break;
3141 eptr++;
3142 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3143 }
3144 }
3145 else
3146 {
3147 for (i = min; i < max; i++)
3148 {
3149 if (eptr >= md->end_subject) break;
3150 eptr++;
3151 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3152 }
3153 }
3154 }
3155
3156 /* Handle unlimited UTF-8 repeat */
3157
3158 else
3159 {
3160 if ((ims & PCRE_DOTALL) == 0)
3161 {
3162 for (i = min; i < max; i++)
3163 {
3164 if (eptr >= md->end_subject ||
3165 (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3166 break;
3167 eptr++;
3168 }
3169 break;
3170 }
3171 else
3172 {
3173 c = max - min;
3174 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3175 eptr += c;
3176 }
3177 }
3178 break;
3179
3180 /* The byte case is the same as non-UTF8 */
3181
3182 case OP_ANYBYTE:
3183 c = max - min;
3184 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3185 eptr += c;
3186 break;
3187
3188 case OP_NOT_DIGIT:
3189 for (i = min; i < max; i++)
3190 {
3191 int len = 1;
3192 if (eptr >= md->end_subject) break;
3193 GETCHARLEN(c, eptr, len);
3194 if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3195 eptr+= len;
3196 }
3197 break;
3198
3199 case OP_DIGIT:
3200 for (i = min; i < max; i++)
3201 {
3202 int len = 1;
3203 if (eptr >= md->end_subject) break;
3204 GETCHARLEN(c, eptr, len);
3205 if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3206 eptr+= len;
3207 }
3208 break;
3209
3210 case OP_NOT_WHITESPACE:
3211 for (i = min; i < max; i++)
3212 {
3213 int len = 1;
3214 if (eptr >= md->end_subject) break;
3215 GETCHARLEN(c, eptr, len);
3216 if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3217 eptr+= len;
3218 }
3219 break;
3220
3221 case OP_WHITESPACE:
3222 for (i = min; i < max; i++)
3223 {
3224 int len = 1;
3225 if (eptr >= md->end_subject) break;
3226 GETCHARLEN(c, eptr, len);
3227 if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3228 eptr+= len;
3229 }
3230 break;
3231
3232 case OP_NOT_WORDCHAR:
3233 for (i = min; i < max; i++)
3234 {
3235 int len = 1;
3236 if (eptr >= md->end_subject) break;
3237 GETCHARLEN(c, eptr, len);
3238 if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3239 eptr+= len;
3240 }
3241 break;
3242
3243 case OP_WORDCHAR:
3244 for (i = min; i < max; i++)
3245 {
3246 int len = 1;
3247 if (eptr >= md->end_subject) break;
3248 GETCHARLEN(c, eptr, len);
3249 if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3250 eptr+= len;
3251 }
3252 break;
3253
3254 default:
3255 RRETURN(PCRE_ERROR_INTERNAL);
3256 }
3257
3258 /* eptr is now past the end of the maximum run */
3259
3260 for(;;)
3261 {
3262 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3263 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3264 if (eptr-- == pp) break; /* Stop if tried at original pos */
3265 BACKCHAR(eptr);
3266 }
3267 }
3268 else
3269 #endif
3270
3271 /* Not UTF-8 mode */
3272 {
3273 switch(ctype)
3274 {
3275 case OP_ANY:
3276 if ((ims & PCRE_DOTALL) == 0)
3277 {
3278 for (i = min; i < max; i++)
3279 {
3280 if (eptr >= md->end_subject ||
3281 (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3282 break;
3283 eptr++;
3284 }
3285 break;
3286 }
3287 /* For DOTALL case, fall through and treat as \C */
3288
3289 case OP_ANYBYTE:
3290 c = max - min;
3291 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3292 eptr += c;
3293 break;
3294
3295 case OP_NOT_DIGIT:
3296 for (i = min; i < max; i++)
3297 {
3298 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3299 break;
3300 eptr++;
3301 }
3302 break;
3303
3304 case OP_DIGIT:
3305 for (i = min; i < max; i++)
3306 {
3307 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3308 break;
3309 eptr++;
3310 }
3311 break;
3312
3313 case OP_NOT_WHITESPACE:
3314 for (i = min; i < max; i++)
3315 {
3316 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3317 break;
3318 eptr++;
3319 }
3320 break;
3321
3322 case OP_WHITESPACE:
3323 for (i = min; i < max; i++)
3324 {
3325 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3326 break;
3327 eptr++;
3328 }
3329 break;
3330
3331 case OP_NOT_WORDCHAR:
3332 for (i = min; i < max; i++)
3333 {
3334 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3335 break;
3336 eptr++;
3337 }
3338 break;
3339
3340 case OP_WORDCHAR:
3341 for (i = min; i < max; i++)
3342 {
3343 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3344 break;
3345 eptr++;
3346 }
3347 break;
3348
3349 default:
3350 RRETURN(PCRE_ERROR_INTERNAL);
3351 }
3352
3353 /* eptr is now past the end of the maximum run */
3354
3355 while (eptr >= pp)
3356 {
3357 RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3358 eptr--;
3359 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3360 }
3361 }
3362
3363 /* Get here if we can't make it match with any permitted repetitions */
3364
3365 RRETURN(MATCH_NOMATCH);
3366 }
3367 /* Control never gets here */
3368
3369 /* There's been some horrible disaster. Since all codes > OP_BRA are
3370 for capturing brackets, and there shouldn't be any gaps between 0 and
3371 OP_BRA, arrival here can only mean there is something seriously wrong
3372 in the code above or the OP_xxx definitions. */
3373
3374 default:
3375 DPRINTF(("Unknown opcode %d\n", *ecode));
3376 RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3377 }
3378
3379 /* Do not stick any code in here without much thought; it is assumed
3380 that "continue" in the code above comes out to here to repeat the main
3381 loop. */
3382
3383 } /* End of main loop */
3384 /* Control never reaches here */
3385 }
3386
3387
3388 /***************************************************************************
3389 ****************************************************************************
3390 RECURSION IN THE match() FUNCTION
3391
3392 Undefine all the macros that were defined above to handle this. */
3393
3394 #ifdef NO_RECURSE
3395 #undef eptr
3396 #undef ecode
3397 #undef offset_top
3398 #undef ims
3399 #undef eptrb
3400 #undef flags
3401
3402 #undef callpat
3403 #undef charptr
3404 #undef data
3405 #undef next
3406 #undef pp
3407 #undef prev
3408 #undef saved_eptr
3409
3410 #undef new_recursive
3411
3412 #undef cur_is_word
3413 #undef condition
3414 #undef minimize
3415 #undef prev_is_word
3416
3417 #undef original_ims
3418
3419 #undef ctype
3420 #undef length
3421 #undef max
3422 #undef min
3423 #undef number
3424 #undef offset
3425 #undef op
3426 #undef save_capture_last
3427 #undef save_offset1
3428 #undef save_offset2
3429 #undef save_offset3
3430 #undef stacksave
3431
3432 #undef newptrb
3433
3434 #endif
3435
3436 /* These two are defined as macros in both cases */
3437
3438 #undef fc
3439 #undef fi
3440
3441 /***************************************************************************
3442 ***************************************************************************/
3443
3444
3445
3446 /*************************************************
3447 * Execute a Regular Expression *
3448 *************************************************/
3449
3450 /* This function applies a compiled re to a subject string and picks out
3451 portions of the string if it matches. Two elements in the vector are set for
3452 each substring: the offsets to the start and end of the substring.
3453
3454 Arguments:
3455 argument_re points to the compiled expression
3456 extra_data points to extra data or is NULL
3457 subject points to the subject string
3458 length length of subject string (may contain binary zeros)
3459 start_offset where to start in the subject string
3460 options option bits
3461 offsets points to a vector of ints to be filled in with offsets
3462 offsetcount the number of elements in the vector
3463
3464 Returns: > 0 => success; value is the number of elements filled in
3465 = 0 => success, but offsets is not big enough
3466 -1 => failed to match
3467 < -1 => some kind of unexpected problem
3468 */
3469
3470 PCRE_DATA_SCOPE int
3471 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3472 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3473 int offsetcount)
3474 {
3475 int rc, resetcount, ocount;
3476 int first_byte = -1;
3477 int req_byte = -1;
3478 int req_byte2 = -1;
3479 int newline;
3480 unsigned long int ims;
3481 BOOL using_temporary_offsets = FALSE;
3482 BOOL anchored;
3483 BOOL startline;
3484 BOOL firstline;
3485 BOOL first_byte_caseless = FALSE;
3486 BOOL req_byte_caseless = FALSE;
3487 match_data match_block;
3488 match_data *md = &match_block;
3489 const uschar *tables;
3490 const uschar *start_bits = NULL;
3491 USPTR start_match = (USPTR)subject + start_offset;
3492 USPTR end_subject;
3493 USPTR req_byte_ptr = start_match - 1;
3494
3495 pcre_study_data internal_study;
3496 const pcre_study_data *study;
3497
3498 real_pcre internal_re;
3499 const real_pcre *external_re = (const real_pcre *)argument_re;
3500 const real_pcre *re = external_re;
3501
3502 /* Plausibility checks */
3503
3504 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3505 if (re == NULL || subject == NULL ||
3506 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3507 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3508
3509 /* Fish out the optional data from the extra_data structure, first setting
3510 the default values. */
3511
3512 study = NULL;
3513 md->match_limit = MATCH_LIMIT;
3514 md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3515 md->callout_data = NULL;
3516
3517 /* The table pointer is always in native byte order. */
3518
3519 tables = external_re->tables;
3520
3521 if (extra_data != NULL)
3522 {
3523 register unsigned int flags = extra_data->flags;
3524 if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3525 study = (const pcre_study_data *)extra_data->study_data;
3526 if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3527 md->match_limit = extra_data->match_limit;
3528 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3529 md->match_limit_recursion = extra_data->match_limit_recursion;
3530 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3531 md->callout_data = extra_data->callout_data;
3532 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3533 }
3534
3535 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3536 is a feature that makes it possible to save compiled regex and re-use them
3537 in other programs later. */
3538
3539 if (tables == NULL) tables = _pcre_default_tables;
3540
3541 /* Check that the first field in the block is the magic number. If it is not,
3542 test for a regex that was compiled on a host of opposite endianness. If this is
3543 the case, flipped values are put in internal_re and internal_study if there was
3544 study data too. */
3545
3546 if (re->magic_number != MAGIC_NUMBER)
3547 {
3548 re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3549 if (re == NULL) return PCRE_ERROR_BADMAGIC;
3550 if (study != NULL) study = &internal_study;
3551 }
3552
3553 /* Set up other data */
3554
3555 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3556 startline = (re->options & PCRE_STARTLINE) != 0;
3557 firstline = (re->options & PCRE_FIRSTLINE) != 0;
3558
3559 /* The code starts after the real_pcre block and the capture name table. */
3560
3561 md->start_code = (const uschar *)external_re + re->name_table_offset +
3562 re->name_count * re->name_entry_size;
3563
3564 md->start_subject = (USPTR)subject;
3565 md->start_offset = start_offset;
3566 md->end_subject = md->start_subject + length;
3567 end_subject = md->end_subject;
3568
3569 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3570 md->utf8 = (re->options & PCRE_UTF8) != 0;
3571
3572 md->notbol = (options & PCRE_NOTBOL) != 0;
3573 md->noteol = (options & PCRE_NOTEOL) != 0;
3574 md->notempty = (options & PCRE_NOTEMPTY) != 0;
3575 md->partial = (options & PCRE_PARTIAL) != 0;
3576 md->hitend = FALSE;
3577
3578 md->recursive = NULL; /* No recursion at top level */
3579
3580 md->lcc = tables + lcc_offset;
3581 md->ctypes = tables + ctypes_offset;
3582
3583 /* Handle different types of newline. The two bits give four cases. If nothing
3584 is set at run time, whatever was used at compile time applies. */
3585
3586 switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
3587 PCRE_NEWLINE_CRLF)
3588 {
3589 default: newline = NEWLINE; break; /* Compile-time default */
3590 case PCRE_NEWLINE_CR: newline = '\r'; break;
3591 case PCRE_NEWLINE_LF: newline = '\n'; break;
3592 case PCRE_NEWLINE_CR+
3593 PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3594 }
3595
3596 if (newline > 255)
3597 {
3598 md->nllen = 2;
3599 md->nl[0] = (newline >> 8) & 255;
3600 md->nl[1] = newline & 255;
3601 }
3602 else
3603 {
3604 md->nllen = 1;
3605 md->nl[0] = newline;
3606 }
3607
3608 /* Partial matching is supported only for a restricted set of regexes at the
3609 moment. */
3610
3611 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3612 return PCRE_ERROR_BADPARTIAL;
3613
3614 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3615 back the character offset. */
3616
3617 #ifdef SUPPORT_UTF8
3618 if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3619 {
3620 if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3621 return PCRE_ERROR_BADUTF8;
3622 if (start_offset > 0 && start_offset < length)
3623 {
3624 int tb = ((uschar *)subject)[start_offset];
3625 if (tb > 127)
3626 {
3627 tb &= 0xc0;
3628 if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3629 }
3630 }
3631 }
3632 #endif
3633
3634 /* The ims options can vary during the matching as a result of the presence
3635 of (?ims) items in the pattern. They are kept in a local variable so that
3636 restoring at the exit of a group is easy. */
3637
3638 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3639
3640 /* If the expression has got more back references than the offsets supplied can
3641 hold, we get a temporary chunk of working store to use during the matching.
3642 Otherwise, we can use the vector supplied, rounding down its size to a multiple
3643 of 3. */
3644
3645 ocount = offsetcount - (offsetcount % 3);
3646
3647 if (re->top_backref > 0 && re->top_backref >= ocount/3)
3648 {
3649 ocount = re->top_backref * 3 + 3;
3650 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3651 if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3652 using_temporary_offsets = TRUE;
3653 DPRINTF(("Got memory to hold back references\n"));
3654 }
3655 else md->offset_vector = offsets;
3656
3657 md->offset_end = ocount;
3658 md->offset_max = (2*ocount)/3;
3659 md->offset_overflow = FALSE;
3660 md->capture_last = -1;
3661
3662 /* Compute the minimum number of offsets that we need to reset each time. Doing
3663 this makes a huge difference to execution time when there aren't many brackets
3664 in the pattern. */
3665
3666 resetcount = 2 + re->top_bracket * 2;
3667 if (resetcount > offsetcount) resetcount = ocount;
3668
3669 /* Reset the working variable associated with each extraction. These should
3670 never be used unless previously set, but they get saved and restored, and so we
3671 initialize them to avoid reading uninitialized locations. */
3672
3673 if (md->offset_vector != NULL)
3674 {
3675 register int *iptr = md->offset_vector + ocount;
3676 register int *iend = iptr - resetcount/2 + 1;
3677 while (--iptr >= iend) *iptr = -1;
3678 }
3679
3680 /* Set up the first character to match, if available. The first_byte value is
3681 never set for an anchored regular expression, but the anchoring may be forced
3682 at run time, so we have to test for anchoring. The first char may be unset for
3683 an unanchored pattern, of course. If there's no first char and the pattern was
3684 studied, there may be a bitmap of possible first characters. */
3685
3686 if (!anchored)
3687 {
3688 if ((re->options & PCRE_FIRSTSET) != 0)
3689 {
3690 first_byte = re->first_byte & 255;
3691 if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3692 first_byte = md->lcc[first_byte];
3693 }
3694 else
3695 if (!startline && study != NULL &&
3696 (study->options & PCRE_STUDY_MAPPED) != 0)
3697 start_bits = study->start_bits;
3698 }
3699
3700 /* For anchored or unanchored matches, there may be a "last known required
3701 character" set. */
3702
3703 if ((re->options & PCRE_REQCHSET) != 0)
3704 {
3705 req_byte = re->req_byte & 255;
3706 req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3707 req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3708 }
3709
3710 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3711 the loop runs just once. */
3712
3713 do
3714 {
3715 USPTR save_end_subject = end_subject;
3716
3717 /* Reset the maximum number of extractions we might see. */
3718
3719 if (md->offset_vector != NULL)
3720 {
3721 register int *iptr = md->offset_vector;
3722 register int *iend = iptr + resetcount;
3723 while (iptr < iend) *iptr++ = -1;
3724 }
3725
3726 /* Advance to a unique first char if possible. If firstline is TRUE, the
3727 start of the match is constrained to the first line of a multiline string.
3728 Implement this by temporarily adjusting end_subject so that we stop scanning
3729 at a newline. If the match fails at the newline, later code breaks this loop.
3730 */
3731
3732 if (firstline)
3733 {
3734 USPTR t = start_match;
3735 while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
3736 end_subject = t;
3737 }
3738
3739 /* Now test for a unique first byte */
3740
3741 if (first_byte >= 0)
3742 {
3743 if (first_byte_caseless)
3744 while (start_match < end_subject &&
3745 md->lcc[*start_match] != first_byte)
3746 start_match++;
3747 else
3748 while (start_match < end_subject && *start_match != first_byte)
3749 start_match++;
3750 }
3751
3752 /* Or to just after a linebreak for a multiline match if possible */
3753
3754 else if (startline)
3755 {
3756 if (start_match >= md->start_subject + md->nllen +
3757 start_offset)
3758 {
3759 while (start_match <= end_subject &&
3760 !IS_NEWLINE(start_match - md->nllen))
3761 start_match++;
3762 }
3763 }
3764
3765 /* Or to a non-unique first char after study */
3766
3767 else if (start_bits != NULL)
3768 {
3769 while (start_match < end_subject)
3770 {
3771 register unsigned int c = *start_match;
3772 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3773 }
3774 }
3775
3776 /* Restore fudged end_subject */
3777
3778 end_subject = save_end_subject;
3779
3780 #ifdef DEBUG /* Sigh. Some compilers never learn. */
3781 printf(">>>> Match against: ");
3782 pchars(start_match, end_subject - start_match, TRUE, md);
3783 printf("\n");
3784 #endif
3785
3786 /* If req_byte is set, we know that that character must appear in the subject
3787 for the match to succeed. If the first character is set, req_byte must be
3788 later in the subject; otherwise the test starts at the match point. This
3789 optimization can save a huge amount of backtracking in patterns with nested
3790 unlimited repeats that aren't going to match. Writing separate code for
3791 cased/caseless versions makes it go faster, as does using an autoincrement
3792 and backing off on a match.
3793
3794 HOWEVER: when the subject string is very, very long, searching to its end can
3795 take a long time, and give bad performance on quite ordinary patterns. This
3796 showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3797 don't do this when the string is sufficiently long.
3798
3799 ALSO: this processing is disabled when partial matching is requested.
3800 */
3801
3802 if (req_byte >= 0 &&
3803 end_subject - start_match < REQ_BYTE_MAX &&
3804 !md->partial)
3805 {
3806 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
3807
3808 /* We don't need to repeat the search if we haven't yet reached the
3809 place we found it at last time. */
3810
3811 if (p > req_byte_ptr)
3812 {
3813 if (req_byte_caseless)
3814 {
3815 while (p < end_subject)
3816 {
3817 register int pp = *p++;
3818 if (pp == req_byte || pp == req_byte2) { p--; break; }
3819 }
3820 }
3821 else
3822 {
3823 while (p < end_subject)
3824 {
3825 if (*p++ == req_byte) { p--; break; }
3826 }
3827 }
3828
3829 /* If we can't find the required character, break the matching loop */
3830
3831 if (p >= end_subject) break;
3832
3833 /* If we have found the required character, save the point where we
3834 found it, so that we don't search again next time round the loop if
3835 the start hasn't passed this character yet. */
3836
3837 req_byte_ptr = p;
3838 }
3839 }
3840
3841 /* When a match occurs, substrings will be set for all internal extractions;
3842 we just need to set up the whole thing as substring 0 before returning. If
3843 there were too many extractions, set the return code to zero. In the case
3844 where we had to get some local store to hold offsets for backreferences, copy
3845 those back references that we can. In this case there need not be overflow
3846 if certain parts of the pattern were not used. */
3847
3848 md->start_match = start_match;
3849 md->match_call_count = 0;
3850
3851 rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);
3852
3853 /* When the result is no match, if the subject's first character was a
3854 newline and the PCRE_FIRSTLINE option is set, break (which will return
3855 PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3856 newline in the subject. Otherwise, advance the pointer to the next character
3857 and continue - but the continuation will actually happen only when the
3858 pattern is not anchored. */
3859
3860 if (rc == MATCH_NOMATCH)
3861 {
3862 if (firstline &&
3863 start_match <= md->end_subject - md->nllen &&
3864 IS_NEWLINE(start_match))
3865 break;
3866 start_match++;
3867 #ifdef SUPPORT_UTF8
3868 if (md->utf8)
3869 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3870 start_match++;
3871 #endif
3872 continue;
3873 }
3874
3875 if (rc != MATCH_MATCH)
3876 {
3877 DPRINTF((">>>> error: returning %d\n", rc));
3878 return rc;
3879 }
3880
3881 /* We have a match! Copy the offset information from temporary store if
3882 necessary */
3883
3884 if (using_temporary_offsets)
3885 {
3886 if (offsetcount >= 4)
3887 {
3888 memcpy(offsets + 2, md->offset_vector + 2,
3889 (offsetcount - 2) * sizeof(int));
3890 DPRINTF(("Copied offsets from temporary memory\n"));
3891 }
3892 if (md->end_offset_top > offsetcount)
3893 md->offset_overflow = TRUE;
3894
3895 DPRINTF(("Freeing temporary memory\n"));
3896 (pcre_free)(md->offset_vector);
3897 }
3898
3899 rc = md->offset_overflow? 0 : md->end_offset_top/2;
3900
3901 if (offsetcount < 2) rc = 0; else
3902 {
3903 offsets[0] = start_match - md->start_subject;
3904 offsets[1] = md->end_match_ptr - md->start_subject;
3905 }
3906
3907 DPRINTF((">>>> returning %d\n", rc));
3908 return rc;
3909 }
3910
3911 /* This "while" is the end of the "do" above */
3912
3913 while (!anchored && start_match <= end_subject);
3914
3915 if (using_temporary_offsets)
3916 {
3917 DPRINTF(("Freeing temporary memory\n"));
3918 (pcre_free)(md->offset_vector);
3919 }
3920
3921 if (md->partial && md->hitend)
3922 {
3923 DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3924 return PCRE_ERROR_PARTIAL;
3925 }
3926 else
3927 {
3928 DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3929 return PCRE_ERROR_NOMATCH;
3930 }
3931 }
3932
3933 /* End of pcre_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12