/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 145 - (hide annotations) (download)
Wed Apr 4 14:06:52 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 127540 byte(s)
Reworked all the WIN32 __declspec stuff in the hope of getting it right.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 nigel 93 #define NLBLOCK md /* Block containing newline information */
46     #define PSSTART start_subject /* Field containing processed string start */
47     #define PSEND end_subject /* Field containing processed string end */
48    
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 137 /* Undefine some potentially clashing cpp symbols */
52    
53     #undef min
54     #undef max
55    
56 nigel 93 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57     obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58 nigel 77
59 nigel 93 #define EPTR_WORK_SIZE (1000)
60 nigel 77
61     /* Flag bits for the match() function */
62    
63 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
64     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65     #define match_tail_recursed 0x04 /* Tail recursive call */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73     /* Maximum number of ints of offset to save on the stack for recursive calls.
74     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75     because the offset vector is always a multiple of 3 long. */
76    
77     #define REC_STACK_SAVE_MAX 30
78    
79     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83    
84    
85    
86     #ifdef DEBUG
87     /*************************************************
88     * Debugging function to print chars *
89     *************************************************/
90    
91     /* Print a sequence of chars in printable format, stopping at the end of the
92     subject if the requested.
93    
94     Arguments:
95     p points to characters
96     length number to print
97     is_subject TRUE if printing from within md->start_subject
98     md pointer to matching data block, if is_subject is TRUE
99    
100     Returns: nothing
101     */
102    
103     static void
104     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105     {
106 nigel 93 unsigned int c;
107 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108     while (length-- > 0)
109     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110     }
111     #endif
112    
113    
114    
115     /*************************************************
116     * Match a back-reference *
117     *************************************************/
118    
119     /* If a back reference hasn't been set, the length that is passed is greater
120     than the number of characters left in the string, so the match fails.
121    
122     Arguments:
123     offset index into the offset vector
124     eptr points into the subject
125     length length to be matched
126     md points to match data block
127     ims the ims flags
128    
129     Returns: TRUE if matched
130     */
131    
132     static BOOL
133 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 nigel 77 unsigned long int ims)
135     {
136 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
137 nigel 77
138     #ifdef DEBUG
139     if (eptr >= md->end_subject)
140     printf("matching subject <null>");
141     else
142     {
143     printf("matching subject ");
144     pchars(eptr, length, TRUE, md);
145     }
146     printf(" against backref ");
147     pchars(p, length, FALSE, md);
148     printf("\n");
149     #endif
150    
151     /* Always fail if not enough characters left */
152    
153     if (length > md->end_subject - eptr) return FALSE;
154    
155     /* Separate the caselesss case for speed */
156    
157     if ((ims & PCRE_CASELESS) != 0)
158     {
159     while (length-- > 0)
160     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161     }
162     else
163     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164    
165     return TRUE;
166     }
167    
168    
169    
170     /***************************************************************************
171     ****************************************************************************
172     RECURSION IN THE match() FUNCTION
173    
174 nigel 87 The match() function is highly recursive, though not every recursive call
175     increases the recursive depth. Nevertheless, some regular expressions can cause
176     it to recurse to a great depth. I was writing for Unix, so I just let it call
177     itself recursively. This uses the stack for saving everything that has to be
178     saved for a recursive call. On Unix, the stack can be large, and this works
179     fine.
180 nigel 77
181 nigel 87 It turns out that on some non-Unix-like systems there are problems with
182     programs that use a lot of stack. (This despite the fact that every last chip
183     has oodles of memory these days, and techniques for extending the stack have
184     been known for decades.) So....
185 nigel 77
186     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187     calls by keeping local variables that need to be preserved in blocks of memory
188 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
189 nigel 77 achieve this so that the actual code doesn't look very different to what it
190     always used to.
191     ****************************************************************************
192     ***************************************************************************/
193    
194    
195 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
196     versions and production versions. */
197 nigel 77
198     #ifndef NO_RECURSE
199     #define REGISTER register
200 nigel 87 #ifdef DEBUG
201     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
202     { \
203     printf("match() called in line %d\n", __LINE__); \
204     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
205     printf("to line %d\n", __LINE__); \
206     }
207     #define RRETURN(ra) \
208     { \
209     printf("match() returned %d from line %d ", ra, __LINE__); \
210     return ra; \
211     }
212     #else
213     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
214     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
215 nigel 77 #define RRETURN(ra) return ra
216 nigel 87 #endif
217    
218 nigel 77 #else
219    
220    
221     /* These versions of the macros manage a private stack on the heap. Note
222     that the rd argument of RMATCH isn't actually used. It's the md argument of
223     match(), which never changes. */
224    
225     #define REGISTER
226    
227     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
228     {\
229     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
230     if (setjmp(frame->Xwhere) == 0)\
231     {\
232     newframe->Xeptr = ra;\
233     newframe->Xecode = rb;\
234     newframe->Xoffset_top = rc;\
235     newframe->Xims = re;\
236     newframe->Xeptrb = rf;\
237     newframe->Xflags = rg;\
238 nigel 87 newframe->Xrdepth = frame->Xrdepth + 1;\
239 nigel 77 newframe->Xprevframe = frame;\
240     frame = newframe;\
241     DPRINTF(("restarting from line %d\n", __LINE__));\
242     goto HEAP_RECURSE;\
243     }\
244     else\
245     {\
246     DPRINTF(("longjumped back to line %d\n", __LINE__));\
247     frame = md->thisframe;\
248     rx = frame->Xresult;\
249     }\
250     }
251    
252     #define RRETURN(ra)\
253     {\
254     heapframe *newframe = frame;\
255     frame = newframe->Xprevframe;\
256     (pcre_stack_free)(newframe);\
257     if (frame != NULL)\
258     {\
259     frame->Xresult = ra;\
260     md->thisframe = frame;\
261     longjmp(frame->Xwhere, 1);\
262     }\
263     return ra;\
264     }
265    
266    
267     /* Structure for remembering the local variables in a private frame */
268    
269     typedef struct heapframe {
270     struct heapframe *Xprevframe;
271    
272     /* Function arguments that may change */
273    
274     const uschar *Xeptr;
275     const uschar *Xecode;
276     int Xoffset_top;
277     long int Xims;
278     eptrblock *Xeptrb;
279     int Xflags;
280 nigel 91 unsigned int Xrdepth;
281 nigel 77
282     /* Function local variables */
283    
284     const uschar *Xcallpat;
285     const uschar *Xcharptr;
286     const uschar *Xdata;
287     const uschar *Xnext;
288     const uschar *Xpp;
289     const uschar *Xprev;
290     const uschar *Xsaved_eptr;
291    
292     recursion_info Xnew_recursive;
293    
294     BOOL Xcur_is_word;
295     BOOL Xcondition;
296     BOOL Xprev_is_word;
297    
298     unsigned long int Xoriginal_ims;
299    
300     #ifdef SUPPORT_UCP
301     int Xprop_type;
302 nigel 87 int Xprop_value;
303 nigel 77 int Xprop_fail_result;
304     int Xprop_category;
305     int Xprop_chartype;
306 nigel 87 int Xprop_script;
307 ph10 123 int Xoclength;
308     uschar Xocchars[8];
309 nigel 77 #endif
310    
311     int Xctype;
312 nigel 93 unsigned int Xfc;
313 nigel 77 int Xfi;
314     int Xlength;
315     int Xmax;
316     int Xmin;
317     int Xnumber;
318     int Xoffset;
319     int Xop;
320     int Xsave_capture_last;
321     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
322     int Xstacksave[REC_STACK_SAVE_MAX];
323    
324     eptrblock Xnewptrb;
325    
326     /* Place to pass back result, and where to jump back to */
327    
328     int Xresult;
329     jmp_buf Xwhere;
330    
331     } heapframe;
332    
333     #endif
334    
335    
336     /***************************************************************************
337     ***************************************************************************/
338    
339    
340    
341     /*************************************************
342     * Match from current position *
343     *************************************************/
344    
345 nigel 93 /* This function is called recursively in many circumstances. Whenever it
346 nigel 77 returns a negative (error) response, the outer incarnation must also return the
347     same response.
348    
349     Performance note: It might be tempting to extract commonly used fields from the
350     md structure (e.g. utf8, end_subject) into individual variables to improve
351     performance. Tests using gcc on a SPARC disproved this; in the first case, it
352     made performance worse.
353    
354     Arguments:
355 nigel 93 eptr pointer to current character in subject
356     ecode pointer to current position in compiled code
357 nigel 77 offset_top current top pointer
358     md pointer to "static" info for the match
359     ims current /i, /m, and /s options
360     eptrb pointer to chain of blocks containing eptr at start of
361     brackets - for testing for empty matches
362     flags can contain
363     match_condassert - this is an assertion condition
364 nigel 93 match_cbegroup - this is the start of an unlimited repeat
365     group that can match an empty string
366     match_tail_recursed - this is a tail_recursed group
367 nigel 87 rdepth the recursion depth
368 nigel 77
369     Returns: MATCH_MATCH if matched ) these values are >= 0
370     MATCH_NOMATCH if failed to match )
371     a negative PCRE_ERROR_xxx value if aborted by an error condition
372 nigel 87 (e.g. stopped by repeated call or recursion limit)
373 nigel 77 */
374    
375     static int
376 nigel 87 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
377 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
378 nigel 91 int flags, unsigned int rdepth)
379 nigel 77 {
380     /* These variables do not need to be preserved over recursion in this function,
381 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
382     "register" because they are used a lot in loops. */
383 nigel 77
384 nigel 91 register int rrc; /* Returns from recursive calls */
385     register int i; /* Used for loops not involving calls to RMATCH() */
386 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
387 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
388 nigel 77
389 nigel 93 BOOL minimize, possessive; /* Quantifier options */
390    
391 nigel 77 /* When recursion is not being used, all "local" variables that have to be
392     preserved over calls to RMATCH() are part of a "frame" which is obtained from
393     heap storage. Set up the top-level frame here; others are obtained from the
394     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
395    
396     #ifdef NO_RECURSE
397     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
398     frame->Xprevframe = NULL; /* Marks the top level */
399    
400     /* Copy in the original argument variables */
401    
402     frame->Xeptr = eptr;
403     frame->Xecode = ecode;
404     frame->Xoffset_top = offset_top;
405     frame->Xims = ims;
406     frame->Xeptrb = eptrb;
407     frame->Xflags = flags;
408 nigel 87 frame->Xrdepth = rdepth;
409 nigel 77
410     /* This is where control jumps back to to effect "recursion" */
411    
412     HEAP_RECURSE:
413    
414     /* Macros make the argument variables come from the current frame */
415    
416     #define eptr frame->Xeptr
417     #define ecode frame->Xecode
418     #define offset_top frame->Xoffset_top
419     #define ims frame->Xims
420     #define eptrb frame->Xeptrb
421     #define flags frame->Xflags
422 nigel 87 #define rdepth frame->Xrdepth
423 nigel 77
424     /* Ditto for the local variables */
425    
426     #ifdef SUPPORT_UTF8
427     #define charptr frame->Xcharptr
428     #endif
429     #define callpat frame->Xcallpat
430     #define data frame->Xdata
431     #define next frame->Xnext
432     #define pp frame->Xpp
433     #define prev frame->Xprev
434     #define saved_eptr frame->Xsaved_eptr
435    
436     #define new_recursive frame->Xnew_recursive
437    
438     #define cur_is_word frame->Xcur_is_word
439     #define condition frame->Xcondition
440     #define prev_is_word frame->Xprev_is_word
441    
442     #define original_ims frame->Xoriginal_ims
443    
444     #ifdef SUPPORT_UCP
445     #define prop_type frame->Xprop_type
446 nigel 87 #define prop_value frame->Xprop_value
447 nigel 77 #define prop_fail_result frame->Xprop_fail_result
448     #define prop_category frame->Xprop_category
449     #define prop_chartype frame->Xprop_chartype
450 nigel 87 #define prop_script frame->Xprop_script
451 ph10 115 #define oclength frame->Xoclength
452     #define occhars frame->Xocchars
453 nigel 77 #endif
454    
455     #define ctype frame->Xctype
456     #define fc frame->Xfc
457     #define fi frame->Xfi
458     #define length frame->Xlength
459     #define max frame->Xmax
460     #define min frame->Xmin
461     #define number frame->Xnumber
462     #define offset frame->Xoffset
463     #define op frame->Xop
464     #define save_capture_last frame->Xsave_capture_last
465     #define save_offset1 frame->Xsave_offset1
466     #define save_offset2 frame->Xsave_offset2
467     #define save_offset3 frame->Xsave_offset3
468     #define stacksave frame->Xstacksave
469    
470     #define newptrb frame->Xnewptrb
471    
472     /* When recursion is being used, local variables are allocated on the stack and
473     get preserved during recursion in the normal way. In this environment, fi and
474     i, and fc and c, can be the same variables. */
475    
476 nigel 93 #else /* NO_RECURSE not defined */
477 nigel 77 #define fi i
478     #define fc c
479    
480    
481 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
482     const uschar *charptr; /* in small blocks of the code. My normal */
483     #endif /* style of coding would have declared */
484     const uschar *callpat; /* them within each of those blocks. */
485     const uschar *data; /* However, in order to accommodate the */
486     const uschar *next; /* version of this code that uses an */
487     USPTR pp; /* external "stack" implemented on the */
488     const uschar *prev; /* heap, it is easier to declare them all */
489     USPTR saved_eptr; /* here, so the declarations can be cut */
490     /* out in a block. The only declarations */
491     recursion_info new_recursive; /* within blocks below are for variables */
492     /* that do not have to be preserved over */
493     BOOL cur_is_word; /* a recursive call to RMATCH(). */
494     BOOL condition;
495 nigel 77 BOOL prev_is_word;
496    
497     unsigned long int original_ims;
498    
499     #ifdef SUPPORT_UCP
500     int prop_type;
501 nigel 87 int prop_value;
502 nigel 77 int prop_fail_result;
503     int prop_category;
504     int prop_chartype;
505 nigel 87 int prop_script;
506 ph10 115 int oclength;
507     uschar occhars[8];
508 nigel 77 #endif
509    
510     int ctype;
511     int length;
512     int max;
513     int min;
514     int number;
515     int offset;
516     int op;
517     int save_capture_last;
518     int save_offset1, save_offset2, save_offset3;
519     int stacksave[REC_STACK_SAVE_MAX];
520    
521     eptrblock newptrb;
522 nigel 93 #endif /* NO_RECURSE */
523 nigel 77
524     /* These statements are here to stop the compiler complaining about unitialized
525     variables. */
526    
527     #ifdef SUPPORT_UCP
528 nigel 87 prop_value = 0;
529 nigel 77 prop_fail_result = 0;
530     #endif
531    
532 nigel 93
533 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
534     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
535     used. Thanks to Ian Taylor for noticing this possibility and sending the
536     original patch. */
537    
538     TAIL_RECURSE:
539    
540 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
541     are specified by the macro RMATCH and RRETURN is used to return. When
542     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
543     and a "return", respectively (possibly with some debugging if DEBUG is
544     defined). However, RMATCH isn't like a function call because it's quite a
545     complicated macro. It has to be used in one particular way. This shouldn't,
546     however, impact performance when true recursion is being used. */
547 nigel 77
548 nigel 87 /* First check that we haven't called match() too many times, or that we
549     haven't exceeded the recursive call limit. */
550    
551 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
552 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
553 nigel 77
554     original_ims = ims; /* Save for resetting on ')' */
555 nigel 91
556     #ifdef SUPPORT_UTF8
557 nigel 77 utf8 = md->utf8; /* Local copy of the flag */
558 nigel 91 #else
559     utf8 = FALSE;
560     #endif
561 nigel 77
562 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
563     string, the match_cbegroup flag is set. When this is the case, add the current
564     subject pointer to the chain of such remembered pointers, to be checked when we
565     hit the closing ket, in order to break infinite loops that match no characters.
566     When match() is called in other circumstances, don't add to the chain. If this
567     is a tail recursion, use a block from the workspace, as the one on the stack is
568     already used. */
569 nigel 77
570 nigel 93 if ((flags & match_cbegroup) != 0)
571 nigel 77 {
572 nigel 93 eptrblock *p;
573     if ((flags & match_tail_recursed) != 0)
574     {
575     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
576     p = md->eptrchain + md->eptrn++;
577     }
578     else p = &newptrb;
579     p->epb_saved_eptr = eptr;
580     p->epb_prev = eptrb;
581     eptrb = p;
582 nigel 77 }
583    
584 nigel 93 /* Now start processing the opcodes. */
585 nigel 77
586     for (;;)
587     {
588 nigel 93 minimize = possessive = FALSE;
589 nigel 77 op = *ecode;
590    
591     /* For partial matching, remember if we ever hit the end of the subject after
592     matching at least one subject character. */
593    
594     if (md->partial &&
595     eptr >= md->end_subject &&
596     eptr > md->start_match)
597     md->hitend = TRUE;
598    
599 nigel 93 switch(op)
600     {
601     /* Handle a capturing bracket. If there is space in the offset vector, save
602     the current subject position in the working slot at the top of the vector.
603     We mustn't change the current values of the data slot, because they may be
604     set from a previous iteration of this group, and be referred to by a
605     reference inside the group.
606 nigel 77
607 nigel 93 If the bracket fails to match, we need to restore this value and also the
608     values of the final offsets, in case they were set by a previous iteration
609     of the same bracket.
610 nigel 77
611 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
612     a non-capturing bracket. Don't worry about setting the flag for the error
613     case here; that is handled in the code for KET. */
614 nigel 77
615 nigel 93 case OP_CBRA:
616     case OP_SCBRA:
617     number = GET2(ecode, 1+LINK_SIZE);
618 nigel 77 offset = number << 1;
619    
620     #ifdef DEBUG
621 nigel 93 printf("start bracket %d\n", number);
622     printf("subject=");
623 nigel 77 pchars(eptr, 16, TRUE, md);
624     printf("\n");
625     #endif
626    
627     if (offset < md->offset_max)
628     {
629     save_offset1 = md->offset_vector[offset];
630     save_offset2 = md->offset_vector[offset+1];
631     save_offset3 = md->offset_vector[md->offset_end - number];
632     save_capture_last = md->capture_last;
633    
634     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
635     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
636    
637 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
638 nigel 77 do
639     {
640 nigel 93 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
641     ims, eptrb, flags);
642 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
643     md->capture_last = save_capture_last;
644     ecode += GET(ecode, 1);
645     }
646     while (*ecode == OP_ALT);
647    
648     DPRINTF(("bracket %d failed\n", number));
649    
650     md->offset_vector[offset] = save_offset1;
651     md->offset_vector[offset+1] = save_offset2;
652     md->offset_vector[md->offset_end - number] = save_offset3;
653    
654     RRETURN(MATCH_NOMATCH);
655     }
656    
657 nigel 93 /* Insufficient room for saving captured contents. Treat as a non-capturing
658     bracket. */
659 nigel 77
660 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
661 nigel 77
662 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
663     final alternative within the brackets, we would return the result of a
664     recursive call to match() whatever happened. We can reduce stack usage by
665     turning this into a tail recursion. */
666 nigel 77
667 nigel 93 case OP_BRA:
668     case OP_SBRA:
669     DPRINTF(("start non-capturing bracket\n"));
670     flags = (op >= OP_SBRA)? match_cbegroup : 0;
671 nigel 91 for (;;)
672 nigel 77 {
673 nigel 91 if (ecode[GET(ecode, 1)] != OP_ALT)
674 nigel 93 {
675     ecode += _pcre_OP_lengths[*ecode];
676     flags |= match_tail_recursed;
677     DPRINTF(("bracket 0 tail recursion\n"));
678     goto TAIL_RECURSE;
679     }
680 nigel 91
681     /* For non-final alternatives, continue the loop for a NOMATCH result;
682     otherwise return. */
683    
684 nigel 93 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
685     eptrb, flags);
686 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
687     ecode += GET(ecode, 1);
688     }
689 nigel 91 /* Control never reaches here. */
690 nigel 77
691     /* Conditional group: compilation checked that there are no more than
692     two branches. If the condition is false, skipping the first branch takes us
693     past the end if there is only one branch, but that's OK because that is
694 nigel 91 exactly what going to the ket would do. As there is only one branch to be
695     obeyed, we can use tail recursion to avoid using another stack frame. */
696 nigel 77
697     case OP_COND:
698 nigel 93 case OP_SCOND:
699     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
700 nigel 77 {
701 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
702     condition = md->recursive != NULL &&
703     (offset == RREF_ANY || offset == md->recursive->group_num);
704     ecode += condition? 3 : GET(ecode, 1);
705     }
706    
707     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
708     {
709 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
710 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
711     ecode += condition? 3 : GET(ecode, 1);
712 nigel 77 }
713    
714 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
715     {
716     condition = FALSE;
717     ecode += GET(ecode, 1);
718     }
719    
720 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
721 nigel 93 the final argument match_condassert causes it to stop at the end of an
722     assertion. */
723 nigel 77
724     else
725     {
726     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
727 nigel 93 match_condassert);
728 nigel 77 if (rrc == MATCH_MATCH)
729     {
730 nigel 93 condition = TRUE;
731     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
732 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
733     }
734     else if (rrc != MATCH_NOMATCH)
735     {
736     RRETURN(rrc); /* Need braces because of following else */
737     }
738 nigel 93 else
739     {
740     condition = FALSE;
741     ecode += GET(ecode, 1);
742     }
743     }
744 nigel 91
745 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
746     we can use tail recursion to avoid using another stack frame. If the second
747     alternative doesn't exist, we can just plough on. */
748 nigel 91
749 nigel 93 if (condition || *ecode == OP_ALT)
750     {
751 nigel 91 ecode += 1 + LINK_SIZE;
752 nigel 93 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
753 nigel 91 goto TAIL_RECURSE;
754 nigel 77 }
755 nigel 93 else
756     {
757     ecode += 1 + LINK_SIZE;
758     }
759     break;
760 nigel 77
761    
762 nigel 93 /* End of the pattern. If we are in a top-level recursion, we should
763     restore the offsets appropriately and continue from after the call. */
764 nigel 77
765     case OP_END:
766     if (md->recursive != NULL && md->recursive->group_num == 0)
767     {
768     recursion_info *rec = md->recursive;
769 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
770 nigel 77 md->recursive = rec->prevrec;
771     memmove(md->offset_vector, rec->offset_save,
772     rec->saved_max * sizeof(int));
773     md->start_match = rec->save_start;
774     ims = original_ims;
775     ecode = rec->after_call;
776     break;
777     }
778    
779     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
780     string - backtracking will then try other alternatives, if any. */
781    
782     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
783     md->end_match_ptr = eptr; /* Record where we ended */
784     md->end_offset_top = offset_top; /* and how many extracts were taken */
785     RRETURN(MATCH_MATCH);
786    
787     /* Change option settings */
788    
789     case OP_OPT:
790     ims = ecode[1];
791     ecode += 2;
792     DPRINTF(("ims set to %02lx\n", ims));
793     break;
794    
795     /* Assertion brackets. Check the alternative branches in turn - the
796     matching won't pass the KET for an assertion. If any one branch matches,
797     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
798     start of each branch to move the current point backwards, so the code at
799     this level is identical to the lookahead case. */
800    
801     case OP_ASSERT:
802     case OP_ASSERTBACK:
803     do
804     {
805 nigel 93 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
806 nigel 77 if (rrc == MATCH_MATCH) break;
807     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
808     ecode += GET(ecode, 1);
809     }
810     while (*ecode == OP_ALT);
811     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
812    
813     /* If checking an assertion for a condition, return MATCH_MATCH. */
814    
815     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
816    
817     /* Continue from after the assertion, updating the offsets high water
818     mark, since extracts may have been taken during the assertion. */
819    
820     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
821     ecode += 1 + LINK_SIZE;
822     offset_top = md->end_offset_top;
823     continue;
824    
825     /* Negative assertion: all branches must fail to match */
826    
827     case OP_ASSERT_NOT:
828     case OP_ASSERTBACK_NOT:
829     do
830     {
831 nigel 93 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
832 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
833     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
834     ecode += GET(ecode,1);
835     }
836     while (*ecode == OP_ALT);
837    
838     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
839    
840     ecode += 1 + LINK_SIZE;
841     continue;
842    
843     /* Move the subject pointer back. This occurs only at the start of
844     each branch of a lookbehind assertion. If we are too close to the start to
845     move back, this match function fails. When working with UTF-8 we move
846     back a number of characters, not bytes. */
847    
848     case OP_REVERSE:
849     #ifdef SUPPORT_UTF8
850     if (utf8)
851     {
852 nigel 93 i = GET(ecode, 1);
853     while (i-- > 0)
854 nigel 77 {
855     eptr--;
856     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
857     BACKCHAR(eptr)
858     }
859     }
860     else
861     #endif
862    
863     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
864    
865     {
866 nigel 93 eptr -= GET(ecode, 1);
867 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
868     }
869    
870     /* Skip to next op code */
871    
872     ecode += 1 + LINK_SIZE;
873     break;
874    
875     /* The callout item calls an external function, if one is provided, passing
876     details of the match so far. This is mainly for debugging, though the
877     function is able to force a failure. */
878    
879     case OP_CALLOUT:
880     if (pcre_callout != NULL)
881     {
882     pcre_callout_block cb;
883     cb.version = 1; /* Version 1 of the callout block */
884     cb.callout_number = ecode[1];
885     cb.offset_vector = md->offset_vector;
886 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
887 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
888     cb.start_match = md->start_match - md->start_subject;
889     cb.current_position = eptr - md->start_subject;
890     cb.pattern_position = GET(ecode, 2);
891     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
892     cb.capture_top = offset_top/2;
893     cb.capture_last = md->capture_last;
894     cb.callout_data = md->callout_data;
895     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
896     if (rrc < 0) RRETURN(rrc);
897     }
898     ecode += 2 + 2*LINK_SIZE;
899     break;
900    
901     /* Recursion either matches the current regex, or some subexpression. The
902     offset data is the offset to the starting bracket from the start of the
903     whole pattern. (This is so that it works from duplicated subpatterns.)
904    
905     If there are any capturing brackets started but not finished, we have to
906     save their starting points and reinstate them after the recursion. However,
907     we don't know how many such there are (offset_top records the completed
908     total) so we just have to save all the potential data. There may be up to
909     65535 such values, which is too large to put on the stack, but using malloc
910     for small numbers seems expensive. As a compromise, the stack is used when
911     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
912     is used. A problem is what to do if the malloc fails ... there is no way of
913     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
914     values on the stack, and accept that the rest may be wrong.
915    
916     There are also other values that have to be saved. We use a chained
917     sequence of blocks that actually live on the stack. Thanks to Robin Houston
918     for the original version of this logic. */
919    
920     case OP_RECURSE:
921     {
922     callpat = md->start_code + GET(ecode, 1);
923 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
924     GET2(callpat, 1 + LINK_SIZE);
925 nigel 77
926     /* Add to "recursing stack" */
927    
928     new_recursive.prevrec = md->recursive;
929     md->recursive = &new_recursive;
930    
931     /* Find where to continue from afterwards */
932    
933     ecode += 1 + LINK_SIZE;
934     new_recursive.after_call = ecode;
935    
936     /* Now save the offset data. */
937    
938     new_recursive.saved_max = md->offset_end;
939     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
940     new_recursive.offset_save = stacksave;
941     else
942     {
943     new_recursive.offset_save =
944     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
945     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
946     }
947    
948     memcpy(new_recursive.offset_save, md->offset_vector,
949     new_recursive.saved_max * sizeof(int));
950     new_recursive.save_start = md->start_match;
951     md->start_match = eptr;
952    
953     /* OK, now we can do the recursion. For each top-level alternative we
954     restore the offset and recursion data. */
955    
956     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
957 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
958 nigel 77 do
959     {
960 nigel 93 RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
961     md, ims, eptrb, flags);
962 nigel 77 if (rrc == MATCH_MATCH)
963     {
964 nigel 87 DPRINTF(("Recursion matched\n"));
965 nigel 77 md->recursive = new_recursive.prevrec;
966     if (new_recursive.offset_save != stacksave)
967     (pcre_free)(new_recursive.offset_save);
968     RRETURN(MATCH_MATCH);
969     }
970 nigel 87 else if (rrc != MATCH_NOMATCH)
971     {
972     DPRINTF(("Recursion gave error %d\n", rrc));
973     RRETURN(rrc);
974     }
975 nigel 77
976     md->recursive = &new_recursive;
977     memcpy(md->offset_vector, new_recursive.offset_save,
978     new_recursive.saved_max * sizeof(int));
979     callpat += GET(callpat, 1);
980     }
981     while (*callpat == OP_ALT);
982    
983     DPRINTF(("Recursion didn't match\n"));
984     md->recursive = new_recursive.prevrec;
985     if (new_recursive.offset_save != stacksave)
986     (pcre_free)(new_recursive.offset_save);
987     RRETURN(MATCH_NOMATCH);
988     }
989     /* Control never reaches here */
990    
991     /* "Once" brackets are like assertion brackets except that after a match,
992     the point in the subject string is not moved back. Thus there can never be
993     a move back into the brackets. Friedl calls these "atomic" subpatterns.
994     Check the alternative branches in turn - the matching won't pass the KET
995     for this kind of subpattern. If any one branch matches, we carry on as at
996     the end of a normal bracket, leaving the subject pointer. */
997    
998     case OP_ONCE:
999 nigel 91 prev = ecode;
1000     saved_eptr = eptr;
1001    
1002     do
1003 nigel 77 {
1004 nigel 91 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1005 nigel 93 eptrb, 0);
1006 nigel 91 if (rrc == MATCH_MATCH) break;
1007     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1008     ecode += GET(ecode,1);
1009     }
1010     while (*ecode == OP_ALT);
1011 nigel 77
1012 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1013 nigel 77
1014 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1015 nigel 77
1016 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1017     mark, since extracts may have been taken. */
1018 nigel 77
1019 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1020 nigel 77
1021 nigel 91 offset_top = md->end_offset_top;
1022     eptr = md->end_match_ptr;
1023 nigel 77
1024 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1025     happens for a repeating ket if no characters were matched in the group.
1026     This is the forcible breaking of infinite loops as implemented in Perl
1027     5.005. If there is an options reset, it will get obeyed in the normal
1028     course of events. */
1029 nigel 77
1030 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1031     {
1032     ecode += 1+LINK_SIZE;
1033     break;
1034     }
1035 nigel 77
1036 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1037     preceding bracket, in the appropriate order. The second "call" of match()
1038     uses tail recursion, to avoid using another stack frame. We need to reset
1039     any options that changed within the bracket before re-running it, so
1040     check the next opcode. */
1041 nigel 77
1042 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1043     {
1044     ims = (ims & ~PCRE_IMS) | ecode[4];
1045     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1046     }
1047 nigel 77
1048 nigel 91 if (*ecode == OP_KETRMIN)
1049     {
1050     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1051     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1052     ecode = prev;
1053 nigel 93 flags = match_tail_recursed;
1054 nigel 91 goto TAIL_RECURSE;
1055 nigel 77 }
1056 nigel 91 else /* OP_KETRMAX */
1057     {
1058 nigel 93 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1059 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1060     ecode += 1 + LINK_SIZE;
1061 nigel 93 flags = match_tail_recursed;
1062 nigel 91 goto TAIL_RECURSE;
1063     }
1064     /* Control never gets here */
1065 nigel 77
1066     /* An alternation is the end of a branch; scan along to find the end of the
1067     bracketed group and go to there. */
1068    
1069     case OP_ALT:
1070     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1071     break;
1072    
1073     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1074     that it may occur zero times. It may repeat infinitely, or not at all -
1075     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1076     repeat limits are compiled as a number of copies, with the optional ones
1077     preceded by BRAZERO or BRAMINZERO. */
1078    
1079     case OP_BRAZERO:
1080     {
1081     next = ecode+1;
1082 nigel 93 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1083 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1084     do next += GET(next,1); while (*next == OP_ALT);
1085 nigel 93 ecode = next + 1 + LINK_SIZE;
1086 nigel 77 }
1087     break;
1088    
1089     case OP_BRAMINZERO:
1090     {
1091     next = ecode+1;
1092 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1093     RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1094 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1095     ecode++;
1096     }
1097     break;
1098    
1099 nigel 93 /* End of a group, repeated or non-repeating. */
1100 nigel 77
1101     case OP_KET:
1102     case OP_KETRMIN:
1103     case OP_KETRMAX:
1104 nigel 91 prev = ecode - GET(ecode, 1);
1105 nigel 77
1106 nigel 93 /* If this was a group that remembered the subject start, in order to break
1107     infinite repeats of empty string matches, retrieve the subject start from
1108     the chain. Otherwise, set it NULL. */
1109 nigel 77
1110 nigel 93 if (*prev >= OP_SBRA)
1111     {
1112     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1113     eptrb = eptrb->epb_prev; /* Backup to previous group */
1114     }
1115     else saved_eptr = NULL;
1116 nigel 77
1117 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1118     MATCH_MATCH, but record the current high water mark for use by positive
1119     assertions. Do this also for the "once" (atomic) groups. */
1120    
1121 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1122     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1123     *prev == OP_ONCE)
1124     {
1125     md->end_match_ptr = eptr; /* For ONCE */
1126     md->end_offset_top = offset_top;
1127     RRETURN(MATCH_MATCH);
1128     }
1129 nigel 77
1130 nigel 93 /* For capturing groups we have to check the group number back at the start
1131     and if necessary complete handling an extraction by setting the offsets and
1132     bumping the high water mark. Note that whole-pattern recursion is coded as
1133     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1134     when the OP_END is reached. Other recursion is handled here. */
1135 nigel 77
1136 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1137 nigel 91 {
1138 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1139 nigel 91 offset = number << 1;
1140 nigel 77
1141     #ifdef DEBUG
1142 nigel 91 printf("end bracket %d", number);
1143     printf("\n");
1144 nigel 77 #endif
1145    
1146 nigel 93 md->capture_last = number;
1147     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1148 nigel 91 {
1149 nigel 93 md->offset_vector[offset] =
1150     md->offset_vector[md->offset_end - number];
1151     md->offset_vector[offset+1] = eptr - md->start_subject;
1152     if (offset_top <= offset) offset_top = offset + 2;
1153     }
1154 nigel 77
1155 nigel 93 /* Handle a recursively called group. Restore the offsets
1156     appropriately and continue from after the call. */
1157 nigel 77
1158 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1159     {
1160     recursion_info *rec = md->recursive;
1161     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1162     md->recursive = rec->prevrec;
1163     md->start_match = rec->save_start;
1164     memcpy(md->offset_vector, rec->offset_save,
1165     rec->saved_max * sizeof(int));
1166     ecode = rec->after_call;
1167     ims = original_ims;
1168     break;
1169 nigel 77 }
1170 nigel 91 }
1171 nigel 77
1172 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1173     flags, in case they got changed during the group. */
1174 nigel 77
1175 nigel 91 ims = original_ims;
1176     DPRINTF(("ims reset to %02lx\n", ims));
1177 nigel 77
1178 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1179     happens for a repeating ket if no characters were matched in the group.
1180     This is the forcible breaking of infinite loops as implemented in Perl
1181     5.005. If there is an options reset, it will get obeyed in the normal
1182     course of events. */
1183 nigel 77
1184 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1185     {
1186     ecode += 1 + LINK_SIZE;
1187     break;
1188     }
1189 nigel 77
1190 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1191     preceding bracket, in the appropriate order. In the second case, we can use
1192     tail recursion to avoid using another stack frame. */
1193 nigel 77
1194 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1195    
1196 nigel 91 if (*ecode == OP_KETRMIN)
1197     {
1198     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1199     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1200     ecode = prev;
1201 nigel 93 flags |= match_tail_recursed;
1202 nigel 91 goto TAIL_RECURSE;
1203 nigel 77 }
1204 nigel 91 else /* OP_KETRMAX */
1205     {
1206 nigel 93 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1207 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1208     ecode += 1 + LINK_SIZE;
1209 nigel 93 flags = match_tail_recursed;
1210 nigel 91 goto TAIL_RECURSE;
1211     }
1212     /* Control never gets here */
1213 nigel 77
1214     /* Start of subject unless notbol, or after internal newline if multiline */
1215    
1216     case OP_CIRC:
1217     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1218     if ((ims & PCRE_MULTILINE) != 0)
1219     {
1220 nigel 91 if (eptr != md->start_subject &&
1221 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1222 nigel 77 RRETURN(MATCH_NOMATCH);
1223     ecode++;
1224     break;
1225     }
1226     /* ... else fall through */
1227    
1228     /* Start of subject assertion */
1229    
1230     case OP_SOD:
1231     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1232     ecode++;
1233     break;
1234    
1235     /* Start of match assertion */
1236    
1237     case OP_SOM:
1238     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1239     ecode++;
1240     break;
1241    
1242     /* Assert before internal newline if multiline, or before a terminating
1243     newline unless endonly is set, else end of subject unless noteol is set. */
1244    
1245     case OP_DOLL:
1246     if ((ims & PCRE_MULTILINE) != 0)
1247     {
1248     if (eptr < md->end_subject)
1249 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1250 nigel 77 else
1251     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1252     ecode++;
1253     break;
1254     }
1255     else
1256     {
1257     if (md->noteol) RRETURN(MATCH_NOMATCH);
1258     if (!md->endonly)
1259     {
1260 nigel 91 if (eptr != md->end_subject &&
1261 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1262 nigel 77 RRETURN(MATCH_NOMATCH);
1263     ecode++;
1264     break;
1265     }
1266     }
1267 nigel 91 /* ... else fall through for endonly */
1268 nigel 77
1269     /* End of subject assertion (\z) */
1270    
1271     case OP_EOD:
1272     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1273     ecode++;
1274     break;
1275    
1276     /* End of subject or ending \n assertion (\Z) */
1277    
1278     case OP_EODN:
1279 nigel 91 if (eptr != md->end_subject &&
1280 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1281 nigel 91 RRETURN(MATCH_NOMATCH);
1282 nigel 77 ecode++;
1283     break;
1284    
1285     /* Word boundary assertions */
1286    
1287     case OP_NOT_WORD_BOUNDARY:
1288     case OP_WORD_BOUNDARY:
1289     {
1290    
1291     /* Find out if the previous and current characters are "word" characters.
1292     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1293     be "non-word" characters. */
1294    
1295     #ifdef SUPPORT_UTF8
1296     if (utf8)
1297     {
1298     if (eptr == md->start_subject) prev_is_word = FALSE; else
1299     {
1300     const uschar *lastptr = eptr - 1;
1301     while((*lastptr & 0xc0) == 0x80) lastptr--;
1302     GETCHAR(c, lastptr);
1303     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1304     }
1305     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1306     {
1307     GETCHAR(c, eptr);
1308     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1309     }
1310     }
1311     else
1312     #endif
1313    
1314     /* More streamlined when not in UTF-8 mode */
1315    
1316     {
1317     prev_is_word = (eptr != md->start_subject) &&
1318     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1319     cur_is_word = (eptr < md->end_subject) &&
1320     ((md->ctypes[*eptr] & ctype_word) != 0);
1321     }
1322    
1323     /* Now see if the situation is what we want */
1324    
1325     if ((*ecode++ == OP_WORD_BOUNDARY)?
1326     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1327     RRETURN(MATCH_NOMATCH);
1328     }
1329     break;
1330    
1331     /* Match a single character type; inline for speed */
1332    
1333     case OP_ANY:
1334 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1335     {
1336 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1337 nigel 91 }
1338 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1339     if (utf8)
1340     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1341     ecode++;
1342     break;
1343    
1344     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1345     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1346    
1347     case OP_ANYBYTE:
1348     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1349     ecode++;
1350     break;
1351    
1352     case OP_NOT_DIGIT:
1353     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1354     GETCHARINCTEST(c, eptr);
1355     if (
1356     #ifdef SUPPORT_UTF8
1357     c < 256 &&
1358     #endif
1359     (md->ctypes[c] & ctype_digit) != 0
1360     )
1361     RRETURN(MATCH_NOMATCH);
1362     ecode++;
1363     break;
1364    
1365     case OP_DIGIT:
1366     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1367     GETCHARINCTEST(c, eptr);
1368     if (
1369     #ifdef SUPPORT_UTF8
1370     c >= 256 ||
1371     #endif
1372     (md->ctypes[c] & ctype_digit) == 0
1373     )
1374     RRETURN(MATCH_NOMATCH);
1375     ecode++;
1376     break;
1377    
1378     case OP_NOT_WHITESPACE:
1379     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1380     GETCHARINCTEST(c, eptr);
1381     if (
1382     #ifdef SUPPORT_UTF8
1383     c < 256 &&
1384     #endif
1385     (md->ctypes[c] & ctype_space) != 0
1386     )
1387     RRETURN(MATCH_NOMATCH);
1388     ecode++;
1389     break;
1390    
1391     case OP_WHITESPACE:
1392     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1393     GETCHARINCTEST(c, eptr);
1394     if (
1395     #ifdef SUPPORT_UTF8
1396     c >= 256 ||
1397     #endif
1398     (md->ctypes[c] & ctype_space) == 0
1399     )
1400     RRETURN(MATCH_NOMATCH);
1401     ecode++;
1402     break;
1403    
1404     case OP_NOT_WORDCHAR:
1405     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1406     GETCHARINCTEST(c, eptr);
1407     if (
1408     #ifdef SUPPORT_UTF8
1409     c < 256 &&
1410     #endif
1411     (md->ctypes[c] & ctype_word) != 0
1412     )
1413     RRETURN(MATCH_NOMATCH);
1414     ecode++;
1415     break;
1416    
1417     case OP_WORDCHAR:
1418     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1419     GETCHARINCTEST(c, eptr);
1420     if (
1421     #ifdef SUPPORT_UTF8
1422     c >= 256 ||
1423     #endif
1424     (md->ctypes[c] & ctype_word) == 0
1425     )
1426     RRETURN(MATCH_NOMATCH);
1427     ecode++;
1428     break;
1429    
1430 nigel 93 case OP_ANYNL:
1431     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1432     GETCHARINCTEST(c, eptr);
1433     switch(c)
1434     {
1435     default: RRETURN(MATCH_NOMATCH);
1436     case 0x000d:
1437     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1438     break;
1439     case 0x000a:
1440     case 0x000b:
1441     case 0x000c:
1442     case 0x0085:
1443     case 0x2028:
1444     case 0x2029:
1445     break;
1446     }
1447     ecode++;
1448     break;
1449    
1450 nigel 77 #ifdef SUPPORT_UCP
1451     /* Check the next character by Unicode property. We will get here only
1452     if the support is in the binary; otherwise a compile-time error occurs. */
1453    
1454     case OP_PROP:
1455     case OP_NOTPROP:
1456     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1457     GETCHARINCTEST(c, eptr);
1458     {
1459 nigel 87 int chartype, script;
1460     int category = _pcre_ucp_findprop(c, &chartype, &script);
1461 nigel 77
1462 nigel 87 switch(ecode[1])
1463     {
1464     case PT_ANY:
1465     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1466     break;
1467 nigel 77
1468 nigel 87 case PT_LAMP:
1469     if ((chartype == ucp_Lu ||
1470     chartype == ucp_Ll ||
1471     chartype == ucp_Lt) == (op == OP_NOTPROP))
1472 nigel 77 RRETURN(MATCH_NOMATCH);
1473 nigel 87 break;
1474    
1475     case PT_GC:
1476     if ((ecode[2] != category) == (op == OP_PROP))
1477 nigel 77 RRETURN(MATCH_NOMATCH);
1478 nigel 87 break;
1479    
1480     case PT_PC:
1481     if ((ecode[2] != chartype) == (op == OP_PROP))
1482     RRETURN(MATCH_NOMATCH);
1483     break;
1484    
1485     case PT_SC:
1486     if ((ecode[2] != script) == (op == OP_PROP))
1487     RRETURN(MATCH_NOMATCH);
1488     break;
1489    
1490     default:
1491     RRETURN(PCRE_ERROR_INTERNAL);
1492 nigel 77 }
1493 nigel 87
1494     ecode += 3;
1495 nigel 77 }
1496     break;
1497    
1498     /* Match an extended Unicode sequence. We will get here only if the support
1499     is in the binary; otherwise a compile-time error occurs. */
1500    
1501     case OP_EXTUNI:
1502     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503     GETCHARINCTEST(c, eptr);
1504     {
1505 nigel 87 int chartype, script;
1506     int category = _pcre_ucp_findprop(c, &chartype, &script);
1507 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1508     while (eptr < md->end_subject)
1509     {
1510     int len = 1;
1511     if (!utf8) c = *eptr; else
1512     {
1513     GETCHARLEN(c, eptr, len);
1514     }
1515 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1516 nigel 77 if (category != ucp_M) break;
1517     eptr += len;
1518     }
1519     }
1520     ecode++;
1521     break;
1522     #endif
1523    
1524    
1525     /* Match a back reference, possibly repeatedly. Look past the end of the
1526     item to see if there is repeat information following. The code is similar
1527     to that for character classes, but repeated for efficiency. Then obey
1528     similar code to character type repeats - written out again for speed.
1529     However, if the referenced string is the empty string, always treat
1530     it as matched, any number of times (otherwise there could be infinite
1531     loops). */
1532    
1533     case OP_REF:
1534     {
1535     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1536     ecode += 3; /* Advance past item */
1537    
1538     /* If the reference is unset, set the length to be longer than the amount
1539     of subject left; this ensures that every attempt at a match fails. We
1540     can't just fail here, because of the possibility of quantifiers with zero
1541     minima. */
1542    
1543     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1544     md->end_subject - eptr + 1 :
1545     md->offset_vector[offset+1] - md->offset_vector[offset];
1546    
1547     /* Set up for repetition, or handle the non-repeated case */
1548    
1549     switch (*ecode)
1550     {
1551     case OP_CRSTAR:
1552     case OP_CRMINSTAR:
1553     case OP_CRPLUS:
1554     case OP_CRMINPLUS:
1555     case OP_CRQUERY:
1556     case OP_CRMINQUERY:
1557     c = *ecode++ - OP_CRSTAR;
1558     minimize = (c & 1) != 0;
1559     min = rep_min[c]; /* Pick up values from tables; */
1560     max = rep_max[c]; /* zero for max => infinity */
1561     if (max == 0) max = INT_MAX;
1562     break;
1563    
1564     case OP_CRRANGE:
1565     case OP_CRMINRANGE:
1566     minimize = (*ecode == OP_CRMINRANGE);
1567     min = GET2(ecode, 1);
1568     max = GET2(ecode, 3);
1569     if (max == 0) max = INT_MAX;
1570     ecode += 5;
1571     break;
1572    
1573     default: /* No repeat follows */
1574     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1575     eptr += length;
1576     continue; /* With the main loop */
1577     }
1578    
1579     /* If the length of the reference is zero, just continue with the
1580     main loop. */
1581    
1582     if (length == 0) continue;
1583    
1584     /* First, ensure the minimum number of matches are present. We get back
1585     the length of the reference string explicitly rather than passing the
1586     address of eptr, so that eptr can be a register variable. */
1587    
1588     for (i = 1; i <= min; i++)
1589     {
1590     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1591     eptr += length;
1592     }
1593    
1594     /* If min = max, continue at the same level without recursion.
1595     They are not both allowed to be zero. */
1596    
1597     if (min == max) continue;
1598    
1599     /* If minimizing, keep trying and advancing the pointer */
1600    
1601     if (minimize)
1602     {
1603     for (fi = min;; fi++)
1604     {
1605     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1606     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1607     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1608     RRETURN(MATCH_NOMATCH);
1609     eptr += length;
1610     }
1611     /* Control never gets here */
1612     }
1613    
1614     /* If maximizing, find the longest string and work backwards */
1615    
1616     else
1617     {
1618     pp = eptr;
1619     for (i = min; i < max; i++)
1620     {
1621     if (!match_ref(offset, eptr, length, md, ims)) break;
1622     eptr += length;
1623     }
1624     while (eptr >= pp)
1625     {
1626     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1627     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1628     eptr -= length;
1629     }
1630     RRETURN(MATCH_NOMATCH);
1631     }
1632     }
1633     /* Control never gets here */
1634    
1635    
1636    
1637     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1638     used when all the characters in the class have values in the range 0-255,
1639     and either the matching is caseful, or the characters are in the range
1640     0-127 when UTF-8 processing is enabled. The only difference between
1641     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1642     encountered.
1643    
1644     First, look past the end of the item to see if there is repeat information
1645     following. Then obey similar code to character type repeats - written out
1646     again for speed. */
1647    
1648     case OP_NCLASS:
1649     case OP_CLASS:
1650     {
1651     data = ecode + 1; /* Save for matching */
1652     ecode += 33; /* Advance past the item */
1653    
1654     switch (*ecode)
1655     {
1656     case OP_CRSTAR:
1657     case OP_CRMINSTAR:
1658     case OP_CRPLUS:
1659     case OP_CRMINPLUS:
1660     case OP_CRQUERY:
1661     case OP_CRMINQUERY:
1662     c = *ecode++ - OP_CRSTAR;
1663     minimize = (c & 1) != 0;
1664     min = rep_min[c]; /* Pick up values from tables; */
1665     max = rep_max[c]; /* zero for max => infinity */
1666     if (max == 0) max = INT_MAX;
1667     break;
1668    
1669     case OP_CRRANGE:
1670     case OP_CRMINRANGE:
1671     minimize = (*ecode == OP_CRMINRANGE);
1672     min = GET2(ecode, 1);
1673     max = GET2(ecode, 3);
1674     if (max == 0) max = INT_MAX;
1675     ecode += 5;
1676     break;
1677    
1678     default: /* No repeat follows */
1679     min = max = 1;
1680     break;
1681     }
1682    
1683     /* First, ensure the minimum number of matches are present. */
1684    
1685     #ifdef SUPPORT_UTF8
1686     /* UTF-8 mode */
1687     if (utf8)
1688     {
1689     for (i = 1; i <= min; i++)
1690     {
1691     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1692     GETCHARINC(c, eptr);
1693     if (c > 255)
1694     {
1695     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1696     }
1697     else
1698     {
1699     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1700     }
1701     }
1702     }
1703     else
1704     #endif
1705     /* Not UTF-8 mode */
1706     {
1707     for (i = 1; i <= min; i++)
1708     {
1709     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1710     c = *eptr++;
1711     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1712     }
1713     }
1714    
1715     /* If max == min we can continue with the main loop without the
1716     need to recurse. */
1717    
1718     if (min == max) continue;
1719    
1720     /* If minimizing, keep testing the rest of the expression and advancing
1721     the pointer while it matches the class. */
1722    
1723     if (minimize)
1724     {
1725     #ifdef SUPPORT_UTF8
1726     /* UTF-8 mode */
1727     if (utf8)
1728     {
1729     for (fi = min;; fi++)
1730     {
1731     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1732     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1733     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1734     GETCHARINC(c, eptr);
1735     if (c > 255)
1736     {
1737     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1738     }
1739     else
1740     {
1741     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1742     }
1743     }
1744     }
1745     else
1746     #endif
1747     /* Not UTF-8 mode */
1748     {
1749     for (fi = min;; fi++)
1750     {
1751     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1752     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1753     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1754     c = *eptr++;
1755     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1756     }
1757     }
1758     /* Control never gets here */
1759     }
1760    
1761     /* If maximizing, find the longest possible run, then work backwards. */
1762    
1763     else
1764     {
1765     pp = eptr;
1766    
1767     #ifdef SUPPORT_UTF8
1768     /* UTF-8 mode */
1769     if (utf8)
1770     {
1771     for (i = min; i < max; i++)
1772     {
1773     int len = 1;
1774     if (eptr >= md->end_subject) break;
1775     GETCHARLEN(c, eptr, len);
1776     if (c > 255)
1777     {
1778     if (op == OP_CLASS) break;
1779     }
1780     else
1781     {
1782     if ((data[c/8] & (1 << (c&7))) == 0) break;
1783     }
1784     eptr += len;
1785     }
1786     for (;;)
1787     {
1788     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1789     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1790     if (eptr-- == pp) break; /* Stop if tried at original pos */
1791     BACKCHAR(eptr);
1792     }
1793     }
1794     else
1795     #endif
1796     /* Not UTF-8 mode */
1797     {
1798     for (i = min; i < max; i++)
1799     {
1800     if (eptr >= md->end_subject) break;
1801     c = *eptr;
1802     if ((data[c/8] & (1 << (c&7))) == 0) break;
1803     eptr++;
1804     }
1805     while (eptr >= pp)
1806     {
1807     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1808 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1809 nigel 77 eptr--;
1810     }
1811     }
1812    
1813     RRETURN(MATCH_NOMATCH);
1814     }
1815     }
1816     /* Control never gets here */
1817    
1818    
1819     /* Match an extended character class. This opcode is encountered only
1820     in UTF-8 mode, because that's the only time it is compiled. */
1821    
1822     #ifdef SUPPORT_UTF8
1823     case OP_XCLASS:
1824     {
1825     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1826     ecode += GET(ecode, 1); /* Advance past the item */
1827    
1828     switch (*ecode)
1829     {
1830     case OP_CRSTAR:
1831     case OP_CRMINSTAR:
1832     case OP_CRPLUS:
1833     case OP_CRMINPLUS:
1834     case OP_CRQUERY:
1835     case OP_CRMINQUERY:
1836     c = *ecode++ - OP_CRSTAR;
1837     minimize = (c & 1) != 0;
1838     min = rep_min[c]; /* Pick up values from tables; */
1839     max = rep_max[c]; /* zero for max => infinity */
1840     if (max == 0) max = INT_MAX;
1841     break;
1842    
1843     case OP_CRRANGE:
1844     case OP_CRMINRANGE:
1845     minimize = (*ecode == OP_CRMINRANGE);
1846     min = GET2(ecode, 1);
1847     max = GET2(ecode, 3);
1848     if (max == 0) max = INT_MAX;
1849     ecode += 5;
1850     break;
1851    
1852     default: /* No repeat follows */
1853     min = max = 1;
1854     break;
1855     }
1856    
1857     /* First, ensure the minimum number of matches are present. */
1858    
1859     for (i = 1; i <= min; i++)
1860     {
1861     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1862     GETCHARINC(c, eptr);
1863     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1864     }
1865    
1866     /* If max == min we can continue with the main loop without the
1867     need to recurse. */
1868    
1869     if (min == max) continue;
1870    
1871     /* If minimizing, keep testing the rest of the expression and advancing
1872     the pointer while it matches the class. */
1873    
1874     if (minimize)
1875     {
1876     for (fi = min;; fi++)
1877     {
1878     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1879     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1880     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1881     GETCHARINC(c, eptr);
1882     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1883     }
1884     /* Control never gets here */
1885     }
1886    
1887     /* If maximizing, find the longest possible run, then work backwards. */
1888    
1889     else
1890     {
1891     pp = eptr;
1892     for (i = min; i < max; i++)
1893     {
1894     int len = 1;
1895     if (eptr >= md->end_subject) break;
1896     GETCHARLEN(c, eptr, len);
1897     if (!_pcre_xclass(c, data)) break;
1898     eptr += len;
1899     }
1900     for(;;)
1901     {
1902     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1903     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1904     if (eptr-- == pp) break; /* Stop if tried at original pos */
1905     BACKCHAR(eptr)
1906     }
1907     RRETURN(MATCH_NOMATCH);
1908     }
1909    
1910     /* Control never gets here */
1911     }
1912     #endif /* End of XCLASS */
1913    
1914     /* Match a single character, casefully */
1915    
1916     case OP_CHAR:
1917     #ifdef SUPPORT_UTF8
1918     if (utf8)
1919     {
1920     length = 1;
1921     ecode++;
1922     GETCHARLEN(fc, ecode, length);
1923     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1924     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1925     }
1926     else
1927     #endif
1928    
1929     /* Non-UTF-8 mode */
1930     {
1931     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1932     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1933     ecode += 2;
1934     }
1935     break;
1936    
1937     /* Match a single character, caselessly */
1938    
1939     case OP_CHARNC:
1940     #ifdef SUPPORT_UTF8
1941     if (utf8)
1942     {
1943     length = 1;
1944     ecode++;
1945     GETCHARLEN(fc, ecode, length);
1946    
1947     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1948    
1949     /* If the pattern character's value is < 128, we have only one byte, and
1950     can use the fast lookup table. */
1951    
1952     if (fc < 128)
1953     {
1954     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1955     }
1956    
1957     /* Otherwise we must pick up the subject character */
1958    
1959     else
1960     {
1961 nigel 93 unsigned int dc;
1962 nigel 77 GETCHARINC(dc, eptr);
1963     ecode += length;
1964    
1965     /* If we have Unicode property support, we can use it to test the other
1966 nigel 87 case of the character, if there is one. */
1967 nigel 77
1968     if (fc != dc)
1969     {
1970     #ifdef SUPPORT_UCP
1971 nigel 87 if (dc != _pcre_ucp_othercase(fc))
1972 nigel 77 #endif
1973     RRETURN(MATCH_NOMATCH);
1974     }
1975     }
1976     }
1977     else
1978     #endif /* SUPPORT_UTF8 */
1979    
1980     /* Non-UTF-8 mode */
1981     {
1982     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1983     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1984     ecode += 2;
1985     }
1986     break;
1987    
1988 nigel 93 /* Match a single character repeatedly. */
1989 nigel 77
1990     case OP_EXACT:
1991     min = max = GET2(ecode, 1);
1992     ecode += 3;
1993     goto REPEATCHAR;
1994    
1995 nigel 93 case OP_POSUPTO:
1996     possessive = TRUE;
1997     /* Fall through */
1998    
1999 nigel 77 case OP_UPTO:
2000     case OP_MINUPTO:
2001     min = 0;
2002     max = GET2(ecode, 1);
2003     minimize = *ecode == OP_MINUPTO;
2004     ecode += 3;
2005     goto REPEATCHAR;
2006    
2007 nigel 93 case OP_POSSTAR:
2008     possessive = TRUE;
2009     min = 0;
2010     max = INT_MAX;
2011     ecode++;
2012     goto REPEATCHAR;
2013    
2014     case OP_POSPLUS:
2015     possessive = TRUE;
2016     min = 1;
2017     max = INT_MAX;
2018     ecode++;
2019     goto REPEATCHAR;
2020    
2021     case OP_POSQUERY:
2022     possessive = TRUE;
2023     min = 0;
2024     max = 1;
2025     ecode++;
2026     goto REPEATCHAR;
2027    
2028 nigel 77 case OP_STAR:
2029     case OP_MINSTAR:
2030     case OP_PLUS:
2031     case OP_MINPLUS:
2032     case OP_QUERY:
2033     case OP_MINQUERY:
2034     c = *ecode++ - OP_STAR;
2035     minimize = (c & 1) != 0;
2036     min = rep_min[c]; /* Pick up values from tables; */
2037     max = rep_max[c]; /* zero for max => infinity */
2038     if (max == 0) max = INT_MAX;
2039    
2040     /* Common code for all repeated single-character matches. We can give
2041     up quickly if there are fewer than the minimum number of characters left in
2042     the subject. */
2043    
2044     REPEATCHAR:
2045     #ifdef SUPPORT_UTF8
2046     if (utf8)
2047     {
2048     length = 1;
2049     charptr = ecode;
2050     GETCHARLEN(fc, ecode, length);
2051     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2052     ecode += length;
2053    
2054     /* Handle multibyte character matching specially here. There is
2055     support for caseless matching if UCP support is present. */
2056    
2057     if (length > 1)
2058     {
2059     #ifdef SUPPORT_UCP
2060 nigel 93 unsigned int othercase;
2061 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2062 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2063 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2064 ph10 115 else oclength = 0;
2065 nigel 77 #endif /* SUPPORT_UCP */
2066    
2067     for (i = 1; i <= min; i++)
2068     {
2069     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2070 ph10 123 #ifdef SUPPORT_UCP
2071 nigel 77 /* Need braces because of following else */
2072     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2073     else
2074     {
2075     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2076     eptr += oclength;
2077     }
2078 ph10 115 #else /* without SUPPORT_UCP */
2079     else { RRETURN(MATCH_NOMATCH); }
2080 ph10 123 #endif /* SUPPORT_UCP */
2081 nigel 77 }
2082    
2083     if (min == max) continue;
2084    
2085     if (minimize)
2086     {
2087     for (fi = min;; fi++)
2088     {
2089     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2090     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2091     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2092     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2093 ph10 123 #ifdef SUPPORT_UCP
2094 nigel 77 /* Need braces because of following else */
2095     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2096     else
2097     {
2098     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2099     eptr += oclength;
2100     }
2101 ph10 115 #else /* without SUPPORT_UCP */
2102     else { RRETURN (MATCH_NOMATCH); }
2103     #endif /* SUPPORT_UCP */
2104 nigel 77 }
2105     /* Control never gets here */
2106     }
2107 nigel 93
2108     else /* Maximize */
2109 nigel 77 {
2110     pp = eptr;
2111     for (i = min; i < max; i++)
2112     {
2113     if (eptr > md->end_subject - length) break;
2114     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2115 ph10 123 #ifdef SUPPORT_UCP
2116 nigel 77 else if (oclength == 0) break;
2117     else
2118     {
2119     if (memcmp(eptr, occhars, oclength) != 0) break;
2120     eptr += oclength;
2121     }
2122 ph10 115 #else /* without SUPPORT_UCP */
2123     else break;
2124 ph10 123 #endif /* SUPPORT_UCP */
2125 nigel 77 }
2126 nigel 93
2127     if (possessive) continue;
2128 ph10 120 for(;;)
2129 nigel 77 {
2130     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2131     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2132 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2133 ph10 115 #ifdef SUPPORT_UCP
2134     eptr--;
2135     BACKCHAR(eptr);
2136 ph10 123 #else /* without SUPPORT_UCP */
2137 nigel 77 eptr -= length;
2138 ph10 123 #endif /* SUPPORT_UCP */
2139 nigel 77 }
2140     }
2141     /* Control never gets here */
2142     }
2143    
2144     /* If the length of a UTF-8 character is 1, we fall through here, and
2145     obey the code as for non-UTF-8 characters below, though in this case the
2146     value of fc will always be < 128. */
2147     }
2148     else
2149     #endif /* SUPPORT_UTF8 */
2150    
2151     /* When not in UTF-8 mode, load a single-byte character. */
2152     {
2153     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2154     fc = *ecode++;
2155     }
2156    
2157     /* The value of fc at this point is always less than 256, though we may or
2158     may not be in UTF-8 mode. The code is duplicated for the caseless and
2159     caseful cases, for speed, since matching characters is likely to be quite
2160     common. First, ensure the minimum number of matches are present. If min =
2161     max, continue at the same level without recursing. Otherwise, if
2162     minimizing, keep trying the rest of the expression and advancing one
2163     matching character if failing, up to the maximum. Alternatively, if
2164     maximizing, find the maximum number of characters and work backwards. */
2165    
2166     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2167     max, eptr));
2168    
2169     if ((ims & PCRE_CASELESS) != 0)
2170     {
2171     fc = md->lcc[fc];
2172     for (i = 1; i <= min; i++)
2173     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2174     if (min == max) continue;
2175     if (minimize)
2176     {
2177     for (fi = min;; fi++)
2178     {
2179     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2180     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2181     if (fi >= max || eptr >= md->end_subject ||
2182     fc != md->lcc[*eptr++])
2183     RRETURN(MATCH_NOMATCH);
2184     }
2185     /* Control never gets here */
2186     }
2187 nigel 93 else /* Maximize */
2188 nigel 77 {
2189     pp = eptr;
2190     for (i = min; i < max; i++)
2191     {
2192     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2193     eptr++;
2194     }
2195 nigel 93 if (possessive) continue;
2196 nigel 77 while (eptr >= pp)
2197     {
2198     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2199     eptr--;
2200     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2201     }
2202     RRETURN(MATCH_NOMATCH);
2203     }
2204     /* Control never gets here */
2205     }
2206    
2207     /* Caseful comparisons (includes all multi-byte characters) */
2208    
2209     else
2210     {
2211     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2212     if (min == max) continue;
2213     if (minimize)
2214     {
2215     for (fi = min;; fi++)
2216     {
2217     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2218     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2219     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2220     RRETURN(MATCH_NOMATCH);
2221     }
2222     /* Control never gets here */
2223     }
2224 nigel 93 else /* Maximize */
2225 nigel 77 {
2226     pp = eptr;
2227     for (i = min; i < max; i++)
2228     {
2229     if (eptr >= md->end_subject || fc != *eptr) break;
2230     eptr++;
2231     }
2232 nigel 93 if (possessive) continue;
2233 nigel 77 while (eptr >= pp)
2234     {
2235     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2236     eptr--;
2237     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2238     }
2239     RRETURN(MATCH_NOMATCH);
2240     }
2241     }
2242     /* Control never gets here */
2243    
2244     /* Match a negated single one-byte character. The character we are
2245     checking can be multibyte. */
2246    
2247     case OP_NOT:
2248     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2249     ecode++;
2250     GETCHARINCTEST(c, eptr);
2251     if ((ims & PCRE_CASELESS) != 0)
2252     {
2253     #ifdef SUPPORT_UTF8
2254     if (c < 256)
2255     #endif
2256     c = md->lcc[c];
2257     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2258     }
2259     else
2260     {
2261     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2262     }
2263     break;
2264    
2265     /* Match a negated single one-byte character repeatedly. This is almost a
2266     repeat of the code for a repeated single character, but I haven't found a
2267     nice way of commoning these up that doesn't require a test of the
2268     positive/negative option for each character match. Maybe that wouldn't add
2269     very much to the time taken, but character matching *is* what this is all
2270     about... */
2271    
2272     case OP_NOTEXACT:
2273     min = max = GET2(ecode, 1);
2274     ecode += 3;
2275     goto REPEATNOTCHAR;
2276    
2277     case OP_NOTUPTO:
2278     case OP_NOTMINUPTO:
2279     min = 0;
2280     max = GET2(ecode, 1);
2281     minimize = *ecode == OP_NOTMINUPTO;
2282     ecode += 3;
2283     goto REPEATNOTCHAR;
2284    
2285 nigel 93 case OP_NOTPOSSTAR:
2286     possessive = TRUE;
2287     min = 0;
2288     max = INT_MAX;
2289     ecode++;
2290     goto REPEATNOTCHAR;
2291    
2292     case OP_NOTPOSPLUS:
2293     possessive = TRUE;
2294     min = 1;
2295     max = INT_MAX;
2296     ecode++;
2297     goto REPEATNOTCHAR;
2298    
2299     case OP_NOTPOSQUERY:
2300     possessive = TRUE;
2301     min = 0;
2302     max = 1;
2303     ecode++;
2304     goto REPEATNOTCHAR;
2305    
2306     case OP_NOTPOSUPTO:
2307     possessive = TRUE;
2308     min = 0;
2309     max = GET2(ecode, 1);
2310     ecode += 3;
2311     goto REPEATNOTCHAR;
2312    
2313 nigel 77 case OP_NOTSTAR:
2314     case OP_NOTMINSTAR:
2315     case OP_NOTPLUS:
2316     case OP_NOTMINPLUS:
2317     case OP_NOTQUERY:
2318     case OP_NOTMINQUERY:
2319     c = *ecode++ - OP_NOTSTAR;
2320     minimize = (c & 1) != 0;
2321     min = rep_min[c]; /* Pick up values from tables; */
2322     max = rep_max[c]; /* zero for max => infinity */
2323     if (max == 0) max = INT_MAX;
2324    
2325     /* Common code for all repeated single-byte matches. We can give up quickly
2326     if there are fewer than the minimum number of bytes left in the
2327     subject. */
2328    
2329     REPEATNOTCHAR:
2330     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2331     fc = *ecode++;
2332    
2333     /* The code is duplicated for the caseless and caseful cases, for speed,
2334     since matching characters is likely to be quite common. First, ensure the
2335     minimum number of matches are present. If min = max, continue at the same
2336     level without recursing. Otherwise, if minimizing, keep trying the rest of
2337     the expression and advancing one matching character if failing, up to the
2338     maximum. Alternatively, if maximizing, find the maximum number of
2339     characters and work backwards. */
2340    
2341     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2342     max, eptr));
2343    
2344     if ((ims & PCRE_CASELESS) != 0)
2345     {
2346     fc = md->lcc[fc];
2347    
2348     #ifdef SUPPORT_UTF8
2349     /* UTF-8 mode */
2350     if (utf8)
2351     {
2352 nigel 93 register unsigned int d;
2353 nigel 77 for (i = 1; i <= min; i++)
2354     {
2355     GETCHARINC(d, eptr);
2356     if (d < 256) d = md->lcc[d];
2357     if (fc == d) RRETURN(MATCH_NOMATCH);
2358     }
2359     }
2360     else
2361     #endif
2362    
2363     /* Not UTF-8 mode */
2364     {
2365     for (i = 1; i <= min; i++)
2366     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2367     }
2368    
2369     if (min == max) continue;
2370    
2371     if (minimize)
2372     {
2373     #ifdef SUPPORT_UTF8
2374     /* UTF-8 mode */
2375     if (utf8)
2376     {
2377 nigel 93 register unsigned int d;
2378 nigel 77 for (fi = min;; fi++)
2379     {
2380     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2381     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2382     GETCHARINC(d, eptr);
2383     if (d < 256) d = md->lcc[d];
2384     if (fi >= max || eptr >= md->end_subject || fc == d)
2385     RRETURN(MATCH_NOMATCH);
2386     }
2387     }
2388     else
2389     #endif
2390     /* Not UTF-8 mode */
2391     {
2392     for (fi = min;; fi++)
2393     {
2394     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2395     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2396     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2397     RRETURN(MATCH_NOMATCH);
2398     }
2399     }
2400     /* Control never gets here */
2401     }
2402    
2403     /* Maximize case */
2404    
2405     else
2406     {
2407     pp = eptr;
2408    
2409     #ifdef SUPPORT_UTF8
2410     /* UTF-8 mode */
2411     if (utf8)
2412     {
2413 nigel 93 register unsigned int d;
2414 nigel 77 for (i = min; i < max; i++)
2415     {
2416     int len = 1;
2417     if (eptr >= md->end_subject) break;
2418     GETCHARLEN(d, eptr, len);
2419     if (d < 256) d = md->lcc[d];
2420     if (fc == d) break;
2421     eptr += len;
2422     }
2423 nigel 93 if (possessive) continue;
2424     for(;;)
2425 nigel 77 {
2426     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2427     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2428     if (eptr-- == pp) break; /* Stop if tried at original pos */
2429     BACKCHAR(eptr);
2430     }
2431     }
2432     else
2433     #endif
2434     /* Not UTF-8 mode */
2435     {
2436     for (i = min; i < max; i++)
2437     {
2438     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2439     eptr++;
2440     }
2441 nigel 93 if (possessive) continue;
2442 nigel 77 while (eptr >= pp)
2443     {
2444     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2445     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2446     eptr--;
2447     }
2448     }
2449    
2450     RRETURN(MATCH_NOMATCH);
2451     }
2452     /* Control never gets here */
2453     }
2454    
2455     /* Caseful comparisons */
2456    
2457     else
2458     {
2459     #ifdef SUPPORT_UTF8
2460     /* UTF-8 mode */
2461     if (utf8)
2462     {
2463 nigel 93 register unsigned int d;
2464 nigel 77 for (i = 1; i <= min; i++)
2465     {
2466     GETCHARINC(d, eptr);
2467     if (fc == d) RRETURN(MATCH_NOMATCH);
2468     }
2469     }
2470     else
2471     #endif
2472     /* Not UTF-8 mode */
2473     {
2474     for (i = 1; i <= min; i++)
2475     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2476     }
2477    
2478     if (min == max) continue;
2479    
2480     if (minimize)
2481     {
2482     #ifdef SUPPORT_UTF8
2483     /* UTF-8 mode */
2484     if (utf8)
2485     {
2486 nigel 93 register unsigned int d;
2487 nigel 77 for (fi = min;; fi++)
2488     {
2489     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2490     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2491     GETCHARINC(d, eptr);
2492     if (fi >= max || eptr >= md->end_subject || fc == d)
2493     RRETURN(MATCH_NOMATCH);
2494     }
2495     }
2496     else
2497     #endif
2498     /* Not UTF-8 mode */
2499     {
2500     for (fi = min;; fi++)
2501     {
2502     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2503     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2504     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2505     RRETURN(MATCH_NOMATCH);
2506     }
2507     }
2508     /* Control never gets here */
2509     }
2510    
2511     /* Maximize case */
2512    
2513     else
2514     {
2515     pp = eptr;
2516    
2517     #ifdef SUPPORT_UTF8
2518     /* UTF-8 mode */
2519     if (utf8)
2520     {
2521 nigel 93 register unsigned int d;
2522 nigel 77 for (i = min; i < max; i++)
2523     {
2524     int len = 1;
2525     if (eptr >= md->end_subject) break;
2526     GETCHARLEN(d, eptr, len);
2527     if (fc == d) break;
2528     eptr += len;
2529     }
2530 nigel 93 if (possessive) continue;
2531 nigel 77 for(;;)
2532     {
2533     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2534     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2535     if (eptr-- == pp) break; /* Stop if tried at original pos */
2536     BACKCHAR(eptr);
2537     }
2538     }
2539     else
2540     #endif
2541     /* Not UTF-8 mode */
2542     {
2543     for (i = min; i < max; i++)
2544     {
2545     if (eptr >= md->end_subject || fc == *eptr) break;
2546     eptr++;
2547     }
2548 nigel 93 if (possessive) continue;
2549 nigel 77 while (eptr >= pp)
2550     {
2551     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2552     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2553     eptr--;
2554     }
2555     }
2556    
2557     RRETURN(MATCH_NOMATCH);
2558     }
2559     }
2560     /* Control never gets here */
2561    
2562     /* Match a single character type repeatedly; several different opcodes
2563     share code. This is very similar to the code for single characters, but we
2564     repeat it in the interests of efficiency. */
2565    
2566     case OP_TYPEEXACT:
2567     min = max = GET2(ecode, 1);
2568     minimize = TRUE;
2569     ecode += 3;
2570     goto REPEATTYPE;
2571    
2572     case OP_TYPEUPTO:
2573     case OP_TYPEMINUPTO:
2574     min = 0;
2575     max = GET2(ecode, 1);
2576     minimize = *ecode == OP_TYPEMINUPTO;
2577     ecode += 3;
2578     goto REPEATTYPE;
2579    
2580 nigel 93 case OP_TYPEPOSSTAR:
2581     possessive = TRUE;
2582     min = 0;
2583     max = INT_MAX;
2584     ecode++;
2585     goto REPEATTYPE;
2586    
2587     case OP_TYPEPOSPLUS:
2588     possessive = TRUE;
2589     min = 1;
2590     max = INT_MAX;
2591     ecode++;
2592     goto REPEATTYPE;
2593    
2594     case OP_TYPEPOSQUERY:
2595     possessive = TRUE;
2596     min = 0;
2597     max = 1;
2598     ecode++;
2599     goto REPEATTYPE;
2600    
2601     case OP_TYPEPOSUPTO:
2602     possessive = TRUE;
2603     min = 0;
2604     max = GET2(ecode, 1);
2605     ecode += 3;
2606     goto REPEATTYPE;
2607    
2608 nigel 77 case OP_TYPESTAR:
2609     case OP_TYPEMINSTAR:
2610     case OP_TYPEPLUS:
2611     case OP_TYPEMINPLUS:
2612     case OP_TYPEQUERY:
2613     case OP_TYPEMINQUERY:
2614     c = *ecode++ - OP_TYPESTAR;
2615     minimize = (c & 1) != 0;
2616     min = rep_min[c]; /* Pick up values from tables; */
2617     max = rep_max[c]; /* zero for max => infinity */
2618     if (max == 0) max = INT_MAX;
2619    
2620     /* Common code for all repeated single character type matches. Note that
2621     in UTF-8 mode, '.' matches a character of any length, but for the other
2622     character types, the valid characters are all one-byte long. */
2623    
2624     REPEATTYPE:
2625     ctype = *ecode++; /* Code for the character type */
2626    
2627     #ifdef SUPPORT_UCP
2628     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2629     {
2630     prop_fail_result = ctype == OP_NOTPROP;
2631     prop_type = *ecode++;
2632 nigel 87 prop_value = *ecode++;
2633 nigel 77 }
2634     else prop_type = -1;
2635     #endif
2636    
2637     /* First, ensure the minimum number of matches are present. Use inline
2638     code for maximizing the speed, and do the type test once at the start
2639     (i.e. keep it out of the loop). Also we can test that there are at least
2640     the minimum number of bytes before we start. This isn't as effective in
2641     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2642     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2643     and single-bytes. */
2644    
2645     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2646     if (min > 0)
2647     {
2648     #ifdef SUPPORT_UCP
2649 nigel 87 if (prop_type >= 0)
2650 nigel 77 {
2651 nigel 87 switch(prop_type)
2652 nigel 77 {
2653 nigel 87 case PT_ANY:
2654     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2655     for (i = 1; i <= min; i++)
2656     {
2657     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2658     GETCHARINC(c, eptr);
2659     }
2660     break;
2661    
2662     case PT_LAMP:
2663     for (i = 1; i <= min; i++)
2664     {
2665     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2666     GETCHARINC(c, eptr);
2667     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2668     if ((prop_chartype == ucp_Lu ||
2669     prop_chartype == ucp_Ll ||
2670     prop_chartype == ucp_Lt) == prop_fail_result)
2671     RRETURN(MATCH_NOMATCH);
2672     }
2673     break;
2674    
2675     case PT_GC:
2676     for (i = 1; i <= min; i++)
2677     {
2678     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2679     GETCHARINC(c, eptr);
2680     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2681     if ((prop_category == prop_value) == prop_fail_result)
2682     RRETURN(MATCH_NOMATCH);
2683     }
2684     break;
2685    
2686     case PT_PC:
2687     for (i = 1; i <= min; i++)
2688     {
2689     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2690     GETCHARINC(c, eptr);
2691     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2692     if ((prop_chartype == prop_value) == prop_fail_result)
2693     RRETURN(MATCH_NOMATCH);
2694     }
2695     break;
2696    
2697     case PT_SC:
2698     for (i = 1; i <= min; i++)
2699     {
2700     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2701     GETCHARINC(c, eptr);
2702     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2703     if ((prop_script == prop_value) == prop_fail_result)
2704     RRETURN(MATCH_NOMATCH);
2705     }
2706     break;
2707    
2708     default:
2709     RRETURN(PCRE_ERROR_INTERNAL);
2710 nigel 77 }
2711     }
2712    
2713     /* Match extended Unicode sequences. We will get here only if the
2714     support is in the binary; otherwise a compile-time error occurs. */
2715    
2716     else if (ctype == OP_EXTUNI)
2717     {
2718     for (i = 1; i <= min; i++)
2719     {
2720     GETCHARINCTEST(c, eptr);
2721 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2722 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2723     while (eptr < md->end_subject)
2724     {
2725     int len = 1;
2726     if (!utf8) c = *eptr; else
2727     {
2728     GETCHARLEN(c, eptr, len);
2729     }
2730 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2731 nigel 77 if (prop_category != ucp_M) break;
2732     eptr += len;
2733     }
2734     }
2735     }
2736    
2737     else
2738     #endif /* SUPPORT_UCP */
2739    
2740     /* Handle all other cases when the coding is UTF-8 */
2741    
2742     #ifdef SUPPORT_UTF8
2743     if (utf8) switch(ctype)
2744     {
2745     case OP_ANY:
2746     for (i = 1; i <= min; i++)
2747     {
2748     if (eptr >= md->end_subject ||
2749 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2750 nigel 77 RRETURN(MATCH_NOMATCH);
2751 nigel 91 eptr++;
2752 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2753     }
2754     break;
2755    
2756     case OP_ANYBYTE:
2757     eptr += min;
2758     break;
2759    
2760 nigel 93 case OP_ANYNL:
2761     for (i = 1; i <= min; i++)
2762     {
2763     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2764     GETCHARINC(c, eptr);
2765     switch(c)
2766     {
2767     default: RRETURN(MATCH_NOMATCH);
2768     case 0x000d:
2769     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2770     break;
2771     case 0x000a:
2772     case 0x000b:
2773     case 0x000c:
2774     case 0x0085:
2775     case 0x2028:
2776     case 0x2029:
2777     break;
2778     }
2779     }
2780     break;
2781    
2782 nigel 77 case OP_NOT_DIGIT:
2783     for (i = 1; i <= min; i++)
2784     {
2785     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2786     GETCHARINC(c, eptr);
2787     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2788     RRETURN(MATCH_NOMATCH);
2789     }
2790     break;
2791    
2792     case OP_DIGIT:
2793     for (i = 1; i <= min; i++)
2794     {
2795     if (eptr >= md->end_subject ||
2796     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2797     RRETURN(MATCH_NOMATCH);
2798     /* No need to skip more bytes - we know it's a 1-byte character */
2799     }
2800     break;
2801    
2802     case OP_NOT_WHITESPACE:
2803     for (i = 1; i <= min; i++)
2804     {
2805     if (eptr >= md->end_subject ||
2806     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2807     RRETURN(MATCH_NOMATCH);
2808     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2809     }
2810     break;
2811    
2812     case OP_WHITESPACE:
2813     for (i = 1; i <= min; i++)
2814     {
2815     if (eptr >= md->end_subject ||
2816     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2817     RRETURN(MATCH_NOMATCH);
2818     /* No need to skip more bytes - we know it's a 1-byte character */
2819     }
2820     break;
2821    
2822     case OP_NOT_WORDCHAR:
2823     for (i = 1; i <= min; i++)
2824     {
2825     if (eptr >= md->end_subject ||
2826     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2827     RRETURN(MATCH_NOMATCH);
2828     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2829     }
2830     break;
2831    
2832     case OP_WORDCHAR:
2833     for (i = 1; i <= min; i++)
2834     {
2835     if (eptr >= md->end_subject ||
2836     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2837     RRETURN(MATCH_NOMATCH);
2838     /* No need to skip more bytes - we know it's a 1-byte character */
2839     }
2840     break;
2841    
2842     default:
2843     RRETURN(PCRE_ERROR_INTERNAL);
2844     } /* End switch(ctype) */
2845    
2846     else
2847     #endif /* SUPPORT_UTF8 */
2848    
2849     /* Code for the non-UTF-8 case for minimum matching of operators other
2850 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2851     number of bytes present, as this was tested above. */
2852 nigel 77
2853     switch(ctype)
2854     {
2855     case OP_ANY:
2856     if ((ims & PCRE_DOTALL) == 0)
2857     {
2858     for (i = 1; i <= min; i++)
2859 nigel 91 {
2860 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2861 nigel 91 eptr++;
2862     }
2863 nigel 77 }
2864     else eptr += min;
2865     break;
2866    
2867     case OP_ANYBYTE:
2868     eptr += min;
2869     break;
2870    
2871 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
2872     bytes are present in this case. */
2873    
2874     case OP_ANYNL:
2875     for (i = 1; i <= min; i++)
2876     {
2877     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2878     switch(*eptr++)
2879     {
2880     default: RRETURN(MATCH_NOMATCH);
2881     case 0x000d:
2882     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2883     break;
2884     case 0x000a:
2885     case 0x000b:
2886     case 0x000c:
2887     case 0x0085:
2888     break;
2889     }
2890     }
2891     break;
2892    
2893 nigel 77 case OP_NOT_DIGIT:
2894     for (i = 1; i <= min; i++)
2895     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2896     break;
2897    
2898     case OP_DIGIT:
2899     for (i = 1; i <= min; i++)
2900     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2901     break;
2902    
2903     case OP_NOT_WHITESPACE:
2904     for (i = 1; i <= min; i++)
2905     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2906     break;
2907    
2908     case OP_WHITESPACE:
2909     for (i = 1; i <= min; i++)
2910     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2911     break;
2912    
2913     case OP_NOT_WORDCHAR:
2914     for (i = 1; i <= min; i++)
2915     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2916     RRETURN(MATCH_NOMATCH);
2917     break;
2918    
2919     case OP_WORDCHAR:
2920     for (i = 1; i <= min; i++)
2921     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2922     RRETURN(MATCH_NOMATCH);
2923     break;
2924    
2925     default:
2926     RRETURN(PCRE_ERROR_INTERNAL);
2927     }
2928     }
2929    
2930     /* If min = max, continue at the same level without recursing */
2931    
2932     if (min == max) continue;
2933    
2934     /* If minimizing, we have to test the rest of the pattern before each
2935     subsequent match. Again, separate the UTF-8 case for speed, and also
2936     separate the UCP cases. */
2937    
2938     if (minimize)
2939     {
2940     #ifdef SUPPORT_UCP
2941 nigel 87 if (prop_type >= 0)
2942 nigel 77 {
2943 nigel 87 switch(prop_type)
2944 nigel 77 {
2945 nigel 87 case PT_ANY:
2946     for (fi = min;; fi++)
2947     {
2948     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2949     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2950     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951     GETCHARINC(c, eptr);
2952     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2953     }
2954 nigel 93 /* Control never gets here */
2955 nigel 87
2956     case PT_LAMP:
2957     for (fi = min;; fi++)
2958     {
2959     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2960     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2961     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962     GETCHARINC(c, eptr);
2963     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2964     if ((prop_chartype == ucp_Lu ||
2965     prop_chartype == ucp_Ll ||
2966     prop_chartype == ucp_Lt) == prop_fail_result)
2967     RRETURN(MATCH_NOMATCH);
2968     }
2969 nigel 93 /* Control never gets here */
2970 nigel 87
2971     case PT_GC:
2972     for (fi = min;; fi++)
2973     {
2974     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2975     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2976     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2977     GETCHARINC(c, eptr);
2978     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2979     if ((prop_category == prop_value) == prop_fail_result)
2980     RRETURN(MATCH_NOMATCH);
2981     }
2982 nigel 93 /* Control never gets here */
2983 nigel 87
2984     case PT_PC:
2985     for (fi = min;; fi++)
2986     {
2987     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2988     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2989     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2990     GETCHARINC(c, eptr);
2991     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2992     if ((prop_chartype == prop_value) == prop_fail_result)
2993     RRETURN(MATCH_NOMATCH);
2994     }
2995 nigel 93 /* Control never gets here */
2996 nigel 87
2997     case PT_SC:
2998     for (fi = min;; fi++)
2999     {
3000     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3001     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3002     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3003     GETCHARINC(c, eptr);
3004     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3005     if ((prop_script == prop_value) == prop_fail_result)
3006     RRETURN(MATCH_NOMATCH);
3007     }
3008 nigel 93 /* Control never gets here */
3009 nigel 87
3010     default:
3011     RRETURN(PCRE_ERROR_INTERNAL);
3012 nigel 77 }
3013     }
3014    
3015     /* Match extended Unicode sequences. We will get here only if the
3016     support is in the binary; otherwise a compile-time error occurs. */
3017    
3018     else if (ctype == OP_EXTUNI)
3019     {
3020     for (fi = min;; fi++)
3021     {
3022     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3023     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3024     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3025     GETCHARINCTEST(c, eptr);
3026 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3027 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3028     while (eptr < md->end_subject)
3029     {
3030     int len = 1;
3031     if (!utf8) c = *eptr; else
3032     {
3033     GETCHARLEN(c, eptr, len);
3034     }
3035 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3036 nigel 77 if (prop_category != ucp_M) break;
3037     eptr += len;
3038     }
3039     }
3040     }
3041    
3042     else
3043     #endif /* SUPPORT_UCP */
3044    
3045     #ifdef SUPPORT_UTF8
3046     /* UTF-8 mode */
3047     if (utf8)
3048     {
3049     for (fi = min;; fi++)
3050     {
3051     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3052     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3053 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3054     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3055 nigel 93 IS_NEWLINE(eptr)))
3056 nigel 91 RRETURN(MATCH_NOMATCH);
3057 nigel 77
3058     GETCHARINC(c, eptr);
3059     switch(ctype)
3060     {
3061 nigel 91 case OP_ANY: /* This is the DOTALL case */
3062 nigel 77 break;
3063    
3064     case OP_ANYBYTE:
3065     break;
3066    
3067 nigel 93 case OP_ANYNL:
3068     switch(c)
3069     {
3070     default: RRETURN(MATCH_NOMATCH);
3071     case 0x000d:
3072     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3073     break;
3074     case 0x000a:
3075     case 0x000b:
3076     case 0x000c:
3077     case 0x0085:
3078     case 0x2028:
3079     case 0x2029:
3080     break;
3081     }
3082     break;
3083    
3084 nigel 77 case OP_NOT_DIGIT:
3085     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3086     RRETURN(MATCH_NOMATCH);
3087     break;
3088    
3089     case OP_DIGIT:
3090     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3091     RRETURN(MATCH_NOMATCH);
3092     break;
3093    
3094     case OP_NOT_WHITESPACE:
3095     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3096     RRETURN(MATCH_NOMATCH);
3097     break;
3098    
3099     case OP_WHITESPACE:
3100     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3101     RRETURN(MATCH_NOMATCH);
3102     break;
3103    
3104     case OP_NOT_WORDCHAR:
3105     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3106     RRETURN(MATCH_NOMATCH);
3107     break;
3108    
3109     case OP_WORDCHAR:
3110     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3111     RRETURN(MATCH_NOMATCH);
3112     break;
3113    
3114     default:
3115     RRETURN(PCRE_ERROR_INTERNAL);
3116     }
3117     }
3118     }
3119     else
3120     #endif
3121     /* Not UTF-8 mode */
3122     {
3123     for (fi = min;; fi++)
3124     {
3125     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3126     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3127 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3128 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3129 nigel 91 RRETURN(MATCH_NOMATCH);
3130    
3131 nigel 77 c = *eptr++;
3132     switch(ctype)
3133     {
3134 nigel 91 case OP_ANY: /* This is the DOTALL case */
3135 nigel 77 break;
3136    
3137     case OP_ANYBYTE:
3138     break;
3139    
3140 nigel 93 case OP_ANYNL:
3141     switch(c)
3142     {
3143     default: RRETURN(MATCH_NOMATCH);
3144     case 0x000d:
3145     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3146     break;
3147     case 0x000a:
3148     case 0x000b:
3149     case 0x000c:
3150     case 0x0085:
3151     break;
3152     }
3153     break;
3154    
3155 nigel 77 case OP_NOT_DIGIT:
3156     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3157     break;
3158    
3159     case OP_DIGIT:
3160     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3161     break;
3162    
3163     case OP_NOT_WHITESPACE:
3164     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3165     break;
3166    
3167     case OP_WHITESPACE:
3168     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3169     break;
3170    
3171     case OP_NOT_WORDCHAR:
3172     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3173     break;
3174    
3175     case OP_WORDCHAR:
3176     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3177     break;
3178    
3179     default:
3180     RRETURN(PCRE_ERROR_INTERNAL);
3181     }
3182     }
3183     }
3184     /* Control never gets here */
3185     }
3186    
3187 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3188 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3189     UTF-8 and UCP stuff separate. */
3190    
3191     else
3192     {
3193     pp = eptr; /* Remember where we started */
3194    
3195     #ifdef SUPPORT_UCP
3196 nigel 87 if (prop_type >= 0)
3197 nigel 77 {
3198 nigel 87 switch(prop_type)
3199 nigel 77 {
3200 nigel 87 case PT_ANY:
3201     for (i = min; i < max; i++)
3202     {
3203     int len = 1;
3204     if (eptr >= md->end_subject) break;
3205     GETCHARLEN(c, eptr, len);
3206     if (prop_fail_result) break;
3207     eptr+= len;
3208     }
3209     break;
3210    
3211     case PT_LAMP:
3212     for (i = min; i < max; i++)
3213     {
3214     int len = 1;
3215     if (eptr >= md->end_subject) break;
3216     GETCHARLEN(c, eptr, len);
3217     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3218     if ((prop_chartype == ucp_Lu ||
3219     prop_chartype == ucp_Ll ||
3220     prop_chartype == ucp_Lt) == prop_fail_result)
3221     break;
3222     eptr+= len;
3223     }
3224     break;
3225    
3226     case PT_GC:
3227     for (i = min; i < max; i++)
3228     {
3229     int len = 1;
3230     if (eptr >= md->end_subject) break;
3231     GETCHARLEN(c, eptr, len);
3232     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3233     if ((prop_category == prop_value) == prop_fail_result)
3234     break;
3235     eptr+= len;
3236     }
3237     break;
3238    
3239     case PT_PC:
3240     for (i = min; i < max; i++)
3241     {
3242     int len = 1;
3243     if (eptr >= md->end_subject) break;
3244     GETCHARLEN(c, eptr, len);
3245     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3246     if ((prop_chartype == prop_value) == prop_fail_result)
3247     break;
3248     eptr+= len;
3249     }
3250     break;
3251    
3252     case PT_SC:
3253     for (i = min; i < max; i++)
3254     {
3255     int len = 1;
3256     if (eptr >= md->end_subject) break;
3257     GETCHARLEN(c, eptr, len);
3258     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3259     if ((prop_script == prop_value) == prop_fail_result)
3260     break;
3261     eptr+= len;
3262     }
3263     break;
3264 nigel 77 }
3265    
3266     /* eptr is now past the end of the maximum run */
3267    
3268 nigel 93 if (possessive) continue;
3269 nigel 77 for(;;)
3270     {
3271     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3272     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3273     if (eptr-- == pp) break; /* Stop if tried at original pos */
3274     BACKCHAR(eptr);
3275     }
3276     }
3277    
3278     /* Match extended Unicode sequences. We will get here only if the
3279     support is in the binary; otherwise a compile-time error occurs. */
3280    
3281     else if (ctype == OP_EXTUNI)
3282     {
3283     for (i = min; i < max; i++)
3284     {
3285     if (eptr >= md->end_subject) break;
3286     GETCHARINCTEST(c, eptr);
3287 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3288 nigel 77 if (prop_category == ucp_M) break;
3289     while (eptr < md->end_subject)
3290     {
3291     int len = 1;
3292     if (!utf8) c = *eptr; else
3293     {
3294     GETCHARLEN(c, eptr, len);
3295     }
3296 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3297 nigel 77 if (prop_category != ucp_M) break;
3298     eptr += len;
3299     }
3300     }
3301    
3302     /* eptr is now past the end of the maximum run */
3303    
3304 nigel 93 if (possessive) continue;
3305 nigel 77 for(;;)
3306     {
3307     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3308     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3309     if (eptr-- == pp) break; /* Stop if tried at original pos */
3310     for (;;) /* Move back over one extended */
3311     {
3312     int len = 1;
3313     BACKCHAR(eptr);
3314     if (!utf8) c = *eptr; else
3315     {
3316     GETCHARLEN(c, eptr, len);
3317     }
3318 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3319 nigel 77 if (prop_category != ucp_M) break;
3320     eptr--;
3321     }
3322     }
3323     }
3324    
3325     else
3326     #endif /* SUPPORT_UCP */
3327    
3328     #ifdef SUPPORT_UTF8
3329     /* UTF-8 mode */
3330    
3331     if (utf8)
3332     {
3333     switch(ctype)
3334     {
3335     case OP_ANY:
3336    
3337 nigel 91 /* Special code is required for UTF8, but when the maximum is
3338     unlimited we don't need it, so we repeat the non-UTF8 code. This is
3339     probably worth it, because .* is quite a common idiom. */
3340 nigel 77
3341     if (max < INT_MAX)
3342     {
3343     if ((ims & PCRE_DOTALL) == 0)
3344     {
3345     for (i = min; i < max; i++)
3346     {
3347 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3348 nigel 77 eptr++;
3349     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3350     }
3351     }
3352     else
3353     {
3354     for (i = min; i < max; i++)
3355     {
3356 nigel 91 if (eptr >= md->end_subject) break;
3357 nigel 77 eptr++;
3358     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3359     }
3360     }
3361     }
3362    
3363     /* Handle unlimited UTF-8 repeat */
3364    
3365     else
3366     {
3367     if ((ims & PCRE_DOTALL) == 0)
3368     {
3369     for (i = min; i < max; i++)
3370     {
3371 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3372 nigel 77 eptr++;
3373     }
3374     break;
3375     }
3376     else
3377     {
3378     c = max - min;
3379 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3380     c = md->end_subject - eptr;
3381 nigel 77 eptr += c;
3382     }
3383     }
3384     break;
3385    
3386     /* The byte case is the same as non-UTF8 */
3387    
3388     case OP_ANYBYTE:
3389     c = max - min;
3390 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3391     c = md->end_subject - eptr;
3392 nigel 77 eptr += c;
3393     break;
3394    
3395 nigel 93 case OP_ANYNL:
3396     for (i = min; i < max; i++)
3397     {
3398     int len = 1;
3399     if (eptr >= md->end_subject) break;
3400     GETCHARLEN(c, eptr, len);
3401     if (c == 0x000d)
3402     {
3403     if (++eptr >= md->end_subject) break;
3404     if (*eptr == 0x000a) eptr++;
3405     }
3406     else
3407     {
3408     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3409     c != 0x0085 && c != 0x2028 && c != 0x2029)
3410     break;
3411     eptr += len;
3412     }
3413     }
3414     break;
3415    
3416 nigel 77 case OP_NOT_DIGIT:
3417     for (i = min; i < max; i++)
3418     {
3419     int len = 1;
3420     if (eptr >= md->end_subject) break;
3421     GETCHARLEN(c, eptr, len);
3422     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3423     eptr+= len;
3424     }
3425     break;
3426    
3427     case OP_DIGIT:
3428     for (i = min; i < max; i++)
3429     {
3430     int len = 1;
3431     if (eptr >= md->end_subject) break;
3432     GETCHARLEN(c, eptr, len);
3433     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3434     eptr+= len;
3435     }
3436     break;
3437    
3438     case OP_NOT_WHITESPACE:
3439     for (i = min; i < max; i++)
3440     {
3441     int len = 1;
3442     if (eptr >= md->end_subject) break;
3443     GETCHARLEN(c, eptr, len);
3444     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3445     eptr+= len;
3446     }
3447     break;
3448    
3449     case OP_WHITESPACE:
3450     for (i = min; i < max; i++)
3451     {
3452     int len = 1;
3453     if (eptr >= md->end_subject) break;
3454     GETCHARLEN(c, eptr, len);
3455     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3456     eptr+= len;
3457     }
3458     break;
3459    
3460     case OP_NOT_WORDCHAR:
3461     for (i = min; i < max; i++)
3462     {
3463     int len = 1;
3464     if (eptr >= md->end_subject) break;
3465     GETCHARLEN(c, eptr, len);
3466     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3467     eptr+= len;
3468     }
3469     break;
3470    
3471     case OP_WORDCHAR:
3472     for (i = min; i < max; i++)
3473     {
3474     int len = 1;
3475     if (eptr >= md->end_subject) break;
3476     GETCHARLEN(c, eptr, len);
3477     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3478     eptr+= len;
3479     }
3480     break;
3481    
3482     default:
3483     RRETURN(PCRE_ERROR_INTERNAL);
3484     }
3485    
3486     /* eptr is now past the end of the maximum run */
3487    
3488 nigel 93 if (possessive) continue;
3489 nigel 77 for(;;)
3490     {
3491     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3492     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3493     if (eptr-- == pp) break; /* Stop if tried at original pos */
3494     BACKCHAR(eptr);
3495     }
3496     }
3497     else
3498     #endif
3499    
3500     /* Not UTF-8 mode */
3501     {
3502     switch(ctype)
3503     {
3504     case OP_ANY:
3505     if ((ims & PCRE_DOTALL) == 0)
3506     {
3507     for (i = min; i < max; i++)
3508     {
3509 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3510 nigel 77 eptr++;
3511     }
3512     break;
3513     }
3514     /* For DOTALL case, fall through and treat as \C */
3515    
3516     case OP_ANYBYTE:
3517     c = max - min;
3518 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3519     c = md->end_subject - eptr;
3520 nigel 77 eptr += c;
3521     break;
3522    
3523 nigel 93 case OP_ANYNL:
3524     for (i = min; i < max; i++)
3525     {
3526     if (eptr >= md->end_subject) break;
3527     c = *eptr;
3528     if (c == 0x000d)
3529     {
3530     if (++eptr >= md->end_subject) break;
3531     if (*eptr == 0x000a) eptr++;
3532     }
3533     else
3534     {
3535     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3536     break;
3537     eptr++;
3538     }
3539     }
3540     break;
3541    
3542 nigel 77 case OP_NOT_DIGIT:
3543     for (i = min; i < max; i++)
3544     {
3545     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3546     break;
3547     eptr++;
3548     }
3549     break;
3550    
3551     case OP_DIGIT:
3552     for (i = min; i < max; i++)
3553     {
3554     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3555     break;
3556     eptr++;
3557     }
3558     break;
3559    
3560     case OP_NOT_WHITESPACE:
3561     for (i = min; i < max; i++)
3562     {
3563     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3564     break;
3565     eptr++;
3566     }
3567     break;
3568    
3569     case OP_WHITESPACE:
3570     for (i = min; i < max; i++)
3571     {
3572     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3573     break;
3574     eptr++;
3575     }
3576     break;
3577    
3578     case OP_NOT_WORDCHAR:
3579     for (i = min; i < max; i++)
3580     {
3581     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3582     break;
3583     eptr++;
3584     }
3585     break;
3586    
3587     case OP_WORDCHAR:
3588     for (i = min; i < max; i++)
3589     {
3590     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3591     break;
3592     eptr++;
3593     }
3594     break;
3595    
3596     default:
3597     RRETURN(PCRE_ERROR_INTERNAL);
3598     }
3599    
3600     /* eptr is now past the end of the maximum run */
3601    
3602 nigel 93 if (possessive) continue;
3603 nigel 77 while (eptr >= pp)
3604     {
3605     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3606     eptr--;
3607     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3608     }
3609     }
3610    
3611     /* Get here if we can't make it match with any permitted repetitions */
3612    
3613     RRETURN(MATCH_NOMATCH);
3614     }
3615     /* Control never gets here */
3616    
3617 nigel 93 /* There's been some horrible disaster. Arrival here can only mean there is
3618     something seriously wrong in the code above or the OP_xxx definitions. */
3619 nigel 77
3620     default:
3621     DPRINTF(("Unknown opcode %d\n", *ecode));
3622 nigel 93 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3623 nigel 77 }
3624    
3625     /* Do not stick any code in here without much thought; it is assumed
3626     that "continue" in the code above comes out to here to repeat the main
3627     loop. */
3628    
3629     } /* End of main loop */
3630     /* Control never reaches here */
3631     }
3632    
3633    
3634     /***************************************************************************
3635     ****************************************************************************
3636     RECURSION IN THE match() FUNCTION
3637    
3638     Undefine all the macros that were defined above to handle this. */
3639    
3640     #ifdef NO_RECURSE
3641     #undef eptr
3642     #undef ecode
3643     #undef offset_top
3644     #undef ims
3645     #undef eptrb
3646     #undef flags
3647    
3648     #undef callpat
3649     #undef charptr
3650     #undef data
3651     #undef next
3652     #undef pp
3653     #undef prev
3654     #undef saved_eptr
3655    
3656     #undef new_recursive
3657    
3658     #undef cur_is_word
3659     #undef condition
3660     #undef prev_is_word
3661    
3662     #undef original_ims
3663    
3664     #undef ctype
3665     #undef length
3666     #undef max
3667     #undef min
3668     #undef number
3669     #undef offset
3670     #undef op
3671     #undef save_capture_last
3672     #undef save_offset1
3673     #undef save_offset2
3674     #undef save_offset3
3675     #undef stacksave
3676    
3677     #undef newptrb
3678    
3679     #endif
3680    
3681     /* These two are defined as macros in both cases */
3682    
3683     #undef fc
3684     #undef fi
3685    
3686     /***************************************************************************
3687     ***************************************************************************/
3688    
3689    
3690    
3691     /*************************************************
3692     * Execute a Regular Expression *
3693     *************************************************/
3694    
3695     /* This function applies a compiled re to a subject string and picks out
3696     portions of the string if it matches. Two elements in the vector are set for
3697     each substring: the offsets to the start and end of the substring.
3698    
3699     Arguments:
3700     argument_re points to the compiled expression
3701     extra_data points to extra data or is NULL
3702     subject points to the subject string
3703     length length of subject string (may contain binary zeros)
3704     start_offset where to start in the subject string
3705     options option bits
3706     offsets points to a vector of ints to be filled in with offsets
3707     offsetcount the number of elements in the vector
3708    
3709     Returns: > 0 => success; value is the number of elements filled in
3710     = 0 => success, but offsets is not big enough
3711     -1 => failed to match
3712     < -1 => some kind of unexpected problem
3713     */
3714    
3715 ph10 145 PCRE_EXP_DEFN int
3716 nigel 77 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3717 nigel 87 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3718 nigel 77 int offsetcount)
3719     {
3720     int rc, resetcount, ocount;
3721     int first_byte = -1;
3722     int req_byte = -1;
3723     int req_byte2 = -1;
3724 nigel 91 int newline;
3725     unsigned long int ims;
3726 nigel 77 BOOL using_temporary_offsets = FALSE;
3727     BOOL anchored;
3728     BOOL startline;
3729     BOOL firstline;
3730     BOOL first_byte_caseless = FALSE;
3731     BOOL req_byte_caseless = FALSE;
3732 nigel 93 BOOL utf8;
3733 nigel 77 match_data match_block;
3734 nigel 91 match_data *md = &match_block;
3735 nigel 77 const uschar *tables;
3736     const uschar *start_bits = NULL;
3737 nigel 87 USPTR start_match = (USPTR)subject + start_offset;
3738     USPTR end_subject;
3739     USPTR req_byte_ptr = start_match - 1;
3740 nigel 93 eptrblock eptrchain[EPTR_WORK_SIZE];
3741 nigel 77
3742     pcre_study_data internal_study;
3743     const pcre_study_data *study;
3744    
3745     real_pcre internal_re;
3746     const real_pcre *external_re = (const real_pcre *)argument_re;
3747     const real_pcre *re = external_re;
3748    
3749     /* Plausibility checks */
3750    
3751     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3752     if (re == NULL || subject == NULL ||
3753     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3754     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3755    
3756     /* Fish out the optional data from the extra_data structure, first setting
3757     the default values. */
3758    
3759     study = NULL;
3760 nigel 91 md->match_limit = MATCH_LIMIT;
3761     md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3762     md->callout_data = NULL;
3763 nigel 77
3764     /* The table pointer is always in native byte order. */
3765    
3766     tables = external_re->tables;
3767    
3768     if (extra_data != NULL)
3769     {
3770     register unsigned int flags = extra_data->flags;
3771     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3772     study = (const pcre_study_data *)extra_data->study_data;
3773     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3774 nigel 91 md->match_limit = extra_data->match_limit;
3775 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3776 nigel 91 md->match_limit_recursion = extra_data->match_limit_recursion;
3777 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3778 nigel 91 md->callout_data = extra_data->callout_data;
3779 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3780     }
3781    
3782     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3783     is a feature that makes it possible to save compiled regex and re-use them
3784     in other programs later. */
3785    
3786     if (tables == NULL) tables = _pcre_default_tables;
3787    
3788     /* Check that the first field in the block is the magic number. If it is not,
3789     test for a regex that was compiled on a host of opposite endianness. If this is
3790     the case, flipped values are put in internal_re and internal_study if there was
3791     study data too. */
3792    
3793     if (re->magic_number != MAGIC_NUMBER)
3794     {
3795     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3796     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3797     if (study != NULL) study = &internal_study;
3798     }
3799    
3800     /* Set up other data */
3801    
3802     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3803     startline = (re->options & PCRE_STARTLINE) != 0;
3804     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3805    
3806     /* The code starts after the real_pcre block and the capture name table. */
3807    
3808 nigel 91 md->start_code = (const uschar *)external_re + re->name_table_offset +
3809 nigel 77 re->name_count * re->name_entry_size;
3810    
3811 nigel 91 md->start_subject = (USPTR)subject;
3812     md->start_offset = start_offset;
3813     md->end_subject = md->start_subject + length;
3814     end_subject = md->end_subject;
3815 nigel 77
3816 nigel 91 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3817 nigel 93 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3818 nigel 77
3819 nigel 91 md->notbol = (options & PCRE_NOTBOL) != 0;
3820     md->noteol = (options & PCRE_NOTEOL) != 0;
3821     md->notempty = (options & PCRE_NOTEMPTY) != 0;
3822     md->partial = (options & PCRE_PARTIAL) != 0;
3823     md->hitend = FALSE;
3824 nigel 77
3825 nigel 91 md->recursive = NULL; /* No recursion at top level */
3826 nigel 93 md->eptrchain = eptrchain; /* Make workspace generally available */
3827 nigel 77
3828 nigel 91 md->lcc = tables + lcc_offset;
3829     md->ctypes = tables + ctypes_offset;
3830 nigel 77
3831 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3832     nothing is set at run time, whatever was used at compile time applies. */
3833 nigel 91
3834 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3835 nigel 93 PCRE_NEWLINE_BITS)
3836 nigel 91 {
3837 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3838 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
3839     case PCRE_NEWLINE_LF: newline = '\n'; break;
3840     case PCRE_NEWLINE_CR+
3841     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3842 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3843     default: return PCRE_ERROR_BADNEWLINE;
3844 nigel 91 }
3845    
3846 nigel 93 if (newline < 0)
3847 nigel 91 {
3848 nigel 93 md->nltype = NLTYPE_ANY;
3849 nigel 91 }
3850     else
3851     {
3852 nigel 93 md->nltype = NLTYPE_FIXED;
3853     if (newline > 255)
3854     {
3855     md->nllen = 2;
3856     md->nl[0] = (newline >> 8) & 255;
3857     md->nl[1] = newline & 255;
3858     }
3859     else
3860     {
3861     md->nllen = 1;
3862     md->nl[0] = newline;
3863     }
3864 nigel 91 }
3865    
3866 nigel 77 /* Partial matching is supported only for a restricted set of regexes at the
3867     moment. */
3868    
3869 nigel 91 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3870 nigel 77 return PCRE_ERROR_BADPARTIAL;
3871    
3872     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3873     back the character offset. */
3874    
3875     #ifdef SUPPORT_UTF8
3876 nigel 93 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3877 nigel 77 {
3878     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3879     return PCRE_ERROR_BADUTF8;
3880     if (start_offset > 0 && start_offset < length)
3881     {
3882     int tb = ((uschar *)subject)[start_offset];
3883     if (tb > 127)
3884     {
3885     tb &= 0xc0;
3886     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3887     }
3888     }
3889     }
3890     #endif
3891    
3892     /* The ims options can vary during the matching as a result of the presence
3893     of (?ims) items in the pattern. They are kept in a local variable so that
3894     restoring at the exit of a group is easy. */
3895    
3896     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3897    
3898     /* If the expression has got more back references than the offsets supplied can
3899     hold, we get a temporary chunk of working store to use during the matching.
3900     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3901     of 3. */
3902    
3903     ocount = offsetcount - (offsetcount % 3);
3904    
3905     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3906     {
3907     ocount = re->top_backref * 3 + 3;
3908 nigel 91 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3909     if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3910 nigel 77 using_temporary_offsets = TRUE;
3911     DPRINTF(("Got memory to hold back references\n"));
3912     }
3913 nigel 91 else md->offset_vector = offsets;
3914 nigel 77
3915 nigel 91 md->offset_end = ocount;
3916     md->offset_max = (2*ocount)/3;
3917     md->offset_overflow = FALSE;
3918     md->capture_last = -1;
3919 nigel 77
3920     /* Compute the minimum number of offsets that we need to reset each time. Doing
3921     this makes a huge difference to execution time when there aren't many brackets
3922     in the pattern. */
3923    
3924     resetcount = 2 + re->top_bracket * 2;
3925     if (resetcount > offsetcount) resetcount = ocount;
3926    
3927     /* Reset the working variable associated with each extraction. These should
3928     never be used unless previously set, but they get saved and restored, and so we
3929     initialize them to avoid reading uninitialized locations. */
3930    
3931 nigel 91 if (md->offset_vector != NULL)
3932 nigel 77 {
3933 nigel 91 register int *iptr = md->offset_vector + ocount;
3934 nigel 77 register int *iend = iptr - resetcount/2 + 1;
3935     while (--iptr >= iend) *iptr = -1;
3936     }
3937    
3938     /* Set up the first character to match, if available. The first_byte value is
3939     never set for an anchored regular expression, but the anchoring may be forced
3940     at run time, so we have to test for anchoring. The first char may be unset for
3941     an unanchored pattern, of course. If there's no first char and the pattern was
3942     studied, there may be a bitmap of possible first characters. */
3943    
3944     if (!anchored)
3945     {
3946     if ((re->options & PCRE_FIRSTSET) != 0)
3947     {
3948     first_byte = re->first_byte & 255;
3949     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3950 nigel 91 first_byte = md->lcc[first_byte];
3951 nigel 77 }
3952     else
3953     if (!startline && study != NULL &&
3954     (study->options & PCRE_STUDY_MAPPED) != 0)
3955     start_bits = study->start_bits;
3956     }
3957    
3958     /* For anchored or unanchored matches, there may be a "last known required
3959     character" set. */
3960    
3961     if ((re->options & PCRE_REQCHSET) != 0)
3962     {
3963     req_byte = re->req_byte & 255;
3964     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3965     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3966     }
3967    
3968 nigel 93
3969     /* ==========================================================================*/
3970    
3971 nigel 77 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3972     the loop runs just once. */
3973    
3974 nigel 93 for(;;)
3975 nigel 77 {
3976 nigel 87 USPTR save_end_subject = end_subject;
3977 nigel 77
3978     /* Reset the maximum number of extractions we might see. */
3979    
3980 nigel 91 if (md->offset_vector != NULL)
3981 nigel 77 {
3982 nigel 91 register int *iptr = md->offset_vector;
3983 nigel 77 register int *iend = iptr + resetcount;
3984     while (iptr < iend) *iptr++ = -1;
3985     }
3986    
3987     /* Advance to a unique first char if possible. If firstline is TRUE, the
3988     start of the match is constrained to the first line of a multiline string.
3989 nigel 93 That is, the match must be before or at the first newline. Implement this by
3990     temporarily adjusting end_subject so that we stop scanning at a newline. If
3991     the match fails at the newline, later code breaks this loop. */
3992 nigel 77
3993     if (firstline)
3994     {
3995 nigel 87 USPTR t = start_match;
3996 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3997 nigel 77 end_subject = t;
3998     }
3999    
4000     /* Now test for a unique first byte */
4001    
4002     if (first_byte >= 0)
4003     {
4004     if (first_byte_caseless)
4005     while (start_match < end_subject &&
4006 nigel 91 md->lcc[*start_match] != first_byte)
4007 nigel 77 start_match++;
4008     else
4009     while (start_match < end_subject && *start_match != first_byte)
4010     start_match++;
4011     }
4012    
4013 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
4014 nigel 77
4015     else if (startline)
4016     {
4017 nigel 93 if (start_match > md->start_subject + start_offset)
4018 nigel 77 {
4019 nigel 93 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4020 nigel 77 start_match++;
4021 ph10 134
4022 ph10 130 /* If we have just passed a CR and the newline option is ANY, and we are
4023     now at a LF, advance the match position by one more character. */
4024 ph10 134
4025 ph10 130 if (start_match[-1] == '\r' &&
4026     md->nltype == NLTYPE_ANY &&
4027     start_match < end_subject &&
4028     *start_match == '\n')
4029     start_match++;
4030 nigel 77 }
4031     }
4032    
4033     /* Or to a non-unique first char after study */
4034    
4035     else if (start_bits != NULL)
4036     {
4037     while (start_match < end_subject)
4038     {
4039     register unsigned int c = *start_match;
4040     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4041     }
4042     }
4043    
4044     /* Restore fudged end_subject */
4045    
4046     end_subject = save_end_subject;
4047    
4048     #ifdef DEBUG /* Sigh. Some compilers never learn. */
4049     printf(">>>> Match against: ");
4050 nigel 91 pchars(start_match, end_subject - start_match, TRUE, md);
4051 nigel 77 printf("\n");
4052     #endif
4053    
4054     /* If req_byte is set, we know that that character must appear in the subject
4055     for the match to succeed. If the first character is set, req_byte must be
4056     later in the subject; otherwise the test starts at the match point. This
4057     optimization can save a huge amount of backtracking in patterns with nested
4058     unlimited repeats that aren't going to match. Writing separate code for
4059     cased/caseless versions makes it go faster, as does using an autoincrement
4060     and backing off on a match.
4061    
4062     HOWEVER: when the subject string is very, very long, searching to its end can
4063     take a long time, and give bad performance on quite ordinary patterns. This
4064 nigel 93 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4065     string... so we don't do this when the string is sufficiently long.
4066 nigel 77
4067     ALSO: this processing is disabled when partial matching is requested.
4068     */
4069    
4070     if (req_byte >= 0 &&
4071     end_subject - start_match < REQ_BYTE_MAX &&
4072 nigel 91 !md->partial)
4073 nigel 77 {
4074 nigel 87 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4075 nigel 77
4076     /* We don't need to repeat the search if we haven't yet reached the
4077     place we found it at last time. */
4078    
4079     if (p > req_byte_ptr)
4080     {
4081     if (req_byte_caseless)
4082     {
4083     while (p < end_subject)
4084     {
4085     register int pp = *p++;
4086     if (pp == req_byte || pp == req_byte2) { p--; break; }
4087     }
4088     }
4089     else
4090     {
4091     while (p < end_subject)
4092     {
4093     if (*p++ == req_byte) { p--; break; }
4094     }
4095     }
4096    
4097 nigel 93 /* If we can't find the required character, break the matching loop,
4098     forcing a match failure. */
4099 nigel 77
4100 nigel 93 if (p >= end_subject)
4101     {
4102     rc = MATCH_NOMATCH;
4103     break;
4104     }
4105 nigel 77
4106     /* If we have found the required character, save the point where we
4107     found it, so that we don't search again next time round the loop if
4108     the start hasn't passed this character yet. */
4109    
4110     req_byte_ptr = p;
4111     }
4112     }
4113    
4114 nigel 93 /* OK, we can now run the match. */
4115 nigel 77
4116 nigel 91 md->start_match = start_match;
4117     md->match_call_count = 0;
4118 nigel 93 md->eptrn = 0; /* Next free eptrchain slot */
4119     rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4120 nigel 77
4121 nigel 93 /* Any return other than MATCH_NOMATCH breaks the loop. */
4122 nigel 77
4123 nigel 93 if (rc != MATCH_NOMATCH) break;
4124 nigel 77
4125 nigel 93 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4126     newline in the subject (though it may continue over the newline). Therefore,
4127     if we have just failed to match, starting at a newline, do not continue. */
4128    
4129     if (firstline && IS_NEWLINE(start_match)) break;
4130    
4131     /* Advance the match position by one character. */
4132    
4133     start_match++;
4134 nigel 77 #ifdef SUPPORT_UTF8
4135 nigel 93 if (utf8)
4136     while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4137     start_match++;
4138 nigel 77 #endif
4139    
4140 nigel 93 /* Break the loop if the pattern is anchored or if we have passed the end of
4141     the subject. */
4142 nigel 77
4143 nigel 93 if (anchored || start_match > end_subject) break;
4144 nigel 77
4145 nigel 93 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
4146     are now at a LF, advance the match position by one more character. */
4147    
4148     if (start_match[-1] == '\r' &&
4149     (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
4150     start_match < end_subject &&
4151     *start_match == '\n')
4152     start_match++;
4153    
4154     } /* End of for(;;) "bumpalong" loop */
4155    
4156     /* ==========================================================================*/
4157    
4158     /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4159     conditions is true:
4160    
4161     (1) The pattern is anchored;
4162    
4163     (2) We are past the end of the subject;
4164    
4165     (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4166     this option requests that a match occur at or before the first newline in
4167     the subject.
4168    
4169     When we have a match and the offset vector is big enough to deal with any
4170     backreferences, captured substring offsets will already be set up. In the case
4171     where we had to get some local store to hold offsets for backreference
4172     processing, copy those that we can. In this case there need not be overflow if
4173     certain parts of the pattern were not used, even though there are more
4174     capturing parentheses than vector slots. */
4175    
4176     if (rc == MATCH_MATCH)
4177     {
4178 nigel 77 if (using_temporary_offsets)
4179     {
4180     if (offsetcount >= 4)
4181     {
4182 nigel 91 memcpy(offsets + 2, md->offset_vector + 2,
4183 nigel 77 (offsetcount - 2) * sizeof(int));
4184     DPRINTF(("Copied offsets from temporary memory\n"));
4185     }
4186 nigel 93 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4187 nigel 77 DPRINTF(("Freeing temporary memory\n"));
4188 nigel 91 (pcre_free)(md->offset_vector);
4189 nigel 77 }
4190    
4191 nigel 93 /* Set the return code to the number of captured strings, or 0 if there are
4192     too many to fit into the vector. */
4193    
4194 nigel 91 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4195 nigel 77
4196 nigel 93 /* If there is space, set up the whole thing as substring 0. */
4197    
4198 nigel 77 if (offsetcount < 2) rc = 0; else