/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 91 - (hide annotations) (download)
Sat Feb 24 21:41:34 2007 UTC (7 years, 9 months ago) by nigel
File MIME type: text/plain
File size: 119563 byte(s)
Load pcre-6.7 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 nigel 91 #define NLBLOCK md /* The block containing newline information */
46 nigel 77 #include "pcre_internal.h"
47    
48    
49     /* Structure for building a chain of data that actually lives on the
50     stack, for holding the values of the subject pointer at the start of each
51     subpattern, so as to detect when an empty string has been matched by a
52     subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
53     are on the heap, not on the stack. */
54    
55     typedef struct eptrblock {
56     struct eptrblock *epb_prev;
57 nigel 87 USPTR epb_saved_eptr;
58 nigel 77 } eptrblock;
59    
60     /* Flag bits for the match() function */
61    
62     #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_isgroup 0x02 /* Set if start of bracketed group */
64    
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71     /* Maximum number of ints of offset to save on the stack for recursive calls.
72     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73     because the offset vector is always a multiple of 3 long. */
74    
75     #define REC_STACK_SAVE_MAX 30
76    
77     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78    
79     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
80     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
81    
82    
83    
84     #ifdef DEBUG
85     /*************************************************
86     * Debugging function to print chars *
87     *************************************************/
88    
89     /* Print a sequence of chars in printable format, stopping at the end of the
90     subject if the requested.
91    
92     Arguments:
93     p points to characters
94     length number to print
95     is_subject TRUE if printing from within md->start_subject
96     md pointer to matching data block, if is_subject is TRUE
97    
98     Returns: nothing
99     */
100    
101     static void
102     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
103     {
104     int c;
105     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106     while (length-- > 0)
107     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
108     }
109     #endif
110    
111    
112    
113     /*************************************************
114     * Match a back-reference *
115     *************************************************/
116    
117     /* If a back reference hasn't been set, the length that is passed is greater
118     than the number of characters left in the string, so the match fails.
119    
120     Arguments:
121     offset index into the offset vector
122     eptr points into the subject
123     length length to be matched
124     md points to match data block
125     ims the ims flags
126    
127     Returns: TRUE if matched
128     */
129    
130     static BOOL
131 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
132 nigel 77 unsigned long int ims)
133     {
134 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
135 nigel 77
136     #ifdef DEBUG
137     if (eptr >= md->end_subject)
138     printf("matching subject <null>");
139     else
140     {
141     printf("matching subject ");
142     pchars(eptr, length, TRUE, md);
143     }
144     printf(" against backref ");
145     pchars(p, length, FALSE, md);
146     printf("\n");
147     #endif
148    
149     /* Always fail if not enough characters left */
150    
151     if (length > md->end_subject - eptr) return FALSE;
152    
153     /* Separate the caselesss case for speed */
154    
155     if ((ims & PCRE_CASELESS) != 0)
156     {
157     while (length-- > 0)
158     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
159     }
160     else
161     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
162    
163     return TRUE;
164     }
165    
166    
167    
168     /***************************************************************************
169     ****************************************************************************
170     RECURSION IN THE match() FUNCTION
171    
172 nigel 87 The match() function is highly recursive, though not every recursive call
173     increases the recursive depth. Nevertheless, some regular expressions can cause
174     it to recurse to a great depth. I was writing for Unix, so I just let it call
175     itself recursively. This uses the stack for saving everything that has to be
176     saved for a recursive call. On Unix, the stack can be large, and this works
177     fine.
178 nigel 77
179 nigel 87 It turns out that on some non-Unix-like systems there are problems with
180     programs that use a lot of stack. (This despite the fact that every last chip
181     has oodles of memory these days, and techniques for extending the stack have
182     been known for decades.) So....
183 nigel 77
184     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
185     calls by keeping local variables that need to be preserved in blocks of memory
186 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
187 nigel 77 achieve this so that the actual code doesn't look very different to what it
188     always used to.
189     ****************************************************************************
190     ***************************************************************************/
191    
192    
193 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
194     versions and production versions. */
195 nigel 77
196     #ifndef NO_RECURSE
197     #define REGISTER register
198 nigel 87 #ifdef DEBUG
199     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
200     { \
201     printf("match() called in line %d\n", __LINE__); \
202     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
203     printf("to line %d\n", __LINE__); \
204     }
205     #define RRETURN(ra) \
206     { \
207     printf("match() returned %d from line %d ", ra, __LINE__); \
208     return ra; \
209     }
210     #else
211     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
212     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
213 nigel 77 #define RRETURN(ra) return ra
214 nigel 87 #endif
215    
216 nigel 77 #else
217    
218    
219     /* These versions of the macros manage a private stack on the heap. Note
220     that the rd argument of RMATCH isn't actually used. It's the md argument of
221     match(), which never changes. */
222    
223     #define REGISTER
224    
225     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
226     {\
227     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
228     if (setjmp(frame->Xwhere) == 0)\
229     {\
230     newframe->Xeptr = ra;\
231     newframe->Xecode = rb;\
232     newframe->Xoffset_top = rc;\
233     newframe->Xims = re;\
234     newframe->Xeptrb = rf;\
235     newframe->Xflags = rg;\
236 nigel 87 newframe->Xrdepth = frame->Xrdepth + 1;\
237 nigel 77 newframe->Xprevframe = frame;\
238     frame = newframe;\
239     DPRINTF(("restarting from line %d\n", __LINE__));\
240     goto HEAP_RECURSE;\
241     }\
242     else\
243     {\
244     DPRINTF(("longjumped back to line %d\n", __LINE__));\
245     frame = md->thisframe;\
246     rx = frame->Xresult;\
247     }\
248     }
249    
250     #define RRETURN(ra)\
251     {\
252     heapframe *newframe = frame;\
253     frame = newframe->Xprevframe;\
254     (pcre_stack_free)(newframe);\
255     if (frame != NULL)\
256     {\
257     frame->Xresult = ra;\
258     md->thisframe = frame;\
259     longjmp(frame->Xwhere, 1);\
260     }\
261     return ra;\
262     }
263    
264    
265     /* Structure for remembering the local variables in a private frame */
266    
267     typedef struct heapframe {
268     struct heapframe *Xprevframe;
269    
270     /* Function arguments that may change */
271    
272     const uschar *Xeptr;
273     const uschar *Xecode;
274     int Xoffset_top;
275     long int Xims;
276     eptrblock *Xeptrb;
277     int Xflags;
278 nigel 91 unsigned int Xrdepth;
279 nigel 77
280     /* Function local variables */
281    
282     const uschar *Xcallpat;
283     const uschar *Xcharptr;
284     const uschar *Xdata;
285     const uschar *Xnext;
286     const uschar *Xpp;
287     const uschar *Xprev;
288     const uschar *Xsaved_eptr;
289    
290     recursion_info Xnew_recursive;
291    
292     BOOL Xcur_is_word;
293     BOOL Xcondition;
294     BOOL Xminimize;
295     BOOL Xprev_is_word;
296    
297     unsigned long int Xoriginal_ims;
298    
299     #ifdef SUPPORT_UCP
300     int Xprop_type;
301 nigel 87 int Xprop_value;
302 nigel 77 int Xprop_fail_result;
303     int Xprop_category;
304     int Xprop_chartype;
305 nigel 87 int Xprop_script;
306 nigel 77 int *Xprop_test_variable;
307     #endif
308    
309     int Xctype;
310     int Xfc;
311     int Xfi;
312     int Xlength;
313     int Xmax;
314     int Xmin;
315     int Xnumber;
316     int Xoffset;
317     int Xop;
318     int Xsave_capture_last;
319     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
320     int Xstacksave[REC_STACK_SAVE_MAX];
321    
322     eptrblock Xnewptrb;
323    
324     /* Place to pass back result, and where to jump back to */
325    
326     int Xresult;
327     jmp_buf Xwhere;
328    
329     } heapframe;
330    
331     #endif
332    
333    
334     /***************************************************************************
335     ***************************************************************************/
336    
337    
338    
339     /*************************************************
340     * Match from current position *
341     *************************************************/
342    
343     /* On entry ecode points to the first opcode, and eptr to the first character
344     in the subject string, while eptrb holds the value of eptr at the start of the
345     last bracketed group - used for breaking infinite loops matching zero-length
346     strings. This function is called recursively in many circumstances. Whenever it
347     returns a negative (error) response, the outer incarnation must also return the
348     same response.
349    
350     Performance note: It might be tempting to extract commonly used fields from the
351     md structure (e.g. utf8, end_subject) into individual variables to improve
352     performance. Tests using gcc on a SPARC disproved this; in the first case, it
353     made performance worse.
354    
355     Arguments:
356     eptr pointer in subject
357     ecode position in code
358     offset_top current top pointer
359     md pointer to "static" info for the match
360     ims current /i, /m, and /s options
361     eptrb pointer to chain of blocks containing eptr at start of
362     brackets - for testing for empty matches
363     flags can contain
364     match_condassert - this is an assertion condition
365     match_isgroup - this is the start of a bracketed group
366 nigel 87 rdepth the recursion depth
367 nigel 77
368     Returns: MATCH_MATCH if matched ) these values are >= 0
369     MATCH_NOMATCH if failed to match )
370     a negative PCRE_ERROR_xxx value if aborted by an error condition
371 nigel 87 (e.g. stopped by repeated call or recursion limit)
372 nigel 77 */
373    
374     static int
375 nigel 87 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
376 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
377 nigel 91 int flags, unsigned int rdepth)
378 nigel 77 {
379     /* These variables do not need to be preserved over recursion in this function,
380     so they can be ordinary variables in all cases. Mark them with "register"
381     because they are used a lot in loops. */
382    
383 nigel 91 register int rrc; /* Returns from recursive calls */
384     register int i; /* Used for loops not involving calls to RMATCH() */
385     register unsigned int c; /* Character values not kept over RMATCH() calls */
386     register BOOL utf8; /* Local copy of UTF-8 flag for speed */
387 nigel 77
388     /* When recursion is not being used, all "local" variables that have to be
389     preserved over calls to RMATCH() are part of a "frame" which is obtained from
390     heap storage. Set up the top-level frame here; others are obtained from the
391     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
392    
393     #ifdef NO_RECURSE
394     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
395     frame->Xprevframe = NULL; /* Marks the top level */
396    
397     /* Copy in the original argument variables */
398    
399     frame->Xeptr = eptr;
400     frame->Xecode = ecode;
401     frame->Xoffset_top = offset_top;
402     frame->Xims = ims;
403     frame->Xeptrb = eptrb;
404     frame->Xflags = flags;
405 nigel 87 frame->Xrdepth = rdepth;
406 nigel 77
407     /* This is where control jumps back to to effect "recursion" */
408    
409     HEAP_RECURSE:
410    
411     /* Macros make the argument variables come from the current frame */
412    
413     #define eptr frame->Xeptr
414     #define ecode frame->Xecode
415     #define offset_top frame->Xoffset_top
416     #define ims frame->Xims
417     #define eptrb frame->Xeptrb
418     #define flags frame->Xflags
419 nigel 87 #define rdepth frame->Xrdepth
420 nigel 77
421     /* Ditto for the local variables */
422    
423     #ifdef SUPPORT_UTF8
424     #define charptr frame->Xcharptr
425     #endif
426     #define callpat frame->Xcallpat
427     #define data frame->Xdata
428     #define next frame->Xnext
429     #define pp frame->Xpp
430     #define prev frame->Xprev
431     #define saved_eptr frame->Xsaved_eptr
432    
433     #define new_recursive frame->Xnew_recursive
434    
435     #define cur_is_word frame->Xcur_is_word
436     #define condition frame->Xcondition
437     #define minimize frame->Xminimize
438     #define prev_is_word frame->Xprev_is_word
439    
440     #define original_ims frame->Xoriginal_ims
441    
442     #ifdef SUPPORT_UCP
443     #define prop_type frame->Xprop_type
444 nigel 87 #define prop_value frame->Xprop_value
445 nigel 77 #define prop_fail_result frame->Xprop_fail_result
446     #define prop_category frame->Xprop_category
447     #define prop_chartype frame->Xprop_chartype
448 nigel 87 #define prop_script frame->Xprop_script
449 nigel 77 #define prop_test_variable frame->Xprop_test_variable
450     #endif
451    
452     #define ctype frame->Xctype
453     #define fc frame->Xfc
454     #define fi frame->Xfi
455     #define length frame->Xlength
456     #define max frame->Xmax
457     #define min frame->Xmin
458     #define number frame->Xnumber
459     #define offset frame->Xoffset
460     #define op frame->Xop
461     #define save_capture_last frame->Xsave_capture_last
462     #define save_offset1 frame->Xsave_offset1
463     #define save_offset2 frame->Xsave_offset2
464     #define save_offset3 frame->Xsave_offset3
465     #define stacksave frame->Xstacksave
466    
467     #define newptrb frame->Xnewptrb
468    
469     /* When recursion is being used, local variables are allocated on the stack and
470     get preserved during recursion in the normal way. In this environment, fi and
471     i, and fc and c, can be the same variables. */
472    
473     #else
474     #define fi i
475     #define fc c
476    
477    
478 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
479     const uschar *charptr; /* in small blocks of the code. My normal */
480     #endif /* style of coding would have declared */
481     const uschar *callpat; /* them within each of those blocks. */
482     const uschar *data; /* However, in order to accommodate the */
483     const uschar *next; /* version of this code that uses an */
484     USPTR pp; /* external "stack" implemented on the */
485     const uschar *prev; /* heap, it is easier to declare them all */
486     USPTR saved_eptr; /* here, so the declarations can be cut */
487     /* out in a block. The only declarations */
488     recursion_info new_recursive; /* within blocks below are for variables */
489     /* that do not have to be preserved over */
490     BOOL cur_is_word; /* a recursive call to RMATCH(). */
491     BOOL condition;
492 nigel 77 BOOL minimize;
493     BOOL prev_is_word;
494    
495     unsigned long int original_ims;
496    
497     #ifdef SUPPORT_UCP
498     int prop_type;
499 nigel 87 int prop_value;
500 nigel 77 int prop_fail_result;
501     int prop_category;
502     int prop_chartype;
503 nigel 87 int prop_script;
504 nigel 77 int *prop_test_variable;
505     #endif
506    
507     int ctype;
508     int length;
509     int max;
510     int min;
511     int number;
512     int offset;
513     int op;
514     int save_capture_last;
515     int save_offset1, save_offset2, save_offset3;
516     int stacksave[REC_STACK_SAVE_MAX];
517    
518     eptrblock newptrb;
519     #endif
520    
521     /* These statements are here to stop the compiler complaining about unitialized
522     variables. */
523    
524     #ifdef SUPPORT_UCP
525 nigel 87 prop_value = 0;
526 nigel 77 prop_fail_result = 0;
527     prop_test_variable = NULL;
528     #endif
529    
530 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
531     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
532     used. Thanks to Ian Taylor for noticing this possibility and sending the
533     original patch. */
534    
535     TAIL_RECURSE:
536    
537 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
538     are specified by the macro RMATCH and RRETURN is used to return. When
539     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
540     and a "return", respectively (possibly with some debugging if DEBUG is
541     defined). However, RMATCH isn't like a function call because it's quite a
542     complicated macro. It has to be used in one particular way. This shouldn't,
543     however, impact performance when true recursion is being used. */
544 nigel 77
545 nigel 87 /* First check that we haven't called match() too many times, or that we
546     haven't exceeded the recursive call limit. */
547    
548 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
549 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
550 nigel 77
551     original_ims = ims; /* Save for resetting on ')' */
552 nigel 91
553     #ifdef SUPPORT_UTF8
554 nigel 77 utf8 = md->utf8; /* Local copy of the flag */
555 nigel 91 #else
556     utf8 = FALSE;
557     #endif
558 nigel 77
559     /* At the start of a bracketed group, add the current subject pointer to the
560     stack of such pointers, to be re-instated at the end of the group when we hit
561     the closing ket. When match() is called in other circumstances, we don't add to
562     this stack. */
563    
564     if ((flags & match_isgroup) != 0)
565     {
566     newptrb.epb_prev = eptrb;
567     newptrb.epb_saved_eptr = eptr;
568     eptrb = &newptrb;
569     }
570    
571     /* Now start processing the operations. */
572    
573     for (;;)
574     {
575     op = *ecode;
576     minimize = FALSE;
577    
578     /* For partial matching, remember if we ever hit the end of the subject after
579     matching at least one subject character. */
580    
581     if (md->partial &&
582     eptr >= md->end_subject &&
583     eptr > md->start_match)
584     md->hitend = TRUE;
585    
586     /* Opening capturing bracket. If there is space in the offset vector, save
587     the current subject position in the working slot at the top of the vector. We
588     mustn't change the current values of the data slot, because they may be set
589     from a previous iteration of this group, and be referred to by a reference
590     inside the group.
591    
592     If the bracket fails to match, we need to restore this value and also the
593     values of the final offsets, in case they were set by a previous iteration of
594     the same bracket.
595    
596     If there isn't enough space in the offset vector, treat this as if it were a
597     non-capturing bracket. Don't worry about setting the flag for the error case
598     here; that is handled in the code for KET. */
599    
600     if (op > OP_BRA)
601     {
602     number = op - OP_BRA;
603    
604     /* For extended extraction brackets (large number), we have to fish out the
605     number from a dummy opcode at the start. */
606    
607     if (number > EXTRACT_BASIC_MAX)
608     number = GET2(ecode, 2+LINK_SIZE);
609     offset = number << 1;
610    
611     #ifdef DEBUG
612     printf("start bracket %d subject=", number);
613     pchars(eptr, 16, TRUE, md);
614     printf("\n");
615     #endif
616    
617     if (offset < md->offset_max)
618     {
619     save_offset1 = md->offset_vector[offset];
620     save_offset2 = md->offset_vector[offset+1];
621     save_offset3 = md->offset_vector[md->offset_end - number];
622     save_capture_last = md->capture_last;
623    
624     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
625     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
626    
627     do
628     {
629     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
630     match_isgroup);
631     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
632     md->capture_last = save_capture_last;
633     ecode += GET(ecode, 1);
634     }
635     while (*ecode == OP_ALT);
636    
637     DPRINTF(("bracket %d failed\n", number));
638    
639     md->offset_vector[offset] = save_offset1;
640     md->offset_vector[offset+1] = save_offset2;
641     md->offset_vector[md->offset_end - number] = save_offset3;
642    
643     RRETURN(MATCH_NOMATCH);
644     }
645    
646     /* Insufficient room for saving captured contents */
647    
648     else op = OP_BRA;
649     }
650    
651     /* Other types of node can be handled by a switch */
652    
653     switch(op)
654     {
655     case OP_BRA: /* Non-capturing bracket: optimized */
656     DPRINTF(("start bracket 0\n"));
657 nigel 91
658     /* Loop for all the alternatives */
659    
660     for (;;)
661 nigel 77 {
662 nigel 91 /* When we get to the final alternative within the brackets, we would
663     return the result of a recursive call to match() whatever happened. We
664     can reduce stack usage by turning this into a tail recursion. */
665    
666     if (ecode[GET(ecode, 1)] != OP_ALT)
667     {
668     ecode += 1 + LINK_SIZE;
669     flags = match_isgroup;
670     DPRINTF(("bracket 0 tail recursion\n"));
671     goto TAIL_RECURSE;
672     }
673    
674     /* For non-final alternatives, continue the loop for a NOMATCH result;
675     otherwise return. */
676    
677 nigel 77 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
678     match_isgroup);
679     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
680     ecode += GET(ecode, 1);
681     }
682 nigel 91 /* Control never reaches here. */
683 nigel 77
684     /* Conditional group: compilation checked that there are no more than
685     two branches. If the condition is false, skipping the first branch takes us
686     past the end if there is only one branch, but that's OK because that is
687 nigel 91 exactly what going to the ket would do. As there is only one branch to be
688     obeyed, we can use tail recursion to avoid using another stack frame. */
689 nigel 77
690     case OP_COND:
691     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
692     {
693     offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
694     condition = (offset == CREF_RECURSE * 2)?
695     (md->recursive != NULL) :
696     (offset < offset_top && md->offset_vector[offset] >= 0);
697 nigel 91 ecode += condition? (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1));
698     flags = match_isgroup;
699     goto TAIL_RECURSE;
700 nigel 77 }
701    
702     /* The condition is an assertion. Call match() to evaluate it - setting
703     the final argument TRUE causes it to stop at the end of an assertion. */
704    
705     else
706     {
707     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
708     match_condassert | match_isgroup);
709     if (rrc == MATCH_MATCH)
710     {
711     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
712     while (*ecode == OP_ALT) ecode += GET(ecode, 1);
713     }
714     else if (rrc != MATCH_NOMATCH)
715     {
716     RRETURN(rrc); /* Need braces because of following else */
717     }
718     else ecode += GET(ecode, 1);
719 nigel 91
720     /* We are now at the branch that is to be obeyed. As there is only one,
721     we can use tail recursion to avoid using another stack frame. */
722    
723     ecode += 1 + LINK_SIZE;
724     flags = match_isgroup;
725     goto TAIL_RECURSE;
726 nigel 77 }
727     /* Control never reaches here */
728    
729     /* Skip over conditional reference or large extraction number data if
730     encountered. */
731    
732     case OP_CREF:
733     case OP_BRANUMBER:
734     ecode += 3;
735     break;
736    
737     /* End of the pattern. If we are in a recursion, we should restore the
738     offsets appropriately and continue from after the call. */
739    
740     case OP_END:
741     if (md->recursive != NULL && md->recursive->group_num == 0)
742     {
743     recursion_info *rec = md->recursive;
744 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
745 nigel 77 md->recursive = rec->prevrec;
746     memmove(md->offset_vector, rec->offset_save,
747     rec->saved_max * sizeof(int));
748     md->start_match = rec->save_start;
749     ims = original_ims;
750     ecode = rec->after_call;
751     break;
752     }
753    
754     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
755     string - backtracking will then try other alternatives, if any. */
756    
757     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
758     md->end_match_ptr = eptr; /* Record where we ended */
759     md->end_offset_top = offset_top; /* and how many extracts were taken */
760     RRETURN(MATCH_MATCH);
761    
762     /* Change option settings */
763    
764     case OP_OPT:
765     ims = ecode[1];
766     ecode += 2;
767     DPRINTF(("ims set to %02lx\n", ims));
768     break;
769    
770     /* Assertion brackets. Check the alternative branches in turn - the
771     matching won't pass the KET for an assertion. If any one branch matches,
772     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
773     start of each branch to move the current point backwards, so the code at
774     this level is identical to the lookahead case. */
775    
776     case OP_ASSERT:
777     case OP_ASSERTBACK:
778     do
779     {
780     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
781     match_isgroup);
782     if (rrc == MATCH_MATCH) break;
783     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
784     ecode += GET(ecode, 1);
785     }
786     while (*ecode == OP_ALT);
787     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
788    
789     /* If checking an assertion for a condition, return MATCH_MATCH. */
790    
791     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
792    
793     /* Continue from after the assertion, updating the offsets high water
794     mark, since extracts may have been taken during the assertion. */
795    
796     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
797     ecode += 1 + LINK_SIZE;
798     offset_top = md->end_offset_top;
799     continue;
800    
801     /* Negative assertion: all branches must fail to match */
802    
803     case OP_ASSERT_NOT:
804     case OP_ASSERTBACK_NOT:
805     do
806     {
807     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
808     match_isgroup);
809     if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
810     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
811     ecode += GET(ecode,1);
812     }
813     while (*ecode == OP_ALT);
814    
815     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
816    
817     ecode += 1 + LINK_SIZE;
818     continue;
819    
820     /* Move the subject pointer back. This occurs only at the start of
821     each branch of a lookbehind assertion. If we are too close to the start to
822     move back, this match function fails. When working with UTF-8 we move
823     back a number of characters, not bytes. */
824    
825     case OP_REVERSE:
826     #ifdef SUPPORT_UTF8
827     if (utf8)
828     {
829     c = GET(ecode,1);
830     for (i = 0; i < c; i++)
831     {
832     eptr--;
833     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
834     BACKCHAR(eptr)
835     }
836     }
837     else
838     #endif
839    
840     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
841    
842     {
843     eptr -= GET(ecode,1);
844     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
845     }
846    
847     /* Skip to next op code */
848    
849     ecode += 1 + LINK_SIZE;
850     break;
851    
852     /* The callout item calls an external function, if one is provided, passing
853     details of the match so far. This is mainly for debugging, though the
854     function is able to force a failure. */
855    
856     case OP_CALLOUT:
857     if (pcre_callout != NULL)
858     {
859     pcre_callout_block cb;
860     cb.version = 1; /* Version 1 of the callout block */
861     cb.callout_number = ecode[1];
862     cb.offset_vector = md->offset_vector;
863 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
864 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
865     cb.start_match = md->start_match - md->start_subject;
866     cb.current_position = eptr - md->start_subject;
867     cb.pattern_position = GET(ecode, 2);
868     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
869     cb.capture_top = offset_top/2;
870     cb.capture_last = md->capture_last;
871     cb.callout_data = md->callout_data;
872     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
873     if (rrc < 0) RRETURN(rrc);
874     }
875     ecode += 2 + 2*LINK_SIZE;
876     break;
877    
878     /* Recursion either matches the current regex, or some subexpression. The
879     offset data is the offset to the starting bracket from the start of the
880     whole pattern. (This is so that it works from duplicated subpatterns.)
881    
882     If there are any capturing brackets started but not finished, we have to
883     save their starting points and reinstate them after the recursion. However,
884     we don't know how many such there are (offset_top records the completed
885     total) so we just have to save all the potential data. There may be up to
886     65535 such values, which is too large to put on the stack, but using malloc
887     for small numbers seems expensive. As a compromise, the stack is used when
888     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
889     is used. A problem is what to do if the malloc fails ... there is no way of
890     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
891     values on the stack, and accept that the rest may be wrong.
892    
893     There are also other values that have to be saved. We use a chained
894     sequence of blocks that actually live on the stack. Thanks to Robin Houston
895     for the original version of this logic. */
896    
897     case OP_RECURSE:
898     {
899     callpat = md->start_code + GET(ecode, 1);
900     new_recursive.group_num = *callpat - OP_BRA;
901    
902     /* For extended extraction brackets (large number), we have to fish out
903     the number from a dummy opcode at the start. */
904    
905     if (new_recursive.group_num > EXTRACT_BASIC_MAX)
906     new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
907    
908     /* Add to "recursing stack" */
909    
910     new_recursive.prevrec = md->recursive;
911     md->recursive = &new_recursive;
912    
913     /* Find where to continue from afterwards */
914    
915     ecode += 1 + LINK_SIZE;
916     new_recursive.after_call = ecode;
917    
918     /* Now save the offset data. */
919    
920     new_recursive.saved_max = md->offset_end;
921     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
922     new_recursive.offset_save = stacksave;
923     else
924     {
925     new_recursive.offset_save =
926     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
927     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
928     }
929    
930     memcpy(new_recursive.offset_save, md->offset_vector,
931     new_recursive.saved_max * sizeof(int));
932     new_recursive.save_start = md->start_match;
933     md->start_match = eptr;
934    
935     /* OK, now we can do the recursion. For each top-level alternative we
936     restore the offset and recursion data. */
937    
938     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
939     do
940     {
941     RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
942     eptrb, match_isgroup);
943     if (rrc == MATCH_MATCH)
944     {
945 nigel 87 DPRINTF(("Recursion matched\n"));
946 nigel 77 md->recursive = new_recursive.prevrec;
947     if (new_recursive.offset_save != stacksave)
948     (pcre_free)(new_recursive.offset_save);
949     RRETURN(MATCH_MATCH);
950     }
951 nigel 87 else if (rrc != MATCH_NOMATCH)
952     {
953     DPRINTF(("Recursion gave error %d\n", rrc));
954     RRETURN(rrc);
955     }
956 nigel 77
957     md->recursive = &new_recursive;
958     memcpy(md->offset_vector, new_recursive.offset_save,
959     new_recursive.saved_max * sizeof(int));
960     callpat += GET(callpat, 1);
961     }
962     while (*callpat == OP_ALT);
963    
964     DPRINTF(("Recursion didn't match\n"));
965     md->recursive = new_recursive.prevrec;
966     if (new_recursive.offset_save != stacksave)
967     (pcre_free)(new_recursive.offset_save);
968     RRETURN(MATCH_NOMATCH);
969     }
970     /* Control never reaches here */
971    
972     /* "Once" brackets are like assertion brackets except that after a match,
973     the point in the subject string is not moved back. Thus there can never be
974     a move back into the brackets. Friedl calls these "atomic" subpatterns.
975     Check the alternative branches in turn - the matching won't pass the KET
976     for this kind of subpattern. If any one branch matches, we carry on as at
977     the end of a normal bracket, leaving the subject pointer. */
978    
979     case OP_ONCE:
980 nigel 91 prev = ecode;
981     saved_eptr = eptr;
982    
983     do
984 nigel 77 {
985 nigel 91 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
986     eptrb, match_isgroup);
987     if (rrc == MATCH_MATCH) break;
988     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
989     ecode += GET(ecode,1);
990     }
991     while (*ecode == OP_ALT);
992 nigel 77
993 nigel 91 /* If hit the end of the group (which could be repeated), fail */
994 nigel 77
995 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
996 nigel 77
997 nigel 91 /* Continue as from after the assertion, updating the offsets high water
998     mark, since extracts may have been taken. */
999 nigel 77
1000 nigel 91 do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1001 nigel 77
1002 nigel 91 offset_top = md->end_offset_top;
1003     eptr = md->end_match_ptr;
1004 nigel 77
1005 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1006     happens for a repeating ket if no characters were matched in the group.
1007     This is the forcible breaking of infinite loops as implemented in Perl
1008     5.005. If there is an options reset, it will get obeyed in the normal
1009     course of events. */
1010 nigel 77
1011 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1012     {
1013     ecode += 1+LINK_SIZE;
1014     break;
1015     }
1016 nigel 77
1017 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1018     preceding bracket, in the appropriate order. The second "call" of match()
1019     uses tail recursion, to avoid using another stack frame. We need to reset
1020     any options that changed within the bracket before re-running it, so
1021     check the next opcode. */
1022 nigel 77
1023 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1024     {
1025     ims = (ims & ~PCRE_IMS) | ecode[4];
1026     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1027     }
1028 nigel 77
1029 nigel 91 if (*ecode == OP_KETRMIN)
1030     {
1031     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1032     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1033     ecode = prev;
1034     flags = match_isgroup;
1035     goto TAIL_RECURSE;
1036 nigel 77 }
1037 nigel 91 else /* OP_KETRMAX */
1038     {
1039     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1040     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1041     ecode += 1 + LINK_SIZE;
1042     flags = 0;
1043     goto TAIL_RECURSE;
1044     }
1045     /* Control never gets here */
1046 nigel 77
1047     /* An alternation is the end of a branch; scan along to find the end of the
1048     bracketed group and go to there. */
1049    
1050     case OP_ALT:
1051     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1052     break;
1053    
1054     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1055     that it may occur zero times. It may repeat infinitely, or not at all -
1056     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1057     repeat limits are compiled as a number of copies, with the optional ones
1058     preceded by BRAZERO or BRAMINZERO. */
1059    
1060     case OP_BRAZERO:
1061     {
1062     next = ecode+1;
1063     RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1064     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1065     do next += GET(next,1); while (*next == OP_ALT);
1066     ecode = next + 1+LINK_SIZE;
1067     }
1068     break;
1069    
1070     case OP_BRAMINZERO:
1071     {
1072     next = ecode+1;
1073     do next += GET(next,1); while (*next == OP_ALT);
1074     RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1075     match_isgroup);
1076     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1077     ecode++;
1078     }
1079     break;
1080    
1081     /* End of a group, repeated or non-repeating. If we are at the end of
1082     an assertion "group", stop matching and return MATCH_MATCH, but record the
1083     current high water mark for use by positive assertions. Do this also
1084     for the "once" (not-backup up) groups. */
1085    
1086     case OP_KET:
1087     case OP_KETRMIN:
1088     case OP_KETRMAX:
1089 nigel 91 prev = ecode - GET(ecode, 1);
1090     saved_eptr = eptrb->epb_saved_eptr;
1091 nigel 77
1092 nigel 91 /* Back up the stack of bracket start pointers. */
1093 nigel 77
1094 nigel 91 eptrb = eptrb->epb_prev;
1095 nigel 77
1096 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1097     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1098     *prev == OP_ONCE)
1099     {
1100     md->end_match_ptr = eptr; /* For ONCE */
1101     md->end_offset_top = offset_top;
1102     RRETURN(MATCH_MATCH);
1103     }
1104 nigel 77
1105 nigel 91 /* In all other cases except a conditional group we have to check the
1106     group number back at the start and if necessary complete handling an
1107     extraction by setting the offsets and bumping the high water mark. */
1108 nigel 77
1109 nigel 91 if (*prev != OP_COND)
1110     {
1111     number = *prev - OP_BRA;
1112 nigel 77
1113 nigel 91 /* For extended extraction brackets (large number), we have to fish out
1114     the number from a dummy opcode at the start. */
1115 nigel 77
1116 nigel 91 if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1117     offset = number << 1;
1118 nigel 77
1119     #ifdef DEBUG
1120 nigel 91 printf("end bracket %d", number);
1121     printf("\n");
1122 nigel 77 #endif
1123    
1124 nigel 91 /* Test for a numbered group. This includes groups called as a result
1125     of recursion. Note that whole-pattern recursion is coded as a recurse
1126     into group 0, so it won't be picked up here. Instead, we catch it when
1127     the OP_END is reached. */
1128 nigel 77
1129 nigel 91 if (number > 0)
1130     {
1131     md->capture_last = number;
1132     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1133 nigel 77 {
1134 nigel 91 md->offset_vector[offset] =
1135     md->offset_vector[md->offset_end - number];
1136     md->offset_vector[offset+1] = eptr - md->start_subject;
1137     if (offset_top <= offset) offset_top = offset + 2;
1138     }
1139 nigel 77
1140 nigel 91 /* Handle a recursively called group. Restore the offsets
1141     appropriately and continue from after the call. */
1142 nigel 77
1143 nigel 91 if (md->recursive != NULL && md->recursive->group_num == number)
1144     {
1145     recursion_info *rec = md->recursive;
1146     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1147     md->recursive = rec->prevrec;
1148     md->start_match = rec->save_start;
1149     memcpy(md->offset_vector, rec->offset_save,
1150     rec->saved_max * sizeof(int));
1151     ecode = rec->after_call;
1152     ims = original_ims;
1153     break;
1154 nigel 77 }
1155     }
1156 nigel 91 }
1157 nigel 77
1158 nigel 91 /* Reset the value of the ims flags, in case they got changed during
1159     the group. */
1160 nigel 77
1161 nigel 91 ims = original_ims;
1162     DPRINTF(("ims reset to %02lx\n", ims));
1163 nigel 77
1164 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1165     happens for a repeating ket if no characters were matched in the group.
1166     This is the forcible breaking of infinite loops as implemented in Perl
1167     5.005. If there is an options reset, it will get obeyed in the normal
1168     course of events. */
1169 nigel 77
1170 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1171     {
1172     ecode += 1 + LINK_SIZE;
1173     break;
1174     }
1175 nigel 77
1176 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1177     preceding bracket, in the appropriate order. In the second case, we can use
1178     tail recursion to avoid using another stack frame. */
1179 nigel 77
1180 nigel 91 if (*ecode == OP_KETRMIN)
1181     {
1182     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1183     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1184     ecode = prev;
1185     flags = match_isgroup;
1186     goto TAIL_RECURSE;
1187 nigel 77 }
1188 nigel 91 else /* OP_KETRMAX */
1189     {
1190     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1191     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192     ecode += 1 + LINK_SIZE;
1193     flags = 0;
1194     goto TAIL_RECURSE;
1195     }
1196     /* Control never gets here */
1197 nigel 77
1198     /* Start of subject unless notbol, or after internal newline if multiline */
1199    
1200     case OP_CIRC:
1201     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1202     if ((ims & PCRE_MULTILINE) != 0)
1203     {
1204 nigel 91 if (eptr != md->start_subject &&
1205     (eptr == md->end_subject ||
1206     eptr < md->start_subject + md->nllen ||
1207     !IS_NEWLINE(eptr - md->nllen)))
1208 nigel 77 RRETURN(MATCH_NOMATCH);
1209     ecode++;
1210     break;
1211     }
1212     /* ... else fall through */
1213    
1214     /* Start of subject assertion */
1215    
1216     case OP_SOD:
1217     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1218     ecode++;
1219     break;
1220    
1221     /* Start of match assertion */
1222    
1223     case OP_SOM:
1224     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1225     ecode++;
1226     break;
1227    
1228     /* Assert before internal newline if multiline, or before a terminating
1229     newline unless endonly is set, else end of subject unless noteol is set. */
1230    
1231     case OP_DOLL:
1232     if ((ims & PCRE_MULTILINE) != 0)
1233     {
1234     if (eptr < md->end_subject)
1235 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1236 nigel 77 else
1237     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1238     ecode++;
1239     break;
1240     }
1241     else
1242     {
1243     if (md->noteol) RRETURN(MATCH_NOMATCH);
1244     if (!md->endonly)
1245     {
1246 nigel 91 if (eptr != md->end_subject &&
1247     (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
1248 nigel 77 RRETURN(MATCH_NOMATCH);
1249     ecode++;
1250     break;
1251     }
1252     }
1253 nigel 91 /* ... else fall through for endonly */
1254 nigel 77
1255     /* End of subject assertion (\z) */
1256    
1257     case OP_EOD:
1258     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1259     ecode++;
1260     break;
1261    
1262     /* End of subject or ending \n assertion (\Z) */
1263    
1264     case OP_EODN:
1265 nigel 91 if (eptr != md->end_subject &&
1266     (eptr != md->end_subject - md->nllen || !IS_NEWLINE(eptr)))
1267     RRETURN(MATCH_NOMATCH);
1268 nigel 77 ecode++;
1269     break;
1270    
1271     /* Word boundary assertions */
1272    
1273     case OP_NOT_WORD_BOUNDARY:
1274     case OP_WORD_BOUNDARY:
1275     {
1276    
1277     /* Find out if the previous and current characters are "word" characters.
1278     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1279     be "non-word" characters. */
1280    
1281     #ifdef SUPPORT_UTF8
1282     if (utf8)
1283     {
1284     if (eptr == md->start_subject) prev_is_word = FALSE; else
1285     {
1286     const uschar *lastptr = eptr - 1;
1287     while((*lastptr & 0xc0) == 0x80) lastptr--;
1288     GETCHAR(c, lastptr);
1289     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1290     }
1291     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1292     {
1293     GETCHAR(c, eptr);
1294     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1295     }
1296     }
1297     else
1298     #endif
1299    
1300     /* More streamlined when not in UTF-8 mode */
1301    
1302     {
1303     prev_is_word = (eptr != md->start_subject) &&
1304     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1305     cur_is_word = (eptr < md->end_subject) &&
1306     ((md->ctypes[*eptr] & ctype_word) != 0);
1307     }
1308    
1309     /* Now see if the situation is what we want */
1310    
1311     if ((*ecode++ == OP_WORD_BOUNDARY)?
1312     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1313     RRETURN(MATCH_NOMATCH);
1314     }
1315     break;
1316    
1317     /* Match a single character type; inline for speed */
1318    
1319     case OP_ANY:
1320 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1321     {
1322     if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
1323     RRETURN(MATCH_NOMATCH);
1324     }
1325 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1326     if (utf8)
1327     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1328     ecode++;
1329     break;
1330    
1331     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1332     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1333    
1334     case OP_ANYBYTE:
1335     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1336     ecode++;
1337     break;
1338    
1339     case OP_NOT_DIGIT:
1340     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1341     GETCHARINCTEST(c, eptr);
1342     if (
1343     #ifdef SUPPORT_UTF8
1344     c < 256 &&
1345     #endif
1346     (md->ctypes[c] & ctype_digit) != 0
1347     )
1348     RRETURN(MATCH_NOMATCH);
1349     ecode++;
1350     break;
1351    
1352     case OP_DIGIT:
1353     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1354     GETCHARINCTEST(c, eptr);
1355     if (
1356     #ifdef SUPPORT_UTF8
1357     c >= 256 ||
1358     #endif
1359     (md->ctypes[c] & ctype_digit) == 0
1360     )
1361     RRETURN(MATCH_NOMATCH);
1362     ecode++;
1363     break;
1364    
1365     case OP_NOT_WHITESPACE:
1366     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1367     GETCHARINCTEST(c, eptr);
1368     if (
1369     #ifdef SUPPORT_UTF8
1370     c < 256 &&
1371     #endif
1372     (md->ctypes[c] & ctype_space) != 0
1373     )
1374     RRETURN(MATCH_NOMATCH);
1375     ecode++;
1376     break;
1377    
1378     case OP_WHITESPACE:
1379     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1380     GETCHARINCTEST(c, eptr);
1381     if (
1382     #ifdef SUPPORT_UTF8
1383     c >= 256 ||
1384     #endif
1385     (md->ctypes[c] & ctype_space) == 0
1386     )
1387     RRETURN(MATCH_NOMATCH);
1388     ecode++;
1389     break;
1390    
1391     case OP_NOT_WORDCHAR:
1392     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1393     GETCHARINCTEST(c, eptr);
1394     if (
1395     #ifdef SUPPORT_UTF8
1396     c < 256 &&
1397     #endif
1398     (md->ctypes[c] & ctype_word) != 0
1399     )
1400     RRETURN(MATCH_NOMATCH);
1401     ecode++;
1402     break;
1403    
1404     case OP_WORDCHAR:
1405     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1406     GETCHARINCTEST(c, eptr);
1407     if (
1408     #ifdef SUPPORT_UTF8
1409     c >= 256 ||
1410     #endif
1411     (md->ctypes[c] & ctype_word) == 0
1412     )
1413     RRETURN(MATCH_NOMATCH);
1414     ecode++;
1415     break;
1416    
1417     #ifdef SUPPORT_UCP
1418     /* Check the next character by Unicode property. We will get here only
1419     if the support is in the binary; otherwise a compile-time error occurs. */
1420    
1421     case OP_PROP:
1422     case OP_NOTPROP:
1423     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1424     GETCHARINCTEST(c, eptr);
1425     {
1426 nigel 87 int chartype, script;
1427     int category = _pcre_ucp_findprop(c, &chartype, &script);
1428 nigel 77
1429 nigel 87 switch(ecode[1])
1430     {
1431     case PT_ANY:
1432     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1433     break;
1434 nigel 77
1435 nigel 87 case PT_LAMP:
1436     if ((chartype == ucp_Lu ||
1437     chartype == ucp_Ll ||
1438     chartype == ucp_Lt) == (op == OP_NOTPROP))
1439 nigel 77 RRETURN(MATCH_NOMATCH);
1440 nigel 87 break;
1441    
1442     case PT_GC:
1443     if ((ecode[2] != category) == (op == OP_PROP))
1444 nigel 77 RRETURN(MATCH_NOMATCH);
1445 nigel 87 break;
1446    
1447     case PT_PC:
1448     if ((ecode[2] != chartype) == (op == OP_PROP))
1449     RRETURN(MATCH_NOMATCH);
1450     break;
1451    
1452     case PT_SC:
1453     if ((ecode[2] != script) == (op == OP_PROP))
1454     RRETURN(MATCH_NOMATCH);
1455     break;
1456    
1457     default:
1458     RRETURN(PCRE_ERROR_INTERNAL);
1459     break;
1460 nigel 77 }
1461 nigel 87
1462     ecode += 3;
1463 nigel 77 }
1464     break;
1465    
1466     /* Match an extended Unicode sequence. We will get here only if the support
1467     is in the binary; otherwise a compile-time error occurs. */
1468    
1469     case OP_EXTUNI:
1470     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1471     GETCHARINCTEST(c, eptr);
1472     {
1473 nigel 87 int chartype, script;
1474     int category = _pcre_ucp_findprop(c, &chartype, &script);
1475 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1476     while (eptr < md->end_subject)
1477     {
1478     int len = 1;
1479     if (!utf8) c = *eptr; else
1480     {
1481     GETCHARLEN(c, eptr, len);
1482     }
1483 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1484 nigel 77 if (category != ucp_M) break;
1485     eptr += len;
1486     }
1487     }
1488     ecode++;
1489     break;
1490     #endif
1491    
1492    
1493     /* Match a back reference, possibly repeatedly. Look past the end of the
1494     item to see if there is repeat information following. The code is similar
1495     to that for character classes, but repeated for efficiency. Then obey
1496     similar code to character type repeats - written out again for speed.
1497     However, if the referenced string is the empty string, always treat
1498     it as matched, any number of times (otherwise there could be infinite
1499     loops). */
1500    
1501     case OP_REF:
1502     {
1503     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1504     ecode += 3; /* Advance past item */
1505    
1506     /* If the reference is unset, set the length to be longer than the amount
1507     of subject left; this ensures that every attempt at a match fails. We
1508     can't just fail here, because of the possibility of quantifiers with zero
1509     minima. */
1510    
1511     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1512     md->end_subject - eptr + 1 :
1513     md->offset_vector[offset+1] - md->offset_vector[offset];
1514    
1515     /* Set up for repetition, or handle the non-repeated case */
1516    
1517     switch (*ecode)
1518     {
1519     case OP_CRSTAR:
1520     case OP_CRMINSTAR:
1521     case OP_CRPLUS:
1522     case OP_CRMINPLUS:
1523     case OP_CRQUERY:
1524     case OP_CRMINQUERY:
1525     c = *ecode++ - OP_CRSTAR;
1526     minimize = (c & 1) != 0;
1527     min = rep_min[c]; /* Pick up values from tables; */
1528     max = rep_max[c]; /* zero for max => infinity */
1529     if (max == 0) max = INT_MAX;
1530     break;
1531    
1532     case OP_CRRANGE:
1533     case OP_CRMINRANGE:
1534     minimize = (*ecode == OP_CRMINRANGE);
1535     min = GET2(ecode, 1);
1536     max = GET2(ecode, 3);
1537     if (max == 0) max = INT_MAX;
1538     ecode += 5;
1539     break;
1540    
1541     default: /* No repeat follows */
1542     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1543     eptr += length;
1544     continue; /* With the main loop */
1545     }
1546    
1547     /* If the length of the reference is zero, just continue with the
1548     main loop. */
1549    
1550     if (length == 0) continue;
1551    
1552     /* First, ensure the minimum number of matches are present. We get back
1553     the length of the reference string explicitly rather than passing the
1554     address of eptr, so that eptr can be a register variable. */
1555    
1556     for (i = 1; i <= min; i++)
1557     {
1558     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1559     eptr += length;
1560     }
1561    
1562     /* If min = max, continue at the same level without recursion.
1563     They are not both allowed to be zero. */
1564    
1565     if (min == max) continue;
1566    
1567     /* If minimizing, keep trying and advancing the pointer */
1568    
1569     if (minimize)
1570     {
1571     for (fi = min;; fi++)
1572     {
1573     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1574     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1575     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1576     RRETURN(MATCH_NOMATCH);
1577     eptr += length;
1578     }
1579     /* Control never gets here */
1580     }
1581    
1582     /* If maximizing, find the longest string and work backwards */
1583    
1584     else
1585     {
1586     pp = eptr;
1587     for (i = min; i < max; i++)
1588     {
1589     if (!match_ref(offset, eptr, length, md, ims)) break;
1590     eptr += length;
1591     }
1592     while (eptr >= pp)
1593     {
1594     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1595     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1596     eptr -= length;
1597     }
1598     RRETURN(MATCH_NOMATCH);
1599     }
1600     }
1601     /* Control never gets here */
1602    
1603    
1604    
1605     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1606     used when all the characters in the class have values in the range 0-255,
1607     and either the matching is caseful, or the characters are in the range
1608     0-127 when UTF-8 processing is enabled. The only difference between
1609     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1610     encountered.
1611    
1612     First, look past the end of the item to see if there is repeat information
1613     following. Then obey similar code to character type repeats - written out
1614     again for speed. */
1615    
1616     case OP_NCLASS:
1617     case OP_CLASS:
1618     {
1619     data = ecode + 1; /* Save for matching */
1620     ecode += 33; /* Advance past the item */
1621    
1622     switch (*ecode)
1623     {
1624     case OP_CRSTAR:
1625     case OP_CRMINSTAR:
1626     case OP_CRPLUS:
1627     case OP_CRMINPLUS:
1628     case OP_CRQUERY:
1629     case OP_CRMINQUERY:
1630     c = *ecode++ - OP_CRSTAR;
1631     minimize = (c & 1) != 0;
1632     min = rep_min[c]; /* Pick up values from tables; */
1633     max = rep_max[c]; /* zero for max => infinity */
1634     if (max == 0) max = INT_MAX;
1635     break;
1636    
1637     case OP_CRRANGE:
1638     case OP_CRMINRANGE:
1639     minimize = (*ecode == OP_CRMINRANGE);
1640     min = GET2(ecode, 1);
1641     max = GET2(ecode, 3);
1642     if (max == 0) max = INT_MAX;
1643     ecode += 5;
1644     break;
1645    
1646     default: /* No repeat follows */
1647     min = max = 1;
1648     break;
1649     }
1650    
1651     /* First, ensure the minimum number of matches are present. */
1652    
1653     #ifdef SUPPORT_UTF8
1654     /* UTF-8 mode */
1655     if (utf8)
1656     {
1657     for (i = 1; i <= min; i++)
1658     {
1659     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1660     GETCHARINC(c, eptr);
1661     if (c > 255)
1662     {
1663     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1664     }
1665     else
1666     {
1667     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1668     }
1669     }
1670     }
1671     else
1672     #endif
1673     /* Not UTF-8 mode */
1674     {
1675     for (i = 1; i <= min; i++)
1676     {
1677     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1678     c = *eptr++;
1679     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1680     }
1681     }
1682    
1683     /* If max == min we can continue with the main loop without the
1684     need to recurse. */
1685    
1686     if (min == max) continue;
1687    
1688     /* If minimizing, keep testing the rest of the expression and advancing
1689     the pointer while it matches the class. */
1690    
1691     if (minimize)
1692     {
1693     #ifdef SUPPORT_UTF8
1694     /* UTF-8 mode */
1695     if (utf8)
1696     {
1697     for (fi = min;; fi++)
1698     {
1699     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1700     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1701     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1702     GETCHARINC(c, eptr);
1703     if (c > 255)
1704     {
1705     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1706     }
1707     else
1708     {
1709     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1710     }
1711     }
1712     }
1713     else
1714     #endif
1715     /* Not UTF-8 mode */
1716     {
1717     for (fi = min;; fi++)
1718     {
1719     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1720     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1721     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1722     c = *eptr++;
1723     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1724     }
1725     }
1726     /* Control never gets here */
1727     }
1728    
1729     /* If maximizing, find the longest possible run, then work backwards. */
1730    
1731     else
1732     {
1733     pp = eptr;
1734    
1735     #ifdef SUPPORT_UTF8
1736     /* UTF-8 mode */
1737     if (utf8)
1738     {
1739     for (i = min; i < max; i++)
1740     {
1741     int len = 1;
1742     if (eptr >= md->end_subject) break;
1743     GETCHARLEN(c, eptr, len);
1744     if (c > 255)
1745     {
1746     if (op == OP_CLASS) break;
1747     }
1748     else
1749     {
1750     if ((data[c/8] & (1 << (c&7))) == 0) break;
1751     }
1752     eptr += len;
1753     }
1754     for (;;)
1755     {
1756     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1757     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1758     if (eptr-- == pp) break; /* Stop if tried at original pos */
1759     BACKCHAR(eptr);
1760     }
1761     }
1762     else
1763     #endif
1764     /* Not UTF-8 mode */
1765     {
1766     for (i = min; i < max; i++)
1767     {
1768     if (eptr >= md->end_subject) break;
1769     c = *eptr;
1770     if ((data[c/8] & (1 << (c&7))) == 0) break;
1771     eptr++;
1772     }
1773     while (eptr >= pp)
1774     {
1775     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1776 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1777 nigel 77 eptr--;
1778     }
1779     }
1780    
1781     RRETURN(MATCH_NOMATCH);
1782     }
1783     }
1784     /* Control never gets here */
1785    
1786    
1787     /* Match an extended character class. This opcode is encountered only
1788     in UTF-8 mode, because that's the only time it is compiled. */
1789    
1790     #ifdef SUPPORT_UTF8
1791     case OP_XCLASS:
1792     {
1793     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1794     ecode += GET(ecode, 1); /* Advance past the item */
1795    
1796     switch (*ecode)
1797     {
1798     case OP_CRSTAR:
1799     case OP_CRMINSTAR:
1800     case OP_CRPLUS:
1801     case OP_CRMINPLUS:
1802     case OP_CRQUERY:
1803     case OP_CRMINQUERY:
1804     c = *ecode++ - OP_CRSTAR;
1805     minimize = (c & 1) != 0;
1806     min = rep_min[c]; /* Pick up values from tables; */
1807     max = rep_max[c]; /* zero for max => infinity */
1808     if (max == 0) max = INT_MAX;
1809     break;
1810    
1811     case OP_CRRANGE:
1812     case OP_CRMINRANGE:
1813     minimize = (*ecode == OP_CRMINRANGE);
1814     min = GET2(ecode, 1);
1815     max = GET2(ecode, 3);
1816     if (max == 0) max = INT_MAX;
1817     ecode += 5;
1818     break;
1819    
1820     default: /* No repeat follows */
1821     min = max = 1;
1822     break;
1823     }
1824    
1825     /* First, ensure the minimum number of matches are present. */
1826    
1827     for (i = 1; i <= min; i++)
1828     {
1829     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1830     GETCHARINC(c, eptr);
1831     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1832     }
1833    
1834     /* If max == min we can continue with the main loop without the
1835     need to recurse. */
1836    
1837     if (min == max) continue;
1838    
1839     /* If minimizing, keep testing the rest of the expression and advancing
1840     the pointer while it matches the class. */
1841    
1842     if (minimize)
1843     {
1844     for (fi = min;; fi++)
1845     {
1846     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1847     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1848     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1849     GETCHARINC(c, eptr);
1850     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1851     }
1852     /* Control never gets here */
1853     }
1854    
1855     /* If maximizing, find the longest possible run, then work backwards. */
1856    
1857     else
1858     {
1859     pp = eptr;
1860     for (i = min; i < max; i++)
1861     {
1862     int len = 1;
1863     if (eptr >= md->end_subject) break;
1864     GETCHARLEN(c, eptr, len);
1865     if (!_pcre_xclass(c, data)) break;
1866     eptr += len;
1867     }
1868     for(;;)
1869     {
1870     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1871     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1872     if (eptr-- == pp) break; /* Stop if tried at original pos */
1873     BACKCHAR(eptr)
1874     }
1875     RRETURN(MATCH_NOMATCH);
1876     }
1877    
1878     /* Control never gets here */
1879     }
1880     #endif /* End of XCLASS */
1881    
1882     /* Match a single character, casefully */
1883    
1884     case OP_CHAR:
1885     #ifdef SUPPORT_UTF8
1886     if (utf8)
1887     {
1888     length = 1;
1889     ecode++;
1890     GETCHARLEN(fc, ecode, length);
1891     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1892     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1893     }
1894     else
1895     #endif
1896    
1897     /* Non-UTF-8 mode */
1898     {
1899     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1900     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1901     ecode += 2;
1902     }
1903     break;
1904    
1905     /* Match a single character, caselessly */
1906    
1907     case OP_CHARNC:
1908     #ifdef SUPPORT_UTF8
1909     if (utf8)
1910     {
1911     length = 1;
1912     ecode++;
1913     GETCHARLEN(fc, ecode, length);
1914    
1915     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1916    
1917     /* If the pattern character's value is < 128, we have only one byte, and
1918     can use the fast lookup table. */
1919    
1920     if (fc < 128)
1921     {
1922     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1923     }
1924    
1925     /* Otherwise we must pick up the subject character */
1926    
1927     else
1928     {
1929     int dc;
1930     GETCHARINC(dc, eptr);
1931     ecode += length;
1932    
1933     /* If we have Unicode property support, we can use it to test the other
1934 nigel 87 case of the character, if there is one. */
1935 nigel 77
1936     if (fc != dc)
1937     {
1938     #ifdef SUPPORT_UCP
1939 nigel 87 if (dc != _pcre_ucp_othercase(fc))
1940 nigel 77 #endif
1941     RRETURN(MATCH_NOMATCH);
1942     }
1943     }
1944     }
1945     else
1946     #endif /* SUPPORT_UTF8 */
1947    
1948     /* Non-UTF-8 mode */
1949     {
1950     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1951     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1952     ecode += 2;
1953     }
1954     break;
1955    
1956     /* Match a single character repeatedly; different opcodes share code. */
1957    
1958     case OP_EXACT:
1959     min = max = GET2(ecode, 1);
1960     ecode += 3;
1961     goto REPEATCHAR;
1962    
1963     case OP_UPTO:
1964     case OP_MINUPTO:
1965     min = 0;
1966     max = GET2(ecode, 1);
1967     minimize = *ecode == OP_MINUPTO;
1968     ecode += 3;
1969     goto REPEATCHAR;
1970    
1971     case OP_STAR:
1972     case OP_MINSTAR:
1973     case OP_PLUS:
1974     case OP_MINPLUS:
1975     case OP_QUERY:
1976     case OP_MINQUERY:
1977     c = *ecode++ - OP_STAR;
1978     minimize = (c & 1) != 0;
1979     min = rep_min[c]; /* Pick up values from tables; */
1980     max = rep_max[c]; /* zero for max => infinity */
1981     if (max == 0) max = INT_MAX;
1982    
1983     /* Common code for all repeated single-character matches. We can give
1984     up quickly if there are fewer than the minimum number of characters left in
1985     the subject. */
1986    
1987     REPEATCHAR:
1988     #ifdef SUPPORT_UTF8
1989     if (utf8)
1990     {
1991     length = 1;
1992     charptr = ecode;
1993     GETCHARLEN(fc, ecode, length);
1994     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1995     ecode += length;
1996    
1997     /* Handle multibyte character matching specially here. There is
1998     support for caseless matching if UCP support is present. */
1999    
2000     if (length > 1)
2001     {
2002     int oclength = 0;
2003     uschar occhars[8];
2004    
2005     #ifdef SUPPORT_UCP
2006     int othercase;
2007     if ((ims & PCRE_CASELESS) != 0 &&
2008 nigel 87 (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
2009     othercase >= 0)
2010 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2011     #endif /* SUPPORT_UCP */
2012    
2013     for (i = 1; i <= min; i++)
2014     {
2015     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2016     /* Need braces because of following else */
2017     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2018     else
2019     {
2020     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2021     eptr += oclength;
2022     }
2023     }
2024    
2025     if (min == max) continue;
2026    
2027     if (minimize)
2028     {
2029     for (fi = min;; fi++)
2030     {
2031     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2032     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2033     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2034     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2035     /* Need braces because of following else */
2036     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2037     else
2038     {
2039     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2040     eptr += oclength;
2041     }
2042     }
2043     /* Control never gets here */
2044     }
2045     else
2046     {
2047     pp = eptr;
2048     for (i = min; i < max; i++)
2049     {
2050     if (eptr > md->end_subject - length) break;
2051     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2052     else if (oclength == 0) break;
2053     else
2054     {
2055     if (memcmp(eptr, occhars, oclength) != 0) break;
2056     eptr += oclength;
2057     }
2058     }
2059     while (eptr >= pp)
2060     {
2061     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2062     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2063     eptr -= length;
2064     }
2065     RRETURN(MATCH_NOMATCH);
2066     }
2067     /* Control never gets here */
2068     }
2069    
2070     /* If the length of a UTF-8 character is 1, we fall through here, and
2071     obey the code as for non-UTF-8 characters below, though in this case the
2072     value of fc will always be < 128. */
2073     }
2074     else
2075     #endif /* SUPPORT_UTF8 */
2076    
2077     /* When not in UTF-8 mode, load a single-byte character. */
2078     {
2079     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2080     fc = *ecode++;
2081     }
2082    
2083     /* The value of fc at this point is always less than 256, though we may or
2084     may not be in UTF-8 mode. The code is duplicated for the caseless and
2085     caseful cases, for speed, since matching characters is likely to be quite
2086     common. First, ensure the minimum number of matches are present. If min =
2087     max, continue at the same level without recursing. Otherwise, if
2088     minimizing, keep trying the rest of the expression and advancing one
2089     matching character if failing, up to the maximum. Alternatively, if
2090     maximizing, find the maximum number of characters and work backwards. */
2091    
2092     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2093     max, eptr));
2094    
2095     if ((ims & PCRE_CASELESS) != 0)
2096     {
2097     fc = md->lcc[fc];
2098     for (i = 1; i <= min; i++)
2099     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2100     if (min == max) continue;
2101     if (minimize)
2102     {
2103     for (fi = min;; fi++)
2104     {
2105     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2106     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2107     if (fi >= max || eptr >= md->end_subject ||
2108     fc != md->lcc[*eptr++])
2109     RRETURN(MATCH_NOMATCH);
2110     }
2111     /* Control never gets here */
2112     }
2113     else
2114     {
2115     pp = eptr;
2116     for (i = min; i < max; i++)
2117     {
2118     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2119     eptr++;
2120     }
2121     while (eptr >= pp)
2122     {
2123     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2124     eptr--;
2125     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2126     }
2127     RRETURN(MATCH_NOMATCH);
2128     }
2129     /* Control never gets here */
2130     }
2131    
2132     /* Caseful comparisons (includes all multi-byte characters) */
2133    
2134     else
2135     {
2136     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2137     if (min == max) continue;
2138     if (minimize)
2139     {
2140     for (fi = min;; fi++)
2141     {
2142     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2143     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2144     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2145     RRETURN(MATCH_NOMATCH);
2146     }
2147     /* Control never gets here */
2148     }
2149     else
2150     {
2151     pp = eptr;
2152     for (i = min; i < max; i++)
2153     {
2154     if (eptr >= md->end_subject || fc != *eptr) break;
2155     eptr++;
2156     }
2157     while (eptr >= pp)
2158     {
2159     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2160     eptr--;
2161     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2162     }
2163     RRETURN(MATCH_NOMATCH);
2164     }
2165     }
2166     /* Control never gets here */
2167    
2168     /* Match a negated single one-byte character. The character we are
2169     checking can be multibyte. */
2170    
2171     case OP_NOT:
2172     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2173     ecode++;
2174     GETCHARINCTEST(c, eptr);
2175     if ((ims & PCRE_CASELESS) != 0)
2176     {
2177     #ifdef SUPPORT_UTF8
2178     if (c < 256)
2179     #endif
2180     c = md->lcc[c];
2181     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2182     }
2183     else
2184     {
2185     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2186     }
2187     break;
2188    
2189     /* Match a negated single one-byte character repeatedly. This is almost a
2190     repeat of the code for a repeated single character, but I haven't found a
2191     nice way of commoning these up that doesn't require a test of the
2192     positive/negative option for each character match. Maybe that wouldn't add
2193     very much to the time taken, but character matching *is* what this is all
2194     about... */
2195    
2196     case OP_NOTEXACT:
2197     min = max = GET2(ecode, 1);
2198     ecode += 3;
2199     goto REPEATNOTCHAR;
2200    
2201     case OP_NOTUPTO:
2202     case OP_NOTMINUPTO:
2203     min = 0;
2204     max = GET2(ecode, 1);
2205     minimize = *ecode == OP_NOTMINUPTO;
2206     ecode += 3;
2207     goto REPEATNOTCHAR;
2208    
2209     case OP_NOTSTAR:
2210     case OP_NOTMINSTAR:
2211     case OP_NOTPLUS:
2212     case OP_NOTMINPLUS:
2213     case OP_NOTQUERY:
2214     case OP_NOTMINQUERY:
2215     c = *ecode++ - OP_NOTSTAR;
2216     minimize = (c & 1) != 0;
2217     min = rep_min[c]; /* Pick up values from tables; */
2218     max = rep_max[c]; /* zero for max => infinity */
2219     if (max == 0) max = INT_MAX;
2220    
2221     /* Common code for all repeated single-byte matches. We can give up quickly
2222     if there are fewer than the minimum number of bytes left in the
2223     subject. */
2224    
2225     REPEATNOTCHAR:
2226     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2227     fc = *ecode++;
2228    
2229     /* The code is duplicated for the caseless and caseful cases, for speed,
2230     since matching characters is likely to be quite common. First, ensure the
2231     minimum number of matches are present. If min = max, continue at the same
2232     level without recursing. Otherwise, if minimizing, keep trying the rest of
2233     the expression and advancing one matching character if failing, up to the
2234     maximum. Alternatively, if maximizing, find the maximum number of
2235     characters and work backwards. */
2236    
2237     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2238     max, eptr));
2239    
2240     if ((ims & PCRE_CASELESS) != 0)
2241     {
2242     fc = md->lcc[fc];
2243    
2244     #ifdef SUPPORT_UTF8
2245     /* UTF-8 mode */
2246     if (utf8)
2247     {
2248     register int d;
2249     for (i = 1; i <= min; i++)
2250     {
2251     GETCHARINC(d, eptr);
2252     if (d < 256) d = md->lcc[d];
2253     if (fc == d) RRETURN(MATCH_NOMATCH);
2254     }
2255     }
2256     else
2257     #endif
2258    
2259     /* Not UTF-8 mode */
2260     {
2261     for (i = 1; i <= min; i++)
2262     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2263     }
2264    
2265     if (min == max) continue;
2266    
2267     if (minimize)
2268     {
2269     #ifdef SUPPORT_UTF8
2270     /* UTF-8 mode */
2271     if (utf8)
2272     {
2273     register int d;
2274     for (fi = min;; fi++)
2275     {
2276     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2277     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2278     GETCHARINC(d, eptr);
2279     if (d < 256) d = md->lcc[d];
2280     if (fi >= max || eptr >= md->end_subject || fc == d)
2281     RRETURN(MATCH_NOMATCH);
2282     }
2283     }
2284     else
2285     #endif
2286     /* Not UTF-8 mode */
2287     {
2288     for (fi = min;; fi++)
2289     {
2290     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2291     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2292     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2293     RRETURN(MATCH_NOMATCH);
2294     }
2295     }
2296     /* Control never gets here */
2297     }
2298    
2299     /* Maximize case */
2300    
2301     else
2302     {
2303     pp = eptr;
2304    
2305     #ifdef SUPPORT_UTF8
2306     /* UTF-8 mode */
2307     if (utf8)
2308     {
2309     register int d;
2310     for (i = min; i < max; i++)
2311     {
2312     int len = 1;
2313     if (eptr >= md->end_subject) break;
2314     GETCHARLEN(d, eptr, len);
2315     if (d < 256) d = md->lcc[d];
2316     if (fc == d) break;
2317     eptr += len;
2318     }
2319     for(;;)
2320     {
2321     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2322     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2323     if (eptr-- == pp) break; /* Stop if tried at original pos */
2324     BACKCHAR(eptr);
2325     }
2326     }
2327     else
2328     #endif
2329     /* Not UTF-8 mode */
2330     {
2331     for (i = min; i < max; i++)
2332     {
2333     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2334     eptr++;
2335     }
2336     while (eptr >= pp)
2337     {
2338     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2339     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2340     eptr--;
2341     }
2342     }
2343    
2344     RRETURN(MATCH_NOMATCH);
2345     }
2346     /* Control never gets here */
2347     }
2348    
2349     /* Caseful comparisons */
2350    
2351     else
2352     {
2353     #ifdef SUPPORT_UTF8
2354     /* UTF-8 mode */
2355     if (utf8)
2356     {
2357     register int d;
2358     for (i = 1; i <= min; i++)
2359     {
2360     GETCHARINC(d, eptr);
2361     if (fc == d) RRETURN(MATCH_NOMATCH);
2362     }
2363     }
2364     else
2365     #endif
2366     /* Not UTF-8 mode */
2367     {
2368     for (i = 1; i <= min; i++)
2369     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2370     }
2371    
2372     if (min == max) continue;
2373    
2374     if (minimize)
2375     {
2376     #ifdef SUPPORT_UTF8
2377     /* UTF-8 mode */
2378     if (utf8)
2379     {
2380     register int d;
2381     for (fi = min;; fi++)
2382     {
2383     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2384     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2385     GETCHARINC(d, eptr);
2386     if (fi >= max || eptr >= md->end_subject || fc == d)
2387     RRETURN(MATCH_NOMATCH);
2388     }
2389     }
2390     else
2391     #endif
2392     /* Not UTF-8 mode */
2393     {
2394     for (fi = min;; fi++)
2395     {
2396     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2397     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2398     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2399     RRETURN(MATCH_NOMATCH);
2400     }
2401     }
2402     /* Control never gets here */
2403     }
2404    
2405     /* Maximize case */
2406    
2407     else
2408     {
2409     pp = eptr;
2410    
2411     #ifdef SUPPORT_UTF8
2412     /* UTF-8 mode */
2413     if (utf8)
2414     {
2415     register int d;
2416     for (i = min; i < max; i++)
2417     {
2418     int len = 1;
2419     if (eptr >= md->end_subject) break;
2420     GETCHARLEN(d, eptr, len);
2421     if (fc == d) break;
2422     eptr += len;
2423     }
2424     for(;;)
2425     {
2426     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2427     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2428     if (eptr-- == pp) break; /* Stop if tried at original pos */
2429     BACKCHAR(eptr);
2430     }
2431     }
2432     else
2433     #endif
2434     /* Not UTF-8 mode */
2435     {
2436     for (i = min; i < max; i++)
2437     {
2438     if (eptr >= md->end_subject || fc == *eptr) break;
2439     eptr++;
2440     }
2441     while (eptr >= pp)
2442     {
2443     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2444     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2445     eptr--;
2446     }
2447     }
2448    
2449     RRETURN(MATCH_NOMATCH);
2450     }
2451     }
2452     /* Control never gets here */
2453    
2454     /* Match a single character type repeatedly; several different opcodes
2455     share code. This is very similar to the code for single characters, but we
2456     repeat it in the interests of efficiency. */
2457    
2458     case OP_TYPEEXACT:
2459     min = max = GET2(ecode, 1);
2460     minimize = TRUE;
2461     ecode += 3;
2462     goto REPEATTYPE;
2463    
2464     case OP_TYPEUPTO:
2465     case OP_TYPEMINUPTO:
2466     min = 0;
2467     max = GET2(ecode, 1);
2468     minimize = *ecode == OP_TYPEMINUPTO;
2469     ecode += 3;
2470     goto REPEATTYPE;
2471    
2472     case OP_TYPESTAR:
2473     case OP_TYPEMINSTAR:
2474     case OP_TYPEPLUS:
2475     case OP_TYPEMINPLUS:
2476     case OP_TYPEQUERY:
2477     case OP_TYPEMINQUERY:
2478     c = *ecode++ - OP_TYPESTAR;
2479     minimize = (c & 1) != 0;
2480     min = rep_min[c]; /* Pick up values from tables; */
2481     max = rep_max[c]; /* zero for max => infinity */
2482     if (max == 0) max = INT_MAX;
2483    
2484     /* Common code for all repeated single character type matches. Note that
2485     in UTF-8 mode, '.' matches a character of any length, but for the other
2486     character types, the valid characters are all one-byte long. */
2487    
2488     REPEATTYPE:
2489     ctype = *ecode++; /* Code for the character type */
2490    
2491     #ifdef SUPPORT_UCP
2492     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2493     {
2494     prop_fail_result = ctype == OP_NOTPROP;
2495     prop_type = *ecode++;
2496 nigel 87 prop_value = *ecode++;
2497 nigel 77 }
2498     else prop_type = -1;
2499     #endif
2500    
2501     /* First, ensure the minimum number of matches are present. Use inline
2502     code for maximizing the speed, and do the type test once at the start
2503     (i.e. keep it out of the loop). Also we can test that there are at least
2504     the minimum number of bytes before we start. This isn't as effective in
2505     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2506     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2507     and single-bytes. */
2508    
2509     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2510     if (min > 0)
2511     {
2512     #ifdef SUPPORT_UCP
2513 nigel 87 if (prop_type >= 0)
2514 nigel 77 {
2515 nigel 87 switch(prop_type)
2516 nigel 77 {
2517 nigel 87 case PT_ANY:
2518     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2519     for (i = 1; i <= min; i++)
2520     {
2521     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2522     GETCHARINC(c, eptr);
2523     }
2524     break;
2525    
2526     case PT_LAMP:
2527     for (i = 1; i <= min; i++)
2528     {
2529     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2530     GETCHARINC(c, eptr);
2531     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2532     if ((prop_chartype == ucp_Lu ||
2533     prop_chartype == ucp_Ll ||
2534     prop_chartype == ucp_Lt) == prop_fail_result)
2535     RRETURN(MATCH_NOMATCH);
2536     }
2537     break;
2538    
2539     case PT_GC:
2540     for (i = 1; i <= min; i++)
2541     {
2542     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2543     GETCHARINC(c, eptr);
2544     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2545     if ((prop_category == prop_value) == prop_fail_result)
2546     RRETURN(MATCH_NOMATCH);
2547     }
2548     break;
2549    
2550     case PT_PC:
2551     for (i = 1; i <= min; i++)
2552     {
2553     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2554     GETCHARINC(c, eptr);
2555     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2556     if ((prop_chartype == prop_value) == prop_fail_result)
2557     RRETURN(MATCH_NOMATCH);
2558     }
2559     break;
2560    
2561     case PT_SC:
2562     for (i = 1; i <= min; i++)
2563     {
2564     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2565     GETCHARINC(c, eptr);
2566     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2567     if ((prop_script == prop_value) == prop_fail_result)
2568     RRETURN(MATCH_NOMATCH);
2569     }
2570     break;
2571    
2572     default:
2573     RRETURN(PCRE_ERROR_INTERNAL);
2574     break;
2575 nigel 77 }
2576     }
2577    
2578     /* Match extended Unicode sequences. We will get here only if the
2579     support is in the binary; otherwise a compile-time error occurs. */
2580    
2581     else if (ctype == OP_EXTUNI)
2582     {
2583     for (i = 1; i <= min; i++)
2584     {
2585     GETCHARINCTEST(c, eptr);
2586 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2587 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2588     while (eptr < md->end_subject)
2589     {
2590     int len = 1;
2591     if (!utf8) c = *eptr; else
2592     {
2593     GETCHARLEN(c, eptr, len);
2594     }
2595 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2596 nigel 77 if (prop_category != ucp_M) break;
2597     eptr += len;
2598     }
2599     }
2600     }
2601    
2602     else
2603     #endif /* SUPPORT_UCP */
2604    
2605     /* Handle all other cases when the coding is UTF-8 */
2606    
2607     #ifdef SUPPORT_UTF8
2608     if (utf8) switch(ctype)
2609     {
2610     case OP_ANY:
2611     for (i = 1; i <= min; i++)
2612     {
2613     if (eptr >= md->end_subject ||
2614 nigel 91 ((ims & PCRE_DOTALL) == 0 &&
2615     eptr <= md->end_subject - md->nllen &&
2616     IS_NEWLINE(eptr)))
2617 nigel 77 RRETURN(MATCH_NOMATCH);
2618 nigel 91 eptr++;
2619 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2620     }
2621     break;
2622    
2623     case OP_ANYBYTE:
2624     eptr += min;
2625     break;
2626    
2627     case OP_NOT_DIGIT:
2628     for (i = 1; i <= min; i++)
2629     {
2630     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2631     GETCHARINC(c, eptr);
2632     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2633     RRETURN(MATCH_NOMATCH);
2634     }
2635     break;
2636    
2637     case OP_DIGIT:
2638     for (i = 1; i <= min; i++)
2639     {
2640     if (eptr >= md->end_subject ||
2641     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2642     RRETURN(MATCH_NOMATCH);
2643     /* No need to skip more bytes - we know it's a 1-byte character */
2644     }
2645     break;
2646    
2647     case OP_NOT_WHITESPACE:
2648     for (i = 1; i <= min; i++)
2649     {
2650     if (eptr >= md->end_subject ||
2651     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2652     RRETURN(MATCH_NOMATCH);
2653     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2654     }
2655     break;
2656    
2657     case OP_WHITESPACE:
2658     for (i = 1; i <= min; i++)
2659     {
2660     if (eptr >= md->end_subject ||
2661     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2662     RRETURN(MATCH_NOMATCH);
2663     /* No need to skip more bytes - we know it's a 1-byte character */
2664     }
2665     break;
2666    
2667     case OP_NOT_WORDCHAR:
2668     for (i = 1; i <= min; i++)
2669     {
2670     if (eptr >= md->end_subject ||
2671     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2672     RRETURN(MATCH_NOMATCH);
2673     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2674     }
2675     break;
2676    
2677     case OP_WORDCHAR:
2678     for (i = 1; i <= min; i++)
2679     {
2680     if (eptr >= md->end_subject ||
2681     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2682     RRETURN(MATCH_NOMATCH);
2683     /* No need to skip more bytes - we know it's a 1-byte character */
2684     }
2685     break;
2686    
2687     default:
2688     RRETURN(PCRE_ERROR_INTERNAL);
2689     } /* End switch(ctype) */
2690    
2691     else
2692     #endif /* SUPPORT_UTF8 */
2693    
2694     /* Code for the non-UTF-8 case for minimum matching of operators other
2695     than OP_PROP and OP_NOTPROP. */
2696    
2697     switch(ctype)
2698     {
2699     case OP_ANY:
2700     if ((ims & PCRE_DOTALL) == 0)
2701     {
2702     for (i = 1; i <= min; i++)
2703 nigel 91 {
2704     if (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr))
2705     RRETURN(MATCH_NOMATCH);
2706     eptr++;
2707     }
2708 nigel 77 }
2709     else eptr += min;
2710     break;
2711    
2712     case OP_ANYBYTE:
2713     eptr += min;
2714     break;
2715    
2716     case OP_NOT_DIGIT:
2717     for (i = 1; i <= min; i++)
2718     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2719     break;
2720    
2721     case OP_DIGIT:
2722     for (i = 1; i <= min; i++)
2723     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2724     break;
2725    
2726     case OP_NOT_WHITESPACE:
2727     for (i = 1; i <= min; i++)
2728     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2729     break;
2730    
2731     case OP_WHITESPACE:
2732     for (i = 1; i <= min; i++)
2733     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2734     break;
2735    
2736     case OP_NOT_WORDCHAR:
2737     for (i = 1; i <= min; i++)
2738     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2739     RRETURN(MATCH_NOMATCH);
2740     break;
2741    
2742     case OP_WORDCHAR:
2743     for (i = 1; i <= min; i++)
2744     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2745     RRETURN(MATCH_NOMATCH);
2746     break;
2747    
2748     default:
2749     RRETURN(PCRE_ERROR_INTERNAL);
2750     }
2751     }
2752    
2753     /* If min = max, continue at the same level without recursing */
2754    
2755     if (min == max) continue;
2756    
2757     /* If minimizing, we have to test the rest of the pattern before each
2758     subsequent match. Again, separate the UTF-8 case for speed, and also
2759     separate the UCP cases. */
2760    
2761     if (minimize)
2762     {
2763     #ifdef SUPPORT_UCP
2764 nigel 87 if (prop_type >= 0)
2765 nigel 77 {
2766 nigel 87 switch(prop_type)
2767 nigel 77 {
2768 nigel 87 case PT_ANY:
2769     for (fi = min;; fi++)
2770     {
2771     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2772     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2773     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2774     GETCHARINC(c, eptr);
2775     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2776     }
2777     break;
2778    
2779     case PT_LAMP:
2780     for (fi = min;; fi++)
2781     {
2782     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2783     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2784     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2785     GETCHARINC(c, eptr);
2786     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2787     if ((prop_chartype == ucp_Lu ||
2788     prop_chartype == ucp_Ll ||
2789     prop_chartype == ucp_Lt) == prop_fail_result)
2790     RRETURN(MATCH_NOMATCH);
2791     }
2792     break;
2793    
2794     case PT_GC:
2795     for (fi = min;; fi++)
2796     {
2797     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2798     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2799     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2800     GETCHARINC(c, eptr);
2801     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2802     if ((prop_category == prop_value) == prop_fail_result)
2803     RRETURN(MATCH_NOMATCH);
2804     }
2805     break;
2806    
2807     case PT_PC:
2808     for (fi = min;; fi++)
2809     {
2810     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2811     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2812     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2813     GETCHARINC(c, eptr);
2814     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2815     if ((prop_chartype == prop_value) == prop_fail_result)
2816     RRETURN(MATCH_NOMATCH);
2817     }
2818     break;
2819    
2820     case PT_SC:
2821     for (fi = min;; fi++)
2822     {
2823     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2824     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2825     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2826     GETCHARINC(c, eptr);
2827     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2828     if ((prop_script == prop_value) == prop_fail_result)
2829     RRETURN(MATCH_NOMATCH);
2830     }
2831     break;
2832    
2833     default:
2834     RRETURN(PCRE_ERROR_INTERNAL);
2835     break;
2836 nigel 77 }
2837     }
2838    
2839     /* Match extended Unicode sequences. We will get here only if the
2840     support is in the binary; otherwise a compile-time error occurs. */
2841    
2842     else if (ctype == OP_EXTUNI)
2843     {
2844     for (fi = min;; fi++)
2845     {
2846     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2847     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2848     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2849     GETCHARINCTEST(c, eptr);
2850 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2851 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2852     while (eptr < md->end_subject)
2853     {
2854     int len = 1;
2855     if (!utf8) c = *eptr; else
2856     {
2857     GETCHARLEN(c, eptr, len);
2858     }
2859 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2860 nigel 77 if (prop_category != ucp_M) break;
2861     eptr += len;
2862     }
2863     }
2864     }
2865    
2866     else
2867     #endif /* SUPPORT_UCP */
2868    
2869     #ifdef SUPPORT_UTF8
2870     /* UTF-8 mode */
2871     if (utf8)
2872     {
2873     for (fi = min;; fi++)
2874     {
2875     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2876     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2877 nigel 91 if (fi >= max || eptr >= md->end_subject ||
2878     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
2879     eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
2880     RRETURN(MATCH_NOMATCH);
2881 nigel 77
2882     GETCHARINC(c, eptr);
2883     switch(ctype)
2884     {
2885 nigel 91 case OP_ANY: /* This is the DOTALL case */
2886 nigel 77 break;
2887    
2888     case OP_ANYBYTE:
2889     break;
2890    
2891     case OP_NOT_DIGIT:
2892     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2893     RRETURN(MATCH_NOMATCH);
2894     break;
2895    
2896     case OP_DIGIT:
2897     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2898     RRETURN(MATCH_NOMATCH);
2899     break;
2900    
2901     case OP_NOT_WHITESPACE:
2902     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2903     RRETURN(MATCH_NOMATCH);
2904     break;
2905    
2906     case OP_WHITESPACE:
2907     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2908     RRETURN(MATCH_NOMATCH);
2909     break;
2910    
2911     case OP_NOT_WORDCHAR:
2912     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2913     RRETURN(MATCH_NOMATCH);
2914     break;
2915    
2916     case OP_WORDCHAR:
2917     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2918     RRETURN(MATCH_NOMATCH);
2919     break;
2920    
2921     default:
2922     RRETURN(PCRE_ERROR_INTERNAL);
2923     }
2924     }
2925     }
2926     else
2927     #endif
2928     /* Not UTF-8 mode */
2929     {
2930     for (fi = min;; fi++)
2931     {
2932     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2933     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2934 nigel 91 if (fi >= max || eptr >= md->end_subject ||
2935     ((ims & PCRE_DOTALL) == 0 &&
2936     eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
2937     RRETURN(MATCH_NOMATCH);
2938    
2939 nigel 77 c = *eptr++;
2940     switch(ctype)
2941     {
2942 nigel 91 case OP_ANY: /* This is the DOTALL case */
2943 nigel 77 break;
2944    
2945     case OP_ANYBYTE:
2946     break;
2947    
2948     case OP_NOT_DIGIT:
2949     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2950     break;
2951    
2952     case OP_DIGIT:
2953     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2954     break;
2955    
2956     case OP_NOT_WHITESPACE:
2957     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2958     break;
2959    
2960     case OP_WHITESPACE:
2961     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2962     break;
2963    
2964     case OP_NOT_WORDCHAR:
2965     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2966     break;
2967    
2968     case OP_WORDCHAR:
2969     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2970     break;
2971    
2972     default:
2973     RRETURN(PCRE_ERROR_INTERNAL);
2974     }
2975     }
2976     }
2977     /* Control never gets here */
2978     }
2979    
2980     /* If maximizing it is worth using inline code for speed, doing the type
2981     test once at the start (i.e. keep it out of the loop). Again, keep the
2982     UTF-8 and UCP stuff separate. */
2983    
2984     else
2985     {
2986     pp = eptr; /* Remember where we started */
2987    
2988     #ifdef SUPPORT_UCP
2989 nigel 87 if (prop_type >= 0)
2990 nigel 77 {
2991 nigel 87 switch(prop_type)
2992 nigel 77 {
2993 nigel 87 case PT_ANY:
2994     for (i = min; i < max; i++)
2995     {
2996     int len = 1;
2997     if (eptr >= md->end_subject) break;
2998     GETCHARLEN(c, eptr, len);
2999     if (prop_fail_result) break;
3000     eptr+= len;
3001     }
3002     break;
3003    
3004     case PT_LAMP:
3005     for (i = min; i < max; i++)
3006     {
3007     int len = 1;
3008     if (eptr >= md->end_subject) break;
3009     GETCHARLEN(c, eptr, len);
3010     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3011     if ((prop_chartype == ucp_Lu ||
3012     prop_chartype == ucp_Ll ||
3013     prop_chartype == ucp_Lt) == prop_fail_result)
3014     break;
3015     eptr+= len;
3016     }
3017     break;
3018    
3019     case PT_GC:
3020     for (i = min; i < max; i++)
3021     {
3022     int len = 1;
3023     if (eptr >= md->end_subject) break;
3024     GETCHARLEN(c, eptr, len);
3025     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3026     if ((prop_category == prop_value) == prop_fail_result)
3027     break;
3028     eptr+= len;
3029     }
3030     break;
3031    
3032     case PT_PC:
3033     for (i = min; i < max; i++)
3034     {
3035     int len = 1;
3036     if (eptr >= md->end_subject) break;
3037     GETCHARLEN(c, eptr, len);
3038     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3039     if ((prop_chartype == prop_value) == prop_fail_result)
3040     break;
3041     eptr+= len;
3042     }
3043     break;
3044    
3045     case PT_SC:
3046     for (i = min; i < max; i++)
3047     {
3048     int len = 1;
3049     if (eptr >= md->end_subject) break;
3050     GETCHARLEN(c, eptr, len);
3051     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3052     if ((prop_script == prop_value) == prop_fail_result)
3053     break;
3054     eptr+= len;
3055     }
3056     break;
3057 nigel 77 }
3058    
3059     /* eptr is now past the end of the maximum run */
3060    
3061     for(;;)
3062     {
3063     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3064     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3065     if (eptr-- == pp) break; /* Stop if tried at original pos */
3066     BACKCHAR(eptr);
3067     }
3068     }
3069    
3070     /* Match extended Unicode sequences. We will get here only if the
3071     support is in the binary; otherwise a compile-time error occurs. */
3072    
3073     else if (ctype == OP_EXTUNI)
3074     {
3075     for (i = min; i < max; i++)
3076     {
3077     if (eptr >= md->end_subject) break;
3078     GETCHARINCTEST(c, eptr);
3079 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3080 nigel 77 if (prop_category == ucp_M) break;
3081     while (eptr < md->end_subject)
3082     {
3083     int len = 1;
3084     if (!utf8) c = *eptr; else
3085     {
3086     GETCHARLEN(c, eptr, len);
3087     }
3088 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3089 nigel 77 if (prop_category != ucp_M) break;
3090     eptr += len;
3091     }
3092     }
3093    
3094     /* eptr is now past the end of the maximum run */
3095    
3096     for(;;)
3097     {
3098     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3099     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3100     if (eptr-- == pp) break; /* Stop if tried at original pos */
3101     for (;;) /* Move back over one extended */
3102     {
3103     int len = 1;
3104     BACKCHAR(eptr);
3105     if (!utf8) c = *eptr; else
3106     {
3107     GETCHARLEN(c, eptr, len);
3108     }
3109 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3110 nigel 77 if (prop_category != ucp_M) break;
3111     eptr--;
3112     }
3113     }
3114     }
3115    
3116     else
3117     #endif /* SUPPORT_UCP */
3118    
3119     #ifdef SUPPORT_UTF8
3120     /* UTF-8 mode */
3121    
3122     if (utf8)
3123     {
3124     switch(ctype)
3125     {
3126     case OP_ANY:
3127    
3128 nigel 91 /* Special code is required for UTF8, but when the maximum is
3129     unlimited we don't need it, so we repeat the non-UTF8 code. This is
3130     probably worth it, because .* is quite a common idiom. */
3131 nigel 77
3132     if (max < INT_MAX)
3133     {
3134     if ((ims & PCRE_DOTALL) == 0)
3135     {
3136     for (i = min; i < max; i++)
3137     {
3138 nigel 91 if (eptr >= md->end_subject ||
3139     (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3140     break;
3141 nigel 77 eptr++;
3142     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3143     }
3144     }
3145     else
3146     {
3147     for (i = min; i < max; i++)
3148     {
3149 nigel 91 if (eptr >= md->end_subject) break;
3150 nigel 77 eptr++;
3151     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3152     }
3153     }
3154     }
3155    
3156     /* Handle unlimited UTF-8 repeat */
3157    
3158     else
3159     {
3160     if ((ims & PCRE_DOTALL) == 0)
3161     {
3162     for (i = min; i < max; i++)
3163     {
3164 nigel 91 if (eptr >= md->end_subject ||
3165     (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3166     break;
3167 nigel 77 eptr++;
3168     }
3169     break;
3170     }
3171     else
3172     {
3173     c = max - min;
3174     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3175     eptr += c;
3176     }
3177     }
3178     break;
3179    
3180     /* The byte case is the same as non-UTF8 */
3181    
3182     case OP_ANYBYTE:
3183     c = max - min;
3184     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3185     eptr += c;
3186     break;
3187    
3188     case OP_NOT_DIGIT:
3189     for (i = min; i < max; i++)
3190     {
3191     int len = 1;
3192     if (eptr >= md->end_subject) break;
3193     GETCHARLEN(c, eptr, len);
3194     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3195     eptr+= len;
3196     }
3197     break;
3198    
3199     case OP_DIGIT:
3200     for (i = min; i < max; i++)
3201     {
3202     int len = 1;
3203     if (eptr >= md->end_subject) break;
3204     GETCHARLEN(c, eptr, len);
3205     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3206     eptr+= len;
3207     }
3208     break;
3209    
3210     case OP_NOT_WHITESPACE:
3211     for (i = min; i < max; i++)
3212     {
3213     int len = 1;
3214     if (eptr >= md->end_subject) break;
3215     GETCHARLEN(c, eptr, len);
3216     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3217     eptr+= len;
3218     }
3219     break;
3220    
3221     case OP_WHITESPACE:
3222     for (i = min; i < max; i++)
3223     {
3224     int len = 1;
3225     if (eptr >= md->end_subject) break;
3226     GETCHARLEN(c, eptr, len);
3227     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3228     eptr+= len;
3229     }
3230     break;
3231    
3232     case OP_NOT_WORDCHAR:
3233     for (i = min; i < max; i++)
3234     {
3235     int len = 1;
3236     if (eptr >= md->end_subject) break;
3237     GETCHARLEN(c, eptr, len);
3238     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3239     eptr+= len;
3240     }
3241     break;
3242    
3243     case OP_WORDCHAR:
3244     for (i = min; i < max; i++)
3245     {
3246     int len = 1;
3247     if (eptr >= md->end_subject) break;
3248     GETCHARLEN(c, eptr, len);
3249     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3250     eptr+= len;
3251     }
3252     break;
3253    
3254     default:
3255     RRETURN(PCRE_ERROR_INTERNAL);
3256     }
3257    
3258     /* eptr is now past the end of the maximum run */
3259    
3260     for(;;)
3261     {
3262     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3263     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3264     if (eptr-- == pp) break; /* Stop if tried at original pos */
3265     BACKCHAR(eptr);
3266     }
3267     }
3268     else
3269     #endif
3270    
3271     /* Not UTF-8 mode */
3272     {
3273     switch(ctype)
3274     {
3275     case OP_ANY:
3276     if ((ims & PCRE_DOTALL) == 0)
3277     {
3278     for (i = min; i < max; i++)
3279     {
3280 nigel 91 if (eptr >= md->end_subject ||
3281     (eptr <= md->end_subject - md->nllen && IS_NEWLINE(eptr)))
3282     break;
3283 nigel 77 eptr++;
3284     }
3285     break;
3286     }
3287     /* For DOTALL case, fall through and treat as \C */
3288    
3289     case OP_ANYBYTE:
3290     c = max - min;
3291     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3292     eptr += c;
3293     break;
3294    
3295     case OP_NOT_DIGIT:
3296     for (i = min; i < max; i++)
3297     {
3298     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3299     break;
3300     eptr++;
3301     }
3302     break;
3303    
3304     case OP_DIGIT:
3305     for (i = min; i < max; i++)
3306     {
3307     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3308     break;
3309     eptr++;
3310     }
3311     break;
3312    
3313     case OP_NOT_WHITESPACE:
3314     for (i = min; i < max; i++)
3315     {
3316     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3317     break;
3318     eptr++;
3319     }
3320     break;
3321    
3322     case OP_WHITESPACE:
3323     for (i = min; i < max; i++)
3324     {
3325     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3326     break;
3327     eptr++;
3328     }
3329     break;
3330    
3331     case OP_NOT_WORDCHAR:
3332     for (i = min; i < max; i++)
3333     {
3334     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3335     break;
3336     eptr++;
3337     }
3338     break;
3339    
3340     case OP_WORDCHAR:
3341     for (i = min; i < max; i++)
3342     {
3343     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3344     break;
3345     eptr++;
3346     }
3347     break;
3348    
3349     default:
3350     RRETURN(PCRE_ERROR_INTERNAL);
3351     }
3352    
3353     /* eptr is now past the end of the maximum run */
3354    
3355     while (eptr >= pp)
3356     {
3357     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3358     eptr--;
3359     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3360     }
3361     }
3362    
3363     /* Get here if we can't make it match with any permitted repetitions */
3364    
3365     RRETURN(MATCH_NOMATCH);
3366     }
3367     /* Control never gets here */
3368    
3369     /* There's been some horrible disaster. Since all codes > OP_BRA are
3370     for capturing brackets, and there shouldn't be any gaps between 0 and
3371     OP_BRA, arrival here can only mean there is something seriously wrong
3372     in the code above or the OP_xxx definitions. */
3373    
3374     default:
3375     DPRINTF(("Unknown opcode %d\n", *ecode));
3376     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3377     }
3378    
3379     /* Do not stick any code in here without much thought; it is assumed
3380     that "continue" in the code above comes out to here to repeat the main
3381     loop. */
3382    
3383     } /* End of main loop */
3384     /* Control never reaches here */
3385     }
3386    
3387    
3388     /***************************************************************************
3389     ****************************************************************************
3390     RECURSION IN THE match() FUNCTION
3391    
3392     Undefine all the macros that were defined above to handle this. */
3393    
3394     #ifdef NO_RECURSE
3395     #undef eptr
3396     #undef ecode
3397     #undef offset_top
3398     #undef ims
3399     #undef eptrb
3400     #undef flags
3401    
3402     #undef callpat
3403     #undef charptr
3404     #undef data
3405     #undef next
3406     #undef pp
3407     #undef prev
3408     #undef saved_eptr
3409    
3410     #undef new_recursive
3411    
3412     #undef cur_is_word
3413     #undef condition
3414     #undef minimize
3415     #undef prev_is_word
3416    
3417     #undef original_ims
3418    
3419     #undef ctype
3420     #undef length
3421     #undef max
3422     #undef min
3423     #undef number
3424     #undef offset
3425     #undef op
3426     #undef save_capture_last
3427     #undef save_offset1
3428     #undef save_offset2
3429     #undef save_offset3
3430     #undef stacksave
3431    
3432     #undef newptrb
3433    
3434     #endif
3435    
3436     /* These two are defined as macros in both cases */
3437    
3438     #undef fc
3439     #undef fi
3440    
3441     /***************************************************************************
3442     ***************************************************************************/
3443    
3444    
3445    
3446     /*************************************************
3447     * Execute a Regular Expression *
3448     *************************************************/
3449    
3450     /* This function applies a compiled re to a subject string and picks out
3451     portions of the string if it matches. Two elements in the vector are set for
3452     each substring: the offsets to the start and end of the substring.
3453    
3454     Arguments:
3455     argument_re points to the compiled expression
3456     extra_data points to extra data or is NULL
3457     subject points to the subject string
3458     length length of subject string (may contain binary zeros)
3459     start_offset where to start in the subject string
3460     options option bits
3461     offsets points to a vector of ints to be filled in with offsets
3462     offsetcount the number of elements in the vector
3463    
3464     Returns: > 0 => success; value is the number of elements filled in
3465     = 0 => success, but offsets is not big enough
3466     -1 => failed to match
3467     < -1 => some kind of unexpected problem
3468     */
3469    
3470 nigel 87 PCRE_DATA_SCOPE int
3471 nigel 77 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3472 nigel 87 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3473 nigel 77 int offsetcount)
3474     {
3475     int rc, resetcount, ocount;
3476     int first_byte = -1;
3477     int req_byte = -1;
3478     int req_byte2 = -1;
3479 nigel 91 int newline;
3480     unsigned long int ims;
3481 nigel 77 BOOL using_temporary_offsets = FALSE;
3482     BOOL anchored;
3483     BOOL startline;
3484     BOOL firstline;
3485     BOOL first_byte_caseless = FALSE;
3486     BOOL req_byte_caseless = FALSE;
3487     match_data match_block;
3488 nigel 91 match_data *md = &match_block;
3489 nigel 77 const uschar *tables;
3490     const uschar *start_bits = NULL;
3491 nigel 87 USPTR start_match = (USPTR)subject + start_offset;
3492     USPTR end_subject;
3493     USPTR req_byte_ptr = start_match - 1;
3494 nigel 77
3495     pcre_study_data internal_study;
3496     const pcre_study_data *study;
3497    
3498     real_pcre internal_re;
3499     const real_pcre *external_re = (const real_pcre *)argument_re;
3500     const real_pcre *re = external_re;
3501    
3502     /* Plausibility checks */
3503    
3504     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3505     if (re == NULL || subject == NULL ||
3506     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3507     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3508    
3509     /* Fish out the optional data from the extra_data structure, first setting
3510     the default values. */
3511    
3512     study = NULL;
3513 nigel 91 md->match_limit = MATCH_LIMIT;
3514     md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3515     md->callout_data = NULL;
3516 nigel 77
3517     /* The table pointer is always in native byte order. */
3518    
3519     tables = external_re->tables;
3520    
3521     if (extra_data != NULL)
3522     {
3523     register unsigned int flags = extra_data->flags;
3524     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3525     study = (const pcre_study_data *)extra_data->study_data;
3526     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3527 nigel 91 md->match_limit = extra_data->match_limit;
3528 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3529 nigel 91 md->match_limit_recursion = extra_data->match_limit_recursion;
3530 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3531 nigel 91 md->callout_data = extra_data->callout_data;
3532 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3533     }
3534    
3535     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3536     is a feature that makes it possible to save compiled regex and re-use them
3537     in other programs later. */
3538    
3539     if (tables == NULL) tables = _pcre_default_tables;
3540    
3541     /* Check that the first field in the block is the magic number. If it is not,
3542     test for a regex that was compiled on a host of opposite endianness. If this is
3543     the case, flipped values are put in internal_re and internal_study if there was
3544     study data too. */
3545    
3546     if (re->magic_number != MAGIC_NUMBER)
3547     {
3548     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3549     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3550     if (study != NULL) study = &internal_study;
3551     }
3552    
3553     /* Set up other data */
3554    
3555     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3556     startline = (re->options & PCRE_STARTLINE) != 0;
3557     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3558    
3559     /* The code starts after the real_pcre block and the capture name table. */
3560    
3561 nigel 91 md->start_code = (const uschar *)external_re + re->name_table_offset +
3562 nigel 77 re->name_count * re->name_entry_size;
3563    
3564 nigel 91 md->start_subject = (USPTR)subject;
3565     md->start_offset = start_offset;
3566     md->end_subject = md->start_subject + length;
3567     end_subject = md->end_subject;
3568 nigel 77
3569 nigel 91 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3570     md->utf8 = (re->options & PCRE_UTF8) != 0;
3571 nigel 77
3572 nigel 91 md->notbol = (options & PCRE_NOTBOL) != 0;
3573     md->noteol = (options & PCRE_NOTEOL) != 0;
3574     md->notempty = (options & PCRE_NOTEMPTY) != 0;
3575     md->partial = (options & PCRE_PARTIAL) != 0;
3576     md->hitend = FALSE;
3577 nigel 77
3578 nigel 91 md->recursive = NULL; /* No recursion at top level */
3579 nigel 77
3580 nigel 91 md->lcc = tables + lcc_offset;
3581     md->ctypes = tables + ctypes_offset;
3582 nigel 77
3583 nigel 91 /* Handle different types of newline. The two bits give four cases. If nothing
3584     is set at run time, whatever was used at compile time applies. */
3585    
3586     switch ((((options & PCRE_NEWLINE_CRLF) == 0)? re->options : options) &
3587     PCRE_NEWLINE_CRLF)
3588     {
3589     default: newline = NEWLINE; break; /* Compile-time default */
3590     case PCRE_NEWLINE_CR: newline = '\r'; break;
3591     case PCRE_NEWLINE_LF: newline = '\n'; break;
3592     case PCRE_NEWLINE_CR+
3593     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3594     }
3595    
3596     if (newline > 255)
3597     {
3598     md->nllen = 2;
3599     md->nl[0] = (newline >> 8) & 255;
3600     md->nl[1] = newline & 255;
3601     }
3602     else
3603     {
3604     md->nllen = 1;
3605     md->nl[0] = newline;
3606     }
3607    
3608 nigel 77 /* Partial matching is supported only for a restricted set of regexes at the
3609     moment. */
3610    
3611 nigel 91 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3612 nigel 77 return PCRE_ERROR_BADPARTIAL;
3613    
3614     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3615     back the character offset. */
3616    
3617     #ifdef SUPPORT_UTF8
3618 nigel 91 if (md->utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3619 nigel 77 {
3620     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3621     return PCRE_ERROR_BADUTF8;
3622     if (start_offset > 0 && start_offset < length)
3623     {
3624     int tb = ((uschar *)subject)[start_offset];
3625     if (tb > 127)
3626     {
3627     tb &= 0xc0;
3628     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3629     }
3630     }
3631     }
3632     #endif
3633    
3634     /* The ims options can vary during the matching as a result of the presence
3635     of (?ims) items in the pattern. They are kept in a local variable so that
3636     restoring at the exit of a group is easy. */
3637    
3638     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3639    
3640     /* If the expression has got more back references than the offsets supplied can
3641     hold, we get a temporary chunk of working store to use during the matching.
3642     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3643     of 3. */
3644    
3645     ocount = offsetcount - (offsetcount % 3);
3646    
3647     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3648     {
3649     ocount = re->top_backref * 3 + 3;
3650 nigel 91 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3651     if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3652 nigel 77 using_temporary_offsets = TRUE;
3653     DPRINTF(("Got memory to hold back references\n"));
3654     }
3655 nigel 91 else md->offset_vector = offsets;
3656 nigel 77
3657 nigel 91 md->offset_end = ocount;
3658     md->offset_max = (2*ocount)/3;
3659     md->offset_overflow = FALSE;
3660     md->capture_last = -1;
3661 nigel 77
3662     /* Compute the minimum number of offsets that we need to reset each time. Doing
3663     this makes a huge difference to execution time when there aren't many brackets
3664     in the pattern. */
3665    
3666     resetcount = 2 + re->top_bracket * 2;
3667     if (resetcount > offsetcount) resetcount = ocount;
3668    
3669     /* Reset the working variable associated with each extraction. These should
3670     never be used unless previously set, but they get saved and restored, and so we
3671     initialize them to avoid reading uninitialized locations. */
3672    
3673 nigel 91 if (md->offset_vector != NULL)
3674 nigel 77 {
3675 nigel 91 register int *iptr = md->offset_vector + ocount;
3676 nigel 77 register int *iend = iptr - resetcount/2 + 1;
3677     while (--iptr >= iend) *iptr = -1;
3678     }
3679    
3680     /* Set up the first character to match, if available. The first_byte value is
3681     never set for an anchored regular expression, but the anchoring may be forced
3682     at run time, so we have to test for anchoring. The first char may be unset for
3683     an unanchored pattern, of course. If there's no first char and the pattern was
3684     studied, there may be a bitmap of possible first characters. */
3685    
3686     if (!anchored)
3687     {
3688     if ((re->options & PCRE_FIRSTSET) != 0)
3689     {
3690     first_byte = re->first_byte & 255;
3691     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3692 nigel 91 first_byte = md->lcc[first_byte];
3693 nigel 77 }
3694     else
3695     if (!startline && study != NULL &&
3696     (study->options & PCRE_STUDY_MAPPED) != 0)
3697     start_bits = study->start_bits;
3698     }
3699    
3700     /* For anchored or unanchored matches, there may be a "last known required
3701     character" set. */
3702    
3703     if ((re->options & PCRE_REQCHSET) != 0)
3704     {
3705     req_byte = re->req_byte & 255;
3706     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3707     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3708     }
3709    
3710     /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3711     the loop runs just once. */
3712    
3713     do
3714     {
3715 nigel 87 USPTR save_end_subject = end_subject;
3716 nigel 77
3717     /* Reset the maximum number of extractions we might see. */
3718    
3719 nigel 91 if (md->offset_vector != NULL)
3720 nigel 77 {
3721 nigel 91 register int *iptr = md->offset_vector;
3722 nigel 77 register int *iend = iptr + resetcount;
3723     while (iptr < iend) *iptr++ = -1;
3724     }
3725    
3726     /* Advance to a unique first char if possible. If firstline is TRUE, the
3727     start of the match is constrained to the first line of a multiline string.
3728     Implement this by temporarily adjusting end_subject so that we stop scanning
3729     at a newline. If the match fails at the newline, later code breaks this loop.
3730     */
3731    
3732     if (firstline)
3733     {
3734 nigel 87 USPTR t = start_match;
3735 nigel 91 while (t <= save_end_subject - md->nllen && !IS_NEWLINE(t)) t++;
3736 nigel 77 end_subject = t;
3737     }
3738    
3739     /* Now test for a unique first byte */
3740    
3741     if (first_byte >= 0)
3742     {
3743     if (first_byte_caseless)
3744     while (start_match < end_subject &&
3745 nigel 91 md->lcc[*start_match] != first_byte)
3746 nigel 77 start_match++;
3747     else
3748     while (start_match < end_subject && *start_match != first_byte)
3749     start_match++;
3750     }
3751    
3752 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
3753 nigel 77
3754     else if (startline)
3755     {
3756 nigel 91 if (start_match >= md->start_subject + md->nllen +
3757     start_offset)
3758 nigel 77 {
3759 nigel 91 while (start_match <= end_subject &&
3760     !IS_NEWLINE(start_match - md->nllen))
3761 nigel 77 start_match++;
3762     }
3763     }
3764    
3765     /* Or to a non-unique first char after study */
3766    
3767     else if (start_bits != NULL)
3768     {
3769     while (start_match < end_subject)
3770     {
3771     register unsigned int c = *start_match;
3772     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3773     }
3774     }
3775    
3776     /* Restore fudged end_subject */
3777    
3778     end_subject = save_end_subject;
3779    
3780     #ifdef DEBUG /* Sigh. Some compilers never learn. */
3781     printf(">>>> Match against: ");
3782 nigel 91 pchars(start_match, end_subject - start_match, TRUE, md);
3783 nigel 77 printf("\n");
3784     #endif
3785    
3786     /* If req_byte is set, we know that that character must appear in the subject
3787     for the match to succeed. If the first character is set, req_byte must be
3788     later in the subject; otherwise the test starts at the match point. This
3789     optimization can save a huge amount of backtracking in patterns with nested
3790     unlimited repeats that aren't going to match. Writing separate code for
3791     cased/caseless versions makes it go faster, as does using an autoincrement
3792     and backing off on a match.
3793    
3794     HOWEVER: when the subject string is very, very long, searching to its end can
3795     take a long time, and give bad performance on quite ordinary patterns. This
3796     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3797     don't do this when the string is sufficiently long.
3798    
3799     ALSO: this processing is disabled when partial matching is requested.
3800     */
3801    
3802     if (req_byte >= 0 &&
3803     end_subject - start_match < REQ_BYTE_MAX &&
3804 nigel 91 !md->partial)
3805 nigel 77 {
3806 nigel 87 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
3807 nigel 77
3808     /* We don't need to repeat the search if we haven't yet reached the
3809     place we found it at last time. */
3810    
3811     if (p > req_byte_ptr)
3812     {
3813     if (req_byte_caseless)
3814     {
3815     while (p < end_subject)
3816     {
3817     register int pp = *p++;
3818     if (pp == req_byte || pp == req_byte2) { p--; break; }
3819     }
3820     }
3821     else
3822     {
3823     while (p < end_subject)
3824     {
3825     if (*p++ == req_byte) { p--; break; }
3826     }
3827     }
3828    
3829     /* If we can't find the required character, break the matching loop */
3830    
3831     if (p >= end_subject) break;
3832    
3833     /* If we have found the required character, save the point where we
3834     found it, so that we don't search again next time round the loop if
3835     the start hasn't passed this character yet. */
3836    
3837     req_byte_ptr = p;
3838     }
3839     }
3840    
3841     /* When a match occurs, substrings will be set for all internal extractions;
3842     we just need to set up the whole thing as substring 0 before returning. If
3843     there were too many extractions, set the return code to zero. In the case
3844     where we had to get some local store to hold offsets for backreferences, copy
3845     those back references that we can. In this case there need not be overflow
3846     if certain parts of the pattern were not used. */
3847    
3848 nigel 91 md->start_match = start_match;
3849     md->match_call_count = 0;
3850 nigel 77
3851 nigel 91 rc = match(start_match, md->start_code, 2, md, ims, NULL, match_isgroup, 0);
3852 nigel 77
3853     /* When the result is no match, if the subject's first character was a
3854     newline and the PCRE_FIRSTLINE option is set, break (which will return
3855     PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3856     newline in the subject. Otherwise, advance the pointer to the next character
3857     and continue - but the continuation will actually happen only when the
3858     pattern is not anchored. */
3859    
3860     if (rc == MATCH_NOMATCH)
3861     {
3862 nigel 91 if (firstline &&
3863     start_match <= md->end_subject - md->nllen &&
3864     IS_NEWLINE(start_match))
3865     break;
3866 nigel 77 start_match++;
3867     #ifdef SUPPORT_UTF8
3868 nigel 91 if (md->utf8)
3869 nigel 77 while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3870     start_match++;
3871     #endif
3872     continue;
3873     }
3874    
3875     if (rc != MATCH_MATCH)
3876     {
3877     DPRINTF((">>>> error: returning %d\n", rc));
3878     return rc;
3879     }
3880    
3881     /* We have a match! Copy the offset information from temporary store if
3882     necessary */
3883    
3884     if (using_temporary_offsets)
3885     {
3886     if (offsetcount >= 4)
3887     {
3888 nigel 91 memcpy(offsets + 2, md->offset_vector + 2,
3889 nigel 77 (offsetcount - 2) * sizeof(int));
3890     DPRINTF(("Copied offsets from temporary memory\n"));
3891     }
3892 nigel 91 if (md->end_offset_top > offsetcount)
3893     md->offset_overflow = TRUE;
3894 nigel 77
3895     DPRINTF(("Freeing temporary memory\n"));
3896 nigel 91 (pcre_free)(md->offset_vector);
3897 nigel 77 }
3898    
3899 nigel 91 rc = md->offset_overflow? 0 : md->end_offset_top/2;
3900 nigel 77
3901     if (offsetcount < 2) rc = 0; else
3902     {
3903 nigel 91 offsets[0] = start_match - md->start_subject;
3904     offsets[1] = md->end_match_ptr - md->start_subject;
3905 nigel 77 }
3906    
3907     DPRINTF((">>>> returning %d\n", rc));
3908     return rc;
3909     }
3910    
3911     /* This "while" is the end of the "do" above */
3912    
3913     while (!anchored && start_match <= end_subject);
3914    
3915     if (using_temporary_offsets)
3916     {
3917     DPRINTF(("Freeing temporary memory\n"));
3918 nigel 91 (pcre_free)(md->offset_vector);
3919 nigel 77 }
3920    
3921 nigel 91 if (md->partial && md->hitend)
3922 nigel 77 {
3923     DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3924     return PCRE_ERROR_PARTIAL;
3925     }
3926     else
3927     {
3928     DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3929     return PCRE_ERROR_NOMATCH;
3930     }
3931     }
3932    
3933     /* End of pcre_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12