/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 77 - (hide annotations) (download)
Sat Feb 24 21:40:45 2007 UTC (7 years, 6 months ago) by nigel
File MIME type: text/plain
File size: 110756 byte(s)
Load pcre-6.0 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9     Copyright (c) 1997-2005 University of Cambridge
10    
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45    
46     #include "pcre_internal.h"
47    
48    
49     /* Structure for building a chain of data that actually lives on the
50     stack, for holding the values of the subject pointer at the start of each
51     subpattern, so as to detect when an empty string has been matched by a
52     subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
53     are on the heap, not on the stack. */
54    
55     typedef struct eptrblock {
56     struct eptrblock *epb_prev;
57     const uschar *epb_saved_eptr;
58     } eptrblock;
59    
60     /* Flag bits for the match() function */
61    
62     #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_isgroup 0x02 /* Set if start of bracketed group */
64    
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71     /* Maximum number of ints of offset to save on the stack for recursive calls.
72     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73     because the offset vector is always a multiple of 3 long. */
74    
75     #define REC_STACK_SAVE_MAX 30
76    
77     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78    
79     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
80     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
81    
82    
83    
84     #ifdef DEBUG
85     /*************************************************
86     * Debugging function to print chars *
87     *************************************************/
88    
89     /* Print a sequence of chars in printable format, stopping at the end of the
90     subject if the requested.
91    
92     Arguments:
93     p points to characters
94     length number to print
95     is_subject TRUE if printing from within md->start_subject
96     md pointer to matching data block, if is_subject is TRUE
97    
98     Returns: nothing
99     */
100    
101     static void
102     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
103     {
104     int c;
105     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106     while (length-- > 0)
107     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
108     }
109     #endif
110    
111    
112    
113     /*************************************************
114     * Match a back-reference *
115     *************************************************/
116    
117     /* If a back reference hasn't been set, the length that is passed is greater
118     than the number of characters left in the string, so the match fails.
119    
120     Arguments:
121     offset index into the offset vector
122     eptr points into the subject
123     length length to be matched
124     md points to match data block
125     ims the ims flags
126    
127     Returns: TRUE if matched
128     */
129    
130     static BOOL
131     match_ref(int offset, register const uschar *eptr, int length, match_data *md,
132     unsigned long int ims)
133     {
134     const uschar *p = md->start_subject + md->offset_vector[offset];
135    
136     #ifdef DEBUG
137     if (eptr >= md->end_subject)
138     printf("matching subject <null>");
139     else
140     {
141     printf("matching subject ");
142     pchars(eptr, length, TRUE, md);
143     }
144     printf(" against backref ");
145     pchars(p, length, FALSE, md);
146     printf("\n");
147     #endif
148    
149     /* Always fail if not enough characters left */
150    
151     if (length > md->end_subject - eptr) return FALSE;
152    
153     /* Separate the caselesss case for speed */
154    
155     if ((ims & PCRE_CASELESS) != 0)
156     {
157     while (length-- > 0)
158     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
159     }
160     else
161     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
162    
163     return TRUE;
164     }
165    
166    
167    
168     /***************************************************************************
169     ****************************************************************************
170     RECURSION IN THE match() FUNCTION
171    
172     The match() function is highly recursive. Some regular expressions can cause
173     it to recurse thousands of times. I was writing for Unix, so I just let it
174     call itself recursively. This uses the stack for saving everything that has
175     to be saved for a recursive call. On Unix, the stack can be large, and this
176     works fine.
177    
178     It turns out that on non-Unix systems there are problems with programs that
179     use a lot of stack. (This despite the fact that every last chip has oodles
180     of memory these days, and techniques for extending the stack have been known
181     for decades.) So....
182    
183     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
184     calls by keeping local variables that need to be preserved in blocks of memory
185     obtained from malloc instead instead of on the stack. Macros are used to
186     achieve this so that the actual code doesn't look very different to what it
187     always used to.
188     ****************************************************************************
189     ***************************************************************************/
190    
191    
192     /* These versions of the macros use the stack, as normal */
193    
194     #ifndef NO_RECURSE
195     #define REGISTER register
196     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) rx = match(ra,rb,rc,rd,re,rf,rg)
197     #define RRETURN(ra) return ra
198     #else
199    
200    
201     /* These versions of the macros manage a private stack on the heap. Note
202     that the rd argument of RMATCH isn't actually used. It's the md argument of
203     match(), which never changes. */
204    
205     #define REGISTER
206    
207     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
208     {\
209     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
210     if (setjmp(frame->Xwhere) == 0)\
211     {\
212     newframe->Xeptr = ra;\
213     newframe->Xecode = rb;\
214     newframe->Xoffset_top = rc;\
215     newframe->Xims = re;\
216     newframe->Xeptrb = rf;\
217     newframe->Xflags = rg;\
218     newframe->Xprevframe = frame;\
219     frame = newframe;\
220     DPRINTF(("restarting from line %d\n", __LINE__));\
221     goto HEAP_RECURSE;\
222     }\
223     else\
224     {\
225     DPRINTF(("longjumped back to line %d\n", __LINE__));\
226     frame = md->thisframe;\
227     rx = frame->Xresult;\
228     }\
229     }
230    
231     #define RRETURN(ra)\
232     {\
233     heapframe *newframe = frame;\
234     frame = newframe->Xprevframe;\
235     (pcre_stack_free)(newframe);\
236     if (frame != NULL)\
237     {\
238     frame->Xresult = ra;\
239     md->thisframe = frame;\
240     longjmp(frame->Xwhere, 1);\
241     }\
242     return ra;\
243     }
244    
245    
246     /* Structure for remembering the local variables in a private frame */
247    
248     typedef struct heapframe {
249     struct heapframe *Xprevframe;
250    
251     /* Function arguments that may change */
252    
253     const uschar *Xeptr;
254     const uschar *Xecode;
255     int Xoffset_top;
256     long int Xims;
257     eptrblock *Xeptrb;
258     int Xflags;
259    
260     /* Function local variables */
261    
262     const uschar *Xcallpat;
263     const uschar *Xcharptr;
264     const uschar *Xdata;
265     const uschar *Xnext;
266     const uschar *Xpp;
267     const uschar *Xprev;
268     const uschar *Xsaved_eptr;
269    
270     recursion_info Xnew_recursive;
271    
272     BOOL Xcur_is_word;
273     BOOL Xcondition;
274     BOOL Xminimize;
275     BOOL Xprev_is_word;
276    
277     unsigned long int Xoriginal_ims;
278    
279     #ifdef SUPPORT_UCP
280     int Xprop_type;
281     int Xprop_fail_result;
282     int Xprop_category;
283     int Xprop_chartype;
284     int Xprop_othercase;
285     int Xprop_test_against;
286     int *Xprop_test_variable;
287     #endif
288    
289     int Xctype;
290     int Xfc;
291     int Xfi;
292     int Xlength;
293     int Xmax;
294     int Xmin;
295     int Xnumber;
296     int Xoffset;
297     int Xop;
298     int Xsave_capture_last;
299     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
300     int Xstacksave[REC_STACK_SAVE_MAX];
301    
302     eptrblock Xnewptrb;
303    
304     /* Place to pass back result, and where to jump back to */
305    
306     int Xresult;
307     jmp_buf Xwhere;
308    
309     } heapframe;
310    
311     #endif
312    
313    
314     /***************************************************************************
315     ***************************************************************************/
316    
317    
318    
319     /*************************************************
320     * Match from current position *
321     *************************************************/
322    
323     /* On entry ecode points to the first opcode, and eptr to the first character
324     in the subject string, while eptrb holds the value of eptr at the start of the
325     last bracketed group - used for breaking infinite loops matching zero-length
326     strings. This function is called recursively in many circumstances. Whenever it
327     returns a negative (error) response, the outer incarnation must also return the
328     same response.
329    
330     Performance note: It might be tempting to extract commonly used fields from the
331     md structure (e.g. utf8, end_subject) into individual variables to improve
332     performance. Tests using gcc on a SPARC disproved this; in the first case, it
333     made performance worse.
334    
335     Arguments:
336     eptr pointer in subject
337     ecode position in code
338     offset_top current top pointer
339     md pointer to "static" info for the match
340     ims current /i, /m, and /s options
341     eptrb pointer to chain of blocks containing eptr at start of
342     brackets - for testing for empty matches
343     flags can contain
344     match_condassert - this is an assertion condition
345     match_isgroup - this is the start of a bracketed group
346    
347     Returns: MATCH_MATCH if matched ) these values are >= 0
348     MATCH_NOMATCH if failed to match )
349     a negative PCRE_ERROR_xxx value if aborted by an error condition
350     (e.g. stopped by recursion limit)
351     */
352    
353     static int
354     match(REGISTER const uschar *eptr, REGISTER const uschar *ecode,
355     int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
356     int flags)
357     {
358     /* These variables do not need to be preserved over recursion in this function,
359     so they can be ordinary variables in all cases. Mark them with "register"
360     because they are used a lot in loops. */
361    
362     register int rrc; /* Returns from recursive calls */
363     register int i; /* Used for loops not involving calls to RMATCH() */
364     register int c; /* Character values not kept over RMATCH() calls */
365     register BOOL utf8; /* Local copy of UTF-8 flag for speed */
366    
367     /* When recursion is not being used, all "local" variables that have to be
368     preserved over calls to RMATCH() are part of a "frame" which is obtained from
369     heap storage. Set up the top-level frame here; others are obtained from the
370     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
371    
372     #ifdef NO_RECURSE
373     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
374     frame->Xprevframe = NULL; /* Marks the top level */
375    
376     /* Copy in the original argument variables */
377    
378     frame->Xeptr = eptr;
379     frame->Xecode = ecode;
380     frame->Xoffset_top = offset_top;
381     frame->Xims = ims;
382     frame->Xeptrb = eptrb;
383     frame->Xflags = flags;
384    
385     /* This is where control jumps back to to effect "recursion" */
386    
387     HEAP_RECURSE:
388    
389     /* Macros make the argument variables come from the current frame */
390    
391     #define eptr frame->Xeptr
392     #define ecode frame->Xecode
393     #define offset_top frame->Xoffset_top
394     #define ims frame->Xims
395     #define eptrb frame->Xeptrb
396     #define flags frame->Xflags
397    
398     /* Ditto for the local variables */
399    
400     #ifdef SUPPORT_UTF8
401     #define charptr frame->Xcharptr
402     #endif
403     #define callpat frame->Xcallpat
404     #define data frame->Xdata
405     #define next frame->Xnext
406     #define pp frame->Xpp
407     #define prev frame->Xprev
408     #define saved_eptr frame->Xsaved_eptr
409    
410     #define new_recursive frame->Xnew_recursive
411    
412     #define cur_is_word frame->Xcur_is_word
413     #define condition frame->Xcondition
414     #define minimize frame->Xminimize
415     #define prev_is_word frame->Xprev_is_word
416    
417     #define original_ims frame->Xoriginal_ims
418    
419     #ifdef SUPPORT_UCP
420     #define prop_type frame->Xprop_type
421     #define prop_fail_result frame->Xprop_fail_result
422     #define prop_category frame->Xprop_category
423     #define prop_chartype frame->Xprop_chartype
424     #define prop_othercase frame->Xprop_othercase
425     #define prop_test_against frame->Xprop_test_against
426     #define prop_test_variable frame->Xprop_test_variable
427     #endif
428    
429     #define ctype frame->Xctype
430     #define fc frame->Xfc
431     #define fi frame->Xfi
432     #define length frame->Xlength
433     #define max frame->Xmax
434     #define min frame->Xmin
435     #define number frame->Xnumber
436     #define offset frame->Xoffset
437     #define op frame->Xop
438     #define save_capture_last frame->Xsave_capture_last
439     #define save_offset1 frame->Xsave_offset1
440     #define save_offset2 frame->Xsave_offset2
441     #define save_offset3 frame->Xsave_offset3
442     #define stacksave frame->Xstacksave
443    
444     #define newptrb frame->Xnewptrb
445    
446     /* When recursion is being used, local variables are allocated on the stack and
447     get preserved during recursion in the normal way. In this environment, fi and
448     i, and fc and c, can be the same variables. */
449    
450     #else
451     #define fi i
452     #define fc c
453    
454    
455     #ifdef SUPPORT_UTF8 /* Many of these variables are used ony */
456     const uschar *charptr; /* small blocks of the code. My normal */
457     #endif /* style of coding would have declared */
458     const uschar *callpat; /* them within each of those blocks. */
459     const uschar *data; /* However, in order to accommodate the */
460     const uschar *next; /* version of this code that uses an */
461     const uschar *pp; /* external "stack" implemented on the */
462     const uschar *prev; /* heap, it is easier to declare them */
463     const uschar *saved_eptr; /* all here, so the declarations can */
464     /* be cut out in a block. The only */
465     recursion_info new_recursive; /* declarations within blocks below are */
466     /* for variables that do not have to */
467     BOOL cur_is_word; /* be preserved over a recursive call */
468     BOOL condition; /* to RMATCH(). */
469     BOOL minimize;
470     BOOL prev_is_word;
471    
472     unsigned long int original_ims;
473    
474     #ifdef SUPPORT_UCP
475     int prop_type;
476     int prop_fail_result;
477     int prop_category;
478     int prop_chartype;
479     int prop_othercase;
480     int prop_test_against;
481     int *prop_test_variable;
482     #endif
483    
484     int ctype;
485     int length;
486     int max;
487     int min;
488     int number;
489     int offset;
490     int op;
491     int save_capture_last;
492     int save_offset1, save_offset2, save_offset3;
493     int stacksave[REC_STACK_SAVE_MAX];
494    
495     eptrblock newptrb;
496     #endif
497    
498     /* These statements are here to stop the compiler complaining about unitialized
499     variables. */
500    
501     #ifdef SUPPORT_UCP
502     prop_fail_result = 0;
503     prop_test_against = 0;
504     prop_test_variable = NULL;
505     #endif
506    
507     /* OK, now we can get on with the real code of the function. Recursion is
508     specified by the macros RMATCH and RRETURN. When NO_RECURSE is *not* defined,
509     these just turn into a recursive call to match() and a "return", respectively.
510     However, RMATCH isn't like a function call because it's quite a complicated
511     macro. It has to be used in one particular way. This shouldn't, however, impact
512     performance when true recursion is being used. */
513    
514     if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
515    
516     original_ims = ims; /* Save for resetting on ')' */
517     utf8 = md->utf8; /* Local copy of the flag */
518    
519     /* At the start of a bracketed group, add the current subject pointer to the
520     stack of such pointers, to be re-instated at the end of the group when we hit
521     the closing ket. When match() is called in other circumstances, we don't add to
522     this stack. */
523    
524     if ((flags & match_isgroup) != 0)
525     {
526     newptrb.epb_prev = eptrb;
527     newptrb.epb_saved_eptr = eptr;
528     eptrb = &newptrb;
529     }
530    
531     /* Now start processing the operations. */
532    
533     for (;;)
534     {
535     op = *ecode;
536     minimize = FALSE;
537    
538     /* For partial matching, remember if we ever hit the end of the subject after
539     matching at least one subject character. */
540    
541     if (md->partial &&
542     eptr >= md->end_subject &&
543     eptr > md->start_match)
544     md->hitend = TRUE;
545    
546     /* Opening capturing bracket. If there is space in the offset vector, save
547     the current subject position in the working slot at the top of the vector. We
548     mustn't change the current values of the data slot, because they may be set
549     from a previous iteration of this group, and be referred to by a reference
550     inside the group.
551    
552     If the bracket fails to match, we need to restore this value and also the
553     values of the final offsets, in case they were set by a previous iteration of
554     the same bracket.
555    
556     If there isn't enough space in the offset vector, treat this as if it were a
557     non-capturing bracket. Don't worry about setting the flag for the error case
558     here; that is handled in the code for KET. */
559    
560     if (op > OP_BRA)
561     {
562     number = op - OP_BRA;
563    
564     /* For extended extraction brackets (large number), we have to fish out the
565     number from a dummy opcode at the start. */
566    
567     if (number > EXTRACT_BASIC_MAX)
568     number = GET2(ecode, 2+LINK_SIZE);
569     offset = number << 1;
570    
571     #ifdef DEBUG
572     printf("start bracket %d subject=", number);
573     pchars(eptr, 16, TRUE, md);
574     printf("\n");
575     #endif
576    
577     if (offset < md->offset_max)
578     {
579     save_offset1 = md->offset_vector[offset];
580     save_offset2 = md->offset_vector[offset+1];
581     save_offset3 = md->offset_vector[md->offset_end - number];
582     save_capture_last = md->capture_last;
583    
584     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
585     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
586    
587     do
588     {
589     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
590     match_isgroup);
591     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
592     md->capture_last = save_capture_last;
593     ecode += GET(ecode, 1);
594     }
595     while (*ecode == OP_ALT);
596    
597     DPRINTF(("bracket %d failed\n", number));
598    
599     md->offset_vector[offset] = save_offset1;
600     md->offset_vector[offset+1] = save_offset2;
601     md->offset_vector[md->offset_end - number] = save_offset3;
602    
603     RRETURN(MATCH_NOMATCH);
604     }
605    
606     /* Insufficient room for saving captured contents */
607    
608     else op = OP_BRA;
609     }
610    
611     /* Other types of node can be handled by a switch */
612    
613     switch(op)
614     {
615     case OP_BRA: /* Non-capturing bracket: optimized */
616     DPRINTF(("start bracket 0\n"));
617     do
618     {
619     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
620     match_isgroup);
621     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
622     ecode += GET(ecode, 1);
623     }
624     while (*ecode == OP_ALT);
625     DPRINTF(("bracket 0 failed\n"));
626     RRETURN(MATCH_NOMATCH);
627    
628     /* Conditional group: compilation checked that there are no more than
629     two branches. If the condition is false, skipping the first branch takes us
630     past the end if there is only one branch, but that's OK because that is
631     exactly what going to the ket would do. */
632    
633     case OP_COND:
634     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
635     {
636     offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
637     condition = (offset == CREF_RECURSE * 2)?
638     (md->recursive != NULL) :
639     (offset < offset_top && md->offset_vector[offset] >= 0);
640     RMATCH(rrc, eptr, ecode + (condition?
641     (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
642     offset_top, md, ims, eptrb, match_isgroup);
643     RRETURN(rrc);
644     }
645    
646     /* The condition is an assertion. Call match() to evaluate it - setting
647     the final argument TRUE causes it to stop at the end of an assertion. */
648    
649     else
650     {
651     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
652     match_condassert | match_isgroup);
653     if (rrc == MATCH_MATCH)
654     {
655     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
656     while (*ecode == OP_ALT) ecode += GET(ecode, 1);
657     }
658     else if (rrc != MATCH_NOMATCH)
659     {
660     RRETURN(rrc); /* Need braces because of following else */
661     }
662     else ecode += GET(ecode, 1);
663     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
664     match_isgroup);
665     RRETURN(rrc);
666     }
667     /* Control never reaches here */
668    
669     /* Skip over conditional reference or large extraction number data if
670     encountered. */
671    
672     case OP_CREF:
673     case OP_BRANUMBER:
674     ecode += 3;
675     break;
676    
677     /* End of the pattern. If we are in a recursion, we should restore the
678     offsets appropriately and continue from after the call. */
679    
680     case OP_END:
681     if (md->recursive != NULL && md->recursive->group_num == 0)
682     {
683     recursion_info *rec = md->recursive;
684     DPRINTF(("Hit the end in a (?0) recursion\n"));
685     md->recursive = rec->prevrec;
686     memmove(md->offset_vector, rec->offset_save,
687     rec->saved_max * sizeof(int));
688     md->start_match = rec->save_start;
689     ims = original_ims;
690     ecode = rec->after_call;
691     break;
692     }
693    
694     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
695     string - backtracking will then try other alternatives, if any. */
696    
697     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
698     md->end_match_ptr = eptr; /* Record where we ended */
699     md->end_offset_top = offset_top; /* and how many extracts were taken */
700     RRETURN(MATCH_MATCH);
701    
702     /* Change option settings */
703    
704     case OP_OPT:
705     ims = ecode[1];
706     ecode += 2;
707     DPRINTF(("ims set to %02lx\n", ims));
708     break;
709    
710     /* Assertion brackets. Check the alternative branches in turn - the
711     matching won't pass the KET for an assertion. If any one branch matches,
712     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
713     start of each branch to move the current point backwards, so the code at
714     this level is identical to the lookahead case. */
715    
716     case OP_ASSERT:
717     case OP_ASSERTBACK:
718     do
719     {
720     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
721     match_isgroup);
722     if (rrc == MATCH_MATCH) break;
723     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
724     ecode += GET(ecode, 1);
725     }
726     while (*ecode == OP_ALT);
727     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
728    
729     /* If checking an assertion for a condition, return MATCH_MATCH. */
730    
731     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
732    
733     /* Continue from after the assertion, updating the offsets high water
734     mark, since extracts may have been taken during the assertion. */
735    
736     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
737     ecode += 1 + LINK_SIZE;
738     offset_top = md->end_offset_top;
739     continue;
740    
741     /* Negative assertion: all branches must fail to match */
742    
743     case OP_ASSERT_NOT:
744     case OP_ASSERTBACK_NOT:
745     do
746     {
747     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
748     match_isgroup);
749     if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
750     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
751     ecode += GET(ecode,1);
752     }
753     while (*ecode == OP_ALT);
754    
755     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
756    
757     ecode += 1 + LINK_SIZE;
758     continue;
759    
760     /* Move the subject pointer back. This occurs only at the start of
761     each branch of a lookbehind assertion. If we are too close to the start to
762     move back, this match function fails. When working with UTF-8 we move
763     back a number of characters, not bytes. */
764    
765     case OP_REVERSE:
766     #ifdef SUPPORT_UTF8
767     if (utf8)
768     {
769     c = GET(ecode,1);
770     for (i = 0; i < c; i++)
771     {
772     eptr--;
773     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
774     BACKCHAR(eptr)
775     }
776     }
777     else
778     #endif
779    
780     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
781    
782     {
783     eptr -= GET(ecode,1);
784     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
785     }
786    
787     /* Skip to next op code */
788    
789     ecode += 1 + LINK_SIZE;
790     break;
791    
792     /* The callout item calls an external function, if one is provided, passing
793     details of the match so far. This is mainly for debugging, though the
794     function is able to force a failure. */
795    
796     case OP_CALLOUT:
797     if (pcre_callout != NULL)
798     {
799     pcre_callout_block cb;
800     cb.version = 1; /* Version 1 of the callout block */
801     cb.callout_number = ecode[1];
802     cb.offset_vector = md->offset_vector;
803     cb.subject = (const char *)md->start_subject;
804     cb.subject_length = md->end_subject - md->start_subject;
805     cb.start_match = md->start_match - md->start_subject;
806     cb.current_position = eptr - md->start_subject;
807     cb.pattern_position = GET(ecode, 2);
808     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
809     cb.capture_top = offset_top/2;
810     cb.capture_last = md->capture_last;
811     cb.callout_data = md->callout_data;
812     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
813     if (rrc < 0) RRETURN(rrc);
814     }
815     ecode += 2 + 2*LINK_SIZE;
816     break;
817    
818     /* Recursion either matches the current regex, or some subexpression. The
819     offset data is the offset to the starting bracket from the start of the
820     whole pattern. (This is so that it works from duplicated subpatterns.)
821    
822     If there are any capturing brackets started but not finished, we have to
823     save their starting points and reinstate them after the recursion. However,
824     we don't know how many such there are (offset_top records the completed
825     total) so we just have to save all the potential data. There may be up to
826     65535 such values, which is too large to put on the stack, but using malloc
827     for small numbers seems expensive. As a compromise, the stack is used when
828     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
829     is used. A problem is what to do if the malloc fails ... there is no way of
830     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
831     values on the stack, and accept that the rest may be wrong.
832    
833     There are also other values that have to be saved. We use a chained
834     sequence of blocks that actually live on the stack. Thanks to Robin Houston
835     for the original version of this logic. */
836    
837     case OP_RECURSE:
838     {
839     callpat = md->start_code + GET(ecode, 1);
840     new_recursive.group_num = *callpat - OP_BRA;
841    
842     /* For extended extraction brackets (large number), we have to fish out
843     the number from a dummy opcode at the start. */
844    
845     if (new_recursive.group_num > EXTRACT_BASIC_MAX)
846     new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
847    
848     /* Add to "recursing stack" */
849    
850     new_recursive.prevrec = md->recursive;
851     md->recursive = &new_recursive;
852    
853     /* Find where to continue from afterwards */
854    
855     ecode += 1 + LINK_SIZE;
856     new_recursive.after_call = ecode;
857    
858     /* Now save the offset data. */
859    
860     new_recursive.saved_max = md->offset_end;
861     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
862     new_recursive.offset_save = stacksave;
863     else
864     {
865     new_recursive.offset_save =
866     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
867     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
868     }
869    
870     memcpy(new_recursive.offset_save, md->offset_vector,
871     new_recursive.saved_max * sizeof(int));
872     new_recursive.save_start = md->start_match;
873     md->start_match = eptr;
874    
875     /* OK, now we can do the recursion. For each top-level alternative we
876     restore the offset and recursion data. */
877    
878     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
879     do
880     {
881     RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
882     eptrb, match_isgroup);
883     if (rrc == MATCH_MATCH)
884     {
885     md->recursive = new_recursive.prevrec;
886     if (new_recursive.offset_save != stacksave)
887     (pcre_free)(new_recursive.offset_save);
888     RRETURN(MATCH_MATCH);
889     }
890     else if (rrc != MATCH_NOMATCH) RRETURN(rrc);
891    
892     md->recursive = &new_recursive;
893     memcpy(md->offset_vector, new_recursive.offset_save,
894     new_recursive.saved_max * sizeof(int));
895     callpat += GET(callpat, 1);
896     }
897     while (*callpat == OP_ALT);
898    
899     DPRINTF(("Recursion didn't match\n"));
900     md->recursive = new_recursive.prevrec;
901     if (new_recursive.offset_save != stacksave)
902     (pcre_free)(new_recursive.offset_save);
903     RRETURN(MATCH_NOMATCH);
904     }
905     /* Control never reaches here */
906    
907     /* "Once" brackets are like assertion brackets except that after a match,
908     the point in the subject string is not moved back. Thus there can never be
909     a move back into the brackets. Friedl calls these "atomic" subpatterns.
910     Check the alternative branches in turn - the matching won't pass the KET
911     for this kind of subpattern. If any one branch matches, we carry on as at
912     the end of a normal bracket, leaving the subject pointer. */
913    
914     case OP_ONCE:
915     {
916     prev = ecode;
917     saved_eptr = eptr;
918    
919     do
920     {
921     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
922     eptrb, match_isgroup);
923     if (rrc == MATCH_MATCH) break;
924     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
925     ecode += GET(ecode,1);
926     }
927     while (*ecode == OP_ALT);
928    
929     /* If hit the end of the group (which could be repeated), fail */
930    
931     if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
932    
933     /* Continue as from after the assertion, updating the offsets high water
934     mark, since extracts may have been taken. */
935    
936     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
937    
938     offset_top = md->end_offset_top;
939     eptr = md->end_match_ptr;
940    
941     /* For a non-repeating ket, just continue at this level. This also
942     happens for a repeating ket if no characters were matched in the group.
943     This is the forcible breaking of infinite loops as implemented in Perl
944     5.005. If there is an options reset, it will get obeyed in the normal
945     course of events. */
946    
947     if (*ecode == OP_KET || eptr == saved_eptr)
948     {
949     ecode += 1+LINK_SIZE;
950     break;
951     }
952    
953     /* The repeating kets try the rest of the pattern or restart from the
954     preceding bracket, in the appropriate order. We need to reset any options
955     that changed within the bracket before re-running it, so check the next
956     opcode. */
957    
958     if (ecode[1+LINK_SIZE] == OP_OPT)
959     {
960     ims = (ims & ~PCRE_IMS) | ecode[4];
961     DPRINTF(("ims set to %02lx at group repeat\n", ims));
962     }
963    
964     if (*ecode == OP_KETRMIN)
965     {
966     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
967     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
968     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
969     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
970     }
971     else /* OP_KETRMAX */
972     {
973     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
974     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
975     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
976     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
977     }
978     }
979     RRETURN(MATCH_NOMATCH);
980    
981     /* An alternation is the end of a branch; scan along to find the end of the
982     bracketed group and go to there. */
983    
984     case OP_ALT:
985     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
986     break;
987    
988     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
989     that it may occur zero times. It may repeat infinitely, or not at all -
990     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
991     repeat limits are compiled as a number of copies, with the optional ones
992     preceded by BRAZERO or BRAMINZERO. */
993    
994     case OP_BRAZERO:
995     {
996     next = ecode+1;
997     RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
998     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
999     do next += GET(next,1); while (*next == OP_ALT);
1000     ecode = next + 1+LINK_SIZE;
1001     }
1002     break;
1003    
1004     case OP_BRAMINZERO:
1005     {
1006     next = ecode+1;
1007     do next += GET(next,1); while (*next == OP_ALT);
1008     RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1009     match_isgroup);
1010     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1011     ecode++;
1012     }
1013     break;
1014    
1015     /* End of a group, repeated or non-repeating. If we are at the end of
1016     an assertion "group", stop matching and return MATCH_MATCH, but record the
1017     current high water mark for use by positive assertions. Do this also
1018     for the "once" (not-backup up) groups. */
1019    
1020     case OP_KET:
1021     case OP_KETRMIN:
1022     case OP_KETRMAX:
1023     {
1024     prev = ecode - GET(ecode, 1);
1025     saved_eptr = eptrb->epb_saved_eptr;
1026    
1027     /* Back up the stack of bracket start pointers. */
1028    
1029     eptrb = eptrb->epb_prev;
1030    
1031     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1032     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1033     *prev == OP_ONCE)
1034     {
1035     md->end_match_ptr = eptr; /* For ONCE */
1036     md->end_offset_top = offset_top;
1037     RRETURN(MATCH_MATCH);
1038     }
1039    
1040     /* In all other cases except a conditional group we have to check the
1041     group number back at the start and if necessary complete handling an
1042     extraction by setting the offsets and bumping the high water mark. */
1043    
1044     if (*prev != OP_COND)
1045     {
1046     number = *prev - OP_BRA;
1047    
1048     /* For extended extraction brackets (large number), we have to fish out
1049     the number from a dummy opcode at the start. */
1050    
1051     if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1052     offset = number << 1;
1053    
1054     #ifdef DEBUG
1055     printf("end bracket %d", number);
1056     printf("\n");
1057     #endif
1058    
1059     /* Test for a numbered group. This includes groups called as a result
1060     of recursion. Note that whole-pattern recursion is coded as a recurse
1061     into group 0, so it won't be picked up here. Instead, we catch it when
1062     the OP_END is reached. */
1063    
1064     if (number > 0)
1065     {
1066     md->capture_last = number;
1067     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1068     {
1069     md->offset_vector[offset] =
1070     md->offset_vector[md->offset_end - number];
1071     md->offset_vector[offset+1] = eptr - md->start_subject;
1072     if (offset_top <= offset) offset_top = offset + 2;
1073     }
1074    
1075     /* Handle a recursively called group. Restore the offsets
1076     appropriately and continue from after the call. */
1077    
1078     if (md->recursive != NULL && md->recursive->group_num == number)
1079     {
1080     recursion_info *rec = md->recursive;
1081     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1082     md->recursive = rec->prevrec;
1083     md->start_match = rec->save_start;
1084     memcpy(md->offset_vector, rec->offset_save,
1085     rec->saved_max * sizeof(int));
1086     ecode = rec->after_call;
1087     ims = original_ims;
1088     break;
1089     }
1090     }
1091     }
1092    
1093     /* Reset the value of the ims flags, in case they got changed during
1094     the group. */
1095    
1096     ims = original_ims;
1097     DPRINTF(("ims reset to %02lx\n", ims));
1098    
1099     /* For a non-repeating ket, just continue at this level. This also
1100     happens for a repeating ket if no characters were matched in the group.
1101     This is the forcible breaking of infinite loops as implemented in Perl
1102     5.005. If there is an options reset, it will get obeyed in the normal
1103     course of events. */
1104    
1105     if (*ecode == OP_KET || eptr == saved_eptr)
1106     {
1107     ecode += 1 + LINK_SIZE;
1108     break;
1109     }
1110    
1111     /* The repeating kets try the rest of the pattern or restart from the
1112     preceding bracket, in the appropriate order. */
1113    
1114     if (*ecode == OP_KETRMIN)
1115     {
1116     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1117     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1118     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1119     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1120     }
1121     else /* OP_KETRMAX */
1122     {
1123     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1124     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1125     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1126     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1127     }
1128     }
1129    
1130     RRETURN(MATCH_NOMATCH);
1131    
1132     /* Start of subject unless notbol, or after internal newline if multiline */
1133    
1134     case OP_CIRC:
1135     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1136     if ((ims & PCRE_MULTILINE) != 0)
1137     {
1138     if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1139     RRETURN(MATCH_NOMATCH);
1140     ecode++;
1141     break;
1142     }
1143     /* ... else fall through */
1144    
1145     /* Start of subject assertion */
1146    
1147     case OP_SOD:
1148     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1149     ecode++;
1150     break;
1151    
1152     /* Start of match assertion */
1153    
1154     case OP_SOM:
1155     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1156     ecode++;
1157     break;
1158    
1159     /* Assert before internal newline if multiline, or before a terminating
1160     newline unless endonly is set, else end of subject unless noteol is set. */
1161    
1162     case OP_DOLL:
1163     if ((ims & PCRE_MULTILINE) != 0)
1164     {
1165     if (eptr < md->end_subject)
1166     { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1167     else
1168     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1169     ecode++;
1170     break;
1171     }
1172     else
1173     {
1174     if (md->noteol) RRETURN(MATCH_NOMATCH);
1175     if (!md->endonly)
1176     {
1177     if (eptr < md->end_subject - 1 ||
1178     (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1179     RRETURN(MATCH_NOMATCH);
1180     ecode++;
1181     break;
1182     }
1183     }
1184     /* ... else fall through */
1185    
1186     /* End of subject assertion (\z) */
1187    
1188     case OP_EOD:
1189     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1190     ecode++;
1191     break;
1192    
1193     /* End of subject or ending \n assertion (\Z) */
1194    
1195     case OP_EODN:
1196     if (eptr < md->end_subject - 1 ||
1197     (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1198     ecode++;
1199     break;
1200    
1201     /* Word boundary assertions */
1202    
1203     case OP_NOT_WORD_BOUNDARY:
1204     case OP_WORD_BOUNDARY:
1205     {
1206    
1207     /* Find out if the previous and current characters are "word" characters.
1208     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1209     be "non-word" characters. */
1210    
1211     #ifdef SUPPORT_UTF8
1212     if (utf8)
1213     {
1214     if (eptr == md->start_subject) prev_is_word = FALSE; else
1215     {
1216     const uschar *lastptr = eptr - 1;
1217     while((*lastptr & 0xc0) == 0x80) lastptr--;
1218     GETCHAR(c, lastptr);
1219     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1220     }
1221     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1222     {
1223     GETCHAR(c, eptr);
1224     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1225     }
1226     }
1227     else
1228     #endif
1229    
1230     /* More streamlined when not in UTF-8 mode */
1231    
1232     {
1233     prev_is_word = (eptr != md->start_subject) &&
1234     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1235     cur_is_word = (eptr < md->end_subject) &&
1236     ((md->ctypes[*eptr] & ctype_word) != 0);
1237     }
1238    
1239     /* Now see if the situation is what we want */
1240    
1241     if ((*ecode++ == OP_WORD_BOUNDARY)?
1242     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1243     RRETURN(MATCH_NOMATCH);
1244     }
1245     break;
1246    
1247     /* Match a single character type; inline for speed */
1248    
1249     case OP_ANY:
1250     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1251     RRETURN(MATCH_NOMATCH);
1252     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1253     #ifdef SUPPORT_UTF8
1254     if (utf8)
1255     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1256     #endif
1257     ecode++;
1258     break;
1259    
1260     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1261     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1262    
1263     case OP_ANYBYTE:
1264     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1265     ecode++;
1266     break;
1267    
1268     case OP_NOT_DIGIT:
1269     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1270     GETCHARINCTEST(c, eptr);
1271     if (
1272     #ifdef SUPPORT_UTF8
1273     c < 256 &&
1274     #endif
1275     (md->ctypes[c] & ctype_digit) != 0
1276     )
1277     RRETURN(MATCH_NOMATCH);
1278     ecode++;
1279     break;
1280    
1281     case OP_DIGIT:
1282     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1283     GETCHARINCTEST(c, eptr);
1284     if (
1285     #ifdef SUPPORT_UTF8
1286     c >= 256 ||
1287     #endif
1288     (md->ctypes[c] & ctype_digit) == 0
1289     )
1290     RRETURN(MATCH_NOMATCH);
1291     ecode++;
1292     break;
1293    
1294     case OP_NOT_WHITESPACE:
1295     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1296     GETCHARINCTEST(c, eptr);
1297     if (
1298     #ifdef SUPPORT_UTF8
1299     c < 256 &&
1300     #endif
1301     (md->ctypes[c] & ctype_space) != 0
1302     )
1303     RRETURN(MATCH_NOMATCH);
1304     ecode++;
1305     break;
1306    
1307     case OP_WHITESPACE:
1308     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1309     GETCHARINCTEST(c, eptr);
1310     if (
1311     #ifdef SUPPORT_UTF8
1312     c >= 256 ||
1313     #endif
1314     (md->ctypes[c] & ctype_space) == 0
1315     )
1316     RRETURN(MATCH_NOMATCH);
1317     ecode++;
1318     break;
1319    
1320     case OP_NOT_WORDCHAR:
1321     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1322     GETCHARINCTEST(c, eptr);
1323     if (
1324     #ifdef SUPPORT_UTF8
1325     c < 256 &&
1326     #endif
1327     (md->ctypes[c] & ctype_word) != 0
1328     )
1329     RRETURN(MATCH_NOMATCH);
1330     ecode++;
1331     break;
1332    
1333     case OP_WORDCHAR:
1334     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1335     GETCHARINCTEST(c, eptr);
1336     if (
1337     #ifdef SUPPORT_UTF8
1338     c >= 256 ||
1339     #endif
1340     (md->ctypes[c] & ctype_word) == 0
1341     )
1342     RRETURN(MATCH_NOMATCH);
1343     ecode++;
1344     break;
1345    
1346     #ifdef SUPPORT_UCP
1347     /* Check the next character by Unicode property. We will get here only
1348     if the support is in the binary; otherwise a compile-time error occurs. */
1349    
1350     case OP_PROP:
1351     case OP_NOTPROP:
1352     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1353     GETCHARINCTEST(c, eptr);
1354     {
1355     int chartype, rqdtype;
1356     int othercase;
1357     int category = ucp_findchar(c, &chartype, &othercase);
1358    
1359     rqdtype = *(++ecode);
1360     ecode++;
1361    
1362     if (rqdtype >= 128)
1363     {
1364     if ((rqdtype - 128 != category) == (op == OP_PROP))
1365     RRETURN(MATCH_NOMATCH);
1366     }
1367     else
1368     {
1369     if ((rqdtype != chartype) == (op == OP_PROP))
1370     RRETURN(MATCH_NOMATCH);
1371     }
1372     }
1373     break;
1374    
1375     /* Match an extended Unicode sequence. We will get here only if the support
1376     is in the binary; otherwise a compile-time error occurs. */
1377    
1378     case OP_EXTUNI:
1379     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1380     GETCHARINCTEST(c, eptr);
1381     {
1382     int chartype;
1383     int othercase;
1384     int category = ucp_findchar(c, &chartype, &othercase);
1385     if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1386     while (eptr < md->end_subject)
1387     {
1388     int len = 1;
1389     if (!utf8) c = *eptr; else
1390     {
1391     GETCHARLEN(c, eptr, len);
1392     }
1393     category = ucp_findchar(c, &chartype, &othercase);
1394     if (category != ucp_M) break;
1395     eptr += len;
1396     }
1397     }
1398     ecode++;
1399     break;
1400     #endif
1401    
1402    
1403     /* Match a back reference, possibly repeatedly. Look past the end of the
1404     item to see if there is repeat information following. The code is similar
1405     to that for character classes, but repeated for efficiency. Then obey
1406     similar code to character type repeats - written out again for speed.
1407     However, if the referenced string is the empty string, always treat
1408     it as matched, any number of times (otherwise there could be infinite
1409     loops). */
1410    
1411     case OP_REF:
1412     {
1413     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1414     ecode += 3; /* Advance past item */
1415    
1416     /* If the reference is unset, set the length to be longer than the amount
1417     of subject left; this ensures that every attempt at a match fails. We
1418     can't just fail here, because of the possibility of quantifiers with zero
1419     minima. */
1420    
1421     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1422     md->end_subject - eptr + 1 :
1423     md->offset_vector[offset+1] - md->offset_vector[offset];
1424    
1425     /* Set up for repetition, or handle the non-repeated case */
1426    
1427     switch (*ecode)
1428     {
1429     case OP_CRSTAR:
1430     case OP_CRMINSTAR:
1431     case OP_CRPLUS:
1432     case OP_CRMINPLUS:
1433     case OP_CRQUERY:
1434     case OP_CRMINQUERY:
1435     c = *ecode++ - OP_CRSTAR;
1436     minimize = (c & 1) != 0;
1437     min = rep_min[c]; /* Pick up values from tables; */
1438     max = rep_max[c]; /* zero for max => infinity */
1439     if (max == 0) max = INT_MAX;
1440     break;
1441    
1442     case OP_CRRANGE:
1443     case OP_CRMINRANGE:
1444     minimize = (*ecode == OP_CRMINRANGE);
1445     min = GET2(ecode, 1);
1446     max = GET2(ecode, 3);
1447     if (max == 0) max = INT_MAX;
1448     ecode += 5;
1449     break;
1450    
1451     default: /* No repeat follows */
1452     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1453     eptr += length;
1454     continue; /* With the main loop */
1455     }
1456    
1457     /* If the length of the reference is zero, just continue with the
1458     main loop. */
1459    
1460     if (length == 0) continue;
1461    
1462     /* First, ensure the minimum number of matches are present. We get back
1463     the length of the reference string explicitly rather than passing the
1464     address of eptr, so that eptr can be a register variable. */
1465    
1466     for (i = 1; i <= min; i++)
1467     {
1468     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1469     eptr += length;
1470     }
1471    
1472     /* If min = max, continue at the same level without recursion.
1473     They are not both allowed to be zero. */
1474    
1475     if (min == max) continue;
1476    
1477     /* If minimizing, keep trying and advancing the pointer */
1478    
1479     if (minimize)
1480     {
1481     for (fi = min;; fi++)
1482     {
1483     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1484     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1485     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1486     RRETURN(MATCH_NOMATCH);
1487     eptr += length;
1488     }
1489     /* Control never gets here */
1490     }
1491    
1492     /* If maximizing, find the longest string and work backwards */
1493    
1494     else
1495     {
1496     pp = eptr;
1497     for (i = min; i < max; i++)
1498     {
1499     if (!match_ref(offset, eptr, length, md, ims)) break;
1500     eptr += length;
1501     }
1502     while (eptr >= pp)
1503     {
1504     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1505     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1506     eptr -= length;
1507     }
1508     RRETURN(MATCH_NOMATCH);
1509     }
1510     }
1511     /* Control never gets here */
1512    
1513    
1514    
1515     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1516     used when all the characters in the class have values in the range 0-255,
1517     and either the matching is caseful, or the characters are in the range
1518     0-127 when UTF-8 processing is enabled. The only difference between
1519     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1520     encountered.
1521    
1522     First, look past the end of the item to see if there is repeat information
1523     following. Then obey similar code to character type repeats - written out
1524     again for speed. */
1525    
1526     case OP_NCLASS:
1527     case OP_CLASS:
1528     {
1529     data = ecode + 1; /* Save for matching */
1530     ecode += 33; /* Advance past the item */
1531    
1532     switch (*ecode)
1533     {
1534     case OP_CRSTAR:
1535     case OP_CRMINSTAR:
1536     case OP_CRPLUS:
1537     case OP_CRMINPLUS:
1538     case OP_CRQUERY:
1539     case OP_CRMINQUERY:
1540     c = *ecode++ - OP_CRSTAR;
1541     minimize = (c & 1) != 0;
1542     min = rep_min[c]; /* Pick up values from tables; */
1543     max = rep_max[c]; /* zero for max => infinity */
1544     if (max == 0) max = INT_MAX;
1545     break;
1546    
1547     case OP_CRRANGE:
1548     case OP_CRMINRANGE:
1549     minimize = (*ecode == OP_CRMINRANGE);
1550     min = GET2(ecode, 1);
1551     max = GET2(ecode, 3);
1552     if (max == 0) max = INT_MAX;
1553     ecode += 5;
1554     break;
1555    
1556     default: /* No repeat follows */
1557     min = max = 1;
1558     break;
1559     }
1560    
1561     /* First, ensure the minimum number of matches are present. */
1562    
1563     #ifdef SUPPORT_UTF8
1564     /* UTF-8 mode */
1565     if (utf8)
1566     {
1567     for (i = 1; i <= min; i++)
1568     {
1569     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1570     GETCHARINC(c, eptr);
1571     if (c > 255)
1572     {
1573     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1574     }
1575     else
1576     {
1577     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1578     }
1579     }
1580     }
1581     else
1582     #endif
1583     /* Not UTF-8 mode */
1584     {
1585     for (i = 1; i <= min; i++)
1586     {
1587     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1588     c = *eptr++;
1589     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1590     }
1591     }
1592    
1593     /* If max == min we can continue with the main loop without the
1594     need to recurse. */
1595    
1596     if (min == max) continue;
1597    
1598     /* If minimizing, keep testing the rest of the expression and advancing
1599     the pointer while it matches the class. */
1600    
1601     if (minimize)
1602     {
1603     #ifdef SUPPORT_UTF8
1604     /* UTF-8 mode */
1605     if (utf8)
1606     {
1607     for (fi = min;; fi++)
1608     {
1609     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1610     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1611     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1612     GETCHARINC(c, eptr);
1613     if (c > 255)
1614     {
1615     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1616     }
1617     else
1618     {
1619     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1620     }
1621     }
1622     }
1623     else
1624     #endif
1625     /* Not UTF-8 mode */
1626     {
1627     for (fi = min;; fi++)
1628     {
1629     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1630     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1631     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1632     c = *eptr++;
1633     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1634     }
1635     }
1636     /* Control never gets here */
1637     }
1638    
1639     /* If maximizing, find the longest possible run, then work backwards. */
1640    
1641     else
1642     {
1643     pp = eptr;
1644    
1645     #ifdef SUPPORT_UTF8
1646     /* UTF-8 mode */
1647     if (utf8)
1648     {
1649     for (i = min; i < max; i++)
1650     {
1651     int len = 1;
1652     if (eptr >= md->end_subject) break;
1653     GETCHARLEN(c, eptr, len);
1654     if (c > 255)
1655     {
1656     if (op == OP_CLASS) break;
1657     }
1658     else
1659     {
1660     if ((data[c/8] & (1 << (c&7))) == 0) break;
1661     }
1662     eptr += len;
1663     }
1664     for (;;)
1665     {
1666     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1667     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1668     if (eptr-- == pp) break; /* Stop if tried at original pos */
1669     BACKCHAR(eptr);
1670     }
1671     }
1672     else
1673     #endif
1674     /* Not UTF-8 mode */
1675     {
1676     for (i = min; i < max; i++)
1677     {
1678     if (eptr >= md->end_subject) break;
1679     c = *eptr;
1680     if ((data[c/8] & (1 << (c&7))) == 0) break;
1681     eptr++;
1682     }
1683     while (eptr >= pp)
1684     {
1685     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1686     eptr--;
1687     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1688     }
1689     }
1690    
1691     RRETURN(MATCH_NOMATCH);
1692     }
1693     }
1694     /* Control never gets here */
1695    
1696    
1697     /* Match an extended character class. This opcode is encountered only
1698     in UTF-8 mode, because that's the only time it is compiled. */
1699    
1700     #ifdef SUPPORT_UTF8
1701     case OP_XCLASS:
1702     {
1703     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1704     ecode += GET(ecode, 1); /* Advance past the item */
1705    
1706     switch (*ecode)
1707     {
1708     case OP_CRSTAR:
1709     case OP_CRMINSTAR:
1710     case OP_CRPLUS:
1711     case OP_CRMINPLUS:
1712     case OP_CRQUERY:
1713     case OP_CRMINQUERY:
1714     c = *ecode++ - OP_CRSTAR;
1715     minimize = (c & 1) != 0;
1716     min = rep_min[c]; /* Pick up values from tables; */
1717     max = rep_max[c]; /* zero for max => infinity */
1718     if (max == 0) max = INT_MAX;
1719     break;
1720    
1721     case OP_CRRANGE:
1722     case OP_CRMINRANGE:
1723     minimize = (*ecode == OP_CRMINRANGE);
1724     min = GET2(ecode, 1);
1725     max = GET2(ecode, 3);
1726     if (max == 0) max = INT_MAX;
1727     ecode += 5;
1728     break;
1729    
1730     default: /* No repeat follows */
1731     min = max = 1;
1732     break;
1733     }
1734    
1735     /* First, ensure the minimum number of matches are present. */
1736    
1737     for (i = 1; i <= min; i++)
1738     {
1739     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1740     GETCHARINC(c, eptr);
1741     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1742     }
1743    
1744     /* If max == min we can continue with the main loop without the
1745     need to recurse. */
1746    
1747     if (min == max) continue;
1748    
1749     /* If minimizing, keep testing the rest of the expression and advancing
1750     the pointer while it matches the class. */
1751    
1752     if (minimize)
1753     {
1754     for (fi = min;; fi++)
1755     {
1756     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1757     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1758     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1759     GETCHARINC(c, eptr);
1760     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1761     }
1762     /* Control never gets here */
1763     }
1764    
1765     /* If maximizing, find the longest possible run, then work backwards. */
1766    
1767     else
1768     {
1769     pp = eptr;
1770     for (i = min; i < max; i++)
1771     {
1772     int len = 1;
1773     if (eptr >= md->end_subject) break;
1774     GETCHARLEN(c, eptr, len);
1775     if (!_pcre_xclass(c, data)) break;
1776     eptr += len;
1777     }
1778     for(;;)
1779     {
1780     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1781     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1782     if (eptr-- == pp) break; /* Stop if tried at original pos */
1783     BACKCHAR(eptr)
1784     }
1785     RRETURN(MATCH_NOMATCH);
1786     }
1787    
1788     /* Control never gets here */
1789     }
1790     #endif /* End of XCLASS */
1791    
1792     /* Match a single character, casefully */
1793    
1794     case OP_CHAR:
1795     #ifdef SUPPORT_UTF8
1796     if (utf8)
1797     {
1798     length = 1;
1799     ecode++;
1800     GETCHARLEN(fc, ecode, length);
1801     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1802     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1803     }
1804     else
1805     #endif
1806    
1807     /* Non-UTF-8 mode */
1808     {
1809     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1810     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1811     ecode += 2;
1812     }
1813     break;
1814    
1815     /* Match a single character, caselessly */
1816    
1817     case OP_CHARNC:
1818     #ifdef SUPPORT_UTF8
1819     if (utf8)
1820     {
1821     length = 1;
1822     ecode++;
1823     GETCHARLEN(fc, ecode, length);
1824    
1825     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1826    
1827     /* If the pattern character's value is < 128, we have only one byte, and
1828     can use the fast lookup table. */
1829    
1830     if (fc < 128)
1831     {
1832     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1833     }
1834    
1835     /* Otherwise we must pick up the subject character */
1836    
1837     else
1838     {
1839     int dc;
1840     GETCHARINC(dc, eptr);
1841     ecode += length;
1842    
1843     /* If we have Unicode property support, we can use it to test the other
1844     case of the character, if there is one. The result of ucp_findchar() is
1845     < 0 if the char isn't found, and othercase is returned as zero if there
1846     isn't one. */
1847    
1848     if (fc != dc)
1849     {
1850     #ifdef SUPPORT_UCP
1851     int chartype;
1852     int othercase;
1853     if (ucp_findchar(fc, &chartype, &othercase) < 0 || dc != othercase)
1854     #endif
1855     RRETURN(MATCH_NOMATCH);
1856     }
1857     }
1858     }
1859     else
1860     #endif /* SUPPORT_UTF8 */
1861    
1862     /* Non-UTF-8 mode */
1863     {
1864     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1865     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1866     ecode += 2;
1867     }
1868     break;
1869    
1870     /* Match a single character repeatedly; different opcodes share code. */
1871    
1872     case OP_EXACT:
1873     min = max = GET2(ecode, 1);
1874     ecode += 3;
1875     goto REPEATCHAR;
1876    
1877     case OP_UPTO:
1878     case OP_MINUPTO:
1879     min = 0;
1880     max = GET2(ecode, 1);
1881     minimize = *ecode == OP_MINUPTO;
1882     ecode += 3;
1883     goto REPEATCHAR;
1884    
1885     case OP_STAR:
1886     case OP_MINSTAR:
1887     case OP_PLUS:
1888     case OP_MINPLUS:
1889     case OP_QUERY:
1890     case OP_MINQUERY:
1891     c = *ecode++ - OP_STAR;
1892     minimize = (c & 1) != 0;
1893     min = rep_min[c]; /* Pick up values from tables; */
1894     max = rep_max[c]; /* zero for max => infinity */
1895     if (max == 0) max = INT_MAX;
1896    
1897     /* Common code for all repeated single-character matches. We can give
1898     up quickly if there are fewer than the minimum number of characters left in
1899     the subject. */
1900    
1901     REPEATCHAR:
1902     #ifdef SUPPORT_UTF8
1903     if (utf8)
1904     {
1905     length = 1;
1906     charptr = ecode;
1907     GETCHARLEN(fc, ecode, length);
1908     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1909     ecode += length;
1910    
1911     /* Handle multibyte character matching specially here. There is
1912     support for caseless matching if UCP support is present. */
1913    
1914     if (length > 1)
1915     {
1916     int oclength = 0;
1917     uschar occhars[8];
1918    
1919     #ifdef SUPPORT_UCP
1920     int othercase;
1921     int chartype;
1922     if ((ims & PCRE_CASELESS) != 0 &&
1923     ucp_findchar(fc, &chartype, &othercase) >= 0 &&
1924     othercase > 0)
1925     oclength = _pcre_ord2utf8(othercase, occhars);
1926     #endif /* SUPPORT_UCP */
1927    
1928     for (i = 1; i <= min; i++)
1929     {
1930     if (memcmp(eptr, charptr, length) == 0) eptr += length;
1931     /* Need braces because of following else */
1932     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1933     else
1934     {
1935     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1936     eptr += oclength;
1937     }
1938     }
1939    
1940     if (min == max) continue;
1941    
1942     if (minimize)
1943     {
1944     for (fi = min;; fi++)
1945     {
1946     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1947     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1948     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1949     if (memcmp(eptr, charptr, length) == 0) eptr += length;
1950     /* Need braces because of following else */
1951     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1952     else
1953     {
1954     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1955     eptr += oclength;
1956     }
1957     }
1958     /* Control never gets here */
1959     }
1960     else
1961     {
1962     pp = eptr;
1963     for (i = min; i < max; i++)
1964     {
1965     if (eptr > md->end_subject - length) break;
1966     if (memcmp(eptr, charptr, length) == 0) eptr += length;
1967     else if (oclength == 0) break;
1968     else
1969     {
1970     if (memcmp(eptr, occhars, oclength) != 0) break;
1971     eptr += oclength;
1972     }
1973     }
1974     while (eptr >= pp)
1975     {
1976     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1977     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1978     eptr -= length;
1979     }
1980     RRETURN(MATCH_NOMATCH);
1981     }
1982     /* Control never gets here */
1983     }
1984    
1985     /* If the length of a UTF-8 character is 1, we fall through here, and
1986     obey the code as for non-UTF-8 characters below, though in this case the
1987     value of fc will always be < 128. */
1988     }
1989     else
1990     #endif /* SUPPORT_UTF8 */
1991    
1992     /* When not in UTF-8 mode, load a single-byte character. */
1993     {
1994     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1995     fc = *ecode++;
1996     }
1997    
1998     /* The value of fc at this point is always less than 256, though we may or
1999     may not be in UTF-8 mode. The code is duplicated for the caseless and
2000     caseful cases, for speed, since matching characters is likely to be quite
2001     common. First, ensure the minimum number of matches are present. If min =
2002     max, continue at the same level without recursing. Otherwise, if
2003     minimizing, keep trying the rest of the expression and advancing one
2004     matching character if failing, up to the maximum. Alternatively, if
2005     maximizing, find the maximum number of characters and work backwards. */
2006    
2007     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2008     max, eptr));
2009    
2010     if ((ims & PCRE_CASELESS) != 0)
2011     {
2012     fc = md->lcc[fc];
2013     for (i = 1; i <= min; i++)
2014     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2015     if (min == max) continue;
2016     if (minimize)
2017     {
2018     for (fi = min;; fi++)
2019     {
2020     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2021     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2022     if (fi >= max || eptr >= md->end_subject ||
2023     fc != md->lcc[*eptr++])
2024     RRETURN(MATCH_NOMATCH);
2025     }
2026     /* Control never gets here */
2027     }
2028     else
2029     {
2030     pp = eptr;
2031     for (i = min; i < max; i++)
2032     {
2033     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2034     eptr++;
2035     }
2036     while (eptr >= pp)
2037     {
2038     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2039     eptr--;
2040     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2041     }
2042     RRETURN(MATCH_NOMATCH);
2043     }
2044     /* Control never gets here */
2045     }
2046    
2047     /* Caseful comparisons (includes all multi-byte characters) */
2048    
2049     else
2050     {
2051     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2052     if (min == max) continue;
2053     if (minimize)
2054     {
2055     for (fi = min;; fi++)
2056     {
2057     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2058     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2059     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2060     RRETURN(MATCH_NOMATCH);
2061     }
2062     /* Control never gets here */
2063     }
2064     else
2065     {
2066     pp = eptr;
2067     for (i = min; i < max; i++)
2068     {
2069     if (eptr >= md->end_subject || fc != *eptr) break;
2070     eptr++;
2071     }
2072     while (eptr >= pp)
2073     {
2074     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2075     eptr--;
2076     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2077     }
2078     RRETURN(MATCH_NOMATCH);
2079     }
2080     }
2081     /* Control never gets here */
2082    
2083     /* Match a negated single one-byte character. The character we are
2084     checking can be multibyte. */
2085    
2086     case OP_NOT:
2087     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2088     ecode++;
2089     GETCHARINCTEST(c, eptr);
2090     if ((ims & PCRE_CASELESS) != 0)
2091     {
2092     #ifdef SUPPORT_UTF8
2093     if (c < 256)
2094     #endif
2095     c = md->lcc[c];
2096     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2097     }
2098     else
2099     {
2100     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2101     }
2102     break;
2103    
2104     /* Match a negated single one-byte character repeatedly. This is almost a
2105     repeat of the code for a repeated single character, but I haven't found a
2106     nice way of commoning these up that doesn't require a test of the
2107     positive/negative option for each character match. Maybe that wouldn't add
2108     very much to the time taken, but character matching *is* what this is all
2109     about... */
2110    
2111     case OP_NOTEXACT:
2112     min = max = GET2(ecode, 1);
2113     ecode += 3;
2114     goto REPEATNOTCHAR;
2115    
2116     case OP_NOTUPTO:
2117     case OP_NOTMINUPTO:
2118     min = 0;
2119     max = GET2(ecode, 1);
2120     minimize = *ecode == OP_NOTMINUPTO;
2121     ecode += 3;
2122     goto REPEATNOTCHAR;
2123    
2124     case OP_NOTSTAR:
2125     case OP_NOTMINSTAR:
2126     case OP_NOTPLUS:
2127     case OP_NOTMINPLUS:
2128     case OP_NOTQUERY:
2129     case OP_NOTMINQUERY:
2130     c = *ecode++ - OP_NOTSTAR;
2131     minimize = (c & 1) != 0;
2132     min = rep_min[c]; /* Pick up values from tables; */
2133     max = rep_max[c]; /* zero for max => infinity */
2134     if (max == 0) max = INT_MAX;
2135    
2136     /* Common code for all repeated single-byte matches. We can give up quickly
2137     if there are fewer than the minimum number of bytes left in the
2138     subject. */
2139    
2140     REPEATNOTCHAR:
2141     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2142     fc = *ecode++;
2143    
2144     /* The code is duplicated for the caseless and caseful cases, for speed,
2145     since matching characters is likely to be quite common. First, ensure the
2146     minimum number of matches are present. If min = max, continue at the same
2147     level without recursing. Otherwise, if minimizing, keep trying the rest of
2148     the expression and advancing one matching character if failing, up to the
2149     maximum. Alternatively, if maximizing, find the maximum number of
2150     characters and work backwards. */
2151    
2152     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2153     max, eptr));
2154    
2155     if ((ims & PCRE_CASELESS) != 0)
2156     {
2157     fc = md->lcc[fc];
2158    
2159     #ifdef SUPPORT_UTF8
2160     /* UTF-8 mode */
2161     if (utf8)
2162     {
2163     register int d;
2164     for (i = 1; i <= min; i++)
2165     {
2166     GETCHARINC(d, eptr);
2167     if (d < 256) d = md->lcc[d];
2168     if (fc == d) RRETURN(MATCH_NOMATCH);
2169     }
2170     }
2171     else
2172     #endif
2173    
2174     /* Not UTF-8 mode */
2175     {
2176     for (i = 1; i <= min; i++)
2177     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2178     }
2179    
2180     if (min == max) continue;
2181    
2182     if (minimize)
2183     {
2184     #ifdef SUPPORT_UTF8
2185     /* UTF-8 mode */
2186     if (utf8)
2187     {
2188     register int d;
2189     for (fi = min;; fi++)
2190     {
2191     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2192     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2193     GETCHARINC(d, eptr);
2194     if (d < 256) d = md->lcc[d];
2195     if (fi >= max || eptr >= md->end_subject || fc == d)
2196     RRETURN(MATCH_NOMATCH);
2197     }
2198     }
2199     else
2200     #endif
2201     /* Not UTF-8 mode */
2202     {
2203     for (fi = min;; fi++)
2204     {
2205     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2206     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2207     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2208     RRETURN(MATCH_NOMATCH);
2209     }
2210     }
2211     /* Control never gets here */
2212     }
2213    
2214     /* Maximize case */
2215    
2216     else
2217     {
2218     pp = eptr;
2219    
2220     #ifdef SUPPORT_UTF8
2221     /* UTF-8 mode */
2222     if (utf8)
2223     {
2224     register int d;
2225     for (i = min; i < max; i++)
2226     {
2227     int len = 1;
2228     if (eptr >= md->end_subject) break;
2229     GETCHARLEN(d, eptr, len);
2230     if (d < 256) d = md->lcc[d];
2231     if (fc == d) break;
2232     eptr += len;
2233     }
2234     for(;;)
2235     {
2236     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2237     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2238     if (eptr-- == pp) break; /* Stop if tried at original pos */
2239     BACKCHAR(eptr);
2240     }
2241     }
2242     else
2243     #endif
2244     /* Not UTF-8 mode */
2245     {
2246     for (i = min; i < max; i++)
2247     {
2248     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2249     eptr++;
2250     }
2251     while (eptr >= pp)
2252     {
2253     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2254     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2255     eptr--;
2256     }
2257     }
2258    
2259     RRETURN(MATCH_NOMATCH);
2260     }
2261     /* Control never gets here */
2262     }
2263    
2264     /* Caseful comparisons */
2265    
2266     else
2267     {
2268     #ifdef SUPPORT_UTF8
2269     /* UTF-8 mode */
2270     if (utf8)
2271     {
2272     register int d;
2273     for (i = 1; i <= min; i++)
2274     {
2275     GETCHARINC(d, eptr);
2276     if (fc == d) RRETURN(MATCH_NOMATCH);
2277     }
2278     }
2279     else
2280     #endif
2281     /* Not UTF-8 mode */
2282     {
2283     for (i = 1; i <= min; i++)
2284     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2285     }
2286    
2287     if (min == max) continue;
2288    
2289     if (minimize)
2290     {
2291     #ifdef SUPPORT_UTF8
2292     /* UTF-8 mode */
2293     if (utf8)
2294     {
2295     register int d;
2296     for (fi = min;; fi++)
2297     {
2298     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2299     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2300     GETCHARINC(d, eptr);
2301     if (fi >= max || eptr >= md->end_subject || fc == d)
2302     RRETURN(MATCH_NOMATCH);
2303     }
2304     }
2305     else
2306     #endif
2307     /* Not UTF-8 mode */
2308     {
2309     for (fi = min;; fi++)
2310     {
2311     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2312     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2313     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2314     RRETURN(MATCH_NOMATCH);
2315     }
2316     }
2317     /* Control never gets here */
2318     }
2319    
2320     /* Maximize case */
2321    
2322     else
2323     {
2324     pp = eptr;
2325    
2326     #ifdef SUPPORT_UTF8
2327     /* UTF-8 mode */
2328     if (utf8)
2329     {
2330     register int d;
2331     for (i = min; i < max; i++)
2332     {
2333     int len = 1;
2334     if (eptr >= md->end_subject) break;
2335     GETCHARLEN(d, eptr, len);
2336     if (fc == d) break;
2337     eptr += len;
2338     }
2339     for(;;)
2340     {
2341     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2342     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2343     if (eptr-- == pp) break; /* Stop if tried at original pos */
2344     BACKCHAR(eptr);
2345     }
2346     }
2347     else
2348     #endif
2349     /* Not UTF-8 mode */
2350     {
2351     for (i = min; i < max; i++)
2352     {
2353     if (eptr >= md->end_subject || fc == *eptr) break;
2354     eptr++;
2355     }
2356     while (eptr >= pp)
2357     {
2358     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2359     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360     eptr--;
2361     }
2362     }
2363    
2364     RRETURN(MATCH_NOMATCH);
2365     }
2366     }
2367     /* Control never gets here */
2368    
2369     /* Match a single character type repeatedly; several different opcodes
2370     share code. This is very similar to the code for single characters, but we
2371     repeat it in the interests of efficiency. */
2372    
2373     case OP_TYPEEXACT:
2374     min = max = GET2(ecode, 1);
2375     minimize = TRUE;
2376     ecode += 3;
2377     goto REPEATTYPE;
2378    
2379     case OP_TYPEUPTO:
2380     case OP_TYPEMINUPTO:
2381     min = 0;
2382     max = GET2(ecode, 1);
2383     minimize = *ecode == OP_TYPEMINUPTO;
2384     ecode += 3;
2385     goto REPEATTYPE;
2386    
2387     case OP_TYPESTAR:
2388     case OP_TYPEMINSTAR:
2389     case OP_TYPEPLUS:
2390     case OP_TYPEMINPLUS:
2391     case OP_TYPEQUERY:
2392     case OP_TYPEMINQUERY:
2393     c = *ecode++ - OP_TYPESTAR;
2394     minimize = (c & 1) != 0;
2395     min = rep_min[c]; /* Pick up values from tables; */
2396     max = rep_max[c]; /* zero for max => infinity */
2397     if (max == 0) max = INT_MAX;
2398    
2399     /* Common code for all repeated single character type matches. Note that
2400     in UTF-8 mode, '.' matches a character of any length, but for the other
2401     character types, the valid characters are all one-byte long. */
2402    
2403     REPEATTYPE:
2404     ctype = *ecode++; /* Code for the character type */
2405    
2406     #ifdef SUPPORT_UCP
2407     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2408     {
2409     prop_fail_result = ctype == OP_NOTPROP;
2410     prop_type = *ecode++;
2411     if (prop_type >= 128)
2412     {
2413     prop_test_against = prop_type - 128;
2414     prop_test_variable = &prop_category;
2415     }
2416     else
2417     {
2418     prop_test_against = prop_type;
2419     prop_test_variable = &prop_chartype;
2420     }
2421     }
2422     else prop_type = -1;
2423     #endif
2424    
2425     /* First, ensure the minimum number of matches are present. Use inline
2426     code for maximizing the speed, and do the type test once at the start
2427     (i.e. keep it out of the loop). Also we can test that there are at least
2428     the minimum number of bytes before we start. This isn't as effective in
2429     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2430     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2431     and single-bytes. */
2432    
2433     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2434     if (min > 0)
2435     {
2436     #ifdef SUPPORT_UCP
2437     if (prop_type > 0)
2438     {
2439     for (i = 1; i <= min; i++)
2440     {
2441     GETCHARINC(c, eptr);
2442     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2443     if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2444     RRETURN(MATCH_NOMATCH);
2445     }
2446     }
2447    
2448     /* Match extended Unicode sequences. We will get here only if the
2449     support is in the binary; otherwise a compile-time error occurs. */
2450    
2451     else if (ctype == OP_EXTUNI)
2452     {
2453     for (i = 1; i <= min; i++)
2454     {
2455     GETCHARINCTEST(c, eptr);
2456     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2457     if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2458     while (eptr < md->end_subject)
2459     {
2460     int len = 1;
2461     if (!utf8) c = *eptr; else
2462     {
2463     GETCHARLEN(c, eptr, len);
2464     }
2465     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2466     if (prop_category != ucp_M) break;
2467     eptr += len;
2468     }
2469     }
2470     }
2471    
2472     else
2473     #endif /* SUPPORT_UCP */
2474    
2475     /* Handle all other cases when the coding is UTF-8 */
2476    
2477     #ifdef SUPPORT_UTF8
2478     if (utf8) switch(ctype)
2479     {
2480     case OP_ANY:
2481     for (i = 1; i <= min; i++)
2482     {
2483     if (eptr >= md->end_subject ||
2484     (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2485     RRETURN(MATCH_NOMATCH);
2486     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2487     }
2488     break;
2489    
2490     case OP_ANYBYTE:
2491     eptr += min;
2492     break;
2493    
2494     case OP_NOT_DIGIT:
2495     for (i = 1; i <= min; i++)
2496     {
2497     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2498     GETCHARINC(c, eptr);
2499     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2500     RRETURN(MATCH_NOMATCH);
2501     }
2502     break;
2503    
2504     case OP_DIGIT:
2505     for (i = 1; i <= min; i++)
2506     {
2507     if (eptr >= md->end_subject ||
2508     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2509     RRETURN(MATCH_NOMATCH);
2510     /* No need to skip more bytes - we know it's a 1-byte character */
2511     }
2512     break;
2513    
2514     case OP_NOT_WHITESPACE:
2515     for (i = 1; i <= min; i++)
2516     {
2517     if (eptr >= md->end_subject ||
2518     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2519     RRETURN(MATCH_NOMATCH);
2520     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2521     }
2522     break;
2523    
2524     case OP_WHITESPACE:
2525     for (i = 1; i <= min; i++)
2526     {
2527     if (eptr >= md->end_subject ||
2528     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2529     RRETURN(MATCH_NOMATCH);
2530     /* No need to skip more bytes - we know it's a 1-byte character */
2531     }
2532     break;
2533    
2534     case OP_NOT_WORDCHAR:
2535     for (i = 1; i <= min; i++)
2536     {
2537     if (eptr >= md->end_subject ||
2538     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2539     RRETURN(MATCH_NOMATCH);
2540     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2541     }
2542     break;
2543    
2544     case OP_WORDCHAR:
2545     for (i = 1; i <= min; i++)
2546     {
2547     if (eptr >= md->end_subject ||
2548     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2549     RRETURN(MATCH_NOMATCH);
2550     /* No need to skip more bytes - we know it's a 1-byte character */
2551     }
2552     break;
2553    
2554     default:
2555     RRETURN(PCRE_ERROR_INTERNAL);
2556     } /* End switch(ctype) */
2557    
2558     else
2559     #endif /* SUPPORT_UTF8 */
2560    
2561     /* Code for the non-UTF-8 case for minimum matching of operators other
2562     than OP_PROP and OP_NOTPROP. */
2563    
2564     switch(ctype)
2565     {
2566     case OP_ANY:
2567     if ((ims & PCRE_DOTALL) == 0)
2568     {
2569     for (i = 1; i <= min; i++)
2570     if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2571     }
2572     else eptr += min;
2573     break;
2574    
2575     case OP_ANYBYTE:
2576     eptr += min;
2577     break;
2578    
2579     case OP_NOT_DIGIT:
2580     for (i = 1; i <= min; i++)
2581     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2582     break;
2583    
2584     case OP_DIGIT:
2585     for (i = 1; i <= min; i++)
2586     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2587     break;
2588    
2589     case OP_NOT_WHITESPACE:
2590     for (i = 1; i <= min; i++)
2591     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2592     break;
2593    
2594     case OP_WHITESPACE:
2595     for (i = 1; i <= min; i++)
2596     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2597     break;
2598    
2599     case OP_NOT_WORDCHAR:
2600     for (i = 1; i <= min; i++)
2601     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2602     RRETURN(MATCH_NOMATCH);
2603     break;
2604    
2605     case OP_WORDCHAR:
2606     for (i = 1; i <= min; i++)
2607     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2608     RRETURN(MATCH_NOMATCH);
2609     break;
2610    
2611     default:
2612     RRETURN(PCRE_ERROR_INTERNAL);
2613     }
2614     }
2615    
2616     /* If min = max, continue at the same level without recursing */
2617    
2618     if (min == max) continue;
2619    
2620     /* If minimizing, we have to test the rest of the pattern before each
2621     subsequent match. Again, separate the UTF-8 case for speed, and also
2622     separate the UCP cases. */
2623    
2624     if (minimize)
2625     {
2626     #ifdef SUPPORT_UCP
2627     if (prop_type > 0)
2628     {
2629     for (fi = min;; fi++)
2630     {
2631     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2632     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2633     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2634     GETCHARINC(c, eptr);
2635     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2636     if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2637     RRETURN(MATCH_NOMATCH);
2638     }
2639     }
2640    
2641     /* Match extended Unicode sequences. We will get here only if the
2642     support is in the binary; otherwise a compile-time error occurs. */
2643    
2644     else if (ctype == OP_EXTUNI)
2645     {
2646     for (fi = min;; fi++)
2647     {
2648     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2649     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2650     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2651     GETCHARINCTEST(c, eptr);
2652     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2653     if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2654     while (eptr < md->end_subject)
2655     {
2656     int len = 1;
2657     if (!utf8) c = *eptr; else
2658     {
2659     GETCHARLEN(c, eptr, len);
2660     }
2661     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2662     if (prop_category != ucp_M) break;
2663     eptr += len;
2664     }
2665     }
2666     }
2667    
2668     else
2669     #endif /* SUPPORT_UCP */
2670    
2671     #ifdef SUPPORT_UTF8
2672     /* UTF-8 mode */
2673     if (utf8)
2674     {
2675     for (fi = min;; fi++)
2676     {
2677     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2678     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2679     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2680    
2681     GETCHARINC(c, eptr);
2682     switch(ctype)
2683     {
2684     case OP_ANY:
2685     if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2686     break;
2687    
2688     case OP_ANYBYTE:
2689     break;
2690    
2691     case OP_NOT_DIGIT:
2692     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2693     RRETURN(MATCH_NOMATCH);
2694     break;
2695    
2696     case OP_DIGIT:
2697     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2698     RRETURN(MATCH_NOMATCH);
2699     break;
2700    
2701     case OP_NOT_WHITESPACE:
2702     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2703     RRETURN(MATCH_NOMATCH);
2704     break;
2705    
2706     case OP_WHITESPACE:
2707     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2708     RRETURN(MATCH_NOMATCH);
2709     break;
2710    
2711     case OP_NOT_WORDCHAR:
2712     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2713     RRETURN(MATCH_NOMATCH);
2714     break;
2715    
2716     case OP_WORDCHAR:
2717     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2718     RRETURN(MATCH_NOMATCH);
2719     break;
2720    
2721     default:
2722     RRETURN(PCRE_ERROR_INTERNAL);
2723     }
2724     }
2725     }
2726     else
2727     #endif
2728     /* Not UTF-8 mode */
2729     {
2730     for (fi = min;; fi++)
2731     {
2732     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2733     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2734     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2735     c = *eptr++;
2736     switch(ctype)
2737     {
2738     case OP_ANY:
2739     if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2740     break;
2741    
2742     case OP_ANYBYTE:
2743     break;
2744    
2745     case OP_NOT_DIGIT:
2746     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2747     break;
2748    
2749     case OP_DIGIT:
2750     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2751     break;
2752    
2753     case OP_NOT_WHITESPACE:
2754     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2755     break;
2756    
2757     case OP_WHITESPACE:
2758     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2759     break;
2760    
2761     case OP_NOT_WORDCHAR:
2762     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2763     break;
2764    
2765     case OP_WORDCHAR:
2766     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2767     break;
2768    
2769     default:
2770     RRETURN(PCRE_ERROR_INTERNAL);
2771     }
2772     }
2773     }
2774     /* Control never gets here */
2775     }
2776    
2777     /* If maximizing it is worth using inline code for speed, doing the type
2778     test once at the start (i.e. keep it out of the loop). Again, keep the
2779     UTF-8 and UCP stuff separate. */
2780    
2781     else
2782     {
2783     pp = eptr; /* Remember where we started */
2784    
2785     #ifdef SUPPORT_UCP
2786     if (prop_type > 0)
2787     {
2788     for (i = min; i < max; i++)
2789     {
2790     int len = 1;
2791     if (eptr >= md->end_subject) break;
2792     GETCHARLEN(c, eptr, len);
2793     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2794     if ((*prop_test_variable == prop_test_against) == prop_fail_result)
2795     break;
2796     eptr+= len;
2797     }
2798    
2799     /* eptr is now past the end of the maximum run */
2800    
2801     for(;;)
2802     {
2803     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2804     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2805     if (eptr-- == pp) break; /* Stop if tried at original pos */
2806     BACKCHAR(eptr);
2807     }
2808     }
2809    
2810     /* Match extended Unicode sequences. We will get here only if the
2811     support is in the binary; otherwise a compile-time error occurs. */
2812    
2813     else if (ctype == OP_EXTUNI)
2814     {
2815     for (i = min; i < max; i++)
2816     {
2817     if (eptr >= md->end_subject) break;
2818     GETCHARINCTEST(c, eptr);
2819     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2820     if (prop_category == ucp_M) break;
2821     while (eptr < md->end_subject)
2822     {
2823     int len = 1;
2824     if (!utf8) c = *eptr; else
2825     {
2826     GETCHARLEN(c, eptr, len);
2827     }
2828     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2829     if (prop_category != ucp_M) break;
2830     eptr += len;
2831     }
2832     }
2833    
2834     /* eptr is now past the end of the maximum run */
2835    
2836     for(;;)
2837     {
2838     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2839     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2840     if (eptr-- == pp) break; /* Stop if tried at original pos */
2841     for (;;) /* Move back over one extended */
2842     {
2843     int len = 1;
2844     BACKCHAR(eptr);
2845     if (!utf8) c = *eptr; else
2846     {
2847     GETCHARLEN(c, eptr, len);
2848     }
2849     prop_category = ucp_findchar(c, &prop_chartype, &prop_othercase);
2850     if (prop_category != ucp_M) break;
2851     eptr--;
2852     }
2853     }
2854     }
2855    
2856     else
2857     #endif /* SUPPORT_UCP */
2858    
2859     #ifdef SUPPORT_UTF8
2860     /* UTF-8 mode */
2861    
2862     if (utf8)
2863     {
2864     switch(ctype)
2865     {
2866     case OP_ANY:
2867    
2868     /* Special code is required for UTF8, but when the maximum is unlimited
2869     we don't need it, so we repeat the non-UTF8 code. This is probably
2870     worth it, because .* is quite a common idiom. */
2871    
2872     if (max < INT_MAX)
2873     {
2874     if ((ims & PCRE_DOTALL) == 0)
2875     {
2876     for (i = min; i < max; i++)
2877     {
2878     if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2879     eptr++;
2880     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2881     }
2882     }
2883     else
2884     {
2885     for (i = min; i < max; i++)
2886     {
2887     eptr++;
2888     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2889     }
2890     }
2891     }
2892    
2893     /* Handle unlimited UTF-8 repeat */
2894    
2895     else
2896     {
2897     if ((ims & PCRE_DOTALL) == 0)
2898     {
2899     for (i = min; i < max; i++)
2900     {
2901     if (eptr >= md->end_subject || *eptr == NEWLINE) break;
2902     eptr++;
2903     }
2904     break;
2905     }
2906     else
2907     {
2908     c = max - min;
2909     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2910     eptr += c;
2911     }
2912     }
2913     break;
2914    
2915     /* The byte case is the same as non-UTF8 */
2916    
2917     case OP_ANYBYTE:
2918     c = max - min;
2919     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
2920     eptr += c;
2921     break;
2922    
2923     case OP_NOT_DIGIT:
2924     for (i = min; i < max; i++)
2925     {
2926     int len = 1;
2927     if (eptr >= md->end_subject) break;
2928     GETCHARLEN(c, eptr, len);
2929     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
2930     eptr+= len;
2931     }
2932     break;
2933    
2934     case OP_DIGIT:
2935     for (i = min; i < max; i++)
2936     {
2937     int len = 1;
2938     if (eptr >= md->end_subject) break;
2939     GETCHARLEN(c, eptr, len);
2940     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
2941     eptr+= len;
2942     }
2943     break;
2944    
2945     case OP_NOT_WHITESPACE:
2946     for (i = min; i < max; i++)
2947     {
2948     int len = 1;
2949     if (eptr >= md->end_subject) break;
2950     GETCHARLEN(c, eptr, len);
2951     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
2952     eptr+= len;
2953     }
2954     break;
2955    
2956     case OP_WHITESPACE:
2957     for (i = min; i < max; i++)
2958     {
2959     int len = 1;
2960     if (eptr >= md->end_subject) break;
2961     GETCHARLEN(c, eptr, len);
2962     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
2963     eptr+= len;
2964     }
2965     break;
2966    
2967     case OP_NOT_WORDCHAR:
2968     for (i = min; i < max; i++)
2969     {
2970     int len = 1;
2971     if (eptr >= md->end_subject) break;
2972     GETCHARLEN(c, eptr, len);
2973     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
2974     eptr+= len;
2975     }
2976     break;
2977    
2978     case OP_WORDCHAR:
2979     for (i = min; i < max; i++)
2980     {
2981     int len = 1;
2982     if (eptr >= md->end_subject) break;
2983     GETCHARLEN(c, eptr, len);
2984     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
2985     eptr+= len;
2986     }
2987     break;
2988    
2989     default:
2990     RRETURN(PCRE_ERROR_INTERNAL);
2991     }
2992    
2993     /* eptr is now past the end of the maximum run */
2994    
2995     for(;;)
2996     {
2997     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2998     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2999     if (eptr-- == pp) break; /* Stop if tried at original pos */
3000     BACKCHAR(eptr);
3001     }
3002     }
3003     else
3004     #endif
3005    
3006     /* Not UTF-8 mode */
3007     {
3008     switch(ctype)
3009     {
3010     case OP_ANY:
3011     if ((ims & PCRE_DOTALL) == 0)
3012     {
3013     for (i = min; i < max; i++)
3014     {
3015     if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3016     eptr++;
3017     }
3018     break;
3019     }
3020     /* For DOTALL case, fall through and treat as \C */
3021    
3022     case OP_ANYBYTE:
3023     c = max - min;
3024     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3025     eptr += c;
3026     break;
3027    
3028     case OP_NOT_DIGIT:
3029     for (i = min; i < max; i++)
3030     {
3031     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3032     break;
3033     eptr++;
3034     }
3035     break;
3036    
3037     case OP_DIGIT:
3038     for (i = min; i < max; i++)
3039     {
3040     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3041     break;
3042     eptr++;
3043     }
3044     break;
3045    
3046     case OP_NOT_WHITESPACE:
3047     for (i = min; i < max; i++)
3048     {
3049     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3050     break;
3051     eptr++;
3052     }
3053     break;
3054    
3055     case OP_WHITESPACE:
3056     for (i = min; i < max; i++)
3057     {
3058     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3059     break;
3060     eptr++;
3061     }
3062     break;
3063    
3064     case OP_NOT_WORDCHAR:
3065     for (i = min; i < max; i++)
3066     {
3067     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3068     break;
3069     eptr++;
3070     }
3071     break;
3072    
3073     case OP_WORDCHAR:
3074     for (i = min; i < max; i++)
3075     {
3076     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3077     break;
3078     eptr++;
3079     }
3080     break;
3081    
3082     default:
3083     RRETURN(PCRE_ERROR_INTERNAL);
3084     }
3085    
3086     /* eptr is now past the end of the maximum run */
3087    
3088     while (eptr >= pp)
3089     {
3090     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3091     eptr--;
3092     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3093     }
3094     }
3095    
3096     /* Get here if we can't make it match with any permitted repetitions */
3097    
3098     RRETURN(MATCH_NOMATCH);
3099     }
3100     /* Control never gets here */
3101    
3102     /* There's been some horrible disaster. Since all codes > OP_BRA are
3103     for capturing brackets, and there shouldn't be any gaps between 0 and
3104     OP_BRA, arrival here can only mean there is something seriously wrong
3105     in the code above or the OP_xxx definitions. */
3106    
3107     default:
3108     DPRINTF(("Unknown opcode %d\n", *ecode));
3109     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3110     }
3111    
3112     /* Do not stick any code in here without much thought; it is assumed
3113     that "continue" in the code above comes out to here to repeat the main
3114     loop. */
3115    
3116     } /* End of main loop */
3117     /* Control never reaches here */
3118     }
3119    
3120    
3121     /***************************************************************************
3122     ****************************************************************************
3123     RECURSION IN THE match() FUNCTION
3124    
3125     Undefine all the macros that were defined above to handle this. */
3126    
3127     #ifdef NO_RECURSE
3128     #undef eptr
3129     #undef ecode
3130     #undef offset_top
3131     #undef ims
3132     #undef eptrb
3133     #undef flags
3134    
3135     #undef callpat
3136     #undef charptr
3137     #undef data
3138     #undef next
3139     #undef pp
3140     #undef prev
3141     #undef saved_eptr
3142    
3143     #undef new_recursive
3144    
3145     #undef cur_is_word
3146     #undef condition
3147     #undef minimize
3148     #undef prev_is_word
3149    
3150     #undef original_ims
3151    
3152     #undef ctype
3153     #undef length
3154     #undef max
3155     #undef min
3156     #undef number
3157     #undef offset
3158     #undef op
3159     #undef save_capture_last
3160     #undef save_offset1
3161     #undef save_offset2
3162     #undef save_offset3
3163     #undef stacksave
3164    
3165     #undef newptrb
3166    
3167     #endif
3168    
3169     /* These two are defined as macros in both cases */
3170    
3171     #undef fc
3172     #undef fi
3173    
3174     /***************************************************************************
3175     ***************************************************************************/
3176    
3177    
3178    
3179     /*************************************************
3180     * Execute a Regular Expression *
3181     *************************************************/
3182    
3183     /* This function applies a compiled re to a subject string and picks out
3184     portions of the string if it matches. Two elements in the vector are set for
3185     each substring: the offsets to the start and end of the substring.
3186    
3187     Arguments:
3188     argument_re points to the compiled expression
3189     extra_data points to extra data or is NULL
3190     subject points to the subject string
3191     length length of subject string (may contain binary zeros)
3192     start_offset where to start in the subject string
3193     options option bits
3194     offsets points to a vector of ints to be filled in with offsets
3195     offsetcount the number of elements in the vector
3196    
3197     Returns: > 0 => success; value is the number of elements filled in
3198     = 0 => success, but offsets is not big enough
3199     -1 => failed to match
3200     < -1 => some kind of unexpected problem
3201     */
3202    
3203     EXPORT int
3204     pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3205     const char *subject, int length, int start_offset, int options, int *offsets,
3206     int offsetcount)
3207     {
3208     int rc, resetcount, ocount;
3209     int first_byte = -1;
3210     int req_byte = -1;
3211     int req_byte2 = -1;
3212     unsigned long int ims = 0;
3213     BOOL using_temporary_offsets = FALSE;
3214     BOOL anchored;
3215     BOOL startline;
3216     BOOL firstline;
3217     BOOL first_byte_caseless = FALSE;
3218     BOOL req_byte_caseless = FALSE;
3219     match_data match_block;
3220     const uschar *tables;
3221     const uschar *start_bits = NULL;
3222     const uschar *start_match = (const uschar *)subject + start_offset;
3223     const uschar *end_subject;
3224     const uschar *req_byte_ptr = start_match - 1;
3225    
3226     pcre_study_data internal_study;
3227     const pcre_study_data *study;
3228    
3229     real_pcre internal_re;
3230     const real_pcre *external_re = (const real_pcre *)argument_re;
3231     const real_pcre *re = external_re;
3232    
3233     /* Plausibility checks */
3234    
3235     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3236     if (re == NULL || subject == NULL ||
3237     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3238     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3239    
3240     /* Fish out the optional data from the extra_data structure, first setting
3241     the default values. */
3242    
3243     study = NULL;
3244     match_block.match_limit = MATCH_LIMIT;
3245     match_block.callout_data = NULL;
3246    
3247     /* The table pointer is always in native byte order. */
3248    
3249     tables = external_re->tables;
3250    
3251     if (extra_data != NULL)
3252     {
3253     register unsigned int flags = extra_data->flags;
3254     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3255     study = (const pcre_study_data *)extra_data->study_data;
3256     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3257     match_block.match_limit = extra_data->match_limit;
3258     if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3259     match_block.callout_data = extra_data->callout_data;
3260     if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3261     }
3262    
3263     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3264     is a feature that makes it possible to save compiled regex and re-use them
3265     in other programs later. */
3266    
3267     if (tables == NULL) tables = _pcre_default_tables;
3268    
3269     /* Check that the first field in the block is the magic number. If it is not,
3270     test for a regex that was compiled on a host of opposite endianness. If this is
3271     the case, flipped values are put in internal_re and internal_study if there was
3272     study data too. */
3273    
3274     if (re->magic_number != MAGIC_NUMBER)
3275     {
3276     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3277     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3278     if (study != NULL) study = &internal_study;
3279     }
3280    
3281     /* Set up other data */
3282    
3283     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3284     startline = (re->options & PCRE_STARTLINE) != 0;
3285     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3286    
3287     /* The code starts after the real_pcre block and the capture name table. */
3288    
3289     match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3290     re->name_count * re->name_entry_size;
3291    
3292     match_block.start_subject = (const uschar *)subject;
3293     match_block.start_offset = start_offset;
3294     match_block.end_subject = match_block.start_subject + length;
3295     end_subject = match_block.end_subject;
3296    
3297     match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3298     match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3299    
3300     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3301     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3302     match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3303     match_block.partial = (options & PCRE_PARTIAL) != 0;
3304     match_block.hitend = FALSE;
3305    
3306     match_block.recursive = NULL; /* No recursion at top level */
3307    
3308     match_block.lcc = tables + lcc_offset;
3309     match_block.ctypes = tables + ctypes_offset;
3310    
3311     /* Partial matching is supported only for a restricted set of regexes at the
3312     moment. */
3313    
3314     if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3315     return PCRE_ERROR_BADPARTIAL;
3316    
3317     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3318     back the character offset. */
3319    
3320     #ifdef SUPPORT_UTF8
3321     if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3322     {
3323     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3324     return PCRE_ERROR_BADUTF8;
3325     if (start_offset > 0 && start_offset < length)
3326     {
3327     int tb = ((uschar *)subject)[start_offset];
3328     if (tb > 127)
3329     {
3330     tb &= 0xc0;
3331     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3332     }
3333     }
3334     }
3335     #endif
3336    
3337     /* The ims options can vary during the matching as a result of the presence
3338     of (?ims) items in the pattern. They are kept in a local variable so that
3339     restoring at the exit of a group is easy. */
3340    
3341     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3342    
3343     /* If the expression has got more back references than the offsets supplied can
3344     hold, we get a temporary chunk of working store to use during the matching.
3345     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3346     of 3. */
3347    
3348     ocount = offsetcount - (offsetcount % 3);
3349    
3350     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3351     {
3352     ocount = re->top_backref * 3 + 3;
3353     match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3354     if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3355     using_temporary_offsets = TRUE;
3356     DPRINTF(("Got memory to hold back references\n"));
3357     }
3358     else match_block.offset_vector = offsets;
3359    
3360     match_block.offset_end = ocount;
3361     match_block.offset_max = (2*ocount)/3;
3362     match_block.offset_overflow = FALSE;
3363     match_block.capture_last = -1;
3364    
3365     /* Compute the minimum number of offsets that we need to reset each time. Doing
3366     this makes a huge difference to execution time when there aren't many brackets
3367     in the pattern. */
3368    
3369     resetcount = 2 + re->top_bracket * 2;
3370     if (resetcount > offsetcount) resetcount = ocount;
3371    
3372     /* Reset the working variable associated with each extraction. These should
3373     never be used unless previously set, but they get saved and restored, and so we
3374     initialize them to avoid reading uninitialized locations. */
3375    
3376     if (match_block.offset_vector != NULL)
3377     {
3378     register int *iptr = match_block.offset_vector + ocount;
3379     register int *iend = iptr - resetcount/2 + 1;
3380     while (--iptr >= iend) *iptr = -1;
3381     }
3382    
3383     /* Set up the first character to match, if available. The first_byte value is
3384     never set for an anchored regular expression, but the anchoring may be forced
3385     at run time, so we have to test for anchoring. The first char may be unset for
3386     an unanchored pattern, of course. If there's no first char and the pattern was
3387     studied, there may be a bitmap of possible first characters. */
3388    
3389     if (!anchored)
3390     {
3391     if ((re->options & PCRE_FIRSTSET) != 0)
3392     {
3393     first_byte = re->first_byte & 255;
3394     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3395     first_byte = match_block.lcc[first_byte];
3396     }
3397     else
3398     if (!startline && study != NULL &&
3399     (study->options & PCRE_STUDY_MAPPED) != 0)
3400     start_bits = study->start_bits;
3401     }
3402    
3403     /* For anchored or unanchored matches, there may be a "last known required
3404     character" set. */
3405    
3406     if ((re->options & PCRE_REQCHSET) != 0)
3407     {
3408     req_byte = re->req_byte & 255;
3409     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3410     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3411     }
3412    
3413     /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3414     the loop runs just once. */
3415    
3416     do
3417     {
3418     const uschar *save_end_subject = end_subject;
3419    
3420     /* Reset the maximum number of extractions we might see. */
3421    
3422     if (match_block.offset_vector != NULL)
3423     {
3424     register int *iptr = match_block.offset_vector;
3425     register int *iend = iptr + resetcount;
3426     while (iptr < iend) *iptr++ = -1;
3427     }
3428    
3429     /* Advance to a unique first char if possible. If firstline is TRUE, the
3430     start of the match is constrained to the first line of a multiline string.
3431     Implement this by temporarily adjusting end_subject so that we stop scanning
3432     at a newline. If the match fails at the newline, later code breaks this loop.
3433     */
3434    
3435     if (firstline)
3436     {
3437     const uschar *t = start_match;
3438     while (t < save_end_subject && *t != '\n') t++;
3439     end_subject = t;
3440     }
3441    
3442     /* Now test for a unique first byte */
3443    
3444     if (first_byte >= 0)
3445     {
3446     if (first_byte_caseless)
3447     while (start_match < end_subject &&
3448     match_block.lcc[*start_match] != first_byte)
3449     start_match++;
3450     else
3451     while (start_match < end_subject && *start_match != first_byte)
3452     start_match++;
3453     }
3454    
3455     /* Or to just after \n for a multiline match if possible */
3456    
3457     else if (startline)
3458     {
3459     if (start_match > match_block.start_subject + start_offset)
3460     {
3461     while (start_match < end_subject && start_match[-1] != NEWLINE)
3462     start_match++;
3463     }
3464     }
3465    
3466     /* Or to a non-unique first char after study */
3467    
3468     else if (start_bits != NULL)
3469     {
3470     while (start_match < end_subject)
3471     {
3472     register unsigned int c = *start_match;
3473     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3474     }
3475     }
3476    
3477     /* Restore fudged end_subject */
3478    
3479     end_subject = save_end_subject;
3480    
3481     #ifdef DEBUG /* Sigh. Some compilers never learn. */
3482     printf(">>>> Match against: ");
3483     pchars(start_match, end_subject - start_match, TRUE, &match_block);
3484     printf("\n");
3485     #endif
3486    
3487     /* If req_byte is set, we know that that character must appear in the subject
3488     for the match to succeed. If the first character is set, req_byte must be
3489     later in the subject; otherwise the test starts at the match point. This
3490     optimization can save a huge amount of backtracking in patterns with nested
3491     unlimited repeats that aren't going to match. Writing separate code for
3492     cased/caseless versions makes it go faster, as does using an autoincrement
3493     and backing off on a match.
3494    
3495     HOWEVER: when the subject string is very, very long, searching to its end can
3496     take a long time, and give bad performance on quite ordinary patterns. This
3497     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3498     don't do this when the string is sufficiently long.
3499    
3500     ALSO: this processing is disabled when partial matching is requested.
3501     */
3502    
3503     if (req_byte >= 0 &&
3504     end_subject - start_match < REQ_BYTE_MAX &&
3505     !match_block.partial)
3506     {
3507     register const uschar *p = start_match + ((first_byte >= 0)? 1 : 0);
3508    
3509     /* We don't need to repeat the search if we haven't yet reached the
3510     place we found it at last time. */
3511    
3512     if (p > req_byte_ptr)
3513     {
3514     if (req_byte_caseless)
3515     {
3516     while (p < end_subject)
3517     {
3518     register int pp = *p++;
3519     if (pp == req_byte || pp == req_byte2) { p--; break; }
3520     }
3521     }
3522     else
3523     {
3524     while (p < end_subject)
3525     {
3526     if (*p++ == req_byte) { p--; break; }
3527     }
3528     }
3529    
3530     /* If we can't find the required character, break the matching loop */
3531    
3532     if (p >= end_subject) break;
3533    
3534     /* If we have found the required character, save the point where we
3535     found it, so that we don't search again next time round the loop if
3536     the start hasn't passed this character yet. */
3537    
3538     req_byte_ptr = p;
3539     }
3540     }
3541    
3542     /* When a match occurs, substrings will be set for all internal extractions;
3543     we just need to set up the whole thing as substring 0 before returning. If
3544     there were too many extractions, set the return code to zero. In the case
3545     where we had to get some local store to hold offsets for backreferences, copy
3546     those back references that we can. In this case there need not be overflow
3547     if certain parts of the pattern were not used. */
3548    
3549     match_block.start_match = start_match;
3550     match_block.match_call_count = 0;
3551    
3552     rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3553     match_isgroup);
3554    
3555     /* When the result is no match, if the subject's first character was a
3556     newline and the PCRE_FIRSTLINE option is set, break (which will return
3557     PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3558     newline in the subject. Otherwise, advance the pointer to the next character
3559     and continue - but the continuation will actually happen only when the
3560     pattern is not anchored. */
3561    
3562     if (rc == MATCH_NOMATCH)
3563     {
3564     if (firstline && *start_match == NEWLINE) break;
3565     start_match++;
3566     #ifdef SUPPORT_UTF8
3567     if (match_block.utf8)
3568     while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3569     start_match++;
3570     #endif
3571     continue;
3572     }
3573    
3574     if (rc != MATCH_MATCH)
3575     {
3576     DPRINTF((">>>> error: returning %d\n", rc));
3577     return rc;
3578     }
3579    
3580     /* We have a match! Copy the offset information from temporary store if
3581     necessary */
3582    
3583     if (using_temporary_offsets)
3584     {
3585     if (offsetcount >= 4)
3586     {
3587     memcpy(offsets + 2, match_block.offset_vector + 2,
3588     (offsetcount - 2) * sizeof(int));
3589     DPRINTF(("Copied offsets from temporary memory\n"));
3590     }
3591     if (match_block.end_offset_top > offsetcount)
3592     match_block.offset_overflow = TRUE;
3593    
3594     DPRINTF(("Freeing temporary memory\n"));
3595     (pcre_free)(match_block.offset_vector);
3596     }
3597    
3598     rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3599    
3600     if (offsetcount < 2) rc = 0; else
3601     {
3602     offsets[0] = start_match - match_block.start_subject;
3603     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3604     }
3605    
3606     DPRINTF((">>>> returning %d\n", rc));
3607     return rc;
3608     }
3609    
3610     /* This "while" is the end of the "do" above */
3611    
3612     while (!anchored && start_match <= end_subject);
3613    
3614     if (using_temporary_offsets)
3615     {
3616     DPRINTF(("Freeing temporary memory\n"));
3617     (pcre_free)(match_block.offset_vector);
3618     }
3619    
3620     if (match_block.partial && match_block.hitend)
3621     {
3622     DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3623     return PCRE_ERROR_PARTIAL;
3624     }
3625     else
3626     {
3627     DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3628     return PCRE_ERROR_NOMATCH;
3629     }
3630     }
3631    
3632     /* End of pcre_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12