/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 87 - (hide annotations) (download)
Sat Feb 24 21:41:21 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 117538 byte(s)
Load pcre-6.5 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45    
46     #include "pcre_internal.h"
47    
48    
49     /* Structure for building a chain of data that actually lives on the
50     stack, for holding the values of the subject pointer at the start of each
51     subpattern, so as to detect when an empty string has been matched by a
52     subpattern - to break infinite loops. When NO_RECURSE is set, these blocks
53     are on the heap, not on the stack. */
54    
55     typedef struct eptrblock {
56     struct eptrblock *epb_prev;
57 nigel 87 USPTR epb_saved_eptr;
58 nigel 77 } eptrblock;
59    
60     /* Flag bits for the match() function */
61    
62     #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_isgroup 0x02 /* Set if start of bracketed group */
64    
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71     /* Maximum number of ints of offset to save on the stack for recursive calls.
72     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73     because the offset vector is always a multiple of 3 long. */
74    
75     #define REC_STACK_SAVE_MAX 30
76    
77     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78    
79     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
80     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
81    
82    
83    
84     #ifdef DEBUG
85     /*************************************************
86     * Debugging function to print chars *
87     *************************************************/
88    
89     /* Print a sequence of chars in printable format, stopping at the end of the
90     subject if the requested.
91    
92     Arguments:
93     p points to characters
94     length number to print
95     is_subject TRUE if printing from within md->start_subject
96     md pointer to matching data block, if is_subject is TRUE
97    
98     Returns: nothing
99     */
100    
101     static void
102     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
103     {
104     int c;
105     if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106     while (length-- > 0)
107     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
108     }
109     #endif
110    
111    
112    
113     /*************************************************
114     * Match a back-reference *
115     *************************************************/
116    
117     /* If a back reference hasn't been set, the length that is passed is greater
118     than the number of characters left in the string, so the match fails.
119    
120     Arguments:
121     offset index into the offset vector
122     eptr points into the subject
123     length length to be matched
124     md points to match data block
125     ims the ims flags
126    
127     Returns: TRUE if matched
128     */
129    
130     static BOOL
131 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
132 nigel 77 unsigned long int ims)
133     {
134 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
135 nigel 77
136     #ifdef DEBUG
137     if (eptr >= md->end_subject)
138     printf("matching subject <null>");
139     else
140     {
141     printf("matching subject ");
142     pchars(eptr, length, TRUE, md);
143     }
144     printf(" against backref ");
145     pchars(p, length, FALSE, md);
146     printf("\n");
147     #endif
148    
149     /* Always fail if not enough characters left */
150    
151     if (length > md->end_subject - eptr) return FALSE;
152    
153     /* Separate the caselesss case for speed */
154    
155     if ((ims & PCRE_CASELESS) != 0)
156     {
157     while (length-- > 0)
158     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
159     }
160     else
161     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
162    
163     return TRUE;
164     }
165    
166    
167    
168     /***************************************************************************
169     ****************************************************************************
170     RECURSION IN THE match() FUNCTION
171    
172 nigel 87 The match() function is highly recursive, though not every recursive call
173     increases the recursive depth. Nevertheless, some regular expressions can cause
174     it to recurse to a great depth. I was writing for Unix, so I just let it call
175     itself recursively. This uses the stack for saving everything that has to be
176     saved for a recursive call. On Unix, the stack can be large, and this works
177     fine.
178 nigel 77
179 nigel 87 It turns out that on some non-Unix-like systems there are problems with
180     programs that use a lot of stack. (This despite the fact that every last chip
181     has oodles of memory these days, and techniques for extending the stack have
182     been known for decades.) So....
183 nigel 77
184     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
185     calls by keeping local variables that need to be preserved in blocks of memory
186 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
187 nigel 77 achieve this so that the actual code doesn't look very different to what it
188     always used to.
189     ****************************************************************************
190     ***************************************************************************/
191    
192    
193 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
194     versions and production versions. */
195 nigel 77
196     #ifndef NO_RECURSE
197     #define REGISTER register
198 nigel 87 #ifdef DEBUG
199     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
200     { \
201     printf("match() called in line %d\n", __LINE__); \
202     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
203     printf("to line %d\n", __LINE__); \
204     }
205     #define RRETURN(ra) \
206     { \
207     printf("match() returned %d from line %d ", ra, __LINE__); \
208     return ra; \
209     }
210     #else
211     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
212     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
213 nigel 77 #define RRETURN(ra) return ra
214 nigel 87 #endif
215    
216 nigel 77 #else
217    
218    
219     /* These versions of the macros manage a private stack on the heap. Note
220     that the rd argument of RMATCH isn't actually used. It's the md argument of
221     match(), which never changes. */
222    
223     #define REGISTER
224    
225     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
226     {\
227     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
228     if (setjmp(frame->Xwhere) == 0)\
229     {\
230     newframe->Xeptr = ra;\
231     newframe->Xecode = rb;\
232     newframe->Xoffset_top = rc;\
233     newframe->Xims = re;\
234     newframe->Xeptrb = rf;\
235     newframe->Xflags = rg;\
236 nigel 87 newframe->Xrdepth = frame->Xrdepth + 1;\
237 nigel 77 newframe->Xprevframe = frame;\
238     frame = newframe;\
239     DPRINTF(("restarting from line %d\n", __LINE__));\
240     goto HEAP_RECURSE;\
241     }\
242     else\
243     {\
244     DPRINTF(("longjumped back to line %d\n", __LINE__));\
245     frame = md->thisframe;\
246     rx = frame->Xresult;\
247     }\
248     }
249    
250     #define RRETURN(ra)\
251     {\
252     heapframe *newframe = frame;\
253     frame = newframe->Xprevframe;\
254     (pcre_stack_free)(newframe);\
255     if (frame != NULL)\
256     {\
257     frame->Xresult = ra;\
258     md->thisframe = frame;\
259     longjmp(frame->Xwhere, 1);\
260     }\
261     return ra;\
262     }
263    
264    
265     /* Structure for remembering the local variables in a private frame */
266    
267     typedef struct heapframe {
268     struct heapframe *Xprevframe;
269    
270     /* Function arguments that may change */
271    
272     const uschar *Xeptr;
273     const uschar *Xecode;
274     int Xoffset_top;
275     long int Xims;
276     eptrblock *Xeptrb;
277     int Xflags;
278 nigel 87 int Xrdepth;
279 nigel 77
280     /* Function local variables */
281    
282     const uschar *Xcallpat;
283     const uschar *Xcharptr;
284     const uschar *Xdata;
285     const uschar *Xnext;
286     const uschar *Xpp;
287     const uschar *Xprev;
288     const uschar *Xsaved_eptr;
289    
290     recursion_info Xnew_recursive;
291    
292     BOOL Xcur_is_word;
293     BOOL Xcondition;
294     BOOL Xminimize;
295     BOOL Xprev_is_word;
296    
297     unsigned long int Xoriginal_ims;
298    
299     #ifdef SUPPORT_UCP
300     int Xprop_type;
301 nigel 87 int Xprop_value;
302 nigel 77 int Xprop_fail_result;
303     int Xprop_category;
304     int Xprop_chartype;
305 nigel 87 int Xprop_script;
306 nigel 77 int *Xprop_test_variable;
307     #endif
308    
309     int Xctype;
310     int Xfc;
311     int Xfi;
312     int Xlength;
313     int Xmax;
314     int Xmin;
315     int Xnumber;
316     int Xoffset;
317     int Xop;
318     int Xsave_capture_last;
319     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
320     int Xstacksave[REC_STACK_SAVE_MAX];
321    
322     eptrblock Xnewptrb;
323    
324     /* Place to pass back result, and where to jump back to */
325    
326     int Xresult;
327     jmp_buf Xwhere;
328    
329     } heapframe;
330    
331     #endif
332    
333    
334     /***************************************************************************
335     ***************************************************************************/
336    
337    
338    
339     /*************************************************
340     * Match from current position *
341     *************************************************/
342    
343     /* On entry ecode points to the first opcode, and eptr to the first character
344     in the subject string, while eptrb holds the value of eptr at the start of the
345     last bracketed group - used for breaking infinite loops matching zero-length
346     strings. This function is called recursively in many circumstances. Whenever it
347     returns a negative (error) response, the outer incarnation must also return the
348     same response.
349    
350     Performance note: It might be tempting to extract commonly used fields from the
351     md structure (e.g. utf8, end_subject) into individual variables to improve
352     performance. Tests using gcc on a SPARC disproved this; in the first case, it
353     made performance worse.
354    
355     Arguments:
356     eptr pointer in subject
357     ecode position in code
358     offset_top current top pointer
359     md pointer to "static" info for the match
360     ims current /i, /m, and /s options
361     eptrb pointer to chain of blocks containing eptr at start of
362     brackets - for testing for empty matches
363     flags can contain
364     match_condassert - this is an assertion condition
365     match_isgroup - this is the start of a bracketed group
366 nigel 87 rdepth the recursion depth
367 nigel 77
368     Returns: MATCH_MATCH if matched ) these values are >= 0
369     MATCH_NOMATCH if failed to match )
370     a negative PCRE_ERROR_xxx value if aborted by an error condition
371 nigel 87 (e.g. stopped by repeated call or recursion limit)
372 nigel 77 */
373    
374     static int
375 nigel 87 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
376 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
377 nigel 87 int flags, int rdepth)
378 nigel 77 {
379     /* These variables do not need to be preserved over recursion in this function,
380     so they can be ordinary variables in all cases. Mark them with "register"
381     because they are used a lot in loops. */
382    
383     register int rrc; /* Returns from recursive calls */
384     register int i; /* Used for loops not involving calls to RMATCH() */
385     register int c; /* Character values not kept over RMATCH() calls */
386     register BOOL utf8; /* Local copy of UTF-8 flag for speed */
387    
388     /* When recursion is not being used, all "local" variables that have to be
389     preserved over calls to RMATCH() are part of a "frame" which is obtained from
390     heap storage. Set up the top-level frame here; others are obtained from the
391     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
392    
393     #ifdef NO_RECURSE
394     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
395     frame->Xprevframe = NULL; /* Marks the top level */
396    
397     /* Copy in the original argument variables */
398    
399     frame->Xeptr = eptr;
400     frame->Xecode = ecode;
401     frame->Xoffset_top = offset_top;
402     frame->Xims = ims;
403     frame->Xeptrb = eptrb;
404     frame->Xflags = flags;
405 nigel 87 frame->Xrdepth = rdepth;
406 nigel 77
407     /* This is where control jumps back to to effect "recursion" */
408    
409     HEAP_RECURSE:
410    
411     /* Macros make the argument variables come from the current frame */
412    
413     #define eptr frame->Xeptr
414     #define ecode frame->Xecode
415     #define offset_top frame->Xoffset_top
416     #define ims frame->Xims
417     #define eptrb frame->Xeptrb
418     #define flags frame->Xflags
419 nigel 87 #define rdepth frame->Xrdepth
420 nigel 77
421     /* Ditto for the local variables */
422    
423     #ifdef SUPPORT_UTF8
424     #define charptr frame->Xcharptr
425     #endif
426     #define callpat frame->Xcallpat
427     #define data frame->Xdata
428     #define next frame->Xnext
429     #define pp frame->Xpp
430     #define prev frame->Xprev
431     #define saved_eptr frame->Xsaved_eptr
432    
433     #define new_recursive frame->Xnew_recursive
434    
435     #define cur_is_word frame->Xcur_is_word
436     #define condition frame->Xcondition
437     #define minimize frame->Xminimize
438     #define prev_is_word frame->Xprev_is_word
439    
440     #define original_ims frame->Xoriginal_ims
441    
442     #ifdef SUPPORT_UCP
443     #define prop_type frame->Xprop_type
444 nigel 87 #define prop_value frame->Xprop_value
445 nigel 77 #define prop_fail_result frame->Xprop_fail_result
446     #define prop_category frame->Xprop_category
447     #define prop_chartype frame->Xprop_chartype
448 nigel 87 #define prop_script frame->Xprop_script
449 nigel 77 #define prop_test_variable frame->Xprop_test_variable
450     #endif
451    
452     #define ctype frame->Xctype
453     #define fc frame->Xfc
454     #define fi frame->Xfi
455     #define length frame->Xlength
456     #define max frame->Xmax
457     #define min frame->Xmin
458     #define number frame->Xnumber
459     #define offset frame->Xoffset
460     #define op frame->Xop
461     #define save_capture_last frame->Xsave_capture_last
462     #define save_offset1 frame->Xsave_offset1
463     #define save_offset2 frame->Xsave_offset2
464     #define save_offset3 frame->Xsave_offset3
465     #define stacksave frame->Xstacksave
466    
467     #define newptrb frame->Xnewptrb
468    
469     /* When recursion is being used, local variables are allocated on the stack and
470     get preserved during recursion in the normal way. In this environment, fi and
471     i, and fc and c, can be the same variables. */
472    
473     #else
474     #define fi i
475     #define fc c
476    
477    
478 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
479     const uschar *charptr; /* in small blocks of the code. My normal */
480     #endif /* style of coding would have declared */
481     const uschar *callpat; /* them within each of those blocks. */
482     const uschar *data; /* However, in order to accommodate the */
483     const uschar *next; /* version of this code that uses an */
484     USPTR pp; /* external "stack" implemented on the */
485     const uschar *prev; /* heap, it is easier to declare them all */
486     USPTR saved_eptr; /* here, so the declarations can be cut */
487     /* out in a block. The only declarations */
488     recursion_info new_recursive; /* within blocks below are for variables */
489     /* that do not have to be preserved over */
490     BOOL cur_is_word; /* a recursive call to RMATCH(). */
491     BOOL condition;
492 nigel 77 BOOL minimize;
493     BOOL prev_is_word;
494    
495     unsigned long int original_ims;
496    
497     #ifdef SUPPORT_UCP
498     int prop_type;
499 nigel 87 int prop_value;
500 nigel 77 int prop_fail_result;
501     int prop_category;
502     int prop_chartype;
503 nigel 87 int prop_script;
504 nigel 77 int *prop_test_variable;
505     #endif
506    
507     int ctype;
508     int length;
509     int max;
510     int min;
511     int number;
512     int offset;
513     int op;
514     int save_capture_last;
515     int save_offset1, save_offset2, save_offset3;
516     int stacksave[REC_STACK_SAVE_MAX];
517    
518     eptrblock newptrb;
519     #endif
520    
521     /* These statements are here to stop the compiler complaining about unitialized
522     variables. */
523    
524     #ifdef SUPPORT_UCP
525 nigel 87 prop_value = 0;
526 nigel 77 prop_fail_result = 0;
527     prop_test_variable = NULL;
528     #endif
529    
530 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
531     are specified by the macro RMATCH and RRETURN is used to return. When
532     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
533     and a "return", respectively (possibly with some debugging if DEBUG is
534     defined). However, RMATCH isn't like a function call because it's quite a
535     complicated macro. It has to be used in one particular way. This shouldn't,
536     however, impact performance when true recursion is being used. */
537 nigel 77
538 nigel 87 /* First check that we haven't called match() too many times, or that we
539     haven't exceeded the recursive call limit. */
540    
541 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
542 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
543 nigel 77
544     original_ims = ims; /* Save for resetting on ')' */
545     utf8 = md->utf8; /* Local copy of the flag */
546    
547     /* At the start of a bracketed group, add the current subject pointer to the
548     stack of such pointers, to be re-instated at the end of the group when we hit
549     the closing ket. When match() is called in other circumstances, we don't add to
550     this stack. */
551    
552     if ((flags & match_isgroup) != 0)
553     {
554     newptrb.epb_prev = eptrb;
555     newptrb.epb_saved_eptr = eptr;
556     eptrb = &newptrb;
557     }
558    
559     /* Now start processing the operations. */
560    
561     for (;;)
562     {
563     op = *ecode;
564     minimize = FALSE;
565    
566     /* For partial matching, remember if we ever hit the end of the subject after
567     matching at least one subject character. */
568    
569     if (md->partial &&
570     eptr >= md->end_subject &&
571     eptr > md->start_match)
572     md->hitend = TRUE;
573    
574     /* Opening capturing bracket. If there is space in the offset vector, save
575     the current subject position in the working slot at the top of the vector. We
576     mustn't change the current values of the data slot, because they may be set
577     from a previous iteration of this group, and be referred to by a reference
578     inside the group.
579    
580     If the bracket fails to match, we need to restore this value and also the
581     values of the final offsets, in case they were set by a previous iteration of
582     the same bracket.
583    
584     If there isn't enough space in the offset vector, treat this as if it were a
585     non-capturing bracket. Don't worry about setting the flag for the error case
586     here; that is handled in the code for KET. */
587    
588     if (op > OP_BRA)
589     {
590     number = op - OP_BRA;
591    
592     /* For extended extraction brackets (large number), we have to fish out the
593     number from a dummy opcode at the start. */
594    
595     if (number > EXTRACT_BASIC_MAX)
596     number = GET2(ecode, 2+LINK_SIZE);
597     offset = number << 1;
598    
599     #ifdef DEBUG
600     printf("start bracket %d subject=", number);
601     pchars(eptr, 16, TRUE, md);
602     printf("\n");
603     #endif
604    
605     if (offset < md->offset_max)
606     {
607     save_offset1 = md->offset_vector[offset];
608     save_offset2 = md->offset_vector[offset+1];
609     save_offset3 = md->offset_vector[md->offset_end - number];
610     save_capture_last = md->capture_last;
611    
612     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
613     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
614    
615     do
616     {
617     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
618     match_isgroup);
619     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
620     md->capture_last = save_capture_last;
621     ecode += GET(ecode, 1);
622     }
623     while (*ecode == OP_ALT);
624    
625     DPRINTF(("bracket %d failed\n", number));
626    
627     md->offset_vector[offset] = save_offset1;
628     md->offset_vector[offset+1] = save_offset2;
629     md->offset_vector[md->offset_end - number] = save_offset3;
630    
631     RRETURN(MATCH_NOMATCH);
632     }
633    
634     /* Insufficient room for saving captured contents */
635    
636     else op = OP_BRA;
637     }
638    
639     /* Other types of node can be handled by a switch */
640    
641     switch(op)
642     {
643     case OP_BRA: /* Non-capturing bracket: optimized */
644     DPRINTF(("start bracket 0\n"));
645     do
646     {
647     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
648     match_isgroup);
649     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
650     ecode += GET(ecode, 1);
651     }
652     while (*ecode == OP_ALT);
653     DPRINTF(("bracket 0 failed\n"));
654     RRETURN(MATCH_NOMATCH);
655    
656     /* Conditional group: compilation checked that there are no more than
657     two branches. If the condition is false, skipping the first branch takes us
658     past the end if there is only one branch, but that's OK because that is
659     exactly what going to the ket would do. */
660    
661     case OP_COND:
662     if (ecode[LINK_SIZE+1] == OP_CREF) /* Condition extract or recurse test */
663     {
664     offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
665     condition = (offset == CREF_RECURSE * 2)?
666     (md->recursive != NULL) :
667     (offset < offset_top && md->offset_vector[offset] >= 0);
668     RMATCH(rrc, eptr, ecode + (condition?
669     (LINK_SIZE + 4) : (LINK_SIZE + 1 + GET(ecode, 1))),
670     offset_top, md, ims, eptrb, match_isgroup);
671     RRETURN(rrc);
672     }
673    
674     /* The condition is an assertion. Call match() to evaluate it - setting
675     the final argument TRUE causes it to stop at the end of an assertion. */
676    
677     else
678     {
679     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
680     match_condassert | match_isgroup);
681     if (rrc == MATCH_MATCH)
682     {
683     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE+2);
684     while (*ecode == OP_ALT) ecode += GET(ecode, 1);
685     }
686     else if (rrc != MATCH_NOMATCH)
687     {
688     RRETURN(rrc); /* Need braces because of following else */
689     }
690     else ecode += GET(ecode, 1);
691     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb,
692     match_isgroup);
693     RRETURN(rrc);
694     }
695     /* Control never reaches here */
696    
697     /* Skip over conditional reference or large extraction number data if
698     encountered. */
699    
700     case OP_CREF:
701     case OP_BRANUMBER:
702     ecode += 3;
703     break;
704    
705     /* End of the pattern. If we are in a recursion, we should restore the
706     offsets appropriately and continue from after the call. */
707    
708     case OP_END:
709     if (md->recursive != NULL && md->recursive->group_num == 0)
710     {
711     recursion_info *rec = md->recursive;
712 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
713 nigel 77 md->recursive = rec->prevrec;
714     memmove(md->offset_vector, rec->offset_save,
715     rec->saved_max * sizeof(int));
716     md->start_match = rec->save_start;
717     ims = original_ims;
718     ecode = rec->after_call;
719     break;
720     }
721    
722     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
723     string - backtracking will then try other alternatives, if any. */
724    
725     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
726     md->end_match_ptr = eptr; /* Record where we ended */
727     md->end_offset_top = offset_top; /* and how many extracts were taken */
728     RRETURN(MATCH_MATCH);
729    
730     /* Change option settings */
731    
732     case OP_OPT:
733     ims = ecode[1];
734     ecode += 2;
735     DPRINTF(("ims set to %02lx\n", ims));
736     break;
737    
738     /* Assertion brackets. Check the alternative branches in turn - the
739     matching won't pass the KET for an assertion. If any one branch matches,
740     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
741     start of each branch to move the current point backwards, so the code at
742     this level is identical to the lookahead case. */
743    
744     case OP_ASSERT:
745     case OP_ASSERTBACK:
746     do
747     {
748     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
749     match_isgroup);
750     if (rrc == MATCH_MATCH) break;
751     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
752     ecode += GET(ecode, 1);
753     }
754     while (*ecode == OP_ALT);
755     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
756    
757     /* If checking an assertion for a condition, return MATCH_MATCH. */
758    
759     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
760    
761     /* Continue from after the assertion, updating the offsets high water
762     mark, since extracts may have been taken during the assertion. */
763    
764     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
765     ecode += 1 + LINK_SIZE;
766     offset_top = md->end_offset_top;
767     continue;
768    
769     /* Negative assertion: all branches must fail to match */
770    
771     case OP_ASSERT_NOT:
772     case OP_ASSERTBACK_NOT:
773     do
774     {
775     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
776     match_isgroup);
777     if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
778     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
779     ecode += GET(ecode,1);
780     }
781     while (*ecode == OP_ALT);
782    
783     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
784    
785     ecode += 1 + LINK_SIZE;
786     continue;
787    
788     /* Move the subject pointer back. This occurs only at the start of
789     each branch of a lookbehind assertion. If we are too close to the start to
790     move back, this match function fails. When working with UTF-8 we move
791     back a number of characters, not bytes. */
792    
793     case OP_REVERSE:
794     #ifdef SUPPORT_UTF8
795     if (utf8)
796     {
797     c = GET(ecode,1);
798     for (i = 0; i < c; i++)
799     {
800     eptr--;
801     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
802     BACKCHAR(eptr)
803     }
804     }
805     else
806     #endif
807    
808     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
809    
810     {
811     eptr -= GET(ecode,1);
812     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
813     }
814    
815     /* Skip to next op code */
816    
817     ecode += 1 + LINK_SIZE;
818     break;
819    
820     /* The callout item calls an external function, if one is provided, passing
821     details of the match so far. This is mainly for debugging, though the
822     function is able to force a failure. */
823    
824     case OP_CALLOUT:
825     if (pcre_callout != NULL)
826     {
827     pcre_callout_block cb;
828     cb.version = 1; /* Version 1 of the callout block */
829     cb.callout_number = ecode[1];
830     cb.offset_vector = md->offset_vector;
831 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
832 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
833     cb.start_match = md->start_match - md->start_subject;
834     cb.current_position = eptr - md->start_subject;
835     cb.pattern_position = GET(ecode, 2);
836     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
837     cb.capture_top = offset_top/2;
838     cb.capture_last = md->capture_last;
839     cb.callout_data = md->callout_data;
840     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
841     if (rrc < 0) RRETURN(rrc);
842     }
843     ecode += 2 + 2*LINK_SIZE;
844     break;
845    
846     /* Recursion either matches the current regex, or some subexpression. The
847     offset data is the offset to the starting bracket from the start of the
848     whole pattern. (This is so that it works from duplicated subpatterns.)
849    
850     If there are any capturing brackets started but not finished, we have to
851     save their starting points and reinstate them after the recursion. However,
852     we don't know how many such there are (offset_top records the completed
853     total) so we just have to save all the potential data. There may be up to
854     65535 such values, which is too large to put on the stack, but using malloc
855     for small numbers seems expensive. As a compromise, the stack is used when
856     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
857     is used. A problem is what to do if the malloc fails ... there is no way of
858     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
859     values on the stack, and accept that the rest may be wrong.
860    
861     There are also other values that have to be saved. We use a chained
862     sequence of blocks that actually live on the stack. Thanks to Robin Houston
863     for the original version of this logic. */
864    
865     case OP_RECURSE:
866     {
867     callpat = md->start_code + GET(ecode, 1);
868     new_recursive.group_num = *callpat - OP_BRA;
869    
870     /* For extended extraction brackets (large number), we have to fish out
871     the number from a dummy opcode at the start. */
872    
873     if (new_recursive.group_num > EXTRACT_BASIC_MAX)
874     new_recursive.group_num = GET2(callpat, 2+LINK_SIZE);
875    
876     /* Add to "recursing stack" */
877    
878     new_recursive.prevrec = md->recursive;
879     md->recursive = &new_recursive;
880    
881     /* Find where to continue from afterwards */
882    
883     ecode += 1 + LINK_SIZE;
884     new_recursive.after_call = ecode;
885    
886     /* Now save the offset data. */
887    
888     new_recursive.saved_max = md->offset_end;
889     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
890     new_recursive.offset_save = stacksave;
891     else
892     {
893     new_recursive.offset_save =
894     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
895     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
896     }
897    
898     memcpy(new_recursive.offset_save, md->offset_vector,
899     new_recursive.saved_max * sizeof(int));
900     new_recursive.save_start = md->start_match;
901     md->start_match = eptr;
902    
903     /* OK, now we can do the recursion. For each top-level alternative we
904     restore the offset and recursion data. */
905    
906     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
907     do
908     {
909     RMATCH(rrc, eptr, callpat + 1 + LINK_SIZE, offset_top, md, ims,
910     eptrb, match_isgroup);
911     if (rrc == MATCH_MATCH)
912     {
913 nigel 87 DPRINTF(("Recursion matched\n"));
914 nigel 77 md->recursive = new_recursive.prevrec;
915     if (new_recursive.offset_save != stacksave)
916     (pcre_free)(new_recursive.offset_save);
917     RRETURN(MATCH_MATCH);
918     }
919 nigel 87 else if (rrc != MATCH_NOMATCH)
920     {
921     DPRINTF(("Recursion gave error %d\n", rrc));
922     RRETURN(rrc);
923     }
924 nigel 77
925     md->recursive = &new_recursive;
926     memcpy(md->offset_vector, new_recursive.offset_save,
927     new_recursive.saved_max * sizeof(int));
928     callpat += GET(callpat, 1);
929     }
930     while (*callpat == OP_ALT);
931    
932     DPRINTF(("Recursion didn't match\n"));
933     md->recursive = new_recursive.prevrec;
934     if (new_recursive.offset_save != stacksave)
935     (pcre_free)(new_recursive.offset_save);
936     RRETURN(MATCH_NOMATCH);
937     }
938     /* Control never reaches here */
939    
940     /* "Once" brackets are like assertion brackets except that after a match,
941     the point in the subject string is not moved back. Thus there can never be
942     a move back into the brackets. Friedl calls these "atomic" subpatterns.
943     Check the alternative branches in turn - the matching won't pass the KET
944     for this kind of subpattern. If any one branch matches, we carry on as at
945     the end of a normal bracket, leaving the subject pointer. */
946    
947     case OP_ONCE:
948     {
949     prev = ecode;
950     saved_eptr = eptr;
951    
952     do
953     {
954     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
955     eptrb, match_isgroup);
956     if (rrc == MATCH_MATCH) break;
957     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
958     ecode += GET(ecode,1);
959     }
960     while (*ecode == OP_ALT);
961    
962     /* If hit the end of the group (which could be repeated), fail */
963    
964     if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
965    
966     /* Continue as from after the assertion, updating the offsets high water
967     mark, since extracts may have been taken. */
968    
969     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
970    
971     offset_top = md->end_offset_top;
972     eptr = md->end_match_ptr;
973    
974     /* For a non-repeating ket, just continue at this level. This also
975     happens for a repeating ket if no characters were matched in the group.
976     This is the forcible breaking of infinite loops as implemented in Perl
977     5.005. If there is an options reset, it will get obeyed in the normal
978     course of events. */
979    
980     if (*ecode == OP_KET || eptr == saved_eptr)
981     {
982     ecode += 1+LINK_SIZE;
983     break;
984     }
985    
986     /* The repeating kets try the rest of the pattern or restart from the
987     preceding bracket, in the appropriate order. We need to reset any options
988     that changed within the bracket before re-running it, so check the next
989     opcode. */
990    
991     if (ecode[1+LINK_SIZE] == OP_OPT)
992     {
993     ims = (ims & ~PCRE_IMS) | ecode[4];
994     DPRINTF(("ims set to %02lx at group repeat\n", ims));
995     }
996    
997     if (*ecode == OP_KETRMIN)
998     {
999     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1000     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1001     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1002     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1003     }
1004     else /* OP_KETRMAX */
1005     {
1006     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1007     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1008     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1009     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1010     }
1011     }
1012     RRETURN(MATCH_NOMATCH);
1013    
1014     /* An alternation is the end of a branch; scan along to find the end of the
1015     bracketed group and go to there. */
1016    
1017     case OP_ALT:
1018     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1019     break;
1020    
1021     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1022     that it may occur zero times. It may repeat infinitely, or not at all -
1023     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1024     repeat limits are compiled as a number of copies, with the optional ones
1025     preceded by BRAZERO or BRAMINZERO. */
1026    
1027     case OP_BRAZERO:
1028     {
1029     next = ecode+1;
1030     RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, match_isgroup);
1031     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1032     do next += GET(next,1); while (*next == OP_ALT);
1033     ecode = next + 1+LINK_SIZE;
1034     }
1035     break;
1036    
1037     case OP_BRAMINZERO:
1038     {
1039     next = ecode+1;
1040     do next += GET(next,1); while (*next == OP_ALT);
1041     RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb,
1042     match_isgroup);
1043     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1044     ecode++;
1045     }
1046     break;
1047    
1048     /* End of a group, repeated or non-repeating. If we are at the end of
1049     an assertion "group", stop matching and return MATCH_MATCH, but record the
1050     current high water mark for use by positive assertions. Do this also
1051     for the "once" (not-backup up) groups. */
1052    
1053     case OP_KET:
1054     case OP_KETRMIN:
1055     case OP_KETRMAX:
1056     {
1057     prev = ecode - GET(ecode, 1);
1058     saved_eptr = eptrb->epb_saved_eptr;
1059    
1060     /* Back up the stack of bracket start pointers. */
1061    
1062     eptrb = eptrb->epb_prev;
1063    
1064     if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1065     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1066     *prev == OP_ONCE)
1067     {
1068     md->end_match_ptr = eptr; /* For ONCE */
1069     md->end_offset_top = offset_top;
1070     RRETURN(MATCH_MATCH);
1071     }
1072    
1073     /* In all other cases except a conditional group we have to check the
1074     group number back at the start and if necessary complete handling an
1075     extraction by setting the offsets and bumping the high water mark. */
1076    
1077     if (*prev != OP_COND)
1078     {
1079     number = *prev - OP_BRA;
1080    
1081     /* For extended extraction brackets (large number), we have to fish out
1082     the number from a dummy opcode at the start. */
1083    
1084     if (number > EXTRACT_BASIC_MAX) number = GET2(prev, 2+LINK_SIZE);
1085     offset = number << 1;
1086    
1087     #ifdef DEBUG
1088     printf("end bracket %d", number);
1089     printf("\n");
1090     #endif
1091    
1092     /* Test for a numbered group. This includes groups called as a result
1093     of recursion. Note that whole-pattern recursion is coded as a recurse
1094     into group 0, so it won't be picked up here. Instead, we catch it when
1095     the OP_END is reached. */
1096    
1097     if (number > 0)
1098     {
1099     md->capture_last = number;
1100     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1101     {
1102     md->offset_vector[offset] =
1103     md->offset_vector[md->offset_end - number];
1104     md->offset_vector[offset+1] = eptr - md->start_subject;
1105     if (offset_top <= offset) offset_top = offset + 2;
1106     }
1107    
1108     /* Handle a recursively called group. Restore the offsets
1109     appropriately and continue from after the call. */
1110    
1111     if (md->recursive != NULL && md->recursive->group_num == number)
1112     {
1113     recursion_info *rec = md->recursive;
1114     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1115     md->recursive = rec->prevrec;
1116     md->start_match = rec->save_start;
1117     memcpy(md->offset_vector, rec->offset_save,
1118     rec->saved_max * sizeof(int));
1119     ecode = rec->after_call;
1120     ims = original_ims;
1121     break;
1122     }
1123     }
1124     }
1125    
1126     /* Reset the value of the ims flags, in case they got changed during
1127     the group. */
1128    
1129     ims = original_ims;
1130     DPRINTF(("ims reset to %02lx\n", ims));
1131    
1132     /* For a non-repeating ket, just continue at this level. This also
1133     happens for a repeating ket if no characters were matched in the group.
1134     This is the forcible breaking of infinite loops as implemented in Perl
1135     5.005. If there is an options reset, it will get obeyed in the normal
1136     course of events. */
1137    
1138     if (*ecode == OP_KET || eptr == saved_eptr)
1139     {
1140     ecode += 1 + LINK_SIZE;
1141     break;
1142     }
1143    
1144     /* The repeating kets try the rest of the pattern or restart from the
1145     preceding bracket, in the appropriate order. */
1146    
1147     if (*ecode == OP_KETRMIN)
1148     {
1149     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1150     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1151     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1152     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1153     }
1154     else /* OP_KETRMAX */
1155     {
1156     RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_isgroup);
1157     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1158     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1159     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1160     }
1161     }
1162    
1163     RRETURN(MATCH_NOMATCH);
1164    
1165     /* Start of subject unless notbol, or after internal newline if multiline */
1166    
1167     case OP_CIRC:
1168     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1169     if ((ims & PCRE_MULTILINE) != 0)
1170     {
1171     if (eptr != md->start_subject && eptr[-1] != NEWLINE)
1172     RRETURN(MATCH_NOMATCH);
1173     ecode++;
1174     break;
1175     }
1176     /* ... else fall through */
1177    
1178     /* Start of subject assertion */
1179    
1180     case OP_SOD:
1181     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1182     ecode++;
1183     break;
1184    
1185     /* Start of match assertion */
1186    
1187     case OP_SOM:
1188     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1189     ecode++;
1190     break;
1191    
1192     /* Assert before internal newline if multiline, or before a terminating
1193     newline unless endonly is set, else end of subject unless noteol is set. */
1194    
1195     case OP_DOLL:
1196     if ((ims & PCRE_MULTILINE) != 0)
1197     {
1198     if (eptr < md->end_subject)
1199     { if (*eptr != NEWLINE) RRETURN(MATCH_NOMATCH); }
1200     else
1201     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1202     ecode++;
1203     break;
1204     }
1205     else
1206     {
1207     if (md->noteol) RRETURN(MATCH_NOMATCH);
1208     if (!md->endonly)
1209     {
1210     if (eptr < md->end_subject - 1 ||
1211     (eptr == md->end_subject - 1 && *eptr != NEWLINE))
1212     RRETURN(MATCH_NOMATCH);
1213     ecode++;
1214     break;
1215     }
1216     }
1217     /* ... else fall through */
1218    
1219     /* End of subject assertion (\z) */
1220    
1221     case OP_EOD:
1222     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1223     ecode++;
1224     break;
1225    
1226     /* End of subject or ending \n assertion (\Z) */
1227    
1228     case OP_EODN:
1229     if (eptr < md->end_subject - 1 ||
1230     (eptr == md->end_subject - 1 && *eptr != NEWLINE)) RRETURN(MATCH_NOMATCH);
1231     ecode++;
1232     break;
1233    
1234     /* Word boundary assertions */
1235    
1236     case OP_NOT_WORD_BOUNDARY:
1237     case OP_WORD_BOUNDARY:
1238     {
1239    
1240     /* Find out if the previous and current characters are "word" characters.
1241     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1242     be "non-word" characters. */
1243    
1244     #ifdef SUPPORT_UTF8
1245     if (utf8)
1246     {
1247     if (eptr == md->start_subject) prev_is_word = FALSE; else
1248     {
1249     const uschar *lastptr = eptr - 1;
1250     while((*lastptr & 0xc0) == 0x80) lastptr--;
1251     GETCHAR(c, lastptr);
1252     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1253     }
1254     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1255     {
1256     GETCHAR(c, eptr);
1257     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1258     }
1259     }
1260     else
1261     #endif
1262    
1263     /* More streamlined when not in UTF-8 mode */
1264    
1265     {
1266     prev_is_word = (eptr != md->start_subject) &&
1267     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1268     cur_is_word = (eptr < md->end_subject) &&
1269     ((md->ctypes[*eptr] & ctype_word) != 0);
1270     }
1271    
1272     /* Now see if the situation is what we want */
1273    
1274     if ((*ecode++ == OP_WORD_BOUNDARY)?
1275     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1276     RRETURN(MATCH_NOMATCH);
1277     }
1278     break;
1279    
1280     /* Match a single character type; inline for speed */
1281    
1282     case OP_ANY:
1283     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
1284     RRETURN(MATCH_NOMATCH);
1285     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1286     #ifdef SUPPORT_UTF8
1287     if (utf8)
1288     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1289     #endif
1290     ecode++;
1291     break;
1292    
1293     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1294     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1295    
1296     case OP_ANYBYTE:
1297     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1298     ecode++;
1299     break;
1300    
1301     case OP_NOT_DIGIT:
1302     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1303     GETCHARINCTEST(c, eptr);
1304     if (
1305     #ifdef SUPPORT_UTF8
1306     c < 256 &&
1307     #endif
1308     (md->ctypes[c] & ctype_digit) != 0
1309     )
1310     RRETURN(MATCH_NOMATCH);
1311     ecode++;
1312     break;
1313    
1314     case OP_DIGIT:
1315     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1316     GETCHARINCTEST(c, eptr);
1317     if (
1318     #ifdef SUPPORT_UTF8
1319     c >= 256 ||
1320     #endif
1321     (md->ctypes[c] & ctype_digit) == 0
1322     )
1323     RRETURN(MATCH_NOMATCH);
1324     ecode++;
1325     break;
1326    
1327     case OP_NOT_WHITESPACE:
1328     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1329     GETCHARINCTEST(c, eptr);
1330     if (
1331     #ifdef SUPPORT_UTF8
1332     c < 256 &&
1333     #endif
1334     (md->ctypes[c] & ctype_space) != 0
1335     )
1336     RRETURN(MATCH_NOMATCH);
1337     ecode++;
1338     break;
1339    
1340     case OP_WHITESPACE:
1341     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1342     GETCHARINCTEST(c, eptr);
1343     if (
1344     #ifdef SUPPORT_UTF8
1345     c >= 256 ||
1346     #endif
1347     (md->ctypes[c] & ctype_space) == 0
1348     )
1349     RRETURN(MATCH_NOMATCH);
1350     ecode++;
1351     break;
1352    
1353     case OP_NOT_WORDCHAR:
1354     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1355     GETCHARINCTEST(c, eptr);
1356     if (
1357     #ifdef SUPPORT_UTF8
1358     c < 256 &&
1359     #endif
1360     (md->ctypes[c] & ctype_word) != 0
1361     )
1362     RRETURN(MATCH_NOMATCH);
1363     ecode++;
1364     break;
1365    
1366     case OP_WORDCHAR:
1367     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1368     GETCHARINCTEST(c, eptr);
1369     if (
1370     #ifdef SUPPORT_UTF8
1371     c >= 256 ||
1372     #endif
1373     (md->ctypes[c] & ctype_word) == 0
1374     )
1375     RRETURN(MATCH_NOMATCH);
1376     ecode++;
1377     break;
1378    
1379     #ifdef SUPPORT_UCP
1380     /* Check the next character by Unicode property. We will get here only
1381     if the support is in the binary; otherwise a compile-time error occurs. */
1382    
1383     case OP_PROP:
1384     case OP_NOTPROP:
1385     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1386     GETCHARINCTEST(c, eptr);
1387     {
1388 nigel 87 int chartype, script;
1389     int category = _pcre_ucp_findprop(c, &chartype, &script);
1390 nigel 77
1391 nigel 87 switch(ecode[1])
1392     {
1393     case PT_ANY:
1394     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1395     break;
1396 nigel 77
1397 nigel 87 case PT_LAMP:
1398     if ((chartype == ucp_Lu ||
1399     chartype == ucp_Ll ||
1400     chartype == ucp_Lt) == (op == OP_NOTPROP))
1401 nigel 77 RRETURN(MATCH_NOMATCH);
1402 nigel 87 break;
1403    
1404     case PT_GC:
1405     if ((ecode[2] != category) == (op == OP_PROP))
1406 nigel 77 RRETURN(MATCH_NOMATCH);
1407 nigel 87 break;
1408    
1409     case PT_PC:
1410     if ((ecode[2] != chartype) == (op == OP_PROP))
1411     RRETURN(MATCH_NOMATCH);
1412     break;
1413    
1414     case PT_SC:
1415     if ((ecode[2] != script) == (op == OP_PROP))
1416     RRETURN(MATCH_NOMATCH);
1417     break;
1418    
1419     default:
1420     RRETURN(PCRE_ERROR_INTERNAL);
1421     break;
1422 nigel 77 }
1423 nigel 87
1424     ecode += 3;
1425 nigel 77 }
1426     break;
1427    
1428     /* Match an extended Unicode sequence. We will get here only if the support
1429     is in the binary; otherwise a compile-time error occurs. */
1430    
1431     case OP_EXTUNI:
1432     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1433     GETCHARINCTEST(c, eptr);
1434     {
1435 nigel 87 int chartype, script;
1436     int category = _pcre_ucp_findprop(c, &chartype, &script);
1437 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1438     while (eptr < md->end_subject)
1439     {
1440     int len = 1;
1441     if (!utf8) c = *eptr; else
1442     {
1443     GETCHARLEN(c, eptr, len);
1444     }
1445 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1446 nigel 77 if (category != ucp_M) break;
1447     eptr += len;
1448     }
1449     }
1450     ecode++;
1451     break;
1452     #endif
1453    
1454    
1455     /* Match a back reference, possibly repeatedly. Look past the end of the
1456     item to see if there is repeat information following. The code is similar
1457     to that for character classes, but repeated for efficiency. Then obey
1458     similar code to character type repeats - written out again for speed.
1459     However, if the referenced string is the empty string, always treat
1460     it as matched, any number of times (otherwise there could be infinite
1461     loops). */
1462    
1463     case OP_REF:
1464     {
1465     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1466     ecode += 3; /* Advance past item */
1467    
1468     /* If the reference is unset, set the length to be longer than the amount
1469     of subject left; this ensures that every attempt at a match fails. We
1470     can't just fail here, because of the possibility of quantifiers with zero
1471     minima. */
1472    
1473     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1474     md->end_subject - eptr + 1 :
1475     md->offset_vector[offset+1] - md->offset_vector[offset];
1476    
1477     /* Set up for repetition, or handle the non-repeated case */
1478    
1479     switch (*ecode)
1480     {
1481     case OP_CRSTAR:
1482     case OP_CRMINSTAR:
1483     case OP_CRPLUS:
1484     case OP_CRMINPLUS:
1485     case OP_CRQUERY:
1486     case OP_CRMINQUERY:
1487     c = *ecode++ - OP_CRSTAR;
1488     minimize = (c & 1) != 0;
1489     min = rep_min[c]; /* Pick up values from tables; */
1490     max = rep_max[c]; /* zero for max => infinity */
1491     if (max == 0) max = INT_MAX;
1492     break;
1493    
1494     case OP_CRRANGE:
1495     case OP_CRMINRANGE:
1496     minimize = (*ecode == OP_CRMINRANGE);
1497     min = GET2(ecode, 1);
1498     max = GET2(ecode, 3);
1499     if (max == 0) max = INT_MAX;
1500     ecode += 5;
1501     break;
1502    
1503     default: /* No repeat follows */
1504     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1505     eptr += length;
1506     continue; /* With the main loop */
1507     }
1508    
1509     /* If the length of the reference is zero, just continue with the
1510     main loop. */
1511    
1512     if (length == 0) continue;
1513    
1514     /* First, ensure the minimum number of matches are present. We get back
1515     the length of the reference string explicitly rather than passing the
1516     address of eptr, so that eptr can be a register variable. */
1517    
1518     for (i = 1; i <= min; i++)
1519     {
1520     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1521     eptr += length;
1522     }
1523    
1524     /* If min = max, continue at the same level without recursion.
1525     They are not both allowed to be zero. */
1526    
1527     if (min == max) continue;
1528    
1529     /* If minimizing, keep trying and advancing the pointer */
1530    
1531     if (minimize)
1532     {
1533     for (fi = min;; fi++)
1534     {
1535     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1536     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1537     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1538     RRETURN(MATCH_NOMATCH);
1539     eptr += length;
1540     }
1541     /* Control never gets here */
1542     }
1543    
1544     /* If maximizing, find the longest string and work backwards */
1545    
1546     else
1547     {
1548     pp = eptr;
1549     for (i = min; i < max; i++)
1550     {
1551     if (!match_ref(offset, eptr, length, md, ims)) break;
1552     eptr += length;
1553     }
1554     while (eptr >= pp)
1555     {
1556     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1557     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1558     eptr -= length;
1559     }
1560     RRETURN(MATCH_NOMATCH);
1561     }
1562     }
1563     /* Control never gets here */
1564    
1565    
1566    
1567     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1568     used when all the characters in the class have values in the range 0-255,
1569     and either the matching is caseful, or the characters are in the range
1570     0-127 when UTF-8 processing is enabled. The only difference between
1571     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1572     encountered.
1573    
1574     First, look past the end of the item to see if there is repeat information
1575     following. Then obey similar code to character type repeats - written out
1576     again for speed. */
1577    
1578     case OP_NCLASS:
1579     case OP_CLASS:
1580     {
1581     data = ecode + 1; /* Save for matching */
1582     ecode += 33; /* Advance past the item */
1583    
1584     switch (*ecode)
1585     {
1586     case OP_CRSTAR:
1587     case OP_CRMINSTAR:
1588     case OP_CRPLUS:
1589     case OP_CRMINPLUS:
1590     case OP_CRQUERY:
1591     case OP_CRMINQUERY:
1592     c = *ecode++ - OP_CRSTAR;
1593     minimize = (c & 1) != 0;
1594     min = rep_min[c]; /* Pick up values from tables; */
1595     max = rep_max[c]; /* zero for max => infinity */
1596     if (max == 0) max = INT_MAX;
1597     break;
1598    
1599     case OP_CRRANGE:
1600     case OP_CRMINRANGE:
1601     minimize = (*ecode == OP_CRMINRANGE);
1602     min = GET2(ecode, 1);
1603     max = GET2(ecode, 3);
1604     if (max == 0) max = INT_MAX;
1605     ecode += 5;
1606     break;
1607    
1608     default: /* No repeat follows */
1609     min = max = 1;
1610     break;
1611     }
1612    
1613     /* First, ensure the minimum number of matches are present. */
1614    
1615     #ifdef SUPPORT_UTF8
1616     /* UTF-8 mode */
1617     if (utf8)
1618     {
1619     for (i = 1; i <= min; i++)
1620     {
1621     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1622     GETCHARINC(c, eptr);
1623     if (c > 255)
1624     {
1625     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1626     }
1627     else
1628     {
1629     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1630     }
1631     }
1632     }
1633     else
1634     #endif
1635     /* Not UTF-8 mode */
1636     {
1637     for (i = 1; i <= min; i++)
1638     {
1639     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1640     c = *eptr++;
1641     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1642     }
1643     }
1644    
1645     /* If max == min we can continue with the main loop without the
1646     need to recurse. */
1647    
1648     if (min == max) continue;
1649    
1650     /* If minimizing, keep testing the rest of the expression and advancing
1651     the pointer while it matches the class. */
1652    
1653     if (minimize)
1654     {
1655     #ifdef SUPPORT_UTF8
1656     /* UTF-8 mode */
1657     if (utf8)
1658     {
1659     for (fi = min;; fi++)
1660     {
1661     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1662     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1663     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1664     GETCHARINC(c, eptr);
1665     if (c > 255)
1666     {
1667     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1668     }
1669     else
1670     {
1671     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1672     }
1673     }
1674     }
1675     else
1676     #endif
1677     /* Not UTF-8 mode */
1678     {
1679     for (fi = min;; fi++)
1680     {
1681     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1682     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1683     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1684     c = *eptr++;
1685     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1686     }
1687     }
1688     /* Control never gets here */
1689     }
1690    
1691     /* If maximizing, find the longest possible run, then work backwards. */
1692    
1693     else
1694     {
1695     pp = eptr;
1696    
1697     #ifdef SUPPORT_UTF8
1698     /* UTF-8 mode */
1699     if (utf8)
1700     {
1701     for (i = min; i < max; i++)
1702     {
1703     int len = 1;
1704     if (eptr >= md->end_subject) break;
1705     GETCHARLEN(c, eptr, len);
1706     if (c > 255)
1707     {
1708     if (op == OP_CLASS) break;
1709     }
1710     else
1711     {
1712     if ((data[c/8] & (1 << (c&7))) == 0) break;
1713     }
1714     eptr += len;
1715     }
1716     for (;;)
1717     {
1718     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1719     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1720     if (eptr-- == pp) break; /* Stop if tried at original pos */
1721     BACKCHAR(eptr);
1722     }
1723     }
1724     else
1725     #endif
1726     /* Not UTF-8 mode */
1727     {
1728     for (i = min; i < max; i++)
1729     {
1730     if (eptr >= md->end_subject) break;
1731     c = *eptr;
1732     if ((data[c/8] & (1 << (c&7))) == 0) break;
1733     eptr++;
1734     }
1735     while (eptr >= pp)
1736     {
1737     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1738 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1739 nigel 77 eptr--;
1740     }
1741     }
1742    
1743     RRETURN(MATCH_NOMATCH);
1744     }
1745     }
1746     /* Control never gets here */
1747    
1748    
1749     /* Match an extended character class. This opcode is encountered only
1750     in UTF-8 mode, because that's the only time it is compiled. */
1751    
1752     #ifdef SUPPORT_UTF8
1753     case OP_XCLASS:
1754     {
1755     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1756     ecode += GET(ecode, 1); /* Advance past the item */
1757    
1758     switch (*ecode)
1759     {
1760     case OP_CRSTAR:
1761     case OP_CRMINSTAR:
1762     case OP_CRPLUS:
1763     case OP_CRMINPLUS:
1764     case OP_CRQUERY:
1765     case OP_CRMINQUERY:
1766     c = *ecode++ - OP_CRSTAR;
1767     minimize = (c & 1) != 0;
1768     min = rep_min[c]; /* Pick up values from tables; */
1769     max = rep_max[c]; /* zero for max => infinity */
1770     if (max == 0) max = INT_MAX;
1771     break;
1772    
1773     case OP_CRRANGE:
1774     case OP_CRMINRANGE:
1775     minimize = (*ecode == OP_CRMINRANGE);
1776     min = GET2(ecode, 1);
1777     max = GET2(ecode, 3);
1778     if (max == 0) max = INT_MAX;
1779     ecode += 5;
1780     break;
1781    
1782     default: /* No repeat follows */
1783     min = max = 1;
1784     break;
1785     }
1786    
1787     /* First, ensure the minimum number of matches are present. */
1788    
1789     for (i = 1; i <= min; i++)
1790     {
1791     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1792     GETCHARINC(c, eptr);
1793     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1794     }
1795    
1796     /* If max == min we can continue with the main loop without the
1797     need to recurse. */
1798    
1799     if (min == max) continue;
1800    
1801     /* If minimizing, keep testing the rest of the expression and advancing
1802     the pointer while it matches the class. */
1803    
1804     if (minimize)
1805     {
1806     for (fi = min;; fi++)
1807     {
1808     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1809     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1810     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1811     GETCHARINC(c, eptr);
1812     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1813     }
1814     /* Control never gets here */
1815     }
1816    
1817     /* If maximizing, find the longest possible run, then work backwards. */
1818    
1819     else
1820     {
1821     pp = eptr;
1822     for (i = min; i < max; i++)
1823     {
1824     int len = 1;
1825     if (eptr >= md->end_subject) break;
1826     GETCHARLEN(c, eptr, len);
1827     if (!_pcre_xclass(c, data)) break;
1828     eptr += len;
1829     }
1830     for(;;)
1831     {
1832     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1833     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1834     if (eptr-- == pp) break; /* Stop if tried at original pos */
1835     BACKCHAR(eptr)
1836     }
1837     RRETURN(MATCH_NOMATCH);
1838     }
1839    
1840     /* Control never gets here */
1841     }
1842     #endif /* End of XCLASS */
1843    
1844     /* Match a single character, casefully */
1845    
1846     case OP_CHAR:
1847     #ifdef SUPPORT_UTF8
1848     if (utf8)
1849     {
1850     length = 1;
1851     ecode++;
1852     GETCHARLEN(fc, ecode, length);
1853     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1854     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1855     }
1856     else
1857     #endif
1858    
1859     /* Non-UTF-8 mode */
1860     {
1861     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1862     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1863     ecode += 2;
1864     }
1865     break;
1866    
1867     /* Match a single character, caselessly */
1868    
1869     case OP_CHARNC:
1870     #ifdef SUPPORT_UTF8
1871     if (utf8)
1872     {
1873     length = 1;
1874     ecode++;
1875     GETCHARLEN(fc, ecode, length);
1876    
1877     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1878    
1879     /* If the pattern character's value is < 128, we have only one byte, and
1880     can use the fast lookup table. */
1881    
1882     if (fc < 128)
1883     {
1884     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1885     }
1886    
1887     /* Otherwise we must pick up the subject character */
1888    
1889     else
1890     {
1891     int dc;
1892     GETCHARINC(dc, eptr);
1893     ecode += length;
1894    
1895     /* If we have Unicode property support, we can use it to test the other
1896 nigel 87 case of the character, if there is one. */
1897 nigel 77
1898     if (fc != dc)
1899     {
1900     #ifdef SUPPORT_UCP
1901 nigel 87 if (dc != _pcre_ucp_othercase(fc))
1902 nigel 77 #endif
1903     RRETURN(MATCH_NOMATCH);
1904     }
1905     }
1906     }
1907     else
1908     #endif /* SUPPORT_UTF8 */
1909    
1910     /* Non-UTF-8 mode */
1911     {
1912     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1913     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1914     ecode += 2;
1915     }
1916     break;
1917    
1918     /* Match a single character repeatedly; different opcodes share code. */
1919    
1920     case OP_EXACT:
1921     min = max = GET2(ecode, 1);
1922     ecode += 3;
1923     goto REPEATCHAR;
1924    
1925     case OP_UPTO:
1926     case OP_MINUPTO:
1927     min = 0;
1928     max = GET2(ecode, 1);
1929     minimize = *ecode == OP_MINUPTO;
1930     ecode += 3;
1931     goto REPEATCHAR;
1932    
1933     case OP_STAR:
1934     case OP_MINSTAR:
1935     case OP_PLUS:
1936     case OP_MINPLUS:
1937     case OP_QUERY:
1938     case OP_MINQUERY:
1939     c = *ecode++ - OP_STAR;
1940     minimize = (c & 1) != 0;
1941     min = rep_min[c]; /* Pick up values from tables; */
1942     max = rep_max[c]; /* zero for max => infinity */
1943     if (max == 0) max = INT_MAX;
1944    
1945     /* Common code for all repeated single-character matches. We can give
1946     up quickly if there are fewer than the minimum number of characters left in
1947     the subject. */
1948    
1949     REPEATCHAR:
1950     #ifdef SUPPORT_UTF8
1951     if (utf8)
1952     {
1953     length = 1;
1954     charptr = ecode;
1955     GETCHARLEN(fc, ecode, length);
1956     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1957     ecode += length;
1958    
1959     /* Handle multibyte character matching specially here. There is
1960     support for caseless matching if UCP support is present. */
1961    
1962     if (length > 1)
1963     {
1964     int oclength = 0;
1965     uschar occhars[8];
1966    
1967     #ifdef SUPPORT_UCP
1968     int othercase;
1969     if ((ims & PCRE_CASELESS) != 0 &&
1970 nigel 87 (othercase = _pcre_ucp_othercase(fc)) >= 0 &&
1971     othercase >= 0)
1972 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
1973     #endif /* SUPPORT_UCP */
1974    
1975     for (i = 1; i <= min; i++)
1976     {
1977     if (memcmp(eptr, charptr, length) == 0) eptr += length;
1978     /* Need braces because of following else */
1979     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1980     else
1981     {
1982     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
1983     eptr += oclength;
1984     }
1985     }
1986    
1987     if (min == max) continue;
1988    
1989     if (minimize)
1990     {
1991     for (fi = min;; fi++)
1992     {
1993     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1994     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1995     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1996     if (memcmp(eptr, charptr, length) == 0) eptr += length;
1997     /* Need braces because of following else */
1998     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
1999     else
2000     {
2001     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2002     eptr += oclength;
2003     }
2004     }
2005     /* Control never gets here */
2006     }
2007     else
2008     {
2009     pp = eptr;
2010     for (i = min; i < max; i++)
2011     {
2012     if (eptr > md->end_subject - length) break;
2013     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2014     else if (oclength == 0) break;
2015     else
2016     {
2017     if (memcmp(eptr, occhars, oclength) != 0) break;
2018     eptr += oclength;
2019     }
2020     }
2021     while (eptr >= pp)
2022     {
2023     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2024     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2025     eptr -= length;
2026     }
2027     RRETURN(MATCH_NOMATCH);
2028     }
2029     /* Control never gets here */
2030     }
2031    
2032     /* If the length of a UTF-8 character is 1, we fall through here, and
2033     obey the code as for non-UTF-8 characters below, though in this case the
2034     value of fc will always be < 128. */
2035     }
2036     else
2037     #endif /* SUPPORT_UTF8 */
2038    
2039     /* When not in UTF-8 mode, load a single-byte character. */
2040     {
2041     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2042     fc = *ecode++;
2043     }
2044    
2045     /* The value of fc at this point is always less than 256, though we may or
2046     may not be in UTF-8 mode. The code is duplicated for the caseless and
2047     caseful cases, for speed, since matching characters is likely to be quite
2048     common. First, ensure the minimum number of matches are present. If min =
2049     max, continue at the same level without recursing. Otherwise, if
2050     minimizing, keep trying the rest of the expression and advancing one
2051     matching character if failing, up to the maximum. Alternatively, if
2052     maximizing, find the maximum number of characters and work backwards. */
2053    
2054     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2055     max, eptr));
2056    
2057     if ((ims & PCRE_CASELESS) != 0)
2058     {
2059     fc = md->lcc[fc];
2060     for (i = 1; i <= min; i++)
2061     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2062     if (min == max) continue;
2063     if (minimize)
2064     {
2065     for (fi = min;; fi++)
2066     {
2067     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2068     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2069     if (fi >= max || eptr >= md->end_subject ||
2070     fc != md->lcc[*eptr++])
2071     RRETURN(MATCH_NOMATCH);
2072     }
2073     /* Control never gets here */
2074     }
2075     else
2076     {
2077     pp = eptr;
2078     for (i = min; i < max; i++)
2079     {
2080     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2081     eptr++;
2082     }
2083     while (eptr >= pp)
2084     {
2085     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2086     eptr--;
2087     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2088     }
2089     RRETURN(MATCH_NOMATCH);
2090     }
2091     /* Control never gets here */
2092     }
2093    
2094     /* Caseful comparisons (includes all multi-byte characters) */
2095    
2096     else
2097     {
2098     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2099     if (min == max) continue;
2100     if (minimize)
2101     {
2102     for (fi = min;; fi++)
2103     {
2104     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2105     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2106     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2107     RRETURN(MATCH_NOMATCH);
2108     }
2109     /* Control never gets here */
2110     }
2111     else
2112     {
2113     pp = eptr;
2114     for (i = min; i < max; i++)
2115     {
2116     if (eptr >= md->end_subject || fc != *eptr) break;
2117     eptr++;
2118     }
2119     while (eptr >= pp)
2120     {
2121     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2122     eptr--;
2123     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2124     }
2125     RRETURN(MATCH_NOMATCH);
2126     }
2127     }
2128     /* Control never gets here */
2129    
2130     /* Match a negated single one-byte character. The character we are
2131     checking can be multibyte. */
2132    
2133     case OP_NOT:
2134     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2135     ecode++;
2136     GETCHARINCTEST(c, eptr);
2137     if ((ims & PCRE_CASELESS) != 0)
2138     {
2139     #ifdef SUPPORT_UTF8
2140     if (c < 256)
2141     #endif
2142     c = md->lcc[c];
2143     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2144     }
2145     else
2146     {
2147     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2148     }
2149     break;
2150    
2151     /* Match a negated single one-byte character repeatedly. This is almost a
2152     repeat of the code for a repeated single character, but I haven't found a
2153     nice way of commoning these up that doesn't require a test of the
2154     positive/negative option for each character match. Maybe that wouldn't add
2155     very much to the time taken, but character matching *is* what this is all
2156     about... */
2157    
2158     case OP_NOTEXACT:
2159     min = max = GET2(ecode, 1);
2160     ecode += 3;
2161     goto REPEATNOTCHAR;
2162    
2163     case OP_NOTUPTO:
2164     case OP_NOTMINUPTO:
2165     min = 0;
2166     max = GET2(ecode, 1);
2167     minimize = *ecode == OP_NOTMINUPTO;
2168     ecode += 3;
2169     goto REPEATNOTCHAR;
2170    
2171     case OP_NOTSTAR:
2172     case OP_NOTMINSTAR:
2173     case OP_NOTPLUS:
2174     case OP_NOTMINPLUS:
2175     case OP_NOTQUERY:
2176     case OP_NOTMINQUERY:
2177     c = *ecode++ - OP_NOTSTAR;
2178     minimize = (c & 1) != 0;
2179     min = rep_min[c]; /* Pick up values from tables; */
2180     max = rep_max[c]; /* zero for max => infinity */
2181     if (max == 0) max = INT_MAX;
2182    
2183     /* Common code for all repeated single-byte matches. We can give up quickly
2184     if there are fewer than the minimum number of bytes left in the
2185     subject. */
2186    
2187     REPEATNOTCHAR:
2188     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2189     fc = *ecode++;
2190    
2191     /* The code is duplicated for the caseless and caseful cases, for speed,
2192     since matching characters is likely to be quite common. First, ensure the
2193     minimum number of matches are present. If min = max, continue at the same
2194     level without recursing. Otherwise, if minimizing, keep trying the rest of
2195     the expression and advancing one matching character if failing, up to the
2196     maximum. Alternatively, if maximizing, find the maximum number of
2197     characters and work backwards. */
2198    
2199     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2200     max, eptr));
2201    
2202     if ((ims & PCRE_CASELESS) != 0)
2203     {
2204     fc = md->lcc[fc];
2205    
2206     #ifdef SUPPORT_UTF8
2207     /* UTF-8 mode */
2208     if (utf8)
2209     {
2210     register int d;
2211     for (i = 1; i <= min; i++)
2212     {
2213     GETCHARINC(d, eptr);
2214     if (d < 256) d = md->lcc[d];
2215     if (fc == d) RRETURN(MATCH_NOMATCH);
2216     }
2217     }
2218     else
2219     #endif
2220    
2221     /* Not UTF-8 mode */
2222     {
2223     for (i = 1; i <= min; i++)
2224     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2225     }
2226    
2227     if (min == max) continue;
2228    
2229     if (minimize)
2230     {
2231     #ifdef SUPPORT_UTF8
2232     /* UTF-8 mode */
2233     if (utf8)
2234     {
2235     register int d;
2236     for (fi = min;; fi++)
2237     {
2238     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2239     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240     GETCHARINC(d, eptr);
2241     if (d < 256) d = md->lcc[d];
2242     if (fi >= max || eptr >= md->end_subject || fc == d)
2243     RRETURN(MATCH_NOMATCH);
2244     }
2245     }
2246     else
2247     #endif
2248     /* Not UTF-8 mode */
2249     {
2250     for (fi = min;; fi++)
2251     {
2252     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2253     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2254     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2255     RRETURN(MATCH_NOMATCH);
2256     }
2257     }
2258     /* Control never gets here */
2259     }
2260    
2261     /* Maximize case */
2262    
2263     else
2264     {
2265     pp = eptr;
2266    
2267     #ifdef SUPPORT_UTF8
2268     /* UTF-8 mode */
2269     if (utf8)
2270     {
2271     register int d;
2272     for (i = min; i < max; i++)
2273     {
2274     int len = 1;
2275     if (eptr >= md->end_subject) break;
2276     GETCHARLEN(d, eptr, len);
2277     if (d < 256) d = md->lcc[d];
2278     if (fc == d) break;
2279     eptr += len;
2280     }
2281     for(;;)
2282     {
2283     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2284     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2285     if (eptr-- == pp) break; /* Stop if tried at original pos */
2286     BACKCHAR(eptr);
2287     }
2288     }
2289     else
2290     #endif
2291     /* Not UTF-8 mode */
2292     {
2293     for (i = min; i < max; i++)
2294     {
2295     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2296     eptr++;
2297     }
2298     while (eptr >= pp)
2299     {
2300     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2301     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2302     eptr--;
2303     }
2304     }
2305    
2306     RRETURN(MATCH_NOMATCH);
2307     }
2308     /* Control never gets here */
2309     }
2310    
2311     /* Caseful comparisons */
2312    
2313     else
2314     {
2315     #ifdef SUPPORT_UTF8
2316     /* UTF-8 mode */
2317     if (utf8)
2318     {
2319     register int d;
2320     for (i = 1; i <= min; i++)
2321     {
2322     GETCHARINC(d, eptr);
2323     if (fc == d) RRETURN(MATCH_NOMATCH);
2324     }
2325     }
2326     else
2327     #endif
2328     /* Not UTF-8 mode */
2329     {
2330     for (i = 1; i <= min; i++)
2331     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2332     }
2333    
2334     if (min == max) continue;
2335    
2336     if (minimize)
2337     {
2338     #ifdef SUPPORT_UTF8
2339     /* UTF-8 mode */
2340     if (utf8)
2341     {
2342     register int d;
2343     for (fi = min;; fi++)
2344     {
2345     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2346     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2347     GETCHARINC(d, eptr);
2348     if (fi >= max || eptr >= md->end_subject || fc == d)
2349     RRETURN(MATCH_NOMATCH);
2350     }
2351     }
2352     else
2353     #endif
2354     /* Not UTF-8 mode */
2355     {
2356     for (fi = min;; fi++)
2357     {
2358     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2359     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2360     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2361     RRETURN(MATCH_NOMATCH);
2362     }
2363     }
2364     /* Control never gets here */
2365     }
2366    
2367     /* Maximize case */
2368    
2369     else
2370     {
2371     pp = eptr;
2372    
2373     #ifdef SUPPORT_UTF8
2374     /* UTF-8 mode */
2375     if (utf8)
2376     {
2377     register int d;
2378     for (i = min; i < max; i++)
2379     {
2380     int len = 1;
2381     if (eptr >= md->end_subject) break;
2382     GETCHARLEN(d, eptr, len);
2383     if (fc == d) break;
2384     eptr += len;
2385     }
2386     for(;;)
2387     {
2388     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2389     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2390     if (eptr-- == pp) break; /* Stop if tried at original pos */
2391     BACKCHAR(eptr);
2392     }
2393     }
2394     else
2395     #endif
2396     /* Not UTF-8 mode */
2397     {
2398     for (i = min; i < max; i++)
2399     {
2400     if (eptr >= md->end_subject || fc == *eptr) break;
2401     eptr++;
2402     }
2403     while (eptr >= pp)
2404     {
2405     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2406     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2407     eptr--;
2408     }
2409     }
2410    
2411     RRETURN(MATCH_NOMATCH);
2412     }
2413     }
2414     /* Control never gets here */
2415    
2416     /* Match a single character type repeatedly; several different opcodes
2417     share code. This is very similar to the code for single characters, but we
2418     repeat it in the interests of efficiency. */
2419    
2420     case OP_TYPEEXACT:
2421     min = max = GET2(ecode, 1);
2422     minimize = TRUE;
2423     ecode += 3;
2424     goto REPEATTYPE;
2425    
2426     case OP_TYPEUPTO:
2427     case OP_TYPEMINUPTO:
2428     min = 0;
2429     max = GET2(ecode, 1);
2430     minimize = *ecode == OP_TYPEMINUPTO;
2431     ecode += 3;
2432     goto REPEATTYPE;
2433    
2434     case OP_TYPESTAR:
2435     case OP_TYPEMINSTAR:
2436     case OP_TYPEPLUS:
2437     case OP_TYPEMINPLUS:
2438     case OP_TYPEQUERY:
2439     case OP_TYPEMINQUERY:
2440     c = *ecode++ - OP_TYPESTAR;
2441     minimize = (c & 1) != 0;
2442     min = rep_min[c]; /* Pick up values from tables; */
2443     max = rep_max[c]; /* zero for max => infinity */
2444     if (max == 0) max = INT_MAX;
2445    
2446     /* Common code for all repeated single character type matches. Note that
2447     in UTF-8 mode, '.' matches a character of any length, but for the other
2448     character types, the valid characters are all one-byte long. */
2449    
2450     REPEATTYPE:
2451     ctype = *ecode++; /* Code for the character type */
2452    
2453     #ifdef SUPPORT_UCP
2454     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2455     {
2456     prop_fail_result = ctype == OP_NOTPROP;
2457     prop_type = *ecode++;
2458 nigel 87 prop_value = *ecode++;
2459 nigel 77 }
2460     else prop_type = -1;
2461     #endif
2462    
2463     /* First, ensure the minimum number of matches are present. Use inline
2464     code for maximizing the speed, and do the type test once at the start
2465     (i.e. keep it out of the loop). Also we can test that there are at least
2466     the minimum number of bytes before we start. This isn't as effective in
2467     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2468     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2469     and single-bytes. */
2470    
2471     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2472     if (min > 0)
2473     {
2474     #ifdef SUPPORT_UCP
2475 nigel 87 if (prop_type >= 0)
2476 nigel 77 {
2477 nigel 87 switch(prop_type)
2478 nigel 77 {
2479 nigel 87 case PT_ANY:
2480     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2481     for (i = 1; i <= min; i++)
2482     {
2483     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2484     GETCHARINC(c, eptr);
2485     }
2486     break;
2487    
2488     case PT_LAMP:
2489     for (i = 1; i <= min; i++)
2490     {
2491     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2492     GETCHARINC(c, eptr);
2493     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2494     if ((prop_chartype == ucp_Lu ||
2495     prop_chartype == ucp_Ll ||
2496     prop_chartype == ucp_Lt) == prop_fail_result)
2497     RRETURN(MATCH_NOMATCH);
2498     }
2499     break;
2500    
2501     case PT_GC:
2502     for (i = 1; i <= min; i++)
2503     {
2504     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2505     GETCHARINC(c, eptr);
2506     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2507     if ((prop_category == prop_value) == prop_fail_result)
2508     RRETURN(MATCH_NOMATCH);
2509     }
2510     break;
2511    
2512     case PT_PC:
2513     for (i = 1; i <= min; i++)
2514     {
2515     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2516     GETCHARINC(c, eptr);
2517     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2518     if ((prop_chartype == prop_value) == prop_fail_result)
2519     RRETURN(MATCH_NOMATCH);
2520     }
2521     break;
2522    
2523     case PT_SC:
2524     for (i = 1; i <= min; i++)
2525     {
2526     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2527     GETCHARINC(c, eptr);
2528     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2529     if ((prop_script == prop_value) == prop_fail_result)
2530     RRETURN(MATCH_NOMATCH);
2531     }
2532     break;
2533    
2534     default:
2535     RRETURN(PCRE_ERROR_INTERNAL);
2536     break;
2537 nigel 77 }
2538     }
2539    
2540     /* Match extended Unicode sequences. We will get here only if the
2541     support is in the binary; otherwise a compile-time error occurs. */
2542    
2543     else if (ctype == OP_EXTUNI)
2544     {
2545     for (i = 1; i <= min; i++)
2546     {
2547     GETCHARINCTEST(c, eptr);
2548 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2549 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2550     while (eptr < md->end_subject)
2551     {
2552     int len = 1;
2553     if (!utf8) c = *eptr; else
2554     {
2555     GETCHARLEN(c, eptr, len);
2556     }
2557 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2558 nigel 77 if (prop_category != ucp_M) break;
2559     eptr += len;
2560     }
2561     }
2562     }
2563    
2564     else
2565     #endif /* SUPPORT_UCP */
2566    
2567     /* Handle all other cases when the coding is UTF-8 */
2568    
2569     #ifdef SUPPORT_UTF8
2570     if (utf8) switch(ctype)
2571     {
2572     case OP_ANY:
2573     for (i = 1; i <= min; i++)
2574     {
2575     if (eptr >= md->end_subject ||
2576     (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
2577     RRETURN(MATCH_NOMATCH);
2578     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2579     }
2580     break;
2581    
2582     case OP_ANYBYTE:
2583     eptr += min;
2584     break;
2585    
2586     case OP_NOT_DIGIT:
2587     for (i = 1; i <= min; i++)
2588     {
2589     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2590     GETCHARINC(c, eptr);
2591     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2592     RRETURN(MATCH_NOMATCH);
2593     }
2594     break;
2595    
2596     case OP_DIGIT:
2597     for (i = 1; i <= min; i++)
2598     {
2599     if (eptr >= md->end_subject ||
2600     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2601     RRETURN(MATCH_NOMATCH);
2602     /* No need to skip more bytes - we know it's a 1-byte character */
2603     }
2604     break;
2605    
2606     case OP_NOT_WHITESPACE:
2607     for (i = 1; i <= min; i++)
2608     {
2609     if (eptr >= md->end_subject ||
2610     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2611     RRETURN(MATCH_NOMATCH);
2612     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2613     }
2614     break;
2615    
2616     case OP_WHITESPACE:
2617     for (i = 1; i <= min; i++)
2618     {
2619     if (eptr >= md->end_subject ||
2620     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2621     RRETURN(MATCH_NOMATCH);
2622     /* No need to skip more bytes - we know it's a 1-byte character */
2623     }
2624     break;
2625    
2626     case OP_NOT_WORDCHAR:
2627     for (i = 1; i <= min; i++)
2628     {
2629     if (eptr >= md->end_subject ||
2630     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2631     RRETURN(MATCH_NOMATCH);
2632     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2633     }
2634     break;
2635    
2636     case OP_WORDCHAR:
2637     for (i = 1; i <= min; i++)
2638     {
2639     if (eptr >= md->end_subject ||
2640     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2641     RRETURN(MATCH_NOMATCH);
2642     /* No need to skip more bytes - we know it's a 1-byte character */
2643     }
2644     break;
2645    
2646     default:
2647     RRETURN(PCRE_ERROR_INTERNAL);
2648     } /* End switch(ctype) */
2649    
2650     else
2651     #endif /* SUPPORT_UTF8 */
2652    
2653     /* Code for the non-UTF-8 case for minimum matching of operators other
2654     than OP_PROP and OP_NOTPROP. */
2655    
2656     switch(ctype)
2657     {
2658     case OP_ANY:
2659     if ((ims & PCRE_DOTALL) == 0)
2660     {
2661     for (i = 1; i <= min; i++)
2662     if (*eptr++ == NEWLINE) RRETURN(MATCH_NOMATCH);
2663     }
2664     else eptr += min;
2665     break;
2666    
2667     case OP_ANYBYTE:
2668     eptr += min;
2669     break;
2670    
2671     case OP_NOT_DIGIT:
2672     for (i = 1; i <= min; i++)
2673     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2674     break;
2675    
2676     case OP_DIGIT:
2677     for (i = 1; i <= min; i++)
2678     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2679     break;
2680    
2681     case OP_NOT_WHITESPACE:
2682     for (i = 1; i <= min; i++)
2683     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2684     break;
2685    
2686     case OP_WHITESPACE:
2687     for (i = 1; i <= min; i++)
2688     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2689     break;
2690    
2691     case OP_NOT_WORDCHAR:
2692     for (i = 1; i <= min; i++)
2693     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2694     RRETURN(MATCH_NOMATCH);
2695     break;
2696    
2697     case OP_WORDCHAR:
2698     for (i = 1; i <= min; i++)
2699     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2700     RRETURN(MATCH_NOMATCH);
2701     break;
2702    
2703     default:
2704     RRETURN(PCRE_ERROR_INTERNAL);
2705     }
2706     }
2707    
2708     /* If min = max, continue at the same level without recursing */
2709    
2710     if (min == max) continue;
2711    
2712     /* If minimizing, we have to test the rest of the pattern before each
2713     subsequent match. Again, separate the UTF-8 case for speed, and also
2714     separate the UCP cases. */
2715    
2716     if (minimize)
2717     {
2718     #ifdef SUPPORT_UCP
2719 nigel 87 if (prop_type >= 0)
2720 nigel 77 {
2721 nigel 87 switch(prop_type)
2722 nigel 77 {
2723 nigel 87 case PT_ANY:
2724     for (fi = min;; fi++)
2725     {
2726     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2727     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2728     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2729     GETCHARINC(c, eptr);
2730     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2731     }
2732     break;
2733    
2734     case PT_LAMP:
2735     for (fi = min;; fi++)
2736     {
2737     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2738     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2739     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2740     GETCHARINC(c, eptr);
2741     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2742     if ((prop_chartype == ucp_Lu ||
2743     prop_chartype == ucp_Ll ||
2744     prop_chartype == ucp_Lt) == prop_fail_result)
2745     RRETURN(MATCH_NOMATCH);
2746     }
2747     break;
2748    
2749     case PT_GC:
2750     for (fi = min;; fi++)
2751     {
2752     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2753     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2754     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2755     GETCHARINC(c, eptr);
2756     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2757     if ((prop_category == prop_value) == prop_fail_result)
2758     RRETURN(MATCH_NOMATCH);
2759     }
2760     break;
2761    
2762     case PT_PC:
2763     for (fi = min;; fi++)
2764     {
2765     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2766     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2767     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2768     GETCHARINC(c, eptr);
2769     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2770     if ((prop_chartype == prop_value) == prop_fail_result)
2771     RRETURN(MATCH_NOMATCH);
2772     }
2773     break;
2774    
2775     case PT_SC:
2776     for (fi = min;; fi++)
2777     {
2778     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2779     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2780     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2781     GETCHARINC(c, eptr);
2782     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2783     if ((prop_script == prop_value) == prop_fail_result)
2784     RRETURN(MATCH_NOMATCH);
2785     }
2786     break;
2787    
2788     default:
2789     RRETURN(PCRE_ERROR_INTERNAL);
2790     break;
2791 nigel 77 }
2792     }
2793    
2794     /* Match extended Unicode sequences. We will get here only if the
2795     support is in the binary; otherwise a compile-time error occurs. */
2796    
2797     else if (ctype == OP_EXTUNI)
2798     {
2799     for (fi = min;; fi++)
2800     {
2801     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2802     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2803     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2804     GETCHARINCTEST(c, eptr);
2805 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2806 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2807     while (eptr < md->end_subject)
2808     {
2809     int len = 1;
2810     if (!utf8) c = *eptr; else
2811     {
2812     GETCHARLEN(c, eptr, len);
2813     }
2814 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2815 nigel 77 if (prop_category != ucp_M) break;
2816     eptr += len;
2817     }
2818     }
2819     }
2820    
2821     else
2822     #endif /* SUPPORT_UCP */
2823    
2824     #ifdef SUPPORT_UTF8
2825     /* UTF-8 mode */
2826     if (utf8)
2827     {
2828     for (fi = min;; fi++)
2829     {
2830     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2831     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2832     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2833    
2834     GETCHARINC(c, eptr);
2835     switch(ctype)
2836     {
2837     case OP_ANY:
2838     if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2839     break;
2840    
2841     case OP_ANYBYTE:
2842     break;
2843    
2844     case OP_NOT_DIGIT:
2845     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
2846     RRETURN(MATCH_NOMATCH);
2847     break;
2848    
2849     case OP_DIGIT:
2850     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
2851     RRETURN(MATCH_NOMATCH);
2852     break;
2853    
2854     case OP_NOT_WHITESPACE:
2855     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
2856     RRETURN(MATCH_NOMATCH);
2857     break;
2858    
2859     case OP_WHITESPACE:
2860     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
2861     RRETURN(MATCH_NOMATCH);
2862     break;
2863    
2864     case OP_NOT_WORDCHAR:
2865     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
2866     RRETURN(MATCH_NOMATCH);
2867     break;
2868    
2869     case OP_WORDCHAR:
2870     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
2871     RRETURN(MATCH_NOMATCH);
2872     break;
2873    
2874     default:
2875     RRETURN(PCRE_ERROR_INTERNAL);
2876     }
2877     }
2878     }
2879     else
2880     #endif
2881     /* Not UTF-8 mode */
2882     {
2883     for (fi = min;; fi++)
2884     {
2885     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2886     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2887     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2888     c = *eptr++;
2889     switch(ctype)
2890     {
2891     case OP_ANY:
2892     if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) RRETURN(MATCH_NOMATCH);
2893     break;
2894    
2895     case OP_ANYBYTE:
2896     break;
2897    
2898     case OP_NOT_DIGIT:
2899     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2900     break;
2901    
2902     case OP_DIGIT:
2903     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2904     break;
2905    
2906     case OP_NOT_WHITESPACE:
2907     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2908     break;
2909    
2910     case OP_WHITESPACE:
2911     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2912     break;
2913    
2914     case OP_NOT_WORDCHAR:
2915     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
2916     break;
2917    
2918     case OP_WORDCHAR:
2919     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
2920     break;
2921    
2922     default:
2923     RRETURN(PCRE_ERROR_INTERNAL);
2924     }
2925     }
2926     }
2927     /* Control never gets here */
2928     }
2929    
2930     /* If maximizing it is worth using inline code for speed, doing the type
2931     test once at the start (i.e. keep it out of the loop). Again, keep the
2932     UTF-8 and UCP stuff separate. */
2933    
2934     else
2935     {
2936     pp = eptr; /* Remember where we started */
2937    
2938     #ifdef SUPPORT_UCP
2939 nigel 87 if (prop_type >= 0)
2940 nigel 77 {
2941 nigel 87 switch(prop_type)
2942 nigel 77 {
2943 nigel 87 case PT_ANY:
2944     for (i = min; i < max; i++)
2945     {
2946     int len = 1;
2947     if (eptr >= md->end_subject) break;
2948     GETCHARLEN(c, eptr, len);
2949     if (prop_fail_result) break;
2950     eptr+= len;
2951     }
2952     break;
2953    
2954     case PT_LAMP:
2955     for (i = min; i < max; i++)
2956     {
2957     int len = 1;
2958     if (eptr >= md->end_subject) break;
2959     GETCHARLEN(c, eptr, len);
2960     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2961     if ((prop_chartype == ucp_Lu ||
2962     prop_chartype == ucp_Ll ||
2963     prop_chartype == ucp_Lt) == prop_fail_result)
2964     break;
2965     eptr+= len;
2966     }
2967     break;
2968    
2969     case PT_GC:
2970     for (i = min; i < max; i++)
2971     {
2972     int len = 1;
2973     if (eptr >= md->end_subject) break;
2974     GETCHARLEN(c, eptr, len);
2975     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2976     if ((prop_category == prop_value) == prop_fail_result)
2977     break;
2978     eptr+= len;
2979     }
2980     break;
2981    
2982     case PT_PC:
2983     for (i = min; i < max; i++)
2984     {
2985     int len = 1;
2986     if (eptr >= md->end_subject) break;
2987     GETCHARLEN(c, eptr, len);
2988     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2989     if ((prop_chartype == prop_value) == prop_fail_result)
2990     break;
2991     eptr+= len;
2992     }
2993     break;
2994    
2995     case PT_SC:
2996     for (i = min; i < max; i++)
2997     {
2998     int len = 1;
2999     if (eptr >= md->end_subject) break;
3000     GETCHARLEN(c, eptr, len);
3001     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3002     if ((prop_script == prop_value) == prop_fail_result)
3003     break;
3004     eptr+= len;
3005     }
3006     break;
3007 nigel 77 }
3008    
3009     /* eptr is now past the end of the maximum run */
3010    
3011     for(;;)
3012     {
3013     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3014     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3015     if (eptr-- == pp) break; /* Stop if tried at original pos */
3016     BACKCHAR(eptr);
3017     }
3018     }
3019    
3020     /* Match extended Unicode sequences. We will get here only if the
3021     support is in the binary; otherwise a compile-time error occurs. */
3022    
3023     else if (ctype == OP_EXTUNI)
3024     {
3025     for (i = min; i < max; i++)
3026     {
3027     if (eptr >= md->end_subject) break;
3028     GETCHARINCTEST(c, eptr);
3029 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3030 nigel 77 if (prop_category == ucp_M) break;
3031     while (eptr < md->end_subject)
3032     {
3033     int len = 1;
3034     if (!utf8) c = *eptr; else
3035     {
3036     GETCHARLEN(c, eptr, len);
3037     }
3038 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3039 nigel 77 if (prop_category != ucp_M) break;
3040     eptr += len;
3041     }
3042     }
3043    
3044     /* eptr is now past the end of the maximum run */
3045    
3046     for(;;)
3047     {
3048     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3049     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3050     if (eptr-- == pp) break; /* Stop if tried at original pos */
3051     for (;;) /* Move back over one extended */
3052     {
3053     int len = 1;
3054     BACKCHAR(eptr);
3055     if (!utf8) c = *eptr; else
3056     {
3057     GETCHARLEN(c, eptr, len);
3058     }
3059 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3060 nigel 77 if (prop_category != ucp_M) break;
3061     eptr--;
3062     }
3063     }
3064     }
3065    
3066     else
3067     #endif /* SUPPORT_UCP */
3068    
3069     #ifdef SUPPORT_UTF8
3070     /* UTF-8 mode */
3071    
3072     if (utf8)
3073     {
3074     switch(ctype)
3075     {
3076     case OP_ANY:
3077    
3078     /* Special code is required for UTF8, but when the maximum is unlimited
3079     we don't need it, so we repeat the non-UTF8 code. This is probably
3080     worth it, because .* is quite a common idiom. */
3081    
3082     if (max < INT_MAX)
3083     {
3084     if ((ims & PCRE_DOTALL) == 0)
3085     {
3086     for (i = min; i < max; i++)
3087     {
3088     if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3089     eptr++;
3090     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3091     }
3092     }
3093     else
3094     {
3095     for (i = min; i < max; i++)
3096     {
3097     eptr++;
3098     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3099     }
3100     }
3101     }
3102    
3103     /* Handle unlimited UTF-8 repeat */
3104    
3105     else
3106     {
3107     if ((ims & PCRE_DOTALL) == 0)
3108     {
3109     for (i = min; i < max; i++)
3110     {
3111     if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3112     eptr++;
3113     }
3114     break;
3115     }
3116     else
3117     {
3118     c = max - min;
3119     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3120     eptr += c;
3121     }
3122     }
3123     break;
3124    
3125     /* The byte case is the same as non-UTF8 */
3126    
3127     case OP_ANYBYTE:
3128     c = max - min;
3129     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3130     eptr += c;
3131     break;
3132    
3133     case OP_NOT_DIGIT:
3134     for (i = min; i < max; i++)
3135     {
3136     int len = 1;
3137     if (eptr >= md->end_subject) break;
3138     GETCHARLEN(c, eptr, len);
3139     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3140     eptr+= len;
3141     }
3142     break;
3143    
3144     case OP_DIGIT:
3145     for (i = min; i < max; i++)
3146     {
3147     int len = 1;
3148     if (eptr >= md->end_subject) break;
3149     GETCHARLEN(c, eptr, len);
3150     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3151     eptr+= len;
3152     }
3153     break;
3154    
3155     case OP_NOT_WHITESPACE:
3156     for (i = min; i < max; i++)
3157     {
3158     int len = 1;
3159     if (eptr >= md->end_subject) break;
3160     GETCHARLEN(c, eptr, len);
3161     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3162     eptr+= len;
3163     }
3164     break;
3165    
3166     case OP_WHITESPACE:
3167     for (i = min; i < max; i++)
3168     {
3169     int len = 1;
3170     if (eptr >= md->end_subject) break;
3171     GETCHARLEN(c, eptr, len);
3172     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3173     eptr+= len;
3174     }
3175     break;
3176    
3177     case OP_NOT_WORDCHAR:
3178     for (i = min; i < max; i++)
3179     {
3180     int len = 1;
3181     if (eptr >= md->end_subject) break;
3182     GETCHARLEN(c, eptr, len);
3183     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3184     eptr+= len;
3185     }
3186     break;
3187    
3188     case OP_WORDCHAR:
3189     for (i = min; i < max; i++)
3190     {
3191     int len = 1;
3192     if (eptr >= md->end_subject) break;
3193     GETCHARLEN(c, eptr, len);
3194     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3195     eptr+= len;
3196     }
3197     break;
3198    
3199     default:
3200     RRETURN(PCRE_ERROR_INTERNAL);
3201     }
3202    
3203     /* eptr is now past the end of the maximum run */
3204    
3205     for(;;)
3206     {
3207     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3208     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3209     if (eptr-- == pp) break; /* Stop if tried at original pos */
3210     BACKCHAR(eptr);
3211     }
3212     }
3213     else
3214     #endif
3215    
3216     /* Not UTF-8 mode */
3217     {
3218     switch(ctype)
3219     {
3220     case OP_ANY:
3221     if ((ims & PCRE_DOTALL) == 0)
3222     {
3223     for (i = min; i < max; i++)
3224     {
3225     if (eptr >= md->end_subject || *eptr == NEWLINE) break;
3226     eptr++;
3227     }
3228     break;
3229     }
3230     /* For DOTALL case, fall through and treat as \C */
3231    
3232     case OP_ANYBYTE:
3233     c = max - min;
3234     if (c > md->end_subject - eptr) c = md->end_subject - eptr;
3235     eptr += c;
3236     break;
3237    
3238     case OP_NOT_DIGIT:
3239     for (i = min; i < max; i++)
3240     {
3241     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3242     break;
3243     eptr++;
3244     }
3245     break;
3246    
3247     case OP_DIGIT:
3248     for (i = min; i < max; i++)
3249     {
3250     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3251     break;
3252     eptr++;
3253     }
3254     break;
3255    
3256     case OP_NOT_WHITESPACE:
3257     for (i = min; i < max; i++)
3258     {
3259     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3260     break;
3261     eptr++;
3262     }
3263     break;
3264    
3265     case OP_WHITESPACE:
3266     for (i = min; i < max; i++)
3267     {
3268     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3269     break;
3270     eptr++;
3271     }
3272     break;
3273    
3274     case OP_NOT_WORDCHAR:
3275     for (i = min; i < max; i++)
3276     {
3277     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3278     break;
3279     eptr++;
3280     }
3281     break;
3282    
3283     case OP_WORDCHAR:
3284     for (i = min; i < max; i++)
3285     {
3286     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3287     break;
3288     eptr++;
3289     }
3290     break;
3291    
3292     default:
3293     RRETURN(PCRE_ERROR_INTERNAL);
3294     }
3295    
3296     /* eptr is now past the end of the maximum run */
3297    
3298     while (eptr >= pp)
3299     {
3300     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3301     eptr--;
3302     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3303     }
3304     }
3305    
3306     /* Get here if we can't make it match with any permitted repetitions */
3307    
3308     RRETURN(MATCH_NOMATCH);
3309     }
3310     /* Control never gets here */
3311    
3312     /* There's been some horrible disaster. Since all codes > OP_BRA are
3313     for capturing brackets, and there shouldn't be any gaps between 0 and
3314     OP_BRA, arrival here can only mean there is something seriously wrong
3315     in the code above or the OP_xxx definitions. */
3316    
3317     default:
3318     DPRINTF(("Unknown opcode %d\n", *ecode));
3319     RRETURN(PCRE_ERROR_UNKNOWN_NODE);
3320     }
3321    
3322     /* Do not stick any code in here without much thought; it is assumed
3323     that "continue" in the code above comes out to here to repeat the main
3324     loop. */
3325    
3326     } /* End of main loop */
3327     /* Control never reaches here */
3328     }
3329    
3330    
3331     /***************************************************************************
3332     ****************************************************************************
3333     RECURSION IN THE match() FUNCTION
3334    
3335     Undefine all the macros that were defined above to handle this. */
3336    
3337     #ifdef NO_RECURSE
3338     #undef eptr
3339     #undef ecode
3340     #undef offset_top
3341     #undef ims
3342     #undef eptrb
3343     #undef flags
3344    
3345     #undef callpat
3346     #undef charptr
3347     #undef data
3348     #undef next
3349     #undef pp
3350     #undef prev
3351     #undef saved_eptr
3352    
3353     #undef new_recursive
3354    
3355     #undef cur_is_word
3356     #undef condition
3357     #undef minimize
3358     #undef prev_is_word
3359    
3360     #undef original_ims
3361    
3362     #undef ctype
3363     #undef length
3364     #undef max
3365     #undef min
3366     #undef number
3367     #undef offset
3368     #undef op
3369     #undef save_capture_last
3370     #undef save_offset1
3371     #undef save_offset2
3372     #undef save_offset3
3373     #undef stacksave
3374    
3375     #undef newptrb
3376    
3377     #endif
3378    
3379     /* These two are defined as macros in both cases */
3380    
3381     #undef fc
3382     #undef fi
3383    
3384     /***************************************************************************
3385     ***************************************************************************/
3386    
3387    
3388    
3389     /*************************************************
3390     * Execute a Regular Expression *
3391     *************************************************/
3392    
3393     /* This function applies a compiled re to a subject string and picks out
3394     portions of the string if it matches. Two elements in the vector are set for
3395     each substring: the offsets to the start and end of the substring.
3396    
3397     Arguments:
3398     argument_re points to the compiled expression
3399     extra_data points to extra data or is NULL
3400     subject points to the subject string
3401     length length of subject string (may contain binary zeros)
3402     start_offset where to start in the subject string
3403     options option bits
3404     offsets points to a vector of ints to be filled in with offsets
3405     offsetcount the number of elements in the vector
3406    
3407     Returns: > 0 => success; value is the number of elements filled in
3408     = 0 => success, but offsets is not big enough
3409     -1 => failed to match
3410     < -1 => some kind of unexpected problem
3411     */
3412    
3413 nigel 87 PCRE_DATA_SCOPE int
3414 nigel 77 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3415 nigel 87 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3416 nigel 77 int offsetcount)
3417     {
3418     int rc, resetcount, ocount;
3419     int first_byte = -1;
3420     int req_byte = -1;
3421     int req_byte2 = -1;
3422     unsigned long int ims = 0;
3423     BOOL using_temporary_offsets = FALSE;
3424     BOOL anchored;
3425     BOOL startline;
3426     BOOL firstline;
3427     BOOL first_byte_caseless = FALSE;
3428     BOOL req_byte_caseless = FALSE;
3429     match_data match_block;
3430     const uschar *tables;
3431     const uschar *start_bits = NULL;
3432 nigel 87 USPTR start_match = (USPTR)subject + start_offset;
3433     USPTR end_subject;
3434     USPTR req_byte_ptr = start_match - 1;
3435 nigel 77
3436     pcre_study_data internal_study;
3437     const pcre_study_data *study;
3438    
3439     real_pcre internal_re;
3440     const real_pcre *external_re = (const real_pcre *)argument_re;
3441     const real_pcre *re = external_re;
3442    
3443     /* Plausibility checks */
3444    
3445     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3446     if (re == NULL || subject == NULL ||
3447     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3448     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3449    
3450     /* Fish out the optional data from the extra_data structure, first setting
3451     the default values. */
3452    
3453     study = NULL;
3454     match_block.match_limit = MATCH_LIMIT;
3455 nigel 87 match_block.match_limit_recursion = MATCH_LIMIT_RECURSION;
3456 nigel 77 match_block.callout_data = NULL;
3457    
3458     /* The table pointer is always in native byte order. */
3459    
3460     tables = external_re->tables;
3461    
3462     if (extra_data != NULL)
3463     {
3464     register unsigned int flags = extra_data->flags;
3465     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3466     study = (const pcre_study_data *)extra_data->study_data;
3467     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3468     match_block.match_limit = extra_data->match_limit;
3469 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3470     match_block.match_limit_recursion = extra_data->match_limit_recursion;
3471 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3472     match_block.callout_data = extra_data->callout_data;
3473     if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3474     }
3475    
3476     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3477     is a feature that makes it possible to save compiled regex and re-use them
3478     in other programs later. */
3479    
3480     if (tables == NULL) tables = _pcre_default_tables;
3481    
3482     /* Check that the first field in the block is the magic number. If it is not,
3483     test for a regex that was compiled on a host of opposite endianness. If this is
3484     the case, flipped values are put in internal_re and internal_study if there was
3485     study data too. */
3486    
3487     if (re->magic_number != MAGIC_NUMBER)
3488     {
3489     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3490     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3491     if (study != NULL) study = &internal_study;
3492     }
3493    
3494     /* Set up other data */
3495    
3496     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3497     startline = (re->options & PCRE_STARTLINE) != 0;
3498     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3499    
3500     /* The code starts after the real_pcre block and the capture name table. */
3501    
3502     match_block.start_code = (const uschar *)external_re + re->name_table_offset +
3503     re->name_count * re->name_entry_size;
3504    
3505 nigel 87 match_block.start_subject = (USPTR)subject;
3506 nigel 77 match_block.start_offset = start_offset;
3507     match_block.end_subject = match_block.start_subject + length;
3508     end_subject = match_block.end_subject;
3509    
3510     match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3511     match_block.utf8 = (re->options & PCRE_UTF8) != 0;
3512    
3513     match_block.notbol = (options & PCRE_NOTBOL) != 0;
3514     match_block.noteol = (options & PCRE_NOTEOL) != 0;
3515     match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
3516     match_block.partial = (options & PCRE_PARTIAL) != 0;
3517     match_block.hitend = FALSE;
3518    
3519     match_block.recursive = NULL; /* No recursion at top level */
3520    
3521     match_block.lcc = tables + lcc_offset;
3522     match_block.ctypes = tables + ctypes_offset;
3523    
3524     /* Partial matching is supported only for a restricted set of regexes at the
3525     moment. */
3526    
3527     if (match_block.partial && (re->options & PCRE_NOPARTIAL) != 0)
3528     return PCRE_ERROR_BADPARTIAL;
3529    
3530     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3531     back the character offset. */
3532    
3533     #ifdef SUPPORT_UTF8
3534     if (match_block.utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3535     {
3536     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3537     return PCRE_ERROR_BADUTF8;
3538     if (start_offset > 0 && start_offset < length)
3539     {
3540     int tb = ((uschar *)subject)[start_offset];
3541     if (tb > 127)
3542     {
3543     tb &= 0xc0;
3544     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3545     }
3546     }
3547     }
3548     #endif
3549    
3550     /* The ims options can vary during the matching as a result of the presence
3551     of (?ims) items in the pattern. They are kept in a local variable so that
3552     restoring at the exit of a group is easy. */
3553    
3554     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3555    
3556     /* If the expression has got more back references than the offsets supplied can
3557     hold, we get a temporary chunk of working store to use during the matching.
3558     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3559     of 3. */
3560    
3561     ocount = offsetcount - (offsetcount % 3);
3562    
3563     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3564     {
3565     ocount = re->top_backref * 3 + 3;
3566     match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3567     if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3568     using_temporary_offsets = TRUE;
3569     DPRINTF(("Got memory to hold back references\n"));
3570     }
3571     else match_block.offset_vector = offsets;
3572    
3573     match_block.offset_end = ocount;
3574     match_block.offset_max = (2*ocount)/3;
3575     match_block.offset_overflow = FALSE;
3576     match_block.capture_last = -1;
3577    
3578     /* Compute the minimum number of offsets that we need to reset each time. Doing
3579     this makes a huge difference to execution time when there aren't many brackets
3580     in the pattern. */
3581    
3582     resetcount = 2 + re->top_bracket * 2;
3583     if (resetcount > offsetcount) resetcount = ocount;
3584    
3585     /* Reset the working variable associated with each extraction. These should
3586     never be used unless previously set, but they get saved and restored, and so we
3587     initialize them to avoid reading uninitialized locations. */
3588    
3589     if (match_block.offset_vector != NULL)
3590     {
3591     register int *iptr = match_block.offset_vector + ocount;
3592     register int *iend = iptr - resetcount/2 + 1;
3593     while (--iptr >= iend) *iptr = -1;
3594     }
3595    
3596     /* Set up the first character to match, if available. The first_byte value is
3597     never set for an anchored regular expression, but the anchoring may be forced
3598     at run time, so we have to test for anchoring. The first char may be unset for
3599     an unanchored pattern, of course. If there's no first char and the pattern was
3600     studied, there may be a bitmap of possible first characters. */
3601    
3602     if (!anchored)
3603     {
3604     if ((re->options & PCRE_FIRSTSET) != 0)
3605     {
3606     first_byte = re->first_byte & 255;
3607     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3608     first_byte = match_block.lcc[first_byte];
3609     }
3610     else
3611     if (!startline && study != NULL &&
3612     (study->options & PCRE_STUDY_MAPPED) != 0)
3613     start_bits = study->start_bits;
3614     }
3615    
3616     /* For anchored or unanchored matches, there may be a "last known required
3617     character" set. */
3618    
3619     if ((re->options & PCRE_REQCHSET) != 0)
3620     {
3621     req_byte = re->req_byte & 255;
3622     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3623     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3624     }
3625    
3626     /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3627     the loop runs just once. */
3628    
3629     do
3630     {
3631 nigel 87 USPTR save_end_subject = end_subject;
3632 nigel 77
3633     /* Reset the maximum number of extractions we might see. */
3634    
3635     if (match_block.offset_vector != NULL)
3636     {
3637     register int *iptr = match_block.offset_vector;
3638     register int *iend = iptr + resetcount;
3639     while (iptr < iend) *iptr++ = -1;
3640     }
3641    
3642     /* Advance to a unique first char if possible. If firstline is TRUE, the
3643     start of the match is constrained to the first line of a multiline string.
3644     Implement this by temporarily adjusting end_subject so that we stop scanning
3645     at a newline. If the match fails at the newline, later code breaks this loop.
3646     */
3647    
3648     if (firstline)
3649     {
3650 nigel 87 USPTR t = start_match;
3651 nigel 77 while (t < save_end_subject && *t != '\n') t++;
3652     end_subject = t;
3653     }
3654    
3655     /* Now test for a unique first byte */
3656    
3657     if (first_byte >= 0)
3658     {
3659     if (first_byte_caseless)
3660     while (start_match < end_subject &&
3661     match_block.lcc[*start_match] != first_byte)
3662     start_match++;
3663     else
3664     while (start_match < end_subject && *start_match != first_byte)
3665     start_match++;
3666     }
3667    
3668     /* Or to just after \n for a multiline match if possible */
3669    
3670     else if (startline)
3671     {
3672     if (start_match > match_block.start_subject + start_offset)
3673     {
3674     while (start_match < end_subject && start_match[-1] != NEWLINE)
3675     start_match++;
3676     }
3677     }
3678    
3679     /* Or to a non-unique first char after study */
3680    
3681     else if (start_bits != NULL)
3682     {
3683     while (start_match < end_subject)
3684     {
3685     register unsigned int c = *start_match;
3686     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
3687     }
3688     }
3689    
3690     /* Restore fudged end_subject */
3691    
3692     end_subject = save_end_subject;
3693    
3694     #ifdef DEBUG /* Sigh. Some compilers never learn. */
3695     printf(">>>> Match against: ");
3696     pchars(start_match, end_subject - start_match, TRUE, &match_block);
3697     printf("\n");
3698     #endif
3699    
3700     /* If req_byte is set, we know that that character must appear in the subject
3701     for the match to succeed. If the first character is set, req_byte must be
3702     later in the subject; otherwise the test starts at the match point. This
3703     optimization can save a huge amount of backtracking in patterns with nested
3704     unlimited repeats that aren't going to match. Writing separate code for
3705     cased/caseless versions makes it go faster, as does using an autoincrement
3706     and backing off on a match.
3707    
3708     HOWEVER: when the subject string is very, very long, searching to its end can
3709     take a long time, and give bad performance on quite ordinary patterns. This
3710     showed up when somebody was matching /^C/ on a 32-megabyte string... so we
3711     don't do this when the string is sufficiently long.
3712    
3713     ALSO: this processing is disabled when partial matching is requested.
3714     */
3715    
3716     if (req_byte >= 0 &&
3717     end_subject - start_match < REQ_BYTE_MAX &&
3718     !match_block.partial)
3719     {
3720 nigel 87 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
3721 nigel 77
3722     /* We don't need to repeat the search if we haven't yet reached the
3723     place we found it at last time. */
3724    
3725     if (p > req_byte_ptr)
3726     {
3727     if (req_byte_caseless)
3728     {
3729     while (p < end_subject)
3730     {
3731     register int pp = *p++;
3732     if (pp == req_byte || pp == req_byte2) { p--; break; }
3733     }
3734     }
3735     else
3736     {
3737     while (p < end_subject)
3738     {
3739     if (*p++ == req_byte) { p--; break; }
3740     }
3741     }
3742    
3743     /* If we can't find the required character, break the matching loop */
3744    
3745     if (p >= end_subject) break;
3746    
3747     /* If we have found the required character, save the point where we
3748     found it, so that we don't search again next time round the loop if
3749     the start hasn't passed this character yet. */
3750    
3751     req_byte_ptr = p;
3752     }
3753     }
3754    
3755     /* When a match occurs, substrings will be set for all internal extractions;
3756     we just need to set up the whole thing as substring 0 before returning. If
3757     there were too many extractions, set the return code to zero. In the case
3758     where we had to get some local store to hold offsets for backreferences, copy
3759     those back references that we can. In this case there need not be overflow
3760     if certain parts of the pattern were not used. */
3761    
3762     match_block.start_match = start_match;
3763     match_block.match_call_count = 0;
3764    
3765     rc = match(start_match, match_block.start_code, 2, &match_block, ims, NULL,
3766 nigel 87 match_isgroup, 0);
3767 nigel 77
3768     /* When the result is no match, if the subject's first character was a
3769     newline and the PCRE_FIRSTLINE option is set, break (which will return
3770     PCRE_ERROR_NOMATCH). The option requests that a match occur before the first
3771     newline in the subject. Otherwise, advance the pointer to the next character
3772     and continue - but the continuation will actually happen only when the
3773     pattern is not anchored. */
3774    
3775     if (rc == MATCH_NOMATCH)
3776     {
3777     if (firstline && *start_match == NEWLINE) break;
3778     start_match++;
3779     #ifdef SUPPORT_UTF8
3780     if (match_block.utf8)
3781     while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
3782     start_match++;
3783     #endif
3784     continue;
3785     }
3786    
3787     if (rc != MATCH_MATCH)
3788     {
3789     DPRINTF((">>>> error: returning %d\n", rc));
3790     return rc;
3791     }
3792    
3793     /* We have a match! Copy the offset information from temporary store if
3794     necessary */
3795    
3796     if (using_temporary_offsets)
3797     {
3798     if (offsetcount >= 4)
3799     {
3800     memcpy(offsets + 2, match_block.offset_vector + 2,
3801     (offsetcount - 2) * sizeof(int));
3802     DPRINTF(("Copied offsets from temporary memory\n"));
3803     }
3804     if (match_block.end_offset_top > offsetcount)
3805     match_block.offset_overflow = TRUE;
3806    
3807     DPRINTF(("Freeing temporary memory\n"));
3808     (pcre_free)(match_block.offset_vector);
3809     }
3810    
3811     rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
3812    
3813     if (offsetcount < 2) rc = 0; else
3814     {
3815     offsets[0] = start_match - match_block.start_subject;
3816     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
3817     }
3818    
3819     DPRINTF((">>>> returning %d\n", rc));
3820     return rc;
3821     }
3822    
3823     /* This "while" is the end of the "do" above */
3824    
3825     while (!anchored && start_match <= end_subject);
3826    
3827     if (using_temporary_offsets)
3828     {
3829     DPRINTF(("Freeing temporary memory\n"));
3830     (pcre_free)(match_block.offset_vector);
3831     }
3832    
3833     if (match_block.partial && match_block.hitend)
3834     {
3835     DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
3836     return PCRE_ERROR_PARTIAL;
3837     }
3838     else
3839     {
3840     DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
3841     return PCRE_ERROR_NOMATCH;
3842     }
3843     }
3844    
3845     /* End of pcre_exec.c */

webmaster@exim.org
ViewVC Help
Powered by ViewVC 1.1.12