/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 93 - (hide annotations) (download)
Sat Feb 24 21:41:42 2007 UTC (7 years, 5 months ago) by nigel
File MIME type: text/plain
File size: 126511 byte(s)
Load pcre-7.0 into code/trunk.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 nigel 87 Copyright (c) 1997-2006 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 nigel 93 #define NLBLOCK md /* Block containing newline information */
46     #define PSSTART start_subject /* Field containing processed string start */
47     #define PSEND end_subject /* Field containing processed string end */
48    
49 nigel 77 #include "pcre_internal.h"
50    
51 nigel 93 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
52     obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
53 nigel 77
54 nigel 93 #define EPTR_WORK_SIZE (1000)
55 nigel 77
56     /* Flag bits for the match() function */
57    
58 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
59     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
60     #define match_tail_recursed 0x04 /* Tail recursive call */
61 nigel 77
62     /* Non-error returns from the match() function. Error returns are externally
63     defined PCRE_ERROR_xxx codes, which are all negative. */
64    
65     #define MATCH_MATCH 1
66     #define MATCH_NOMATCH 0
67    
68     /* Maximum number of ints of offset to save on the stack for recursive calls.
69     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
70     because the offset vector is always a multiple of 3 long. */
71    
72     #define REC_STACK_SAVE_MAX 30
73    
74     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
75    
76     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
77     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
78    
79    
80    
81     #ifdef DEBUG
82     /*************************************************
83     * Debugging function to print chars *
84     *************************************************/
85    
86     /* Print a sequence of chars in printable format, stopping at the end of the
87     subject if the requested.
88    
89     Arguments:
90     p points to characters
91     length number to print
92     is_subject TRUE if printing from within md->start_subject
93     md pointer to matching data block, if is_subject is TRUE
94    
95     Returns: nothing
96     */
97    
98     static void
99     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
100     {
101 nigel 93 unsigned int c;
102 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
103     while (length-- > 0)
104     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
105     }
106     #endif
107    
108    
109    
110     /*************************************************
111     * Match a back-reference *
112     *************************************************/
113    
114     /* If a back reference hasn't been set, the length that is passed is greater
115     than the number of characters left in the string, so the match fails.
116    
117     Arguments:
118     offset index into the offset vector
119     eptr points into the subject
120     length length to be matched
121     md points to match data block
122     ims the ims flags
123    
124     Returns: TRUE if matched
125     */
126    
127     static BOOL
128 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
129 nigel 77 unsigned long int ims)
130     {
131 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
132 nigel 77
133     #ifdef DEBUG
134     if (eptr >= md->end_subject)
135     printf("matching subject <null>");
136     else
137     {
138     printf("matching subject ");
139     pchars(eptr, length, TRUE, md);
140     }
141     printf(" against backref ");
142     pchars(p, length, FALSE, md);
143     printf("\n");
144     #endif
145    
146     /* Always fail if not enough characters left */
147    
148     if (length > md->end_subject - eptr) return FALSE;
149    
150     /* Separate the caselesss case for speed */
151    
152     if ((ims & PCRE_CASELESS) != 0)
153     {
154     while (length-- > 0)
155     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
156     }
157     else
158     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
159    
160     return TRUE;
161     }
162    
163    
164    
165     /***************************************************************************
166     ****************************************************************************
167     RECURSION IN THE match() FUNCTION
168    
169 nigel 87 The match() function is highly recursive, though not every recursive call
170     increases the recursive depth. Nevertheless, some regular expressions can cause
171     it to recurse to a great depth. I was writing for Unix, so I just let it call
172     itself recursively. This uses the stack for saving everything that has to be
173     saved for a recursive call. On Unix, the stack can be large, and this works
174     fine.
175 nigel 77
176 nigel 87 It turns out that on some non-Unix-like systems there are problems with
177     programs that use a lot of stack. (This despite the fact that every last chip
178     has oodles of memory these days, and techniques for extending the stack have
179     been known for decades.) So....
180 nigel 77
181     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
182     calls by keeping local variables that need to be preserved in blocks of memory
183 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
184 nigel 77 achieve this so that the actual code doesn't look very different to what it
185     always used to.
186     ****************************************************************************
187     ***************************************************************************/
188    
189    
190 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
191     versions and production versions. */
192 nigel 77
193     #ifndef NO_RECURSE
194     #define REGISTER register
195 nigel 87 #ifdef DEBUG
196     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
197     { \
198     printf("match() called in line %d\n", __LINE__); \
199     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
200     printf("to line %d\n", __LINE__); \
201     }
202     #define RRETURN(ra) \
203     { \
204     printf("match() returned %d from line %d ", ra, __LINE__); \
205     return ra; \
206     }
207     #else
208     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
209     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
210 nigel 77 #define RRETURN(ra) return ra
211 nigel 87 #endif
212    
213 nigel 77 #else
214    
215    
216     /* These versions of the macros manage a private stack on the heap. Note
217     that the rd argument of RMATCH isn't actually used. It's the md argument of
218     match(), which never changes. */
219    
220     #define REGISTER
221    
222     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
223     {\
224     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
225     if (setjmp(frame->Xwhere) == 0)\
226     {\
227     newframe->Xeptr = ra;\
228     newframe->Xecode = rb;\
229     newframe->Xoffset_top = rc;\
230     newframe->Xims = re;\
231     newframe->Xeptrb = rf;\
232     newframe->Xflags = rg;\
233 nigel 87 newframe->Xrdepth = frame->Xrdepth + 1;\
234 nigel 77 newframe->Xprevframe = frame;\
235     frame = newframe;\
236     DPRINTF(("restarting from line %d\n", __LINE__));\
237     goto HEAP_RECURSE;\
238     }\
239     else\
240     {\
241     DPRINTF(("longjumped back to line %d\n", __LINE__));\
242     frame = md->thisframe;\
243     rx = frame->Xresult;\
244     }\
245     }
246    
247     #define RRETURN(ra)\
248     {\
249     heapframe *newframe = frame;\
250     frame = newframe->Xprevframe;\
251     (pcre_stack_free)(newframe);\
252     if (frame != NULL)\
253     {\
254     frame->Xresult = ra;\
255     md->thisframe = frame;\
256     longjmp(frame->Xwhere, 1);\
257     }\
258     return ra;\
259     }
260    
261    
262     /* Structure for remembering the local variables in a private frame */
263    
264     typedef struct heapframe {
265     struct heapframe *Xprevframe;
266    
267     /* Function arguments that may change */
268    
269     const uschar *Xeptr;
270     const uschar *Xecode;
271     int Xoffset_top;
272     long int Xims;
273     eptrblock *Xeptrb;
274     int Xflags;
275 nigel 91 unsigned int Xrdepth;
276 nigel 77
277     /* Function local variables */
278    
279     const uschar *Xcallpat;
280     const uschar *Xcharptr;
281     const uschar *Xdata;
282     const uschar *Xnext;
283     const uschar *Xpp;
284     const uschar *Xprev;
285     const uschar *Xsaved_eptr;
286    
287     recursion_info Xnew_recursive;
288    
289     BOOL Xcur_is_word;
290     BOOL Xcondition;
291     BOOL Xprev_is_word;
292    
293     unsigned long int Xoriginal_ims;
294    
295     #ifdef SUPPORT_UCP
296     int Xprop_type;
297 nigel 87 int Xprop_value;
298 nigel 77 int Xprop_fail_result;
299     int Xprop_category;
300     int Xprop_chartype;
301 nigel 87 int Xprop_script;
302 nigel 77 #endif
303    
304     int Xctype;
305 nigel 93 unsigned int Xfc;
306 nigel 77 int Xfi;
307     int Xlength;
308     int Xmax;
309     int Xmin;
310     int Xnumber;
311     int Xoffset;
312     int Xop;
313     int Xsave_capture_last;
314     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
315     int Xstacksave[REC_STACK_SAVE_MAX];
316    
317     eptrblock Xnewptrb;
318    
319     /* Place to pass back result, and where to jump back to */
320    
321     int Xresult;
322     jmp_buf Xwhere;
323    
324     } heapframe;
325    
326     #endif
327    
328    
329     /***************************************************************************
330     ***************************************************************************/
331    
332    
333    
334     /*************************************************
335     * Match from current position *
336     *************************************************/
337    
338 nigel 93 /* This function is called recursively in many circumstances. Whenever it
339 nigel 77 returns a negative (error) response, the outer incarnation must also return the
340     same response.
341    
342     Performance note: It might be tempting to extract commonly used fields from the
343     md structure (e.g. utf8, end_subject) into individual variables to improve
344     performance. Tests using gcc on a SPARC disproved this; in the first case, it
345     made performance worse.
346    
347     Arguments:
348 nigel 93 eptr pointer to current character in subject
349     ecode pointer to current position in compiled code
350 nigel 77 offset_top current top pointer
351     md pointer to "static" info for the match
352     ims current /i, /m, and /s options
353     eptrb pointer to chain of blocks containing eptr at start of
354     brackets - for testing for empty matches
355     flags can contain
356     match_condassert - this is an assertion condition
357 nigel 93 match_cbegroup - this is the start of an unlimited repeat
358     group that can match an empty string
359     match_tail_recursed - this is a tail_recursed group
360 nigel 87 rdepth the recursion depth
361 nigel 77
362     Returns: MATCH_MATCH if matched ) these values are >= 0
363     MATCH_NOMATCH if failed to match )
364     a negative PCRE_ERROR_xxx value if aborted by an error condition
365 nigel 87 (e.g. stopped by repeated call or recursion limit)
366 nigel 77 */
367    
368     static int
369 nigel 87 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
370 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
371 nigel 91 int flags, unsigned int rdepth)
372 nigel 77 {
373     /* These variables do not need to be preserved over recursion in this function,
374 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
375     "register" because they are used a lot in loops. */
376 nigel 77
377 nigel 91 register int rrc; /* Returns from recursive calls */
378     register int i; /* Used for loops not involving calls to RMATCH() */
379 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
380 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
381 nigel 77
382 nigel 93 BOOL minimize, possessive; /* Quantifier options */
383    
384 nigel 77 /* When recursion is not being used, all "local" variables that have to be
385     preserved over calls to RMATCH() are part of a "frame" which is obtained from
386     heap storage. Set up the top-level frame here; others are obtained from the
387     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
388    
389     #ifdef NO_RECURSE
390     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
391     frame->Xprevframe = NULL; /* Marks the top level */
392    
393     /* Copy in the original argument variables */
394    
395     frame->Xeptr = eptr;
396     frame->Xecode = ecode;
397     frame->Xoffset_top = offset_top;
398     frame->Xims = ims;
399     frame->Xeptrb = eptrb;
400     frame->Xflags = flags;
401 nigel 87 frame->Xrdepth = rdepth;
402 nigel 77
403     /* This is where control jumps back to to effect "recursion" */
404    
405     HEAP_RECURSE:
406    
407     /* Macros make the argument variables come from the current frame */
408    
409     #define eptr frame->Xeptr
410     #define ecode frame->Xecode
411     #define offset_top frame->Xoffset_top
412     #define ims frame->Xims
413     #define eptrb frame->Xeptrb
414     #define flags frame->Xflags
415 nigel 87 #define rdepth frame->Xrdepth
416 nigel 77
417     /* Ditto for the local variables */
418    
419     #ifdef SUPPORT_UTF8
420     #define charptr frame->Xcharptr
421     #endif
422     #define callpat frame->Xcallpat
423     #define data frame->Xdata
424     #define next frame->Xnext
425     #define pp frame->Xpp
426     #define prev frame->Xprev
427     #define saved_eptr frame->Xsaved_eptr
428    
429     #define new_recursive frame->Xnew_recursive
430    
431     #define cur_is_word frame->Xcur_is_word
432     #define condition frame->Xcondition
433     #define prev_is_word frame->Xprev_is_word
434    
435     #define original_ims frame->Xoriginal_ims
436    
437     #ifdef SUPPORT_UCP
438     #define prop_type frame->Xprop_type
439 nigel 87 #define prop_value frame->Xprop_value
440 nigel 77 #define prop_fail_result frame->Xprop_fail_result
441     #define prop_category frame->Xprop_category
442     #define prop_chartype frame->Xprop_chartype
443 nigel 87 #define prop_script frame->Xprop_script
444 nigel 77 #endif
445    
446     #define ctype frame->Xctype
447     #define fc frame->Xfc
448     #define fi frame->Xfi
449     #define length frame->Xlength
450     #define max frame->Xmax
451     #define min frame->Xmin
452     #define number frame->Xnumber
453     #define offset frame->Xoffset
454     #define op frame->Xop
455     #define save_capture_last frame->Xsave_capture_last
456     #define save_offset1 frame->Xsave_offset1
457     #define save_offset2 frame->Xsave_offset2
458     #define save_offset3 frame->Xsave_offset3
459     #define stacksave frame->Xstacksave
460    
461     #define newptrb frame->Xnewptrb
462    
463     /* When recursion is being used, local variables are allocated on the stack and
464     get preserved during recursion in the normal way. In this environment, fi and
465     i, and fc and c, can be the same variables. */
466    
467 nigel 93 #else /* NO_RECURSE not defined */
468 nigel 77 #define fi i
469     #define fc c
470    
471    
472 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
473     const uschar *charptr; /* in small blocks of the code. My normal */
474     #endif /* style of coding would have declared */
475     const uschar *callpat; /* them within each of those blocks. */
476     const uschar *data; /* However, in order to accommodate the */
477     const uschar *next; /* version of this code that uses an */
478     USPTR pp; /* external "stack" implemented on the */
479     const uschar *prev; /* heap, it is easier to declare them all */
480     USPTR saved_eptr; /* here, so the declarations can be cut */
481     /* out in a block. The only declarations */
482     recursion_info new_recursive; /* within blocks below are for variables */
483     /* that do not have to be preserved over */
484     BOOL cur_is_word; /* a recursive call to RMATCH(). */
485     BOOL condition;
486 nigel 77 BOOL prev_is_word;
487    
488     unsigned long int original_ims;
489    
490     #ifdef SUPPORT_UCP
491     int prop_type;
492 nigel 87 int prop_value;
493 nigel 77 int prop_fail_result;
494     int prop_category;
495     int prop_chartype;
496 nigel 87 int prop_script;
497 nigel 77 #endif
498    
499     int ctype;
500     int length;
501     int max;
502     int min;
503     int number;
504     int offset;
505     int op;
506     int save_capture_last;
507     int save_offset1, save_offset2, save_offset3;
508     int stacksave[REC_STACK_SAVE_MAX];
509    
510     eptrblock newptrb;
511 nigel 93 #endif /* NO_RECURSE */
512 nigel 77
513     /* These statements are here to stop the compiler complaining about unitialized
514     variables. */
515    
516     #ifdef SUPPORT_UCP
517 nigel 87 prop_value = 0;
518 nigel 77 prop_fail_result = 0;
519     #endif
520    
521 nigel 93
522 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
523     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
524     used. Thanks to Ian Taylor for noticing this possibility and sending the
525     original patch. */
526    
527     TAIL_RECURSE:
528    
529 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
530     are specified by the macro RMATCH and RRETURN is used to return. When
531     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
532     and a "return", respectively (possibly with some debugging if DEBUG is
533     defined). However, RMATCH isn't like a function call because it's quite a
534     complicated macro. It has to be used in one particular way. This shouldn't,
535     however, impact performance when true recursion is being used. */
536 nigel 77
537 nigel 87 /* First check that we haven't called match() too many times, or that we
538     haven't exceeded the recursive call limit. */
539    
540 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
541 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
542 nigel 77
543     original_ims = ims; /* Save for resetting on ')' */
544 nigel 91
545     #ifdef SUPPORT_UTF8
546 nigel 77 utf8 = md->utf8; /* Local copy of the flag */
547 nigel 91 #else
548     utf8 = FALSE;
549     #endif
550 nigel 77
551 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
552     string, the match_cbegroup flag is set. When this is the case, add the current
553     subject pointer to the chain of such remembered pointers, to be checked when we
554     hit the closing ket, in order to break infinite loops that match no characters.
555     When match() is called in other circumstances, don't add to the chain. If this
556     is a tail recursion, use a block from the workspace, as the one on the stack is
557     already used. */
558 nigel 77
559 nigel 93 if ((flags & match_cbegroup) != 0)
560 nigel 77 {
561 nigel 93 eptrblock *p;
562     if ((flags & match_tail_recursed) != 0)
563     {
564     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
565     p = md->eptrchain + md->eptrn++;
566     }
567     else p = &newptrb;
568     p->epb_saved_eptr = eptr;
569     p->epb_prev = eptrb;
570     eptrb = p;
571 nigel 77 }
572    
573 nigel 93 /* Now start processing the opcodes. */
574 nigel 77
575     for (;;)
576     {
577 nigel 93 minimize = possessive = FALSE;
578 nigel 77 op = *ecode;
579    
580     /* For partial matching, remember if we ever hit the end of the subject after
581     matching at least one subject character. */
582    
583     if (md->partial &&
584     eptr >= md->end_subject &&
585     eptr > md->start_match)
586     md->hitend = TRUE;
587    
588 nigel 93 switch(op)
589     {
590     /* Handle a capturing bracket. If there is space in the offset vector, save
591     the current subject position in the working slot at the top of the vector.
592     We mustn't change the current values of the data slot, because they may be
593     set from a previous iteration of this group, and be referred to by a
594     reference inside the group.
595 nigel 77
596 nigel 93 If the bracket fails to match, we need to restore this value and also the
597     values of the final offsets, in case they were set by a previous iteration
598     of the same bracket.
599 nigel 77
600 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
601     a non-capturing bracket. Don't worry about setting the flag for the error
602     case here; that is handled in the code for KET. */
603 nigel 77
604 nigel 93 case OP_CBRA:
605     case OP_SCBRA:
606     number = GET2(ecode, 1+LINK_SIZE);
607 nigel 77 offset = number << 1;
608    
609     #ifdef DEBUG
610 nigel 93 printf("start bracket %d\n", number);
611     printf("subject=");
612 nigel 77 pchars(eptr, 16, TRUE, md);
613     printf("\n");
614     #endif
615    
616     if (offset < md->offset_max)
617     {
618     save_offset1 = md->offset_vector[offset];
619     save_offset2 = md->offset_vector[offset+1];
620     save_offset3 = md->offset_vector[md->offset_end - number];
621     save_capture_last = md->capture_last;
622    
623     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
624     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
625    
626 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
627 nigel 77 do
628     {
629 nigel 93 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
630     ims, eptrb, flags);
631 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
632     md->capture_last = save_capture_last;
633     ecode += GET(ecode, 1);
634     }
635     while (*ecode == OP_ALT);
636    
637     DPRINTF(("bracket %d failed\n", number));
638    
639     md->offset_vector[offset] = save_offset1;
640     md->offset_vector[offset+1] = save_offset2;
641     md->offset_vector[md->offset_end - number] = save_offset3;
642    
643     RRETURN(MATCH_NOMATCH);
644     }
645    
646 nigel 93 /* Insufficient room for saving captured contents. Treat as a non-capturing
647     bracket. */
648 nigel 77
649 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
650 nigel 77
651 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
652     final alternative within the brackets, we would return the result of a
653     recursive call to match() whatever happened. We can reduce stack usage by
654     turning this into a tail recursion. */
655 nigel 77
656 nigel 93 case OP_BRA:
657     case OP_SBRA:
658     DPRINTF(("start non-capturing bracket\n"));
659     flags = (op >= OP_SBRA)? match_cbegroup : 0;
660 nigel 91 for (;;)
661 nigel 77 {
662 nigel 91 if (ecode[GET(ecode, 1)] != OP_ALT)
663 nigel 93 {
664     ecode += _pcre_OP_lengths[*ecode];
665     flags |= match_tail_recursed;
666     DPRINTF(("bracket 0 tail recursion\n"));
667     goto TAIL_RECURSE;
668     }
669 nigel 91
670     /* For non-final alternatives, continue the loop for a NOMATCH result;
671     otherwise return. */
672    
673 nigel 93 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
674     eptrb, flags);
675 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
676     ecode += GET(ecode, 1);
677     }
678 nigel 91 /* Control never reaches here. */
679 nigel 77
680     /* Conditional group: compilation checked that there are no more than
681     two branches. If the condition is false, skipping the first branch takes us
682     past the end if there is only one branch, but that's OK because that is
683 nigel 91 exactly what going to the ket would do. As there is only one branch to be
684     obeyed, we can use tail recursion to avoid using another stack frame. */
685 nigel 77
686     case OP_COND:
687 nigel 93 case OP_SCOND:
688     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
689 nigel 77 {
690 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
691     condition = md->recursive != NULL &&
692     (offset == RREF_ANY || offset == md->recursive->group_num);
693     ecode += condition? 3 : GET(ecode, 1);
694     }
695    
696     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
697     {
698 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
699 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
700     ecode += condition? 3 : GET(ecode, 1);
701 nigel 77 }
702    
703 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
704     {
705     condition = FALSE;
706     ecode += GET(ecode, 1);
707     }
708    
709 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
710 nigel 93 the final argument match_condassert causes it to stop at the end of an
711     assertion. */
712 nigel 77
713     else
714     {
715     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
716 nigel 93 match_condassert);
717 nigel 77 if (rrc == MATCH_MATCH)
718     {
719 nigel 93 condition = TRUE;
720     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
721 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
722     }
723     else if (rrc != MATCH_NOMATCH)
724     {
725     RRETURN(rrc); /* Need braces because of following else */
726     }
727 nigel 93 else
728     {
729     condition = FALSE;
730     ecode += GET(ecode, 1);
731     }
732     }
733 nigel 91
734 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
735     we can use tail recursion to avoid using another stack frame. If the second
736     alternative doesn't exist, we can just plough on. */
737 nigel 91
738 nigel 93 if (condition || *ecode == OP_ALT)
739     {
740 nigel 91 ecode += 1 + LINK_SIZE;
741 nigel 93 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
742 nigel 91 goto TAIL_RECURSE;
743 nigel 77 }
744 nigel 93 else
745     {
746     ecode += 1 + LINK_SIZE;
747     }
748     break;
749 nigel 77
750    
751 nigel 93 /* End of the pattern. If we are in a top-level recursion, we should
752     restore the offsets appropriately and continue from after the call. */
753 nigel 77
754     case OP_END:
755     if (md->recursive != NULL && md->recursive->group_num == 0)
756     {
757     recursion_info *rec = md->recursive;
758 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
759 nigel 77 md->recursive = rec->prevrec;
760     memmove(md->offset_vector, rec->offset_save,
761     rec->saved_max * sizeof(int));
762     md->start_match = rec->save_start;
763     ims = original_ims;
764     ecode = rec->after_call;
765     break;
766     }
767    
768     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
769     string - backtracking will then try other alternatives, if any. */
770    
771     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
772     md->end_match_ptr = eptr; /* Record where we ended */
773     md->end_offset_top = offset_top; /* and how many extracts were taken */
774     RRETURN(MATCH_MATCH);
775    
776     /* Change option settings */
777    
778     case OP_OPT:
779     ims = ecode[1];
780     ecode += 2;
781     DPRINTF(("ims set to %02lx\n", ims));
782     break;
783    
784     /* Assertion brackets. Check the alternative branches in turn - the
785     matching won't pass the KET for an assertion. If any one branch matches,
786     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
787     start of each branch to move the current point backwards, so the code at
788     this level is identical to the lookahead case. */
789    
790     case OP_ASSERT:
791     case OP_ASSERTBACK:
792     do
793     {
794 nigel 93 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
795 nigel 77 if (rrc == MATCH_MATCH) break;
796     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
797     ecode += GET(ecode, 1);
798     }
799     while (*ecode == OP_ALT);
800     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
801    
802     /* If checking an assertion for a condition, return MATCH_MATCH. */
803    
804     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
805    
806     /* Continue from after the assertion, updating the offsets high water
807     mark, since extracts may have been taken during the assertion. */
808    
809     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
810     ecode += 1 + LINK_SIZE;
811     offset_top = md->end_offset_top;
812     continue;
813    
814     /* Negative assertion: all branches must fail to match */
815    
816     case OP_ASSERT_NOT:
817     case OP_ASSERTBACK_NOT:
818     do
819     {
820 nigel 93 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
821 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
822     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
823     ecode += GET(ecode,1);
824     }
825     while (*ecode == OP_ALT);
826    
827     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
828    
829     ecode += 1 + LINK_SIZE;
830     continue;
831    
832     /* Move the subject pointer back. This occurs only at the start of
833     each branch of a lookbehind assertion. If we are too close to the start to
834     move back, this match function fails. When working with UTF-8 we move
835     back a number of characters, not bytes. */
836    
837     case OP_REVERSE:
838     #ifdef SUPPORT_UTF8
839     if (utf8)
840     {
841 nigel 93 i = GET(ecode, 1);
842     while (i-- > 0)
843 nigel 77 {
844     eptr--;
845     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
846     BACKCHAR(eptr)
847     }
848     }
849     else
850     #endif
851    
852     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
853    
854     {
855 nigel 93 eptr -= GET(ecode, 1);
856 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
857     }
858    
859     /* Skip to next op code */
860    
861     ecode += 1 + LINK_SIZE;
862     break;
863    
864     /* The callout item calls an external function, if one is provided, passing
865     details of the match so far. This is mainly for debugging, though the
866     function is able to force a failure. */
867    
868     case OP_CALLOUT:
869     if (pcre_callout != NULL)
870     {
871     pcre_callout_block cb;
872     cb.version = 1; /* Version 1 of the callout block */
873     cb.callout_number = ecode[1];
874     cb.offset_vector = md->offset_vector;
875 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
876 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
877     cb.start_match = md->start_match - md->start_subject;
878     cb.current_position = eptr - md->start_subject;
879     cb.pattern_position = GET(ecode, 2);
880     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
881     cb.capture_top = offset_top/2;
882     cb.capture_last = md->capture_last;
883     cb.callout_data = md->callout_data;
884     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
885     if (rrc < 0) RRETURN(rrc);
886     }
887     ecode += 2 + 2*LINK_SIZE;
888     break;
889    
890     /* Recursion either matches the current regex, or some subexpression. The
891     offset data is the offset to the starting bracket from the start of the
892     whole pattern. (This is so that it works from duplicated subpatterns.)
893    
894     If there are any capturing brackets started but not finished, we have to
895     save their starting points and reinstate them after the recursion. However,
896     we don't know how many such there are (offset_top records the completed
897     total) so we just have to save all the potential data. There may be up to
898     65535 such values, which is too large to put on the stack, but using malloc
899     for small numbers seems expensive. As a compromise, the stack is used when
900     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
901     is used. A problem is what to do if the malloc fails ... there is no way of
902     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
903     values on the stack, and accept that the rest may be wrong.
904    
905     There are also other values that have to be saved. We use a chained
906     sequence of blocks that actually live on the stack. Thanks to Robin Houston
907     for the original version of this logic. */
908    
909     case OP_RECURSE:
910     {
911     callpat = md->start_code + GET(ecode, 1);
912 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
913     GET2(callpat, 1 + LINK_SIZE);
914 nigel 77
915     /* Add to "recursing stack" */
916    
917     new_recursive.prevrec = md->recursive;
918     md->recursive = &new_recursive;
919    
920     /* Find where to continue from afterwards */
921    
922     ecode += 1 + LINK_SIZE;
923     new_recursive.after_call = ecode;
924    
925     /* Now save the offset data. */
926    
927     new_recursive.saved_max = md->offset_end;
928     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
929     new_recursive.offset_save = stacksave;
930     else
931     {
932     new_recursive.offset_save =
933     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
934     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
935     }
936    
937     memcpy(new_recursive.offset_save, md->offset_vector,
938     new_recursive.saved_max * sizeof(int));
939     new_recursive.save_start = md->start_match;
940     md->start_match = eptr;
941    
942     /* OK, now we can do the recursion. For each top-level alternative we
943     restore the offset and recursion data. */
944    
945     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
946 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
947 nigel 77 do
948     {
949 nigel 93 RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
950     md, ims, eptrb, flags);
951 nigel 77 if (rrc == MATCH_MATCH)
952     {
953 nigel 87 DPRINTF(("Recursion matched\n"));
954 nigel 77 md->recursive = new_recursive.prevrec;
955     if (new_recursive.offset_save != stacksave)
956     (pcre_free)(new_recursive.offset_save);
957     RRETURN(MATCH_MATCH);
958     }
959 nigel 87 else if (rrc != MATCH_NOMATCH)
960     {
961     DPRINTF(("Recursion gave error %d\n", rrc));
962     RRETURN(rrc);
963     }
964 nigel 77
965     md->recursive = &new_recursive;
966     memcpy(md->offset_vector, new_recursive.offset_save,
967     new_recursive.saved_max * sizeof(int));
968     callpat += GET(callpat, 1);
969     }
970     while (*callpat == OP_ALT);
971    
972     DPRINTF(("Recursion didn't match\n"));
973     md->recursive = new_recursive.prevrec;
974     if (new_recursive.offset_save != stacksave)
975     (pcre_free)(new_recursive.offset_save);
976     RRETURN(MATCH_NOMATCH);
977     }
978     /* Control never reaches here */
979    
980     /* "Once" brackets are like assertion brackets except that after a match,
981     the point in the subject string is not moved back. Thus there can never be
982     a move back into the brackets. Friedl calls these "atomic" subpatterns.
983     Check the alternative branches in turn - the matching won't pass the KET
984     for this kind of subpattern. If any one branch matches, we carry on as at
985     the end of a normal bracket, leaving the subject pointer. */
986    
987     case OP_ONCE:
988 nigel 91 prev = ecode;
989     saved_eptr = eptr;
990    
991     do
992 nigel 77 {
993 nigel 91 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
994 nigel 93 eptrb, 0);
995 nigel 91 if (rrc == MATCH_MATCH) break;
996     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
997     ecode += GET(ecode,1);
998     }
999     while (*ecode == OP_ALT);
1000 nigel 77
1001 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1002 nigel 77
1003 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1004 nigel 77
1005 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1006     mark, since extracts may have been taken. */
1007 nigel 77
1008 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1009 nigel 77
1010 nigel 91 offset_top = md->end_offset_top;
1011     eptr = md->end_match_ptr;
1012 nigel 77
1013 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1014     happens for a repeating ket if no characters were matched in the group.
1015     This is the forcible breaking of infinite loops as implemented in Perl
1016     5.005. If there is an options reset, it will get obeyed in the normal
1017     course of events. */
1018 nigel 77
1019 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1020     {
1021     ecode += 1+LINK_SIZE;
1022     break;
1023     }
1024 nigel 77
1025 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1026     preceding bracket, in the appropriate order. The second "call" of match()
1027     uses tail recursion, to avoid using another stack frame. We need to reset
1028     any options that changed within the bracket before re-running it, so
1029     check the next opcode. */
1030 nigel 77
1031 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1032     {
1033     ims = (ims & ~PCRE_IMS) | ecode[4];
1034     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1035     }
1036 nigel 77
1037 nigel 91 if (*ecode == OP_KETRMIN)
1038     {
1039     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1040     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1041     ecode = prev;
1042 nigel 93 flags = match_tail_recursed;
1043 nigel 91 goto TAIL_RECURSE;
1044 nigel 77 }
1045 nigel 91 else /* OP_KETRMAX */
1046     {
1047 nigel 93 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1048 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1049     ecode += 1 + LINK_SIZE;
1050 nigel 93 flags = match_tail_recursed;
1051 nigel 91 goto TAIL_RECURSE;
1052     }
1053     /* Control never gets here */
1054 nigel 77
1055     /* An alternation is the end of a branch; scan along to find the end of the
1056     bracketed group and go to there. */
1057    
1058     case OP_ALT:
1059     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1060     break;
1061    
1062     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1063     that it may occur zero times. It may repeat infinitely, or not at all -
1064     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1065     repeat limits are compiled as a number of copies, with the optional ones
1066     preceded by BRAZERO or BRAMINZERO. */
1067    
1068     case OP_BRAZERO:
1069     {
1070     next = ecode+1;
1071 nigel 93 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1072 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1073     do next += GET(next,1); while (*next == OP_ALT);
1074 nigel 93 ecode = next + 1 + LINK_SIZE;
1075 nigel 77 }
1076     break;
1077    
1078     case OP_BRAMINZERO:
1079     {
1080     next = ecode+1;
1081 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1082     RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1083 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1084     ecode++;
1085     }
1086     break;
1087    
1088 nigel 93 /* End of a group, repeated or non-repeating. */
1089 nigel 77
1090     case OP_KET:
1091     case OP_KETRMIN:
1092     case OP_KETRMAX:
1093 nigel 91 prev = ecode - GET(ecode, 1);
1094 nigel 77
1095 nigel 93 /* If this was a group that remembered the subject start, in order to break
1096     infinite repeats of empty string matches, retrieve the subject start from
1097     the chain. Otherwise, set it NULL. */
1098 nigel 77
1099 nigel 93 if (*prev >= OP_SBRA)
1100     {
1101     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1102     eptrb = eptrb->epb_prev; /* Backup to previous group */
1103     }
1104     else saved_eptr = NULL;
1105 nigel 77
1106 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1107     MATCH_MATCH, but record the current high water mark for use by positive
1108     assertions. Do this also for the "once" (atomic) groups. */
1109    
1110 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1111     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1112     *prev == OP_ONCE)
1113     {
1114     md->end_match_ptr = eptr; /* For ONCE */
1115     md->end_offset_top = offset_top;
1116     RRETURN(MATCH_MATCH);
1117     }
1118 nigel 77
1119 nigel 93 /* For capturing groups we have to check the group number back at the start
1120     and if necessary complete handling an extraction by setting the offsets and
1121     bumping the high water mark. Note that whole-pattern recursion is coded as
1122     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1123     when the OP_END is reached. Other recursion is handled here. */
1124 nigel 77
1125 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1126 nigel 91 {
1127 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1128 nigel 91 offset = number << 1;
1129 nigel 77
1130     #ifdef DEBUG
1131 nigel 91 printf("end bracket %d", number);
1132     printf("\n");
1133 nigel 77 #endif
1134    
1135 nigel 93 md->capture_last = number;
1136     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1137 nigel 91 {
1138 nigel 93 md->offset_vector[offset] =
1139     md->offset_vector[md->offset_end - number];
1140     md->offset_vector[offset+1] = eptr - md->start_subject;
1141     if (offset_top <= offset) offset_top = offset + 2;
1142     }
1143 nigel 77
1144 nigel 93 /* Handle a recursively called group. Restore the offsets
1145     appropriately and continue from after the call. */
1146 nigel 77
1147 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1148     {
1149     recursion_info *rec = md->recursive;
1150     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1151     md->recursive = rec->prevrec;
1152     md->start_match = rec->save_start;
1153     memcpy(md->offset_vector, rec->offset_save,
1154     rec->saved_max * sizeof(int));
1155     ecode = rec->after_call;
1156     ims = original_ims;
1157     break;
1158 nigel 77 }
1159 nigel 91 }
1160 nigel 77
1161 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1162     flags, in case they got changed during the group. */
1163 nigel 77
1164 nigel 91 ims = original_ims;
1165     DPRINTF(("ims reset to %02lx\n", ims));
1166 nigel 77
1167 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1168     happens for a repeating ket if no characters were matched in the group.
1169     This is the forcible breaking of infinite loops as implemented in Perl
1170     5.005. If there is an options reset, it will get obeyed in the normal
1171     course of events. */
1172 nigel 77
1173 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1174     {
1175     ecode += 1 + LINK_SIZE;
1176     break;
1177     }
1178 nigel 77
1179 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1180     preceding bracket, in the appropriate order. In the second case, we can use
1181     tail recursion to avoid using another stack frame. */
1182 nigel 77
1183 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1184    
1185 nigel 91 if (*ecode == OP_KETRMIN)
1186     {
1187     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1188     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1189     ecode = prev;
1190 nigel 93 flags |= match_tail_recursed;
1191 nigel 91 goto TAIL_RECURSE;
1192 nigel 77 }
1193 nigel 91 else /* OP_KETRMAX */
1194     {
1195 nigel 93 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1196 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1197     ecode += 1 + LINK_SIZE;
1198 nigel 93 flags = match_tail_recursed;
1199 nigel 91 goto TAIL_RECURSE;
1200     }
1201     /* Control never gets here */
1202 nigel 77
1203     /* Start of subject unless notbol, or after internal newline if multiline */
1204    
1205     case OP_CIRC:
1206     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1207     if ((ims & PCRE_MULTILINE) != 0)
1208     {
1209 nigel 91 if (eptr != md->start_subject &&
1210 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1211 nigel 77 RRETURN(MATCH_NOMATCH);
1212     ecode++;
1213     break;
1214     }
1215     /* ... else fall through */
1216    
1217     /* Start of subject assertion */
1218    
1219     case OP_SOD:
1220     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1221     ecode++;
1222     break;
1223    
1224     /* Start of match assertion */
1225    
1226     case OP_SOM:
1227     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1228     ecode++;
1229     break;
1230    
1231     /* Assert before internal newline if multiline, or before a terminating
1232     newline unless endonly is set, else end of subject unless noteol is set. */
1233    
1234     case OP_DOLL:
1235     if ((ims & PCRE_MULTILINE) != 0)
1236     {
1237     if (eptr < md->end_subject)
1238 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1239 nigel 77 else
1240     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1241     ecode++;
1242     break;
1243     }
1244     else
1245     {
1246     if (md->noteol) RRETURN(MATCH_NOMATCH);
1247     if (!md->endonly)
1248     {
1249 nigel 91 if (eptr != md->end_subject &&
1250 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1251 nigel 77 RRETURN(MATCH_NOMATCH);
1252     ecode++;
1253     break;
1254     }
1255     }
1256 nigel 91 /* ... else fall through for endonly */
1257 nigel 77
1258     /* End of subject assertion (\z) */
1259    
1260     case OP_EOD:
1261     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1262     ecode++;
1263     break;
1264    
1265     /* End of subject or ending \n assertion (\Z) */
1266    
1267     case OP_EODN:
1268 nigel 91 if (eptr != md->end_subject &&
1269 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1270 nigel 91 RRETURN(MATCH_NOMATCH);
1271 nigel 77 ecode++;
1272     break;
1273    
1274     /* Word boundary assertions */
1275    
1276     case OP_NOT_WORD_BOUNDARY:
1277     case OP_WORD_BOUNDARY:
1278     {
1279    
1280     /* Find out if the previous and current characters are "word" characters.
1281     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1282     be "non-word" characters. */
1283    
1284     #ifdef SUPPORT_UTF8
1285     if (utf8)
1286     {
1287     if (eptr == md->start_subject) prev_is_word = FALSE; else
1288     {
1289     const uschar *lastptr = eptr - 1;
1290     while((*lastptr & 0xc0) == 0x80) lastptr--;
1291     GETCHAR(c, lastptr);
1292     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1293     }
1294     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1295     {
1296     GETCHAR(c, eptr);
1297     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1298     }
1299     }
1300     else
1301     #endif
1302    
1303     /* More streamlined when not in UTF-8 mode */
1304    
1305     {
1306     prev_is_word = (eptr != md->start_subject) &&
1307     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1308     cur_is_word = (eptr < md->end_subject) &&
1309     ((md->ctypes[*eptr] & ctype_word) != 0);
1310     }
1311    
1312     /* Now see if the situation is what we want */
1313    
1314     if ((*ecode++ == OP_WORD_BOUNDARY)?
1315     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1316     RRETURN(MATCH_NOMATCH);
1317     }
1318     break;
1319    
1320     /* Match a single character type; inline for speed */
1321    
1322     case OP_ANY:
1323 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1324     {
1325 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1326 nigel 91 }
1327 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1328     if (utf8)
1329     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1330     ecode++;
1331     break;
1332    
1333     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1334     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1335    
1336     case OP_ANYBYTE:
1337     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1338     ecode++;
1339     break;
1340    
1341     case OP_NOT_DIGIT:
1342     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1343     GETCHARINCTEST(c, eptr);
1344     if (
1345     #ifdef SUPPORT_UTF8
1346     c < 256 &&
1347     #endif
1348     (md->ctypes[c] & ctype_digit) != 0
1349     )
1350     RRETURN(MATCH_NOMATCH);
1351     ecode++;
1352     break;
1353    
1354     case OP_DIGIT:
1355     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1356     GETCHARINCTEST(c, eptr);
1357     if (
1358     #ifdef SUPPORT_UTF8
1359     c >= 256 ||
1360     #endif
1361     (md->ctypes[c] & ctype_digit) == 0
1362     )
1363     RRETURN(MATCH_NOMATCH);
1364     ecode++;
1365     break;
1366    
1367     case OP_NOT_WHITESPACE:
1368     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1369     GETCHARINCTEST(c, eptr);
1370     if (
1371     #ifdef SUPPORT_UTF8
1372     c < 256 &&
1373     #endif
1374     (md->ctypes[c] & ctype_space) != 0
1375     )
1376     RRETURN(MATCH_NOMATCH);
1377     ecode++;
1378     break;
1379    
1380     case OP_WHITESPACE:
1381     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1382     GETCHARINCTEST(c, eptr);
1383     if (
1384     #ifdef SUPPORT_UTF8
1385     c >= 256 ||
1386     #endif
1387     (md->ctypes[c] & ctype_space) == 0
1388     )
1389     RRETURN(MATCH_NOMATCH);
1390     ecode++;
1391     break;
1392    
1393     case OP_NOT_WORDCHAR:
1394     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1395     GETCHARINCTEST(c, eptr);
1396     if (
1397     #ifdef SUPPORT_UTF8
1398     c < 256 &&
1399     #endif
1400     (md->ctypes[c] & ctype_word) != 0
1401     )
1402     RRETURN(MATCH_NOMATCH);
1403     ecode++;
1404     break;
1405    
1406     case OP_WORDCHAR:
1407     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1408     GETCHARINCTEST(c, eptr);
1409     if (
1410     #ifdef SUPPORT_UTF8
1411     c >= 256 ||
1412     #endif
1413     (md->ctypes[c] & ctype_word) == 0
1414     )
1415     RRETURN(MATCH_NOMATCH);
1416     ecode++;
1417     break;
1418    
1419 nigel 93 case OP_ANYNL:
1420     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1421     GETCHARINCTEST(c, eptr);
1422     switch(c)
1423     {
1424     default: RRETURN(MATCH_NOMATCH);
1425     case 0x000d:
1426     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1427     break;
1428     case 0x000a:
1429     case 0x000b:
1430     case 0x000c:
1431     case 0x0085:
1432     case 0x2028:
1433     case 0x2029:
1434     break;
1435     }
1436     ecode++;
1437     break;
1438    
1439 nigel 77 #ifdef SUPPORT_UCP
1440     /* Check the next character by Unicode property. We will get here only
1441     if the support is in the binary; otherwise a compile-time error occurs. */
1442    
1443     case OP_PROP:
1444     case OP_NOTPROP:
1445     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1446     GETCHARINCTEST(c, eptr);
1447     {
1448 nigel 87 int chartype, script;
1449     int category = _pcre_ucp_findprop(c, &chartype, &script);
1450 nigel 77
1451 nigel 87 switch(ecode[1])
1452     {
1453     case PT_ANY:
1454     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1455     break;
1456 nigel 77
1457 nigel 87 case PT_LAMP:
1458     if ((chartype == ucp_Lu ||
1459     chartype == ucp_Ll ||
1460     chartype == ucp_Lt) == (op == OP_NOTPROP))
1461 nigel 77 RRETURN(MATCH_NOMATCH);
1462 nigel 87 break;
1463    
1464     case PT_GC:
1465     if ((ecode[2] != category) == (op == OP_PROP))
1466 nigel 77 RRETURN(MATCH_NOMATCH);
1467 nigel 87 break;
1468    
1469     case PT_PC:
1470     if ((ecode[2] != chartype) == (op == OP_PROP))
1471     RRETURN(MATCH_NOMATCH);
1472     break;
1473    
1474     case PT_SC:
1475     if ((ecode[2] != script) == (op == OP_PROP))
1476     RRETURN(MATCH_NOMATCH);
1477     break;
1478    
1479     default:
1480     RRETURN(PCRE_ERROR_INTERNAL);
1481 nigel 77 }
1482 nigel 87
1483     ecode += 3;
1484 nigel 77 }
1485     break;
1486    
1487     /* Match an extended Unicode sequence. We will get here only if the support
1488     is in the binary; otherwise a compile-time error occurs. */
1489    
1490     case OP_EXTUNI:
1491     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1492     GETCHARINCTEST(c, eptr);
1493     {
1494 nigel 87 int chartype, script;
1495     int category = _pcre_ucp_findprop(c, &chartype, &script);
1496 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1497     while (eptr < md->end_subject)
1498     {
1499     int len = 1;
1500     if (!utf8) c = *eptr; else
1501     {
1502     GETCHARLEN(c, eptr, len);
1503     }
1504 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1505 nigel 77 if (category != ucp_M) break;
1506     eptr += len;
1507     }
1508     }
1509     ecode++;
1510     break;
1511     #endif
1512    
1513    
1514     /* Match a back reference, possibly repeatedly. Look past the end of the
1515     item to see if there is repeat information following. The code is similar
1516     to that for character classes, but repeated for efficiency. Then obey
1517     similar code to character type repeats - written out again for speed.
1518     However, if the referenced string is the empty string, always treat
1519     it as matched, any number of times (otherwise there could be infinite
1520     loops). */
1521    
1522     case OP_REF:
1523     {
1524     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1525     ecode += 3; /* Advance past item */
1526    
1527     /* If the reference is unset, set the length to be longer than the amount
1528     of subject left; this ensures that every attempt at a match fails. We
1529     can't just fail here, because of the possibility of quantifiers with zero
1530     minima. */
1531    
1532     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1533     md->end_subject - eptr + 1 :
1534     md->offset_vector[offset+1] - md->offset_vector[offset];
1535    
1536     /* Set up for repetition, or handle the non-repeated case */
1537    
1538     switch (*ecode)
1539     {
1540     case OP_CRSTAR:
1541     case OP_CRMINSTAR:
1542     case OP_CRPLUS:
1543     case OP_CRMINPLUS:
1544     case OP_CRQUERY:
1545     case OP_CRMINQUERY:
1546     c = *ecode++ - OP_CRSTAR;
1547     minimize = (c & 1) != 0;
1548     min = rep_min[c]; /* Pick up values from tables; */
1549     max = rep_max[c]; /* zero for max => infinity */
1550     if (max == 0) max = INT_MAX;
1551     break;
1552    
1553     case OP_CRRANGE:
1554     case OP_CRMINRANGE:
1555     minimize = (*ecode == OP_CRMINRANGE);
1556     min = GET2(ecode, 1);
1557     max = GET2(ecode, 3);
1558     if (max == 0) max = INT_MAX;
1559     ecode += 5;
1560     break;
1561    
1562     default: /* No repeat follows */
1563     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1564     eptr += length;
1565     continue; /* With the main loop */
1566     }
1567    
1568     /* If the length of the reference is zero, just continue with the
1569     main loop. */
1570    
1571     if (length == 0) continue;
1572    
1573     /* First, ensure the minimum number of matches are present. We get back
1574     the length of the reference string explicitly rather than passing the
1575     address of eptr, so that eptr can be a register variable. */
1576    
1577     for (i = 1; i <= min; i++)
1578     {
1579     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1580     eptr += length;
1581     }
1582    
1583     /* If min = max, continue at the same level without recursion.
1584     They are not both allowed to be zero. */
1585    
1586     if (min == max) continue;
1587    
1588     /* If minimizing, keep trying and advancing the pointer */
1589    
1590     if (minimize)
1591     {
1592     for (fi = min;; fi++)
1593     {
1594     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1595     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1596     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1597     RRETURN(MATCH_NOMATCH);
1598     eptr += length;
1599     }
1600     /* Control never gets here */
1601     }
1602    
1603     /* If maximizing, find the longest string and work backwards */
1604    
1605     else
1606     {
1607     pp = eptr;
1608     for (i = min; i < max; i++)
1609     {
1610     if (!match_ref(offset, eptr, length, md, ims)) break;
1611     eptr += length;
1612     }
1613     while (eptr >= pp)
1614     {
1615     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1616     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1617     eptr -= length;
1618     }
1619     RRETURN(MATCH_NOMATCH);
1620     }
1621     }
1622     /* Control never gets here */
1623    
1624    
1625    
1626     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1627     used when all the characters in the class have values in the range 0-255,
1628     and either the matching is caseful, or the characters are in the range
1629     0-127 when UTF-8 processing is enabled. The only difference between
1630     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1631     encountered.
1632    
1633     First, look past the end of the item to see if there is repeat information
1634     following. Then obey similar code to character type repeats - written out
1635     again for speed. */
1636    
1637     case OP_NCLASS:
1638     case OP_CLASS:
1639     {
1640     data = ecode + 1; /* Save for matching */
1641     ecode += 33; /* Advance past the item */
1642    
1643     switch (*ecode)
1644     {
1645     case OP_CRSTAR:
1646     case OP_CRMINSTAR:
1647     case OP_CRPLUS:
1648     case OP_CRMINPLUS:
1649     case OP_CRQUERY:
1650     case OP_CRMINQUERY:
1651     c = *ecode++ - OP_CRSTAR;
1652     minimize = (c & 1) != 0;
1653     min = rep_min[c]; /* Pick up values from tables; */
1654     max = rep_max[c]; /* zero for max => infinity */
1655     if (max == 0) max = INT_MAX;
1656     break;
1657    
1658     case OP_CRRANGE:
1659     case OP_CRMINRANGE:
1660     minimize = (*ecode == OP_CRMINRANGE);
1661     min = GET2(ecode, 1);
1662     max = GET2(ecode, 3);
1663     if (max == 0) max = INT_MAX;
1664     ecode += 5;
1665     break;
1666    
1667     default: /* No repeat follows */
1668     min = max = 1;
1669     break;
1670     }
1671    
1672     /* First, ensure the minimum number of matches are present. */
1673    
1674     #ifdef SUPPORT_UTF8
1675     /* UTF-8 mode */
1676     if (utf8)
1677     {
1678     for (i = 1; i <= min; i++)
1679     {
1680     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1681     GETCHARINC(c, eptr);
1682     if (c > 255)
1683     {
1684     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1685     }
1686     else
1687     {
1688     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1689     }
1690     }
1691     }
1692     else
1693     #endif
1694     /* Not UTF-8 mode */
1695     {
1696     for (i = 1; i <= min; i++)
1697     {
1698     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1699     c = *eptr++;
1700     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1701     }
1702     }
1703    
1704     /* If max == min we can continue with the main loop without the
1705     need to recurse. */
1706    
1707     if (min == max) continue;
1708    
1709     /* If minimizing, keep testing the rest of the expression and advancing
1710     the pointer while it matches the class. */
1711    
1712     if (minimize)
1713     {
1714     #ifdef SUPPORT_UTF8
1715     /* UTF-8 mode */
1716     if (utf8)
1717     {
1718     for (fi = min;; fi++)
1719     {
1720     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1721     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1722     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1723     GETCHARINC(c, eptr);
1724     if (c > 255)
1725     {
1726     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1727     }
1728     else
1729     {
1730     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1731     }
1732     }
1733     }
1734     else
1735     #endif
1736     /* Not UTF-8 mode */
1737     {
1738     for (fi = min;; fi++)
1739     {
1740     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1741     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1742     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1743     c = *eptr++;
1744     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1745     }
1746     }
1747     /* Control never gets here */
1748     }
1749    
1750     /* If maximizing, find the longest possible run, then work backwards. */
1751    
1752     else
1753     {
1754     pp = eptr;
1755    
1756     #ifdef SUPPORT_UTF8
1757     /* UTF-8 mode */
1758     if (utf8)
1759     {
1760     for (i = min; i < max; i++)
1761     {
1762     int len = 1;
1763     if (eptr >= md->end_subject) break;
1764     GETCHARLEN(c, eptr, len);
1765     if (c > 255)
1766     {
1767     if (op == OP_CLASS) break;
1768     }
1769     else
1770     {
1771     if ((data[c/8] & (1 << (c&7))) == 0) break;
1772     }
1773     eptr += len;
1774     }
1775     for (;;)
1776     {
1777     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1778     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1779     if (eptr-- == pp) break; /* Stop if tried at original pos */
1780     BACKCHAR(eptr);
1781     }
1782     }
1783     else
1784     #endif
1785     /* Not UTF-8 mode */
1786     {
1787     for (i = min; i < max; i++)
1788     {
1789     if (eptr >= md->end_subject) break;
1790     c = *eptr;
1791     if ((data[c/8] & (1 << (c&7))) == 0) break;
1792     eptr++;
1793     }
1794     while (eptr >= pp)
1795     {
1796     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1797 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1798 nigel 77 eptr--;
1799     }
1800     }
1801    
1802     RRETURN(MATCH_NOMATCH);
1803     }
1804     }
1805     /* Control never gets here */
1806    
1807    
1808     /* Match an extended character class. This opcode is encountered only
1809     in UTF-8 mode, because that's the only time it is compiled. */
1810    
1811     #ifdef SUPPORT_UTF8
1812     case OP_XCLASS:
1813     {
1814     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1815     ecode += GET(ecode, 1); /* Advance past the item */
1816    
1817     switch (*ecode)
1818     {
1819     case OP_CRSTAR:
1820     case OP_CRMINSTAR:
1821     case OP_CRPLUS:
1822     case OP_CRMINPLUS:
1823     case OP_CRQUERY:
1824     case OP_CRMINQUERY:
1825     c = *ecode++ - OP_CRSTAR;
1826     minimize = (c & 1) != 0;
1827     min = rep_min[c]; /* Pick up values from tables; */
1828     max = rep_max[c]; /* zero for max => infinity */
1829     if (max == 0) max = INT_MAX;
1830     break;
1831    
1832     case OP_CRRANGE:
1833     case OP_CRMINRANGE:
1834     minimize = (*ecode == OP_CRMINRANGE);
1835     min = GET2(ecode, 1);
1836     max = GET2(ecode, 3);
1837     if (max == 0) max = INT_MAX;
1838     ecode += 5;
1839     break;
1840    
1841     default: /* No repeat follows */
1842     min = max = 1;
1843     break;
1844     }
1845    
1846     /* First, ensure the minimum number of matches are present. */
1847    
1848     for (i = 1; i <= min; i++)
1849     {
1850     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1851     GETCHARINC(c, eptr);
1852     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1853     }
1854    
1855     /* If max == min we can continue with the main loop without the
1856     need to recurse. */
1857    
1858     if (min == max) continue;
1859    
1860     /* If minimizing, keep testing the rest of the expression and advancing
1861     the pointer while it matches the class. */
1862    
1863     if (minimize)
1864     {
1865     for (fi = min;; fi++)
1866     {
1867     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1868     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1869     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1870     GETCHARINC(c, eptr);
1871     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1872     }
1873     /* Control never gets here */
1874     }
1875    
1876     /* If maximizing, find the longest possible run, then work backwards. */
1877    
1878     else
1879     {
1880     pp = eptr;
1881     for (i = min; i < max; i++)
1882     {
1883     int len = 1;
1884     if (eptr >= md->end_subject) break;
1885     GETCHARLEN(c, eptr, len);
1886     if (!_pcre_xclass(c, data)) break;
1887     eptr += len;
1888     }
1889     for(;;)
1890     {
1891     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1892     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1893     if (eptr-- == pp) break; /* Stop if tried at original pos */
1894     BACKCHAR(eptr)
1895     }
1896     RRETURN(MATCH_NOMATCH);
1897     }
1898    
1899     /* Control never gets here */
1900     }
1901     #endif /* End of XCLASS */
1902    
1903     /* Match a single character, casefully */
1904    
1905     case OP_CHAR:
1906     #ifdef SUPPORT_UTF8
1907     if (utf8)
1908     {
1909     length = 1;
1910     ecode++;
1911     GETCHARLEN(fc, ecode, length);
1912     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1913     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1914     }
1915     else
1916     #endif
1917    
1918     /* Non-UTF-8 mode */
1919     {
1920     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1921     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1922     ecode += 2;
1923     }
1924     break;
1925    
1926     /* Match a single character, caselessly */
1927    
1928     case OP_CHARNC:
1929     #ifdef SUPPORT_UTF8
1930     if (utf8)
1931     {
1932     length = 1;
1933     ecode++;
1934     GETCHARLEN(fc, ecode, length);
1935    
1936     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1937    
1938     /* If the pattern character's value is < 128, we have only one byte, and
1939     can use the fast lookup table. */
1940    
1941     if (fc < 128)
1942     {
1943     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1944     }
1945    
1946     /* Otherwise we must pick up the subject character */
1947    
1948     else
1949     {
1950 nigel 93 unsigned int dc;
1951 nigel 77 GETCHARINC(dc, eptr);
1952     ecode += length;
1953    
1954     /* If we have Unicode property support, we can use it to test the other
1955 nigel 87 case of the character, if there is one. */
1956 nigel 77
1957     if (fc != dc)
1958     {
1959     #ifdef SUPPORT_UCP
1960 nigel 87 if (dc != _pcre_ucp_othercase(fc))
1961 nigel 77 #endif
1962     RRETURN(MATCH_NOMATCH);
1963     }
1964     }
1965     }
1966     else
1967     #endif /* SUPPORT_UTF8 */
1968    
1969     /* Non-UTF-8 mode */
1970     {
1971     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1972     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1973     ecode += 2;
1974     }
1975     break;
1976    
1977 nigel 93 /* Match a single character repeatedly. */
1978 nigel 77
1979     case OP_EXACT:
1980     min = max = GET2(ecode, 1);
1981     ecode += 3;
1982     goto REPEATCHAR;
1983    
1984 nigel 93 case OP_POSUPTO:
1985     possessive = TRUE;
1986     /* Fall through */
1987    
1988 nigel 77 case OP_UPTO:
1989     case OP_MINUPTO:
1990     min = 0;
1991     max = GET2(ecode, 1);
1992     minimize = *ecode == OP_MINUPTO;
1993     ecode += 3;
1994     goto REPEATCHAR;
1995    
1996 nigel 93 case OP_POSSTAR:
1997     possessive = TRUE;
1998     min = 0;
1999     max = INT_MAX;
2000     ecode++;
2001     goto REPEATCHAR;
2002    
2003     case OP_POSPLUS:
2004     possessive = TRUE;
2005     min = 1;
2006     max = INT_MAX;
2007     ecode++;
2008     goto REPEATCHAR;
2009    
2010     case OP_POSQUERY:
2011     possessive = TRUE;
2012     min = 0;
2013     max = 1;
2014     ecode++;
2015     goto REPEATCHAR;
2016    
2017 nigel 77 case OP_STAR:
2018     case OP_MINSTAR:
2019     case OP_PLUS:
2020     case OP_MINPLUS:
2021     case OP_QUERY:
2022     case OP_MINQUERY:
2023     c = *ecode++ - OP_STAR;
2024     minimize = (c & 1) != 0;
2025     min = rep_min[c]; /* Pick up values from tables; */
2026     max = rep_max[c]; /* zero for max => infinity */
2027     if (max == 0) max = INT_MAX;
2028    
2029     /* Common code for all repeated single-character matches. We can give
2030     up quickly if there are fewer than the minimum number of characters left in
2031     the subject. */
2032    
2033     REPEATCHAR:
2034     #ifdef SUPPORT_UTF8
2035     if (utf8)
2036     {
2037     length = 1;
2038     charptr = ecode;
2039     GETCHARLEN(fc, ecode, length);
2040     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2041     ecode += length;
2042    
2043     /* Handle multibyte character matching specially here. There is
2044     support for caseless matching if UCP support is present. */
2045    
2046     if (length > 1)
2047     {
2048     int oclength = 0;
2049     uschar occhars[8];
2050    
2051     #ifdef SUPPORT_UCP
2052 nigel 93 unsigned int othercase;
2053 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2054 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2055 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2056     #endif /* SUPPORT_UCP */
2057    
2058     for (i = 1; i <= min; i++)
2059     {
2060     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2061     /* Need braces because of following else */
2062     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2063     else
2064     {
2065     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2066     eptr += oclength;
2067     }
2068     }
2069    
2070     if (min == max) continue;
2071    
2072     if (minimize)
2073     {
2074     for (fi = min;; fi++)
2075     {
2076     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2077     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2078     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2079     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2080     /* Need braces because of following else */
2081     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2082     else
2083     {
2084     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2085     eptr += oclength;
2086     }
2087     }
2088     /* Control never gets here */
2089     }
2090 nigel 93
2091     else /* Maximize */
2092 nigel 77 {
2093     pp = eptr;
2094     for (i = min; i < max; i++)
2095     {
2096     if (eptr > md->end_subject - length) break;
2097     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2098     else if (oclength == 0) break;
2099     else
2100     {
2101     if (memcmp(eptr, occhars, oclength) != 0) break;
2102     eptr += oclength;
2103     }
2104     }
2105 nigel 93
2106     if (possessive) continue;
2107 nigel 77 while (eptr >= pp)
2108     {
2109     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2110     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2111     eptr -= length;
2112     }
2113     RRETURN(MATCH_NOMATCH);
2114     }
2115     /* Control never gets here */
2116     }
2117    
2118     /* If the length of a UTF-8 character is 1, we fall through here, and
2119     obey the code as for non-UTF-8 characters below, though in this case the
2120     value of fc will always be < 128. */
2121     }
2122     else
2123     #endif /* SUPPORT_UTF8 */
2124    
2125     /* When not in UTF-8 mode, load a single-byte character. */
2126     {
2127     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2128     fc = *ecode++;
2129     }
2130    
2131     /* The value of fc at this point is always less than 256, though we may or
2132     may not be in UTF-8 mode. The code is duplicated for the caseless and
2133     caseful cases, for speed, since matching characters is likely to be quite
2134     common. First, ensure the minimum number of matches are present. If min =
2135     max, continue at the same level without recursing. Otherwise, if
2136     minimizing, keep trying the rest of the expression and advancing one
2137     matching character if failing, up to the maximum. Alternatively, if
2138     maximizing, find the maximum number of characters and work backwards. */
2139    
2140     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2141     max, eptr));
2142    
2143     if ((ims & PCRE_CASELESS) != 0)
2144     {
2145     fc = md->lcc[fc];
2146     for (i = 1; i <= min; i++)
2147     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2148     if (min == max) continue;
2149     if (minimize)
2150     {
2151     for (fi = min;; fi++)
2152     {
2153     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2154     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2155     if (fi >= max || eptr >= md->end_subject ||
2156     fc != md->lcc[*eptr++])
2157     RRETURN(MATCH_NOMATCH);
2158     }
2159     /* Control never gets here */
2160     }
2161 nigel 93 else /* Maximize */
2162 nigel 77 {
2163     pp = eptr;
2164     for (i = min; i < max; i++)
2165     {
2166     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2167     eptr++;
2168     }
2169 nigel 93 if (possessive) continue;
2170 nigel 77 while (eptr >= pp)
2171     {
2172     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2173     eptr--;
2174     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2175     }
2176     RRETURN(MATCH_NOMATCH);
2177     }
2178     /* Control never gets here */
2179     }
2180    
2181     /* Caseful comparisons (includes all multi-byte characters) */
2182    
2183     else
2184     {
2185     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2186     if (min == max) continue;
2187     if (minimize)
2188     {
2189     for (fi = min;; fi++)
2190     {
2191     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2192     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2193     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2194     RRETURN(MATCH_NOMATCH);
2195     }
2196     /* Control never gets here */
2197     }
2198 nigel 93 else /* Maximize */
2199 nigel 77 {
2200     pp = eptr;
2201     for (i = min; i < max; i++)
2202     {
2203     if (eptr >= md->end_subject || fc != *eptr) break;
2204     eptr++;
2205     }
2206 nigel 93 if (possessive) continue;
2207 nigel 77 while (eptr >= pp)
2208     {
2209     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2210     eptr--;
2211     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2212     }
2213     RRETURN(MATCH_NOMATCH);
2214     }
2215     }
2216     /* Control never gets here */
2217    
2218     /* Match a negated single one-byte character. The character we are
2219     checking can be multibyte. */
2220    
2221     case OP_NOT:
2222     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2223     ecode++;
2224     GETCHARINCTEST(c, eptr);
2225     if ((ims & PCRE_CASELESS) != 0)
2226     {
2227     #ifdef SUPPORT_UTF8
2228     if (c < 256)
2229     #endif
2230     c = md->lcc[c];
2231     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2232     }
2233     else
2234     {
2235     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2236     }
2237     break;
2238    
2239     /* Match a negated single one-byte character repeatedly. This is almost a
2240     repeat of the code for a repeated single character, but I haven't found a
2241     nice way of commoning these up that doesn't require a test of the
2242     positive/negative option for each character match. Maybe that wouldn't add
2243     very much to the time taken, but character matching *is* what this is all
2244     about... */
2245    
2246     case OP_NOTEXACT:
2247     min = max = GET2(ecode, 1);
2248     ecode += 3;
2249     goto REPEATNOTCHAR;
2250    
2251     case OP_NOTUPTO:
2252     case OP_NOTMINUPTO:
2253     min = 0;
2254     max = GET2(ecode, 1);
2255     minimize = *ecode == OP_NOTMINUPTO;
2256     ecode += 3;
2257     goto REPEATNOTCHAR;
2258    
2259 nigel 93 case OP_NOTPOSSTAR:
2260     possessive = TRUE;
2261     min = 0;
2262     max = INT_MAX;
2263     ecode++;
2264     goto REPEATNOTCHAR;
2265    
2266     case OP_NOTPOSPLUS:
2267     possessive = TRUE;
2268     min = 1;
2269     max = INT_MAX;
2270     ecode++;
2271     goto REPEATNOTCHAR;
2272    
2273     case OP_NOTPOSQUERY:
2274     possessive = TRUE;
2275     min = 0;
2276     max = 1;
2277     ecode++;
2278     goto REPEATNOTCHAR;
2279    
2280     case OP_NOTPOSUPTO:
2281     possessive = TRUE;
2282     min = 0;
2283     max = GET2(ecode, 1);
2284     ecode += 3;
2285     goto REPEATNOTCHAR;
2286    
2287 nigel 77 case OP_NOTSTAR:
2288     case OP_NOTMINSTAR:
2289     case OP_NOTPLUS:
2290     case OP_NOTMINPLUS:
2291     case OP_NOTQUERY:
2292     case OP_NOTMINQUERY:
2293     c = *ecode++ - OP_NOTSTAR;
2294     minimize = (c & 1) != 0;
2295     min = rep_min[c]; /* Pick up values from tables; */
2296     max = rep_max[c]; /* zero for max => infinity */
2297     if (max == 0) max = INT_MAX;
2298    
2299     /* Common code for all repeated single-byte matches. We can give up quickly
2300     if there are fewer than the minimum number of bytes left in the
2301     subject. */
2302    
2303     REPEATNOTCHAR:
2304     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2305     fc = *ecode++;
2306    
2307     /* The code is duplicated for the caseless and caseful cases, for speed,
2308     since matching characters is likely to be quite common. First, ensure the
2309     minimum number of matches are present. If min = max, continue at the same
2310     level without recursing. Otherwise, if minimizing, keep trying the rest of
2311     the expression and advancing one matching character if failing, up to the
2312     maximum. Alternatively, if maximizing, find the maximum number of
2313     characters and work backwards. */
2314    
2315     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2316     max, eptr));
2317    
2318     if ((ims & PCRE_CASELESS) != 0)
2319     {
2320     fc = md->lcc[fc];
2321    
2322     #ifdef SUPPORT_UTF8
2323     /* UTF-8 mode */
2324     if (utf8)
2325     {
2326 nigel 93 register unsigned int d;
2327 nigel 77 for (i = 1; i <= min; i++)
2328     {
2329     GETCHARINC(d, eptr);
2330     if (d < 256) d = md->lcc[d];
2331     if (fc == d) RRETURN(MATCH_NOMATCH);
2332     }
2333     }
2334     else
2335     #endif
2336    
2337     /* Not UTF-8 mode */
2338     {
2339     for (i = 1; i <= min; i++)
2340     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2341     }
2342    
2343     if (min == max) continue;
2344    
2345     if (minimize)
2346     {
2347     #ifdef SUPPORT_UTF8
2348     /* UTF-8 mode */
2349     if (utf8)
2350     {
2351 nigel 93 register unsigned int d;
2352 nigel 77 for (fi = min;; fi++)
2353     {
2354     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2355     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2356     GETCHARINC(d, eptr);
2357     if (d < 256) d = md->lcc[d];
2358     if (fi >= max || eptr >= md->end_subject || fc == d)
2359     RRETURN(MATCH_NOMATCH);
2360     }
2361     }
2362     else
2363     #endif
2364     /* Not UTF-8 mode */
2365     {
2366     for (fi = min;; fi++)
2367     {
2368     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2369     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2370     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2371     RRETURN(MATCH_NOMATCH);
2372     }
2373     }
2374     /* Control never gets here */
2375     }
2376    
2377     /* Maximize case */
2378    
2379     else
2380     {
2381     pp = eptr;
2382    
2383     #ifdef SUPPORT_UTF8
2384     /* UTF-8 mode */
2385     if (utf8)
2386     {
2387 nigel 93 register unsigned int d;
2388 nigel 77 for (i = min; i < max; i++)
2389     {
2390     int len = 1;
2391     if (eptr >= md->end_subject) break;
2392     GETCHARLEN(d, eptr, len);
2393     if (d < 256) d = md->lcc[d];
2394     if (fc == d) break;
2395     eptr += len;
2396     }
2397 nigel 93 if (possessive) continue;
2398     for(;;)
2399 nigel 77 {
2400     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2401     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2402     if (eptr-- == pp) break; /* Stop if tried at original pos */
2403     BACKCHAR(eptr);
2404     }
2405     }
2406     else
2407     #endif
2408     /* Not UTF-8 mode */
2409     {
2410     for (i = min; i < max; i++)
2411     {
2412     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2413     eptr++;
2414     }
2415 nigel 93 if (possessive) continue;
2416 nigel 77 while (eptr >= pp)
2417     {
2418     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2419     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2420     eptr--;
2421     }
2422     }
2423    
2424     RRETURN(MATCH_NOMATCH);
2425     }
2426     /* Control never gets here */
2427     }
2428    
2429     /* Caseful comparisons */
2430    
2431     else
2432     {
2433     #ifdef SUPPORT_UTF8
2434     /* UTF-8 mode */
2435     if (utf8)
2436     {
2437 nigel 93 register unsigned int d;
2438 nigel 77 for (i = 1; i <= min; i++)
2439     {
2440     GETCHARINC(d, eptr);
2441     if (fc == d) RRETURN(MATCH_NOMATCH);
2442     }
2443     }
2444     else
2445     #endif
2446     /* Not UTF-8 mode */
2447     {
2448     for (i = 1; i <= min; i++)
2449     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2450     }
2451    
2452     if (min == max) continue;
2453    
2454     if (minimize)
2455     {
2456     #ifdef SUPPORT_UTF8
2457     /* UTF-8 mode */
2458     if (utf8)
2459     {
2460 nigel 93 register unsigned int d;
2461 nigel 77 for (fi = min;; fi++)
2462     {
2463     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2464     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2465     GETCHARINC(d, eptr);
2466     if (fi >= max || eptr >= md->end_subject || fc == d)
2467     RRETURN(MATCH_NOMATCH);
2468     }
2469     }
2470     else
2471     #endif
2472     /* Not UTF-8 mode */
2473     {
2474     for (fi = min;; fi++)
2475     {
2476     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2477     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2478     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2479     RRETURN(MATCH_NOMATCH);
2480     }
2481     }
2482     /* Control never gets here */
2483     }
2484    
2485     /* Maximize case */
2486    
2487     else
2488     {
2489     pp = eptr;
2490    
2491     #ifdef SUPPORT_UTF8
2492     /* UTF-8 mode */
2493     if (utf8)
2494     {
2495 nigel 93 register unsigned int d;
2496 nigel 77 for (i = min; i < max; i++)
2497     {
2498     int len = 1;
2499     if (eptr >= md->end_subject) break;
2500     GETCHARLEN(d, eptr, len);
2501     if (fc == d) break;
2502     eptr += len;
2503     }
2504 nigel 93 if (possessive) continue;
2505 nigel 77 for(;;)
2506     {
2507     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2508     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2509     if (eptr-- == pp) break; /* Stop if tried at original pos */
2510     BACKCHAR(eptr);
2511     }
2512     }
2513     else
2514     #endif
2515     /* Not UTF-8 mode */
2516     {
2517     for (i = min; i < max; i++)
2518     {
2519     if (eptr >= md->end_subject || fc == *eptr) break;
2520     eptr++;
2521     }
2522 nigel 93 if (possessive) continue;
2523 nigel 77 while (eptr >= pp)
2524     {
2525     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2526     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2527     eptr--;
2528     }
2529     }
2530    
2531     RRETURN(MATCH_NOMATCH);
2532     }
2533     }
2534     /* Control never gets here */
2535    
2536     /* Match a single character type repeatedly; several different opcodes
2537     share code. This is very similar to the code for single characters, but we
2538     repeat it in the interests of efficiency. */
2539    
2540     case OP_TYPEEXACT:
2541     min = max = GET2(ecode, 1);
2542     minimize = TRUE;
2543     ecode += 3;
2544     goto REPEATTYPE;
2545    
2546     case OP_TYPEUPTO:
2547     case OP_TYPEMINUPTO:
2548     min = 0;
2549     max = GET2(ecode, 1);
2550     minimize = *ecode == OP_TYPEMINUPTO;
2551     ecode += 3;
2552     goto REPEATTYPE;
2553    
2554 nigel 93 case OP_TYPEPOSSTAR:
2555     possessive = TRUE;
2556     min = 0;
2557     max = INT_MAX;
2558     ecode++;
2559     goto REPEATTYPE;
2560    
2561     case OP_TYPEPOSPLUS:
2562     possessive = TRUE;
2563     min = 1;
2564     max = INT_MAX;
2565     ecode++;
2566     goto REPEATTYPE;
2567    
2568     case OP_TYPEPOSQUERY:
2569     possessive = TRUE;
2570     min = 0;
2571     max = 1;
2572     ecode++;
2573     goto REPEATTYPE;
2574    
2575     case OP_TYPEPOSUPTO:
2576     possessive = TRUE;
2577     min = 0;
2578     max = GET2(ecode, 1);
2579     ecode += 3;
2580     goto REPEATTYPE;
2581    
2582 nigel 77 case OP_TYPESTAR:
2583     case OP_TYPEMINSTAR:
2584     case OP_TYPEPLUS:
2585     case OP_TYPEMINPLUS:
2586     case OP_TYPEQUERY:
2587     case OP_TYPEMINQUERY:
2588     c = *ecode++ - OP_TYPESTAR;
2589     minimize = (c & 1) != 0;
2590     min = rep_min[c]; /* Pick up values from tables; */
2591     max = rep_max[c]; /* zero for max => infinity */
2592     if (max == 0) max = INT_MAX;
2593    
2594     /* Common code for all repeated single character type matches. Note that
2595     in UTF-8 mode, '.' matches a character of any length, but for the other
2596     character types, the valid characters are all one-byte long. */
2597    
2598     REPEATTYPE:
2599     ctype = *ecode++; /* Code for the character type */
2600    
2601     #ifdef SUPPORT_UCP
2602     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2603     {
2604     prop_fail_result = ctype == OP_NOTPROP;
2605     prop_type = *ecode++;
2606 nigel 87 prop_value = *ecode++;
2607 nigel 77 }
2608     else prop_type = -1;
2609     #endif
2610    
2611     /* First, ensure the minimum number of matches are present. Use inline
2612     code for maximizing the speed, and do the type test once at the start
2613     (i.e. keep it out of the loop). Also we can test that there are at least
2614     the minimum number of bytes before we start. This isn't as effective in
2615     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2616     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2617     and single-bytes. */
2618    
2619     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2620     if (min > 0)
2621     {
2622     #ifdef SUPPORT_UCP
2623 nigel 87 if (prop_type >= 0)
2624 nigel 77 {
2625 nigel 87 switch(prop_type)
2626 nigel 77 {
2627 nigel 87 case PT_ANY:
2628     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2629     for (i = 1; i <= min; i++)
2630     {
2631     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2632     GETCHARINC(c, eptr);
2633     }
2634     break;
2635    
2636     case PT_LAMP:
2637     for (i = 1; i <= min; i++)
2638     {
2639     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2640     GETCHARINC(c, eptr);
2641     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2642     if ((prop_chartype == ucp_Lu ||
2643     prop_chartype == ucp_Ll ||
2644     prop_chartype == ucp_Lt) == prop_fail_result)
2645     RRETURN(MATCH_NOMATCH);
2646     }
2647     break;
2648    
2649     case PT_GC:
2650     for (i = 1; i <= min; i++)
2651     {
2652     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2653     GETCHARINC(c, eptr);
2654     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2655     if ((prop_category == prop_value) == prop_fail_result)
2656     RRETURN(MATCH_NOMATCH);
2657     }
2658     break;
2659    
2660     case PT_PC:
2661     for (i = 1; i <= min; i++)
2662     {
2663     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2664     GETCHARINC(c, eptr);
2665     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2666     if ((prop_chartype == prop_value) == prop_fail_result)
2667     RRETURN(MATCH_NOMATCH);
2668     }
2669     break;
2670    
2671     case PT_SC:
2672     for (i = 1; i <= min; i++)
2673     {
2674     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2675     GETCHARINC(c, eptr);
2676     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2677     if ((prop_script == prop_value) == prop_fail_result)
2678     RRETURN(MATCH_NOMATCH);
2679     }
2680     break;
2681    
2682     default:
2683     RRETURN(PCRE_ERROR_INTERNAL);
2684 nigel 77 }
2685     }
2686    
2687     /* Match extended Unicode sequences. We will get here only if the
2688     support is in the binary; otherwise a compile-time error occurs. */
2689    
2690     else if (ctype == OP_EXTUNI)
2691     {
2692     for (i = 1; i <= min; i++)
2693     {
2694     GETCHARINCTEST(c, eptr);
2695 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2696 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2697     while (eptr < md->end_subject)
2698     {
2699     int len = 1;
2700     if (!utf8) c = *eptr; else
2701     {
2702     GETCHARLEN(c, eptr, len);
2703     }
2704 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2705 nigel 77 if (prop_category != ucp_M) break;
2706     eptr += len;
2707     }
2708     }
2709     }
2710    
2711     else
2712     #endif /* SUPPORT_UCP */
2713    
2714     /* Handle all other cases when the coding is UTF-8 */
2715    
2716     #ifdef SUPPORT_UTF8
2717     if (utf8) switch(ctype)
2718     {
2719     case OP_ANY:
2720     for (i = 1; i <= min; i++)
2721     {
2722     if (eptr >= md->end_subject ||
2723 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2724 nigel 77 RRETURN(MATCH_NOMATCH);
2725 nigel 91 eptr++;
2726 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2727     }
2728     break;
2729    
2730     case OP_ANYBYTE:
2731     eptr += min;
2732     break;
2733    
2734 nigel 93 case OP_ANYNL:
2735     for (i = 1; i <= min; i++)
2736     {
2737     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2738     GETCHARINC(c, eptr);
2739     switch(c)
2740     {
2741     default: RRETURN(MATCH_NOMATCH);
2742     case 0x000d:
2743     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2744     break;
2745     case 0x000a:
2746     case 0x000b:
2747     case 0x000c:
2748     case 0x0085:
2749     case 0x2028:
2750     case 0x2029:
2751     break;
2752     }
2753     }
2754     break;
2755    
2756 nigel 77 case OP_NOT_DIGIT:
2757     for (i = 1; i <= min; i++)
2758     {
2759     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2760     GETCHARINC(c, eptr);
2761     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2762     RRETURN(MATCH_NOMATCH);
2763     }
2764     break;
2765    
2766     case OP_DIGIT:
2767     for (i = 1; i <= min; i++)
2768     {
2769     if (eptr >= md->end_subject ||
2770     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2771     RRETURN(MATCH_NOMATCH);
2772     /* No need to skip more bytes - we know it's a 1-byte character */
2773     }
2774     break;
2775    
2776     case OP_NOT_WHITESPACE:
2777     for (i = 1; i <= min; i++)
2778     {
2779     if (eptr >= md->end_subject ||
2780     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2781     RRETURN(MATCH_NOMATCH);
2782     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2783     }
2784     break;
2785    
2786     case OP_WHITESPACE:
2787     for (i = 1; i <= min; i++)
2788     {
2789     if (eptr >= md->end_subject ||
2790     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2791     RRETURN(MATCH_NOMATCH);
2792     /* No need to skip more bytes - we know it's a 1-byte character */
2793     }
2794     break;
2795    
2796     case OP_NOT_WORDCHAR:
2797     for (i = 1; i <= min; i++)
2798     {
2799     if (eptr >= md->end_subject ||
2800     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2801     RRETURN(MATCH_NOMATCH);
2802     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2803     }
2804     break;
2805    
2806     case OP_WORDCHAR:
2807     for (i = 1; i <= min; i++)
2808     {
2809     if (eptr >= md->end_subject ||
2810     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2811     RRETURN(MATCH_NOMATCH);
2812     /* No need to skip more bytes - we know it's a 1-byte character */
2813     }
2814     break;
2815    
2816     default:
2817     RRETURN(PCRE_ERROR_INTERNAL);
2818     } /* End switch(ctype) */
2819    
2820     else
2821     #endif /* SUPPORT_UTF8 */
2822    
2823     /* Code for the non-UTF-8 case for minimum matching of operators other
2824 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2825     number of bytes present, as this was tested above. */
2826 nigel 77
2827     switch(ctype)
2828     {
2829     case OP_ANY:
2830     if ((ims & PCRE_DOTALL) == 0)
2831     {
2832     for (i = 1; i <= min; i++)
2833 nigel 91 {
2834 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2835 nigel 91 eptr++;
2836     }
2837 nigel 77 }
2838     else eptr += min;
2839     break;
2840    
2841     case OP_ANYBYTE:
2842     eptr += min;
2843     break;
2844    
2845 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
2846     bytes are present in this case. */
2847    
2848     case OP_ANYNL:
2849     for (i = 1; i <= min; i++)
2850     {
2851     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2852     switch(*eptr++)
2853     {
2854     default: RRETURN(MATCH_NOMATCH);
2855     case 0x000d:
2856     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2857     break;
2858     case 0x000a:
2859     case 0x000b:
2860     case 0x000c:
2861     case 0x0085:
2862     break;
2863     }
2864     }
2865     break;
2866    
2867 nigel 77 case OP_NOT_DIGIT:
2868     for (i = 1; i <= min; i++)
2869     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2870     break;
2871    
2872     case OP_DIGIT:
2873     for (i = 1; i <= min; i++)
2874     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2875     break;
2876    
2877     case OP_NOT_WHITESPACE:
2878     for (i = 1; i <= min; i++)
2879     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2880     break;
2881    
2882     case OP_WHITESPACE:
2883     for (i = 1; i <= min; i++)
2884     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2885     break;
2886    
2887     case OP_NOT_WORDCHAR:
2888     for (i = 1; i <= min; i++)
2889     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2890     RRETURN(MATCH_NOMATCH);
2891     break;
2892    
2893     case OP_WORDCHAR:
2894     for (i = 1; i <= min; i++)
2895     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2896     RRETURN(MATCH_NOMATCH);
2897     break;
2898    
2899     default:
2900     RRETURN(PCRE_ERROR_INTERNAL);
2901     }
2902     }
2903    
2904     /* If min = max, continue at the same level without recursing */
2905    
2906     if (min == max) continue;
2907    
2908     /* If minimizing, we have to test the rest of the pattern before each
2909     subsequent match. Again, separate the UTF-8 case for speed, and also
2910     separate the UCP cases. */
2911    
2912     if (minimize)
2913     {
2914     #ifdef SUPPORT_UCP
2915 nigel 87 if (prop_type >= 0)
2916 nigel 77 {
2917 nigel 87 switch(prop_type)
2918 nigel 77 {
2919 nigel 87 case PT_ANY:
2920     for (fi = min;; fi++)
2921     {
2922     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2923     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2924     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2925     GETCHARINC(c, eptr);
2926     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2927     }
2928 nigel 93 /* Control never gets here */
2929 nigel 87
2930     case PT_LAMP:
2931     for (fi = min;; fi++)
2932     {
2933     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2934     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2935     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2936     GETCHARINC(c, eptr);
2937     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2938     if ((prop_chartype == ucp_Lu ||
2939     prop_chartype == ucp_Ll ||
2940     prop_chartype == ucp_Lt) == prop_fail_result)
2941     RRETURN(MATCH_NOMATCH);
2942     }
2943 nigel 93 /* Control never gets here */
2944 nigel 87
2945     case PT_GC:
2946     for (fi = min;; fi++)
2947     {
2948     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2949     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2950     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951     GETCHARINC(c, eptr);
2952     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2953     if ((prop_category == prop_value) == prop_fail_result)
2954     RRETURN(MATCH_NOMATCH);
2955     }
2956 nigel 93 /* Control never gets here */
2957 nigel 87
2958     case PT_PC:
2959     for (fi = min;; fi++)
2960     {
2961     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2962     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2963     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2964     GETCHARINC(c, eptr);
2965     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2966     if ((prop_chartype == prop_value) == prop_fail_result)
2967     RRETURN(MATCH_NOMATCH);
2968     }
2969 nigel 93 /* Control never gets here */
2970 nigel 87
2971     case PT_SC:
2972     for (fi = min;; fi++)
2973     {
2974     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2975     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2976     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2977     GETCHARINC(c, eptr);
2978     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2979     if ((prop_script == prop_value) == prop_fail_result)
2980     RRETURN(MATCH_NOMATCH);
2981     }
2982 nigel 93 /* Control never gets here */
2983 nigel 87
2984     default:
2985     RRETURN(PCRE_ERROR_INTERNAL);
2986 nigel 77 }
2987     }
2988    
2989     /* Match extended Unicode sequences. We will get here only if the
2990     support is in the binary; otherwise a compile-time error occurs. */
2991    
2992     else if (ctype == OP_EXTUNI)
2993     {
2994     for (fi = min;; fi++)
2995     {
2996     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2997     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2998     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2999     GETCHARINCTEST(c, eptr);
3000 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3001 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3002     while (eptr < md->end_subject)
3003     {
3004     int len = 1;
3005     if (!utf8) c = *eptr; else
3006     {
3007     GETCHARLEN(c, eptr, len);
3008     }
3009 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3010 nigel 77 if (prop_category != ucp_M) break;
3011     eptr += len;
3012     }
3013     }
3014     }
3015    
3016     else
3017     #endif /* SUPPORT_UCP */
3018    
3019     #ifdef SUPPORT_UTF8
3020     /* UTF-8 mode */
3021     if (utf8)
3022     {
3023     for (fi = min;; fi++)
3024     {
3025     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3026     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3027 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3028     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3029 nigel 93 IS_NEWLINE(eptr)))
3030 nigel 91 RRETURN(MATCH_NOMATCH);
3031 nigel 77
3032     GETCHARINC(c, eptr);
3033     switch(ctype)
3034     {
3035 nigel 91 case OP_ANY: /* This is the DOTALL case */
3036 nigel 77 break;
3037    
3038     case OP_ANYBYTE:
3039     break;
3040    
3041 nigel 93 case OP_ANYNL:
3042     switch(c)
3043     {
3044     default: RRETURN(MATCH_NOMATCH);
3045     case 0x000d:
3046     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3047     break;
3048     case 0x000a:
3049     case 0x000b:
3050     case 0x000c:
3051     case 0x0085:
3052     case 0x2028:
3053     case 0x2029:
3054     break;
3055     }
3056     break;
3057    
3058 nigel 77 case OP_NOT_DIGIT:
3059     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3060     RRETURN(MATCH_NOMATCH);
3061     break;
3062    
3063     case OP_DIGIT:
3064     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3065     RRETURN(MATCH_NOMATCH);
3066     break;
3067    
3068     case OP_NOT_WHITESPACE:
3069     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3070     RRETURN(MATCH_NOMATCH);
3071     break;
3072    
3073     case OP_WHITESPACE:
3074     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3075     RRETURN(MATCH_NOMATCH);
3076     break;
3077    
3078     case OP_NOT_WORDCHAR:
3079     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3080     RRETURN(MATCH_NOMATCH);
3081     break;
3082    
3083     case OP_WORDCHAR:
3084     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3085     RRETURN(MATCH_NOMATCH);
3086     break;
3087    
3088     default:
3089     RRETURN(PCRE_ERROR_INTERNAL);
3090     }
3091     }
3092     }
3093     else
3094     #endif
3095     /* Not UTF-8 mode */
3096     {
3097     for (fi = min;; fi++)
3098     {
3099     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3100     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3101 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3102 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3103 nigel 91 RRETURN(MATCH_NOMATCH);
3104    
3105 nigel 77 c = *eptr++;
3106     switch(ctype)
3107     {
3108 nigel 91 case OP_ANY: /* This is the DOTALL case */
3109 nigel 77 break;
3110    
3111     case OP_ANYBYTE:
3112     break;
3113    
3114 nigel 93 case OP_ANYNL:
3115     switch(c)
3116     {
3117     default: RRETURN(MATCH_NOMATCH);
3118     case 0x000d:
3119     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3120     break;
3121     case 0x000a:
3122     case 0x000b:
3123     case 0x000c:
3124     case 0x0085:
3125     break;
3126     }
3127     break;
3128    
3129 nigel 77 case OP_NOT_DIGIT:
3130     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3131     break;
3132    
3133     case OP_DIGIT:
3134     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3135     break;
3136    
3137     case OP_NOT_WHITESPACE:
3138     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3139     break;
3140    
3141     case OP_WHITESPACE:
3142     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3143     break;
3144    
3145     case OP_NOT_WORDCHAR:
3146     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3147     break;
3148    
3149     case OP_WORDCHAR:
3150     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3151     break;
3152    
3153     default:
3154     RRETURN(PCRE_ERROR_INTERNAL);
3155     }
3156     }
3157     }
3158     /* Control never gets here */
3159     }
3160    
3161 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3162 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3163     UTF-8 and UCP stuff separate. */
3164    
3165     else
3166     {
3167     pp = eptr; /* Remember where we started */
3168    
3169     #ifdef SUPPORT_UCP
3170 nigel 87 if (prop_type >= 0)
3171 nigel 77 {
3172 nigel 87 switch(prop_type)
3173 nigel 77 {
3174 nigel 87 case PT_ANY:
3175     for (i = min; i < max; i++)
3176     {
3177     int len = 1;
3178     if (eptr >= md->end_subject) break;
3179     GETCHARLEN(c, eptr, len);
3180     if (prop_fail_result) break;
3181     eptr+= len;
3182     }
3183     break;
3184    
3185     case PT_LAMP:
3186     for (i = min; i < max; i++)
3187     {
3188     int len = 1;
3189     if (eptr >= md->end_subject) break;
3190     GETCHARLEN(c, eptr, len);
3191     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3192     if ((prop_chartype == ucp_Lu ||
3193     prop_chartype == ucp_Ll ||
3194     prop_chartype == ucp_Lt) == prop_fail_result)
3195     break;
3196     eptr+= len;
3197     }
3198     break;
3199    
3200     case PT_GC:
3201     for (i = min; i < max; i++)
3202     {
3203     int len = 1;
3204     if (eptr >= md->end_subject) break;
3205     GETCHARLEN(c, eptr, len);
3206     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3207     if ((prop_category == prop_value) == prop_fail_result)
3208     break;
3209     eptr+= len;
3210     }
3211     break;
3212    
3213     case PT_PC:
3214     for (i = min; i < max; i++)
3215     {
3216     int len = 1;
3217     if (eptr >= md->end_subject) break;
3218     GETCHARLEN(c, eptr, len);
3219     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3220     if ((prop_chartype == prop_value) == prop_fail_result)
3221     break;
3222     eptr+= len;
3223     }
3224     break;
3225    
3226     case PT_SC:
3227     for (i = min; i < max; i++)
3228     {
3229     int len = 1;
3230     if (eptr >= md->end_subject) break;
3231     GETCHARLEN(c, eptr, len);
3232     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3233     if ((prop_script == prop_value) == prop_fail_result)
3234     break;
3235     eptr+= len;
3236     }
3237     break;
3238 nigel 77 }
3239    
3240     /* eptr is now past the end of the maximum run */
3241    
3242 nigel 93 if (possessive) continue;
3243 nigel 77 for(;;)
3244     {
3245     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3246     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3247     if (eptr-- == pp) break; /* Stop if tried at original pos */
3248     BACKCHAR(eptr);
3249     }
3250     }
3251    
3252     /* Match extended Unicode sequences. We will get here only if the
3253     support is in the binary; otherwise a compile-time error occurs. */
3254    
3255     else if (ctype == OP_EXTUNI)
3256     {
3257     for (i = min; i < max; i++)
3258     {
3259     if (eptr >= md->end_subject) break;
3260     GETCHARINCTEST(c, eptr);
3261 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3262 nigel 77 if (prop_category == ucp_M) break;
3263     while (eptr < md->end_subject)
3264     {
3265     int len = 1;
3266     if (!utf8) c = *eptr; else
3267     {
3268     GETCHARLEN(c, eptr, len);
3269     }
3270 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3271 nigel 77 if (prop_category != ucp_M) break;
3272     eptr += len;
3273     }
3274     }
3275    
3276     /* eptr is now past the end of the maximum run */
3277    
3278 nigel 93 if (possessive) continue;
3279 nigel 77 for(;;)
3280     {
3281     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3282     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3283     if (eptr-- == pp) break; /* Stop if tried at original pos */
3284     for (;;) /* Move back over one extended */
3285     {
3286     int len = 1;
3287     BACKCHAR(eptr);
3288     if (!utf8) c = *eptr; else
3289     {
3290     GETCHARLEN(c, eptr, len);
3291     }
3292 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3293 nigel 77 if (prop_category != ucp_M) break;
3294     eptr--;
3295     }
3296     }
3297     }
3298    
3299     else
3300     #endif /* SUPPORT_UCP */
3301    
3302     #ifdef SUPPORT_UTF8
3303     /* UTF-8 mode */
3304    
3305     if (utf8)
3306     {
3307     switch(ctype)
3308     {
3309     case OP_ANY:
3310    
3311 nigel 91 /* Special code is required for UTF8, but when the maximum is
3312     unlimited we don't need it, so we repeat the non-UTF8 code. This is
3313     probably worth it, because .* is quite a common idiom. */
3314 nigel 77
3315     if (max < INT_MAX)
3316     {
3317     if ((ims & PCRE_DOTALL) == 0)
3318     {
3319     for (i = min; i < max; i++)
3320     {
3321 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3322 nigel 77 eptr++;
3323     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3324     }
3325     }
3326     else
3327     {
3328     for (i = min; i < max; i++)
3329     {
3330 nigel 91 if (eptr >= md->end_subject) break;
3331 nigel 77 eptr++;
3332     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3333     }
3334     }
3335     }
3336    
3337     /* Handle unlimited UTF-8 repeat */
3338    
3339     else
3340     {
3341     if ((ims & PCRE_DOTALL) == 0)
3342     {
3343     for (i = min; i < max; i++)
3344     {
3345 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3346 nigel 77 eptr++;
3347     }
3348     break;
3349     }
3350     else
3351     {
3352     c = max - min;
3353 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3354     c = md->end_subject - eptr;
3355 nigel 77 eptr += c;
3356     }
3357     }
3358     break;
3359    
3360     /* The byte case is the same as non-UTF8 */
3361    
3362     case OP_ANYBYTE:
3363     c = max - min;
3364 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3365     c = md->end_subject - eptr;
3366 nigel 77 eptr += c;
3367     break;
3368    
3369 nigel 93 case OP_ANYNL:
3370     for (i = min; i < max; i++)
3371     {
3372     int len = 1;
3373     if (eptr >= md->end_subject) break;
3374     GETCHARLEN(c, eptr, len);
3375     if (c == 0x000d)
3376     {
3377     if (++eptr >= md->end_subject) break;
3378     if (*eptr == 0x000a) eptr++;
3379     }
3380     else
3381     {
3382     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3383     c != 0x0085 && c != 0x2028 && c != 0x2029)
3384     break;
3385     eptr += len;
3386     }
3387     }
3388     break;
3389    
3390 nigel 77 case OP_NOT_DIGIT:
3391     for (i = min; i < max; i++)
3392     {
3393     int len = 1;
3394     if (eptr >= md->end_subject) break;
3395     GETCHARLEN(c, eptr, len);
3396     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3397     eptr+= len;
3398     }
3399     break;
3400    
3401     case OP_DIGIT:
3402     for (i = min; i < max; i++)
3403     {
3404     int len = 1;
3405     if (eptr >= md->end_subject) break;
3406     GETCHARLEN(c, eptr, len);
3407     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3408     eptr+= len;
3409     }
3410     break;
3411    
3412     case OP_NOT_WHITESPACE:
3413     for (i = min; i < max; i++)
3414     {
3415     int len = 1;
3416     if (eptr >= md->end_subject) break;
3417     GETCHARLEN(c, eptr, len);
3418     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3419     eptr+= len;
3420     }
3421     break;
3422    
3423     case OP_WHITESPACE:
3424     for (i = min; i < max; i++)
3425     {
3426     int len = 1;
3427     if (eptr >= md->end_subject) break;
3428     GETCHARLEN(c, eptr, len);
3429     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3430     eptr+= len;
3431     }
3432     break;
3433    
3434     case OP_NOT_WORDCHAR:
3435     for (i = min; i < max; i++)
3436     {
3437     int len = 1;
3438     if (eptr >= md->end_subject) break;
3439     GETCHARLEN(c, eptr, len);
3440     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3441     eptr+= len;
3442     }
3443     break;
3444    
3445     case OP_WORDCHAR:
3446     for (i = min; i < max; i++)
3447     {
3448     int len = 1;
3449     if (eptr >= md->end_subject) break;
3450     GETCHARLEN(c, eptr, len);
3451     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3452     eptr+= len;
3453     }
3454     break;
3455    
3456     default:
3457     RRETURN(PCRE_ERROR_INTERNAL);
3458     }
3459    
3460     /* eptr is now past the end of the maximum run */
3461    
3462 nigel 93 if (possessive) continue;
3463 nigel 77 for(;;)
3464     {
3465     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3466     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467     if (eptr-- == pp) break; /* Stop if tried at original pos */
3468     BACKCHAR(eptr);
3469     }
3470     }
3471     else
3472     #endif
3473    
3474     /* Not UTF-8 mode */
3475     {
3476     switch(ctype)
3477     {
3478     case OP_ANY:
3479     if ((ims & PCRE_DOTALL) == 0)
3480     {
3481     for (i = min; i < max; i++)
3482     {
3483 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3484 nigel 77 eptr++;
3485     }
3486     break;
3487     }
3488     /* For DOTALL case, fall through and treat as \C */
3489    
3490     case OP_ANYBYTE:
3491     c = max - min;
3492 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3493     c = md->end_subject - eptr;
3494 nigel 77 eptr += c;
3495     break;
3496    
3497 nigel 93 case OP_ANYNL:
3498     for (i = min; i < max; i++)
3499     {
3500     if (eptr >= md->end_subject) break;
3501     c = *eptr;
3502     if (c == 0x000d)
3503     {
3504     if (++eptr >= md->end_subject) break;
3505     if (*eptr == 0x000a) eptr++;
3506     }
3507     else
3508     {
3509     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3510     break;
3511     eptr++;
3512     }
3513     }
3514     break;
3515    
3516 nigel 77 case OP_NOT_DIGIT:
3517     for (i = min; i < max; i++)
3518     {
3519     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3520     break;
3521     eptr++;
3522     }
3523     break;
3524    
3525     case OP_DIGIT:
3526     for (i = min; i < max; i++)
3527     {
3528     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3529     break;
3530     eptr++;
3531     }
3532     break;
3533    
3534     case OP_NOT_WHITESPACE:
3535     for (i = min; i < max; i++)
3536     {
3537     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3538     break;
3539     eptr++;
3540     }
3541     break;
3542    
3543     case OP_WHITESPACE:
3544     for (i = min; i < max; i++)
3545     {
3546     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3547     break;
3548     eptr++;
3549     }
3550     break;
3551    
3552     case OP_NOT_WORDCHAR:
3553     for (i = min; i < max; i++)
3554     {
3555     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3556     break;
3557     eptr++;
3558     }
3559     break;
3560    
3561     case OP_WORDCHAR:
3562     for (i = min; i < max; i++)
3563     {
3564     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3565     break;
3566     eptr++;
3567     }
3568     break;
3569    
3570     default:
3571     RRETURN(PCRE_ERROR_INTERNAL);
3572     }
3573    
3574     /* eptr is now past the end of the maximum run */
3575    
3576 nigel 93 if (possessive) continue;
3577 nigel 77 while (eptr >= pp)
3578     {
3579     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3580     eptr--;
3581     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3582     }
3583     }
3584    
3585     /* Get here if we can't make it match with any permitted repetitions */
3586    
3587     RRETURN(MATCH_NOMATCH);
3588     }
3589     /* Control never gets here */
3590    
3591 nigel 93 /* There's been some horrible disaster. Arrival here can only mean there is
3592     something seriously wrong in the code above or the OP_xxx definitions. */
3593 nigel 77
3594     default:
3595     DPRINTF(("Unknown opcode %d\n", *ecode));
3596 nigel 93 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3597 nigel 77 }
3598    
3599     /* Do not stick any code in here without much thought; it is assumed
3600     that "continue" in the code above comes out to here to repeat the main
3601     loop. */
3602    
3603     } /* End of main loop */
3604     /* Control never reaches here */
3605     }
3606    
3607    
3608     /***************************************************************************
3609     ****************************************************************************
3610     RECURSION IN THE match() FUNCTION
3611    
3612     Undefine all the macros that were defined above to handle this. */
3613    
3614     #ifdef NO_RECURSE
3615     #undef eptr
3616     #undef ecode
3617     #undef offset_top
3618     #undef ims
3619     #undef eptrb
3620     #undef flags
3621    
3622     #undef callpat
3623     #undef charptr
3624     #undef data
3625     #undef next
3626     #undef pp
3627     #undef prev
3628     #undef saved_eptr
3629    
3630     #undef new_recursive
3631    
3632     #undef cur_is_word
3633     #undef condition
3634     #undef prev_is_word
3635    
3636     #undef original_ims
3637    
3638     #undef ctype
3639     #undef length
3640     #undef max
3641     #undef min
3642     #undef number
3643     #undef offset
3644     #undef op
3645     #undef save_capture_last
3646     #undef save_offset1
3647     #undef save_offset2
3648     #undef save_offset3
3649     #undef stacksave
3650    
3651     #undef newptrb
3652    
3653     #endif
3654    
3655     /* These two are defined as macros in both cases */
3656    
3657     #undef fc
3658     #undef fi
3659    
3660     /***************************************************************************
3661     ***************************************************************************/
3662    
3663    
3664    
3665     /*************************************************
3666     * Execute a Regular Expression *
3667     *************************************************/
3668    
3669     /* This function applies a compiled re to a subject string and picks out
3670     portions of the string if it matches. Two elements in the vector are set for
3671     each substring: the offsets to the start and end of the substring.
3672    
3673     Arguments:
3674     argument_re points to the compiled expression
3675     extra_data points to extra data or is NULL
3676     subject points to the subject string
3677     length length of subject string (may contain binary zeros)
3678     start_offset where to start in the subject string
3679     options option bits
3680     offsets points to a vector of ints to be filled in with offsets
3681     offsetcount the number of elements in the vector
3682    
3683     Returns: > 0 => success; value is the number of elements filled in
3684     = 0 => success, but offsets is not big enough
3685     -1 => failed to match
3686     < -1 => some kind of unexpected problem
3687     */
3688    
3689 nigel 87 PCRE_DATA_SCOPE int
3690 nigel 77 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3691 nigel 87 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3692 nigel 77 int offsetcount)
3693     {
3694     int rc, resetcount, ocount;
3695     int first_byte = -1;
3696     int req_byte = -1;
3697     int req_byte2 = -1;
3698 nigel 91 int newline;
3699     unsigned long int ims;
3700 nigel 77 BOOL using_temporary_offsets = FALSE;
3701     BOOL anchored;
3702     BOOL startline;
3703     BOOL firstline;
3704     BOOL first_byte_caseless = FALSE;
3705     BOOL req_byte_caseless = FALSE;
3706 nigel 93 BOOL utf8;
3707 nigel 77 match_data match_block;
3708 nigel 91 match_data *md = &match_block;
3709 nigel 77 const uschar *tables;
3710     const uschar *start_bits = NULL;
3711 nigel 87 USPTR start_match = (USPTR)subject + start_offset;
3712     USPTR end_subject;
3713     USPTR req_byte_ptr = start_match - 1;
3714 nigel 93 eptrblock eptrchain[EPTR_WORK_SIZE];
3715 nigel 77
3716     pcre_study_data internal_study;
3717     const pcre_study_data *study;
3718    
3719     real_pcre internal_re;
3720     const real_pcre *external_re = (const real_pcre *)argument_re;
3721     const real_pcre *re = external_re;
3722    
3723     /* Plausibility checks */
3724    
3725     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3726     if (re == NULL || subject == NULL ||
3727     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3728     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3729    
3730     /* Fish out the optional data from the extra_data structure, first setting
3731     the default values. */
3732    
3733     study = NULL;
3734 nigel 91 md->match_limit = MATCH_LIMIT;
3735     md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3736     md->callout_data = NULL;
3737 nigel 77
3738     /* The table pointer is always in native byte order. */
3739    
3740     tables = external_re->tables;
3741    
3742     if (extra_data != NULL)
3743     {
3744     register unsigned int flags = extra_data->flags;
3745     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3746     study = (const pcre_study_data *)extra_data->study_data;
3747     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3748 nigel 91 md->match_limit = extra_data->match_limit;
3749 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3750 nigel 91 md->match_limit_recursion = extra_data->match_limit_recursion;
3751 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3752 nigel 91 md->callout_data = extra_data->callout_data;
3753 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3754     }
3755    
3756     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3757     is a feature that makes it possible to save compiled regex and re-use them
3758     in other programs later. */
3759    
3760     if (tables == NULL) tables = _pcre_default_tables;
3761    
3762     /* Check that the first field in the block is the magic number. If it is not,
3763     test for a regex that was compiled on a host of opposite endianness. If this is
3764     the case, flipped values are put in internal_re and internal_study if there was
3765     study data too. */
3766    
3767     if (re->magic_number != MAGIC_NUMBER)
3768     {
3769     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3770     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3771     if (study != NULL) study = &internal_study;
3772     }
3773    
3774     /* Set up other data */
3775    
3776     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3777     startline = (re->options & PCRE_STARTLINE) != 0;
3778     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3779    
3780     /* The code starts after the real_pcre block and the capture name table. */
3781    
3782 nigel 91 md->start_code = (const uschar *)external_re + re->name_table_offset +
3783 nigel 77 re->name_count * re->name_entry_size;
3784    
3785 nigel 91 md->start_subject = (USPTR)subject;
3786     md->start_offset = start_offset;
3787     md->end_subject = md->start_subject + length;
3788     end_subject = md->end_subject;
3789 nigel 77
3790 nigel 91 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3791 nigel 93 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3792 nigel 77
3793 nigel 91 md->notbol = (options & PCRE_NOTBOL) != 0;
3794     md->noteol = (options & PCRE_NOTEOL) != 0;
3795     md->notempty = (options & PCRE_NOTEMPTY) != 0;
3796     md->partial = (options & PCRE_PARTIAL) != 0;
3797     md->hitend = FALSE;
3798 nigel 77
3799 nigel 91 md->recursive = NULL; /* No recursion at top level */
3800 nigel 93 md->eptrchain = eptrchain; /* Make workspace generally available */
3801 nigel 77
3802 nigel 91 md->lcc = tables + lcc_offset;
3803     md->ctypes = tables + ctypes_offset;
3804 nigel 77
3805 nigel 91 /* Handle different types of newline. The two bits give four cases. If nothing
3806     is set at run time, whatever was used at compile time applies. */
3807    
3808 nigel 93 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
3809     PCRE_NEWLINE_BITS)
3810 nigel 91 {
3811 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3812 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
3813     case PCRE_NEWLINE_LF: newline = '\n'; break;
3814     case PCRE_NEWLINE_CR+
3815     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3816 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3817     default: return PCRE_ERROR_BADNEWLINE;
3818 nigel 91 }
3819    
3820 nigel 93 if (newline < 0)
3821 nigel 91 {
3822 nigel 93 md->nltype = NLTYPE_ANY;
3823 nigel 91 }
3824     else
3825     {
3826 nigel 93 md->nltype = NLTYPE_FIXED;
3827     if (newline > 255)
3828     {
3829     md->nllen = 2;
3830     md->nl[0] = (newline >> 8) & 255;
3831     md->nl[1] = newline & 255;
3832     }
3833     else
3834     {
3835     md->nllen = 1;
3836     md->nl[0] = newline;
3837     }
3838 nigel 91 }
3839    
3840 nigel 77 /* Partial matching is supported only for a restricted set of regexes at the
3841     moment. */
3842    
3843 nigel 91 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3844 nigel 77 return PCRE_ERROR_BADPARTIAL;
3845    
3846     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3847     back the character offset. */
3848    
3849     #ifdef SUPPORT_UTF8
3850 nigel 93 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3851 nigel 77 {
3852     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3853     return PCRE_ERROR_BADUTF8;
3854     if (start_offset > 0 && start_offset < length)
3855     {
3856     int tb = ((uschar *)subject)[start_offset];
3857     if (tb > 127)
3858     {
3859     tb &= 0xc0;
3860     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3861     }
3862     }
3863     }
3864     #endif
3865    
3866     /* The ims options can vary during the matching as a result of the presence
3867     of (?ims) items in the pattern. They are kept in a local variable so that
3868     restoring at the exit of a group is easy. */
3869    
3870     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3871    
3872     /* If the expression has got more back references than the offsets supplied can
3873     hold, we get a temporary chunk of working store to use during the matching.
3874     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3875     of 3. */
3876    
3877     ocount = offsetcount - (offsetcount % 3);
3878    
3879     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3880     {
3881     ocount = re->top_backref * 3 + 3;
3882 nigel 91 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3883     if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3884 nigel 77 using_temporary_offsets = TRUE;
3885     DPRINTF(("Got memory to hold back references\n"));
3886     }
3887 nigel 91 else md->offset_vector = offsets;
3888 nigel 77
3889 nigel 91 md->offset_end = ocount;
3890     md->offset_max = (2*ocount)/3;
3891     md->offset_overflow = FALSE;
3892     md->capture_last = -1;
3893 nigel 77
3894     /* Compute the minimum number of offsets that we need to reset each time. Doing
3895     this makes a huge difference to execution time when there aren't many brackets
3896     in the pattern. */
3897    
3898     resetcount = 2 + re->top_bracket * 2;
3899     if (resetcount > offsetcount) resetcount = ocount;
3900    
3901     /* Reset the working variable associated with each extraction. These should
3902     never be used unless previously set, but they get saved and restored, and so we
3903     initialize them to avoid reading uninitialized locations. */
3904    
3905 nigel 91 if (md->offset_vector != NULL)
3906 nigel 77 {
3907 nigel 91 register int *iptr = md->offset_vector + ocount;
3908 nigel 77 register int *iend = iptr - resetcount/2 + 1;
3909     while (--iptr >= iend) *iptr = -1;
3910     }
3911    
3912     /* Set up the first character to match, if available. The first_byte value is
3913     never set for an anchored regular expression, but the anchoring may be forced
3914     at run time, so we have to test for anchoring. The first char may be unset for
3915     an unanchored pattern, of course. If there's no first char and the pattern was
3916     studied, there may be a bitmap of possible first characters. */
3917    
3918     if (!anchored)
3919     {
3920     if ((re->options & PCRE_FIRSTSET) != 0)
3921     {
3922     first_byte = re->first_byte & 255;
3923     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3924 nigel 91 first_byte = md->lcc[first_byte];
3925 nigel 77 }
3926     else
3927     if (!startline && study != NULL &&
3928     (study->options & PCRE_STUDY_MAPPED) != 0)
3929     start_bits = study->start_bits;
3930     }
3931    
3932     /* For anchored or unanchored matches, there may be a "last known required
3933     character" set. */
3934    
3935     if ((re->options & PCRE_REQCHSET) != 0)
3936     {
3937     req_byte = re->req_byte & 255;
3938     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3939     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3940     }
3941    
3942 nigel 93
3943     /* ==========================================================================*/
3944    
3945 nigel 77 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3946     the loop runs just once. */
3947    
3948 nigel 93 for(;;)
3949 nigel 77 {
3950 nigel 87 USPTR save_end_subject = end_subject;
3951 nigel 77
3952     /* Reset the maximum number of extractions we might see. */
3953    
3954 nigel 91 if (md->offset_vector != NULL)
3955 nigel 77 {
3956 nigel 91 register int *iptr = md->offset_vector;
3957 nigel 77 register int *iend = iptr + resetcount;
3958     while (iptr < iend) *iptr++ = -1;
3959     }
3960    
3961     /* Advance to a unique first char if possible. If firstline is TRUE, the
3962     start of the match is constrained to the first line of a multiline string.
3963 nigel 93 That is, the match must be before or at the first newline. Implement this by
3964     temporarily adjusting end_subject so that we stop scanning at a newline. If
3965     the match fails at the newline, later code breaks this loop. */
3966 nigel 77
3967     if (firstline)
3968     {
3969 nigel 87 USPTR t = start_match;
3970 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3971 nigel 77 end_subject = t;
3972     }
3973    
3974     /* Now test for a unique first byte */
3975    
3976     if (first_byte >= 0)
3977     {
3978     if (first_byte_caseless)
3979     while (start_match < end_subject &&
3980 nigel 91 md->lcc[*start_match] != first_byte)
3981 nigel 77 start_match++;
3982     else
3983     while (start_match < end_subject && *start_match != first_byte)
3984     start_match++;
3985     }
3986    
3987 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
3988 nigel 77
3989     else if (startline)
3990     {
3991 nigel 93 if (start_match > md->start_subject + start_offset)
3992 nigel 77 {
3993 nigel 93 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
3994 nigel 77 start_match++;
3995     }
3996     }
3997    
3998     /* Or to a non-unique first char after study */
3999    
4000     else if (start_bits != NULL)
4001     {
4002     while (start_match < end_subject)
4003     {
4004     register unsigned int c = *start_match;
4005     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4006     }
4007     }
4008    
4009     /* Restore fudged end_subject */
4010    
4011     end_subject = save_end_subject;
4012    
4013     #ifdef DEBUG /* Sigh. Some compilers never learn. */
4014     printf(">>>> Match against: ");
4015 nigel 91 pchars(start_match, end_subject - start_match, TRUE, md);
4016 nigel 77 printf("\n");
4017     #endif
4018    
4019     /* If req_byte is set, we know that that character must appear in the subject
4020     for the match to succeed. If the first character is set, req_byte must be
4021     later in the subject; otherwise the test starts at the match point. This
4022     optimization can save a huge amount of backtracking in patterns with nested
4023     unlimited repeats that aren't going to match. Writing separate code for
4024     cased/caseless versions makes it go faster, as does using an autoincrement
4025     and backing off on a match.
4026    
4027     HOWEVER: when the subject string is very, very long, searching to its end can
4028     take a long time, and give bad performance on quite ordinary patterns. This
4029 nigel 93 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4030     string... so we don't do this when the string is sufficiently long.
4031 nigel 77
4032     ALSO: this processing is disabled when partial matching is requested.
4033     */
4034    
4035     if (req_byte >= 0 &&
4036     end_subject - start_match < REQ_BYTE_MAX &&
4037 nigel 91 !md->partial)
4038 nigel 77 {
4039 nigel 87 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4040 nigel 77
4041     /* We don't need to repeat the search if we haven't yet reached the
4042     place we found it at last time. */
4043    
4044     if (p > req_byte_ptr)
4045     {
4046     if (req_byte_caseless)
4047     {
4048     while (p < end_subject)
4049     {
4050     register int pp = *p++;
4051     if (pp == req_byte || pp == req_byte2) { p--; break; }
4052     }
4053     }
4054     else
4055     {
4056     while (p < end_subject)
4057     {
4058     if (*p++ == req_byte) { p--; break; }
4059     }
4060     }
4061    
4062 nigel 93 /* If we can't find the required character, break the matching loop,
4063     forcing a match failure. */
4064 nigel 77
4065 nigel 93 if (p >= end_subject)
4066     {
4067     rc = MATCH_NOMATCH;
4068     break;
4069     }
4070 nigel 77
4071     /* If we have found the required character, save the point where we
4072     found it, so that we don't search again next time round the loop if
4073     the start hasn't passed this character yet. */
4074    
4075     req_byte_ptr = p;
4076     }
4077     }
4078    
4079 nigel 93 /* OK, we can now run the match. */
4080 nigel 77
4081 nigel 91 md->start_match = start_match;
4082     md->match_call_count = 0;
4083 nigel 93 md->eptrn = 0; /* Next free eptrchain slot */
4084     rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4085 nigel 77
4086 nigel 93 /* Any return other than MATCH_NOMATCH breaks the loop. */
4087 nigel 77
4088 nigel 93 if (rc != MATCH_NOMATCH) break;
4089 nigel 77
4090 nigel 93 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4091     newline in the subject (though it may continue over the newline). Therefore,
4092     if we have just failed to match, starting at a newline, do not continue. */
4093    
4094     if (firstline && IS_NEWLINE(start_match)) break;
4095    
4096     /* Advance the match position by one character. */
4097    
4098     start_match++;
4099 nigel 77 #ifdef SUPPORT_UTF8
4100 nigel 93 if (utf8)
4101     while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4102     start_match++;
4103 nigel 77 #endif
4104    
4105 nigel 93 /* Break the loop if the pattern is anchored or if we have passed the end of
4106     the subject. */
4107 nigel 77
4108 nigel 93 if (anchored || start_match > end_subject) break;
4109 nigel 77
4110 nigel 93 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
4111     are now at a LF, advance the match position by one more character. */
4112    
4113     if (start_match[-1] == '\r' &&
4114     (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
4115     start_match < end_subject &&
4116     *start_match == '\n')
4117     start_match++;
4118    
4119     } /* End of for(;;) "bumpalong" loop */
4120    
4121     /* ==========================================================================*/
4122    
4123     /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4124     conditions is true:
4125    
4126     (1) The pattern is anchored;
4127    
4128     (2) We are past the end of the subject;
4129    
4130     (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4131     this option requests that a match occur at or before the first newline in
4132     the subject.
4133    
4134     When we have a match and the offset vector is big enough to deal with any
4135     backreferences, captured substring offsets will already be set up. In the case
4136     where we had to get some local store to hold offsets for backreference
4137     processing, copy those that we can. In this case there need not be overflow if
4138     certain parts of the pattern were not used, even though there are more
4139     capturing parentheses than vector slots. */
4140    
4141     if (rc == MATCH_MATCH)
4142     {
4143 nigel 77 if (using_temporary_offsets)
4144     {
4145     if (offsetcount >= 4)
4146     {
4147 nigel 91 memcpy(offsets + 2, md->offset_vector + 2,
4148 nigel 77 (offsetcount - 2) * sizeof(int));
4149     DPRINTF(("Copied offsets from temporary memory\n"));
4150     }
4151 nigel 93 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4152 nigel 77 DPRINTF(("Freeing temporary memory\n"));
4153 nigel 91 (pcre_free)(md->offset_vector);
4154 nigel 77 }
4155    
4156 nigel 93 /* Set the return code to the number of captured strings, or 0 if there are
4157     too many to fit into the vector. */
4158    
4159 nigel 91 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4160 nigel 77
4161 nigel 93 /* If there is space, set up the whole thing as substring 0. */
4162    
4163 nigel 77 if (offsetcount < 2) rc = 0; else
4164     {
4165 nigel 91 offsets[0] = start_match - md->start_subject;
4166     offsets[1] = md->end_match_ptr - md->start_subject;
4167 nigel 77 }
4168    
4169     DPRINTF((">>>> returning %d\n", rc));
4170     return rc;
4171     }
4172    
4173 nigel 93 /* Control gets here if there has been an error, or if the overall match
4174     attempt has failed at all permitted starting positions. */
4175 nigel 77
4176     if (using_temporary_offsets)
4177     {
4178     DPRINTF(("Freeing temporary memory\n"));
4179 nigel 91 (pcre_free)(md->offset_vector);
4180 nigel 77 }
4181    
4182 nigel 93 if (rc != MATCH_NOMATCH)
4183 nigel 77 {
4184 nigel 93 DPRINTF((">>>> error: returning %d\n", rc));
4185     return rc;
4186     }
4187     else if (md->partial && md->hitend)
4188