/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 117 - (hide annotations) (download)
Fri Mar 9 15:59:06 2007 UTC (7 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 127229 byte(s)
Update copyright years to 2007.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 nigel 93 #define NLBLOCK md /* Block containing newline information */
46     #define PSSTART start_subject /* Field containing processed string start */
47     #define PSEND end_subject /* Field containing processed string end */
48    
49 nigel 77 #include "pcre_internal.h"
50    
51 nigel 93 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
52     obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
53 nigel 77
54 nigel 93 #define EPTR_WORK_SIZE (1000)
55 nigel 77
56     /* Flag bits for the match() function */
57    
58 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
59     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
60     #define match_tail_recursed 0x04 /* Tail recursive call */
61 nigel 77
62     /* Non-error returns from the match() function. Error returns are externally
63     defined PCRE_ERROR_xxx codes, which are all negative. */
64    
65     #define MATCH_MATCH 1
66     #define MATCH_NOMATCH 0
67    
68     /* Maximum number of ints of offset to save on the stack for recursive calls.
69     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
70     because the offset vector is always a multiple of 3 long. */
71    
72     #define REC_STACK_SAVE_MAX 30
73    
74     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
75    
76     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
77     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
78    
79    
80    
81     #ifdef DEBUG
82     /*************************************************
83     * Debugging function to print chars *
84     *************************************************/
85    
86     /* Print a sequence of chars in printable format, stopping at the end of the
87     subject if the requested.
88    
89     Arguments:
90     p points to characters
91     length number to print
92     is_subject TRUE if printing from within md->start_subject
93     md pointer to matching data block, if is_subject is TRUE
94    
95     Returns: nothing
96     */
97    
98     static void
99     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
100     {
101 nigel 93 unsigned int c;
102 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
103     while (length-- > 0)
104     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
105     }
106     #endif
107    
108    
109    
110     /*************************************************
111     * Match a back-reference *
112     *************************************************/
113    
114     /* If a back reference hasn't been set, the length that is passed is greater
115     than the number of characters left in the string, so the match fails.
116    
117     Arguments:
118     offset index into the offset vector
119     eptr points into the subject
120     length length to be matched
121     md points to match data block
122     ims the ims flags
123    
124     Returns: TRUE if matched
125     */
126    
127     static BOOL
128 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
129 nigel 77 unsigned long int ims)
130     {
131 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
132 nigel 77
133     #ifdef DEBUG
134     if (eptr >= md->end_subject)
135     printf("matching subject <null>");
136     else
137     {
138     printf("matching subject ");
139     pchars(eptr, length, TRUE, md);
140     }
141     printf(" against backref ");
142     pchars(p, length, FALSE, md);
143     printf("\n");
144     #endif
145    
146     /* Always fail if not enough characters left */
147    
148     if (length > md->end_subject - eptr) return FALSE;
149    
150     /* Separate the caselesss case for speed */
151    
152     if ((ims & PCRE_CASELESS) != 0)
153     {
154     while (length-- > 0)
155     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
156     }
157     else
158     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
159    
160     return TRUE;
161     }
162    
163    
164    
165     /***************************************************************************
166     ****************************************************************************
167     RECURSION IN THE match() FUNCTION
168    
169 nigel 87 The match() function is highly recursive, though not every recursive call
170     increases the recursive depth. Nevertheless, some regular expressions can cause
171     it to recurse to a great depth. I was writing for Unix, so I just let it call
172     itself recursively. This uses the stack for saving everything that has to be
173     saved for a recursive call. On Unix, the stack can be large, and this works
174     fine.
175 nigel 77
176 nigel 87 It turns out that on some non-Unix-like systems there are problems with
177     programs that use a lot of stack. (This despite the fact that every last chip
178     has oodles of memory these days, and techniques for extending the stack have
179     been known for decades.) So....
180 nigel 77
181     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
182     calls by keeping local variables that need to be preserved in blocks of memory
183 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
184 nigel 77 achieve this so that the actual code doesn't look very different to what it
185     always used to.
186     ****************************************************************************
187     ***************************************************************************/
188    
189    
190 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
191     versions and production versions. */
192 nigel 77
193     #ifndef NO_RECURSE
194     #define REGISTER register
195 nigel 87 #ifdef DEBUG
196     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
197     { \
198     printf("match() called in line %d\n", __LINE__); \
199     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
200     printf("to line %d\n", __LINE__); \
201     }
202     #define RRETURN(ra) \
203     { \
204     printf("match() returned %d from line %d ", ra, __LINE__); \
205     return ra; \
206     }
207     #else
208     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg) \
209     rx = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
210 nigel 77 #define RRETURN(ra) return ra
211 nigel 87 #endif
212    
213 nigel 77 #else
214    
215    
216     /* These versions of the macros manage a private stack on the heap. Note
217     that the rd argument of RMATCH isn't actually used. It's the md argument of
218     match(), which never changes. */
219    
220     #define REGISTER
221    
222     #define RMATCH(rx,ra,rb,rc,rd,re,rf,rg)\
223     {\
224     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
225     if (setjmp(frame->Xwhere) == 0)\
226     {\
227     newframe->Xeptr = ra;\
228     newframe->Xecode = rb;\
229     newframe->Xoffset_top = rc;\
230     newframe->Xims = re;\
231     newframe->Xeptrb = rf;\
232     newframe->Xflags = rg;\
233 nigel 87 newframe->Xrdepth = frame->Xrdepth + 1;\
234 nigel 77 newframe->Xprevframe = frame;\
235     frame = newframe;\
236     DPRINTF(("restarting from line %d\n", __LINE__));\
237     goto HEAP_RECURSE;\
238     }\
239     else\
240     {\
241     DPRINTF(("longjumped back to line %d\n", __LINE__));\
242     frame = md->thisframe;\
243     rx = frame->Xresult;\
244     }\
245     }
246    
247     #define RRETURN(ra)\
248     {\
249     heapframe *newframe = frame;\
250     frame = newframe->Xprevframe;\
251     (pcre_stack_free)(newframe);\
252     if (frame != NULL)\
253     {\
254     frame->Xresult = ra;\
255     md->thisframe = frame;\
256     longjmp(frame->Xwhere, 1);\
257     }\
258     return ra;\
259     }
260    
261    
262     /* Structure for remembering the local variables in a private frame */
263    
264     typedef struct heapframe {
265     struct heapframe *Xprevframe;
266    
267     /* Function arguments that may change */
268    
269     const uschar *Xeptr;
270     const uschar *Xecode;
271     int Xoffset_top;
272     long int Xims;
273     eptrblock *Xeptrb;
274     int Xflags;
275 nigel 91 unsigned int Xrdepth;
276 nigel 77
277     /* Function local variables */
278    
279     const uschar *Xcallpat;
280     const uschar *Xcharptr;
281     const uschar *Xdata;
282     const uschar *Xnext;
283     const uschar *Xpp;
284     const uschar *Xprev;
285     const uschar *Xsaved_eptr;
286    
287     recursion_info Xnew_recursive;
288    
289     BOOL Xcur_is_word;
290     BOOL Xcondition;
291     BOOL Xprev_is_word;
292    
293     unsigned long int Xoriginal_ims;
294    
295     #ifdef SUPPORT_UCP
296     int Xprop_type;
297 nigel 87 int Xprop_value;
298 nigel 77 int Xprop_fail_result;
299     int Xprop_category;
300     int Xprop_chartype;
301 nigel 87 int Xprop_script;
302 ph10 115 int Xoclength;
303     uschar Xocchars[8];
304 nigel 77 #endif
305    
306     int Xctype;
307 nigel 93 unsigned int Xfc;
308 nigel 77 int Xfi;
309     int Xlength;
310     int Xmax;
311     int Xmin;
312     int Xnumber;
313     int Xoffset;
314     int Xop;
315     int Xsave_capture_last;
316     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
317     int Xstacksave[REC_STACK_SAVE_MAX];
318    
319     eptrblock Xnewptrb;
320    
321     /* Place to pass back result, and where to jump back to */
322    
323     int Xresult;
324     jmp_buf Xwhere;
325    
326     } heapframe;
327    
328     #endif
329    
330    
331     /***************************************************************************
332     ***************************************************************************/
333    
334    
335    
336     /*************************************************
337     * Match from current position *
338     *************************************************/
339    
340 nigel 93 /* This function is called recursively in many circumstances. Whenever it
341 nigel 77 returns a negative (error) response, the outer incarnation must also return the
342     same response.
343    
344     Performance note: It might be tempting to extract commonly used fields from the
345     md structure (e.g. utf8, end_subject) into individual variables to improve
346     performance. Tests using gcc on a SPARC disproved this; in the first case, it
347     made performance worse.
348    
349     Arguments:
350 nigel 93 eptr pointer to current character in subject
351     ecode pointer to current position in compiled code
352 nigel 77 offset_top current top pointer
353     md pointer to "static" info for the match
354     ims current /i, /m, and /s options
355     eptrb pointer to chain of blocks containing eptr at start of
356     brackets - for testing for empty matches
357     flags can contain
358     match_condassert - this is an assertion condition
359 nigel 93 match_cbegroup - this is the start of an unlimited repeat
360     group that can match an empty string
361     match_tail_recursed - this is a tail_recursed group
362 nigel 87 rdepth the recursion depth
363 nigel 77
364     Returns: MATCH_MATCH if matched ) these values are >= 0
365     MATCH_NOMATCH if failed to match )
366     a negative PCRE_ERROR_xxx value if aborted by an error condition
367 nigel 87 (e.g. stopped by repeated call or recursion limit)
368 nigel 77 */
369    
370     static int
371 nigel 87 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
372 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
373 nigel 91 int flags, unsigned int rdepth)
374 nigel 77 {
375     /* These variables do not need to be preserved over recursion in this function,
376 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
377     "register" because they are used a lot in loops. */
378 nigel 77
379 nigel 91 register int rrc; /* Returns from recursive calls */
380     register int i; /* Used for loops not involving calls to RMATCH() */
381 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
382 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
383 nigel 77
384 nigel 93 BOOL minimize, possessive; /* Quantifier options */
385    
386 nigel 77 /* When recursion is not being used, all "local" variables that have to be
387     preserved over calls to RMATCH() are part of a "frame" which is obtained from
388     heap storage. Set up the top-level frame here; others are obtained from the
389     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
390    
391     #ifdef NO_RECURSE
392     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
393     frame->Xprevframe = NULL; /* Marks the top level */
394    
395     /* Copy in the original argument variables */
396    
397     frame->Xeptr = eptr;
398     frame->Xecode = ecode;
399     frame->Xoffset_top = offset_top;
400     frame->Xims = ims;
401     frame->Xeptrb = eptrb;
402     frame->Xflags = flags;
403 nigel 87 frame->Xrdepth = rdepth;
404 nigel 77
405     /* This is where control jumps back to to effect "recursion" */
406    
407     HEAP_RECURSE:
408    
409     /* Macros make the argument variables come from the current frame */
410    
411     #define eptr frame->Xeptr
412     #define ecode frame->Xecode
413     #define offset_top frame->Xoffset_top
414     #define ims frame->Xims
415     #define eptrb frame->Xeptrb
416     #define flags frame->Xflags
417 nigel 87 #define rdepth frame->Xrdepth
418 nigel 77
419     /* Ditto for the local variables */
420    
421     #ifdef SUPPORT_UTF8
422     #define charptr frame->Xcharptr
423     #endif
424     #define callpat frame->Xcallpat
425     #define data frame->Xdata
426     #define next frame->Xnext
427     #define pp frame->Xpp
428     #define prev frame->Xprev
429     #define saved_eptr frame->Xsaved_eptr
430    
431     #define new_recursive frame->Xnew_recursive
432    
433     #define cur_is_word frame->Xcur_is_word
434     #define condition frame->Xcondition
435     #define prev_is_word frame->Xprev_is_word
436    
437     #define original_ims frame->Xoriginal_ims
438    
439     #ifdef SUPPORT_UCP
440     #define prop_type frame->Xprop_type
441 nigel 87 #define prop_value frame->Xprop_value
442 nigel 77 #define prop_fail_result frame->Xprop_fail_result
443     #define prop_category frame->Xprop_category
444     #define prop_chartype frame->Xprop_chartype
445 nigel 87 #define prop_script frame->Xprop_script
446 ph10 115 #define oclength frame->Xoclength
447     #define occhars frame->Xocchars
448 nigel 77 #endif
449    
450     #define ctype frame->Xctype
451     #define fc frame->Xfc
452     #define fi frame->Xfi
453     #define length frame->Xlength
454     #define max frame->Xmax
455     #define min frame->Xmin
456     #define number frame->Xnumber
457     #define offset frame->Xoffset
458     #define op frame->Xop
459     #define save_capture_last frame->Xsave_capture_last
460     #define save_offset1 frame->Xsave_offset1
461     #define save_offset2 frame->Xsave_offset2
462     #define save_offset3 frame->Xsave_offset3
463     #define stacksave frame->Xstacksave
464    
465     #define newptrb frame->Xnewptrb
466    
467     /* When recursion is being used, local variables are allocated on the stack and
468     get preserved during recursion in the normal way. In this environment, fi and
469     i, and fc and c, can be the same variables. */
470    
471 nigel 93 #else /* NO_RECURSE not defined */
472 nigel 77 #define fi i
473     #define fc c
474    
475    
476 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
477     const uschar *charptr; /* in small blocks of the code. My normal */
478     #endif /* style of coding would have declared */
479     const uschar *callpat; /* them within each of those blocks. */
480     const uschar *data; /* However, in order to accommodate the */
481     const uschar *next; /* version of this code that uses an */
482     USPTR pp; /* external "stack" implemented on the */
483     const uschar *prev; /* heap, it is easier to declare them all */
484     USPTR saved_eptr; /* here, so the declarations can be cut */
485     /* out in a block. The only declarations */
486     recursion_info new_recursive; /* within blocks below are for variables */
487     /* that do not have to be preserved over */
488     BOOL cur_is_word; /* a recursive call to RMATCH(). */
489     BOOL condition;
490 nigel 77 BOOL prev_is_word;
491    
492     unsigned long int original_ims;
493    
494     #ifdef SUPPORT_UCP
495     int prop_type;
496 nigel 87 int prop_value;
497 nigel 77 int prop_fail_result;
498     int prop_category;
499     int prop_chartype;
500 nigel 87 int prop_script;
501 ph10 115 int oclength;
502     uschar occhars[8];
503 nigel 77 #endif
504    
505     int ctype;
506     int length;
507     int max;
508     int min;
509     int number;
510     int offset;
511     int op;
512     int save_capture_last;
513     int save_offset1, save_offset2, save_offset3;
514     int stacksave[REC_STACK_SAVE_MAX];
515    
516     eptrblock newptrb;
517 nigel 93 #endif /* NO_RECURSE */
518 nigel 77
519     /* These statements are here to stop the compiler complaining about unitialized
520     variables. */
521    
522     #ifdef SUPPORT_UCP
523 nigel 87 prop_value = 0;
524 nigel 77 prop_fail_result = 0;
525     #endif
526    
527 nigel 93
528 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
529     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
530     used. Thanks to Ian Taylor for noticing this possibility and sending the
531     original patch. */
532    
533     TAIL_RECURSE:
534    
535 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
536     are specified by the macro RMATCH and RRETURN is used to return. When
537     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
538     and a "return", respectively (possibly with some debugging if DEBUG is
539     defined). However, RMATCH isn't like a function call because it's quite a
540     complicated macro. It has to be used in one particular way. This shouldn't,
541     however, impact performance when true recursion is being used. */
542 nigel 77
543 nigel 87 /* First check that we haven't called match() too many times, or that we
544     haven't exceeded the recursive call limit. */
545    
546 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
547 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
548 nigel 77
549     original_ims = ims; /* Save for resetting on ')' */
550 nigel 91
551     #ifdef SUPPORT_UTF8
552 nigel 77 utf8 = md->utf8; /* Local copy of the flag */
553 nigel 91 #else
554     utf8 = FALSE;
555     #endif
556 nigel 77
557 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
558     string, the match_cbegroup flag is set. When this is the case, add the current
559     subject pointer to the chain of such remembered pointers, to be checked when we
560     hit the closing ket, in order to break infinite loops that match no characters.
561     When match() is called in other circumstances, don't add to the chain. If this
562     is a tail recursion, use a block from the workspace, as the one on the stack is
563     already used. */
564 nigel 77
565 nigel 93 if ((flags & match_cbegroup) != 0)
566 nigel 77 {
567 nigel 93 eptrblock *p;
568     if ((flags & match_tail_recursed) != 0)
569     {
570     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
571     p = md->eptrchain + md->eptrn++;
572     }
573     else p = &newptrb;
574     p->epb_saved_eptr = eptr;
575     p->epb_prev = eptrb;
576     eptrb = p;
577 nigel 77 }
578    
579 nigel 93 /* Now start processing the opcodes. */
580 nigel 77
581     for (;;)
582     {
583 nigel 93 minimize = possessive = FALSE;
584 nigel 77 op = *ecode;
585    
586     /* For partial matching, remember if we ever hit the end of the subject after
587     matching at least one subject character. */
588    
589     if (md->partial &&
590     eptr >= md->end_subject &&
591     eptr > md->start_match)
592     md->hitend = TRUE;
593    
594 nigel 93 switch(op)
595     {
596     /* Handle a capturing bracket. If there is space in the offset vector, save
597     the current subject position in the working slot at the top of the vector.
598     We mustn't change the current values of the data slot, because they may be
599     set from a previous iteration of this group, and be referred to by a
600     reference inside the group.
601 nigel 77
602 nigel 93 If the bracket fails to match, we need to restore this value and also the
603     values of the final offsets, in case they were set by a previous iteration
604     of the same bracket.
605 nigel 77
606 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
607     a non-capturing bracket. Don't worry about setting the flag for the error
608     case here; that is handled in the code for KET. */
609 nigel 77
610 nigel 93 case OP_CBRA:
611     case OP_SCBRA:
612     number = GET2(ecode, 1+LINK_SIZE);
613 nigel 77 offset = number << 1;
614    
615     #ifdef DEBUG
616 nigel 93 printf("start bracket %d\n", number);
617     printf("subject=");
618 nigel 77 pchars(eptr, 16, TRUE, md);
619     printf("\n");
620     #endif
621    
622     if (offset < md->offset_max)
623     {
624     save_offset1 = md->offset_vector[offset];
625     save_offset2 = md->offset_vector[offset+1];
626     save_offset3 = md->offset_vector[md->offset_end - number];
627     save_capture_last = md->capture_last;
628    
629     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
630     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
631    
632 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
633 nigel 77 do
634     {
635 nigel 93 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
636     ims, eptrb, flags);
637 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
638     md->capture_last = save_capture_last;
639     ecode += GET(ecode, 1);
640     }
641     while (*ecode == OP_ALT);
642    
643     DPRINTF(("bracket %d failed\n", number));
644    
645     md->offset_vector[offset] = save_offset1;
646     md->offset_vector[offset+1] = save_offset2;
647     md->offset_vector[md->offset_end - number] = save_offset3;
648    
649     RRETURN(MATCH_NOMATCH);
650     }
651    
652 nigel 93 /* Insufficient room for saving captured contents. Treat as a non-capturing
653     bracket. */
654 nigel 77
655 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
656 nigel 77
657 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
658     final alternative within the brackets, we would return the result of a
659     recursive call to match() whatever happened. We can reduce stack usage by
660     turning this into a tail recursion. */
661 nigel 77
662 nigel 93 case OP_BRA:
663     case OP_SBRA:
664     DPRINTF(("start non-capturing bracket\n"));
665     flags = (op >= OP_SBRA)? match_cbegroup : 0;
666 nigel 91 for (;;)
667 nigel 77 {
668 nigel 91 if (ecode[GET(ecode, 1)] != OP_ALT)
669 nigel 93 {
670     ecode += _pcre_OP_lengths[*ecode];
671     flags |= match_tail_recursed;
672     DPRINTF(("bracket 0 tail recursion\n"));
673     goto TAIL_RECURSE;
674     }
675 nigel 91
676     /* For non-final alternatives, continue the loop for a NOMATCH result;
677     otherwise return. */
678    
679 nigel 93 RMATCH(rrc, eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
680     eptrb, flags);
681 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682     ecode += GET(ecode, 1);
683     }
684 nigel 91 /* Control never reaches here. */
685 nigel 77
686     /* Conditional group: compilation checked that there are no more than
687     two branches. If the condition is false, skipping the first branch takes us
688     past the end if there is only one branch, but that's OK because that is
689 nigel 91 exactly what going to the ket would do. As there is only one branch to be
690     obeyed, we can use tail recursion to avoid using another stack frame. */
691 nigel 77
692     case OP_COND:
693 nigel 93 case OP_SCOND:
694     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
695 nigel 77 {
696 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
697     condition = md->recursive != NULL &&
698     (offset == RREF_ANY || offset == md->recursive->group_num);
699     ecode += condition? 3 : GET(ecode, 1);
700     }
701    
702     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
703     {
704 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
705 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
706     ecode += condition? 3 : GET(ecode, 1);
707 nigel 77 }
708    
709 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
710     {
711     condition = FALSE;
712     ecode += GET(ecode, 1);
713     }
714    
715 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
716 nigel 93 the final argument match_condassert causes it to stop at the end of an
717     assertion. */
718 nigel 77
719     else
720     {
721     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
722 nigel 93 match_condassert);
723 nigel 77 if (rrc == MATCH_MATCH)
724     {
725 nigel 93 condition = TRUE;
726     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
727 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
728     }
729     else if (rrc != MATCH_NOMATCH)
730     {
731     RRETURN(rrc); /* Need braces because of following else */
732     }
733 nigel 93 else
734     {
735     condition = FALSE;
736     ecode += GET(ecode, 1);
737     }
738     }
739 nigel 91
740 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
741     we can use tail recursion to avoid using another stack frame. If the second
742     alternative doesn't exist, we can just plough on. */
743 nigel 91
744 nigel 93 if (condition || *ecode == OP_ALT)
745     {
746 nigel 91 ecode += 1 + LINK_SIZE;
747 nigel 93 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
748 nigel 91 goto TAIL_RECURSE;
749 nigel 77 }
750 nigel 93 else
751     {
752     ecode += 1 + LINK_SIZE;
753     }
754     break;
755 nigel 77
756    
757 nigel 93 /* End of the pattern. If we are in a top-level recursion, we should
758     restore the offsets appropriately and continue from after the call. */
759 nigel 77
760     case OP_END:
761     if (md->recursive != NULL && md->recursive->group_num == 0)
762     {
763     recursion_info *rec = md->recursive;
764 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
765 nigel 77 md->recursive = rec->prevrec;
766     memmove(md->offset_vector, rec->offset_save,
767     rec->saved_max * sizeof(int));
768     md->start_match = rec->save_start;
769     ims = original_ims;
770     ecode = rec->after_call;
771     break;
772     }
773    
774     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
775     string - backtracking will then try other alternatives, if any. */
776    
777     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
778     md->end_match_ptr = eptr; /* Record where we ended */
779     md->end_offset_top = offset_top; /* and how many extracts were taken */
780     RRETURN(MATCH_MATCH);
781    
782     /* Change option settings */
783    
784     case OP_OPT:
785     ims = ecode[1];
786     ecode += 2;
787     DPRINTF(("ims set to %02lx\n", ims));
788     break;
789    
790     /* Assertion brackets. Check the alternative branches in turn - the
791     matching won't pass the KET for an assertion. If any one branch matches,
792     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
793     start of each branch to move the current point backwards, so the code at
794     this level is identical to the lookahead case. */
795    
796     case OP_ASSERT:
797     case OP_ASSERTBACK:
798     do
799     {
800 nigel 93 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
801 nigel 77 if (rrc == MATCH_MATCH) break;
802     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
803     ecode += GET(ecode, 1);
804     }
805     while (*ecode == OP_ALT);
806     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
807    
808     /* If checking an assertion for a condition, return MATCH_MATCH. */
809    
810     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
811    
812     /* Continue from after the assertion, updating the offsets high water
813     mark, since extracts may have been taken during the assertion. */
814    
815     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
816     ecode += 1 + LINK_SIZE;
817     offset_top = md->end_offset_top;
818     continue;
819    
820     /* Negative assertion: all branches must fail to match */
821    
822     case OP_ASSERT_NOT:
823     case OP_ASSERTBACK_NOT:
824     do
825     {
826 nigel 93 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0);
827 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
828     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
829     ecode += GET(ecode,1);
830     }
831     while (*ecode == OP_ALT);
832    
833     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
834    
835     ecode += 1 + LINK_SIZE;
836     continue;
837    
838     /* Move the subject pointer back. This occurs only at the start of
839     each branch of a lookbehind assertion. If we are too close to the start to
840     move back, this match function fails. When working with UTF-8 we move
841     back a number of characters, not bytes. */
842    
843     case OP_REVERSE:
844     #ifdef SUPPORT_UTF8
845     if (utf8)
846     {
847 nigel 93 i = GET(ecode, 1);
848     while (i-- > 0)
849 nigel 77 {
850     eptr--;
851     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
852     BACKCHAR(eptr)
853     }
854     }
855     else
856     #endif
857    
858     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
859    
860     {
861 nigel 93 eptr -= GET(ecode, 1);
862 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
863     }
864    
865     /* Skip to next op code */
866    
867     ecode += 1 + LINK_SIZE;
868     break;
869    
870     /* The callout item calls an external function, if one is provided, passing
871     details of the match so far. This is mainly for debugging, though the
872     function is able to force a failure. */
873    
874     case OP_CALLOUT:
875     if (pcre_callout != NULL)
876     {
877     pcre_callout_block cb;
878     cb.version = 1; /* Version 1 of the callout block */
879     cb.callout_number = ecode[1];
880     cb.offset_vector = md->offset_vector;
881 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
882 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
883     cb.start_match = md->start_match - md->start_subject;
884     cb.current_position = eptr - md->start_subject;
885     cb.pattern_position = GET(ecode, 2);
886     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
887     cb.capture_top = offset_top/2;
888     cb.capture_last = md->capture_last;
889     cb.callout_data = md->callout_data;
890     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
891     if (rrc < 0) RRETURN(rrc);
892     }
893     ecode += 2 + 2*LINK_SIZE;
894     break;
895    
896     /* Recursion either matches the current regex, or some subexpression. The
897     offset data is the offset to the starting bracket from the start of the
898     whole pattern. (This is so that it works from duplicated subpatterns.)
899    
900     If there are any capturing brackets started but not finished, we have to
901     save their starting points and reinstate them after the recursion. However,
902     we don't know how many such there are (offset_top records the completed
903     total) so we just have to save all the potential data. There may be up to
904     65535 such values, which is too large to put on the stack, but using malloc
905     for small numbers seems expensive. As a compromise, the stack is used when
906     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
907     is used. A problem is what to do if the malloc fails ... there is no way of
908     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
909     values on the stack, and accept that the rest may be wrong.
910    
911     There are also other values that have to be saved. We use a chained
912     sequence of blocks that actually live on the stack. Thanks to Robin Houston
913     for the original version of this logic. */
914    
915     case OP_RECURSE:
916     {
917     callpat = md->start_code + GET(ecode, 1);
918 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
919     GET2(callpat, 1 + LINK_SIZE);
920 nigel 77
921     /* Add to "recursing stack" */
922    
923     new_recursive.prevrec = md->recursive;
924     md->recursive = &new_recursive;
925    
926     /* Find where to continue from afterwards */
927    
928     ecode += 1 + LINK_SIZE;
929     new_recursive.after_call = ecode;
930    
931     /* Now save the offset data. */
932    
933     new_recursive.saved_max = md->offset_end;
934     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
935     new_recursive.offset_save = stacksave;
936     else
937     {
938     new_recursive.offset_save =
939     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
940     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
941     }
942    
943     memcpy(new_recursive.offset_save, md->offset_vector,
944     new_recursive.saved_max * sizeof(int));
945     new_recursive.save_start = md->start_match;
946     md->start_match = eptr;
947    
948     /* OK, now we can do the recursion. For each top-level alternative we
949     restore the offset and recursion data. */
950    
951     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
952 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
953 nigel 77 do
954     {
955 nigel 93 RMATCH(rrc, eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
956     md, ims, eptrb, flags);
957 nigel 77 if (rrc == MATCH_MATCH)
958     {
959 nigel 87 DPRINTF(("Recursion matched\n"));
960 nigel 77 md->recursive = new_recursive.prevrec;
961     if (new_recursive.offset_save != stacksave)
962     (pcre_free)(new_recursive.offset_save);
963     RRETURN(MATCH_MATCH);
964     }
965 nigel 87 else if (rrc != MATCH_NOMATCH)
966     {
967     DPRINTF(("Recursion gave error %d\n", rrc));
968     RRETURN(rrc);
969     }
970 nigel 77
971     md->recursive = &new_recursive;
972     memcpy(md->offset_vector, new_recursive.offset_save,
973     new_recursive.saved_max * sizeof(int));
974     callpat += GET(callpat, 1);
975     }
976     while (*callpat == OP_ALT);
977    
978     DPRINTF(("Recursion didn't match\n"));
979     md->recursive = new_recursive.prevrec;
980     if (new_recursive.offset_save != stacksave)
981     (pcre_free)(new_recursive.offset_save);
982     RRETURN(MATCH_NOMATCH);
983     }
984     /* Control never reaches here */
985    
986     /* "Once" brackets are like assertion brackets except that after a match,
987     the point in the subject string is not moved back. Thus there can never be
988     a move back into the brackets. Friedl calls these "atomic" subpatterns.
989     Check the alternative branches in turn - the matching won't pass the KET
990     for this kind of subpattern. If any one branch matches, we carry on as at
991     the end of a normal bracket, leaving the subject pointer. */
992    
993     case OP_ONCE:
994 nigel 91 prev = ecode;
995     saved_eptr = eptr;
996    
997     do
998 nigel 77 {
999 nigel 91 RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1000 nigel 93 eptrb, 0);
1001 nigel 91 if (rrc == MATCH_MATCH) break;
1002     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1003     ecode += GET(ecode,1);
1004     }
1005     while (*ecode == OP_ALT);
1006 nigel 77
1007 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1008 nigel 77
1009 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1010 nigel 77
1011 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1012     mark, since extracts may have been taken. */
1013 nigel 77
1014 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1015 nigel 77
1016 nigel 91 offset_top = md->end_offset_top;
1017     eptr = md->end_match_ptr;
1018 nigel 77
1019 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1020     happens for a repeating ket if no characters were matched in the group.
1021     This is the forcible breaking of infinite loops as implemented in Perl
1022     5.005. If there is an options reset, it will get obeyed in the normal
1023     course of events. */
1024 nigel 77
1025 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1026     {
1027     ecode += 1+LINK_SIZE;
1028     break;
1029     }
1030 nigel 77
1031 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1032     preceding bracket, in the appropriate order. The second "call" of match()
1033     uses tail recursion, to avoid using another stack frame. We need to reset
1034     any options that changed within the bracket before re-running it, so
1035     check the next opcode. */
1036 nigel 77
1037 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1038     {
1039     ims = (ims & ~PCRE_IMS) | ecode[4];
1040     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1041     }
1042 nigel 77
1043 nigel 91 if (*ecode == OP_KETRMIN)
1044     {
1045     RMATCH(rrc, eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0);
1046     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1047     ecode = prev;
1048 nigel 93 flags = match_tail_recursed;
1049 nigel 91 goto TAIL_RECURSE;
1050 nigel 77 }
1051 nigel 91 else /* OP_KETRMAX */
1052     {
1053 nigel 93 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, match_cbegroup);
1054 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1055     ecode += 1 + LINK_SIZE;
1056 nigel 93 flags = match_tail_recursed;
1057 nigel 91 goto TAIL_RECURSE;
1058     }
1059     /* Control never gets here */
1060 nigel 77
1061     /* An alternation is the end of a branch; scan along to find the end of the
1062     bracketed group and go to there. */
1063    
1064     case OP_ALT:
1065     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1066     break;
1067    
1068     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1069     that it may occur zero times. It may repeat infinitely, or not at all -
1070     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1071     repeat limits are compiled as a number of copies, with the optional ones
1072     preceded by BRAZERO or BRAMINZERO. */
1073    
1074     case OP_BRAZERO:
1075     {
1076     next = ecode+1;
1077 nigel 93 RMATCH(rrc, eptr, next, offset_top, md, ims, eptrb, 0);
1078 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1079     do next += GET(next,1); while (*next == OP_ALT);
1080 nigel 93 ecode = next + 1 + LINK_SIZE;
1081 nigel 77 }
1082     break;
1083    
1084     case OP_BRAMINZERO:
1085     {
1086     next = ecode+1;
1087 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1088     RMATCH(rrc, eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1089 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1090     ecode++;
1091     }
1092     break;
1093    
1094 nigel 93 /* End of a group, repeated or non-repeating. */
1095 nigel 77
1096     case OP_KET:
1097     case OP_KETRMIN:
1098     case OP_KETRMAX:
1099 nigel 91 prev = ecode - GET(ecode, 1);
1100 nigel 77
1101 nigel 93 /* If this was a group that remembered the subject start, in order to break
1102     infinite repeats of empty string matches, retrieve the subject start from
1103     the chain. Otherwise, set it NULL. */
1104 nigel 77
1105 nigel 93 if (*prev >= OP_SBRA)
1106     {
1107     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1108     eptrb = eptrb->epb_prev; /* Backup to previous group */
1109     }
1110     else saved_eptr = NULL;
1111 nigel 77
1112 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1113     MATCH_MATCH, but record the current high water mark for use by positive
1114     assertions. Do this also for the "once" (atomic) groups. */
1115    
1116 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1117     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1118     *prev == OP_ONCE)
1119     {
1120     md->end_match_ptr = eptr; /* For ONCE */
1121     md->end_offset_top = offset_top;
1122     RRETURN(MATCH_MATCH);
1123     }
1124 nigel 77
1125 nigel 93 /* For capturing groups we have to check the group number back at the start
1126     and if necessary complete handling an extraction by setting the offsets and
1127     bumping the high water mark. Note that whole-pattern recursion is coded as
1128     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1129     when the OP_END is reached. Other recursion is handled here. */
1130 nigel 77
1131 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1132 nigel 91 {
1133 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1134 nigel 91 offset = number << 1;
1135 nigel 77
1136     #ifdef DEBUG
1137 nigel 91 printf("end bracket %d", number);
1138     printf("\n");
1139 nigel 77 #endif
1140    
1141 nigel 93 md->capture_last = number;
1142     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1143 nigel 91 {
1144 nigel 93 md->offset_vector[offset] =
1145     md->offset_vector[md->offset_end - number];
1146     md->offset_vector[offset+1] = eptr - md->start_subject;
1147     if (offset_top <= offset) offset_top = offset + 2;
1148     }
1149 nigel 77
1150 nigel 93 /* Handle a recursively called group. Restore the offsets
1151     appropriately and continue from after the call. */
1152 nigel 77
1153 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1154     {
1155     recursion_info *rec = md->recursive;
1156     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1157     md->recursive = rec->prevrec;
1158     md->start_match = rec->save_start;
1159     memcpy(md->offset_vector, rec->offset_save,
1160     rec->saved_max * sizeof(int));
1161     ecode = rec->after_call;
1162     ims = original_ims;
1163     break;
1164 nigel 77 }
1165 nigel 91 }
1166 nigel 77
1167 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1168     flags, in case they got changed during the group. */
1169 nigel 77
1170 nigel 91 ims = original_ims;
1171     DPRINTF(("ims reset to %02lx\n", ims));
1172 nigel 77
1173 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1174     happens for a repeating ket if no characters were matched in the group.
1175     This is the forcible breaking of infinite loops as implemented in Perl
1176     5.005. If there is an options reset, it will get obeyed in the normal
1177     course of events. */
1178 nigel 77
1179 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1180     {
1181     ecode += 1 + LINK_SIZE;
1182     break;
1183     }
1184 nigel 77
1185 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1186     preceding bracket, in the appropriate order. In the second case, we can use
1187     tail recursion to avoid using another stack frame. */
1188 nigel 77
1189 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1190    
1191 nigel 91 if (*ecode == OP_KETRMIN)
1192     {
1193     RMATCH(rrc, eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0);
1194     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1195     ecode = prev;
1196 nigel 93 flags |= match_tail_recursed;
1197 nigel 91 goto TAIL_RECURSE;
1198 nigel 77 }
1199 nigel 91 else /* OP_KETRMAX */
1200     {
1201 nigel 93 RMATCH(rrc, eptr, prev, offset_top, md, ims, eptrb, flags);
1202 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1203     ecode += 1 + LINK_SIZE;
1204 nigel 93 flags = match_tail_recursed;
1205 nigel 91 goto TAIL_RECURSE;
1206     }
1207     /* Control never gets here */
1208 nigel 77
1209     /* Start of subject unless notbol, or after internal newline if multiline */
1210    
1211     case OP_CIRC:
1212     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1213     if ((ims & PCRE_MULTILINE) != 0)
1214     {
1215 nigel 91 if (eptr != md->start_subject &&
1216 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1217 nigel 77 RRETURN(MATCH_NOMATCH);
1218     ecode++;
1219     break;
1220     }
1221     /* ... else fall through */
1222    
1223     /* Start of subject assertion */
1224    
1225     case OP_SOD:
1226     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1227     ecode++;
1228     break;
1229    
1230     /* Start of match assertion */
1231    
1232     case OP_SOM:
1233     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1234     ecode++;
1235     break;
1236    
1237     /* Assert before internal newline if multiline, or before a terminating
1238     newline unless endonly is set, else end of subject unless noteol is set. */
1239    
1240     case OP_DOLL:
1241     if ((ims & PCRE_MULTILINE) != 0)
1242     {
1243     if (eptr < md->end_subject)
1244 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1245 nigel 77 else
1246     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1247     ecode++;
1248     break;
1249     }
1250     else
1251     {
1252     if (md->noteol) RRETURN(MATCH_NOMATCH);
1253     if (!md->endonly)
1254     {
1255 nigel 91 if (eptr != md->end_subject &&
1256 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1257 nigel 77 RRETURN(MATCH_NOMATCH);
1258     ecode++;
1259     break;
1260     }
1261     }
1262 nigel 91 /* ... else fall through for endonly */
1263 nigel 77
1264     /* End of subject assertion (\z) */
1265    
1266     case OP_EOD:
1267     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1268     ecode++;
1269     break;
1270    
1271     /* End of subject or ending \n assertion (\Z) */
1272    
1273     case OP_EODN:
1274 nigel 91 if (eptr != md->end_subject &&
1275 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1276 nigel 91 RRETURN(MATCH_NOMATCH);
1277 nigel 77 ecode++;
1278     break;
1279    
1280     /* Word boundary assertions */
1281    
1282     case OP_NOT_WORD_BOUNDARY:
1283     case OP_WORD_BOUNDARY:
1284     {
1285    
1286     /* Find out if the previous and current characters are "word" characters.
1287     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1288     be "non-word" characters. */
1289    
1290     #ifdef SUPPORT_UTF8
1291     if (utf8)
1292     {
1293     if (eptr == md->start_subject) prev_is_word = FALSE; else
1294     {
1295     const uschar *lastptr = eptr - 1;
1296     while((*lastptr & 0xc0) == 0x80) lastptr--;
1297     GETCHAR(c, lastptr);
1298     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1299     }
1300     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1301     {
1302     GETCHAR(c, eptr);
1303     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1304     }
1305     }
1306     else
1307     #endif
1308    
1309     /* More streamlined when not in UTF-8 mode */
1310    
1311     {
1312     prev_is_word = (eptr != md->start_subject) &&
1313     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1314     cur_is_word = (eptr < md->end_subject) &&
1315     ((md->ctypes[*eptr] & ctype_word) != 0);
1316     }
1317    
1318     /* Now see if the situation is what we want */
1319    
1320     if ((*ecode++ == OP_WORD_BOUNDARY)?
1321     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1322     RRETURN(MATCH_NOMATCH);
1323     }
1324     break;
1325    
1326     /* Match a single character type; inline for speed */
1327    
1328     case OP_ANY:
1329 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1330     {
1331 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1332 nigel 91 }
1333 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1334     if (utf8)
1335     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1336     ecode++;
1337     break;
1338    
1339     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1340     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1341    
1342     case OP_ANYBYTE:
1343     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1344     ecode++;
1345     break;
1346    
1347     case OP_NOT_DIGIT:
1348     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1349     GETCHARINCTEST(c, eptr);
1350     if (
1351     #ifdef SUPPORT_UTF8
1352     c < 256 &&
1353     #endif
1354     (md->ctypes[c] & ctype_digit) != 0
1355     )
1356     RRETURN(MATCH_NOMATCH);
1357     ecode++;
1358     break;
1359    
1360     case OP_DIGIT:
1361     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1362     GETCHARINCTEST(c, eptr);
1363     if (
1364     #ifdef SUPPORT_UTF8
1365     c >= 256 ||
1366     #endif
1367     (md->ctypes[c] & ctype_digit) == 0
1368     )
1369     RRETURN(MATCH_NOMATCH);
1370     ecode++;
1371     break;
1372    
1373     case OP_NOT_WHITESPACE:
1374     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1375     GETCHARINCTEST(c, eptr);
1376     if (
1377     #ifdef SUPPORT_UTF8
1378     c < 256 &&
1379     #endif
1380     (md->ctypes[c] & ctype_space) != 0
1381     )
1382     RRETURN(MATCH_NOMATCH);
1383     ecode++;
1384     break;
1385    
1386     case OP_WHITESPACE:
1387     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1388     GETCHARINCTEST(c, eptr);
1389     if (
1390     #ifdef SUPPORT_UTF8
1391     c >= 256 ||
1392     #endif
1393     (md->ctypes[c] & ctype_space) == 0
1394     )
1395     RRETURN(MATCH_NOMATCH);
1396     ecode++;
1397     break;
1398    
1399     case OP_NOT_WORDCHAR:
1400     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1401     GETCHARINCTEST(c, eptr);
1402     if (
1403     #ifdef SUPPORT_UTF8
1404     c < 256 &&
1405     #endif
1406     (md->ctypes[c] & ctype_word) != 0
1407     )
1408     RRETURN(MATCH_NOMATCH);
1409     ecode++;
1410     break;
1411    
1412     case OP_WORDCHAR:
1413     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1414     GETCHARINCTEST(c, eptr);
1415     if (
1416     #ifdef SUPPORT_UTF8
1417     c >= 256 ||
1418     #endif
1419     (md->ctypes[c] & ctype_word) == 0
1420     )
1421     RRETURN(MATCH_NOMATCH);
1422     ecode++;
1423     break;
1424    
1425 nigel 93 case OP_ANYNL:
1426     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1427     GETCHARINCTEST(c, eptr);
1428     switch(c)
1429     {
1430     default: RRETURN(MATCH_NOMATCH);
1431     case 0x000d:
1432     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1433     break;
1434     case 0x000a:
1435     case 0x000b:
1436     case 0x000c:
1437     case 0x0085:
1438     case 0x2028:
1439     case 0x2029:
1440     break;
1441     }
1442     ecode++;
1443     break;
1444    
1445 nigel 77 #ifdef SUPPORT_UCP
1446     /* Check the next character by Unicode property. We will get here only
1447     if the support is in the binary; otherwise a compile-time error occurs. */
1448    
1449     case OP_PROP:
1450     case OP_NOTPROP:
1451     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1452     GETCHARINCTEST(c, eptr);
1453     {
1454 nigel 87 int chartype, script;
1455     int category = _pcre_ucp_findprop(c, &chartype, &script);
1456 nigel 77
1457 nigel 87 switch(ecode[1])
1458     {
1459     case PT_ANY:
1460     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1461     break;
1462 nigel 77
1463 nigel 87 case PT_LAMP:
1464     if ((chartype == ucp_Lu ||
1465     chartype == ucp_Ll ||
1466     chartype == ucp_Lt) == (op == OP_NOTPROP))
1467 nigel 77 RRETURN(MATCH_NOMATCH);
1468 nigel 87 break;
1469    
1470     case PT_GC:
1471     if ((ecode[2] != category) == (op == OP_PROP))
1472 nigel 77 RRETURN(MATCH_NOMATCH);
1473 nigel 87 break;
1474    
1475     case PT_PC:
1476     if ((ecode[2] != chartype) == (op == OP_PROP))
1477     RRETURN(MATCH_NOMATCH);
1478     break;
1479    
1480     case PT_SC:
1481     if ((ecode[2] != script) == (op == OP_PROP))
1482     RRETURN(MATCH_NOMATCH);
1483     break;
1484    
1485     default:
1486     RRETURN(PCRE_ERROR_INTERNAL);
1487 nigel 77 }
1488 nigel 87
1489     ecode += 3;
1490 nigel 77 }
1491     break;
1492    
1493     /* Match an extended Unicode sequence. We will get here only if the support
1494     is in the binary; otherwise a compile-time error occurs. */
1495    
1496     case OP_EXTUNI:
1497     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1498     GETCHARINCTEST(c, eptr);
1499     {
1500 nigel 87 int chartype, script;
1501     int category = _pcre_ucp_findprop(c, &chartype, &script);
1502 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1503     while (eptr < md->end_subject)
1504     {
1505     int len = 1;
1506     if (!utf8) c = *eptr; else
1507     {
1508     GETCHARLEN(c, eptr, len);
1509     }
1510 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1511 nigel 77 if (category != ucp_M) break;
1512     eptr += len;
1513     }
1514     }
1515     ecode++;
1516     break;
1517     #endif
1518    
1519    
1520     /* Match a back reference, possibly repeatedly. Look past the end of the
1521     item to see if there is repeat information following. The code is similar
1522     to that for character classes, but repeated for efficiency. Then obey
1523     similar code to character type repeats - written out again for speed.
1524     However, if the referenced string is the empty string, always treat
1525     it as matched, any number of times (otherwise there could be infinite
1526     loops). */
1527    
1528     case OP_REF:
1529     {
1530     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1531     ecode += 3; /* Advance past item */
1532    
1533     /* If the reference is unset, set the length to be longer than the amount
1534     of subject left; this ensures that every attempt at a match fails. We
1535     can't just fail here, because of the possibility of quantifiers with zero
1536     minima. */
1537    
1538     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1539     md->end_subject - eptr + 1 :
1540     md->offset_vector[offset+1] - md->offset_vector[offset];
1541    
1542     /* Set up for repetition, or handle the non-repeated case */
1543    
1544     switch (*ecode)
1545     {
1546     case OP_CRSTAR:
1547     case OP_CRMINSTAR:
1548     case OP_CRPLUS:
1549     case OP_CRMINPLUS:
1550     case OP_CRQUERY:
1551     case OP_CRMINQUERY:
1552     c = *ecode++ - OP_CRSTAR;
1553     minimize = (c & 1) != 0;
1554     min = rep_min[c]; /* Pick up values from tables; */
1555     max = rep_max[c]; /* zero for max => infinity */
1556     if (max == 0) max = INT_MAX;
1557     break;
1558    
1559     case OP_CRRANGE:
1560     case OP_CRMINRANGE:
1561     minimize = (*ecode == OP_CRMINRANGE);
1562     min = GET2(ecode, 1);
1563     max = GET2(ecode, 3);
1564     if (max == 0) max = INT_MAX;
1565     ecode += 5;
1566     break;
1567    
1568     default: /* No repeat follows */
1569     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1570     eptr += length;
1571     continue; /* With the main loop */
1572     }
1573    
1574     /* If the length of the reference is zero, just continue with the
1575     main loop. */
1576    
1577     if (length == 0) continue;
1578    
1579     /* First, ensure the minimum number of matches are present. We get back
1580     the length of the reference string explicitly rather than passing the
1581     address of eptr, so that eptr can be a register variable. */
1582    
1583     for (i = 1; i <= min; i++)
1584     {
1585     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1586     eptr += length;
1587     }
1588    
1589     /* If min = max, continue at the same level without recursion.
1590     They are not both allowed to be zero. */
1591    
1592     if (min == max) continue;
1593    
1594     /* If minimizing, keep trying and advancing the pointer */
1595    
1596     if (minimize)
1597     {
1598     for (fi = min;; fi++)
1599     {
1600     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1601     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1602     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1603     RRETURN(MATCH_NOMATCH);
1604     eptr += length;
1605     }
1606     /* Control never gets here */
1607     }
1608    
1609     /* If maximizing, find the longest string and work backwards */
1610    
1611     else
1612     {
1613     pp = eptr;
1614     for (i = min; i < max; i++)
1615     {
1616     if (!match_ref(offset, eptr, length, md, ims)) break;
1617     eptr += length;
1618     }
1619     while (eptr >= pp)
1620     {
1621     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1622     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1623     eptr -= length;
1624     }
1625     RRETURN(MATCH_NOMATCH);
1626     }
1627     }
1628     /* Control never gets here */
1629    
1630    
1631    
1632     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1633     used when all the characters in the class have values in the range 0-255,
1634     and either the matching is caseful, or the characters are in the range
1635     0-127 when UTF-8 processing is enabled. The only difference between
1636     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1637     encountered.
1638    
1639     First, look past the end of the item to see if there is repeat information
1640     following. Then obey similar code to character type repeats - written out
1641     again for speed. */
1642    
1643     case OP_NCLASS:
1644     case OP_CLASS:
1645     {
1646     data = ecode + 1; /* Save for matching */
1647     ecode += 33; /* Advance past the item */
1648    
1649     switch (*ecode)
1650     {
1651     case OP_CRSTAR:
1652     case OP_CRMINSTAR:
1653     case OP_CRPLUS:
1654     case OP_CRMINPLUS:
1655     case OP_CRQUERY:
1656     case OP_CRMINQUERY:
1657     c = *ecode++ - OP_CRSTAR;
1658     minimize = (c & 1) != 0;
1659     min = rep_min[c]; /* Pick up values from tables; */
1660     max = rep_max[c]; /* zero for max => infinity */
1661     if (max == 0) max = INT_MAX;
1662     break;
1663    
1664     case OP_CRRANGE:
1665     case OP_CRMINRANGE:
1666     minimize = (*ecode == OP_CRMINRANGE);
1667     min = GET2(ecode, 1);
1668     max = GET2(ecode, 3);
1669     if (max == 0) max = INT_MAX;
1670     ecode += 5;
1671     break;
1672    
1673     default: /* No repeat follows */
1674     min = max = 1;
1675     break;
1676     }
1677    
1678     /* First, ensure the minimum number of matches are present. */
1679    
1680     #ifdef SUPPORT_UTF8
1681     /* UTF-8 mode */
1682     if (utf8)
1683     {
1684     for (i = 1; i <= min; i++)
1685     {
1686     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1687     GETCHARINC(c, eptr);
1688     if (c > 255)
1689     {
1690     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1691     }
1692     else
1693     {
1694     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1695     }
1696     }
1697     }
1698     else
1699     #endif
1700     /* Not UTF-8 mode */
1701     {
1702     for (i = 1; i <= min; i++)
1703     {
1704     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1705     c = *eptr++;
1706     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1707     }
1708     }
1709    
1710     /* If max == min we can continue with the main loop without the
1711     need to recurse. */
1712    
1713     if (min == max) continue;
1714    
1715     /* If minimizing, keep testing the rest of the expression and advancing
1716     the pointer while it matches the class. */
1717    
1718     if (minimize)
1719     {
1720     #ifdef SUPPORT_UTF8
1721     /* UTF-8 mode */
1722     if (utf8)
1723     {
1724     for (fi = min;; fi++)
1725     {
1726     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1727     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1728     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1729     GETCHARINC(c, eptr);
1730     if (c > 255)
1731     {
1732     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1733     }
1734     else
1735     {
1736     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1737     }
1738     }
1739     }
1740     else
1741     #endif
1742     /* Not UTF-8 mode */
1743     {
1744     for (fi = min;; fi++)
1745     {
1746     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1747     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1748     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1749     c = *eptr++;
1750     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1751     }
1752     }
1753     /* Control never gets here */
1754     }
1755    
1756     /* If maximizing, find the longest possible run, then work backwards. */
1757    
1758     else
1759     {
1760     pp = eptr;
1761    
1762     #ifdef SUPPORT_UTF8
1763     /* UTF-8 mode */
1764     if (utf8)
1765     {
1766     for (i = min; i < max; i++)
1767     {
1768     int len = 1;
1769     if (eptr >= md->end_subject) break;
1770     GETCHARLEN(c, eptr, len);
1771     if (c > 255)
1772     {
1773     if (op == OP_CLASS) break;
1774     }
1775     else
1776     {
1777     if ((data[c/8] & (1 << (c&7))) == 0) break;
1778     }
1779     eptr += len;
1780     }
1781     for (;;)
1782     {
1783     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1784     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1785     if (eptr-- == pp) break; /* Stop if tried at original pos */
1786     BACKCHAR(eptr);
1787     }
1788     }
1789     else
1790     #endif
1791     /* Not UTF-8 mode */
1792     {
1793     for (i = min; i < max; i++)
1794     {
1795     if (eptr >= md->end_subject) break;
1796     c = *eptr;
1797     if ((data[c/8] & (1 << (c&7))) == 0) break;
1798     eptr++;
1799     }
1800     while (eptr >= pp)
1801     {
1802     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1803 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1804 nigel 77 eptr--;
1805     }
1806     }
1807    
1808     RRETURN(MATCH_NOMATCH);
1809     }
1810     }
1811     /* Control never gets here */
1812    
1813    
1814     /* Match an extended character class. This opcode is encountered only
1815     in UTF-8 mode, because that's the only time it is compiled. */
1816    
1817     #ifdef SUPPORT_UTF8
1818     case OP_XCLASS:
1819     {
1820     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1821     ecode += GET(ecode, 1); /* Advance past the item */
1822    
1823     switch (*ecode)
1824     {
1825     case OP_CRSTAR:
1826     case OP_CRMINSTAR:
1827     case OP_CRPLUS:
1828     case OP_CRMINPLUS:
1829     case OP_CRQUERY:
1830     case OP_CRMINQUERY:
1831     c = *ecode++ - OP_CRSTAR;
1832     minimize = (c & 1) != 0;
1833     min = rep_min[c]; /* Pick up values from tables; */
1834     max = rep_max[c]; /* zero for max => infinity */
1835     if (max == 0) max = INT_MAX;
1836     break;
1837    
1838     case OP_CRRANGE:
1839     case OP_CRMINRANGE:
1840     minimize = (*ecode == OP_CRMINRANGE);
1841     min = GET2(ecode, 1);
1842     max = GET2(ecode, 3);
1843     if (max == 0) max = INT_MAX;
1844     ecode += 5;
1845     break;
1846    
1847     default: /* No repeat follows */
1848     min = max = 1;
1849     break;
1850     }
1851    
1852     /* First, ensure the minimum number of matches are present. */
1853    
1854     for (i = 1; i <= min; i++)
1855     {
1856     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1857     GETCHARINC(c, eptr);
1858     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1859     }
1860    
1861     /* If max == min we can continue with the main loop without the
1862     need to recurse. */
1863    
1864     if (min == max) continue;
1865    
1866     /* If minimizing, keep testing the rest of the expression and advancing
1867     the pointer while it matches the class. */
1868    
1869     if (minimize)
1870     {
1871     for (fi = min;; fi++)
1872     {
1873     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1874     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1875     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1876     GETCHARINC(c, eptr);
1877     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1878     }
1879     /* Control never gets here */
1880     }
1881    
1882     /* If maximizing, find the longest possible run, then work backwards. */
1883    
1884     else
1885     {
1886     pp = eptr;
1887     for (i = min; i < max; i++)
1888     {
1889     int len = 1;
1890     if (eptr >= md->end_subject) break;
1891     GETCHARLEN(c, eptr, len);
1892     if (!_pcre_xclass(c, data)) break;
1893     eptr += len;
1894     }
1895     for(;;)
1896     {
1897     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
1898     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1899     if (eptr-- == pp) break; /* Stop if tried at original pos */
1900     BACKCHAR(eptr)
1901     }
1902     RRETURN(MATCH_NOMATCH);
1903     }
1904    
1905     /* Control never gets here */
1906     }
1907     #endif /* End of XCLASS */
1908    
1909     /* Match a single character, casefully */
1910    
1911     case OP_CHAR:
1912     #ifdef SUPPORT_UTF8
1913     if (utf8)
1914     {
1915     length = 1;
1916     ecode++;
1917     GETCHARLEN(fc, ecode, length);
1918     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1919     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1920     }
1921     else
1922     #endif
1923    
1924     /* Non-UTF-8 mode */
1925     {
1926     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1927     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1928     ecode += 2;
1929     }
1930     break;
1931    
1932     /* Match a single character, caselessly */
1933    
1934     case OP_CHARNC:
1935     #ifdef SUPPORT_UTF8
1936     if (utf8)
1937     {
1938     length = 1;
1939     ecode++;
1940     GETCHARLEN(fc, ecode, length);
1941    
1942     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1943    
1944     /* If the pattern character's value is < 128, we have only one byte, and
1945     can use the fast lookup table. */
1946    
1947     if (fc < 128)
1948     {
1949     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1950     }
1951    
1952     /* Otherwise we must pick up the subject character */
1953    
1954     else
1955     {
1956 nigel 93 unsigned int dc;
1957 nigel 77 GETCHARINC(dc, eptr);
1958     ecode += length;
1959    
1960     /* If we have Unicode property support, we can use it to test the other
1961 nigel 87 case of the character, if there is one. */
1962 nigel 77
1963     if (fc != dc)
1964     {
1965     #ifdef SUPPORT_UCP
1966 nigel 87 if (dc != _pcre_ucp_othercase(fc))
1967 nigel 77 #endif
1968     RRETURN(MATCH_NOMATCH);
1969     }
1970     }
1971     }
1972     else
1973     #endif /* SUPPORT_UTF8 */
1974    
1975     /* Non-UTF-8 mode */
1976     {
1977     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1978     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1979     ecode += 2;
1980     }
1981     break;
1982    
1983 nigel 93 /* Match a single character repeatedly. */
1984 nigel 77
1985     case OP_EXACT:
1986     min = max = GET2(ecode, 1);
1987     ecode += 3;
1988     goto REPEATCHAR;
1989    
1990 nigel 93 case OP_POSUPTO:
1991     possessive = TRUE;
1992     /* Fall through */
1993    
1994 nigel 77 case OP_UPTO:
1995     case OP_MINUPTO:
1996     min = 0;
1997     max = GET2(ecode, 1);
1998     minimize = *ecode == OP_MINUPTO;
1999     ecode += 3;
2000     goto REPEATCHAR;
2001    
2002 nigel 93 case OP_POSSTAR:
2003     possessive = TRUE;
2004     min = 0;
2005     max = INT_MAX;
2006     ecode++;
2007     goto REPEATCHAR;
2008    
2009     case OP_POSPLUS:
2010     possessive = TRUE;
2011     min = 1;
2012     max = INT_MAX;
2013     ecode++;
2014     goto REPEATCHAR;
2015    
2016     case OP_POSQUERY:
2017     possessive = TRUE;
2018     min = 0;
2019     max = 1;
2020     ecode++;
2021     goto REPEATCHAR;
2022    
2023 nigel 77 case OP_STAR:
2024     case OP_MINSTAR:
2025     case OP_PLUS:
2026     case OP_MINPLUS:
2027     case OP_QUERY:
2028     case OP_MINQUERY:
2029     c = *ecode++ - OP_STAR;
2030     minimize = (c & 1) != 0;
2031     min = rep_min[c]; /* Pick up values from tables; */
2032     max = rep_max[c]; /* zero for max => infinity */
2033     if (max == 0) max = INT_MAX;
2034    
2035     /* Common code for all repeated single-character matches. We can give
2036     up quickly if there are fewer than the minimum number of characters left in
2037     the subject. */
2038    
2039     REPEATCHAR:
2040     #ifdef SUPPORT_UTF8
2041     if (utf8)
2042     {
2043     length = 1;
2044     charptr = ecode;
2045     GETCHARLEN(fc, ecode, length);
2046     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2047     ecode += length;
2048    
2049     /* Handle multibyte character matching specially here. There is
2050     support for caseless matching if UCP support is present. */
2051    
2052     if (length > 1)
2053     {
2054     #ifdef SUPPORT_UCP
2055 nigel 93 unsigned int othercase;
2056 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2057 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2058 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2059 ph10 115 else oclength = 0;
2060 nigel 77 #endif /* SUPPORT_UCP */
2061    
2062     for (i = 1; i <= min; i++)
2063     {
2064     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2065 ph10 115 #ifdef SUPPORT_UCP
2066 nigel 77 /* Need braces because of following else */
2067     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2068     else
2069     {
2070     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2071     eptr += oclength;
2072     }
2073 ph10 115 #else /* without SUPPORT_UCP */
2074     else { RRETURN(MATCH_NOMATCH); }
2075     #endif /* SUPPORT_UCP */
2076 nigel 77 }
2077    
2078     if (min == max) continue;
2079    
2080     if (minimize)
2081     {
2082     for (fi = min;; fi++)
2083     {
2084     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2085     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2086     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2087     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2088 ph10 115 #ifdef SUPPORT_UCP
2089 nigel 77 /* Need braces because of following else */
2090     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2091     else
2092     {
2093     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2094     eptr += oclength;
2095     }
2096 ph10 115 #else /* without SUPPORT_UCP */
2097     else { RRETURN (MATCH_NOMATCH); }
2098     #endif /* SUPPORT_UCP */
2099 nigel 77 }
2100     /* Control never gets here */
2101     }
2102 nigel 93
2103     else /* Maximize */
2104 nigel 77 {
2105     pp = eptr;
2106     for (i = min; i < max; i++)
2107     {
2108     if (eptr > md->end_subject - length) break;
2109     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2110 ph10 115 #ifdef SUPPORT_UCP
2111 nigel 77 else if (oclength == 0) break;
2112     else
2113     {
2114     if (memcmp(eptr, occhars, oclength) != 0) break;
2115     eptr += oclength;
2116     }
2117 ph10 115 #else /* without SUPPORT_UCP */
2118     else break;
2119     #endif /* SUPPORT_UCP */
2120 nigel 77 }
2121 nigel 93
2122     if (possessive) continue;
2123 nigel 77 while (eptr >= pp)
2124     {
2125     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2126     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2127 ph10 115 #ifdef SUPPORT_UCP
2128     eptr--;
2129     BACKCHAR(eptr);
2130     #else /* without SUPPORT_UCP */
2131 nigel 77 eptr -= length;
2132 ph10 115 #endif /* SUPPORT_UCP */
2133 nigel 77 }
2134     RRETURN(MATCH_NOMATCH);
2135     }
2136     /* Control never gets here */
2137     }
2138    
2139     /* If the length of a UTF-8 character is 1, we fall through here, and
2140     obey the code as for non-UTF-8 characters below, though in this case the
2141     value of fc will always be < 128. */
2142     }
2143     else
2144     #endif /* SUPPORT_UTF8 */
2145    
2146     /* When not in UTF-8 mode, load a single-byte character. */
2147     {
2148     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2149     fc = *ecode++;
2150     }
2151    
2152     /* The value of fc at this point is always less than 256, though we may or
2153     may not be in UTF-8 mode. The code is duplicated for the caseless and
2154     caseful cases, for speed, since matching characters is likely to be quite
2155     common. First, ensure the minimum number of matches are present. If min =
2156     max, continue at the same level without recursing. Otherwise, if
2157     minimizing, keep trying the rest of the expression and advancing one
2158     matching character if failing, up to the maximum. Alternatively, if
2159     maximizing, find the maximum number of characters and work backwards. */
2160    
2161     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2162     max, eptr));
2163    
2164     if ((ims & PCRE_CASELESS) != 0)
2165     {
2166     fc = md->lcc[fc];
2167     for (i = 1; i <= min; i++)
2168     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2169     if (min == max) continue;
2170     if (minimize)
2171     {
2172     for (fi = min;; fi++)
2173     {
2174     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2175     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2176     if (fi >= max || eptr >= md->end_subject ||
2177     fc != md->lcc[*eptr++])
2178     RRETURN(MATCH_NOMATCH);
2179     }
2180     /* Control never gets here */
2181     }
2182 nigel 93 else /* Maximize */
2183 nigel 77 {
2184     pp = eptr;
2185     for (i = min; i < max; i++)
2186     {
2187     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2188     eptr++;
2189     }
2190 nigel 93 if (possessive) continue;
2191 nigel 77 while (eptr >= pp)
2192     {
2193     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2194     eptr--;
2195     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2196     }
2197     RRETURN(MATCH_NOMATCH);
2198     }
2199     /* Control never gets here */
2200     }
2201    
2202     /* Caseful comparisons (includes all multi-byte characters) */
2203    
2204     else
2205     {
2206     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2207     if (min == max) continue;
2208     if (minimize)
2209     {
2210     for (fi = min;; fi++)
2211     {
2212     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2213     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2214     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2215     RRETURN(MATCH_NOMATCH);
2216     }
2217     /* Control never gets here */
2218     }
2219 nigel 93 else /* Maximize */
2220 nigel 77 {
2221     pp = eptr;
2222     for (i = min; i < max; i++)
2223     {
2224     if (eptr >= md->end_subject || fc != *eptr) break;
2225     eptr++;
2226     }
2227 nigel 93 if (possessive) continue;
2228 nigel 77 while (eptr >= pp)
2229     {
2230     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2231     eptr--;
2232     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2233     }
2234     RRETURN(MATCH_NOMATCH);
2235     }
2236     }
2237     /* Control never gets here */
2238    
2239     /* Match a negated single one-byte character. The character we are
2240     checking can be multibyte. */
2241    
2242     case OP_NOT:
2243     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2244     ecode++;
2245     GETCHARINCTEST(c, eptr);
2246     if ((ims & PCRE_CASELESS) != 0)
2247     {
2248     #ifdef SUPPORT_UTF8
2249     if (c < 256)
2250     #endif
2251     c = md->lcc[c];
2252     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2253     }
2254     else
2255     {
2256     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2257     }
2258     break;
2259    
2260     /* Match a negated single one-byte character repeatedly. This is almost a
2261     repeat of the code for a repeated single character, but I haven't found a
2262     nice way of commoning these up that doesn't require a test of the
2263     positive/negative option for each character match. Maybe that wouldn't add
2264     very much to the time taken, but character matching *is* what this is all
2265     about... */
2266    
2267     case OP_NOTEXACT:
2268     min = max = GET2(ecode, 1);
2269     ecode += 3;
2270     goto REPEATNOTCHAR;
2271    
2272     case OP_NOTUPTO:
2273     case OP_NOTMINUPTO:
2274     min = 0;
2275     max = GET2(ecode, 1);
2276     minimize = *ecode == OP_NOTMINUPTO;
2277     ecode += 3;
2278     goto REPEATNOTCHAR;
2279    
2280 nigel 93 case OP_NOTPOSSTAR:
2281     possessive = TRUE;
2282     min = 0;
2283     max = INT_MAX;
2284     ecode++;
2285     goto REPEATNOTCHAR;
2286    
2287     case OP_NOTPOSPLUS:
2288     possessive = TRUE;
2289     min = 1;
2290     max = INT_MAX;
2291     ecode++;
2292     goto REPEATNOTCHAR;
2293    
2294     case OP_NOTPOSQUERY:
2295     possessive = TRUE;
2296     min = 0;
2297     max = 1;
2298     ecode++;
2299     goto REPEATNOTCHAR;
2300    
2301     case OP_NOTPOSUPTO:
2302     possessive = TRUE;
2303     min = 0;
2304     max = GET2(ecode, 1);
2305     ecode += 3;
2306     goto REPEATNOTCHAR;
2307    
2308 nigel 77 case OP_NOTSTAR:
2309     case OP_NOTMINSTAR:
2310     case OP_NOTPLUS:
2311     case OP_NOTMINPLUS:
2312     case OP_NOTQUERY:
2313     case OP_NOTMINQUERY:
2314     c = *ecode++ - OP_NOTSTAR;
2315     minimize = (c & 1) != 0;
2316     min = rep_min[c]; /* Pick up values from tables; */
2317     max = rep_max[c]; /* zero for max => infinity */
2318     if (max == 0) max = INT_MAX;
2319    
2320     /* Common code for all repeated single-byte matches. We can give up quickly
2321     if there are fewer than the minimum number of bytes left in the
2322     subject. */
2323    
2324     REPEATNOTCHAR:
2325     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2326     fc = *ecode++;
2327    
2328     /* The code is duplicated for the caseless and caseful cases, for speed,
2329     since matching characters is likely to be quite common. First, ensure the
2330     minimum number of matches are present. If min = max, continue at the same
2331     level without recursing. Otherwise, if minimizing, keep trying the rest of
2332     the expression and advancing one matching character if failing, up to the
2333     maximum. Alternatively, if maximizing, find the maximum number of
2334     characters and work backwards. */
2335    
2336     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2337     max, eptr));
2338    
2339     if ((ims & PCRE_CASELESS) != 0)
2340     {
2341     fc = md->lcc[fc];
2342    
2343     #ifdef SUPPORT_UTF8
2344     /* UTF-8 mode */
2345     if (utf8)
2346     {
2347 nigel 93 register unsigned int d;
2348 nigel 77 for (i = 1; i <= min; i++)
2349     {
2350     GETCHARINC(d, eptr);
2351     if (d < 256) d = md->lcc[d];
2352     if (fc == d) RRETURN(MATCH_NOMATCH);
2353     }
2354     }
2355     else
2356     #endif
2357    
2358     /* Not UTF-8 mode */
2359     {
2360     for (i = 1; i <= min; i++)
2361     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2362     }
2363    
2364     if (min == max) continue;
2365    
2366     if (minimize)
2367     {
2368     #ifdef SUPPORT_UTF8
2369     /* UTF-8 mode */
2370     if (utf8)
2371     {
2372 nigel 93 register unsigned int d;
2373 nigel 77 for (fi = min;; fi++)
2374     {
2375     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2376     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2377     GETCHARINC(d, eptr);
2378     if (d < 256) d = md->lcc[d];
2379     if (fi >= max || eptr >= md->end_subject || fc == d)
2380     RRETURN(MATCH_NOMATCH);
2381     }
2382     }
2383     else
2384     #endif
2385     /* Not UTF-8 mode */
2386     {
2387     for (fi = min;; fi++)
2388     {
2389     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2390     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2391     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2392     RRETURN(MATCH_NOMATCH);
2393     }
2394     }
2395     /* Control never gets here */
2396     }
2397    
2398     /* Maximize case */
2399    
2400     else
2401     {
2402     pp = eptr;
2403    
2404     #ifdef SUPPORT_UTF8
2405     /* UTF-8 mode */
2406     if (utf8)
2407     {
2408 nigel 93 register unsigned int d;
2409 nigel 77 for (i = min; i < max; i++)
2410     {
2411     int len = 1;
2412     if (eptr >= md->end_subject) break;
2413     GETCHARLEN(d, eptr, len);
2414     if (d < 256) d = md->lcc[d];
2415     if (fc == d) break;
2416     eptr += len;
2417     }
2418 nigel 93 if (possessive) continue;
2419     for(;;)
2420 nigel 77 {
2421     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2422     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2423     if (eptr-- == pp) break; /* Stop if tried at original pos */
2424     BACKCHAR(eptr);
2425     }
2426     }
2427     else
2428     #endif
2429     /* Not UTF-8 mode */
2430     {
2431     for (i = min; i < max; i++)
2432     {
2433     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2434     eptr++;
2435     }
2436 nigel 93 if (possessive) continue;
2437 nigel 77 while (eptr >= pp)
2438     {
2439     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2440     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2441     eptr--;
2442     }
2443     }
2444    
2445     RRETURN(MATCH_NOMATCH);
2446     }
2447     /* Control never gets here */
2448     }
2449    
2450     /* Caseful comparisons */
2451    
2452     else
2453     {
2454     #ifdef SUPPORT_UTF8
2455     /* UTF-8 mode */
2456     if (utf8)
2457     {
2458 nigel 93 register unsigned int d;
2459 nigel 77 for (i = 1; i <= min; i++)
2460     {
2461     GETCHARINC(d, eptr);
2462     if (fc == d) RRETURN(MATCH_NOMATCH);
2463     }
2464     }
2465     else
2466     #endif
2467     /* Not UTF-8 mode */
2468     {
2469     for (i = 1; i <= min; i++)
2470     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2471     }
2472    
2473     if (min == max) continue;
2474    
2475     if (minimize)
2476     {
2477     #ifdef SUPPORT_UTF8
2478     /* UTF-8 mode */
2479     if (utf8)
2480     {
2481 nigel 93 register unsigned int d;
2482 nigel 77 for (fi = min;; fi++)
2483     {
2484     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2485     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2486     GETCHARINC(d, eptr);
2487     if (fi >= max || eptr >= md->end_subject || fc == d)
2488     RRETURN(MATCH_NOMATCH);
2489     }
2490     }
2491     else
2492     #endif
2493     /* Not UTF-8 mode */
2494     {
2495     for (fi = min;; fi++)
2496     {
2497     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2498     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2499     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2500     RRETURN(MATCH_NOMATCH);
2501     }
2502     }
2503     /* Control never gets here */
2504     }
2505    
2506     /* Maximize case */
2507    
2508     else
2509     {
2510     pp = eptr;
2511    
2512     #ifdef SUPPORT_UTF8
2513     /* UTF-8 mode */
2514     if (utf8)
2515     {
2516 nigel 93 register unsigned int d;
2517 nigel 77 for (i = min; i < max; i++)
2518     {
2519     int len = 1;
2520     if (eptr >= md->end_subject) break;
2521     GETCHARLEN(d, eptr, len);
2522     if (fc == d) break;
2523     eptr += len;
2524     }
2525 nigel 93 if (possessive) continue;
2526 nigel 77 for(;;)
2527     {
2528     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2529     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2530     if (eptr-- == pp) break; /* Stop if tried at original pos */
2531     BACKCHAR(eptr);
2532     }
2533     }
2534     else
2535     #endif
2536     /* Not UTF-8 mode */
2537     {
2538     for (i = min; i < max; i++)
2539     {
2540     if (eptr >= md->end_subject || fc == *eptr) break;
2541     eptr++;
2542     }
2543 nigel 93 if (possessive) continue;
2544 nigel 77 while (eptr >= pp)
2545     {
2546     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2547     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2548     eptr--;
2549     }
2550     }
2551    
2552     RRETURN(MATCH_NOMATCH);
2553     }
2554     }
2555     /* Control never gets here */
2556    
2557     /* Match a single character type repeatedly; several different opcodes
2558     share code. This is very similar to the code for single characters, but we
2559     repeat it in the interests of efficiency. */
2560    
2561     case OP_TYPEEXACT:
2562     min = max = GET2(ecode, 1);
2563     minimize = TRUE;
2564     ecode += 3;
2565     goto REPEATTYPE;
2566    
2567     case OP_TYPEUPTO:
2568     case OP_TYPEMINUPTO:
2569     min = 0;
2570     max = GET2(ecode, 1);
2571     minimize = *ecode == OP_TYPEMINUPTO;
2572     ecode += 3;
2573     goto REPEATTYPE;
2574    
2575 nigel 93 case OP_TYPEPOSSTAR:
2576     possessive = TRUE;
2577     min = 0;
2578     max = INT_MAX;
2579     ecode++;
2580     goto REPEATTYPE;
2581    
2582     case OP_TYPEPOSPLUS:
2583     possessive = TRUE;
2584     min = 1;
2585     max = INT_MAX;
2586     ecode++;
2587     goto REPEATTYPE;
2588    
2589     case OP_TYPEPOSQUERY:
2590     possessive = TRUE;
2591     min = 0;
2592     max = 1;
2593     ecode++;
2594     goto REPEATTYPE;
2595    
2596     case OP_TYPEPOSUPTO:
2597     possessive = TRUE;
2598     min = 0;
2599     max = GET2(ecode, 1);
2600     ecode += 3;
2601     goto REPEATTYPE;
2602    
2603 nigel 77 case OP_TYPESTAR:
2604     case OP_TYPEMINSTAR:
2605     case OP_TYPEPLUS:
2606     case OP_TYPEMINPLUS:
2607     case OP_TYPEQUERY:
2608     case OP_TYPEMINQUERY:
2609     c = *ecode++ - OP_TYPESTAR;
2610     minimize = (c & 1) != 0;
2611     min = rep_min[c]; /* Pick up values from tables; */
2612     max = rep_max[c]; /* zero for max => infinity */
2613     if (max == 0) max = INT_MAX;
2614    
2615     /* Common code for all repeated single character type matches. Note that
2616     in UTF-8 mode, '.' matches a character of any length, but for the other
2617     character types, the valid characters are all one-byte long. */
2618    
2619     REPEATTYPE:
2620     ctype = *ecode++; /* Code for the character type */
2621    
2622     #ifdef SUPPORT_UCP
2623     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2624     {
2625     prop_fail_result = ctype == OP_NOTPROP;
2626     prop_type = *ecode++;
2627 nigel 87 prop_value = *ecode++;
2628 nigel 77 }
2629     else prop_type = -1;
2630     #endif
2631    
2632     /* First, ensure the minimum number of matches are present. Use inline
2633     code for maximizing the speed, and do the type test once at the start
2634     (i.e. keep it out of the loop). Also we can test that there are at least
2635     the minimum number of bytes before we start. This isn't as effective in
2636     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2637     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2638     and single-bytes. */
2639    
2640     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2641     if (min > 0)
2642     {
2643     #ifdef SUPPORT_UCP
2644 nigel 87 if (prop_type >= 0)
2645 nigel 77 {
2646 nigel 87 switch(prop_type)
2647 nigel 77 {
2648 nigel 87 case PT_ANY:
2649     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2650     for (i = 1; i <= min; i++)
2651     {
2652     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2653     GETCHARINC(c, eptr);
2654     }
2655     break;
2656    
2657     case PT_LAMP:
2658     for (i = 1; i <= min; i++)
2659     {
2660     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2661     GETCHARINC(c, eptr);
2662     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2663     if ((prop_chartype == ucp_Lu ||
2664     prop_chartype == ucp_Ll ||
2665     prop_chartype == ucp_Lt) == prop_fail_result)
2666     RRETURN(MATCH_NOMATCH);
2667     }
2668     break;
2669    
2670     case PT_GC:
2671     for (i = 1; i <= min; i++)
2672     {
2673     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2674     GETCHARINC(c, eptr);
2675     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2676     if ((prop_category == prop_value) == prop_fail_result)
2677     RRETURN(MATCH_NOMATCH);
2678     }
2679     break;
2680    
2681     case PT_PC:
2682     for (i = 1; i <= min; i++)
2683     {
2684     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2685     GETCHARINC(c, eptr);
2686     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2687     if ((prop_chartype == prop_value) == prop_fail_result)
2688     RRETURN(MATCH_NOMATCH);
2689     }
2690     break;
2691    
2692     case PT_SC:
2693     for (i = 1; i <= min; i++)
2694     {
2695     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2696     GETCHARINC(c, eptr);
2697     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2698     if ((prop_script == prop_value) == prop_fail_result)
2699     RRETURN(MATCH_NOMATCH);
2700     }
2701     break;
2702    
2703     default:
2704     RRETURN(PCRE_ERROR_INTERNAL);
2705 nigel 77 }
2706     }
2707    
2708     /* Match extended Unicode sequences. We will get here only if the
2709     support is in the binary; otherwise a compile-time error occurs. */
2710    
2711     else if (ctype == OP_EXTUNI)
2712     {
2713     for (i = 1; i <= min; i++)
2714     {
2715     GETCHARINCTEST(c, eptr);
2716 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2717 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2718     while (eptr < md->end_subject)
2719     {
2720     int len = 1;
2721     if (!utf8) c = *eptr; else
2722     {
2723     GETCHARLEN(c, eptr, len);
2724     }
2725 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2726 nigel 77 if (prop_category != ucp_M) break;
2727     eptr += len;
2728     }
2729     }
2730     }
2731    
2732     else
2733     #endif /* SUPPORT_UCP */
2734    
2735     /* Handle all other cases when the coding is UTF-8 */
2736    
2737     #ifdef SUPPORT_UTF8
2738     if (utf8) switch(ctype)
2739     {
2740     case OP_ANY:
2741     for (i = 1; i <= min; i++)
2742     {
2743     if (eptr >= md->end_subject ||
2744 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2745 nigel 77 RRETURN(MATCH_NOMATCH);
2746 nigel 91 eptr++;
2747 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2748     }
2749     break;
2750    
2751     case OP_ANYBYTE:
2752     eptr += min;
2753     break;
2754    
2755 nigel 93 case OP_ANYNL:
2756     for (i = 1; i <= min; i++)
2757     {
2758     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2759     GETCHARINC(c, eptr);
2760     switch(c)
2761     {
2762     default: RRETURN(MATCH_NOMATCH);
2763     case 0x000d:
2764     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2765     break;
2766     case 0x000a:
2767     case 0x000b:
2768     case 0x000c:
2769     case 0x0085:
2770     case 0x2028:
2771     case 0x2029:
2772     break;
2773     }
2774     }
2775     break;
2776    
2777 nigel 77 case OP_NOT_DIGIT:
2778     for (i = 1; i <= min; i++)
2779     {
2780     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2781     GETCHARINC(c, eptr);
2782     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2783     RRETURN(MATCH_NOMATCH);
2784     }
2785     break;
2786    
2787     case OP_DIGIT:
2788     for (i = 1; i <= min; i++)
2789     {
2790     if (eptr >= md->end_subject ||
2791     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2792     RRETURN(MATCH_NOMATCH);
2793     /* No need to skip more bytes - we know it's a 1-byte character */
2794     }
2795     break;
2796    
2797     case OP_NOT_WHITESPACE:
2798     for (i = 1; i <= min; i++)
2799     {
2800     if (eptr >= md->end_subject ||
2801     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2802     RRETURN(MATCH_NOMATCH);
2803     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2804     }
2805     break;
2806    
2807     case OP_WHITESPACE:
2808     for (i = 1; i <= min; i++)
2809     {
2810     if (eptr >= md->end_subject ||
2811     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2812     RRETURN(MATCH_NOMATCH);
2813     /* No need to skip more bytes - we know it's a 1-byte character */
2814     }
2815     break;
2816    
2817     case OP_NOT_WORDCHAR:
2818     for (i = 1; i <= min; i++)
2819     {
2820     if (eptr >= md->end_subject ||
2821     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2822     RRETURN(MATCH_NOMATCH);
2823     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2824     }
2825     break;
2826    
2827     case OP_WORDCHAR:
2828     for (i = 1; i <= min; i++)
2829     {
2830     if (eptr >= md->end_subject ||
2831     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2832     RRETURN(MATCH_NOMATCH);
2833     /* No need to skip more bytes - we know it's a 1-byte character */
2834     }
2835     break;
2836    
2837     default:
2838     RRETURN(PCRE_ERROR_INTERNAL);
2839     } /* End switch(ctype) */
2840    
2841     else
2842     #endif /* SUPPORT_UTF8 */
2843    
2844     /* Code for the non-UTF-8 case for minimum matching of operators other
2845 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2846     number of bytes present, as this was tested above. */
2847 nigel 77
2848     switch(ctype)
2849     {
2850     case OP_ANY:
2851     if ((ims & PCRE_DOTALL) == 0)
2852     {
2853     for (i = 1; i <= min; i++)
2854 nigel 91 {
2855 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2856 nigel 91 eptr++;
2857     }
2858 nigel 77 }
2859     else eptr += min;
2860     break;
2861    
2862     case OP_ANYBYTE:
2863     eptr += min;
2864     break;
2865    
2866 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
2867     bytes are present in this case. */
2868    
2869     case OP_ANYNL:
2870     for (i = 1; i <= min; i++)
2871     {
2872     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2873     switch(*eptr++)
2874     {
2875     default: RRETURN(MATCH_NOMATCH);
2876     case 0x000d:
2877     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2878     break;
2879     case 0x000a:
2880     case 0x000b:
2881     case 0x000c:
2882     case 0x0085:
2883     break;
2884     }
2885     }
2886     break;
2887    
2888 nigel 77 case OP_NOT_DIGIT:
2889     for (i = 1; i <= min; i++)
2890     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2891     break;
2892    
2893     case OP_DIGIT:
2894     for (i = 1; i <= min; i++)
2895     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2896     break;
2897    
2898     case OP_NOT_WHITESPACE:
2899     for (i = 1; i <= min; i++)
2900     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2901     break;
2902    
2903     case OP_WHITESPACE:
2904     for (i = 1; i <= min; i++)
2905     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2906     break;
2907    
2908     case OP_NOT_WORDCHAR:
2909     for (i = 1; i <= min; i++)
2910     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2911     RRETURN(MATCH_NOMATCH);
2912     break;
2913    
2914     case OP_WORDCHAR:
2915     for (i = 1; i <= min; i++)
2916     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2917     RRETURN(MATCH_NOMATCH);
2918     break;
2919    
2920     default:
2921     RRETURN(PCRE_ERROR_INTERNAL);
2922     }
2923     }
2924    
2925     /* If min = max, continue at the same level without recursing */
2926    
2927     if (min == max) continue;
2928    
2929     /* If minimizing, we have to test the rest of the pattern before each
2930     subsequent match. Again, separate the UTF-8 case for speed, and also
2931     separate the UCP cases. */
2932    
2933     if (minimize)
2934     {
2935     #ifdef SUPPORT_UCP
2936 nigel 87 if (prop_type >= 0)
2937 nigel 77 {
2938 nigel 87 switch(prop_type)
2939 nigel 77 {
2940 nigel 87 case PT_ANY:
2941     for (fi = min;; fi++)
2942     {
2943     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2944     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2945     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2946     GETCHARINC(c, eptr);
2947     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2948     }
2949 nigel 93 /* Control never gets here */
2950 nigel 87
2951     case PT_LAMP:
2952     for (fi = min;; fi++)
2953     {
2954     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2955     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2956     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2957     GETCHARINC(c, eptr);
2958     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2959     if ((prop_chartype == ucp_Lu ||
2960     prop_chartype == ucp_Ll ||
2961     prop_chartype == ucp_Lt) == prop_fail_result)
2962     RRETURN(MATCH_NOMATCH);
2963     }
2964 nigel 93 /* Control never gets here */
2965 nigel 87
2966     case PT_GC:
2967     for (fi = min;; fi++)
2968     {
2969     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2970     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2971     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2972     GETCHARINC(c, eptr);
2973     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2974     if ((prop_category == prop_value) == prop_fail_result)
2975     RRETURN(MATCH_NOMATCH);
2976     }
2977 nigel 93 /* Control never gets here */
2978 nigel 87
2979     case PT_PC:
2980     for (fi = min;; fi++)
2981     {
2982     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2983     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2984     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2985     GETCHARINC(c, eptr);
2986     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2987     if ((prop_chartype == prop_value) == prop_fail_result)
2988     RRETURN(MATCH_NOMATCH);
2989     }
2990 nigel 93 /* Control never gets here */
2991 nigel 87
2992     case PT_SC:
2993     for (fi = min;; fi++)
2994     {
2995     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
2996     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2997     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2998     GETCHARINC(c, eptr);
2999     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3000     if ((prop_script == prop_value) == prop_fail_result)
3001     RRETURN(MATCH_NOMATCH);
3002     }
3003 nigel 93 /* Control never gets here */
3004 nigel 87
3005     default:
3006     RRETURN(PCRE_ERROR_INTERNAL);
3007 nigel 77 }
3008     }
3009    
3010     /* Match extended Unicode sequences. We will get here only if the
3011     support is in the binary; otherwise a compile-time error occurs. */
3012    
3013     else if (ctype == OP_EXTUNI)
3014     {
3015     for (fi = min;; fi++)
3016     {
3017     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3018     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3019     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3020     GETCHARINCTEST(c, eptr);
3021 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3022 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3023     while (eptr < md->end_subject)
3024     {
3025     int len = 1;
3026     if (!utf8) c = *eptr; else
3027     {
3028     GETCHARLEN(c, eptr, len);
3029     }
3030 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3031 nigel 77 if (prop_category != ucp_M) break;
3032     eptr += len;
3033     }
3034     }
3035     }
3036    
3037     else
3038     #endif /* SUPPORT_UCP */
3039    
3040     #ifdef SUPPORT_UTF8
3041     /* UTF-8 mode */
3042     if (utf8)
3043     {
3044     for (fi = min;; fi++)
3045     {
3046     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3047     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3048 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3049     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3050 nigel 93 IS_NEWLINE(eptr)))
3051 nigel 91 RRETURN(MATCH_NOMATCH);
3052 nigel 77
3053     GETCHARINC(c, eptr);
3054     switch(ctype)
3055     {
3056 nigel 91 case OP_ANY: /* This is the DOTALL case */
3057 nigel 77 break;
3058    
3059     case OP_ANYBYTE:
3060     break;
3061    
3062 nigel 93 case OP_ANYNL:
3063     switch(c)
3064     {
3065     default: RRETURN(MATCH_NOMATCH);
3066     case 0x000d:
3067     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3068     break;
3069     case 0x000a:
3070     case 0x000b:
3071     case 0x000c:
3072     case 0x0085:
3073     case 0x2028:
3074     case 0x2029:
3075     break;
3076     }
3077     break;
3078    
3079 nigel 77 case OP_NOT_DIGIT:
3080     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3081     RRETURN(MATCH_NOMATCH);
3082     break;
3083    
3084     case OP_DIGIT:
3085     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3086     RRETURN(MATCH_NOMATCH);
3087     break;
3088    
3089     case OP_NOT_WHITESPACE:
3090     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3091     RRETURN(MATCH_NOMATCH);
3092     break;
3093    
3094     case OP_WHITESPACE:
3095     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3096     RRETURN(MATCH_NOMATCH);
3097     break;
3098    
3099     case OP_NOT_WORDCHAR:
3100     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3101     RRETURN(MATCH_NOMATCH);
3102     break;
3103    
3104     case OP_WORDCHAR:
3105     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3106     RRETURN(MATCH_NOMATCH);
3107     break;
3108    
3109     default:
3110     RRETURN(PCRE_ERROR_INTERNAL);
3111     }
3112     }
3113     }
3114     else
3115     #endif
3116     /* Not UTF-8 mode */
3117     {
3118     for (fi = min;; fi++)
3119     {
3120     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3121     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3122 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3123 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3124 nigel 91 RRETURN(MATCH_NOMATCH);
3125    
3126 nigel 77 c = *eptr++;
3127     switch(ctype)
3128     {
3129 nigel 91 case OP_ANY: /* This is the DOTALL case */
3130 nigel 77 break;
3131    
3132     case OP_ANYBYTE:
3133     break;
3134    
3135 nigel 93 case OP_ANYNL:
3136     switch(c)
3137     {
3138     default: RRETURN(MATCH_NOMATCH);
3139     case 0x000d:
3140     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3141     break;
3142     case 0x000a:
3143     case 0x000b:
3144     case 0x000c:
3145     case 0x0085:
3146     break;
3147     }
3148     break;
3149    
3150 nigel 77 case OP_NOT_DIGIT:
3151     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3152     break;
3153    
3154     case OP_DIGIT:
3155     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3156     break;
3157    
3158     case OP_NOT_WHITESPACE:
3159     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3160     break;
3161    
3162     case OP_WHITESPACE:
3163     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3164     break;
3165    
3166     case OP_NOT_WORDCHAR:
3167     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3168     break;
3169    
3170     case OP_WORDCHAR:
3171     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3172     break;
3173    
3174     default:
3175     RRETURN(PCRE_ERROR_INTERNAL);
3176     }
3177     }
3178     }
3179     /* Control never gets here */
3180     }
3181    
3182 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3183 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3184     UTF-8 and UCP stuff separate. */
3185    
3186     else
3187     {
3188     pp = eptr; /* Remember where we started */
3189    
3190     #ifdef SUPPORT_UCP
3191 nigel 87 if (prop_type >= 0)
3192 nigel 77 {
3193 nigel 87 switch(prop_type)
3194 nigel 77 {
3195 nigel 87 case PT_ANY:
3196     for (i = min; i < max; i++)
3197     {
3198     int len = 1;
3199     if (eptr >= md->end_subject) break;
3200     GETCHARLEN(c, eptr, len);
3201     if (prop_fail_result) break;
3202     eptr+= len;
3203     }
3204     break;
3205    
3206     case PT_LAMP:
3207     for (i = min; i < max; i++)
3208     {
3209     int len = 1;
3210     if (eptr >= md->end_subject) break;
3211     GETCHARLEN(c, eptr, len);
3212     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3213     if ((prop_chartype == ucp_Lu ||
3214     prop_chartype == ucp_Ll ||
3215     prop_chartype == ucp_Lt) == prop_fail_result)
3216     break;
3217     eptr+= len;
3218     }
3219     break;
3220    
3221     case PT_GC:
3222     for (i = min; i < max; i++)
3223     {
3224     int len = 1;
3225     if (eptr >= md->end_subject) break;
3226     GETCHARLEN(c, eptr, len);
3227     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3228     if ((prop_category == prop_value) == prop_fail_result)
3229     break;
3230     eptr+= len;
3231     }
3232     break;
3233    
3234     case PT_PC:
3235     for (i = min; i < max; i++)
3236     {
3237     int len = 1;
3238     if (eptr >= md->end_subject) break;
3239     GETCHARLEN(c, eptr, len);
3240     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3241     if ((prop_chartype == prop_value) == prop_fail_result)
3242     break;
3243     eptr+= len;
3244     }
3245     break;
3246    
3247     case PT_SC:
3248     for (i = min; i < max; i++)
3249     {
3250     int len = 1;
3251     if (eptr >= md->end_subject) break;
3252     GETCHARLEN(c, eptr, len);
3253     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3254     if ((prop_script == prop_value) == prop_fail_result)
3255     break;
3256     eptr+= len;
3257     }
3258     break;
3259 nigel 77 }
3260    
3261     /* eptr is now past the end of the maximum run */
3262    
3263 nigel 93 if (possessive) continue;
3264 nigel 77 for(;;)
3265     {
3266     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3267     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3268     if (eptr-- == pp) break; /* Stop if tried at original pos */
3269     BACKCHAR(eptr);
3270     }
3271     }
3272    
3273     /* Match extended Unicode sequences. We will get here only if the
3274     support is in the binary; otherwise a compile-time error occurs. */
3275    
3276     else if (ctype == OP_EXTUNI)
3277     {
3278     for (i = min; i < max; i++)
3279     {
3280     if (eptr >= md->end_subject) break;
3281     GETCHARINCTEST(c, eptr);
3282 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3283 nigel 77 if (prop_category == ucp_M) break;
3284     while (eptr < md->end_subject)
3285     {
3286     int len = 1;
3287     if (!utf8) c = *eptr; else
3288     {
3289     GETCHARLEN(c, eptr, len);
3290     }
3291 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3292 nigel 77 if (prop_category != ucp_M) break;
3293     eptr += len;
3294     }
3295     }
3296    
3297     /* eptr is now past the end of the maximum run */
3298    
3299 nigel 93 if (possessive) continue;
3300 nigel 77 for(;;)
3301     {
3302     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3303     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3304     if (eptr-- == pp) break; /* Stop if tried at original pos */
3305     for (;;) /* Move back over one extended */
3306     {
3307     int len = 1;
3308     BACKCHAR(eptr);
3309     if (!utf8) c = *eptr; else
3310     {
3311     GETCHARLEN(c, eptr, len);
3312     }
3313 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3314 nigel 77 if (prop_category != ucp_M) break;
3315     eptr--;
3316     }
3317     }
3318     }
3319    
3320     else
3321     #endif /* SUPPORT_UCP */
3322    
3323     #ifdef SUPPORT_UTF8
3324     /* UTF-8 mode */
3325    
3326     if (utf8)
3327     {
3328     switch(ctype)
3329     {
3330     case OP_ANY:
3331    
3332 nigel 91 /* Special code is required for UTF8, but when the maximum is
3333     unlimited we don't need it, so we repeat the non-UTF8 code. This is
3334     probably worth it, because .* is quite a common idiom. */
3335 nigel 77
3336     if (max < INT_MAX)
3337     {
3338     if ((ims & PCRE_DOTALL) == 0)
3339     {
3340     for (i = min; i < max; i++)
3341     {
3342 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3343 nigel 77 eptr++;
3344     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3345     }
3346     }
3347     else
3348     {
3349     for (i = min; i < max; i++)
3350     {
3351 nigel 91 if (eptr >= md->end_subject) break;
3352 nigel 77 eptr++;
3353     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3354     }
3355     }
3356     }
3357    
3358     /* Handle unlimited UTF-8 repeat */
3359    
3360     else
3361     {
3362     if ((ims & PCRE_DOTALL) == 0)
3363     {
3364     for (i = min; i < max; i++)
3365     {
3366 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3367 nigel 77 eptr++;
3368     }
3369     break;
3370     }
3371     else
3372     {
3373     c = max - min;
3374 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3375     c = md->end_subject - eptr;
3376 nigel 77 eptr += c;
3377     }
3378     }
3379     break;
3380    
3381     /* The byte case is the same as non-UTF8 */
3382    
3383     case OP_ANYBYTE:
3384     c = max - min;
3385 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3386     c = md->end_subject - eptr;
3387 nigel 77 eptr += c;
3388     break;
3389    
3390 nigel 93 case OP_ANYNL:
3391     for (i = min; i < max; i++)
3392     {
3393     int len = 1;
3394     if (eptr >= md->end_subject) break;
3395     GETCHARLEN(c, eptr, len);
3396     if (c == 0x000d)
3397     {
3398     if (++eptr >= md->end_subject) break;
3399     if (*eptr == 0x000a) eptr++;
3400     }
3401     else
3402     {
3403     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3404     c != 0x0085 && c != 0x2028 && c != 0x2029)
3405     break;
3406     eptr += len;
3407     }
3408     }
3409     break;
3410    
3411 nigel 77 case OP_NOT_DIGIT:
3412     for (i = min; i < max; i++)
3413     {
3414     int len = 1;
3415     if (eptr >= md->end_subject) break;
3416     GETCHARLEN(c, eptr, len);
3417     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3418     eptr+= len;
3419     }
3420     break;
3421    
3422     case OP_DIGIT:
3423     for (i = min; i < max; i++)
3424     {
3425     int len = 1;
3426     if (eptr >= md->end_subject) break;
3427     GETCHARLEN(c, eptr, len);
3428     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3429     eptr+= len;
3430     }
3431     break;
3432    
3433     case OP_NOT_WHITESPACE:
3434     for (i = min; i < max; i++)
3435     {
3436     int len = 1;
3437     if (eptr >= md->end_subject) break;
3438     GETCHARLEN(c, eptr, len);
3439     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3440     eptr+= len;
3441     }
3442     break;
3443    
3444     case OP_WHITESPACE:
3445     for (i = min; i < max; i++)
3446     {
3447     int len = 1;
3448     if (eptr >= md->end_subject) break;
3449     GETCHARLEN(c, eptr, len);
3450     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3451     eptr+= len;
3452     }
3453     break;
3454    
3455     case OP_NOT_WORDCHAR:
3456     for (i = min; i < max; i++)
3457     {
3458     int len = 1;
3459     if (eptr >= md->end_subject) break;
3460     GETCHARLEN(c, eptr, len);
3461     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3462     eptr+= len;
3463     }
3464     break;
3465    
3466     case OP_WORDCHAR:
3467     for (i = min; i < max; i++)
3468     {
3469     int len = 1;
3470     if (eptr >= md->end_subject) break;
3471     GETCHARLEN(c, eptr, len);
3472     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3473     eptr+= len;
3474     }
3475     break;
3476    
3477     default:
3478     RRETURN(PCRE_ERROR_INTERNAL);
3479     }
3480    
3481     /* eptr is now past the end of the maximum run */
3482    
3483 nigel 93 if (possessive) continue;
3484 nigel 77 for(;;)
3485     {
3486     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3487     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3488     if (eptr-- == pp) break; /* Stop if tried at original pos */
3489     BACKCHAR(eptr);
3490     }
3491     }
3492     else
3493     #endif
3494    
3495     /* Not UTF-8 mode */
3496     {
3497     switch(ctype)
3498     {
3499     case OP_ANY:
3500     if ((ims & PCRE_DOTALL) == 0)
3501     {
3502     for (i = min; i < max; i++)
3503     {
3504 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3505 nigel 77 eptr++;
3506     }
3507     break;
3508     }
3509     /* For DOTALL case, fall through and treat as \C */
3510    
3511     case OP_ANYBYTE:
3512     c = max - min;
3513 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3514     c = md->end_subject - eptr;
3515 nigel 77 eptr += c;
3516     break;
3517    
3518 nigel 93 case OP_ANYNL:
3519     for (i = min; i < max; i++)
3520     {
3521     if (eptr >= md->end_subject) break;
3522     c = *eptr;
3523     if (c == 0x000d)
3524     {
3525     if (++eptr >= md->end_subject) break;
3526     if (*eptr == 0x000a) eptr++;
3527     }
3528     else
3529     {
3530     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3531     break;
3532     eptr++;
3533     }
3534     }
3535     break;
3536    
3537 nigel 77 case OP_NOT_DIGIT:
3538     for (i = min; i < max; i++)
3539     {
3540     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3541     break;
3542     eptr++;
3543     }
3544     break;
3545    
3546     case OP_DIGIT:
3547     for (i = min; i < max; i++)
3548     {
3549     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3550     break;
3551     eptr++;
3552     }
3553     break;
3554    
3555     case OP_NOT_WHITESPACE:
3556     for (i = min; i < max; i++)
3557     {
3558     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3559     break;
3560     eptr++;
3561     }
3562     break;
3563    
3564     case OP_WHITESPACE:
3565     for (i = min; i < max; i++)
3566     {
3567     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3568     break;
3569     eptr++;
3570     }
3571     break;
3572    
3573     case OP_NOT_WORDCHAR:
3574     for (i = min; i < max; i++)
3575     {
3576     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3577     break;
3578     eptr++;
3579     }
3580     break;
3581    
3582     case OP_WORDCHAR:
3583     for (i = min; i < max; i++)
3584     {
3585     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3586     break;
3587     eptr++;
3588     }
3589     break;
3590    
3591     default:
3592     RRETURN(PCRE_ERROR_INTERNAL);
3593     }
3594    
3595     /* eptr is now past the end of the maximum run */
3596    
3597 nigel 93 if (possessive) continue;
3598 nigel 77 while (eptr >= pp)
3599     {
3600     RMATCH(rrc, eptr, ecode, offset_top, md, ims, eptrb, 0);
3601     eptr--;
3602     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3603     }
3604     }
3605    
3606     /* Get here if we can't make it match with any permitted repetitions */
3607    
3608     RRETURN(MATCH_NOMATCH);
3609     }
3610     /* Control never gets here */
3611    
3612 nigel 93 /* There's been some horrible disaster. Arrival here can only mean there is
3613     something seriously wrong in the code above or the OP_xxx definitions. */
3614 nigel 77
3615     default:
3616     DPRINTF(("Unknown opcode %d\n", *ecode));
3617 nigel 93 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3618 nigel 77 }
3619    
3620     /* Do not stick any code in here without much thought; it is assumed
3621     that "continue" in the code above comes out to here to repeat the main
3622     loop. */
3623    
3624     } /* End of main loop */
3625     /* Control never reaches here */
3626     }
3627    
3628    
3629     /***************************************************************************
3630     ****************************************************************************
3631     RECURSION IN THE match() FUNCTION
3632    
3633     Undefine all the macros that were defined above to handle this. */
3634    
3635     #ifdef NO_RECURSE
3636     #undef eptr
3637     #undef ecode
3638     #undef offset_top
3639     #undef ims
3640     #undef eptrb
3641     #undef flags
3642    
3643     #undef callpat
3644     #undef charptr
3645     #undef data
3646     #undef next
3647     #undef pp
3648     #undef prev
3649     #undef saved_eptr
3650    
3651     #undef new_recursive
3652    
3653     #undef cur_is_word
3654     #undef condition
3655     #undef prev_is_word
3656    
3657     #undef original_ims
3658    
3659     #undef ctype
3660     #undef length
3661     #undef max
3662     #undef min
3663     #undef number
3664     #undef offset
3665     #undef op
3666     #undef save_capture_last
3667     #undef save_offset1
3668     #undef save_offset2
3669     #undef save_offset3
3670     #undef stacksave
3671    
3672     #undef newptrb
3673    
3674     #endif
3675    
3676     /* These two are defined as macros in both cases */
3677    
3678     #undef fc
3679     #undef fi
3680    
3681     /***************************************************************************
3682     ***************************************************************************/
3683    
3684    
3685    
3686     /*************************************************
3687     * Execute a Regular Expression *
3688     *************************************************/
3689    
3690     /* This function applies a compiled re to a subject string and picks out
3691     portions of the string if it matches. Two elements in the vector are set for
3692     each substring: the offsets to the start and end of the substring.
3693    
3694     Arguments:
3695     argument_re points to the compiled expression
3696     extra_data points to extra data or is NULL
3697     subject points to the subject string
3698     length length of subject string (may contain binary zeros)
3699     start_offset where to start in the subject string
3700     options option bits
3701     offsets points to a vector of ints to be filled in with offsets
3702     offsetcount the number of elements in the vector
3703    
3704     Returns: > 0 => success; value is the number of elements filled in
3705     = 0 => success, but offsets is not big enough
3706     -1 => failed to match
3707     < -1 => some kind of unexpected problem
3708     */
3709    
3710 nigel 87 PCRE_DATA_SCOPE int
3711 nigel 77 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3712 nigel 87 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3713 nigel 77 int offsetcount)
3714     {
3715     int rc, resetcount, ocount;
3716     int first_byte = -1;
3717     int req_byte = -1;
3718     int req_byte2 = -1;
3719 nigel 91 int newline;
3720     unsigned long int ims;
3721 nigel 77 BOOL using_temporary_offsets = FALSE;
3722     BOOL anchored;
3723     BOOL startline;
3724     BOOL firstline;
3725     BOOL first_byte_caseless = FALSE;
3726     BOOL req_byte_caseless = FALSE;
3727 nigel 93 BOOL utf8;
3728 nigel 77 match_data match_block;
3729 nigel 91 match_data *md = &match_block;
3730 nigel 77 const uschar *tables;
3731     const uschar *start_bits = NULL;
3732 nigel 87 USPTR start_match = (USPTR)subject + start_offset;
3733     USPTR end_subject;
3734     USPTR req_byte_ptr = start_match - 1;
3735 nigel 93 eptrblock eptrchain[EPTR_WORK_SIZE];
3736 nigel 77
3737     pcre_study_data internal_study;
3738     const pcre_study_data *study;
3739    
3740     real_pcre internal_re;
3741     const real_pcre *external_re = (const real_pcre *)argument_re;
3742     const real_pcre *re = external_re;
3743    
3744     /* Plausibility checks */
3745    
3746     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3747     if (re == NULL || subject == NULL ||
3748     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3749     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3750    
3751     /* Fish out the optional data from the extra_data structure, first setting
3752     the default values. */
3753    
3754     study = NULL;
3755 nigel 91 md->match_limit = MATCH_LIMIT;
3756     md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3757     md->callout_data = NULL;
3758 nigel 77
3759     /* The table pointer is always in native byte order. */
3760    
3761     tables = external_re->tables;
3762    
3763     if (extra_data != NULL)
3764     {
3765     register unsigned int flags = extra_data->flags;
3766     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3767     study = (const pcre_study_data *)extra_data->study_data;
3768     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3769 nigel 91 md->match_limit = extra_data->match_limit;
3770 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3771 nigel 91 md->match_limit_recursion = extra_data->match_limit_recursion;
3772 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3773 nigel 91 md->callout_data = extra_data->callout_data;
3774 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3775     }
3776    
3777     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3778     is a feature that makes it possible to save compiled regex and re-use them
3779     in other programs later. */
3780    
3781     if (tables == NULL) tables = _pcre_default_tables;
3782    
3783     /* Check that the first field in the block is the magic number. If it is not,
3784     test for a regex that was compiled on a host of opposite endianness. If this is
3785     the case, flipped values are put in internal_re and internal_study if there was
3786     study data too. */
3787    
3788     if (re->magic_number != MAGIC_NUMBER)
3789     {
3790     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3791     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3792     if (study != NULL) study = &internal_study;
3793     }
3794    
3795     /* Set up other data */
3796    
3797     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3798     startline = (re->options & PCRE_STARTLINE) != 0;
3799     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3800    
3801     /* The code starts after the real_pcre block and the capture name table. */
3802    
3803 nigel 91 md->start_code = (const uschar *)external_re + re->name_table_offset +
3804 nigel 77 re->name_count * re->name_entry_size;
3805    
3806 nigel 91 md->start_subject = (USPTR)subject;
3807     md->start_offset = start_offset;
3808     md->end_subject = md->start_subject + length;
3809     end_subject = md->end_subject;
3810 nigel 77
3811 nigel 91 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3812 nigel 93 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3813 nigel 77
3814 nigel 91 md->notbol = (options & PCRE_NOTBOL) != 0;
3815     md->noteol = (options & PCRE_NOTEOL) != 0;
3816     md->notempty = (options & PCRE_NOTEMPTY) != 0;
3817     md->partial = (options & PCRE_PARTIAL) != 0;
3818     md->hitend = FALSE;
3819 nigel 77
3820 nigel 91 md->recursive = NULL; /* No recursion at top level */
3821 nigel 93 md->eptrchain = eptrchain; /* Make workspace generally available */
3822 nigel 77
3823 nigel 91 md->lcc = tables + lcc_offset;
3824     md->ctypes = tables + ctypes_offset;
3825 nigel 77
3826 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3827     nothing is set at run time, whatever was used at compile time applies. */
3828 nigel 91
3829 nigel 93 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : options) &
3830     PCRE_NEWLINE_BITS)
3831 nigel 91 {
3832 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3833 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
3834     case PCRE_NEWLINE_LF: newline = '\n'; break;
3835     case PCRE_NEWLINE_CR+
3836     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3837 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3838     default: return PCRE_ERROR_BADNEWLINE;
3839 nigel 91 }
3840    
3841 nigel 93 if (newline < 0)
3842 nigel 91 {
3843 nigel 93 md->nltype = NLTYPE_ANY;
3844 nigel 91 }
3845     else
3846     {
3847 nigel 93 md->nltype = NLTYPE_FIXED;
3848     if (newline > 255)
3849     {
3850     md->nllen = 2;
3851     md->nl[0] = (newline >> 8) & 255;
3852     md->nl[1] = newline & 255;
3853     }
3854     else
3855     {
3856     md->nllen = 1;
3857     md->nl[0] = newline;
3858     }
3859 nigel 91 }
3860    
3861 nigel 77 /* Partial matching is supported only for a restricted set of regexes at the
3862     moment. */
3863    
3864 nigel 91 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3865 nigel 77 return PCRE_ERROR_BADPARTIAL;
3866    
3867     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3868     back the character offset. */
3869    
3870     #ifdef SUPPORT_UTF8
3871 nigel 93 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3872 nigel 77 {
3873     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3874     return PCRE_ERROR_BADUTF8;
3875     if (start_offset > 0 && start_offset < length)
3876     {
3877     int tb = ((uschar *)subject)[start_offset];
3878     if (tb > 127)
3879     {
3880     tb &= 0xc0;
3881     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3882     }
3883     }
3884     }
3885     #endif
3886    
3887     /* The ims options can vary during the matching as a result of the presence
3888     of (?ims) items in the pattern. They are kept in a local variable so that
3889     restoring at the exit of a group is easy. */
3890    
3891     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3892    
3893     /* If the expression has got more back references than the offsets supplied can
3894     hold, we get a temporary chunk of working store to use during the matching.
3895     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3896     of 3. */
3897    
3898     ocount = offsetcount - (offsetcount % 3);
3899    
3900     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3901     {
3902     ocount = re->top_backref * 3 + 3;
3903 nigel 91 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3904     if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3905 nigel 77 using_temporary_offsets = TRUE;
3906     DPRINTF(("Got memory to hold back references\n"));
3907     }
3908 nigel 91 else md->offset_vector = offsets;
3909 nigel 77
3910 nigel 91 md->offset_end = ocount;
3911     md->offset_max = (2*ocount)/3;
3912     md->offset_overflow = FALSE;
3913     md->capture_last = -1;
3914 nigel 77
3915     /* Compute the minimum number of offsets that we need to reset each time. Doing
3916     this makes a huge difference to execution time when there aren't many brackets
3917     in the pattern. */
3918    
3919     resetcount = 2 + re->top_bracket * 2;
3920     if (resetcount > offsetcount) resetcount = ocount;
3921    
3922     /* Reset the working variable associated with each extraction. These should
3923     never be used unless previously set, but they get saved and restored, and so we
3924     initialize them to avoid reading uninitialized locations. */
3925    
3926 nigel 91 if (md->offset_vector != NULL)
3927 nigel 77 {
3928 nigel 91 register int *iptr = md->offset_vector + ocount;
3929 nigel 77 register int *iend = iptr - resetcount/2 + 1;
3930     while (--iptr >= iend) *iptr = -1;
3931     }
3932    
3933     /* Set up the first character to match, if available. The first_byte value is
3934     never set for an anchored regular expression, but the anchoring may be forced
3935     at run time, so we have to test for anchoring. The first char may be unset for
3936     an unanchored pattern, of course. If there's no first char and the pattern was
3937     studied, there may be a bitmap of possible first characters. */
3938    
3939     if (!anchored)
3940     {
3941     if ((re->options & PCRE_FIRSTSET) != 0)
3942     {
3943     first_byte = re->first_byte & 255;
3944     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3945 nigel 91 first_byte = md->lcc[first_byte];
3946 nigel 77 }
3947     else
3948     if (!startline && study != NULL &&
3949     (study->options & PCRE_STUDY_MAPPED) != 0)
3950     start_bits = study->start_bits;
3951     }
3952    
3953     /* For anchored or unanchored matches, there may be a "last known required
3954     character" set. */
3955    
3956     if ((re->options & PCRE_REQCHSET) != 0)
3957     {
3958     req_byte = re->req_byte & 255;
3959     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
3960     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
3961     }
3962    
3963 nigel 93
3964     /* ==========================================================================*/
3965    
3966 nigel 77 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
3967     the loop runs just once. */
3968    
3969 nigel 93 for(;;)
3970 nigel 77 {
3971 nigel 87 USPTR save_end_subject = end_subject;
3972 nigel 77
3973     /* Reset the maximum number of extractions we might see. */
3974    
3975 nigel 91 if (md->offset_vector != NULL)
3976 nigel 77 {
3977 nigel 91 register int *iptr = md->offset_vector;
3978 nigel 77 register int *iend = iptr + resetcount;
3979     while (iptr < iend) *iptr++ = -1;
3980     }
3981    
3982     /* Advance to a unique first char if possible. If firstline is TRUE, the
3983     start of the match is constrained to the first line of a multiline string.
3984 nigel 93 That is, the match must be before or at the first newline. Implement this by
3985     temporarily adjusting end_subject so that we stop scanning at a newline. If
3986     the match fails at the newline, later code breaks this loop. */
3987 nigel 77
3988     if (firstline)
3989     {
3990 nigel 87 USPTR t = start_match;
3991 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
3992 nigel 77 end_subject = t;
3993     }
3994    
3995     /* Now test for a unique first byte */
3996    
3997     if (first_byte >= 0)
3998     {
3999     if (first_byte_caseless)
4000     while (start_match < end_subject &&
4001 nigel 91 md->lcc[*start_match] != first_byte)
4002 nigel 77 start_match++;
4003     else
4004     while (start_match < end_subject && *start_match != first_byte)
4005     start_match++;
4006     }
4007    
4008 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
4009 nigel 77
4010     else if (startline)
4011     {
4012 nigel 93 if (start_match > md->start_subject + start_offset)
4013 nigel 77 {
4014 nigel 93 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4015 nigel 77 start_match++;
4016     }
4017     }
4018    
4019     /* Or to a non-unique first char after study */
4020    
4021     else if (start_bits != NULL)
4022     {
4023     while (start_match < end_subject)
4024     {
4025     register unsigned int c = *start_match;
4026     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4027     }
4028     }
4029    
4030     /* Restore fudged end_subject */
4031    
4032     end_subject = save_end_subject;
4033    
4034     #ifdef DEBUG /* Sigh. Some compilers never learn. */
4035     printf(">>>> Match against: ");
4036 nigel 91 pchars(start_match, end_subject - start_match, TRUE, md);
4037 nigel 77 printf("\n");
4038     #endif
4039    
4040     /* If req_byte is set, we know that that character must appear in the subject
4041     for the match to succeed. If the first character is set, req_byte must be
4042     later in the subject; otherwise the test starts at the match point. This
4043     optimization can save a huge amount of backtracking in patterns with nested
4044     unlimited repeats that aren't going to match. Writing separate code for
4045     cased/caseless versions makes it go faster, as does using an autoincrement
4046     and backing off on a match.
4047    
4048     HOWEVER: when the subject string is very, very long, searching to its end can
4049     take a long time, and give bad performance on quite ordinary patterns. This
4050 nigel 93 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4051     string... so we don't do this when the string is sufficiently long.
4052 nigel 77
4053     ALSO: this processing is disabled when partial matching is requested.
4054     */
4055    
4056     if (req_byte >= 0 &&
4057     end_subject - start_match < REQ_BYTE_MAX &&
4058 nigel 91 !md->partial)
4059 nigel 77 {
4060 nigel 87 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4061 nigel 77
4062     /* We don't need to repeat the search if we haven't yet reached the
4063     place we found it at last time. */
4064    
4065     if (p > req_byte_ptr)
4066     {
4067     if (req_byte_caseless)
4068     {
4069     while (p < end_subject)
4070     {
4071     register int pp = *p++;
4072     if (pp == req_byte || pp == req_byte2) { p--; break; }
4073     }
4074     }
4075     else
4076     {
4077     while (p < end_subject)
4078     {
4079     if (*p++ == req_byte) { p--; break; }
4080     }
4081     }
4082    
4083 nigel 93 /* If we can't find the required character, break the matching loop,
4084     forcing a match failure. */
4085 nigel 77
4086 nigel 93 if (p >= end_subject)
4087     {
4088     rc = MATCH_NOMATCH;
4089     break;
4090     }
4091 nigel 77
4092     /* If we have found the required character, save the point where we
4093     found it, so that we don't search again next time round the loop if
4094     the start hasn't passed this character yet. */
4095    
4096     req_byte_ptr = p;
4097     }
4098     }
4099    
4100 nigel 93 /* OK, we can now run the match. */
4101 nigel 77
4102 nigel 91 md->start_match = start_match;
4103     md->match_call_count = 0;
4104 nigel 93 md->eptrn = 0; /* Next free eptrchain slot */
4105     rc = match(start_match, md->start_code, 2, md, ims, NULL, 0, 0);
4106 nigel 77
4107 nigel 93 /* Any return other than MATCH_NOMATCH breaks the loop. */
4108 nigel 77
4109 nigel 93 if (rc != MATCH_NOMATCH) break;
4110 nigel 77
4111 nigel 93 /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4112     newline in the subject (though it may continue over the newline). Therefore,
4113     if we have just failed to match, starting at a newline, do not continue. */
4114    
4115     if (firstline && IS_NEWLINE(start_match)) break;
4116    
4117     /* Advance the match position by one character. */
4118    
4119     start_match++;
4120 nigel 77 #ifdef SUPPORT_UTF8
4121 nigel 93 if (utf8)
4122     while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4123     start_match++;
4124 nigel 77 #endif
4125    
4126 nigel 93 /* Break the loop if the pattern is anchored or if we have passed the end of
4127     the subject. */
4128 nigel 77
4129 nigel 93 if (anchored || start_match > end_subject) break;
4130 nigel 77
4131 nigel 93 /* If we have just passed a CR and the newline option is CRLF or ANY, and we
4132     are now at a LF, advance the match position by one more character. */
4133    
4134     if (start_match[-1] == '\r' &&
4135     (md->nltype == NLTYPE_ANY || md->nllen == 2) &&
4136     start_match < end_subject &&
4137     *start_match == '\n')
4138     start_match++;
4139    
4140     } /* End of for(;;) "bumpalong" loop */
4141    
4142     /* ==========================================================================*/
4143    
4144     /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4145     conditions is true:
4146    
4147     (1) The pattern is anchored;
4148    
4149     (2) We are past the end of the subject;
4150    
4151     (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4152     this option requests that a match occur at or before the first newline in
4153     the subject.
4154    
4155     When we have a match and the offset vector is big enough to deal with any
4156     backreferences, captured substring offsets will already be set up. In the case
4157     where we had to get some local store to hold offsets for backreference
4158     processing, copy those that we can. In this case there need not be overflow if
4159     certain parts of the pattern were not used, even though there are more
4160     capturing parentheses than vector slots. */
4161    
4162     if (rc == MATCH_MATCH)
4163     {
4164 nigel 77 if (using_temporary_offsets)
4165     {
4166     if (offsetcount >= 4)
4167     {
4168 nigel 91 memcpy(offsets + 2, md->offset_vector + 2,
4169 nigel 77 (offsetcount - 2) * sizeof(int));
4170     DPRINTF(("Copied offsets from temporary memory\n"));
4171     }
4172 nigel 93 if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4173 nigel 77 DPRINTF(("Freeing temporary memory\n"));
4174 nigel 91 (pcre_free)(md->offset_vector);
4175 nigel 77 }
4176    
4177 nigel 93 /* Set the return code to the number of captured strings, or 0 if there are
4178     too many to fit into the vector. */
4179    
4180 nigel 91 rc = md->offset_overflow? 0 : md->end_offset_top/2;
4181 nigel 77
4182 nigel 93 /* If there is space, set up the whole thing as substring 0. */
4183    
4184 nigel 77 if (offsetcount < 2) rc = 0; else
4185     {
4186 nigel 91 offsets[0] = start_match - md->start_subject;
4187     offsets[1] = md->end_match_ptr - md->start_subject;
4188 nigel 77 }
4189    
4190     DPRINTF((">>>> returning %d\n", rc));
4191     return rc;