/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 165 - (hide annotations) (download)
Wed May 9 10:50:57 2007 UTC (7 years, 3 months ago) by ph10
File MIME type: text/plain
File size: 129791 byte(s)
Non-longjmp heap recursion.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 nigel 93 #define NLBLOCK md /* Block containing newline information */
46     #define PSSTART start_subject /* Field containing processed string start */
47     #define PSEND end_subject /* Field containing processed string end */
48    
49 nigel 77 #include "pcre_internal.h"
50    
51 ph10 137 /* Undefine some potentially clashing cpp symbols */
52    
53     #undef min
54     #undef max
55    
56 nigel 93 /* The chain of eptrblocks for tail recursions uses memory in stack workspace,
57     obtained at top level, the size of which is defined by EPTR_WORK_SIZE. */
58 nigel 77
59 nigel 93 #define EPTR_WORK_SIZE (1000)
60 nigel 77
61     /* Flag bits for the match() function */
62    
63 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
64     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
65     #define match_tail_recursed 0x04 /* Tail recursive call */
66 nigel 77
67     /* Non-error returns from the match() function. Error returns are externally
68     defined PCRE_ERROR_xxx codes, which are all negative. */
69    
70     #define MATCH_MATCH 1
71     #define MATCH_NOMATCH 0
72    
73     /* Maximum number of ints of offset to save on the stack for recursive calls.
74     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
75     because the offset vector is always a multiple of 3 long. */
76    
77     #define REC_STACK_SAVE_MAX 30
78    
79     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
80    
81     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
82     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
83    
84    
85    
86     #ifdef DEBUG
87     /*************************************************
88     * Debugging function to print chars *
89     *************************************************/
90    
91     /* Print a sequence of chars in printable format, stopping at the end of the
92     subject if the requested.
93    
94     Arguments:
95     p points to characters
96     length number to print
97     is_subject TRUE if printing from within md->start_subject
98     md pointer to matching data block, if is_subject is TRUE
99    
100     Returns: nothing
101     */
102    
103     static void
104     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
105     {
106 nigel 93 unsigned int c;
107 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
108     while (length-- > 0)
109     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
110     }
111     #endif
112    
113    
114    
115     /*************************************************
116     * Match a back-reference *
117     *************************************************/
118    
119     /* If a back reference hasn't been set, the length that is passed is greater
120     than the number of characters left in the string, so the match fails.
121    
122     Arguments:
123     offset index into the offset vector
124     eptr points into the subject
125     length length to be matched
126     md points to match data block
127     ims the ims flags
128    
129     Returns: TRUE if matched
130     */
131    
132     static BOOL
133 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
134 nigel 77 unsigned long int ims)
135     {
136 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
137 nigel 77
138     #ifdef DEBUG
139     if (eptr >= md->end_subject)
140     printf("matching subject <null>");
141     else
142     {
143     printf("matching subject ");
144     pchars(eptr, length, TRUE, md);
145     }
146     printf(" against backref ");
147     pchars(p, length, FALSE, md);
148     printf("\n");
149     #endif
150    
151     /* Always fail if not enough characters left */
152    
153     if (length > md->end_subject - eptr) return FALSE;
154    
155     /* Separate the caselesss case for speed */
156    
157     if ((ims & PCRE_CASELESS) != 0)
158     {
159     while (length-- > 0)
160     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
161     }
162     else
163     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
164    
165     return TRUE;
166     }
167    
168    
169    
170     /***************************************************************************
171     ****************************************************************************
172     RECURSION IN THE match() FUNCTION
173    
174 nigel 87 The match() function is highly recursive, though not every recursive call
175     increases the recursive depth. Nevertheless, some regular expressions can cause
176     it to recurse to a great depth. I was writing for Unix, so I just let it call
177     itself recursively. This uses the stack for saving everything that has to be
178     saved for a recursive call. On Unix, the stack can be large, and this works
179     fine.
180 nigel 77
181 nigel 87 It turns out that on some non-Unix-like systems there are problems with
182     programs that use a lot of stack. (This despite the fact that every last chip
183     has oodles of memory these days, and techniques for extending the stack have
184     been known for decades.) So....
185 nigel 77
186     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
187     calls by keeping local variables that need to be preserved in blocks of memory
188 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
189 nigel 77 achieve this so that the actual code doesn't look very different to what it
190     always used to.
191 ph10 164
192 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
193 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
194     Switzer, the use of longjmp() has been abolished, at the cost of having to
195     provide a unique number for each call to RMATCH. There is no way of generating
196     a sequence of numbers at compile time in C. I have given them names, to make
197     them stand out more clearly.
198    
199     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
200     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
201 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
202     don't have indeterminate values; this has meant that the frame size can be
203 ph10 164 reduced because the result can be "passed back" by straight setting of the
204     variable instead of being passed in the frame.
205 nigel 77 ****************************************************************************
206     ***************************************************************************/
207    
208    
209 ph10 164 /* Numbers for RMATCH calls */
210    
211     enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
212     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
213     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
214     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
215     RM41, RM42, RM43, RM44, RM45, RM46, RM47 };
216    
217 ph10 165
218 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
219 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
220 ph10 164 actuall used in this definition. */
221 nigel 77
222     #ifndef NO_RECURSE
223     #define REGISTER register
224 ph10 164
225 nigel 87 #ifdef DEBUG
226 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
227 nigel 87 { \
228     printf("match() called in line %d\n", __LINE__); \
229 ph10 164 rrc = match(ra,rb,rc,rd,re,rf,rg,rdepth+1); \
230 nigel 87 printf("to line %d\n", __LINE__); \
231     }
232     #define RRETURN(ra) \
233     { \
234     printf("match() returned %d from line %d ", ra, __LINE__); \
235     return ra; \
236     }
237     #else
238 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
239     rrc = match(ra,rb,rc,rd,re,rf,rg,rdepth+1)
240 nigel 77 #define RRETURN(ra) return ra
241 nigel 87 #endif
242    
243 nigel 77 #else
244    
245    
246 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
247     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
248     argument of match(), which never changes. */
249 nigel 77
250     #define REGISTER
251    
252 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
253 nigel 77 {\
254     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
255 ph10 164 frame->Xwhere = rw; \
256     newframe->Xeptr = ra;\
257     newframe->Xecode = rb;\
258     newframe->Xoffset_top = rc;\
259     newframe->Xims = re;\
260     newframe->Xeptrb = rf;\
261     newframe->Xflags = rg;\
262     newframe->Xrdepth = frame->Xrdepth + 1;\
263     newframe->Xprevframe = frame;\
264     frame = newframe;\
265     DPRINTF(("restarting from line %d\n", __LINE__));\
266     goto HEAP_RECURSE;\
267     L_##rw:\
268     DPRINTF(("jumped back to line %d\n", __LINE__));\
269 nigel 77 }
270    
271     #define RRETURN(ra)\
272     {\
273     heapframe *newframe = frame;\
274     frame = newframe->Xprevframe;\
275     (pcre_stack_free)(newframe);\
276     if (frame != NULL)\
277     {\
278 ph10 164 rrc = ra;\
279     goto HEAP_RETURN;\
280 nigel 77 }\
281     return ra;\
282     }
283    
284    
285     /* Structure for remembering the local variables in a private frame */
286    
287     typedef struct heapframe {
288     struct heapframe *Xprevframe;
289    
290     /* Function arguments that may change */
291    
292     const uschar *Xeptr;
293     const uschar *Xecode;
294     int Xoffset_top;
295     long int Xims;
296     eptrblock *Xeptrb;
297     int Xflags;
298 nigel 91 unsigned int Xrdepth;
299 nigel 77
300     /* Function local variables */
301    
302     const uschar *Xcallpat;
303     const uschar *Xcharptr;
304     const uschar *Xdata;
305     const uschar *Xnext;
306     const uschar *Xpp;
307     const uschar *Xprev;
308     const uschar *Xsaved_eptr;
309    
310     recursion_info Xnew_recursive;
311    
312     BOOL Xcur_is_word;
313     BOOL Xcondition;
314     BOOL Xprev_is_word;
315    
316     unsigned long int Xoriginal_ims;
317    
318     #ifdef SUPPORT_UCP
319     int Xprop_type;
320 nigel 87 int Xprop_value;
321 nigel 77 int Xprop_fail_result;
322     int Xprop_category;
323     int Xprop_chartype;
324 nigel 87 int Xprop_script;
325 ph10 123 int Xoclength;
326     uschar Xocchars[8];
327 nigel 77 #endif
328    
329     int Xctype;
330 nigel 93 unsigned int Xfc;
331 nigel 77 int Xfi;
332     int Xlength;
333     int Xmax;
334     int Xmin;
335     int Xnumber;
336     int Xoffset;
337     int Xop;
338     int Xsave_capture_last;
339     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
340     int Xstacksave[REC_STACK_SAVE_MAX];
341    
342     eptrblock Xnewptrb;
343    
344 ph10 164 /* Where to jump back to */
345 nigel 77
346 ph10 164 int Xwhere;
347 ph10 165
348 nigel 77 } heapframe;
349    
350     #endif
351    
352    
353     /***************************************************************************
354     ***************************************************************************/
355    
356    
357    
358     /*************************************************
359     * Match from current position *
360     *************************************************/
361    
362 nigel 93 /* This function is called recursively in many circumstances. Whenever it
363 nigel 77 returns a negative (error) response, the outer incarnation must also return the
364     same response.
365    
366     Performance note: It might be tempting to extract commonly used fields from the
367     md structure (e.g. utf8, end_subject) into individual variables to improve
368     performance. Tests using gcc on a SPARC disproved this; in the first case, it
369     made performance worse.
370    
371     Arguments:
372 nigel 93 eptr pointer to current character in subject
373     ecode pointer to current position in compiled code
374 nigel 77 offset_top current top pointer
375     md pointer to "static" info for the match
376     ims current /i, /m, and /s options
377     eptrb pointer to chain of blocks containing eptr at start of
378     brackets - for testing for empty matches
379     flags can contain
380     match_condassert - this is an assertion condition
381 nigel 93 match_cbegroup - this is the start of an unlimited repeat
382     group that can match an empty string
383     match_tail_recursed - this is a tail_recursed group
384 nigel 87 rdepth the recursion depth
385 nigel 77
386     Returns: MATCH_MATCH if matched ) these values are >= 0
387     MATCH_NOMATCH if failed to match )
388     a negative PCRE_ERROR_xxx value if aborted by an error condition
389 nigel 87 (e.g. stopped by repeated call or recursion limit)
390 nigel 77 */
391    
392     static int
393 nigel 87 match(REGISTER USPTR eptr, REGISTER const uschar *ecode,
394 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
395 nigel 91 int flags, unsigned int rdepth)
396 nigel 77 {
397     /* These variables do not need to be preserved over recursion in this function,
398 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
399     "register" because they are used a lot in loops. */
400 nigel 77
401 nigel 91 register int rrc; /* Returns from recursive calls */
402     register int i; /* Used for loops not involving calls to RMATCH() */
403 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
404 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
405 nigel 77
406 nigel 93 BOOL minimize, possessive; /* Quantifier options */
407    
408 nigel 77 /* When recursion is not being used, all "local" variables that have to be
409     preserved over calls to RMATCH() are part of a "frame" which is obtained from
410     heap storage. Set up the top-level frame here; others are obtained from the
411     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
412    
413     #ifdef NO_RECURSE
414     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
415     frame->Xprevframe = NULL; /* Marks the top level */
416    
417     /* Copy in the original argument variables */
418    
419     frame->Xeptr = eptr;
420     frame->Xecode = ecode;
421     frame->Xoffset_top = offset_top;
422     frame->Xims = ims;
423     frame->Xeptrb = eptrb;
424     frame->Xflags = flags;
425 nigel 87 frame->Xrdepth = rdepth;
426 nigel 77
427     /* This is where control jumps back to to effect "recursion" */
428    
429     HEAP_RECURSE:
430    
431     /* Macros make the argument variables come from the current frame */
432    
433     #define eptr frame->Xeptr
434     #define ecode frame->Xecode
435     #define offset_top frame->Xoffset_top
436     #define ims frame->Xims
437     #define eptrb frame->Xeptrb
438     #define flags frame->Xflags
439 nigel 87 #define rdepth frame->Xrdepth
440 nigel 77
441     /* Ditto for the local variables */
442    
443     #ifdef SUPPORT_UTF8
444     #define charptr frame->Xcharptr
445     #endif
446     #define callpat frame->Xcallpat
447     #define data frame->Xdata
448     #define next frame->Xnext
449     #define pp frame->Xpp
450     #define prev frame->Xprev
451     #define saved_eptr frame->Xsaved_eptr
452    
453     #define new_recursive frame->Xnew_recursive
454    
455     #define cur_is_word frame->Xcur_is_word
456     #define condition frame->Xcondition
457     #define prev_is_word frame->Xprev_is_word
458    
459     #define original_ims frame->Xoriginal_ims
460    
461     #ifdef SUPPORT_UCP
462     #define prop_type frame->Xprop_type
463 nigel 87 #define prop_value frame->Xprop_value
464 nigel 77 #define prop_fail_result frame->Xprop_fail_result
465     #define prop_category frame->Xprop_category
466     #define prop_chartype frame->Xprop_chartype
467 nigel 87 #define prop_script frame->Xprop_script
468 ph10 115 #define oclength frame->Xoclength
469     #define occhars frame->Xocchars
470 nigel 77 #endif
471    
472     #define ctype frame->Xctype
473     #define fc frame->Xfc
474     #define fi frame->Xfi
475     #define length frame->Xlength
476     #define max frame->Xmax
477     #define min frame->Xmin
478     #define number frame->Xnumber
479     #define offset frame->Xoffset
480     #define op frame->Xop
481     #define save_capture_last frame->Xsave_capture_last
482     #define save_offset1 frame->Xsave_offset1
483     #define save_offset2 frame->Xsave_offset2
484     #define save_offset3 frame->Xsave_offset3
485     #define stacksave frame->Xstacksave
486    
487     #define newptrb frame->Xnewptrb
488    
489     /* When recursion is being used, local variables are allocated on the stack and
490     get preserved during recursion in the normal way. In this environment, fi and
491     i, and fc and c, can be the same variables. */
492    
493 nigel 93 #else /* NO_RECURSE not defined */
494 nigel 77 #define fi i
495     #define fc c
496    
497    
498 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
499     const uschar *charptr; /* in small blocks of the code. My normal */
500     #endif /* style of coding would have declared */
501     const uschar *callpat; /* them within each of those blocks. */
502     const uschar *data; /* However, in order to accommodate the */
503     const uschar *next; /* version of this code that uses an */
504     USPTR pp; /* external "stack" implemented on the */
505     const uschar *prev; /* heap, it is easier to declare them all */
506     USPTR saved_eptr; /* here, so the declarations can be cut */
507     /* out in a block. The only declarations */
508     recursion_info new_recursive; /* within blocks below are for variables */
509     /* that do not have to be preserved over */
510     BOOL cur_is_word; /* a recursive call to RMATCH(). */
511     BOOL condition;
512 nigel 77 BOOL prev_is_word;
513    
514     unsigned long int original_ims;
515    
516     #ifdef SUPPORT_UCP
517     int prop_type;
518 nigel 87 int prop_value;
519 nigel 77 int prop_fail_result;
520     int prop_category;
521     int prop_chartype;
522 nigel 87 int prop_script;
523 ph10 115 int oclength;
524     uschar occhars[8];
525 nigel 77 #endif
526    
527     int ctype;
528     int length;
529     int max;
530     int min;
531     int number;
532     int offset;
533     int op;
534     int save_capture_last;
535     int save_offset1, save_offset2, save_offset3;
536     int stacksave[REC_STACK_SAVE_MAX];
537    
538     eptrblock newptrb;
539 nigel 93 #endif /* NO_RECURSE */
540 nigel 77
541     /* These statements are here to stop the compiler complaining about unitialized
542     variables. */
543    
544     #ifdef SUPPORT_UCP
545 nigel 87 prop_value = 0;
546 nigel 77 prop_fail_result = 0;
547     #endif
548    
549 nigel 93
550 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
551     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
552     used. Thanks to Ian Taylor for noticing this possibility and sending the
553     original patch. */
554    
555     TAIL_RECURSE:
556    
557 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
558     are specified by the macro RMATCH and RRETURN is used to return. When
559     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
560     and a "return", respectively (possibly with some debugging if DEBUG is
561     defined). However, RMATCH isn't like a function call because it's quite a
562     complicated macro. It has to be used in one particular way. This shouldn't,
563     however, impact performance when true recursion is being used. */
564 nigel 77
565 ph10 164 #ifdef SUPPORT_UTF8
566     utf8 = md->utf8; /* Local copy of the flag */
567     #else
568     utf8 = FALSE;
569     #endif
570    
571 nigel 87 /* First check that we haven't called match() too many times, or that we
572     haven't exceeded the recursive call limit. */
573    
574 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
575 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
576 nigel 77
577     original_ims = ims; /* Save for resetting on ')' */
578 nigel 91
579 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
580     string, the match_cbegroup flag is set. When this is the case, add the current
581     subject pointer to the chain of such remembered pointers, to be checked when we
582     hit the closing ket, in order to break infinite loops that match no characters.
583     When match() is called in other circumstances, don't add to the chain. If this
584     is a tail recursion, use a block from the workspace, as the one on the stack is
585     already used. */
586 nigel 77
587 nigel 93 if ((flags & match_cbegroup) != 0)
588 nigel 77 {
589 nigel 93 eptrblock *p;
590     if ((flags & match_tail_recursed) != 0)
591     {
592     if (md->eptrn >= EPTR_WORK_SIZE) RRETURN(PCRE_ERROR_NULLWSLIMIT);
593     p = md->eptrchain + md->eptrn++;
594     }
595     else p = &newptrb;
596     p->epb_saved_eptr = eptr;
597     p->epb_prev = eptrb;
598     eptrb = p;
599 nigel 77 }
600    
601 nigel 93 /* Now start processing the opcodes. */
602 nigel 77
603     for (;;)
604     {
605 nigel 93 minimize = possessive = FALSE;
606 nigel 77 op = *ecode;
607    
608     /* For partial matching, remember if we ever hit the end of the subject after
609     matching at least one subject character. */
610    
611     if (md->partial &&
612     eptr >= md->end_subject &&
613     eptr > md->start_match)
614     md->hitend = TRUE;
615    
616 nigel 93 switch(op)
617     {
618     /* Handle a capturing bracket. If there is space in the offset vector, save
619     the current subject position in the working slot at the top of the vector.
620     We mustn't change the current values of the data slot, because they may be
621     set from a previous iteration of this group, and be referred to by a
622     reference inside the group.
623 nigel 77
624 nigel 93 If the bracket fails to match, we need to restore this value and also the
625     values of the final offsets, in case they were set by a previous iteration
626     of the same bracket.
627 nigel 77
628 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
629     a non-capturing bracket. Don't worry about setting the flag for the error
630     case here; that is handled in the code for KET. */
631 nigel 77
632 nigel 93 case OP_CBRA:
633     case OP_SCBRA:
634     number = GET2(ecode, 1+LINK_SIZE);
635 nigel 77 offset = number << 1;
636    
637     #ifdef DEBUG
638 nigel 93 printf("start bracket %d\n", number);
639     printf("subject=");
640 nigel 77 pchars(eptr, 16, TRUE, md);
641     printf("\n");
642     #endif
643    
644     if (offset < md->offset_max)
645     {
646     save_offset1 = md->offset_vector[offset];
647     save_offset2 = md->offset_vector[offset+1];
648     save_offset3 = md->offset_vector[md->offset_end - number];
649     save_capture_last = md->capture_last;
650    
651     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
652     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
653    
654 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
655 nigel 77 do
656     {
657 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
658     ims, eptrb, flags, RM1);
659 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
660     md->capture_last = save_capture_last;
661     ecode += GET(ecode, 1);
662     }
663     while (*ecode == OP_ALT);
664    
665     DPRINTF(("bracket %d failed\n", number));
666    
667     md->offset_vector[offset] = save_offset1;
668     md->offset_vector[offset+1] = save_offset2;
669     md->offset_vector[md->offset_end - number] = save_offset3;
670    
671     RRETURN(MATCH_NOMATCH);
672     }
673    
674 nigel 93 /* Insufficient room for saving captured contents. Treat as a non-capturing
675     bracket. */
676 nigel 77
677 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
678 nigel 77
679 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
680     final alternative within the brackets, we would return the result of a
681     recursive call to match() whatever happened. We can reduce stack usage by
682     turning this into a tail recursion. */
683 nigel 77
684 nigel 93 case OP_BRA:
685     case OP_SBRA:
686     DPRINTF(("start non-capturing bracket\n"));
687     flags = (op >= OP_SBRA)? match_cbegroup : 0;
688 nigel 91 for (;;)
689 nigel 77 {
690 nigel 91 if (ecode[GET(ecode, 1)] != OP_ALT)
691 nigel 93 {
692     ecode += _pcre_OP_lengths[*ecode];
693     flags |= match_tail_recursed;
694     DPRINTF(("bracket 0 tail recursion\n"));
695     goto TAIL_RECURSE;
696     }
697 nigel 91
698     /* For non-final alternatives, continue the loop for a NOMATCH result;
699     otherwise return. */
700    
701 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
702     eptrb, flags, RM2);
703 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
704     ecode += GET(ecode, 1);
705     }
706 nigel 91 /* Control never reaches here. */
707 nigel 77
708     /* Conditional group: compilation checked that there are no more than
709     two branches. If the condition is false, skipping the first branch takes us
710     past the end if there is only one branch, but that's OK because that is
711 nigel 91 exactly what going to the ket would do. As there is only one branch to be
712     obeyed, we can use tail recursion to avoid using another stack frame. */
713 nigel 77
714     case OP_COND:
715 nigel 93 case OP_SCOND:
716     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
717 nigel 77 {
718 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
719     condition = md->recursive != NULL &&
720     (offset == RREF_ANY || offset == md->recursive->group_num);
721     ecode += condition? 3 : GET(ecode, 1);
722     }
723    
724     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
725     {
726 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
727 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
728     ecode += condition? 3 : GET(ecode, 1);
729 nigel 77 }
730    
731 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
732     {
733     condition = FALSE;
734     ecode += GET(ecode, 1);
735     }
736    
737 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
738 nigel 93 the final argument match_condassert causes it to stop at the end of an
739     assertion. */
740 nigel 77
741     else
742     {
743 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
744     match_condassert, RM3);
745 nigel 77 if (rrc == MATCH_MATCH)
746     {
747 nigel 93 condition = TRUE;
748     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
749 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
750     }
751     else if (rrc != MATCH_NOMATCH)
752     {
753     RRETURN(rrc); /* Need braces because of following else */
754     }
755 nigel 93 else
756     {
757     condition = FALSE;
758     ecode += GET(ecode, 1);
759     }
760     }
761 nigel 91
762 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
763     we can use tail recursion to avoid using another stack frame. If the second
764     alternative doesn't exist, we can just plough on. */
765 nigel 91
766 nigel 93 if (condition || *ecode == OP_ALT)
767     {
768 nigel 91 ecode += 1 + LINK_SIZE;
769 nigel 93 flags = match_tail_recursed | ((op == OP_SCOND)? match_cbegroup : 0);
770 nigel 91 goto TAIL_RECURSE;
771 nigel 77 }
772 nigel 93 else
773     {
774     ecode += 1 + LINK_SIZE;
775     }
776     break;
777 nigel 77
778    
779 nigel 93 /* End of the pattern. If we are in a top-level recursion, we should
780     restore the offsets appropriately and continue from after the call. */
781 nigel 77
782     case OP_END:
783     if (md->recursive != NULL && md->recursive->group_num == 0)
784     {
785     recursion_info *rec = md->recursive;
786 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
787 nigel 77 md->recursive = rec->prevrec;
788     memmove(md->offset_vector, rec->offset_save,
789     rec->saved_max * sizeof(int));
790     md->start_match = rec->save_start;
791     ims = original_ims;
792     ecode = rec->after_call;
793     break;
794     }
795    
796     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
797     string - backtracking will then try other alternatives, if any. */
798    
799     if (md->notempty && eptr == md->start_match) RRETURN(MATCH_NOMATCH);
800     md->end_match_ptr = eptr; /* Record where we ended */
801     md->end_offset_top = offset_top; /* and how many extracts were taken */
802     RRETURN(MATCH_MATCH);
803    
804     /* Change option settings */
805    
806     case OP_OPT:
807     ims = ecode[1];
808     ecode += 2;
809     DPRINTF(("ims set to %02lx\n", ims));
810     break;
811    
812     /* Assertion brackets. Check the alternative branches in turn - the
813     matching won't pass the KET for an assertion. If any one branch matches,
814     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
815     start of each branch to move the current point backwards, so the code at
816     this level is identical to the lookahead case. */
817    
818     case OP_ASSERT:
819     case OP_ASSERTBACK:
820     do
821     {
822 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
823     RM4);
824 nigel 77 if (rrc == MATCH_MATCH) break;
825     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
826     ecode += GET(ecode, 1);
827     }
828     while (*ecode == OP_ALT);
829     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
830    
831     /* If checking an assertion for a condition, return MATCH_MATCH. */
832    
833     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
834    
835     /* Continue from after the assertion, updating the offsets high water
836     mark, since extracts may have been taken during the assertion. */
837    
838     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
839     ecode += 1 + LINK_SIZE;
840     offset_top = md->end_offset_top;
841     continue;
842    
843     /* Negative assertion: all branches must fail to match */
844    
845     case OP_ASSERT_NOT:
846     case OP_ASSERTBACK_NOT:
847     do
848     {
849 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
850     RM5);
851 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
852     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
853     ecode += GET(ecode,1);
854     }
855     while (*ecode == OP_ALT);
856    
857     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
858    
859     ecode += 1 + LINK_SIZE;
860     continue;
861    
862     /* Move the subject pointer back. This occurs only at the start of
863     each branch of a lookbehind assertion. If we are too close to the start to
864     move back, this match function fails. When working with UTF-8 we move
865     back a number of characters, not bytes. */
866    
867     case OP_REVERSE:
868     #ifdef SUPPORT_UTF8
869     if (utf8)
870     {
871 nigel 93 i = GET(ecode, 1);
872     while (i-- > 0)
873 nigel 77 {
874     eptr--;
875     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
876     BACKCHAR(eptr)
877     }
878     }
879     else
880     #endif
881    
882     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
883    
884     {
885 nigel 93 eptr -= GET(ecode, 1);
886 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
887     }
888    
889     /* Skip to next op code */
890    
891     ecode += 1 + LINK_SIZE;
892     break;
893    
894     /* The callout item calls an external function, if one is provided, passing
895     details of the match so far. This is mainly for debugging, though the
896     function is able to force a failure. */
897    
898     case OP_CALLOUT:
899     if (pcre_callout != NULL)
900     {
901     pcre_callout_block cb;
902     cb.version = 1; /* Version 1 of the callout block */
903     cb.callout_number = ecode[1];
904     cb.offset_vector = md->offset_vector;
905 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
906 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
907     cb.start_match = md->start_match - md->start_subject;
908     cb.current_position = eptr - md->start_subject;
909     cb.pattern_position = GET(ecode, 2);
910     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
911     cb.capture_top = offset_top/2;
912     cb.capture_last = md->capture_last;
913     cb.callout_data = md->callout_data;
914     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
915     if (rrc < 0) RRETURN(rrc);
916     }
917     ecode += 2 + 2*LINK_SIZE;
918     break;
919    
920     /* Recursion either matches the current regex, or some subexpression. The
921     offset data is the offset to the starting bracket from the start of the
922     whole pattern. (This is so that it works from duplicated subpatterns.)
923    
924     If there are any capturing brackets started but not finished, we have to
925     save their starting points and reinstate them after the recursion. However,
926     we don't know how many such there are (offset_top records the completed
927     total) so we just have to save all the potential data. There may be up to
928     65535 such values, which is too large to put on the stack, but using malloc
929     for small numbers seems expensive. As a compromise, the stack is used when
930     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
931     is used. A problem is what to do if the malloc fails ... there is no way of
932     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
933     values on the stack, and accept that the rest may be wrong.
934    
935     There are also other values that have to be saved. We use a chained
936     sequence of blocks that actually live on the stack. Thanks to Robin Houston
937     for the original version of this logic. */
938    
939     case OP_RECURSE:
940     {
941     callpat = md->start_code + GET(ecode, 1);
942 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
943     GET2(callpat, 1 + LINK_SIZE);
944 nigel 77
945     /* Add to "recursing stack" */
946    
947     new_recursive.prevrec = md->recursive;
948     md->recursive = &new_recursive;
949    
950     /* Find where to continue from afterwards */
951    
952     ecode += 1 + LINK_SIZE;
953     new_recursive.after_call = ecode;
954    
955     /* Now save the offset data. */
956    
957     new_recursive.saved_max = md->offset_end;
958     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
959     new_recursive.offset_save = stacksave;
960     else
961     {
962     new_recursive.offset_save =
963     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
964     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
965     }
966    
967     memcpy(new_recursive.offset_save, md->offset_vector,
968     new_recursive.saved_max * sizeof(int));
969     new_recursive.save_start = md->start_match;
970     md->start_match = eptr;
971    
972     /* OK, now we can do the recursion. For each top-level alternative we
973     restore the offset and recursion data. */
974    
975     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
976 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
977 nigel 77 do
978     {
979 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
980     md, ims, eptrb, flags, RM6);
981 nigel 77 if (rrc == MATCH_MATCH)
982     {
983 nigel 87 DPRINTF(("Recursion matched\n"));
984 nigel 77 md->recursive = new_recursive.prevrec;
985     if (new_recursive.offset_save != stacksave)
986     (pcre_free)(new_recursive.offset_save);
987     RRETURN(MATCH_MATCH);
988     }
989 nigel 87 else if (rrc != MATCH_NOMATCH)
990     {
991     DPRINTF(("Recursion gave error %d\n", rrc));
992     RRETURN(rrc);
993     }
994 nigel 77
995     md->recursive = &new_recursive;
996     memcpy(md->offset_vector, new_recursive.offset_save,
997     new_recursive.saved_max * sizeof(int));
998     callpat += GET(callpat, 1);
999     }
1000     while (*callpat == OP_ALT);
1001    
1002     DPRINTF(("Recursion didn't match\n"));
1003     md->recursive = new_recursive.prevrec;
1004     if (new_recursive.offset_save != stacksave)
1005     (pcre_free)(new_recursive.offset_save);
1006     RRETURN(MATCH_NOMATCH);
1007     }
1008     /* Control never reaches here */
1009    
1010     /* "Once" brackets are like assertion brackets except that after a match,
1011     the point in the subject string is not moved back. Thus there can never be
1012     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1013     Check the alternative branches in turn - the matching won't pass the KET
1014     for this kind of subpattern. If any one branch matches, we carry on as at
1015     the end of a normal bracket, leaving the subject pointer. */
1016    
1017     case OP_ONCE:
1018 nigel 91 prev = ecode;
1019     saved_eptr = eptr;
1020    
1021     do
1022 nigel 77 {
1023 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims,
1024     eptrb, 0, RM7);
1025 nigel 91 if (rrc == MATCH_MATCH) break;
1026     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1027     ecode += GET(ecode,1);
1028     }
1029     while (*ecode == OP_ALT);
1030 nigel 77
1031 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1032 nigel 77
1033 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1034 nigel 77
1035 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1036     mark, since extracts may have been taken. */
1037 nigel 77
1038 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1039 nigel 77
1040 nigel 91 offset_top = md->end_offset_top;
1041     eptr = md->end_match_ptr;
1042 nigel 77
1043 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1044     happens for a repeating ket if no characters were matched in the group.
1045     This is the forcible breaking of infinite loops as implemented in Perl
1046     5.005. If there is an options reset, it will get obeyed in the normal
1047     course of events. */
1048 nigel 77
1049 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1050     {
1051     ecode += 1+LINK_SIZE;
1052     break;
1053     }
1054 nigel 77
1055 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1056     preceding bracket, in the appropriate order. The second "call" of match()
1057     uses tail recursion, to avoid using another stack frame. We need to reset
1058     any options that changed within the bracket before re-running it, so
1059     check the next opcode. */
1060 nigel 77
1061 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1062     {
1063     ims = (ims & ~PCRE_IMS) | ecode[4];
1064     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1065     }
1066 nigel 77
1067 nigel 91 if (*ecode == OP_KETRMIN)
1068     {
1069 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0,
1070     RM8);
1071 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1072     ecode = prev;
1073 nigel 93 flags = match_tail_recursed;
1074 nigel 91 goto TAIL_RECURSE;
1075 nigel 77 }
1076 nigel 91 else /* OP_KETRMAX */
1077     {
1078 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1079 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1080     ecode += 1 + LINK_SIZE;
1081 nigel 93 flags = match_tail_recursed;
1082 nigel 91 goto TAIL_RECURSE;
1083     }
1084     /* Control never gets here */
1085 nigel 77
1086     /* An alternation is the end of a branch; scan along to find the end of the
1087     bracketed group and go to there. */
1088    
1089     case OP_ALT:
1090     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1091     break;
1092    
1093     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1094     that it may occur zero times. It may repeat infinitely, or not at all -
1095     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1096     repeat limits are compiled as a number of copies, with the optional ones
1097     preceded by BRAZERO or BRAMINZERO. */
1098    
1099     case OP_BRAZERO:
1100     {
1101     next = ecode+1;
1102 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1103 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1104     do next += GET(next,1); while (*next == OP_ALT);
1105 nigel 93 ecode = next + 1 + LINK_SIZE;
1106 nigel 77 }
1107     break;
1108    
1109     case OP_BRAMINZERO:
1110     {
1111     next = ecode+1;
1112 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1113 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1114 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1115     ecode++;
1116     }
1117     break;
1118    
1119 nigel 93 /* End of a group, repeated or non-repeating. */
1120 nigel 77
1121     case OP_KET:
1122     case OP_KETRMIN:
1123     case OP_KETRMAX:
1124 nigel 91 prev = ecode - GET(ecode, 1);
1125 nigel 77
1126 nigel 93 /* If this was a group that remembered the subject start, in order to break
1127     infinite repeats of empty string matches, retrieve the subject start from
1128     the chain. Otherwise, set it NULL. */
1129 nigel 77
1130 nigel 93 if (*prev >= OP_SBRA)
1131     {
1132     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1133     eptrb = eptrb->epb_prev; /* Backup to previous group */
1134     }
1135     else saved_eptr = NULL;
1136 nigel 77
1137 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1138     MATCH_MATCH, but record the current high water mark for use by positive
1139     assertions. Do this also for the "once" (atomic) groups. */
1140    
1141 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1142     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1143     *prev == OP_ONCE)
1144     {
1145     md->end_match_ptr = eptr; /* For ONCE */
1146     md->end_offset_top = offset_top;
1147     RRETURN(MATCH_MATCH);
1148     }
1149 nigel 77
1150 nigel 93 /* For capturing groups we have to check the group number back at the start
1151     and if necessary complete handling an extraction by setting the offsets and
1152     bumping the high water mark. Note that whole-pattern recursion is coded as
1153     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1154     when the OP_END is reached. Other recursion is handled here. */
1155 nigel 77
1156 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1157 nigel 91 {
1158 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1159 nigel 91 offset = number << 1;
1160 nigel 77
1161     #ifdef DEBUG
1162 nigel 91 printf("end bracket %d", number);
1163     printf("\n");
1164 nigel 77 #endif
1165    
1166 nigel 93 md->capture_last = number;
1167     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1168 nigel 91 {
1169 nigel 93 md->offset_vector[offset] =
1170     md->offset_vector[md->offset_end - number];
1171     md->offset_vector[offset+1] = eptr - md->start_subject;
1172     if (offset_top <= offset) offset_top = offset + 2;
1173     }
1174 nigel 77
1175 nigel 93 /* Handle a recursively called group. Restore the offsets
1176     appropriately and continue from after the call. */
1177 nigel 77
1178 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1179     {
1180     recursion_info *rec = md->recursive;
1181     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1182     md->recursive = rec->prevrec;
1183     md->start_match = rec->save_start;
1184     memcpy(md->offset_vector, rec->offset_save,
1185     rec->saved_max * sizeof(int));
1186     ecode = rec->after_call;
1187     ims = original_ims;
1188     break;
1189 nigel 77 }
1190 nigel 91 }
1191 nigel 77
1192 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1193     flags, in case they got changed during the group. */
1194 nigel 77
1195 nigel 91 ims = original_ims;
1196     DPRINTF(("ims reset to %02lx\n", ims));
1197 nigel 77
1198 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1199     happens for a repeating ket if no characters were matched in the group.
1200     This is the forcible breaking of infinite loops as implemented in Perl
1201     5.005. If there is an options reset, it will get obeyed in the normal
1202     course of events. */
1203 nigel 77
1204 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1205     {
1206     ecode += 1 + LINK_SIZE;
1207     break;
1208     }
1209 nigel 77
1210 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1211     preceding bracket, in the appropriate order. In the second case, we can use
1212     tail recursion to avoid using another stack frame. */
1213 nigel 77
1214 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1215    
1216 nigel 91 if (*ecode == OP_KETRMIN)
1217     {
1218 ph10 164 RMATCH(eptr, ecode + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0,
1219     RM12);
1220 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1221     ecode = prev;
1222 nigel 93 flags |= match_tail_recursed;
1223 nigel 91 goto TAIL_RECURSE;
1224 nigel 77 }
1225 nigel 91 else /* OP_KETRMAX */
1226     {
1227 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1228 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1229     ecode += 1 + LINK_SIZE;
1230 nigel 93 flags = match_tail_recursed;
1231 nigel 91 goto TAIL_RECURSE;
1232     }
1233     /* Control never gets here */
1234 nigel 77
1235     /* Start of subject unless notbol, or after internal newline if multiline */
1236    
1237     case OP_CIRC:
1238     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1239     if ((ims & PCRE_MULTILINE) != 0)
1240     {
1241 nigel 91 if (eptr != md->start_subject &&
1242 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1243 nigel 77 RRETURN(MATCH_NOMATCH);
1244     ecode++;
1245     break;
1246     }
1247     /* ... else fall through */
1248    
1249     /* Start of subject assertion */
1250    
1251     case OP_SOD:
1252     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1253     ecode++;
1254     break;
1255    
1256     /* Start of match assertion */
1257    
1258     case OP_SOM:
1259     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1260     ecode++;
1261     break;
1262    
1263     /* Assert before internal newline if multiline, or before a terminating
1264     newline unless endonly is set, else end of subject unless noteol is set. */
1265    
1266     case OP_DOLL:
1267     if ((ims & PCRE_MULTILINE) != 0)
1268     {
1269     if (eptr < md->end_subject)
1270 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1271 nigel 77 else
1272     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1273     ecode++;
1274     break;
1275     }
1276     else
1277     {
1278     if (md->noteol) RRETURN(MATCH_NOMATCH);
1279     if (!md->endonly)
1280     {
1281 nigel 91 if (eptr != md->end_subject &&
1282 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1283 nigel 77 RRETURN(MATCH_NOMATCH);
1284     ecode++;
1285     break;
1286     }
1287     }
1288 nigel 91 /* ... else fall through for endonly */
1289 nigel 77
1290     /* End of subject assertion (\z) */
1291    
1292     case OP_EOD:
1293     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1294     ecode++;
1295     break;
1296    
1297     /* End of subject or ending \n assertion (\Z) */
1298    
1299     case OP_EODN:
1300 nigel 91 if (eptr != md->end_subject &&
1301 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1302 nigel 91 RRETURN(MATCH_NOMATCH);
1303 nigel 77 ecode++;
1304     break;
1305    
1306     /* Word boundary assertions */
1307    
1308     case OP_NOT_WORD_BOUNDARY:
1309     case OP_WORD_BOUNDARY:
1310     {
1311    
1312     /* Find out if the previous and current characters are "word" characters.
1313     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1314     be "non-word" characters. */
1315    
1316     #ifdef SUPPORT_UTF8
1317     if (utf8)
1318     {
1319     if (eptr == md->start_subject) prev_is_word = FALSE; else
1320     {
1321     const uschar *lastptr = eptr - 1;
1322     while((*lastptr & 0xc0) == 0x80) lastptr--;
1323     GETCHAR(c, lastptr);
1324     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1325     }
1326     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1327     {
1328     GETCHAR(c, eptr);
1329     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1330     }
1331     }
1332     else
1333     #endif
1334    
1335     /* More streamlined when not in UTF-8 mode */
1336    
1337     {
1338     prev_is_word = (eptr != md->start_subject) &&
1339     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1340     cur_is_word = (eptr < md->end_subject) &&
1341     ((md->ctypes[*eptr] & ctype_word) != 0);
1342     }
1343    
1344     /* Now see if the situation is what we want */
1345    
1346     if ((*ecode++ == OP_WORD_BOUNDARY)?
1347     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1348     RRETURN(MATCH_NOMATCH);
1349     }
1350     break;
1351    
1352     /* Match a single character type; inline for speed */
1353    
1354     case OP_ANY:
1355 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1356     {
1357 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1358 nigel 91 }
1359 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1360     if (utf8)
1361     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1362     ecode++;
1363     break;
1364    
1365     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1366     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1367    
1368     case OP_ANYBYTE:
1369     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1370     ecode++;
1371     break;
1372    
1373     case OP_NOT_DIGIT:
1374     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1375     GETCHARINCTEST(c, eptr);
1376     if (
1377     #ifdef SUPPORT_UTF8
1378     c < 256 &&
1379     #endif
1380     (md->ctypes[c] & ctype_digit) != 0
1381     )
1382     RRETURN(MATCH_NOMATCH);
1383     ecode++;
1384     break;
1385    
1386     case OP_DIGIT:
1387     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1388     GETCHARINCTEST(c, eptr);
1389     if (
1390     #ifdef SUPPORT_UTF8
1391     c >= 256 ||
1392     #endif
1393     (md->ctypes[c] & ctype_digit) == 0
1394     )
1395     RRETURN(MATCH_NOMATCH);
1396     ecode++;
1397     break;
1398    
1399     case OP_NOT_WHITESPACE:
1400     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1401     GETCHARINCTEST(c, eptr);
1402     if (
1403     #ifdef SUPPORT_UTF8
1404     c < 256 &&
1405     #endif
1406     (md->ctypes[c] & ctype_space) != 0
1407     )
1408     RRETURN(MATCH_NOMATCH);
1409     ecode++;
1410     break;
1411    
1412     case OP_WHITESPACE:
1413     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1414     GETCHARINCTEST(c, eptr);
1415     if (
1416     #ifdef SUPPORT_UTF8
1417     c >= 256 ||
1418     #endif
1419     (md->ctypes[c] & ctype_space) == 0
1420     )
1421     RRETURN(MATCH_NOMATCH);
1422     ecode++;
1423     break;
1424    
1425     case OP_NOT_WORDCHAR:
1426     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1427     GETCHARINCTEST(c, eptr);
1428     if (
1429     #ifdef SUPPORT_UTF8
1430     c < 256 &&
1431     #endif
1432     (md->ctypes[c] & ctype_word) != 0
1433     )
1434     RRETURN(MATCH_NOMATCH);
1435     ecode++;
1436     break;
1437    
1438     case OP_WORDCHAR:
1439     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1440     GETCHARINCTEST(c, eptr);
1441     if (
1442     #ifdef SUPPORT_UTF8
1443     c >= 256 ||
1444     #endif
1445     (md->ctypes[c] & ctype_word) == 0
1446     )
1447     RRETURN(MATCH_NOMATCH);
1448     ecode++;
1449     break;
1450    
1451 nigel 93 case OP_ANYNL:
1452     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1453     GETCHARINCTEST(c, eptr);
1454     switch(c)
1455     {
1456     default: RRETURN(MATCH_NOMATCH);
1457     case 0x000d:
1458     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1459     break;
1460     case 0x000a:
1461     case 0x000b:
1462     case 0x000c:
1463     case 0x0085:
1464     case 0x2028:
1465     case 0x2029:
1466     break;
1467     }
1468     ecode++;
1469     break;
1470    
1471 nigel 77 #ifdef SUPPORT_UCP
1472     /* Check the next character by Unicode property. We will get here only
1473     if the support is in the binary; otherwise a compile-time error occurs. */
1474    
1475     case OP_PROP:
1476     case OP_NOTPROP:
1477     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1478     GETCHARINCTEST(c, eptr);
1479     {
1480 nigel 87 int chartype, script;
1481     int category = _pcre_ucp_findprop(c, &chartype, &script);
1482 nigel 77
1483 nigel 87 switch(ecode[1])
1484     {
1485     case PT_ANY:
1486     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1487     break;
1488 nigel 77
1489 nigel 87 case PT_LAMP:
1490     if ((chartype == ucp_Lu ||
1491     chartype == ucp_Ll ||
1492     chartype == ucp_Lt) == (op == OP_NOTPROP))
1493 nigel 77 RRETURN(MATCH_NOMATCH);
1494 nigel 87 break;
1495    
1496     case PT_GC:
1497     if ((ecode[2] != category) == (op == OP_PROP))
1498 nigel 77 RRETURN(MATCH_NOMATCH);
1499 nigel 87 break;
1500    
1501     case PT_PC:
1502     if ((ecode[2] != chartype) == (op == OP_PROP))
1503     RRETURN(MATCH_NOMATCH);
1504     break;
1505    
1506     case PT_SC:
1507     if ((ecode[2] != script) == (op == OP_PROP))
1508     RRETURN(MATCH_NOMATCH);
1509     break;
1510    
1511     default:
1512     RRETURN(PCRE_ERROR_INTERNAL);
1513 nigel 77 }
1514 nigel 87
1515     ecode += 3;
1516 nigel 77 }
1517     break;
1518    
1519     /* Match an extended Unicode sequence. We will get here only if the support
1520     is in the binary; otherwise a compile-time error occurs. */
1521    
1522     case OP_EXTUNI:
1523     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1524     GETCHARINCTEST(c, eptr);
1525     {
1526 nigel 87 int chartype, script;
1527     int category = _pcre_ucp_findprop(c, &chartype, &script);
1528 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1529     while (eptr < md->end_subject)
1530     {
1531     int len = 1;
1532     if (!utf8) c = *eptr; else
1533     {
1534     GETCHARLEN(c, eptr, len);
1535     }
1536 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1537 nigel 77 if (category != ucp_M) break;
1538     eptr += len;
1539     }
1540     }
1541     ecode++;
1542     break;
1543     #endif
1544    
1545    
1546     /* Match a back reference, possibly repeatedly. Look past the end of the
1547     item to see if there is repeat information following. The code is similar
1548     to that for character classes, but repeated for efficiency. Then obey
1549     similar code to character type repeats - written out again for speed.
1550     However, if the referenced string is the empty string, always treat
1551     it as matched, any number of times (otherwise there could be infinite
1552     loops). */
1553    
1554     case OP_REF:
1555     {
1556     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1557     ecode += 3; /* Advance past item */
1558    
1559     /* If the reference is unset, set the length to be longer than the amount
1560     of subject left; this ensures that every attempt at a match fails. We
1561     can't just fail here, because of the possibility of quantifiers with zero
1562     minima. */
1563    
1564     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1565     md->end_subject - eptr + 1 :
1566     md->offset_vector[offset+1] - md->offset_vector[offset];
1567    
1568     /* Set up for repetition, or handle the non-repeated case */
1569    
1570     switch (*ecode)
1571     {
1572     case OP_CRSTAR:
1573     case OP_CRMINSTAR:
1574     case OP_CRPLUS:
1575     case OP_CRMINPLUS:
1576     case OP_CRQUERY:
1577     case OP_CRMINQUERY:
1578     c = *ecode++ - OP_CRSTAR;
1579     minimize = (c & 1) != 0;
1580     min = rep_min[c]; /* Pick up values from tables; */
1581     max = rep_max[c]; /* zero for max => infinity */
1582     if (max == 0) max = INT_MAX;
1583     break;
1584    
1585     case OP_CRRANGE:
1586     case OP_CRMINRANGE:
1587     minimize = (*ecode == OP_CRMINRANGE);
1588     min = GET2(ecode, 1);
1589     max = GET2(ecode, 3);
1590     if (max == 0) max = INT_MAX;
1591     ecode += 5;
1592     break;
1593    
1594     default: /* No repeat follows */
1595     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1596     eptr += length;
1597     continue; /* With the main loop */
1598     }
1599    
1600     /* If the length of the reference is zero, just continue with the
1601     main loop. */
1602    
1603     if (length == 0) continue;
1604    
1605     /* First, ensure the minimum number of matches are present. We get back
1606     the length of the reference string explicitly rather than passing the
1607     address of eptr, so that eptr can be a register variable. */
1608    
1609     for (i = 1; i <= min; i++)
1610     {
1611     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1612     eptr += length;
1613     }
1614    
1615     /* If min = max, continue at the same level without recursion.
1616     They are not both allowed to be zero. */
1617    
1618     if (min == max) continue;
1619    
1620     /* If minimizing, keep trying and advancing the pointer */
1621    
1622     if (minimize)
1623     {
1624     for (fi = min;; fi++)
1625     {
1626 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1627 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1628     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1629     RRETURN(MATCH_NOMATCH);
1630     eptr += length;
1631     }
1632     /* Control never gets here */
1633     }
1634    
1635     /* If maximizing, find the longest string and work backwards */
1636    
1637     else
1638     {
1639     pp = eptr;
1640     for (i = min; i < max; i++)
1641     {
1642     if (!match_ref(offset, eptr, length, md, ims)) break;
1643     eptr += length;
1644     }
1645     while (eptr >= pp)
1646     {
1647 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1648 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1649     eptr -= length;
1650     }
1651     RRETURN(MATCH_NOMATCH);
1652     }
1653     }
1654     /* Control never gets here */
1655    
1656    
1657    
1658     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1659     used when all the characters in the class have values in the range 0-255,
1660     and either the matching is caseful, or the characters are in the range
1661     0-127 when UTF-8 processing is enabled. The only difference between
1662     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1663     encountered.
1664    
1665     First, look past the end of the item to see if there is repeat information
1666     following. Then obey similar code to character type repeats - written out
1667     again for speed. */
1668    
1669     case OP_NCLASS:
1670     case OP_CLASS:
1671     {
1672     data = ecode + 1; /* Save for matching */
1673     ecode += 33; /* Advance past the item */
1674    
1675     switch (*ecode)
1676     {
1677     case OP_CRSTAR:
1678     case OP_CRMINSTAR:
1679     case OP_CRPLUS:
1680     case OP_CRMINPLUS:
1681     case OP_CRQUERY:
1682     case OP_CRMINQUERY:
1683     c = *ecode++ - OP_CRSTAR;
1684     minimize = (c & 1) != 0;
1685     min = rep_min[c]; /* Pick up values from tables; */
1686     max = rep_max[c]; /* zero for max => infinity */
1687     if (max == 0) max = INT_MAX;
1688     break;
1689    
1690     case OP_CRRANGE:
1691     case OP_CRMINRANGE:
1692     minimize = (*ecode == OP_CRMINRANGE);
1693     min = GET2(ecode, 1);
1694     max = GET2(ecode, 3);
1695     if (max == 0) max = INT_MAX;
1696     ecode += 5;
1697     break;
1698    
1699     default: /* No repeat follows */
1700     min = max = 1;
1701     break;
1702     }
1703    
1704     /* First, ensure the minimum number of matches are present. */
1705    
1706     #ifdef SUPPORT_UTF8
1707     /* UTF-8 mode */
1708     if (utf8)
1709     {
1710     for (i = 1; i <= min; i++)
1711     {
1712     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1713     GETCHARINC(c, eptr);
1714     if (c > 255)
1715     {
1716     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1717     }
1718     else
1719     {
1720     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1721     }
1722     }
1723     }
1724     else
1725     #endif
1726     /* Not UTF-8 mode */
1727     {
1728     for (i = 1; i <= min; i++)
1729     {
1730     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1731     c = *eptr++;
1732     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1733     }
1734     }
1735    
1736     /* If max == min we can continue with the main loop without the
1737     need to recurse. */
1738    
1739     if (min == max) continue;
1740    
1741     /* If minimizing, keep testing the rest of the expression and advancing
1742     the pointer while it matches the class. */
1743    
1744     if (minimize)
1745     {
1746     #ifdef SUPPORT_UTF8
1747     /* UTF-8 mode */
1748     if (utf8)
1749     {
1750     for (fi = min;; fi++)
1751     {
1752 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1753 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1754     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1755     GETCHARINC(c, eptr);
1756     if (c > 255)
1757     {
1758     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1759     }
1760     else
1761     {
1762     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1763     }
1764     }
1765     }
1766     else
1767     #endif
1768     /* Not UTF-8 mode */
1769     {
1770     for (fi = min;; fi++)
1771     {
1772 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1773 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1774     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1775     c = *eptr++;
1776     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1777     }
1778     }
1779     /* Control never gets here */
1780     }
1781    
1782     /* If maximizing, find the longest possible run, then work backwards. */
1783    
1784     else
1785     {
1786     pp = eptr;
1787    
1788     #ifdef SUPPORT_UTF8
1789     /* UTF-8 mode */
1790     if (utf8)
1791     {
1792     for (i = min; i < max; i++)
1793     {
1794     int len = 1;
1795     if (eptr >= md->end_subject) break;
1796     GETCHARLEN(c, eptr, len);
1797     if (c > 255)
1798     {
1799     if (op == OP_CLASS) break;
1800     }
1801     else
1802     {
1803     if ((data[c/8] & (1 << (c&7))) == 0) break;
1804     }
1805     eptr += len;
1806     }
1807     for (;;)
1808     {
1809 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1810 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1811     if (eptr-- == pp) break; /* Stop if tried at original pos */
1812     BACKCHAR(eptr);
1813     }
1814     }
1815     else
1816     #endif
1817     /* Not UTF-8 mode */
1818     {
1819     for (i = min; i < max; i++)
1820     {
1821     if (eptr >= md->end_subject) break;
1822     c = *eptr;
1823     if ((data[c/8] & (1 << (c&7))) == 0) break;
1824     eptr++;
1825     }
1826     while (eptr >= pp)
1827     {
1828 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1829 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1830 nigel 77 eptr--;
1831     }
1832     }
1833    
1834     RRETURN(MATCH_NOMATCH);
1835     }
1836     }
1837     /* Control never gets here */
1838    
1839    
1840     /* Match an extended character class. This opcode is encountered only
1841     in UTF-8 mode, because that's the only time it is compiled. */
1842    
1843     #ifdef SUPPORT_UTF8
1844     case OP_XCLASS:
1845     {
1846     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1847     ecode += GET(ecode, 1); /* Advance past the item */
1848    
1849     switch (*ecode)
1850     {
1851     case OP_CRSTAR:
1852     case OP_CRMINSTAR:
1853     case OP_CRPLUS:
1854     case OP_CRMINPLUS:
1855     case OP_CRQUERY:
1856     case OP_CRMINQUERY:
1857     c = *ecode++ - OP_CRSTAR;
1858     minimize = (c & 1) != 0;
1859     min = rep_min[c]; /* Pick up values from tables; */
1860     max = rep_max[c]; /* zero for max => infinity */
1861     if (max == 0) max = INT_MAX;
1862     break;
1863    
1864     case OP_CRRANGE:
1865     case OP_CRMINRANGE:
1866     minimize = (*ecode == OP_CRMINRANGE);
1867     min = GET2(ecode, 1);
1868     max = GET2(ecode, 3);
1869     if (max == 0) max = INT_MAX;
1870     ecode += 5;
1871     break;
1872    
1873     default: /* No repeat follows */
1874     min = max = 1;
1875     break;
1876     }
1877    
1878     /* First, ensure the minimum number of matches are present. */
1879    
1880     for (i = 1; i <= min; i++)
1881     {
1882     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1883     GETCHARINC(c, eptr);
1884     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1885     }
1886    
1887     /* If max == min we can continue with the main loop without the
1888     need to recurse. */
1889    
1890     if (min == max) continue;
1891    
1892     /* If minimizing, keep testing the rest of the expression and advancing
1893     the pointer while it matches the class. */
1894    
1895     if (minimize)
1896     {
1897     for (fi = min;; fi++)
1898     {
1899 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
1900 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1901     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1902     GETCHARINC(c, eptr);
1903     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
1904     }
1905     /* Control never gets here */
1906     }
1907    
1908     /* If maximizing, find the longest possible run, then work backwards. */
1909    
1910     else
1911     {
1912     pp = eptr;
1913     for (i = min; i < max; i++)
1914     {
1915     int len = 1;
1916     if (eptr >= md->end_subject) break;
1917     GETCHARLEN(c, eptr, len);
1918     if (!_pcre_xclass(c, data)) break;
1919     eptr += len;
1920     }
1921     for(;;)
1922     {
1923 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
1924 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1925     if (eptr-- == pp) break; /* Stop if tried at original pos */
1926     BACKCHAR(eptr)
1927     }
1928     RRETURN(MATCH_NOMATCH);
1929     }
1930    
1931     /* Control never gets here */
1932     }
1933     #endif /* End of XCLASS */
1934    
1935     /* Match a single character, casefully */
1936    
1937     case OP_CHAR:
1938     #ifdef SUPPORT_UTF8
1939     if (utf8)
1940     {
1941     length = 1;
1942     ecode++;
1943     GETCHARLEN(fc, ecode, length);
1944     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1945     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1946     }
1947     else
1948     #endif
1949    
1950     /* Non-UTF-8 mode */
1951     {
1952     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
1953     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1954     ecode += 2;
1955     }
1956     break;
1957    
1958     /* Match a single character, caselessly */
1959    
1960     case OP_CHARNC:
1961     #ifdef SUPPORT_UTF8
1962     if (utf8)
1963     {
1964     length = 1;
1965     ecode++;
1966     GETCHARLEN(fc, ecode, length);
1967    
1968     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
1969    
1970     /* If the pattern character's value is < 128, we have only one byte, and
1971     can use the fast lookup table. */
1972    
1973     if (fc < 128)
1974     {
1975     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
1976     }
1977    
1978     /* Otherwise we must pick up the subject character */
1979    
1980     else
1981     {
1982 nigel 93 unsigned int dc;
1983 nigel 77 GETCHARINC(dc, eptr);
1984     ecode += length;
1985    
1986     /* If we have Unicode property support, we can use it to test the other
1987 nigel 87 case of the character, if there is one. */
1988 nigel 77
1989     if (fc != dc)
1990     {
1991     #ifdef SUPPORT_UCP
1992 nigel 87 if (dc != _pcre_ucp_othercase(fc))
1993 nigel 77 #endif
1994     RRETURN(MATCH_NOMATCH);
1995     }
1996     }
1997     }
1998     else
1999     #endif /* SUPPORT_UTF8 */
2000    
2001     /* Non-UTF-8 mode */
2002     {
2003     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2004     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2005     ecode += 2;
2006     }
2007     break;
2008    
2009 nigel 93 /* Match a single character repeatedly. */
2010 nigel 77
2011     case OP_EXACT:
2012     min = max = GET2(ecode, 1);
2013     ecode += 3;
2014     goto REPEATCHAR;
2015    
2016 nigel 93 case OP_POSUPTO:
2017     possessive = TRUE;
2018     /* Fall through */
2019    
2020 nigel 77 case OP_UPTO:
2021     case OP_MINUPTO:
2022     min = 0;
2023     max = GET2(ecode, 1);
2024     minimize = *ecode == OP_MINUPTO;
2025     ecode += 3;
2026     goto REPEATCHAR;
2027    
2028 nigel 93 case OP_POSSTAR:
2029     possessive = TRUE;
2030     min = 0;
2031     max = INT_MAX;
2032     ecode++;
2033     goto REPEATCHAR;
2034    
2035     case OP_POSPLUS:
2036     possessive = TRUE;
2037     min = 1;
2038     max = INT_MAX;
2039     ecode++;
2040     goto REPEATCHAR;
2041    
2042     case OP_POSQUERY:
2043     possessive = TRUE;
2044     min = 0;
2045     max = 1;
2046     ecode++;
2047     goto REPEATCHAR;
2048    
2049 nigel 77 case OP_STAR:
2050     case OP_MINSTAR:
2051     case OP_PLUS:
2052     case OP_MINPLUS:
2053     case OP_QUERY:
2054     case OP_MINQUERY:
2055     c = *ecode++ - OP_STAR;
2056     minimize = (c & 1) != 0;
2057     min = rep_min[c]; /* Pick up values from tables; */
2058     max = rep_max[c]; /* zero for max => infinity */
2059     if (max == 0) max = INT_MAX;
2060    
2061     /* Common code for all repeated single-character matches. We can give
2062     up quickly if there are fewer than the minimum number of characters left in
2063     the subject. */
2064    
2065     REPEATCHAR:
2066     #ifdef SUPPORT_UTF8
2067     if (utf8)
2068     {
2069     length = 1;
2070     charptr = ecode;
2071     GETCHARLEN(fc, ecode, length);
2072     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2073     ecode += length;
2074    
2075     /* Handle multibyte character matching specially here. There is
2076     support for caseless matching if UCP support is present. */
2077    
2078     if (length > 1)
2079     {
2080     #ifdef SUPPORT_UCP
2081 nigel 93 unsigned int othercase;
2082 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2083 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2084 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2085 ph10 115 else oclength = 0;
2086 nigel 77 #endif /* SUPPORT_UCP */
2087    
2088     for (i = 1; i <= min; i++)
2089     {
2090     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2091 ph10 123 #ifdef SUPPORT_UCP
2092 nigel 77 /* Need braces because of following else */
2093     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2094     else
2095     {
2096     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2097     eptr += oclength;
2098     }
2099 ph10 115 #else /* without SUPPORT_UCP */
2100     else { RRETURN(MATCH_NOMATCH); }
2101 ph10 123 #endif /* SUPPORT_UCP */
2102 nigel 77 }
2103    
2104     if (min == max) continue;
2105    
2106     if (minimize)
2107     {
2108     for (fi = min;; fi++)
2109     {
2110 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2111 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2112     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2113     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2114 ph10 123 #ifdef SUPPORT_UCP
2115 nigel 77 /* Need braces because of following else */
2116     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2117     else
2118     {
2119     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2120     eptr += oclength;
2121     }
2122 ph10 115 #else /* without SUPPORT_UCP */
2123     else { RRETURN (MATCH_NOMATCH); }
2124     #endif /* SUPPORT_UCP */
2125 nigel 77 }
2126     /* Control never gets here */
2127     }
2128 nigel 93
2129     else /* Maximize */
2130 nigel 77 {
2131     pp = eptr;
2132     for (i = min; i < max; i++)
2133     {
2134     if (eptr > md->end_subject - length) break;
2135     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2136 ph10 123 #ifdef SUPPORT_UCP
2137 nigel 77 else if (oclength == 0) break;
2138     else
2139     {
2140     if (memcmp(eptr, occhars, oclength) != 0) break;
2141     eptr += oclength;
2142     }
2143 ph10 115 #else /* without SUPPORT_UCP */
2144     else break;
2145 ph10 123 #endif /* SUPPORT_UCP */
2146 nigel 77 }
2147 nigel 93
2148     if (possessive) continue;
2149 ph10 120 for(;;)
2150 nigel 77 {
2151 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2152 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2153 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2154 ph10 115 #ifdef SUPPORT_UCP
2155     eptr--;
2156     BACKCHAR(eptr);
2157 ph10 123 #else /* without SUPPORT_UCP */
2158 nigel 77 eptr -= length;
2159 ph10 123 #endif /* SUPPORT_UCP */
2160 nigel 77 }
2161     }
2162     /* Control never gets here */
2163     }
2164    
2165     /* If the length of a UTF-8 character is 1, we fall through here, and
2166     obey the code as for non-UTF-8 characters below, though in this case the
2167     value of fc will always be < 128. */
2168     }
2169     else
2170     #endif /* SUPPORT_UTF8 */
2171    
2172     /* When not in UTF-8 mode, load a single-byte character. */
2173     {
2174     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2175     fc = *ecode++;
2176     }
2177    
2178     /* The value of fc at this point is always less than 256, though we may or
2179     may not be in UTF-8 mode. The code is duplicated for the caseless and
2180     caseful cases, for speed, since matching characters is likely to be quite
2181     common. First, ensure the minimum number of matches are present. If min =
2182     max, continue at the same level without recursing. Otherwise, if
2183     minimizing, keep trying the rest of the expression and advancing one
2184     matching character if failing, up to the maximum. Alternatively, if
2185     maximizing, find the maximum number of characters and work backwards. */
2186    
2187     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2188     max, eptr));
2189    
2190     if ((ims & PCRE_CASELESS) != 0)
2191     {
2192     fc = md->lcc[fc];
2193     for (i = 1; i <= min; i++)
2194     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2195     if (min == max) continue;
2196     if (minimize)
2197     {
2198     for (fi = min;; fi++)
2199     {
2200 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2201 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2202     if (fi >= max || eptr >= md->end_subject ||
2203     fc != md->lcc[*eptr++])
2204     RRETURN(MATCH_NOMATCH);
2205     }
2206     /* Control never gets here */
2207     }
2208 nigel 93 else /* Maximize */
2209 nigel 77 {
2210     pp = eptr;
2211     for (i = min; i < max; i++)
2212     {
2213     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2214     eptr++;
2215     }
2216 nigel 93 if (possessive) continue;
2217 nigel 77 while (eptr >= pp)
2218     {
2219 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2220 nigel 77 eptr--;
2221     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2222     }
2223     RRETURN(MATCH_NOMATCH);
2224     }
2225     /* Control never gets here */
2226     }
2227    
2228     /* Caseful comparisons (includes all multi-byte characters) */
2229    
2230     else
2231     {
2232     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2233     if (min == max) continue;
2234     if (minimize)
2235     {
2236     for (fi = min;; fi++)
2237     {
2238 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2239 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2240     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2241     RRETURN(MATCH_NOMATCH);
2242     }
2243     /* Control never gets here */
2244     }
2245 nigel 93 else /* Maximize */
2246 nigel 77 {
2247     pp = eptr;
2248     for (i = min; i < max; i++)
2249     {
2250     if (eptr >= md->end_subject || fc != *eptr) break;
2251     eptr++;
2252     }
2253 nigel 93 if (possessive) continue;
2254 nigel 77 while (eptr >= pp)
2255     {
2256 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2257 nigel 77 eptr--;
2258     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2259     }
2260     RRETURN(MATCH_NOMATCH);
2261     }
2262     }
2263     /* Control never gets here */
2264    
2265     /* Match a negated single one-byte character. The character we are
2266     checking can be multibyte. */
2267    
2268     case OP_NOT:
2269     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2270     ecode++;
2271     GETCHARINCTEST(c, eptr);
2272     if ((ims & PCRE_CASELESS) != 0)
2273     {
2274     #ifdef SUPPORT_UTF8
2275     if (c < 256)
2276     #endif
2277     c = md->lcc[c];
2278     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2279     }
2280     else
2281     {
2282     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2283     }
2284     break;
2285    
2286     /* Match a negated single one-byte character repeatedly. This is almost a
2287     repeat of the code for a repeated single character, but I haven't found a
2288     nice way of commoning these up that doesn't require a test of the
2289     positive/negative option for each character match. Maybe that wouldn't add
2290     very much to the time taken, but character matching *is* what this is all
2291     about... */
2292    
2293     case OP_NOTEXACT:
2294     min = max = GET2(ecode, 1);
2295     ecode += 3;
2296     goto REPEATNOTCHAR;
2297    
2298     case OP_NOTUPTO:
2299     case OP_NOTMINUPTO:
2300     min = 0;
2301     max = GET2(ecode, 1);
2302     minimize = *ecode == OP_NOTMINUPTO;
2303     ecode += 3;
2304     goto REPEATNOTCHAR;
2305    
2306 nigel 93 case OP_NOTPOSSTAR:
2307     possessive = TRUE;
2308     min = 0;
2309     max = INT_MAX;
2310     ecode++;
2311     goto REPEATNOTCHAR;
2312    
2313     case OP_NOTPOSPLUS:
2314     possessive = TRUE;
2315     min = 1;
2316     max = INT_MAX;
2317     ecode++;
2318     goto REPEATNOTCHAR;
2319    
2320     case OP_NOTPOSQUERY:
2321     possessive = TRUE;
2322     min = 0;
2323     max = 1;
2324     ecode++;
2325     goto REPEATNOTCHAR;
2326    
2327     case OP_NOTPOSUPTO:
2328     possessive = TRUE;
2329     min = 0;
2330     max = GET2(ecode, 1);
2331     ecode += 3;
2332     goto REPEATNOTCHAR;
2333    
2334 nigel 77 case OP_NOTSTAR:
2335     case OP_NOTMINSTAR:
2336     case OP_NOTPLUS:
2337     case OP_NOTMINPLUS:
2338     case OP_NOTQUERY:
2339     case OP_NOTMINQUERY:
2340     c = *ecode++ - OP_NOTSTAR;
2341     minimize = (c & 1) != 0;
2342     min = rep_min[c]; /* Pick up values from tables; */
2343     max = rep_max[c]; /* zero for max => infinity */
2344     if (max == 0) max = INT_MAX;
2345    
2346     /* Common code for all repeated single-byte matches. We can give up quickly
2347     if there are fewer than the minimum number of bytes left in the
2348     subject. */
2349    
2350     REPEATNOTCHAR:
2351     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2352     fc = *ecode++;
2353    
2354     /* The code is duplicated for the caseless and caseful cases, for speed,
2355     since matching characters is likely to be quite common. First, ensure the
2356     minimum number of matches are present. If min = max, continue at the same
2357     level without recursing. Otherwise, if minimizing, keep trying the rest of
2358     the expression and advancing one matching character if failing, up to the
2359     maximum. Alternatively, if maximizing, find the maximum number of
2360     characters and work backwards. */
2361    
2362     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2363     max, eptr));
2364    
2365     if ((ims & PCRE_CASELESS) != 0)
2366     {
2367     fc = md->lcc[fc];
2368    
2369     #ifdef SUPPORT_UTF8
2370     /* UTF-8 mode */
2371     if (utf8)
2372     {
2373 nigel 93 register unsigned int d;
2374 nigel 77 for (i = 1; i <= min; i++)
2375     {
2376     GETCHARINC(d, eptr);
2377     if (d < 256) d = md->lcc[d];
2378     if (fc == d) RRETURN(MATCH_NOMATCH);
2379     }
2380     }
2381     else
2382     #endif
2383    
2384     /* Not UTF-8 mode */
2385     {
2386     for (i = 1; i <= min; i++)
2387     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2388     }
2389    
2390     if (min == max) continue;
2391    
2392     if (minimize)
2393     {
2394     #ifdef SUPPORT_UTF8
2395     /* UTF-8 mode */
2396     if (utf8)
2397     {
2398 nigel 93 register unsigned int d;
2399 nigel 77 for (fi = min;; fi++)
2400     {
2401 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2402 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2403     GETCHARINC(d, eptr);
2404     if (d < 256) d = md->lcc[d];
2405     if (fi >= max || eptr >= md->end_subject || fc == d)
2406     RRETURN(MATCH_NOMATCH);
2407     }
2408     }
2409     else
2410     #endif
2411     /* Not UTF-8 mode */
2412     {
2413     for (fi = min;; fi++)
2414     {
2415 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2416 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2417     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2418     RRETURN(MATCH_NOMATCH);
2419     }
2420     }
2421     /* Control never gets here */
2422     }
2423    
2424     /* Maximize case */
2425    
2426     else
2427     {
2428     pp = eptr;
2429    
2430     #ifdef SUPPORT_UTF8
2431     /* UTF-8 mode */
2432     if (utf8)
2433     {
2434 nigel 93 register unsigned int d;
2435 nigel 77 for (i = min; i < max; i++)
2436     {
2437     int len = 1;
2438     if (eptr >= md->end_subject) break;
2439     GETCHARLEN(d, eptr, len);
2440     if (d < 256) d = md->lcc[d];
2441     if (fc == d) break;
2442     eptr += len;
2443     }
2444 nigel 93 if (possessive) continue;
2445     for(;;)
2446 nigel 77 {
2447 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2448 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449     if (eptr-- == pp) break; /* Stop if tried at original pos */
2450     BACKCHAR(eptr);
2451     }
2452     }
2453     else
2454     #endif
2455     /* Not UTF-8 mode */
2456     {
2457     for (i = min; i < max; i++)
2458     {
2459     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2460     eptr++;
2461     }
2462 nigel 93 if (possessive) continue;
2463 nigel 77 while (eptr >= pp)
2464     {
2465 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2466 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2467     eptr--;
2468     }
2469     }
2470    
2471     RRETURN(MATCH_NOMATCH);
2472     }
2473     /* Control never gets here */
2474     }
2475    
2476     /* Caseful comparisons */
2477    
2478     else
2479     {
2480     #ifdef SUPPORT_UTF8
2481     /* UTF-8 mode */
2482     if (utf8)
2483     {
2484 nigel 93 register unsigned int d;
2485 nigel 77 for (i = 1; i <= min; i++)
2486     {
2487     GETCHARINC(d, eptr);
2488     if (fc == d) RRETURN(MATCH_NOMATCH);
2489     }
2490     }
2491     else
2492     #endif
2493     /* Not UTF-8 mode */
2494     {
2495     for (i = 1; i <= min; i++)
2496     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2497     }
2498    
2499     if (min == max) continue;
2500    
2501     if (minimize)
2502     {
2503     #ifdef SUPPORT_UTF8
2504     /* UTF-8 mode */
2505     if (utf8)
2506     {
2507 nigel 93 register unsigned int d;
2508 nigel 77 for (fi = min;; fi++)
2509     {
2510 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2511 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2512     GETCHARINC(d, eptr);
2513     if (fi >= max || eptr >= md->end_subject || fc == d)
2514     RRETURN(MATCH_NOMATCH);
2515     }
2516     }
2517     else
2518     #endif
2519     /* Not UTF-8 mode */
2520     {
2521     for (fi = min;; fi++)
2522     {
2523 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2524 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2525     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2526     RRETURN(MATCH_NOMATCH);
2527     }
2528     }
2529     /* Control never gets here */
2530     }
2531    
2532     /* Maximize case */
2533    
2534     else
2535     {
2536     pp = eptr;
2537    
2538     #ifdef SUPPORT_UTF8
2539     /* UTF-8 mode */
2540     if (utf8)
2541     {
2542 nigel 93 register unsigned int d;
2543 nigel 77 for (i = min; i < max; i++)
2544     {
2545     int len = 1;
2546     if (eptr >= md->end_subject) break;
2547     GETCHARLEN(d, eptr, len);
2548     if (fc == d) break;
2549     eptr += len;
2550     }
2551 nigel 93 if (possessive) continue;
2552 nigel 77 for(;;)
2553     {
2554 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2555 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2556     if (eptr-- == pp) break; /* Stop if tried at original pos */
2557     BACKCHAR(eptr);
2558     }
2559     }
2560     else
2561     #endif
2562     /* Not UTF-8 mode */
2563     {
2564     for (i = min; i < max; i++)
2565     {
2566     if (eptr >= md->end_subject || fc == *eptr) break;
2567     eptr++;
2568     }
2569 nigel 93 if (possessive) continue;
2570 nigel 77 while (eptr >= pp)
2571     {
2572 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2573 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2574     eptr--;
2575     }
2576     }
2577    
2578     RRETURN(MATCH_NOMATCH);
2579     }
2580     }
2581     /* Control never gets here */
2582    
2583     /* Match a single character type repeatedly; several different opcodes
2584     share code. This is very similar to the code for single characters, but we
2585     repeat it in the interests of efficiency. */
2586    
2587     case OP_TYPEEXACT:
2588     min = max = GET2(ecode, 1);
2589     minimize = TRUE;
2590     ecode += 3;
2591     goto REPEATTYPE;
2592    
2593     case OP_TYPEUPTO:
2594     case OP_TYPEMINUPTO:
2595     min = 0;
2596     max = GET2(ecode, 1);
2597     minimize = *ecode == OP_TYPEMINUPTO;
2598     ecode += 3;
2599     goto REPEATTYPE;
2600    
2601 nigel 93 case OP_TYPEPOSSTAR:
2602     possessive = TRUE;
2603     min = 0;
2604     max = INT_MAX;
2605     ecode++;
2606     goto REPEATTYPE;
2607    
2608     case OP_TYPEPOSPLUS:
2609     possessive = TRUE;
2610     min = 1;
2611     max = INT_MAX;
2612     ecode++;
2613     goto REPEATTYPE;
2614    
2615     case OP_TYPEPOSQUERY:
2616     possessive = TRUE;
2617     min = 0;
2618     max = 1;
2619     ecode++;
2620     goto REPEATTYPE;
2621    
2622     case OP_TYPEPOSUPTO:
2623     possessive = TRUE;
2624     min = 0;
2625     max = GET2(ecode, 1);
2626     ecode += 3;
2627     goto REPEATTYPE;
2628    
2629 nigel 77 case OP_TYPESTAR:
2630     case OP_TYPEMINSTAR:
2631     case OP_TYPEPLUS:
2632     case OP_TYPEMINPLUS:
2633     case OP_TYPEQUERY:
2634     case OP_TYPEMINQUERY:
2635     c = *ecode++ - OP_TYPESTAR;
2636     minimize = (c & 1) != 0;
2637     min = rep_min[c]; /* Pick up values from tables; */
2638     max = rep_max[c]; /* zero for max => infinity */
2639     if (max == 0) max = INT_MAX;
2640    
2641     /* Common code for all repeated single character type matches. Note that
2642     in UTF-8 mode, '.' matches a character of any length, but for the other
2643     character types, the valid characters are all one-byte long. */
2644    
2645     REPEATTYPE:
2646     ctype = *ecode++; /* Code for the character type */
2647    
2648     #ifdef SUPPORT_UCP
2649     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2650     {
2651     prop_fail_result = ctype == OP_NOTPROP;
2652     prop_type = *ecode++;
2653 nigel 87 prop_value = *ecode++;
2654 nigel 77 }
2655     else prop_type = -1;
2656     #endif
2657    
2658     /* First, ensure the minimum number of matches are present. Use inline
2659     code for maximizing the speed, and do the type test once at the start
2660     (i.e. keep it out of the loop). Also we can test that there are at least
2661     the minimum number of bytes before we start. This isn't as effective in
2662     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2663     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2664     and single-bytes. */
2665    
2666     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2667     if (min > 0)
2668     {
2669     #ifdef SUPPORT_UCP
2670 nigel 87 if (prop_type >= 0)
2671 nigel 77 {
2672 nigel 87 switch(prop_type)
2673 nigel 77 {
2674 nigel 87 case PT_ANY:
2675     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2676     for (i = 1; i <= min; i++)
2677     {
2678     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2679     GETCHARINC(c, eptr);
2680     }
2681     break;
2682    
2683     case PT_LAMP:
2684     for (i = 1; i <= min; i++)
2685     {
2686     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2687     GETCHARINC(c, eptr);
2688     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2689     if ((prop_chartype == ucp_Lu ||
2690     prop_chartype == ucp_Ll ||
2691     prop_chartype == ucp_Lt) == prop_fail_result)
2692     RRETURN(MATCH_NOMATCH);
2693     }
2694     break;
2695    
2696     case PT_GC:
2697     for (i = 1; i <= min; i++)
2698     {
2699     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2700     GETCHARINC(c, eptr);
2701     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2702     if ((prop_category == prop_value) == prop_fail_result)
2703     RRETURN(MATCH_NOMATCH);
2704     }
2705     break;
2706    
2707     case PT_PC:
2708     for (i = 1; i <= min; i++)
2709     {
2710     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2711     GETCHARINC(c, eptr);
2712     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2713     if ((prop_chartype == prop_value) == prop_fail_result)
2714     RRETURN(MATCH_NOMATCH);
2715     }
2716     break;
2717    
2718     case PT_SC:
2719     for (i = 1; i <= min; i++)
2720     {
2721     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2722     GETCHARINC(c, eptr);
2723     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2724     if ((prop_script == prop_value) == prop_fail_result)
2725     RRETURN(MATCH_NOMATCH);
2726     }
2727     break;
2728    
2729     default:
2730     RRETURN(PCRE_ERROR_INTERNAL);
2731 nigel 77 }
2732     }
2733    
2734     /* Match extended Unicode sequences. We will get here only if the
2735     support is in the binary; otherwise a compile-time error occurs. */
2736    
2737     else if (ctype == OP_EXTUNI)
2738     {
2739     for (i = 1; i <= min; i++)
2740     {
2741     GETCHARINCTEST(c, eptr);
2742 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2743 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2744     while (eptr < md->end_subject)
2745     {
2746     int len = 1;
2747     if (!utf8) c = *eptr; else
2748     {
2749     GETCHARLEN(c, eptr, len);
2750     }
2751 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2752 nigel 77 if (prop_category != ucp_M) break;
2753     eptr += len;
2754     }
2755     }
2756     }
2757    
2758     else
2759     #endif /* SUPPORT_UCP */
2760    
2761     /* Handle all other cases when the coding is UTF-8 */
2762    
2763     #ifdef SUPPORT_UTF8
2764     if (utf8) switch(ctype)
2765     {
2766     case OP_ANY:
2767     for (i = 1; i <= min; i++)
2768     {
2769     if (eptr >= md->end_subject ||
2770 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2771 nigel 77 RRETURN(MATCH_NOMATCH);
2772 nigel 91 eptr++;
2773 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2774     }
2775     break;
2776    
2777     case OP_ANYBYTE:
2778     eptr += min;
2779     break;
2780    
2781 nigel 93 case OP_ANYNL:
2782     for (i = 1; i <= min; i++)
2783     {
2784     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2785     GETCHARINC(c, eptr);
2786     switch(c)
2787     {
2788     default: RRETURN(MATCH_NOMATCH);
2789     case 0x000d:
2790     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2791     break;
2792     case 0x000a:
2793     case 0x000b:
2794     case 0x000c:
2795     case 0x0085:
2796     case 0x2028:
2797     case 0x2029:
2798     break;
2799     }
2800     }
2801     break;
2802    
2803 nigel 77 case OP_NOT_DIGIT:
2804     for (i = 1; i <= min; i++)
2805     {
2806     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2807     GETCHARINC(c, eptr);
2808     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
2809     RRETURN(MATCH_NOMATCH);
2810     }
2811     break;
2812    
2813     case OP_DIGIT:
2814     for (i = 1; i <= min; i++)
2815     {
2816     if (eptr >= md->end_subject ||
2817     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
2818     RRETURN(MATCH_NOMATCH);
2819     /* No need to skip more bytes - we know it's a 1-byte character */
2820     }
2821     break;
2822    
2823     case OP_NOT_WHITESPACE:
2824     for (i = 1; i <= min; i++)
2825     {
2826     if (eptr >= md->end_subject ||
2827     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
2828     RRETURN(MATCH_NOMATCH);
2829     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2830     }
2831     break;
2832    
2833     case OP_WHITESPACE:
2834     for (i = 1; i <= min; i++)
2835     {
2836     if (eptr >= md->end_subject ||
2837     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
2838     RRETURN(MATCH_NOMATCH);
2839     /* No need to skip more bytes - we know it's a 1-byte character */
2840     }
2841     break;
2842    
2843     case OP_NOT_WORDCHAR:
2844     for (i = 1; i <= min; i++)
2845     {
2846     if (eptr >= md->end_subject ||
2847     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
2848     RRETURN(MATCH_NOMATCH);
2849     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2850     }
2851     break;
2852    
2853     case OP_WORDCHAR:
2854     for (i = 1; i <= min; i++)
2855     {
2856     if (eptr >= md->end_subject ||
2857     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
2858     RRETURN(MATCH_NOMATCH);
2859     /* No need to skip more bytes - we know it's a 1-byte character */
2860     }
2861     break;
2862    
2863     default:
2864     RRETURN(PCRE_ERROR_INTERNAL);
2865     } /* End switch(ctype) */
2866    
2867     else
2868     #endif /* SUPPORT_UTF8 */
2869    
2870     /* Code for the non-UTF-8 case for minimum matching of operators other
2871 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
2872     number of bytes present, as this was tested above. */
2873 nigel 77
2874     switch(ctype)
2875     {
2876     case OP_ANY:
2877     if ((ims & PCRE_DOTALL) == 0)
2878     {
2879     for (i = 1; i <= min; i++)
2880 nigel 91 {
2881 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
2882 nigel 91 eptr++;
2883     }
2884 nigel 77 }
2885     else eptr += min;
2886     break;
2887    
2888     case OP_ANYBYTE:
2889     eptr += min;
2890     break;
2891    
2892 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
2893     bytes are present in this case. */
2894    
2895     case OP_ANYNL:
2896     for (i = 1; i <= min; i++)
2897     {
2898     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2899     switch(*eptr++)
2900     {
2901     default: RRETURN(MATCH_NOMATCH);
2902     case 0x000d:
2903     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2904     break;
2905     case 0x000a:
2906     case 0x000b:
2907     case 0x000c:
2908     case 0x0085:
2909     break;
2910     }
2911     }
2912     break;
2913    
2914 nigel 77 case OP_NOT_DIGIT:
2915     for (i = 1; i <= min; i++)
2916     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
2917     break;
2918    
2919     case OP_DIGIT:
2920     for (i = 1; i <= min; i++)
2921     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
2922     break;
2923    
2924     case OP_NOT_WHITESPACE:
2925     for (i = 1; i <= min; i++)
2926     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
2927     break;
2928    
2929     case OP_WHITESPACE:
2930     for (i = 1; i <= min; i++)
2931     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
2932     break;
2933    
2934     case OP_NOT_WORDCHAR:
2935     for (i = 1; i <= min; i++)
2936     if ((md->ctypes[*eptr++] & ctype_word) != 0)
2937     RRETURN(MATCH_NOMATCH);
2938     break;
2939    
2940     case OP_WORDCHAR:
2941     for (i = 1; i <= min; i++)
2942     if ((md->ctypes[*eptr++] & ctype_word) == 0)
2943     RRETURN(MATCH_NOMATCH);
2944     break;
2945    
2946     default:
2947     RRETURN(PCRE_ERROR_INTERNAL);
2948     }
2949     }
2950    
2951     /* If min = max, continue at the same level without recursing */
2952    
2953     if (min == max) continue;
2954    
2955     /* If minimizing, we have to test the rest of the pattern before each
2956     subsequent match. Again, separate the UTF-8 case for speed, and also
2957     separate the UCP cases. */
2958    
2959     if (minimize)
2960     {
2961     #ifdef SUPPORT_UCP
2962 nigel 87 if (prop_type >= 0)
2963 nigel 77 {
2964 nigel 87 switch(prop_type)
2965 nigel 77 {
2966 nigel 87 case PT_ANY:
2967     for (fi = min;; fi++)
2968     {
2969 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
2970 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2971     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2972     GETCHARINC(c, eptr);
2973     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2974     }
2975 nigel 93 /* Control never gets here */
2976 nigel 87
2977     case PT_LAMP:
2978     for (fi = min;; fi++)
2979     {
2980 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
2981 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2982     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2983     GETCHARINC(c, eptr);
2984     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2985     if ((prop_chartype == ucp_Lu ||
2986     prop_chartype == ucp_Ll ||
2987     prop_chartype == ucp_Lt) == prop_fail_result)
2988     RRETURN(MATCH_NOMATCH);
2989     }
2990 nigel 93 /* Control never gets here */
2991 nigel 87
2992     case PT_GC:
2993     for (fi = min;; fi++)
2994     {
2995 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
2996 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2997     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2998     GETCHARINC(c, eptr);
2999     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3000     if ((prop_category == prop_value) == prop_fail_result)
3001     RRETURN(MATCH_NOMATCH);
3002     }
3003 nigel 93 /* Control never gets here */
3004 nigel 87
3005     case PT_PC:
3006     for (fi = min;; fi++)
3007     {
3008 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3009 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3010     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3011     GETCHARINC(c, eptr);
3012     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3013     if ((prop_chartype == prop_value) == prop_fail_result)
3014     RRETURN(MATCH_NOMATCH);
3015     }
3016 nigel 93 /* Control never gets here */
3017 nigel 87
3018     case PT_SC:
3019     for (fi = min;; fi++)
3020     {
3021 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3022 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3023     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3024     GETCHARINC(c, eptr);
3025     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3026     if ((prop_script == prop_value) == prop_fail_result)
3027     RRETURN(MATCH_NOMATCH);
3028     }
3029 nigel 93 /* Control never gets here */
3030 nigel 87
3031     default:
3032     RRETURN(PCRE_ERROR_INTERNAL);
3033 nigel 77 }
3034     }
3035    
3036     /* Match extended Unicode sequences. We will get here only if the
3037     support is in the binary; otherwise a compile-time error occurs. */
3038    
3039     else if (ctype == OP_EXTUNI)
3040     {
3041     for (fi = min;; fi++)
3042     {
3043 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3044 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3045     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3046     GETCHARINCTEST(c, eptr);
3047 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3048 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3049     while (eptr < md->end_subject)
3050     {
3051     int len = 1;
3052     if (!utf8) c = *eptr; else
3053     {
3054     GETCHARLEN(c, eptr, len);
3055     }
3056 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3057 nigel 77 if (prop_category != ucp_M) break;
3058     eptr += len;
3059     }
3060     }
3061     }
3062    
3063     else
3064     #endif /* SUPPORT_UCP */
3065    
3066     #ifdef SUPPORT_UTF8
3067     /* UTF-8 mode */
3068     if (utf8)
3069     {
3070     for (fi = min;; fi++)
3071     {
3072 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3073 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3074 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3075     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3076 nigel 93 IS_NEWLINE(eptr)))
3077 nigel 91 RRETURN(MATCH_NOMATCH);
3078 nigel 77
3079     GETCHARINC(c, eptr);
3080     switch(ctype)
3081     {
3082 nigel 91 case OP_ANY: /* This is the DOTALL case */
3083 nigel 77 break;
3084    
3085     case OP_ANYBYTE:
3086     break;
3087    
3088 nigel 93 case OP_ANYNL:
3089     switch(c)
3090     {
3091     default: RRETURN(MATCH_NOMATCH);
3092     case 0x000d:
3093     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3094     break;
3095     case 0x000a:
3096     case 0x000b:
3097     case 0x000c:
3098     case 0x0085:
3099     case 0x2028:
3100     case 0x2029:
3101     break;
3102     }
3103     break;
3104    
3105 nigel 77 case OP_NOT_DIGIT:
3106     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3107     RRETURN(MATCH_NOMATCH);
3108     break;
3109    
3110     case OP_DIGIT:
3111     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3112     RRETURN(MATCH_NOMATCH);
3113     break;
3114    
3115     case OP_NOT_WHITESPACE:
3116     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3117     RRETURN(MATCH_NOMATCH);
3118     break;
3119    
3120     case OP_WHITESPACE:
3121     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3122     RRETURN(MATCH_NOMATCH);
3123     break;
3124    
3125     case OP_NOT_WORDCHAR:
3126     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3127     RRETURN(MATCH_NOMATCH);
3128     break;
3129    
3130     case OP_WORDCHAR:
3131     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3132     RRETURN(MATCH_NOMATCH);
3133     break;
3134    
3135     default:
3136     RRETURN(PCRE_ERROR_INTERNAL);
3137     }
3138     }
3139     }
3140     else
3141     #endif
3142     /* Not UTF-8 mode */
3143     {
3144     for (fi = min;; fi++)
3145     {
3146 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3147 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3148 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3149 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3150 nigel 91 RRETURN(MATCH_NOMATCH);
3151    
3152 nigel 77 c = *eptr++;
3153     switch(ctype)
3154     {
3155 nigel 91 case OP_ANY: /* This is the DOTALL case */
3156 nigel 77 break;
3157    
3158     case OP_ANYBYTE:
3159     break;
3160    
3161 nigel 93 case OP_ANYNL:
3162     switch(c)
3163     {
3164     default: RRETURN(MATCH_NOMATCH);
3165     case 0x000d:
3166     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3167     break;
3168     case 0x000a:
3169     case 0x000b:
3170     case 0x000c:
3171     case 0x0085:
3172     break;
3173     }
3174     break;
3175    
3176 nigel 77 case OP_NOT_DIGIT:
3177     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3178     break;
3179    
3180     case OP_DIGIT:
3181     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3182     break;
3183    
3184     case OP_NOT_WHITESPACE:
3185     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3186     break;
3187    
3188     case OP_WHITESPACE:
3189     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3190     break;
3191    
3192     case OP_NOT_WORDCHAR:
3193     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3194     break;
3195    
3196     case OP_WORDCHAR:
3197     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3198     break;
3199    
3200     default:
3201     RRETURN(PCRE_ERROR_INTERNAL);
3202     }
3203     }
3204     }
3205     /* Control never gets here */
3206     }
3207    
3208 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3209 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3210     UTF-8 and UCP stuff separate. */
3211    
3212     else
3213     {
3214     pp = eptr; /* Remember where we started */
3215    
3216     #ifdef SUPPORT_UCP
3217 nigel 87 if (prop_type >= 0)
3218 nigel 77 {
3219 nigel 87 switch(prop_type)
3220 nigel 77 {
3221 nigel 87 case PT_ANY:
3222     for (i = min; i < max; i++)
3223     {
3224     int len = 1;
3225     if (eptr >= md->end_subject) break;
3226     GETCHARLEN(c, eptr, len);
3227     if (prop_fail_result) break;
3228     eptr+= len;
3229     }
3230     break;
3231    
3232     case PT_LAMP:
3233     for (i = min; i < max; i++)
3234     {
3235     int len = 1;
3236     if (eptr >= md->end_subject) break;
3237     GETCHARLEN(c, eptr, len);
3238     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3239     if ((prop_chartype == ucp_Lu ||
3240     prop_chartype == ucp_Ll ||
3241     prop_chartype == ucp_Lt) == prop_fail_result)
3242     break;
3243     eptr+= len;
3244     }
3245     break;
3246    
3247     case PT_GC:
3248     for (i = min; i < max; i++)
3249     {
3250     int len = 1;
3251     if (eptr >= md->end_subject) break;
3252     GETCHARLEN(c, eptr, len);
3253     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3254     if ((prop_category == prop_value) == prop_fail_result)
3255     break;
3256     eptr+= len;
3257     }
3258     break;
3259    
3260     case PT_PC:
3261     for (i = min; i < max; i++)
3262     {
3263     int len = 1;
3264     if (eptr >= md->end_subject) break;
3265     GETCHARLEN(c, eptr, len);
3266     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3267     if ((prop_chartype == prop_value) == prop_fail_result)
3268     break;
3269     eptr+= len;
3270     }
3271     break;
3272    
3273     case PT_SC:
3274     for (i = min; i < max; i++)
3275     {
3276     int len = 1;
3277     if (eptr >= md->end_subject) break;
3278     GETCHARLEN(c, eptr, len);
3279     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3280     if ((prop_script == prop_value) == prop_fail_result)
3281     break;
3282     eptr+= len;
3283     }
3284     break;
3285 nigel 77 }
3286    
3287     /* eptr is now past the end of the maximum run */
3288    
3289 nigel 93 if (possessive) continue;
3290 nigel 77 for(;;)
3291     {
3292 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3293 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3294     if (eptr-- == pp) break; /* Stop if tried at original pos */
3295     BACKCHAR(eptr);
3296     }
3297     }
3298    
3299     /* Match extended Unicode sequences. We will get here only if the
3300     support is in the binary; otherwise a compile-time error occurs. */
3301    
3302     else if (ctype == OP_EXTUNI)
3303     {
3304     for (i = min; i < max; i++)
3305     {
3306     if (eptr >= md->end_subject) break;
3307     GETCHARINCTEST(c, eptr);
3308 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3309 nigel 77 if (prop_category == ucp_M) break;
3310     while (eptr < md->end_subject)
3311     {
3312     int len = 1;
3313     if (!utf8) c = *eptr; else
3314     {
3315     GETCHARLEN(c, eptr, len);
3316     }
3317 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3318 nigel 77 if (prop_category != ucp_M) break;
3319     eptr += len;
3320     }
3321     }
3322    
3323     /* eptr is now past the end of the maximum run */
3324    
3325 nigel 93 if (possessive) continue;
3326 nigel 77 for(;;)
3327     {
3328 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3329 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3330     if (eptr-- == pp) break; /* Stop if tried at original pos */
3331     for (;;) /* Move back over one extended */
3332     {
3333     int len = 1;
3334     BACKCHAR(eptr);
3335     if (!utf8) c = *eptr; else
3336     {
3337     GETCHARLEN(c, eptr, len);
3338     }
3339 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3340 nigel 77 if (prop_category != ucp_M) break;
3341     eptr--;
3342     }
3343     }
3344     }
3345    
3346     else
3347     #endif /* SUPPORT_UCP */
3348    
3349     #ifdef SUPPORT_UTF8
3350     /* UTF-8 mode */
3351    
3352     if (utf8)
3353     {
3354     switch(ctype)
3355     {
3356     case OP_ANY:
3357    
3358 nigel 91 /* Special code is required for UTF8, but when the maximum is
3359     unlimited we don't need it, so we repeat the non-UTF8 code. This is
3360     probably worth it, because .* is quite a common idiom. */
3361 nigel 77
3362     if (max < INT_MAX)
3363     {
3364     if ((ims & PCRE_DOTALL) == 0)
3365     {
3366     for (i = min; i < max; i++)
3367     {
3368 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3369 nigel 77 eptr++;
3370     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3371     }
3372     }
3373     else
3374     {
3375     for (i = min; i < max; i++)
3376     {
3377 nigel 91 if (eptr >= md->end_subject) break;
3378 nigel 77 eptr++;
3379     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3380     }
3381     }
3382     }
3383    
3384     /* Handle unlimited UTF-8 repeat */
3385    
3386     else
3387     {
3388     if ((ims & PCRE_DOTALL) == 0)
3389     {
3390     for (i = min; i < max; i++)
3391     {
3392 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3393 nigel 77 eptr++;
3394     }
3395     break;
3396     }
3397     else
3398     {
3399     c = max - min;
3400 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3401     c = md->end_subject - eptr;
3402 nigel 77 eptr += c;
3403     }
3404     }
3405     break;
3406    
3407     /* The byte case is the same as non-UTF8 */
3408    
3409     case OP_ANYBYTE:
3410     c = max - min;
3411 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3412     c = md->end_subject - eptr;
3413 nigel 77 eptr += c;
3414     break;
3415    
3416 nigel 93 case OP_ANYNL:
3417     for (i = min; i < max; i++)
3418     {
3419     int len = 1;
3420     if (eptr >= md->end_subject) break;
3421     GETCHARLEN(c, eptr, len);
3422     if (c == 0x000d)
3423     {
3424     if (++eptr >= md->end_subject) break;
3425     if (*eptr == 0x000a) eptr++;
3426     }
3427     else
3428     {
3429     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3430     c != 0x0085 && c != 0x2028 && c != 0x2029)
3431     break;
3432     eptr += len;
3433     }
3434     }
3435     break;
3436    
3437 nigel 77 case OP_NOT_DIGIT:
3438     for (i = min; i < max; i++)
3439     {
3440     int len = 1;
3441     if (eptr >= md->end_subject) break;
3442     GETCHARLEN(c, eptr, len);
3443     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3444     eptr+= len;
3445     }
3446     break;
3447    
3448     case OP_DIGIT:
3449     for (i = min; i < max; i++)
3450     {
3451     int len = 1;
3452     if (eptr >= md->end_subject) break;
3453     GETCHARLEN(c, eptr, len);
3454     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3455     eptr+= len;
3456     }
3457     break;
3458    
3459     case OP_NOT_WHITESPACE:
3460     for (i = min; i < max; i++)
3461     {
3462     int len = 1;
3463     if (eptr >= md->end_subject) break;
3464     GETCHARLEN(c, eptr, len);
3465     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3466     eptr+= len;
3467     }
3468     break;
3469    
3470     case OP_WHITESPACE:
3471     for (i = min; i < max; i++)
3472     {
3473     int len = 1;
3474     if (eptr >= md->end_subject) break;
3475     GETCHARLEN(c, eptr, len);
3476     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3477     eptr+= len;
3478     }
3479     break;
3480    
3481     case OP_NOT_WORDCHAR:
3482     for (i = min; i < max; i++)
3483     {
3484     int len = 1;
3485     if (eptr >= md->end_subject) break;
3486     GETCHARLEN(c, eptr, len);
3487     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3488     eptr+= len;
3489     }
3490     break;
3491    
3492     case OP_WORDCHAR:
3493     for (i = min; i < max; i++)
3494     {
3495     int len = 1;
3496     if (eptr >= md->end_subject) break;
3497     GETCHARLEN(c, eptr, len);
3498     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3499     eptr+= len;
3500     }
3501     break;
3502    
3503     default:
3504     RRETURN(PCRE_ERROR_INTERNAL);
3505     }
3506    
3507     /* eptr is now past the end of the maximum run */
3508    
3509 nigel 93 if (possessive) continue;
3510 nigel 77 for(;;)
3511     {
3512 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3513 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3514     if (eptr-- == pp) break; /* Stop if tried at original pos */
3515     BACKCHAR(eptr);
3516     }
3517     }
3518     else
3519     #endif
3520    
3521     /* Not UTF-8 mode */
3522     {
3523     switch(ctype)
3524     {
3525     case OP_ANY:
3526     if ((ims & PCRE_DOTALL) == 0)
3527     {
3528     for (i = min; i < max; i++)
3529     {
3530 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3531 nigel 77 eptr++;
3532     }
3533     break;
3534     }
3535     /* For DOTALL case, fall through and treat as \C */
3536    
3537     case OP_ANYBYTE:
3538     c = max - min;
3539 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3540     c = md->end_subject - eptr;
3541 nigel 77 eptr += c;
3542     break;
3543    
3544 nigel 93 case OP_ANYNL:
3545     for (i = min; i < max; i++)
3546     {
3547     if (eptr >= md->end_subject) break;
3548     c = *eptr;
3549     if (c == 0x000d)
3550     {
3551     if (++eptr >= md->end_subject) break;
3552     if (*eptr == 0x000a) eptr++;
3553     }
3554     else
3555     {
3556     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
3557     break;
3558     eptr++;
3559     }
3560     }
3561     break;
3562    
3563 nigel 77 case OP_NOT_DIGIT:
3564     for (i = min; i < max; i++)
3565     {
3566     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
3567     break;
3568     eptr++;
3569     }
3570     break;
3571    
3572     case OP_DIGIT:
3573     for (i = min; i < max; i++)
3574     {
3575     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
3576     break;
3577     eptr++;
3578     }
3579     break;
3580    
3581     case OP_NOT_WHITESPACE:
3582     for (i = min; i < max; i++)
3583     {
3584     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
3585     break;
3586     eptr++;
3587     }
3588     break;
3589    
3590     case OP_WHITESPACE:
3591     for (i = min; i < max; i++)
3592     {
3593     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
3594     break;
3595     eptr++;
3596     }
3597     break;
3598    
3599     case OP_NOT_WORDCHAR:
3600     for (i = min; i < max; i++)
3601     {
3602     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
3603     break;
3604     eptr++;
3605     }
3606     break;
3607    
3608     case OP_WORDCHAR:
3609     for (i = min; i < max; i++)
3610     {
3611     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
3612     break;
3613     eptr++;
3614     }
3615     break;
3616    
3617     default:
3618     RRETURN(PCRE_ERROR_INTERNAL);
3619     }
3620    
3621     /* eptr is now past the end of the maximum run */
3622    
3623 nigel 93 if (possessive) continue;
3624 nigel 77 while (eptr >= pp)
3625     {
3626 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
3627 nigel 77 eptr--;
3628     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3629     }
3630     }
3631    
3632     /* Get here if we can't make it match with any permitted repetitions */
3633    
3634     RRETURN(MATCH_NOMATCH);
3635     }
3636     /* Control never gets here */
3637    
3638 nigel 93 /* There's been some horrible disaster. Arrival here can only mean there is
3639     something seriously wrong in the code above or the OP_xxx definitions. */
3640 nigel 77
3641     default:
3642     DPRINTF(("Unknown opcode %d\n", *ecode));
3643 nigel 93 RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
3644 nigel 77 }
3645    
3646     /* Do not stick any code in here without much thought; it is assumed
3647     that "continue" in the code above comes out to here to repeat the main
3648     loop. */
3649    
3650     } /* End of main loop */
3651     /* Control never reaches here */
3652 ph10 164
3653    
3654 ph10 165 /* When compiling to use the heap rather than the stack for recursive calls to
3655     match(), the RRETURN() macro jumps here. The number that is saved in
3656 ph10 164 frame->Xwhere indicates which label we actually want to return to. */
3657    
3658     #ifdef NO_RECURSE
3659     #define LBL(val) case val: goto L_RM##val;
3660     HEAP_RETURN:
3661     switch (frame->Xwhere)
3662     {
3663     LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
3664     LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16)
3665     LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24)
3666     LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32)
3667     LBL(33) LBL(34) LBL(35) LBL(36) LBL(37) LBL(38) LBL(39) LBL(40)
3668     LBL(41) LBL(42) LBL(43) LBL(44) LBL(45) LBL(46) LBL(47)
3669     default:
3670     DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
3671     return PCRE_ERROR_INTERNAL;
3672     }
3673 ph10 165 #undef LBL
3674 ph10 164 #endif /* NO_RECURSE */
3675 nigel 77 }
3676    
3677    
3678     /***************************************************************************
3679     ****************************************************************************
3680     RECURSION IN THE match() FUNCTION
3681    
3682     Undefine all the macros that were defined above to handle this. */
3683    
3684     #ifdef NO_RECURSE
3685     #undef eptr
3686     #undef ecode
3687     #undef offset_top
3688     #undef ims
3689     #undef eptrb
3690     #undef flags
3691    
3692     #undef callpat
3693     #undef charptr
3694     #undef data
3695     #undef next
3696     #undef pp
3697     #undef prev
3698     #undef saved_eptr
3699    
3700     #undef new_recursive
3701    
3702     #undef cur_is_word
3703     #undef condition
3704     #undef prev_is_word
3705    
3706     #undef original_ims
3707    
3708     #undef ctype
3709     #undef length
3710     #undef max
3711     #undef min
3712     #undef number
3713     #undef offset
3714     #undef op
3715     #undef save_capture_last
3716     #undef save_offset1
3717     #undef save_offset2
3718     #undef save_offset3
3719     #undef stacksave
3720    
3721     #undef newptrb
3722    
3723     #endif
3724    
3725     /* These two are defined as macros in both cases */
3726    
3727     #undef fc
3728     #undef fi
3729    
3730     /***************************************************************************
3731     ***************************************************************************/
3732    
3733    
3734    
3735     /*************************************************
3736     * Execute a Regular Expression *
3737     *************************************************/
3738    
3739     /* This function applies a compiled re to a subject string and picks out
3740     portions of the string if it matches. Two elements in the vector are set for
3741     each substring: the offsets to the start and end of the substring.
3742    
3743     Arguments:
3744     argument_re points to the compiled expression
3745     extra_data points to extra data or is NULL
3746     subject points to the subject string
3747     length length of subject string (may contain binary zeros)
3748     start_offset where to start in the subject string
3749     options option bits
3750     offsets points to a vector of ints to be filled in with offsets
3751     offsetcount the number of elements in the vector
3752    
3753     Returns: > 0 => success; value is the number of elements filled in
3754     = 0 => success, but offsets is not big enough
3755     -1 => failed to match
3756     < -1 => some kind of unexpected problem
3757     */
3758    
3759 ph10 145 PCRE_EXP_DEFN int
3760 nigel 77 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
3761 nigel 87 PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
3762 nigel 77 int offsetcount)
3763     {
3764     int rc, resetcount, ocount;
3765     int first_byte = -1;
3766     int req_byte = -1;
3767     int req_byte2 = -1;
3768 nigel 91 int newline;
3769     unsigned long int ims;
3770 nigel 77 BOOL using_temporary_offsets = FALSE;
3771     BOOL anchored;
3772     BOOL startline;
3773     BOOL firstline;
3774     BOOL first_byte_caseless = FALSE;
3775     BOOL req_byte_caseless = FALSE;
3776 nigel 93 BOOL utf8;
3777 nigel 77 match_data match_block;
3778 nigel 91 match_data *md = &match_block;
3779 nigel 77 const uschar *tables;
3780     const uschar *start_bits = NULL;
3781 nigel 87 USPTR start_match = (USPTR)subject + start_offset;
3782     USPTR end_subject;
3783     USPTR req_byte_ptr = start_match - 1;
3784 nigel 93 eptrblock eptrchain[EPTR_WORK_SIZE];
3785 nigel 77
3786     pcre_study_data internal_study;
3787     const pcre_study_data *study;
3788    
3789     real_pcre internal_re;
3790     const real_pcre *external_re = (const real_pcre *)argument_re;
3791     const real_pcre *re = external_re;
3792    
3793     /* Plausibility checks */
3794    
3795     if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
3796     if (re == NULL || subject == NULL ||
3797     (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
3798     if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
3799    
3800     /* Fish out the optional data from the extra_data structure, first setting
3801     the default values. */
3802    
3803     study = NULL;
3804 nigel 91 md->match_limit = MATCH_LIMIT;
3805     md->match_limit_recursion = MATCH_LIMIT_RECURSION;
3806     md->callout_data = NULL;
3807 nigel 77
3808     /* The table pointer is always in native byte order. */
3809    
3810     tables = external_re->tables;
3811    
3812     if (extra_data != NULL)
3813     {
3814     register unsigned int flags = extra_data->flags;
3815     if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
3816     study = (const pcre_study_data *)extra_data->study_data;
3817     if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
3818 nigel 91 md->match_limit = extra_data->match_limit;
3819 nigel 87 if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
3820 nigel 91 md->match_limit_recursion = extra_data->match_limit_recursion;
3821 nigel 77 if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
3822 nigel 91 md->callout_data = extra_data->callout_data;
3823 nigel 77 if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
3824     }
3825    
3826     /* If the exec call supplied NULL for tables, use the inbuilt ones. This
3827     is a feature that makes it possible to save compiled regex and re-use them
3828     in other programs later. */
3829    
3830     if (tables == NULL) tables = _pcre_default_tables;
3831    
3832     /* Check that the first field in the block is the magic number. If it is not,
3833     test for a regex that was compiled on a host of opposite endianness. If this is
3834     the case, flipped values are put in internal_re and internal_study if there was
3835     study data too. */
3836    
3837     if (re->magic_number != MAGIC_NUMBER)
3838     {
3839     re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
3840     if (re == NULL) return PCRE_ERROR_BADMAGIC;
3841     if (study != NULL) study = &internal_study;
3842     }
3843    
3844     /* Set up other data */
3845    
3846     anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
3847     startline = (re->options & PCRE_STARTLINE) != 0;
3848     firstline = (re->options & PCRE_FIRSTLINE) != 0;
3849    
3850     /* The code starts after the real_pcre block and the capture name table. */
3851    
3852 nigel 91 md->start_code = (const uschar *)external_re + re->name_table_offset +
3853 nigel 77 re->name_count * re->name_entry_size;
3854    
3855 nigel 91 md->start_subject = (USPTR)subject;
3856     md->start_offset = start_offset;
3857     md->end_subject = md->start_subject + length;
3858     end_subject = md->end_subject;
3859 nigel 77
3860 nigel 91 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
3861 nigel 93 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
3862 nigel 77
3863 nigel 91 md->notbol = (options & PCRE_NOTBOL) != 0;
3864     md->noteol = (options & PCRE_NOTEOL) != 0;
3865     md->notempty = (options & PCRE_NOTEMPTY) != 0;
3866     md->partial = (options & PCRE_PARTIAL) != 0;
3867     md->hitend = FALSE;
3868 nigel 77
3869 nigel 91 md->recursive = NULL; /* No recursion at top level */
3870 nigel 93 md->eptrchain = eptrchain; /* Make workspace generally available */
3871 nigel 77
3872 nigel 91 md->lcc = tables + lcc_offset;
3873     md->ctypes = tables + ctypes_offset;
3874 nigel 77
3875 ph10 97 /* Handle different types of newline. The three bits give eight cases. If
3876     nothing is set at run time, whatever was used at compile time applies. */
3877 nigel 91
3878 ph10 144 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
3879 nigel 93 PCRE_NEWLINE_BITS)
3880 nigel 91 {
3881 nigel 93 case 0: newline = NEWLINE; break; /* Compile-time default */
3882 nigel 91 case PCRE_NEWLINE_CR: newline = '\r'; break;
3883     case PCRE_NEWLINE_LF: newline = '\n'; break;
3884     case PCRE_NEWLINE_CR+
3885     PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
3886 nigel 93 case PCRE_NEWLINE_ANY: newline = -1; break;
3887 ph10 150 case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
3888 nigel 93 default: return PCRE_ERROR_BADNEWLINE;
3889 nigel 91 }
3890    
3891 ph10 149 if (newline == -2)
3892 nigel 91 {
3893 ph10 149 md->nltype = NLTYPE_ANYCRLF;
3894     }
3895     else if (newline < 0)
3896     {
3897 nigel 93 md->nltype = NLTYPE_ANY;
3898 nigel 91 }
3899     else
3900     {
3901 nigel 93 md->nltype = NLTYPE_FIXED;
3902     if (newline > 255)
3903     {
3904     md->nllen = 2;
3905     md->nl[0] = (newline >> 8) & 255;
3906     md->nl[1] = newline & 255;
3907     }
3908     else
3909     {
3910     md->nllen = 1;
3911     md->nl[0] = newline;
3912     }
3913 nigel 91 }
3914    
3915 nigel 77 /* Partial matching is supported only for a restricted set of regexes at the
3916     moment. */
3917    
3918 nigel 91 if (md->partial && (re->options & PCRE_NOPARTIAL) != 0)
3919 nigel 77 return PCRE_ERROR_BADPARTIAL;
3920    
3921     /* Check a UTF-8 string if required. Unfortunately there's no way of passing
3922     back the character offset. */
3923    
3924     #ifdef SUPPORT_UTF8
3925 nigel 93 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
3926 nigel 77 {
3927     if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
3928     return PCRE_ERROR_BADUTF8;
3929     if (start_offset > 0 && start_offset < length)
3930     {
3931     int tb = ((uschar *)subject)[start_offset];
3932     if (tb > 127)
3933     {
3934     tb &= 0xc0;
3935     if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
3936     }
3937     }
3938     }
3939     #endif
3940    
3941     /* The ims options can vary during the matching as a result of the presence
3942     of (?ims) items in the pattern. They are kept in a local variable so that
3943     restoring at the exit of a group is easy. */
3944    
3945     ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
3946    
3947     /* If the expression has got more back references than the offsets supplied can
3948     hold, we get a temporary chunk of working store to use during the matching.
3949     Otherwise, we can use the vector supplied, rounding down its size to a multiple
3950     of 3. */
3951    
3952     ocount = offsetcount - (offsetcount % 3);
3953    
3954     if (re->top_backref > 0 && re->top_backref >= ocount/3)
3955     {
3956     ocount = re->top_backref * 3 + 3;
3957 nigel 91 md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
3958     if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
3959 nigel 77 using_temporary_offsets = TRUE;
3960     DPRINTF(("Got memory to hold back references\n"));
3961     }
3962 nigel 91 else md->offset_vector = offsets;
3963 nigel 77
3964 nigel 91 md->offset_end = ocount;
3965     md->offset_max = (2*ocount)/3;
3966     md->offset_overflow = FALSE;
3967     md->capture_last = -1;
3968 nigel 77
3969     /* Compute the minimum number of offsets that we need to reset each time. Doing
3970     this makes a huge difference to execution time when there aren't many brackets
3971     in the pattern. */
3972    
3973     resetcount = 2 + re->top_bracket * 2;
3974     if (resetcount > offsetcount) resetcount = ocount;
3975    
3976     /* Reset the working variable associated with each extraction. These should
3977     never be used unless previously set, but they get saved and restored, and so we
3978     initialize them to avoid reading uninitialized locations. */
3979    
3980 nigel 91 if (md->offset_vector != NULL)
3981 nigel 77 {
3982 nigel 91 register int *iptr = md->offset_vector + ocount;
3983 nigel 77 register int *iend = iptr - resetcount/2 + 1;
3984     while (--iptr >= iend) *iptr = -1;
3985     }
3986    
3987     /* Set up the first character to match, if available. The first_byte value is
3988     never set for an anchored regular expression, but the anchoring may be forced
3989     at run time, so we have to test for anchoring. The first char may be unset for
3990     an unanchored pattern, of course. If there's no first char and the pattern was
3991     studied, there may be a bitmap of possible first characters. */
3992    
3993     if (!anchored)
3994     {
3995     if ((re->options & PCRE_FIRSTSET) != 0)
3996     {
3997     first_byte = re->first_byte & 255;
3998     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
3999 nigel 91 first_byte = md->lcc[first_byte];
4000 nigel 77 }
4001     else
4002     if (!startline && study != NULL &&
4003     (study->options & PCRE_STUDY_MAPPED) != 0)
4004     start_bits = study->start_bits;
4005     }
4006    
4007     /* For anchored or unanchored matches, there may be a "last known required
4008     character" set. */
4009    
4010     if ((re->options & PCRE_REQCHSET) != 0)
4011     {
4012     req_byte = re->req_byte & 255;
4013     req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4014     req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4015     }
4016    
4017 nigel 93
4018     /* ==========================================================================*/
4019    
4020 nigel 77 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4021     the loop runs just once. */
4022    
4023 nigel 93 for(;;)
4024 nigel 77 {
4025 nigel 87 USPTR save_end_subject = end_subject;
4026 nigel 77
4027     /* Reset the maximum number of extractions we might see. */
4028    
4029 nigel 91 if (md->offset_vector != NULL)
4030 nigel 77 {
4031 nigel 91 register int *iptr = md->offset_vector;
4032 nigel 77 register int *iend = iptr + resetcount;
4033     while (iptr < iend) *iptr++ = -1;
4034     }
4035    
4036     /* Advance to a unique first char if possible. If firstline is TRUE, the
4037     start of the match is constrained to the first line of a multiline string.
4038 nigel 93 That is, the match must be before or at the first newline. Implement this by
4039     temporarily adjusting end_subject so that we stop scanning at a newline. If
4040     the match fails at the newline, later code breaks this loop. */
4041 nigel 77
4042     if (firstline)
4043     {
4044 nigel 87 USPTR t = start_match;
4045 nigel 93 while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4046 nigel 77 end_subject = t;
4047     }
4048    
4049     /* Now test for a unique first byte */
4050    
4051     if (first_byte >= 0)
4052     {
4053     if (first_byte_caseless)
4054     while (start_match < end_subject &&
4055 nigel 91 md->lcc[*start_match] != first_byte)
4056 nigel 77 start_match++;
4057     else
4058     while (start_match < end_subject && *start_match != first_byte)
4059     start_match++;
4060     }
4061    
4062 nigel 91 /* Or to just after a linebreak for a multiline match if possible */
4063 nigel 77
4064     else if (startline)
4065     {
4066 nigel 93 if (start_match > md->start_subject + start_offset)
4067 nigel 77 {
4068 nigel 93 while (start_match <= end_subject && !WAS_NEWLINE(start_match))
4069 nigel 77 start_match++;
4070 ph10 134
4071 ph10 149 /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4072     and we are now at a LF, advance the match position by one more character.
4073     */
4074 ph10 134
4075 ph10 130 if (start_match[-1] == '\r' &&
4076 ph10 149 (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4077 ph10 130 start_match < end_subject &&
4078     *start_match == '\n')
4079     start_match++;
4080 nigel 77 }
4081     }
4082    
4083     /* Or to a non-unique first char after study */
4084    
4085     else if (start_bits != NULL)
4086     {
4087     while (start_match < end_subject)
4088     {
4089     register unsigned int c = *start_match;
4090     if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4091     }
4092     }
4093    
4094     /* Restore fudged end_subject */
4095    
4096     end_subject = save_end_subject;
4097    
4098     #ifdef DEBUG /* Sigh. Some compilers never learn. */
4099     printf(">>>> Match against: ");
4100 nigel 91 pchars(start_match, end_subject - start_match, TRUE, md);
4101 nigel 77 printf("\n");
4102     #endif
4103    
4104     /* If req_byte is set, we know that that character must appear in the subject
4105     for the match to succeed. If the first character is set, req_byte must be
4106     later in the subject; otherwise the test starts at the match point. This
4107     optimization can save a huge amount of backtracking in patterns with nested
4108     unlimited repeats that aren't going to match. Writing separate code for
4109     cased/caseless versions makes it go faster, as does using an autoincrement
4110     and backing off on a match.
4111    
4112     HOWEVER: when the subject string is very, very long, searching to its end can
4113     take a long time, and give bad performance on quite ordinary patterns. This
4114 nigel 93 showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4115     string... so we don't do this when the string is sufficiently long.
4116 nigel 77
4117     ALSO: this processing is disabled when partial matching is requested.
4118     */
4119    
4120     if (req_byte >= 0 &&
4121     end_subject - start_match < REQ_BYTE_MAX &&
4122 nigel 91 !md->partial)
4123 nigel 77 {
4124 nigel 87 register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4125 nigel 77
4126     /* We don't need to repeat the search if we haven't yet reached the
4127     place we found it at last time. */
4128    
4129     if (p > req_byte_ptr)
4130     {
4131     if (req_byte_caseless)
4132     {
4133     while (p < end_subject)
4134     {
4135     register int pp = *p++;
4136     if (pp == req_byte || pp == req_byte2) { p--; break; }
4137     }
4138     }
4139     else
4140     {
4141     while (p < end_subject)
4142     {
4143     if (*p++ == req_byte) { p--; break; }
4144     }
4145     }
4146    
4147 nigel 93 /* If we can't find the required character, break the matching loop,
4148     forcing a match failure. */
4149 nigel 77
4150 nigel 93 if (p >= end_subject)
4151     {
4152     rc = MATCH_NOMATCH;
4153     break;
4154     }
4155 nigel 77
4156