/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 200 - (hide annotations) (download)
Wed Aug 1 09:10:40 2007 UTC (6 years, 8 months ago) by ph10
File MIME type: text/plain
File size: 146761 byte(s)
Correct errors in previous patch; tidy for test release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71     /* Maximum number of ints of offset to save on the stack for recursive calls.
72     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
73     because the offset vector is always a multiple of 3 long. */
74    
75     #define REC_STACK_SAVE_MAX 30
76    
77     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
78    
79     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
80     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
81    
82    
83    
84     #ifdef DEBUG
85     /*************************************************
86     * Debugging function to print chars *
87     *************************************************/
88    
89     /* Print a sequence of chars in printable format, stopping at the end of the
90     subject if the requested.
91    
92     Arguments:
93     p points to characters
94     length number to print
95     is_subject TRUE if printing from within md->start_subject
96     md pointer to matching data block, if is_subject is TRUE
97    
98     Returns: nothing
99     */
100    
101     static void
102     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
103     {
104 nigel 93 unsigned int c;
105 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
106     while (length-- > 0)
107     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
108     }
109     #endif
110    
111    
112    
113     /*************************************************
114     * Match a back-reference *
115     *************************************************/
116    
117     /* If a back reference hasn't been set, the length that is passed is greater
118     than the number of characters left in the string, so the match fails.
119    
120     Arguments:
121     offset index into the offset vector
122     eptr points into the subject
123     length length to be matched
124     md points to match data block
125     ims the ims flags
126    
127     Returns: TRUE if matched
128     */
129    
130     static BOOL
131 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
132 nigel 77 unsigned long int ims)
133     {
134 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
135 nigel 77
136     #ifdef DEBUG
137     if (eptr >= md->end_subject)
138     printf("matching subject <null>");
139     else
140     {
141     printf("matching subject ");
142     pchars(eptr, length, TRUE, md);
143     }
144     printf(" against backref ");
145     pchars(p, length, FALSE, md);
146     printf("\n");
147     #endif
148    
149     /* Always fail if not enough characters left */
150    
151     if (length > md->end_subject - eptr) return FALSE;
152    
153     /* Separate the caselesss case for speed */
154    
155     if ((ims & PCRE_CASELESS) != 0)
156     {
157     while (length-- > 0)
158     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
159     }
160     else
161     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
162    
163     return TRUE;
164     }
165    
166    
167    
168     /***************************************************************************
169     ****************************************************************************
170     RECURSION IN THE match() FUNCTION
171    
172 nigel 87 The match() function is highly recursive, though not every recursive call
173     increases the recursive depth. Nevertheless, some regular expressions can cause
174     it to recurse to a great depth. I was writing for Unix, so I just let it call
175     itself recursively. This uses the stack for saving everything that has to be
176     saved for a recursive call. On Unix, the stack can be large, and this works
177     fine.
178 nigel 77
179 nigel 87 It turns out that on some non-Unix-like systems there are problems with
180     programs that use a lot of stack. (This despite the fact that every last chip
181     has oodles of memory these days, and techniques for extending the stack have
182     been known for decades.) So....
183 nigel 77
184     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
185     calls by keeping local variables that need to be preserved in blocks of memory
186 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
187 nigel 77 achieve this so that the actual code doesn't look very different to what it
188     always used to.
189 ph10 164
190 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
191 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
192     Switzer, the use of longjmp() has been abolished, at the cost of having to
193     provide a unique number for each call to RMATCH. There is no way of generating
194     a sequence of numbers at compile time in C. I have given them names, to make
195     them stand out more clearly.
196    
197     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
198     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
199 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
200     don't have indeterminate values; this has meant that the frame size can be
201 ph10 164 reduced because the result can be "passed back" by straight setting of the
202     variable instead of being passed in the frame.
203 nigel 77 ****************************************************************************
204     ***************************************************************************/
205    
206    
207 ph10 164 /* Numbers for RMATCH calls */
208    
209     enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
210     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
211     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
212     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
213 ph10 197 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50 };
214 ph10 164
215 ph10 165
216 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
217 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
218 ph10 164 actuall used in this definition. */
219 nigel 77
220     #ifndef NO_RECURSE
221     #define REGISTER register
222 ph10 164
223 nigel 87 #ifdef DEBUG
224 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
225 nigel 87 { \
226     printf("match() called in line %d\n", __LINE__); \
227 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
228 nigel 87 printf("to line %d\n", __LINE__); \
229     }
230     #define RRETURN(ra) \
231     { \
232     printf("match() returned %d from line %d ", ra, __LINE__); \
233     return ra; \
234     }
235     #else
236 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
237 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
238 nigel 77 #define RRETURN(ra) return ra
239 nigel 87 #endif
240    
241 nigel 77 #else
242    
243    
244 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
245     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
246     argument of match(), which never changes. */
247 nigel 77
248     #define REGISTER
249    
250 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
251 nigel 77 {\
252     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
253 ph10 164 frame->Xwhere = rw; \
254     newframe->Xeptr = ra;\
255     newframe->Xecode = rb;\
256 ph10 168 newframe->Xmstart = mstart;\
257 ph10 164 newframe->Xoffset_top = rc;\
258     newframe->Xims = re;\
259     newframe->Xeptrb = rf;\
260     newframe->Xflags = rg;\
261     newframe->Xrdepth = frame->Xrdepth + 1;\
262     newframe->Xprevframe = frame;\
263     frame = newframe;\
264     DPRINTF(("restarting from line %d\n", __LINE__));\
265     goto HEAP_RECURSE;\
266     L_##rw:\
267     DPRINTF(("jumped back to line %d\n", __LINE__));\
268 nigel 77 }
269    
270     #define RRETURN(ra)\
271     {\
272     heapframe *newframe = frame;\
273     frame = newframe->Xprevframe;\
274     (pcre_stack_free)(newframe);\
275     if (frame != NULL)\
276     {\
277 ph10 164 rrc = ra;\
278     goto HEAP_RETURN;\
279 nigel 77 }\
280     return ra;\
281     }
282    
283    
284     /* Structure for remembering the local variables in a private frame */
285    
286     typedef struct heapframe {
287     struct heapframe *Xprevframe;
288    
289     /* Function arguments that may change */
290    
291     const uschar *Xeptr;
292     const uschar *Xecode;
293 ph10 172 const uschar *Xmstart;
294 nigel 77 int Xoffset_top;
295     long int Xims;
296     eptrblock *Xeptrb;
297     int Xflags;
298 nigel 91 unsigned int Xrdepth;
299 nigel 77
300     /* Function local variables */
301    
302     const uschar *Xcallpat;
303     const uschar *Xcharptr;
304     const uschar *Xdata;
305     const uschar *Xnext;
306     const uschar *Xpp;
307     const uschar *Xprev;
308     const uschar *Xsaved_eptr;
309    
310     recursion_info Xnew_recursive;
311    
312     BOOL Xcur_is_word;
313     BOOL Xcondition;
314     BOOL Xprev_is_word;
315    
316     unsigned long int Xoriginal_ims;
317    
318     #ifdef SUPPORT_UCP
319     int Xprop_type;
320 nigel 87 int Xprop_value;
321 nigel 77 int Xprop_fail_result;
322     int Xprop_category;
323     int Xprop_chartype;
324 nigel 87 int Xprop_script;
325 ph10 123 int Xoclength;
326     uschar Xocchars[8];
327 nigel 77 #endif
328    
329     int Xctype;
330 nigel 93 unsigned int Xfc;
331 nigel 77 int Xfi;
332     int Xlength;
333     int Xmax;
334     int Xmin;
335     int Xnumber;
336     int Xoffset;
337     int Xop;
338     int Xsave_capture_last;
339     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
340     int Xstacksave[REC_STACK_SAVE_MAX];
341    
342     eptrblock Xnewptrb;
343    
344 ph10 164 /* Where to jump back to */
345 nigel 77
346 ph10 164 int Xwhere;
347 ph10 165
348 nigel 77 } heapframe;
349    
350     #endif
351    
352    
353     /***************************************************************************
354     ***************************************************************************/
355    
356    
357    
358     /*************************************************
359     * Match from current position *
360     *************************************************/
361    
362 nigel 93 /* This function is called recursively in many circumstances. Whenever it
363 nigel 77 returns a negative (error) response, the outer incarnation must also return the
364     same response.
365    
366     Performance note: It might be tempting to extract commonly used fields from the
367     md structure (e.g. utf8, end_subject) into individual variables to improve
368     performance. Tests using gcc on a SPARC disproved this; in the first case, it
369     made performance worse.
370    
371     Arguments:
372 nigel 93 eptr pointer to current character in subject
373     ecode pointer to current position in compiled code
374 ph10 168 mstart pointer to the current match start position (can be modified
375 ph10 172 by encountering \K)
376 nigel 77 offset_top current top pointer
377     md pointer to "static" info for the match
378     ims current /i, /m, and /s options
379     eptrb pointer to chain of blocks containing eptr at start of
380     brackets - for testing for empty matches
381     flags can contain
382     match_condassert - this is an assertion condition
383 nigel 93 match_cbegroup - this is the start of an unlimited repeat
384     group that can match an empty string
385 nigel 87 rdepth the recursion depth
386 nigel 77
387     Returns: MATCH_MATCH if matched ) these values are >= 0
388     MATCH_NOMATCH if failed to match )
389     a negative PCRE_ERROR_xxx value if aborted by an error condition
390 nigel 87 (e.g. stopped by repeated call or recursion limit)
391 nigel 77 */
392    
393     static int
394 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
395 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
396 nigel 91 int flags, unsigned int rdepth)
397 nigel 77 {
398     /* These variables do not need to be preserved over recursion in this function,
399 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
400     "register" because they are used a lot in loops. */
401 nigel 77
402 nigel 91 register int rrc; /* Returns from recursive calls */
403     register int i; /* Used for loops not involving calls to RMATCH() */
404 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
405 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
406 nigel 77
407 nigel 93 BOOL minimize, possessive; /* Quantifier options */
408    
409 nigel 77 /* When recursion is not being used, all "local" variables that have to be
410     preserved over calls to RMATCH() are part of a "frame" which is obtained from
411     heap storage. Set up the top-level frame here; others are obtained from the
412     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
413    
414     #ifdef NO_RECURSE
415     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
416     frame->Xprevframe = NULL; /* Marks the top level */
417    
418     /* Copy in the original argument variables */
419    
420     frame->Xeptr = eptr;
421     frame->Xecode = ecode;
422 ph10 168 frame->Xmstart = mstart;
423 nigel 77 frame->Xoffset_top = offset_top;
424     frame->Xims = ims;
425     frame->Xeptrb = eptrb;
426     frame->Xflags = flags;
427 nigel 87 frame->Xrdepth = rdepth;
428 nigel 77
429     /* This is where control jumps back to to effect "recursion" */
430    
431     HEAP_RECURSE:
432    
433     /* Macros make the argument variables come from the current frame */
434    
435     #define eptr frame->Xeptr
436     #define ecode frame->Xecode
437 ph10 168 #define mstart frame->Xmstart
438 nigel 77 #define offset_top frame->Xoffset_top
439     #define ims frame->Xims
440     #define eptrb frame->Xeptrb
441     #define flags frame->Xflags
442 nigel 87 #define rdepth frame->Xrdepth
443 nigel 77
444     /* Ditto for the local variables */
445    
446     #ifdef SUPPORT_UTF8
447     #define charptr frame->Xcharptr
448     #endif
449     #define callpat frame->Xcallpat
450     #define data frame->Xdata
451     #define next frame->Xnext
452     #define pp frame->Xpp
453     #define prev frame->Xprev
454     #define saved_eptr frame->Xsaved_eptr
455    
456     #define new_recursive frame->Xnew_recursive
457    
458     #define cur_is_word frame->Xcur_is_word
459     #define condition frame->Xcondition
460     #define prev_is_word frame->Xprev_is_word
461    
462     #define original_ims frame->Xoriginal_ims
463    
464     #ifdef SUPPORT_UCP
465     #define prop_type frame->Xprop_type
466 nigel 87 #define prop_value frame->Xprop_value
467 nigel 77 #define prop_fail_result frame->Xprop_fail_result
468     #define prop_category frame->Xprop_category
469     #define prop_chartype frame->Xprop_chartype
470 nigel 87 #define prop_script frame->Xprop_script
471 ph10 115 #define oclength frame->Xoclength
472     #define occhars frame->Xocchars
473 nigel 77 #endif
474    
475     #define ctype frame->Xctype
476     #define fc frame->Xfc
477     #define fi frame->Xfi
478     #define length frame->Xlength
479     #define max frame->Xmax
480     #define min frame->Xmin
481     #define number frame->Xnumber
482     #define offset frame->Xoffset
483     #define op frame->Xop
484     #define save_capture_last frame->Xsave_capture_last
485     #define save_offset1 frame->Xsave_offset1
486     #define save_offset2 frame->Xsave_offset2
487     #define save_offset3 frame->Xsave_offset3
488     #define stacksave frame->Xstacksave
489    
490     #define newptrb frame->Xnewptrb
491    
492     /* When recursion is being used, local variables are allocated on the stack and
493     get preserved during recursion in the normal way. In this environment, fi and
494     i, and fc and c, can be the same variables. */
495    
496 nigel 93 #else /* NO_RECURSE not defined */
497 nigel 77 #define fi i
498     #define fc c
499    
500    
501 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
502     const uschar *charptr; /* in small blocks of the code. My normal */
503     #endif /* style of coding would have declared */
504     const uschar *callpat; /* them within each of those blocks. */
505     const uschar *data; /* However, in order to accommodate the */
506     const uschar *next; /* version of this code that uses an */
507     USPTR pp; /* external "stack" implemented on the */
508     const uschar *prev; /* heap, it is easier to declare them all */
509     USPTR saved_eptr; /* here, so the declarations can be cut */
510     /* out in a block. The only declarations */
511     recursion_info new_recursive; /* within blocks below are for variables */
512     /* that do not have to be preserved over */
513     BOOL cur_is_word; /* a recursive call to RMATCH(). */
514     BOOL condition;
515 nigel 77 BOOL prev_is_word;
516    
517     unsigned long int original_ims;
518    
519     #ifdef SUPPORT_UCP
520     int prop_type;
521 nigel 87 int prop_value;
522 nigel 77 int prop_fail_result;
523     int prop_category;
524     int prop_chartype;
525 nigel 87 int prop_script;
526 ph10 115 int oclength;
527     uschar occhars[8];
528 nigel 77 #endif
529    
530     int ctype;
531     int length;
532     int max;
533     int min;
534     int number;
535     int offset;
536     int op;
537     int save_capture_last;
538     int save_offset1, save_offset2, save_offset3;
539     int stacksave[REC_STACK_SAVE_MAX];
540    
541     eptrblock newptrb;
542 nigel 93 #endif /* NO_RECURSE */
543 nigel 77
544     /* These statements are here to stop the compiler complaining about unitialized
545     variables. */
546    
547     #ifdef SUPPORT_UCP
548 nigel 87 prop_value = 0;
549 nigel 77 prop_fail_result = 0;
550     #endif
551    
552 nigel 93
553 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
554     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
555     used. Thanks to Ian Taylor for noticing this possibility and sending the
556     original patch. */
557    
558     TAIL_RECURSE:
559    
560 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
561     are specified by the macro RMATCH and RRETURN is used to return. When
562     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
563     and a "return", respectively (possibly with some debugging if DEBUG is
564     defined). However, RMATCH isn't like a function call because it's quite a
565     complicated macro. It has to be used in one particular way. This shouldn't,
566     however, impact performance when true recursion is being used. */
567 nigel 77
568 ph10 164 #ifdef SUPPORT_UTF8
569     utf8 = md->utf8; /* Local copy of the flag */
570     #else
571     utf8 = FALSE;
572     #endif
573    
574 nigel 87 /* First check that we haven't called match() too many times, or that we
575     haven't exceeded the recursive call limit. */
576    
577 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
578 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
579 nigel 77
580     original_ims = ims; /* Save for resetting on ')' */
581 nigel 91
582 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
583     string, the match_cbegroup flag is set. When this is the case, add the current
584     subject pointer to the chain of such remembered pointers, to be checked when we
585     hit the closing ket, in order to break infinite loops that match no characters.
586 ph10 197 When match() is called in other circumstances, don't add to the chain. The
587     match_cbegroup flag must NOT be used with tail recursion, because the memory
588     block that is used is on the stack, so a new one may be required for each
589     match(). */
590 nigel 77
591 nigel 93 if ((flags & match_cbegroup) != 0)
592 nigel 77 {
593 ph10 197 newptrb.epb_saved_eptr = eptr;
594     newptrb.epb_prev = eptrb;
595     eptrb = &newptrb;
596 nigel 77 }
597    
598 nigel 93 /* Now start processing the opcodes. */
599 nigel 77
600     for (;;)
601     {
602 nigel 93 minimize = possessive = FALSE;
603 nigel 77 op = *ecode;
604    
605     /* For partial matching, remember if we ever hit the end of the subject after
606     matching at least one subject character. */
607    
608     if (md->partial &&
609     eptr >= md->end_subject &&
610 ph10 168 eptr > mstart)
611 nigel 77 md->hitend = TRUE;
612    
613 nigel 93 switch(op)
614     {
615     /* Handle a capturing bracket. If there is space in the offset vector, save
616     the current subject position in the working slot at the top of the vector.
617     We mustn't change the current values of the data slot, because they may be
618     set from a previous iteration of this group, and be referred to by a
619     reference inside the group.
620 nigel 77
621 nigel 93 If the bracket fails to match, we need to restore this value and also the
622     values of the final offsets, in case they were set by a previous iteration
623     of the same bracket.
624 nigel 77
625 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
626     a non-capturing bracket. Don't worry about setting the flag for the error
627     case here; that is handled in the code for KET. */
628 nigel 77
629 nigel 93 case OP_CBRA:
630     case OP_SCBRA:
631     number = GET2(ecode, 1+LINK_SIZE);
632 nigel 77 offset = number << 1;
633    
634     #ifdef DEBUG
635 nigel 93 printf("start bracket %d\n", number);
636     printf("subject=");
637 nigel 77 pchars(eptr, 16, TRUE, md);
638     printf("\n");
639     #endif
640    
641     if (offset < md->offset_max)
642     {
643     save_offset1 = md->offset_vector[offset];
644     save_offset2 = md->offset_vector[offset+1];
645     save_offset3 = md->offset_vector[md->offset_end - number];
646     save_capture_last = md->capture_last;
647    
648     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
649     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
650    
651 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
652 nigel 77 do
653     {
654 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
655     ims, eptrb, flags, RM1);
656 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
657     md->capture_last = save_capture_last;
658     ecode += GET(ecode, 1);
659     }
660     while (*ecode == OP_ALT);
661    
662     DPRINTF(("bracket %d failed\n", number));
663    
664     md->offset_vector[offset] = save_offset1;
665     md->offset_vector[offset+1] = save_offset2;
666     md->offset_vector[md->offset_end - number] = save_offset3;
667    
668     RRETURN(MATCH_NOMATCH);
669     }
670    
671 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
672     as a non-capturing bracket. */
673 nigel 77
674 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
675     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
676    
677 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
678 nigel 77
679 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
680     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
681    
682 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
683     final alternative within the brackets, we would return the result of a
684     recursive call to match() whatever happened. We can reduce stack usage by
685 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
686     is set.*/
687 nigel 77
688 nigel 93 case OP_BRA:
689     case OP_SBRA:
690     DPRINTF(("start non-capturing bracket\n"));
691     flags = (op >= OP_SBRA)? match_cbegroup : 0;
692 nigel 91 for (;;)
693 nigel 77 {
694 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
695 nigel 93 {
696 ph10 197 if (flags == 0) /* Not a possibly empty group */
697     {
698     ecode += _pcre_OP_lengths[*ecode];
699     DPRINTF(("bracket 0 tail recursion\n"));
700     goto TAIL_RECURSE;
701     }
702    
703     /* Possibly empty group; can't use tail recursion. */
704    
705     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
706     eptrb, flags, RM48);
707     RRETURN(rrc);
708 nigel 93 }
709 nigel 91
710     /* For non-final alternatives, continue the loop for a NOMATCH result;
711     otherwise return. */
712    
713 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
714     eptrb, flags, RM2);
715 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
716     ecode += GET(ecode, 1);
717     }
718 nigel 91 /* Control never reaches here. */
719 nigel 77
720     /* Conditional group: compilation checked that there are no more than
721     two branches. If the condition is false, skipping the first branch takes us
722     past the end if there is only one branch, but that's OK because that is
723 nigel 91 exactly what going to the ket would do. As there is only one branch to be
724     obeyed, we can use tail recursion to avoid using another stack frame. */
725 nigel 77
726     case OP_COND:
727 nigel 93 case OP_SCOND:
728     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
729 nigel 77 {
730 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
731     condition = md->recursive != NULL &&
732     (offset == RREF_ANY || offset == md->recursive->group_num);
733     ecode += condition? 3 : GET(ecode, 1);
734     }
735    
736     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
737     {
738 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
739 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
740     ecode += condition? 3 : GET(ecode, 1);
741 nigel 77 }
742    
743 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
744     {
745     condition = FALSE;
746     ecode += GET(ecode, 1);
747     }
748    
749 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
750 nigel 93 the final argument match_condassert causes it to stop at the end of an
751     assertion. */
752 nigel 77
753     else
754     {
755 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
756     match_condassert, RM3);
757 nigel 77 if (rrc == MATCH_MATCH)
758     {
759 nigel 93 condition = TRUE;
760     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
761 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
762     }
763     else if (rrc != MATCH_NOMATCH)
764     {
765     RRETURN(rrc); /* Need braces because of following else */
766     }
767 nigel 93 else
768     {
769     condition = FALSE;
770     ecode += GET(ecode, 1);
771     }
772     }
773 nigel 91
774 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
775 ph10 197 we can use tail recursion to avoid using another stack frame, except when
776     match_cbegroup is required for an unlimited repeat of a possibly empty
777     group. If the second alternative doesn't exist, we can just plough on. */
778 nigel 91
779 nigel 93 if (condition || *ecode == OP_ALT)
780     {
781 nigel 91 ecode += 1 + LINK_SIZE;
782 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
783     {
784     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
785     RRETURN(rrc);
786     }
787     else /* Group must match something */
788     {
789     flags = 0;
790     goto TAIL_RECURSE;
791     }
792 nigel 77 }
793 ph10 197 else /* Condition false & no 2nd alternative */
794 nigel 93 {
795     ecode += 1 + LINK_SIZE;
796     }
797     break;
798 nigel 77
799    
800 nigel 93 /* End of the pattern. If we are in a top-level recursion, we should
801     restore the offsets appropriately and continue from after the call. */
802 nigel 77
803     case OP_END:
804     if (md->recursive != NULL && md->recursive->group_num == 0)
805     {
806     recursion_info *rec = md->recursive;
807 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
808 nigel 77 md->recursive = rec->prevrec;
809     memmove(md->offset_vector, rec->offset_save,
810     rec->saved_max * sizeof(int));
811 ph10 168 mstart = rec->save_start;
812 nigel 77 ims = original_ims;
813     ecode = rec->after_call;
814     break;
815     }
816    
817     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
818     string - backtracking will then try other alternatives, if any. */
819    
820 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
821     md->end_match_ptr = eptr; /* Record where we ended */
822     md->end_offset_top = offset_top; /* and how many extracts were taken */
823     md->start_match_ptr = mstart; /* and the start (\K can modify) */
824 nigel 77 RRETURN(MATCH_MATCH);
825    
826     /* Change option settings */
827    
828     case OP_OPT:
829     ims = ecode[1];
830     ecode += 2;
831     DPRINTF(("ims set to %02lx\n", ims));
832     break;
833    
834     /* Assertion brackets. Check the alternative branches in turn - the
835     matching won't pass the KET for an assertion. If any one branch matches,
836     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
837     start of each branch to move the current point backwards, so the code at
838     this level is identical to the lookahead case. */
839    
840     case OP_ASSERT:
841     case OP_ASSERTBACK:
842     do
843     {
844 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
845     RM4);
846 nigel 77 if (rrc == MATCH_MATCH) break;
847     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
848     ecode += GET(ecode, 1);
849     }
850     while (*ecode == OP_ALT);
851     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
852    
853     /* If checking an assertion for a condition, return MATCH_MATCH. */
854    
855     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
856    
857     /* Continue from after the assertion, updating the offsets high water
858     mark, since extracts may have been taken during the assertion. */
859    
860     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
861     ecode += 1 + LINK_SIZE;
862     offset_top = md->end_offset_top;
863     continue;
864    
865     /* Negative assertion: all branches must fail to match */
866    
867     case OP_ASSERT_NOT:
868     case OP_ASSERTBACK_NOT:
869     do
870     {
871 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
872     RM5);
873 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
874     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
875     ecode += GET(ecode,1);
876     }
877     while (*ecode == OP_ALT);
878    
879     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
880    
881     ecode += 1 + LINK_SIZE;
882     continue;
883    
884     /* Move the subject pointer back. This occurs only at the start of
885     each branch of a lookbehind assertion. If we are too close to the start to
886     move back, this match function fails. When working with UTF-8 we move
887     back a number of characters, not bytes. */
888    
889     case OP_REVERSE:
890     #ifdef SUPPORT_UTF8
891     if (utf8)
892     {
893 nigel 93 i = GET(ecode, 1);
894     while (i-- > 0)
895 nigel 77 {
896     eptr--;
897     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
898     BACKCHAR(eptr)
899     }
900     }
901     else
902     #endif
903    
904     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
905    
906     {
907 nigel 93 eptr -= GET(ecode, 1);
908 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
909     }
910    
911     /* Skip to next op code */
912    
913     ecode += 1 + LINK_SIZE;
914     break;
915    
916     /* The callout item calls an external function, if one is provided, passing
917     details of the match so far. This is mainly for debugging, though the
918     function is able to force a failure. */
919    
920     case OP_CALLOUT:
921     if (pcre_callout != NULL)
922     {
923     pcre_callout_block cb;
924     cb.version = 1; /* Version 1 of the callout block */
925     cb.callout_number = ecode[1];
926     cb.offset_vector = md->offset_vector;
927 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
928 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
929 ph10 168 cb.start_match = mstart - md->start_subject;
930 nigel 77 cb.current_position = eptr - md->start_subject;
931     cb.pattern_position = GET(ecode, 2);
932     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
933     cb.capture_top = offset_top/2;
934     cb.capture_last = md->capture_last;
935     cb.callout_data = md->callout_data;
936     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
937     if (rrc < 0) RRETURN(rrc);
938     }
939     ecode += 2 + 2*LINK_SIZE;
940     break;
941    
942     /* Recursion either matches the current regex, or some subexpression. The
943     offset data is the offset to the starting bracket from the start of the
944     whole pattern. (This is so that it works from duplicated subpatterns.)
945    
946     If there are any capturing brackets started but not finished, we have to
947     save their starting points and reinstate them after the recursion. However,
948     we don't know how many such there are (offset_top records the completed
949     total) so we just have to save all the potential data. There may be up to
950     65535 such values, which is too large to put on the stack, but using malloc
951     for small numbers seems expensive. As a compromise, the stack is used when
952     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
953     is used. A problem is what to do if the malloc fails ... there is no way of
954     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
955     values on the stack, and accept that the rest may be wrong.
956    
957     There are also other values that have to be saved. We use a chained
958     sequence of blocks that actually live on the stack. Thanks to Robin Houston
959     for the original version of this logic. */
960    
961     case OP_RECURSE:
962     {
963     callpat = md->start_code + GET(ecode, 1);
964 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
965     GET2(callpat, 1 + LINK_SIZE);
966 nigel 77
967     /* Add to "recursing stack" */
968    
969     new_recursive.prevrec = md->recursive;
970     md->recursive = &new_recursive;
971    
972     /* Find where to continue from afterwards */
973    
974     ecode += 1 + LINK_SIZE;
975     new_recursive.after_call = ecode;
976    
977     /* Now save the offset data. */
978    
979     new_recursive.saved_max = md->offset_end;
980     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
981     new_recursive.offset_save = stacksave;
982     else
983     {
984     new_recursive.offset_save =
985     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
986     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
987     }
988    
989     memcpy(new_recursive.offset_save, md->offset_vector,
990     new_recursive.saved_max * sizeof(int));
991 ph10 168 new_recursive.save_start = mstart;
992     mstart = eptr;
993 nigel 77
994     /* OK, now we can do the recursion. For each top-level alternative we
995     restore the offset and recursion data. */
996    
997     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
998 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
999 nigel 77 do
1000     {
1001 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1002     md, ims, eptrb, flags, RM6);
1003 nigel 77 if (rrc == MATCH_MATCH)
1004     {
1005 nigel 87 DPRINTF(("Recursion matched\n"));
1006 nigel 77 md->recursive = new_recursive.prevrec;
1007     if (new_recursive.offset_save != stacksave)
1008     (pcre_free)(new_recursive.offset_save);
1009     RRETURN(MATCH_MATCH);
1010     }
1011 nigel 87 else if (rrc != MATCH_NOMATCH)
1012     {
1013     DPRINTF(("Recursion gave error %d\n", rrc));
1014     RRETURN(rrc);
1015     }
1016 nigel 77
1017     md->recursive = &new_recursive;
1018     memcpy(md->offset_vector, new_recursive.offset_save,
1019     new_recursive.saved_max * sizeof(int));
1020     callpat += GET(callpat, 1);
1021     }
1022     while (*callpat == OP_ALT);
1023    
1024     DPRINTF(("Recursion didn't match\n"));
1025     md->recursive = new_recursive.prevrec;
1026     if (new_recursive.offset_save != stacksave)
1027     (pcre_free)(new_recursive.offset_save);
1028     RRETURN(MATCH_NOMATCH);
1029     }
1030     /* Control never reaches here */
1031    
1032     /* "Once" brackets are like assertion brackets except that after a match,
1033     the point in the subject string is not moved back. Thus there can never be
1034     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1035     Check the alternative branches in turn - the matching won't pass the KET
1036     for this kind of subpattern. If any one branch matches, we carry on as at
1037     the end of a normal bracket, leaving the subject pointer. */
1038    
1039     case OP_ONCE:
1040 nigel 91 prev = ecode;
1041     saved_eptr = eptr;
1042    
1043     do
1044 nigel 77 {
1045 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1046 nigel 91 if (rrc == MATCH_MATCH) break;
1047     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1048     ecode += GET(ecode,1);
1049     }
1050     while (*ecode == OP_ALT);
1051 nigel 77
1052 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1053 nigel 77
1054 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1055 nigel 77
1056 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1057     mark, since extracts may have been taken. */
1058 nigel 77
1059 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1060 nigel 77
1061 nigel 91 offset_top = md->end_offset_top;
1062     eptr = md->end_match_ptr;
1063 nigel 77
1064 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1065     happens for a repeating ket if no characters were matched in the group.
1066     This is the forcible breaking of infinite loops as implemented in Perl
1067     5.005. If there is an options reset, it will get obeyed in the normal
1068     course of events. */
1069 nigel 77
1070 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1071     {
1072     ecode += 1+LINK_SIZE;
1073     break;
1074     }
1075 nigel 77
1076 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1077     preceding bracket, in the appropriate order. The second "call" of match()
1078     uses tail recursion, to avoid using another stack frame. We need to reset
1079     any options that changed within the bracket before re-running it, so
1080     check the next opcode. */
1081 nigel 77
1082 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1083     {
1084     ims = (ims & ~PCRE_IMS) | ecode[4];
1085     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1086     }
1087 nigel 77
1088 nigel 91 if (*ecode == OP_KETRMIN)
1089     {
1090 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1091 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1092     ecode = prev;
1093 ph10 197 flags = 0;
1094 nigel 91 goto TAIL_RECURSE;
1095 nigel 77 }
1096 nigel 91 else /* OP_KETRMAX */
1097     {
1098 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1099 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1100     ecode += 1 + LINK_SIZE;
1101 ph10 197 flags = 0;
1102 nigel 91 goto TAIL_RECURSE;
1103     }
1104     /* Control never gets here */
1105 nigel 77
1106     /* An alternation is the end of a branch; scan along to find the end of the
1107     bracketed group and go to there. */
1108    
1109     case OP_ALT:
1110     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1111     break;
1112    
1113     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1114     that it may occur zero times. It may repeat infinitely, or not at all -
1115     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1116     repeat limits are compiled as a number of copies, with the optional ones
1117     preceded by BRAZERO or BRAMINZERO. */
1118    
1119     case OP_BRAZERO:
1120     {
1121     next = ecode+1;
1122 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1123 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1124     do next += GET(next,1); while (*next == OP_ALT);
1125 nigel 93 ecode = next + 1 + LINK_SIZE;
1126 nigel 77 }
1127     break;
1128    
1129     case OP_BRAMINZERO:
1130     {
1131     next = ecode+1;
1132 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1133 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1134 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1135     ecode++;
1136     }
1137     break;
1138    
1139 nigel 93 /* End of a group, repeated or non-repeating. */
1140 nigel 77
1141     case OP_KET:
1142     case OP_KETRMIN:
1143     case OP_KETRMAX:
1144 nigel 91 prev = ecode - GET(ecode, 1);
1145 nigel 77
1146 nigel 93 /* If this was a group that remembered the subject start, in order to break
1147     infinite repeats of empty string matches, retrieve the subject start from
1148     the chain. Otherwise, set it NULL. */
1149 nigel 77
1150 nigel 93 if (*prev >= OP_SBRA)
1151     {
1152     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1153     eptrb = eptrb->epb_prev; /* Backup to previous group */
1154     }
1155     else saved_eptr = NULL;
1156 nigel 77
1157 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1158     MATCH_MATCH, but record the current high water mark for use by positive
1159     assertions. Do this also for the "once" (atomic) groups. */
1160    
1161 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1162     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1163     *prev == OP_ONCE)
1164     {
1165     md->end_match_ptr = eptr; /* For ONCE */
1166     md->end_offset_top = offset_top;
1167     RRETURN(MATCH_MATCH);
1168     }
1169 nigel 77
1170 nigel 93 /* For capturing groups we have to check the group number back at the start
1171     and if necessary complete handling an extraction by setting the offsets and
1172     bumping the high water mark. Note that whole-pattern recursion is coded as
1173     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1174     when the OP_END is reached. Other recursion is handled here. */
1175 nigel 77
1176 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1177 nigel 91 {
1178 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1179 nigel 91 offset = number << 1;
1180 nigel 77
1181     #ifdef DEBUG
1182 nigel 91 printf("end bracket %d", number);
1183     printf("\n");
1184 nigel 77 #endif
1185    
1186 nigel 93 md->capture_last = number;
1187     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1188 nigel 91 {
1189 nigel 93 md->offset_vector[offset] =
1190     md->offset_vector[md->offset_end - number];
1191     md->offset_vector[offset+1] = eptr - md->start_subject;
1192     if (offset_top <= offset) offset_top = offset + 2;
1193     }
1194 nigel 77
1195 nigel 93 /* Handle a recursively called group. Restore the offsets
1196     appropriately and continue from after the call. */
1197 nigel 77
1198 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1199     {
1200     recursion_info *rec = md->recursive;
1201     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1202     md->recursive = rec->prevrec;
1203 ph10 168 mstart = rec->save_start;
1204 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1205     rec->saved_max * sizeof(int));
1206     ecode = rec->after_call;
1207     ims = original_ims;
1208     break;
1209 nigel 77 }
1210 nigel 91 }
1211 nigel 77
1212 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1213     flags, in case they got changed during the group. */
1214 nigel 77
1215 nigel 91 ims = original_ims;
1216     DPRINTF(("ims reset to %02lx\n", ims));
1217 nigel 77
1218 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1219     happens for a repeating ket if no characters were matched in the group.
1220     This is the forcible breaking of infinite loops as implemented in Perl
1221     5.005. If there is an options reset, it will get obeyed in the normal
1222     course of events. */
1223 nigel 77
1224 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1225     {
1226     ecode += 1 + LINK_SIZE;
1227     break;
1228     }
1229 nigel 77
1230 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1231     preceding bracket, in the appropriate order. In the second case, we can use
1232 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1233     unlimited repeat of a group that can match an empty string. */
1234 nigel 77
1235 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1236    
1237 nigel 91 if (*ecode == OP_KETRMIN)
1238     {
1239 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1240 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1241 ph10 197 if (flags != 0) /* Could match an empty string */
1242     {
1243     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1244     RRETURN(rrc);
1245     }
1246 nigel 91 ecode = prev;
1247     goto TAIL_RECURSE;
1248 nigel 77 }
1249 nigel 91 else /* OP_KETRMAX */
1250     {
1251 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1252 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1253     ecode += 1 + LINK_SIZE;
1254 ph10 197 flags = 0;
1255 nigel 91 goto TAIL_RECURSE;
1256     }
1257     /* Control never gets here */
1258 nigel 77
1259     /* Start of subject unless notbol, or after internal newline if multiline */
1260    
1261     case OP_CIRC:
1262     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1263     if ((ims & PCRE_MULTILINE) != 0)
1264     {
1265 nigel 91 if (eptr != md->start_subject &&
1266 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1267 nigel 77 RRETURN(MATCH_NOMATCH);
1268     ecode++;
1269     break;
1270     }
1271     /* ... else fall through */
1272    
1273     /* Start of subject assertion */
1274    
1275     case OP_SOD:
1276     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1277     ecode++;
1278     break;
1279    
1280     /* Start of match assertion */
1281    
1282     case OP_SOM:
1283     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1284     ecode++;
1285     break;
1286 ph10 172
1287 ph10 168 /* Reset the start of match point */
1288 ph10 172
1289 ph10 168 case OP_SET_SOM:
1290     mstart = eptr;
1291 ph10 172 ecode++;
1292     break;
1293 nigel 77
1294     /* Assert before internal newline if multiline, or before a terminating
1295     newline unless endonly is set, else end of subject unless noteol is set. */
1296    
1297     case OP_DOLL:
1298     if ((ims & PCRE_MULTILINE) != 0)
1299     {
1300     if (eptr < md->end_subject)
1301 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1302 nigel 77 else
1303     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1304     ecode++;
1305     break;
1306     }
1307     else
1308     {
1309     if (md->noteol) RRETURN(MATCH_NOMATCH);
1310     if (!md->endonly)
1311     {
1312 nigel 91 if (eptr != md->end_subject &&
1313 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1314 nigel 77 RRETURN(MATCH_NOMATCH);
1315     ecode++;
1316     break;
1317     }
1318     }
1319 nigel 91 /* ... else fall through for endonly */
1320 nigel 77
1321     /* End of subject assertion (\z) */
1322    
1323     case OP_EOD:
1324     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1325     ecode++;
1326     break;
1327    
1328     /* End of subject or ending \n assertion (\Z) */
1329    
1330     case OP_EODN:
1331 nigel 91 if (eptr != md->end_subject &&
1332 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1333 nigel 91 RRETURN(MATCH_NOMATCH);
1334 nigel 77 ecode++;
1335     break;
1336    
1337     /* Word boundary assertions */
1338    
1339     case OP_NOT_WORD_BOUNDARY:
1340     case OP_WORD_BOUNDARY:
1341     {
1342    
1343     /* Find out if the previous and current characters are "word" characters.
1344     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1345     be "non-word" characters. */
1346    
1347     #ifdef SUPPORT_UTF8
1348     if (utf8)
1349     {
1350     if (eptr == md->start_subject) prev_is_word = FALSE; else
1351     {
1352     const uschar *lastptr = eptr - 1;
1353     while((*lastptr & 0xc0) == 0x80) lastptr--;
1354     GETCHAR(c, lastptr);
1355     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1356     }
1357     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1358     {
1359     GETCHAR(c, eptr);
1360     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1361     }
1362     }
1363     else
1364     #endif
1365    
1366     /* More streamlined when not in UTF-8 mode */
1367    
1368     {
1369     prev_is_word = (eptr != md->start_subject) &&
1370     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1371     cur_is_word = (eptr < md->end_subject) &&
1372     ((md->ctypes[*eptr] & ctype_word) != 0);
1373     }
1374    
1375     /* Now see if the situation is what we want */
1376    
1377     if ((*ecode++ == OP_WORD_BOUNDARY)?
1378     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1379     RRETURN(MATCH_NOMATCH);
1380     }
1381     break;
1382    
1383     /* Match a single character type; inline for speed */
1384    
1385     case OP_ANY:
1386 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1387     {
1388 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1389 nigel 91 }
1390 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1391     if (utf8)
1392     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1393     ecode++;
1394     break;
1395    
1396     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1397     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1398    
1399     case OP_ANYBYTE:
1400     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1401     ecode++;
1402     break;
1403    
1404     case OP_NOT_DIGIT:
1405     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1406     GETCHARINCTEST(c, eptr);
1407     if (
1408     #ifdef SUPPORT_UTF8
1409     c < 256 &&
1410     #endif
1411     (md->ctypes[c] & ctype_digit) != 0
1412     )
1413     RRETURN(MATCH_NOMATCH);
1414     ecode++;
1415     break;
1416    
1417     case OP_DIGIT:
1418     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1419     GETCHARINCTEST(c, eptr);
1420     if (
1421     #ifdef SUPPORT_UTF8
1422     c >= 256 ||
1423     #endif
1424     (md->ctypes[c] & ctype_digit) == 0
1425     )
1426     RRETURN(MATCH_NOMATCH);
1427     ecode++;
1428     break;
1429    
1430     case OP_NOT_WHITESPACE:
1431     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1432     GETCHARINCTEST(c, eptr);
1433     if (
1434     #ifdef SUPPORT_UTF8
1435     c < 256 &&
1436     #endif
1437     (md->ctypes[c] & ctype_space) != 0
1438     )
1439     RRETURN(MATCH_NOMATCH);
1440     ecode++;
1441     break;
1442    
1443     case OP_WHITESPACE:
1444     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1445     GETCHARINCTEST(c, eptr);
1446     if (
1447     #ifdef SUPPORT_UTF8
1448     c >= 256 ||
1449     #endif
1450     (md->ctypes[c] & ctype_space) == 0
1451     )
1452     RRETURN(MATCH_NOMATCH);
1453     ecode++;
1454     break;
1455    
1456     case OP_NOT_WORDCHAR:
1457     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1458     GETCHARINCTEST(c, eptr);
1459     if (
1460     #ifdef SUPPORT_UTF8
1461     c < 256 &&
1462     #endif
1463     (md->ctypes[c] & ctype_word) != 0
1464     )
1465     RRETURN(MATCH_NOMATCH);
1466     ecode++;
1467     break;
1468    
1469     case OP_WORDCHAR:
1470     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1471     GETCHARINCTEST(c, eptr);
1472     if (
1473     #ifdef SUPPORT_UTF8
1474     c >= 256 ||
1475     #endif
1476     (md->ctypes[c] & ctype_word) == 0
1477     )
1478     RRETURN(MATCH_NOMATCH);
1479     ecode++;
1480     break;
1481    
1482 nigel 93 case OP_ANYNL:
1483     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1484     GETCHARINCTEST(c, eptr);
1485     switch(c)
1486     {
1487     default: RRETURN(MATCH_NOMATCH);
1488     case 0x000d:
1489     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1490     break;
1491     case 0x000a:
1492     case 0x000b:
1493     case 0x000c:
1494     case 0x0085:
1495     case 0x2028:
1496     case 0x2029:
1497     break;
1498     }
1499     ecode++;
1500     break;
1501    
1502 ph10 178 case OP_NOT_HSPACE:
1503     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1504     GETCHARINCTEST(c, eptr);
1505     switch(c)
1506     {
1507     default: break;
1508     case 0x09: /* HT */
1509     case 0x20: /* SPACE */
1510     case 0xa0: /* NBSP */
1511     case 0x1680: /* OGHAM SPACE MARK */
1512     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1513     case 0x2000: /* EN QUAD */
1514     case 0x2001: /* EM QUAD */
1515     case 0x2002: /* EN SPACE */
1516     case 0x2003: /* EM SPACE */
1517     case 0x2004: /* THREE-PER-EM SPACE */
1518     case 0x2005: /* FOUR-PER-EM SPACE */
1519     case 0x2006: /* SIX-PER-EM SPACE */
1520     case 0x2007: /* FIGURE SPACE */
1521     case 0x2008: /* PUNCTUATION SPACE */
1522     case 0x2009: /* THIN SPACE */
1523     case 0x200A: /* HAIR SPACE */
1524     case 0x202f: /* NARROW NO-BREAK SPACE */
1525     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1526     case 0x3000: /* IDEOGRAPHIC SPACE */
1527     RRETURN(MATCH_NOMATCH);
1528     }
1529     ecode++;
1530     break;
1531    
1532     case OP_HSPACE:
1533     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1534     GETCHARINCTEST(c, eptr);
1535     switch(c)
1536     {
1537     default: RRETURN(MATCH_NOMATCH);
1538     case 0x09: /* HT */
1539     case 0x20: /* SPACE */
1540     case 0xa0: /* NBSP */
1541     case 0x1680: /* OGHAM SPACE MARK */
1542     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1543     case 0x2000: /* EN QUAD */
1544     case 0x2001: /* EM QUAD */
1545     case 0x2002: /* EN SPACE */
1546     case 0x2003: /* EM SPACE */
1547     case 0x2004: /* THREE-PER-EM SPACE */
1548     case 0x2005: /* FOUR-PER-EM SPACE */
1549     case 0x2006: /* SIX-PER-EM SPACE */
1550     case 0x2007: /* FIGURE SPACE */
1551     case 0x2008: /* PUNCTUATION SPACE */
1552     case 0x2009: /* THIN SPACE */
1553     case 0x200A: /* HAIR SPACE */
1554     case 0x202f: /* NARROW NO-BREAK SPACE */
1555     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1556     case 0x3000: /* IDEOGRAPHIC SPACE */
1557     break;
1558     }
1559     ecode++;
1560     break;
1561    
1562     case OP_NOT_VSPACE:
1563     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1564     GETCHARINCTEST(c, eptr);
1565     switch(c)
1566     {
1567     default: break;
1568     case 0x0a: /* LF */
1569     case 0x0b: /* VT */
1570     case 0x0c: /* FF */
1571     case 0x0d: /* CR */
1572     case 0x85: /* NEL */
1573     case 0x2028: /* LINE SEPARATOR */
1574     case 0x2029: /* PARAGRAPH SEPARATOR */
1575     RRETURN(MATCH_NOMATCH);
1576     }
1577     ecode++;
1578     break;
1579    
1580     case OP_VSPACE:
1581     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1582     GETCHARINCTEST(c, eptr);
1583     switch(c)
1584     {
1585     default: RRETURN(MATCH_NOMATCH);
1586     case 0x0a: /* LF */
1587     case 0x0b: /* VT */
1588     case 0x0c: /* FF */
1589     case 0x0d: /* CR */
1590     case 0x85: /* NEL */
1591     case 0x2028: /* LINE SEPARATOR */
1592     case 0x2029: /* PARAGRAPH SEPARATOR */
1593     break;
1594     }
1595     ecode++;
1596     break;
1597    
1598 nigel 77 #ifdef SUPPORT_UCP
1599     /* Check the next character by Unicode property. We will get here only
1600     if the support is in the binary; otherwise a compile-time error occurs. */
1601    
1602     case OP_PROP:
1603     case OP_NOTPROP:
1604     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1605     GETCHARINCTEST(c, eptr);
1606     {
1607 nigel 87 int chartype, script;
1608     int category = _pcre_ucp_findprop(c, &chartype, &script);
1609 nigel 77
1610 nigel 87 switch(ecode[1])
1611     {
1612     case PT_ANY:
1613     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1614     break;
1615 nigel 77
1616 nigel 87 case PT_LAMP:
1617     if ((chartype == ucp_Lu ||
1618     chartype == ucp_Ll ||
1619     chartype == ucp_Lt) == (op == OP_NOTPROP))
1620 nigel 77 RRETURN(MATCH_NOMATCH);
1621 nigel 87 break;
1622    
1623     case PT_GC:
1624     if ((ecode[2] != category) == (op == OP_PROP))
1625 nigel 77 RRETURN(MATCH_NOMATCH);
1626 nigel 87 break;
1627    
1628     case PT_PC:
1629     if ((ecode[2] != chartype) == (op == OP_PROP))
1630     RRETURN(MATCH_NOMATCH);
1631     break;
1632    
1633     case PT_SC:
1634     if ((ecode[2] != script) == (op == OP_PROP))
1635     RRETURN(MATCH_NOMATCH);
1636     break;
1637    
1638     default:
1639     RRETURN(PCRE_ERROR_INTERNAL);
1640 nigel 77 }
1641 nigel 87
1642     ecode += 3;
1643 nigel 77 }
1644     break;
1645    
1646     /* Match an extended Unicode sequence. We will get here only if the support
1647     is in the binary; otherwise a compile-time error occurs. */
1648    
1649     case OP_EXTUNI:
1650     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1651     GETCHARINCTEST(c, eptr);
1652     {
1653 nigel 87 int chartype, script;
1654     int category = _pcre_ucp_findprop(c, &chartype, &script);
1655 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1656     while (eptr < md->end_subject)
1657     {
1658     int len = 1;
1659     if (!utf8) c = *eptr; else
1660     {
1661     GETCHARLEN(c, eptr, len);
1662     }
1663 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1664 nigel 77 if (category != ucp_M) break;
1665     eptr += len;
1666     }
1667     }
1668     ecode++;
1669     break;
1670     #endif
1671    
1672    
1673     /* Match a back reference, possibly repeatedly. Look past the end of the
1674     item to see if there is repeat information following. The code is similar
1675     to that for character classes, but repeated for efficiency. Then obey
1676     similar code to character type repeats - written out again for speed.
1677     However, if the referenced string is the empty string, always treat
1678     it as matched, any number of times (otherwise there could be infinite
1679     loops). */
1680    
1681     case OP_REF:
1682     {
1683     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1684     ecode += 3; /* Advance past item */
1685    
1686     /* If the reference is unset, set the length to be longer than the amount
1687     of subject left; this ensures that every attempt at a match fails. We
1688     can't just fail here, because of the possibility of quantifiers with zero
1689     minima. */
1690    
1691     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1692     md->end_subject - eptr + 1 :
1693     md->offset_vector[offset+1] - md->offset_vector[offset];
1694    
1695     /* Set up for repetition, or handle the non-repeated case */
1696    
1697     switch (*ecode)
1698     {
1699     case OP_CRSTAR:
1700     case OP_CRMINSTAR:
1701     case OP_CRPLUS:
1702     case OP_CRMINPLUS:
1703     case OP_CRQUERY:
1704     case OP_CRMINQUERY:
1705     c = *ecode++ - OP_CRSTAR;
1706     minimize = (c & 1) != 0;
1707     min = rep_min[c]; /* Pick up values from tables; */
1708     max = rep_max[c]; /* zero for max => infinity */
1709     if (max == 0) max = INT_MAX;
1710     break;
1711    
1712     case OP_CRRANGE:
1713     case OP_CRMINRANGE:
1714     minimize = (*ecode == OP_CRMINRANGE);
1715     min = GET2(ecode, 1);
1716     max = GET2(ecode, 3);
1717     if (max == 0) max = INT_MAX;
1718     ecode += 5;
1719     break;
1720    
1721     default: /* No repeat follows */
1722     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1723     eptr += length;
1724     continue; /* With the main loop */
1725     }
1726    
1727     /* If the length of the reference is zero, just continue with the
1728     main loop. */
1729    
1730     if (length == 0) continue;
1731    
1732     /* First, ensure the minimum number of matches are present. We get back
1733     the length of the reference string explicitly rather than passing the
1734     address of eptr, so that eptr can be a register variable. */
1735    
1736     for (i = 1; i <= min; i++)
1737     {
1738     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1739     eptr += length;
1740     }
1741    
1742     /* If min = max, continue at the same level without recursion.
1743     They are not both allowed to be zero. */
1744    
1745     if (min == max) continue;
1746    
1747     /* If minimizing, keep trying and advancing the pointer */
1748    
1749     if (minimize)
1750     {
1751     for (fi = min;; fi++)
1752     {
1753 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1754 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1755     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1756     RRETURN(MATCH_NOMATCH);
1757     eptr += length;
1758     }
1759     /* Control never gets here */
1760     }
1761    
1762     /* If maximizing, find the longest string and work backwards */
1763    
1764     else
1765     {
1766     pp = eptr;
1767     for (i = min; i < max; i++)
1768     {
1769     if (!match_ref(offset, eptr, length, md, ims)) break;
1770     eptr += length;
1771     }
1772     while (eptr >= pp)
1773     {
1774 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1775 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1776     eptr -= length;
1777     }
1778     RRETURN(MATCH_NOMATCH);
1779     }
1780     }
1781     /* Control never gets here */
1782    
1783    
1784    
1785     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1786     used when all the characters in the class have values in the range 0-255,
1787     and either the matching is caseful, or the characters are in the range
1788     0-127 when UTF-8 processing is enabled. The only difference between
1789     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1790     encountered.
1791    
1792     First, look past the end of the item to see if there is repeat information
1793     following. Then obey similar code to character type repeats - written out
1794     again for speed. */
1795    
1796     case OP_NCLASS:
1797     case OP_CLASS:
1798     {
1799     data = ecode + 1; /* Save for matching */
1800     ecode += 33; /* Advance past the item */
1801    
1802     switch (*ecode)
1803     {
1804     case OP_CRSTAR:
1805     case OP_CRMINSTAR:
1806     case OP_CRPLUS:
1807     case OP_CRMINPLUS:
1808     case OP_CRQUERY:
1809     case OP_CRMINQUERY:
1810     c = *ecode++ - OP_CRSTAR;
1811     minimize = (c & 1) != 0;
1812     min = rep_min[c]; /* Pick up values from tables; */
1813     max = rep_max[c]; /* zero for max => infinity */
1814     if (max == 0) max = INT_MAX;
1815     break;
1816    
1817     case OP_CRRANGE:
1818     case OP_CRMINRANGE:
1819     minimize = (*ecode == OP_CRMINRANGE);
1820     min = GET2(ecode, 1);
1821     max = GET2(ecode, 3);
1822     if (max == 0) max = INT_MAX;
1823     ecode += 5;
1824     break;
1825    
1826     default: /* No repeat follows */
1827     min = max = 1;
1828     break;
1829     }
1830    
1831     /* First, ensure the minimum number of matches are present. */
1832    
1833     #ifdef SUPPORT_UTF8
1834     /* UTF-8 mode */
1835     if (utf8)
1836     {
1837     for (i = 1; i <= min; i++)
1838     {
1839     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1840     GETCHARINC(c, eptr);
1841     if (c > 255)
1842     {
1843     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1844     }
1845     else
1846     {
1847     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1848     }
1849     }
1850     }
1851     else
1852     #endif
1853     /* Not UTF-8 mode */
1854     {
1855     for (i = 1; i <= min; i++)
1856     {
1857     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1858     c = *eptr++;
1859     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1860     }
1861     }
1862    
1863     /* If max == min we can continue with the main loop without the
1864     need to recurse. */
1865    
1866     if (min == max) continue;
1867    
1868     /* If minimizing, keep testing the rest of the expression and advancing
1869     the pointer while it matches the class. */
1870    
1871     if (minimize)
1872     {
1873     #ifdef SUPPORT_UTF8
1874     /* UTF-8 mode */
1875     if (utf8)
1876     {
1877     for (fi = min;; fi++)
1878     {
1879 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1880 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1881     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1882     GETCHARINC(c, eptr);
1883     if (c > 255)
1884     {
1885     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1886     }
1887     else
1888     {
1889     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1890     }
1891     }
1892     }
1893     else
1894     #endif
1895     /* Not UTF-8 mode */
1896     {
1897     for (fi = min;; fi++)
1898     {
1899 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1900 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1901     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1902     c = *eptr++;
1903     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1904     }
1905     }
1906     /* Control never gets here */
1907     }
1908    
1909     /* If maximizing, find the longest possible run, then work backwards. */
1910    
1911     else
1912     {
1913     pp = eptr;
1914    
1915     #ifdef SUPPORT_UTF8
1916     /* UTF-8 mode */
1917     if (utf8)
1918     {
1919     for (i = min; i < max; i++)
1920     {
1921     int len = 1;
1922     if (eptr >= md->end_subject) break;
1923     GETCHARLEN(c, eptr, len);
1924     if (c > 255)
1925     {
1926     if (op == OP_CLASS) break;
1927     }
1928     else
1929     {
1930     if ((data[c/8] & (1 << (c&7))) == 0) break;
1931     }
1932     eptr += len;
1933     }
1934     for (;;)
1935     {
1936 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1937 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1938     if (eptr-- == pp) break; /* Stop if tried at original pos */
1939     BACKCHAR(eptr);
1940     }
1941     }
1942     else
1943     #endif
1944     /* Not UTF-8 mode */
1945     {
1946     for (i = min; i < max; i++)
1947     {
1948     if (eptr >= md->end_subject) break;
1949     c = *eptr;
1950     if ((data[c/8] & (1 << (c&7))) == 0) break;
1951     eptr++;
1952     }
1953     while (eptr >= pp)
1954     {
1955 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1956 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1957 nigel 77 eptr--;
1958     }
1959     }
1960    
1961     RRETURN(MATCH_NOMATCH);
1962     }
1963     }
1964     /* Control never gets here */
1965    
1966    
1967     /* Match an extended character class. This opcode is encountered only
1968     in UTF-8 mode, because that's the only time it is compiled. */
1969    
1970     #ifdef SUPPORT_UTF8
1971     case OP_XCLASS:
1972     {
1973     data = ecode + 1 + LINK_SIZE; /* Save for matching */
1974     ecode += GET(ecode, 1); /* Advance past the item */
1975    
1976     switch (*ecode)
1977     {
1978     case OP_CRSTAR:
1979     case OP_CRMINSTAR:
1980     case OP_CRPLUS:
1981     case OP_CRMINPLUS:
1982     case OP_CRQUERY:
1983     case OP_CRMINQUERY:
1984     c = *ecode++ - OP_CRSTAR;
1985     minimize = (c & 1) != 0;
1986     min = rep_min[c]; /* Pick up values from tables; */
1987     max = rep_max[c]; /* zero for max => infinity */
1988     if (max == 0) max = INT_MAX;
1989     break;
1990    
1991     case OP_CRRANGE:
1992     case OP_CRMINRANGE:
1993     minimize = (*ecode == OP_CRMINRANGE);
1994     min = GET2(ecode, 1);
1995     max = GET2(ecode, 3);
1996     if (max == 0) max = INT_MAX;
1997     ecode += 5;
1998     break;
1999    
2000     default: /* No repeat follows */
2001     min = max = 1;
2002     break;
2003     }
2004    
2005     /* First, ensure the minimum number of matches are present. */
2006    
2007     for (i = 1; i <= min; i++)
2008     {
2009     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2010     GETCHARINC(c, eptr);
2011     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2012     }
2013    
2014     /* If max == min we can continue with the main loop without the
2015     need to recurse. */
2016    
2017     if (min == max) continue;
2018    
2019     /* If minimizing, keep testing the rest of the expression and advancing
2020     the pointer while it matches the class. */
2021    
2022     if (minimize)
2023     {
2024     for (fi = min;; fi++)
2025     {
2026 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2027 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2028     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2029     GETCHARINC(c, eptr);
2030     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2031     }
2032     /* Control never gets here */
2033     }
2034    
2035     /* If maximizing, find the longest possible run, then work backwards. */
2036    
2037     else
2038     {
2039     pp = eptr;
2040     for (i = min; i < max; i++)
2041     {
2042     int len = 1;
2043     if (eptr >= md->end_subject) break;
2044     GETCHARLEN(c, eptr, len);
2045     if (!_pcre_xclass(c, data)) break;
2046     eptr += len;
2047     }
2048     for(;;)
2049     {
2050 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2051 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2052     if (eptr-- == pp) break; /* Stop if tried at original pos */
2053     BACKCHAR(eptr)
2054     }
2055     RRETURN(MATCH_NOMATCH);
2056     }
2057    
2058     /* Control never gets here */
2059     }
2060     #endif /* End of XCLASS */
2061    
2062     /* Match a single character, casefully */
2063    
2064     case OP_CHAR:
2065     #ifdef SUPPORT_UTF8
2066     if (utf8)
2067     {
2068     length = 1;
2069     ecode++;
2070     GETCHARLEN(fc, ecode, length);
2071     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2072     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2073     }
2074     else
2075     #endif
2076    
2077     /* Non-UTF-8 mode */
2078     {
2079     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2080     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2081     ecode += 2;
2082     }
2083     break;
2084    
2085     /* Match a single character, caselessly */
2086    
2087     case OP_CHARNC:
2088     #ifdef SUPPORT_UTF8
2089     if (utf8)
2090     {
2091     length = 1;
2092     ecode++;
2093     GETCHARLEN(fc, ecode, length);
2094    
2095     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2096    
2097     /* If the pattern character's value is < 128, we have only one byte, and
2098     can use the fast lookup table. */
2099    
2100     if (fc < 128)
2101     {
2102     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2103     }
2104    
2105     /* Otherwise we must pick up the subject character */
2106    
2107     else
2108     {
2109 nigel 93 unsigned int dc;
2110 nigel 77 GETCHARINC(dc, eptr);
2111     ecode += length;
2112    
2113     /* If we have Unicode property support, we can use it to test the other
2114 nigel 87 case of the character, if there is one. */
2115 nigel 77
2116     if (fc != dc)
2117     {
2118     #ifdef SUPPORT_UCP
2119 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2120 nigel 77 #endif
2121     RRETURN(MATCH_NOMATCH);
2122     }
2123     }
2124     }
2125     else
2126     #endif /* SUPPORT_UTF8 */
2127    
2128     /* Non-UTF-8 mode */
2129     {
2130     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2131     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2132     ecode += 2;
2133     }
2134     break;
2135    
2136 nigel 93 /* Match a single character repeatedly. */
2137 nigel 77
2138     case OP_EXACT:
2139     min = max = GET2(ecode, 1);
2140     ecode += 3;
2141     goto REPEATCHAR;
2142    
2143 nigel 93 case OP_POSUPTO:
2144     possessive = TRUE;
2145     /* Fall through */
2146    
2147 nigel 77 case OP_UPTO:
2148     case OP_MINUPTO:
2149     min = 0;
2150     max = GET2(ecode, 1);
2151     minimize = *ecode == OP_MINUPTO;
2152     ecode += 3;
2153     goto REPEATCHAR;
2154    
2155 nigel 93 case OP_POSSTAR:
2156     possessive = TRUE;
2157     min = 0;
2158     max = INT_MAX;
2159     ecode++;
2160     goto REPEATCHAR;
2161    
2162     case OP_POSPLUS:
2163     possessive = TRUE;
2164     min = 1;
2165     max = INT_MAX;
2166     ecode++;
2167     goto REPEATCHAR;
2168    
2169     case OP_POSQUERY:
2170     possessive = TRUE;
2171     min = 0;
2172     max = 1;
2173     ecode++;
2174     goto REPEATCHAR;
2175    
2176 nigel 77 case OP_STAR:
2177     case OP_MINSTAR:
2178     case OP_PLUS:
2179     case OP_MINPLUS:
2180     case OP_QUERY:
2181     case OP_MINQUERY:
2182     c = *ecode++ - OP_STAR;
2183     minimize = (c & 1) != 0;
2184     min = rep_min[c]; /* Pick up values from tables; */
2185     max = rep_max[c]; /* zero for max => infinity */
2186     if (max == 0) max = INT_MAX;
2187    
2188     /* Common code for all repeated single-character matches. We can give
2189     up quickly if there are fewer than the minimum number of characters left in
2190     the subject. */
2191    
2192     REPEATCHAR:
2193     #ifdef SUPPORT_UTF8
2194     if (utf8)
2195     {
2196     length = 1;
2197     charptr = ecode;
2198     GETCHARLEN(fc, ecode, length);
2199     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2200     ecode += length;
2201    
2202     /* Handle multibyte character matching specially here. There is
2203     support for caseless matching if UCP support is present. */
2204    
2205     if (length > 1)
2206     {
2207     #ifdef SUPPORT_UCP
2208 nigel 93 unsigned int othercase;
2209 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2210 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2211 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2212 ph10 115 else oclength = 0;
2213 nigel 77 #endif /* SUPPORT_UCP */
2214    
2215     for (i = 1; i <= min; i++)
2216     {
2217     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2218 ph10 123 #ifdef SUPPORT_UCP
2219 nigel 77 /* Need braces because of following else */
2220     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2221     else
2222     {
2223     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2224     eptr += oclength;
2225     }
2226 ph10 115 #else /* without SUPPORT_UCP */
2227     else { RRETURN(MATCH_NOMATCH); }
2228 ph10 123 #endif /* SUPPORT_UCP */
2229 nigel 77 }
2230    
2231     if (min == max) continue;
2232    
2233     if (minimize)
2234     {
2235     for (fi = min;; fi++)
2236     {
2237 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2238 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2239     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2240     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2241 ph10 123 #ifdef SUPPORT_UCP
2242 nigel 77 /* Need braces because of following else */
2243     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2244     else
2245     {
2246     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2247     eptr += oclength;
2248     }
2249 ph10 115 #else /* without SUPPORT_UCP */
2250     else { RRETURN (MATCH_NOMATCH); }
2251     #endif /* SUPPORT_UCP */
2252 nigel 77 }
2253     /* Control never gets here */
2254     }
2255 nigel 93
2256     else /* Maximize */
2257 nigel 77 {
2258     pp = eptr;
2259     for (i = min; i < max; i++)
2260     {
2261     if (eptr > md->end_subject - length) break;
2262     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2263 ph10 123 #ifdef SUPPORT_UCP
2264 nigel 77 else if (oclength == 0) break;
2265     else
2266     {
2267     if (memcmp(eptr, occhars, oclength) != 0) break;
2268     eptr += oclength;
2269     }
2270 ph10 115 #else /* without SUPPORT_UCP */
2271     else break;
2272 ph10 123 #endif /* SUPPORT_UCP */
2273 nigel 77 }
2274 nigel 93
2275     if (possessive) continue;
2276 ph10 120 for(;;)
2277 nigel 77 {
2278 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2279 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2280 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2281 ph10 115 #ifdef SUPPORT_UCP
2282     eptr--;
2283     BACKCHAR(eptr);
2284 ph10 123 #else /* without SUPPORT_UCP */
2285 nigel 77 eptr -= length;
2286 ph10 123 #endif /* SUPPORT_UCP */
2287 nigel 77 }
2288     }
2289     /* Control never gets here */
2290     }
2291    
2292     /* If the length of a UTF-8 character is 1, we fall through here, and
2293     obey the code as for non-UTF-8 characters below, though in this case the
2294     value of fc will always be < 128. */
2295     }
2296     else
2297     #endif /* SUPPORT_UTF8 */
2298    
2299     /* When not in UTF-8 mode, load a single-byte character. */
2300     {
2301     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2302     fc = *ecode++;
2303     }
2304    
2305     /* The value of fc at this point is always less than 256, though we may or
2306     may not be in UTF-8 mode. The code is duplicated for the caseless and
2307     caseful cases, for speed, since matching characters is likely to be quite
2308     common. First, ensure the minimum number of matches are present. If min =
2309     max, continue at the same level without recursing. Otherwise, if
2310     minimizing, keep trying the rest of the expression and advancing one
2311     matching character if failing, up to the maximum. Alternatively, if
2312     maximizing, find the maximum number of characters and work backwards. */
2313    
2314     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2315     max, eptr));
2316    
2317     if ((ims & PCRE_CASELESS) != 0)
2318     {
2319     fc = md->lcc[fc];
2320     for (i = 1; i <= min; i++)
2321     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2322     if (min == max) continue;
2323     if (minimize)
2324     {
2325     for (fi = min;; fi++)
2326     {
2327 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2328 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2329     if (fi >= max || eptr >= md->end_subject ||
2330     fc != md->lcc[*eptr++])
2331     RRETURN(MATCH_NOMATCH);
2332     }
2333     /* Control never gets here */
2334     }
2335 nigel 93 else /* Maximize */
2336 nigel 77 {
2337     pp = eptr;
2338     for (i = min; i < max; i++)
2339     {
2340     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2341     eptr++;
2342     }
2343 nigel 93 if (possessive) continue;
2344 nigel 77 while (eptr >= pp)
2345     {
2346 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2347 nigel 77 eptr--;
2348     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2349     }
2350     RRETURN(MATCH_NOMATCH);
2351     }
2352     /* Control never gets here */
2353     }
2354    
2355     /* Caseful comparisons (includes all multi-byte characters) */
2356    
2357     else
2358     {
2359     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2360     if (min == max) continue;
2361     if (minimize)
2362     {
2363     for (fi = min;; fi++)
2364     {
2365 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2366 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2367     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2368     RRETURN(MATCH_NOMATCH);
2369     }
2370     /* Control never gets here */
2371     }
2372 nigel 93 else /* Maximize */
2373 nigel 77 {
2374     pp = eptr;
2375     for (i = min; i < max; i++)
2376     {
2377     if (eptr >= md->end_subject || fc != *eptr) break;
2378     eptr++;
2379     }
2380 nigel 93 if (possessive) continue;
2381 nigel 77 while (eptr >= pp)
2382     {
2383 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2384 nigel 77 eptr--;
2385     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2386     }
2387     RRETURN(MATCH_NOMATCH);
2388     }
2389     }
2390     /* Control never gets here */
2391    
2392     /* Match a negated single one-byte character. The character we are
2393     checking can be multibyte. */
2394    
2395     case OP_NOT:
2396     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2397     ecode++;
2398     GETCHARINCTEST(c, eptr);
2399     if ((ims & PCRE_CASELESS) != 0)
2400     {
2401     #ifdef SUPPORT_UTF8
2402     if (c < 256)
2403     #endif
2404     c = md->lcc[c];
2405     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2406     }
2407     else
2408     {
2409     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2410     }
2411     break;
2412    
2413     /* Match a negated single one-byte character repeatedly. This is almost a
2414     repeat of the code for a repeated single character, but I haven't found a
2415     nice way of commoning these up that doesn't require a test of the
2416     positive/negative option for each character match. Maybe that wouldn't add
2417     very much to the time taken, but character matching *is* what this is all
2418     about... */
2419    
2420     case OP_NOTEXACT:
2421     min = max = GET2(ecode, 1);
2422     ecode += 3;
2423     goto REPEATNOTCHAR;
2424    
2425     case OP_NOTUPTO:
2426     case OP_NOTMINUPTO:
2427     min = 0;
2428     max = GET2(ecode, 1);
2429     minimize = *ecode == OP_NOTMINUPTO;
2430     ecode += 3;
2431     goto REPEATNOTCHAR;
2432    
2433 nigel 93 case OP_NOTPOSSTAR:
2434     possessive = TRUE;
2435     min = 0;
2436     max = INT_MAX;
2437     ecode++;
2438     goto REPEATNOTCHAR;
2439    
2440     case OP_NOTPOSPLUS:
2441     possessive = TRUE;
2442     min = 1;
2443     max = INT_MAX;
2444     ecode++;
2445     goto REPEATNOTCHAR;
2446    
2447     case OP_NOTPOSQUERY:
2448     possessive = TRUE;
2449     min = 0;
2450     max = 1;
2451     ecode++;
2452     goto REPEATNOTCHAR;
2453    
2454     case OP_NOTPOSUPTO:
2455     possessive = TRUE;
2456     min = 0;
2457     max = GET2(ecode, 1);
2458     ecode += 3;
2459     goto REPEATNOTCHAR;
2460    
2461 nigel 77 case OP_NOTSTAR:
2462     case OP_NOTMINSTAR:
2463     case OP_NOTPLUS:
2464     case OP_NOTMINPLUS:
2465     case OP_NOTQUERY:
2466     case OP_NOTMINQUERY:
2467     c = *ecode++ - OP_NOTSTAR;
2468     minimize = (c & 1) != 0;
2469     min = rep_min[c]; /* Pick up values from tables; */
2470     max = rep_max[c]; /* zero for max => infinity */
2471     if (max == 0) max = INT_MAX;
2472    
2473     /* Common code for all repeated single-byte matches. We can give up quickly
2474     if there are fewer than the minimum number of bytes left in the
2475     subject. */
2476    
2477     REPEATNOTCHAR:
2478     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2479     fc = *ecode++;
2480    
2481     /* The code is duplicated for the caseless and caseful cases, for speed,
2482     since matching characters is likely to be quite common. First, ensure the
2483     minimum number of matches are present. If min = max, continue at the same
2484     level without recursing. Otherwise, if minimizing, keep trying the rest of
2485     the expression and advancing one matching character if failing, up to the
2486     maximum. Alternatively, if maximizing, find the maximum number of
2487     characters and work backwards. */
2488    
2489     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2490     max, eptr));
2491    
2492     if ((ims & PCRE_CASELESS) != 0)
2493     {
2494     fc = md->lcc[fc];
2495    
2496     #ifdef SUPPORT_UTF8
2497     /* UTF-8 mode */
2498     if (utf8)
2499     {
2500 nigel 93 register unsigned int d;
2501 nigel 77 for (i = 1; i <= min; i++)
2502     {
2503     GETCHARINC(d, eptr);
2504     if (d < 256) d = md->lcc[d];
2505     if (fc == d) RRETURN(MATCH_NOMATCH);
2506     }
2507     }
2508     else
2509     #endif
2510    
2511     /* Not UTF-8 mode */
2512     {
2513     for (i = 1; i <= min; i++)
2514     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2515     }
2516    
2517     if (min == max) continue;
2518    
2519     if (minimize)
2520     {
2521     #ifdef SUPPORT_UTF8
2522     /* UTF-8 mode */
2523     if (utf8)
2524     {
2525 nigel 93 register unsigned int d;
2526 nigel 77 for (fi = min;; fi++)
2527     {
2528 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2529 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2530     GETCHARINC(d, eptr);
2531     if (d < 256) d = md->lcc[d];
2532     if (fi >= max || eptr >= md->end_subject || fc == d)
2533     RRETURN(MATCH_NOMATCH);
2534     }
2535     }
2536     else
2537     #endif
2538     /* Not UTF-8 mode */
2539     {
2540     for (fi = min;; fi++)
2541     {
2542 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2543 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2544     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2545     RRETURN(MATCH_NOMATCH);
2546     }
2547     }
2548     /* Control never gets here */
2549     }
2550    
2551     /* Maximize case */
2552    
2553     else
2554     {
2555     pp = eptr;
2556    
2557     #ifdef SUPPORT_UTF8
2558     /* UTF-8 mode */
2559     if (utf8)
2560     {
2561 nigel 93 register unsigned int d;
2562 nigel 77 for (i = min; i < max; i++)
2563     {
2564     int len = 1;
2565     if (eptr >= md->end_subject) break;
2566     GETCHARLEN(d, eptr, len);
2567     if (d < 256) d = md->lcc[d];
2568     if (fc == d) break;
2569     eptr += len;
2570     }
2571 nigel 93 if (possessive) continue;
2572     for(;;)
2573 nigel 77 {
2574 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2575 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2576     if (eptr-- == pp) break; /* Stop if tried at original pos */
2577     BACKCHAR(eptr);
2578     }
2579     }
2580     else
2581     #endif
2582     /* Not UTF-8 mode */
2583     {
2584     for (i = min; i < max; i++)
2585     {
2586     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2587     eptr++;
2588     }
2589 nigel 93 if (possessive) continue;
2590 nigel 77 while (eptr >= pp)
2591     {
2592 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2593 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2594     eptr--;
2595     }
2596     }
2597    
2598     RRETURN(MATCH_NOMATCH);
2599     }
2600     /* Control never gets here */
2601     }
2602    
2603     /* Caseful comparisons */
2604    
2605     else
2606     {
2607     #ifdef SUPPORT_UTF8
2608     /* UTF-8 mode */
2609     if (utf8)
2610     {
2611 nigel 93 register unsigned int d;
2612 nigel 77 for (i = 1; i <= min; i++)
2613     {
2614     GETCHARINC(d, eptr);
2615     if (fc == d) RRETURN(MATCH_NOMATCH);
2616     }
2617     }
2618     else
2619     #endif
2620     /* Not UTF-8 mode */
2621     {
2622     for (i = 1; i <= min; i++)
2623     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2624     }
2625    
2626     if (min == max) continue;
2627    
2628     if (minimize)
2629     {
2630     #ifdef SUPPORT_UTF8
2631     /* UTF-8 mode */
2632     if (utf8)
2633     {
2634 nigel 93 register unsigned int d;
2635 nigel 77 for (fi = min;; fi++)
2636     {
2637 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2638 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2639     GETCHARINC(d, eptr);
2640     if (fi >= max || eptr >= md->end_subject || fc == d)
2641     RRETURN(MATCH_NOMATCH);
2642     }
2643     }
2644     else
2645     #endif
2646     /* Not UTF-8 mode */
2647     {
2648     for (fi = min;; fi++)
2649     {
2650 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2651 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2652     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2653     RRETURN(MATCH_NOMATCH);
2654     }
2655     }
2656     /* Control never gets here */
2657     }
2658    
2659     /* Maximize case */
2660    
2661     else
2662     {
2663     pp = eptr;
2664    
2665     #ifdef SUPPORT_UTF8
2666     /* UTF-8 mode */
2667     if (utf8)
2668     {
2669 nigel 93 register unsigned int d;
2670 nigel 77 for (i = min; i < max; i++)
2671     {
2672     int len = 1;
2673     if (eptr >= md->end_subject) break;
2674     GETCHARLEN(d, eptr, len);
2675     if (fc == d) break;
2676     eptr += len;
2677     }
2678 nigel 93 if (possessive) continue;
2679 nigel 77 for(;;)
2680     {
2681 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2682 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2683     if (eptr-- == pp) break; /* Stop if tried at original pos */
2684     BACKCHAR(eptr);
2685     }
2686     }
2687     else
2688     #endif
2689     /* Not UTF-8 mode */
2690     {
2691     for (i = min; i < max; i++)
2692     {
2693     if (eptr >= md->end_subject || fc == *eptr) break;
2694     eptr++;
2695     }
2696 nigel 93 if (possessive) continue;
2697 nigel 77 while (eptr >= pp)
2698     {
2699 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2700 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2701     eptr--;
2702     }
2703     }
2704    
2705     RRETURN(MATCH_NOMATCH);
2706     }
2707     }
2708     /* Control never gets here */
2709    
2710     /* Match a single character type repeatedly; several different opcodes
2711     share code. This is very similar to the code for single characters, but we
2712     repeat it in the interests of efficiency. */
2713    
2714     case OP_TYPEEXACT:
2715     min = max = GET2(ecode, 1);
2716     minimize = TRUE;
2717     ecode += 3;
2718     goto REPEATTYPE;
2719    
2720     case OP_TYPEUPTO:
2721     case OP_TYPEMINUPTO:
2722     min = 0;
2723     max = GET2(ecode, 1);
2724     minimize = *ecode == OP_TYPEMINUPTO;
2725     ecode += 3;
2726     goto REPEATTYPE;
2727    
2728 nigel 93 case OP_TYPEPOSSTAR:
2729     possessive = TRUE;
2730     min = 0;
2731     max = INT_MAX;
2732     ecode++;
2733     goto REPEATTYPE;
2734    
2735     case OP_TYPEPOSPLUS:
2736     possessive = TRUE;
2737     min = 1;
2738     max = INT_MAX;
2739     ecode++;
2740     goto REPEATTYPE;
2741    
2742     case OP_TYPEPOSQUERY:
2743     possessive = TRUE;
2744     min = 0;
2745     max = 1;
2746     ecode++;
2747     goto REPEATTYPE;
2748    
2749     case OP_TYPEPOSUPTO:
2750     possessive = TRUE;
2751     min = 0;
2752     max = GET2(ecode, 1);
2753     ecode += 3;
2754     goto REPEATTYPE;
2755    
2756 nigel 77 case OP_TYPESTAR:
2757     case OP_TYPEMINSTAR:
2758     case OP_TYPEPLUS:
2759     case OP_TYPEMINPLUS:
2760     case OP_TYPEQUERY:
2761     case OP_TYPEMINQUERY:
2762     c = *ecode++ - OP_TYPESTAR;
2763     minimize = (c & 1) != 0;
2764     min = rep_min[c]; /* Pick up values from tables; */
2765     max = rep_max[c]; /* zero for max => infinity */
2766     if (max == 0) max = INT_MAX;
2767    
2768     /* Common code for all repeated single character type matches. Note that
2769     in UTF-8 mode, '.' matches a character of any length, but for the other
2770     character types, the valid characters are all one-byte long. */
2771    
2772     REPEATTYPE:
2773     ctype = *ecode++; /* Code for the character type */
2774    
2775     #ifdef SUPPORT_UCP
2776     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2777     {
2778     prop_fail_result = ctype == OP_NOTPROP;
2779     prop_type = *ecode++;
2780 nigel 87 prop_value = *ecode++;
2781 nigel 77 }
2782     else prop_type = -1;
2783     #endif
2784    
2785     /* First, ensure the minimum number of matches are present. Use inline
2786     code for maximizing the speed, and do the type test once at the start
2787     (i.e. keep it out of the loop). Also we can test that there are at least
2788     the minimum number of bytes before we start. This isn't as effective in
2789     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2790     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2791     and single-bytes. */
2792    
2793     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2794     if (min > 0)
2795     {
2796     #ifdef SUPPORT_UCP
2797 nigel 87 if (prop_type >= 0)
2798 nigel 77 {
2799 nigel 87 switch(prop_type)
2800 nigel 77 {
2801 nigel 87 case PT_ANY:
2802     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2803     for (i = 1; i <= min; i++)
2804     {
2805     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2806 ph10 184 GETCHARINCTEST(c, eptr);
2807 nigel 87 }
2808     break;
2809    
2810     case PT_LAMP:
2811     for (i = 1; i <= min; i++)
2812     {
2813     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2814 ph10 184 GETCHARINCTEST(c, eptr);
2815 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2816     if ((prop_chartype == ucp_Lu ||
2817     prop_chartype == ucp_Ll ||
2818     prop_chartype == ucp_Lt) == prop_fail_result)
2819     RRETURN(MATCH_NOMATCH);
2820     }
2821     break;
2822    
2823     case PT_GC:
2824     for (i = 1; i <= min; i++)
2825     {
2826     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2827 ph10 184 GETCHARINCTEST(c, eptr);
2828 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2829     if ((prop_category == prop_value) == prop_fail_result)
2830     RRETURN(MATCH_NOMATCH);
2831     }
2832     break;
2833    
2834     case PT_PC:
2835     for (i = 1; i <= min; i++)
2836     {
2837     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2838 ph10 184 GETCHARINCTEST(c, eptr);
2839 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2840     if ((prop_chartype == prop_value) == prop_fail_result)
2841     RRETURN(MATCH_NOMATCH);
2842     }
2843     break;
2844    
2845     case PT_SC:
2846     for (i = 1; i <= min; i++)
2847     {
2848     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2849 ph10 184 GETCHARINCTEST(c, eptr);
2850 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2851     if ((prop_script == prop_value) == prop_fail_result)
2852     RRETURN(MATCH_NOMATCH);
2853     }
2854     break;
2855    
2856     default:
2857     RRETURN(PCRE_ERROR_INTERNAL);
2858 nigel 77 }
2859     }
2860    
2861     /* Match extended Unicode sequences. We will get here only if the
2862     support is in the binary; otherwise a compile-time error occurs. */
2863    
2864     else if (ctype == OP_EXTUNI)
2865     {
2866     for (i = 1; i <= min; i++)
2867     {
2868     GETCHARINCTEST(c, eptr);
2869 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2870 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2871     while (eptr < md->end_subject)
2872     {
2873     int len = 1;
2874     if (!utf8) c = *eptr; else
2875     {
2876     GETCHARLEN(c, eptr, len);
2877     }
2878 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2879 nigel 77 if (prop_category != ucp_M) break;
2880     eptr += len;
2881     }
2882     }
2883     }
2884    
2885     else
2886     #endif /* SUPPORT_UCP */
2887    
2888     /* Handle all other cases when the coding is UTF-8 */
2889    
2890     #ifdef SUPPORT_UTF8
2891     if (utf8) switch(ctype)
2892     {
2893     case OP_ANY:
2894     for (i = 1; i <= min; i++)
2895     {
2896     if (eptr >= md->end_subject ||
2897 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2898 nigel 77 RRETURN(MATCH_NOMATCH);
2899 nigel 91 eptr++;
2900 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2901     }
2902     break;
2903    
2904     case OP_ANYBYTE:
2905     eptr += min;
2906     break;
2907    
2908 nigel 93 case OP_ANYNL:
2909     for (i = 1; i <= min; i++)
2910     {
2911     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2912     GETCHARINC(c, eptr);
2913     switch(c)
2914     {
2915     default: RRETURN(MATCH_NOMATCH);
2916     case 0x000d:
2917     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2918     break;
2919     case 0x000a:
2920     case 0x000b:
2921     case 0x000c:
2922     case 0x0085:
2923     case 0x2028:
2924     case 0x2029:
2925     break;
2926     }
2927     }
2928     break;
2929    
2930 ph10 178 case OP_NOT_HSPACE:
2931     for (i = 1; i <= min; i++)
2932     {
2933     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2934     GETCHARINC(c, eptr);
2935     switch(c)
2936     {
2937     default: break;
2938     case 0x09: /* HT */
2939     case 0x20: /* SPACE */
2940     case 0xa0: /* NBSP */
2941     case 0x1680: /* OGHAM SPACE MARK */
2942     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2943     case 0x2000: /* EN QUAD */
2944     case 0x2001: /* EM QUAD */
2945     case 0x2002: /* EN SPACE */
2946     case 0x2003: /* EM SPACE */
2947     case 0x2004: /* THREE-PER-EM SPACE */
2948     case 0x2005: /* FOUR-PER-EM SPACE */
2949     case 0x2006: /* SIX-PER-EM SPACE */
2950     case 0x2007: /* FIGURE SPACE */
2951     case 0x2008: /* PUNCTUATION SPACE */
2952     case 0x2009: /* THIN SPACE */
2953     case 0x200A: /* HAIR SPACE */
2954     case 0x202f: /* NARROW NO-BREAK SPACE */
2955     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2956     case 0x3000: /* IDEOGRAPHIC SPACE */
2957     RRETURN(MATCH_NOMATCH);
2958     }
2959     }
2960     break;
2961 ph10 182
2962 ph10 178 case OP_HSPACE:
2963     for (i = 1; i <= min; i++)
2964     {
2965     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2966     GETCHARINC(c, eptr);
2967     switch(c)
2968     {
2969     default: RRETURN(MATCH_NOMATCH);
2970     case 0x09: /* HT */
2971     case 0x20: /* SPACE */
2972     case 0xa0: /* NBSP */
2973     case 0x1680: /* OGHAM SPACE MARK */
2974     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2975     case 0x2000: /* EN QUAD */
2976     case 0x2001: /* EM QUAD */
2977     case 0x2002: /* EN SPACE */
2978     case 0x2003: /* EM SPACE */
2979     case 0x2004: /* THREE-PER-EM SPACE */
2980     case 0x2005: /* FOUR-PER-EM SPACE */
2981     case 0x2006: /* SIX-PER-EM SPACE */
2982     case 0x2007: /* FIGURE SPACE */
2983     case 0x2008: /* PUNCTUATION SPACE */
2984     case 0x2009: /* THIN SPACE */
2985     case 0x200A: /* HAIR SPACE */
2986     case 0x202f: /* NARROW NO-BREAK SPACE */
2987     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2988     case 0x3000: /* IDEOGRAPHIC SPACE */
2989     break;
2990     }
2991     }
2992     break;
2993 ph10 182
2994 ph10 178 case OP_NOT_VSPACE:
2995     for (i = 1; i <= min; i++)
2996     {
2997     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2998     GETCHARINC(c, eptr);
2999     switch(c)
3000     {
3001     default: break;
3002     case 0x0a: /* LF */
3003     case 0x0b: /* VT */
3004     case 0x0c: /* FF */
3005     case 0x0d: /* CR */
3006     case 0x85: /* NEL */
3007     case 0x2028: /* LINE SEPARATOR */
3008     case 0x2029: /* PARAGRAPH SEPARATOR */
3009     RRETURN(MATCH_NOMATCH);
3010     }
3011     }
3012     break;
3013 ph10 182
3014 ph10 178 case OP_VSPACE:
3015     for (i = 1; i <= min; i++)
3016     {
3017     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3018     GETCHARINC(c, eptr);
3019     switch(c)
3020     {
3021     default: RRETURN(MATCH_NOMATCH);
3022     case 0x0a: /* LF */
3023     case 0x0b: /* VT */
3024     case 0x0c: /* FF */
3025     case 0x0d: /* CR */
3026     case 0x85: /* NEL */
3027     case 0x2028: /* LINE SEPARATOR */
3028     case 0x2029: /* PARAGRAPH SEPARATOR */
3029 ph10 182 break;
3030 ph10 178 }
3031     }
3032     break;
3033    
3034 nigel 77 case OP_NOT_DIGIT:
3035     for (i = 1; i <= min; i++)
3036     {
3037     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3038     GETCHARINC(c, eptr);
3039     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3040     RRETURN(MATCH_NOMATCH);
3041     }
3042     break;
3043    
3044     case OP_DIGIT:
3045     for (i = 1; i <= min; i++)
3046     {
3047     if (eptr >= md->end_subject ||
3048     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3049     RRETURN(MATCH_NOMATCH);
3050     /* No need to skip more bytes - we know it's a 1-byte character */
3051     }
3052     break;
3053    
3054     case OP_NOT_WHITESPACE:
3055     for (i = 1; i <= min; i++)
3056     {
3057     if (eptr >= md->end_subject ||
3058     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
3059     RRETURN(MATCH_NOMATCH);
3060     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3061     }
3062     break;
3063    
3064     case OP_WHITESPACE:
3065     for (i = 1; i <= min; i++)
3066     {
3067     if (eptr >= md->end_subject ||
3068     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3069     RRETURN(MATCH_NOMATCH);
3070     /* No need to skip more bytes - we know it's a 1-byte character */
3071     }
3072     break;
3073    
3074     case OP_NOT_WORDCHAR:
3075     for (i = 1; i <= min; i++)
3076     {
3077     if (eptr >= md->end_subject ||
3078     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
3079     RRETURN(MATCH_NOMATCH);
3080     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3081     }
3082     break;
3083    
3084     case OP_WORDCHAR:
3085     for (i = 1; i <= min; i++)
3086     {
3087     if (eptr >= md->end_subject ||
3088     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3089     RRETURN(MATCH_NOMATCH);
3090     /* No need to skip more bytes - we know it's a 1-byte character */
3091     }
3092     break;
3093    
3094     default:
3095     RRETURN(PCRE_ERROR_INTERNAL);
3096     } /* End switch(ctype) */
3097    
3098     else
3099     #endif /* SUPPORT_UTF8 */
3100    
3101     /* Code for the non-UTF-8 case for minimum matching of operators other
3102 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3103     number of bytes present, as this was tested above. */
3104 nigel 77
3105     switch(ctype)
3106     {
3107     case OP_ANY:
3108     if ((ims & PCRE_DOTALL) == 0)
3109     {
3110     for (i = 1; i <= min; i++)
3111 nigel 91 {
3112 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3113 nigel 91 eptr++;
3114     }
3115 nigel 77 }
3116     else eptr += min;
3117     break;
3118    
3119     case OP_ANYBYTE:
3120     eptr += min;
3121     break;
3122    
3123 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3124     bytes are present in this case. */
3125    
3126     case OP_ANYNL:
3127     for (i = 1; i <= min; i++)
3128     {
3129     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3130     switch(*eptr++)
3131     {
3132     default: RRETURN(MATCH_NOMATCH);
3133     case 0x000d:
3134     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3135     break;
3136     case 0x000a:
3137     case 0x000b:
3138     case 0x000c:
3139     case 0x0085:
3140     break;
3141     }
3142     }
3143     break;
3144    
3145 ph10 178 case OP_NOT_HSPACE:
3146     for (i = 1; i <= min; i++)
3147     {
3148     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3149     switch(*eptr++)
3150     {
3151     default: break;
3152     case 0x09: /* HT */
3153     case 0x20: /* SPACE */
3154     case 0xa0: /* NBSP */
3155     RRETURN(MATCH_NOMATCH);
3156     }
3157     }
3158     break;
3159    
3160     case OP_HSPACE:
3161     for (i = 1; i <= min; i++)
3162     {
3163     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3164     switch(*eptr++)
3165     {
3166     default: RRETURN(MATCH_NOMATCH);
3167     case 0x09: /* HT */
3168     case 0x20: /* SPACE */
3169     case 0xa0: /* NBSP */
3170 ph10 182 break;
3171 ph10 178 }
3172     }
3173     break;
3174    
3175     case OP_NOT_VSPACE:
3176     for (i = 1; i <= min; i++)
3177     {
3178     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3179     switch(*eptr++)
3180     {
3181     default: break;
3182     case 0x0a: /* LF */
3183     case 0x0b: /* VT */
3184     case 0x0c: /* FF */
3185     case 0x0d: /* CR */
3186     case 0x85: /* NEL */
3187     RRETURN(MATCH_NOMATCH);
3188     }
3189     }
3190     break;
3191    
3192     case OP_VSPACE:
3193     for (i = 1; i <= min; i++)
3194     {
3195     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3196     switch(*eptr++)
3197     {
3198     default: RRETURN(MATCH_NOMATCH);
3199     case 0x0a: /* LF */
3200     case 0x0b: /* VT */
3201     case 0x0c: /* FF */
3202     case 0x0d: /* CR */
3203     case 0x85: /* NEL */
3204 ph10 182 break;
3205 ph10 178 }
3206     }
3207     break;
3208    
3209 nigel 77 case OP_NOT_DIGIT:
3210     for (i = 1; i <= min; i++)
3211     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3212     break;
3213    
3214     case OP_DIGIT:
3215     for (i = 1; i <= min; i++)
3216     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3217     break;
3218    
3219     case OP_NOT_WHITESPACE:
3220     for (i = 1; i <= min; i++)
3221     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3222     break;
3223    
3224     case OP_WHITESPACE:
3225     for (i = 1; i <= min; i++)
3226     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3227     break;
3228    
3229     case OP_NOT_WORDCHAR:
3230     for (i = 1; i <= min; i++)
3231     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3232     RRETURN(MATCH_NOMATCH);
3233     break;
3234    
3235     case OP_WORDCHAR:
3236     for (i = 1; i <= min; i++)
3237     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3238     RRETURN(MATCH_NOMATCH);
3239     break;
3240    
3241     default:
3242     RRETURN(PCRE_ERROR_INTERNAL);
3243     }
3244     }
3245    
3246     /* If min = max, continue at the same level without recursing */
3247    
3248     if (min == max) continue;
3249    
3250     /* If minimizing, we have to test the rest of the pattern before each
3251     subsequent match. Again, separate the UTF-8 case for speed, and also
3252     separate the UCP cases. */
3253    
3254     if (minimize)
3255     {
3256     #ifdef SUPPORT_UCP
3257 nigel 87 if (prop_type >= 0)
3258 nigel 77 {
3259 nigel 87 switch(prop_type)
3260 nigel 77 {
3261 nigel 87 case PT_ANY:
3262     for (fi = min;; fi++)
3263     {
3264 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3265 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3266     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3267     GETCHARINC(c, eptr);
3268     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3269     }
3270 nigel 93 /* Control never gets here */
3271 nigel 87
3272     case PT_LAMP:
3273     for (fi = min;; fi++)
3274     {
3275 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3276 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3277     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3278     GETCHARINC(c, eptr);
3279     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3280     if ((prop_chartype == ucp_Lu ||
3281     prop_chartype == ucp_Ll ||
3282     prop_chartype == ucp_Lt) == prop_fail_result)
3283     RRETURN(MATCH_NOMATCH);
3284     }
3285 nigel 93 /* Control never gets here */
3286 nigel 87
3287     case PT_GC:
3288     for (fi = min;; fi++)
3289     {
3290 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3291 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3292     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3293     GETCHARINC(c, eptr);
3294     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3295     if ((prop_category == prop_value) == prop_fail_result)
3296     RRETURN(MATCH_NOMATCH);
3297     }
3298 nigel 93 /* Control never gets here */
3299 nigel 87
3300     case PT_PC:
3301     for (fi = min;; fi++)
3302     {
3303 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3304 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3305     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3306     GETCHARINC(c, eptr);
3307     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3308     if ((prop_chartype == prop_value) == prop_fail_result)
3309     RRETURN(MATCH_NOMATCH);
3310     }
3311 nigel 93 /* Control never gets here */
3312 nigel 87
3313     case PT_SC:
3314     for (fi = min;; fi++)
3315     {
3316 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3317 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3318     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3319     GETCHARINC(c, eptr);
3320     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3321     if ((prop_script == prop_value) == prop_fail_result)
3322     RRETURN(MATCH_NOMATCH);
3323     }
3324 nigel 93 /* Control never gets here */
3325 nigel 87
3326     default:
3327     RRETURN(PCRE_ERROR_INTERNAL);
3328 nigel 77 }
3329     }
3330    
3331     /* Match extended Unicode sequences. We will get here only if the
3332     support is in the binary; otherwise a compile-time error occurs. */
3333    
3334     else if (ctype == OP_EXTUNI)
3335     {
3336     for (fi = min;; fi++)
3337     {
3338 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3339 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3340     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3341     GETCHARINCTEST(c, eptr);
3342 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3343 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3344     while (eptr < md->end_subject)
3345     {
3346     int len = 1;
3347     if (!utf8) c = *eptr; else
3348     {
3349     GETCHARLEN(c, eptr, len);
3350     }
3351 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3352 nigel 77 if (prop_category != ucp_M) break;
3353     eptr += len;
3354     }
3355     }
3356     }
3357    
3358     else
3359     #endif /* SUPPORT_UCP */
3360    
3361     #ifdef SUPPORT_UTF8
3362     /* UTF-8 mode */
3363     if (utf8)
3364     {
3365     for (fi = min;; fi++)
3366     {
3367 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3368 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3369 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3370     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3371 nigel 93 IS_NEWLINE(eptr)))
3372 nigel 91 RRETURN(MATCH_NOMATCH);
3373 nigel 77
3374     GETCHARINC(c, eptr);
3375     switch(ctype)
3376     {
3377 nigel 91 case OP_ANY: /* This is the DOTALL case */
3378 nigel 77 break;
3379    
3380     case OP_ANYBYTE:
3381     break;
3382    
3383 nigel 93 case OP_ANYNL:
3384     switch(c)
3385     {
3386     default: RRETURN(MATCH_NOMATCH);
3387     case 0x000d:
3388     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3389     break;
3390     case 0x000a:
3391     case 0x000b:
3392     case 0x000c:
3393     case 0x0085:
3394     case 0x2028:
3395     case 0x2029:
3396     break;
3397     }
3398     break;
3399    
3400 ph10 178 case OP_NOT_HSPACE:
3401     switch(c)
3402     {
3403     default: break;
3404     case 0x09: /* HT */
3405     case 0x20: /* SPACE */
3406     case 0xa0: /* NBSP */
3407     case 0x1680: /* OGHAM SPACE MARK */
3408     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3409     case 0x2000: /* EN QUAD */
3410     case 0x2001: /* EM QUAD */
3411     case 0x2002: /* EN SPACE */
3412     case 0x2003: /* EM SPACE */
3413     case 0x2004: /* THREE-PER-EM SPACE */
3414     case 0x2005: /* FOUR-PER-EM SPACE */
3415     case 0x2006: /* SIX-PER-EM SPACE */
3416     case 0x2007: /* FIGURE SPACE */
3417     case 0x2008: /* PUNCTUATION SPACE */
3418     case 0x2009: /* THIN SPACE */
3419     case 0x200A: /* HAIR SPACE */
3420     case 0x202f: /* NARROW NO-BREAK SPACE */
3421     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3422     case 0x3000: /* IDEOGRAPHIC SPACE */
3423     RRETURN(MATCH_NOMATCH);
3424     }
3425     break;
3426    
3427     case OP_HSPACE:
3428     switch(c)
3429     {
3430     default: RRETURN(MATCH_NOMATCH);
3431     case 0x09: /* HT */
3432     case 0x20: /* SPACE */
3433     case 0xa0: /* NBSP */
3434     case 0x1680: /* OGHAM SPACE MARK */
3435     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3436     case 0x2000: /* EN QUAD */
3437     case 0x2001: /* EM QUAD */
3438     case 0x2002: /* EN SPACE */
3439     case 0x2003: /* EM SPACE */
3440     case 0x2004: /* THREE-PER-EM SPACE */
3441     case 0x2005: /* FOUR-PER-EM SPACE */
3442     case 0x2006: /* SIX-PER-EM SPACE */
3443     case 0x2007: /* FIGURE SPACE */
3444     case 0x2008: /* PUNCTUATION SPACE */
3445     case 0x2009: /* THIN SPACE */
3446     case 0x200A: /* HAIR SPACE */
3447     case 0x202f: /* NARROW NO-BREAK SPACE */
3448     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3449     case 0x3000: /* IDEOGRAPHIC SPACE */
3450     break;
3451     }
3452     break;
3453    
3454     case OP_NOT_VSPACE:
3455     switch(c)
3456     {
3457     default: break;
3458     case 0x0a: /* LF */
3459     case 0x0b: /* VT */
3460     case 0x0c: /* FF */
3461     case 0x0d: /* CR */
3462     case 0x85: /* NEL */
3463     case 0x2028: /* LINE SEPARATOR */
3464     case 0x2029: /* PARAGRAPH SEPARATOR */
3465     RRETURN(MATCH_NOMATCH);
3466     }
3467     break;
3468    
3469     case OP_VSPACE:
3470     switch(c)
3471     {
3472     default: RRETURN(MATCH_NOMATCH);
3473     case 0x0a: /* LF */
3474     case 0x0b: /* VT */
3475     case 0x0c: /* FF */
3476     case 0x0d: /* CR */
3477     case 0x85: /* NEL */
3478     case 0x2028: /* LINE SEPARATOR */
3479     case 0x2029: /* PARAGRAPH SEPARATOR */
3480     break;
3481     }
3482     break;
3483    
3484 nigel 77 case OP_NOT_DIGIT:
3485     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3486     RRETURN(MATCH_NOMATCH);
3487     break;
3488    
3489     case OP_DIGIT:
3490     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3491     RRETURN(MATCH_NOMATCH);
3492     break;
3493    
3494     case OP_NOT_WHITESPACE:
3495     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3496     RRETURN(MATCH_NOMATCH);
3497     break;
3498    
3499     case OP_WHITESPACE:
3500     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3501     RRETURN(MATCH_NOMATCH);
3502     break;
3503    
3504     case OP_NOT_WORDCHAR:
3505     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3506     RRETURN(MATCH_NOMATCH);
3507     break;
3508    
3509     case OP_WORDCHAR:
3510     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3511     RRETURN(MATCH_NOMATCH);
3512     break;
3513    
3514     default:
3515     RRETURN(PCRE_ERROR_INTERNAL);
3516     }
3517     }
3518     }
3519     else
3520     #endif
3521     /* Not UTF-8 mode */
3522     {
3523     for (fi = min;; fi++)
3524     {
3525 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3526 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3527 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3528 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3529 nigel 91 RRETURN(MATCH_NOMATCH);
3530    
3531 nigel 77 c = *eptr++;
3532     switch(ctype)
3533     {
3534 nigel 91 case OP_ANY: /* This is the DOTALL case */
3535 nigel 77 break;
3536    
3537     case OP_ANYBYTE:
3538     break;
3539    
3540 nigel 93 case OP_ANYNL:
3541     switch(c)
3542     {
3543     default: RRETURN(MATCH_NOMATCH);
3544     case 0x000d:
3545     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3546     break;
3547     case 0x000a:
3548     case 0x000b:
3549     case 0x000c:
3550     case 0x0085:
3551     break;
3552     }
3553     break;
3554    
3555 ph10 178 case OP_NOT_HSPACE:
3556     switch(c)
3557     {
3558     default: break;
3559     case 0x09: /* HT */
3560     case 0x20: /* SPACE */
3561     case 0xa0: /* NBSP */
3562     RRETURN(MATCH_NOMATCH);
3563     }
3564     break;
3565    
3566     case OP_HSPACE:
3567     switch(c)
3568     {
3569     default: RRETURN(MATCH_NOMATCH);
3570     case 0x09: /* HT */
3571     case 0x20: /* SPACE */
3572     case 0xa0: /* NBSP */
3573     break;
3574     }
3575     break;
3576    
3577     case OP_NOT_VSPACE:
3578     switch(c)
3579     {
3580     default: break;
3581     case 0x0a: /* LF */
3582     case 0x0b: /* VT */
3583     case 0x0c: /* FF */
3584     case 0x0d: /* CR */
3585     case 0x85: /* NEL */
3586     RRETURN(MATCH_NOMATCH);
3587     }
3588     break;
3589    
3590     case OP_VSPACE:
3591     switch(c)
3592     {
3593     default: RRETURN(MATCH_NOMATCH);
3594     case 0x0a: /* LF */
3595     case 0x0b: /* VT */
3596     case 0x0c: /* FF */
3597     case 0x0d: /* CR */
3598     case 0x85: /* NEL */
3599     break;
3600     }
3601     break;
3602    
3603 nigel 77 case OP_NOT_DIGIT:
3604     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3605     break;
3606    
3607     case OP_DIGIT:
3608     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3609     break;
3610    
3611     case OP_NOT_WHITESPACE:
3612     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3613     break;
3614    
3615     case OP_WHITESPACE:
3616     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3617     break;
3618    
3619     case OP_NOT_WORDCHAR:
3620     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3621     break;
3622    
3623     case OP_WORDCHAR:
3624     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3625     break;
3626    
3627     default:
3628     RRETURN(PCRE_ERROR_INTERNAL);
3629     }
3630     }
3631     }
3632     /* Control never gets here */
3633     }
3634    
3635 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3636 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3637     UTF-8 and UCP stuff separate. */
3638    
3639     else
3640     {
3641     pp = eptr; /* Remember where we started */
3642    
3643     #ifdef SUPPORT_UCP
3644 nigel 87 if (prop_type >= 0)
3645 nigel 77 {
3646 nigel 87 switch(prop_type)
3647 nigel 77 {
3648 nigel 87 case PT_ANY:
3649     for (i = min; i < max; i++)
3650     {
3651     int len = 1;
3652     if (eptr >= md->end_subject) break;
3653     GETCHARLEN(c, eptr, len);
3654     if (prop_fail_result) break;
3655     eptr+= len;
3656     }
3657     break;
3658    
3659     case PT_LAMP:
3660     for (i = min; i < max; i++)
3661     {
3662     int len = 1;
3663     if (eptr >= md->end_subject) break;
3664     GETCHARLEN(c, eptr, len);
3665     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3666     if ((prop_chartype == ucp_Lu ||
3667     prop_chartype == ucp_Ll ||
3668     prop_chartype == ucp_Lt) == prop_fail_result)
3669     break;
3670     eptr+= len;
3671     }
3672     break;
3673    
3674     case PT_GC:
3675     for (i = min; i < max; i++)
3676     {
3677     int len = 1;
3678     if (eptr >= md->end_subject) break;
3679     GETCHARLEN(c, eptr, len);
3680     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3681     if ((prop_category == prop_value) == prop_fail_result)
3682     break;
3683     eptr+= len;
3684     }
3685     break;
3686    
3687     case PT_PC:
3688     for (i = min; i < max; i++)
3689     {
3690     int len = 1;
3691     if (eptr >= md->end_subject) break;
3692     GETCHARLEN(c, eptr, len);
3693     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3694     if ((prop_chartype == prop_value) == prop_fail_result)
3695     break;
3696     eptr+= len;
3697     }
3698     break;
3699    
3700     case PT_SC:
3701     for (i = min; i < max; i++)
3702     {
3703     int len = 1;
3704     if (eptr >= md->end_subject) break;
3705     GETCHARLEN(c, eptr, len);
3706     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3707     if ((prop_script == prop_value) == prop_fail_result)
3708     break;
3709     eptr+= len;
3710     }
3711     break;
3712 nigel 77 }
3713    
3714     /* eptr is now past the end of the maximum run */
3715    
3716 nigel 93 if (possessive) continue;
3717 nigel 77 for(;;)
3718     {
3719 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3720 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3721     if (eptr-- == pp) break; /* Stop if tried at original pos */
3722     BACKCHAR(eptr);
3723     }
3724     }
3725    
3726     /* Match extended Unicode sequences. We will get here only if the
3727     support is in the binary; otherwise a compile-time error occurs. */
3728    
3729     else if (ctype == OP_EXTUNI)
3730     {
3731     for (i = min; i < max; i++)
3732     {
3733     if (eptr >= md->end_subject) break;
3734     GETCHARINCTEST(c, eptr);
3735 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3736 nigel 77 if (prop_category == ucp_M) break;
3737     while (eptr < md->end_subject)
3738     {
3739     int len = 1;
3740     if (!utf8) c = *eptr; else
3741     {
3742     GETCHARLEN(c, eptr, len);
3743     }
3744 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3745 nigel 77 if (prop_category != ucp_M) break;
3746     eptr += len;
3747     }
3748     }
3749    
3750     /* eptr is now past the end of the maximum run */
3751    
3752 nigel 93 if (possessive) continue;
3753 nigel 77 for(;;)
3754     {
3755 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3756 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3757     if (eptr-- == pp) break; /* Stop if tried at original pos */
3758     for (;;) /* Move back over one extended */
3759     {
3760     int len = 1;
3761     BACKCHAR(eptr);
3762     if (!utf8) c = *eptr; else
3763     {
3764     GETCHARLEN(c, eptr, len);
3765     }
3766 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3767 nigel 77 if (prop_category != ucp_M) break;
3768     eptr--;
3769     }
3770     }
3771     }
3772    
3773     else
3774     #endif /* SUPPORT_UCP */
3775    
3776     #ifdef SUPPORT_UTF8
3777     /* UTF-8 mode */
3778    
3779     if (utf8)
3780     {
3781     switch(ctype)
3782     {
3783     case OP_ANY:
3784     if (max < INT_MAX)
3785     {
3786     if ((ims & PCRE_DOTALL) == 0)
3787     {
3788     for (i = min; i < max; i++)
3789     {
3790 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3791 nigel 77 eptr++;
3792     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3793     }
3794     }
3795     else
3796     {
3797     for (i = min; i < max; i++)
3798     {
3799 nigel 91 if (eptr >= md->end_subject) break;
3800 nigel 77 eptr++;
3801     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3802     }
3803     }
3804     }
3805    
3806     /* Handle unlimited UTF-8 repeat */
3807    
3808     else
3809     {
3810     if ((ims & PCRE_DOTALL) == 0)
3811     {
3812     for (i = min; i < max; i++)
3813     {
3814 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3815 nigel 77 eptr++;
3816 ph10 190 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3817 nigel 77 }
3818     }
3819     else
3820     {
3821 ph10 190 eptr = md->end_subject;
3822 nigel 77 }
3823     }
3824     break;
3825    
3826     /* The byte case is the same as non-UTF8 */
3827    
3828     case OP_ANYBYTE:
3829     c = max - min;
3830 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3831     c = md->end_subject - eptr;
3832 nigel 77 eptr += c;
3833     break;
3834    
3835 nigel 93 case OP_ANYNL:
3836     for (i = min; i < max; i++)
3837     {
3838     int len = 1;
3839     if (eptr >= md->end_subject) break;
3840     GETCHARLEN(c, eptr, len);
3841     if (c == 0x000d)
3842     {
3843     if (++eptr >= md->end_subject) break;
3844     if (*eptr == 0x000a) eptr++;
3845     }
3846     else
3847     {
3848     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3849     c != 0x0085 && c != 0x2028 && c != 0x2029)
3850     break;
3851     eptr += len;
3852     }
3853     }
3854     break;
3855    
3856 ph10 178 case OP_NOT_HSPACE:
3857 ph10 182 case OP_HSPACE:
3858 ph10 178 for (i = min; i < max; i++)
3859     {
3860 ph10 182 BOOL gotspace;
3861 ph10 178 int len = 1;
3862     if (eptr >= md->end_subject) break;
3863     GETCHARLEN(c, eptr, len);
3864     switch(c)
3865 ph10 182 {
3866     default: gotspace = FALSE; break;
3867 ph10 178 case 0x09: /* HT */
3868     case 0x20: /* SPACE */
3869     case 0xa0: /* NBSP */
3870     case 0x1680: /* OGHAM SPACE MARK */
3871     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3872     case 0x2000: /* EN QUAD */
3873     case 0x2001: /* EM QUAD */
3874     case 0x2002: /* EN SPACE */
3875     case 0x2003: /* EM SPACE */
3876     case 0x2004: /* THREE-PER-EM SPACE */
3877     case 0x2005: /* FOUR-PER-EM SPACE */
3878     case 0x2006: /* SIX-PER-EM SPACE */
3879     case 0x2007: /* FIGURE SPACE */
3880     case 0x2008: /* PUNCTUATION SPACE */
3881     case 0x2009: /* THIN SPACE */
3882     case 0x200A: /* HAIR SPACE */
3883     case 0x202f: /* NARROW NO-BREAK SPACE */
3884     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3885     case 0x3000: /* IDEOGRAPHIC SPACE */
3886     gotspace = TRUE;
3887 ph10 182 break;
3888 ph10 178 }
3889     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3890     eptr += len;
3891     }
3892     break;
3893    
3894     case OP_NOT_VSPACE:
3895 ph10 182 case OP_VSPACE:
3896 ph10 178 for (i = min; i < max; i++)
3897     {
3898 ph10 182 BOOL gotspace;
3899 ph10 178 int len = 1;
3900     if (eptr >= md->end_subject) break;
3901     GETCHARLEN(c, eptr, len);
3902     switch(c)
3903     {
3904 ph10 182 default: gotspace = FALSE; break;
3905 ph10 178 case 0x0a: /* LF */
3906     case 0x0b: /* VT */
3907     case 0x0c: /* FF */
3908     case 0x0d: /* CR */
3909     case 0x85: /* NEL */
3910     case 0x2028: /* LINE SEPARATOR */
3911     case 0x2029: /* PARAGRAPH SEPARATOR */
3912     gotspace = TRUE;
3913     break;
3914     }
3915 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3916 ph10 178 eptr += len;
3917     }
3918     break;
3919    
3920 nigel 77 case OP_NOT_DIGIT:
3921     for (i = min; i < max; i++)
3922     {
3923     int len = 1;
3924     if (eptr >= md->end_subject) break;
3925     GETCHARLEN(c, eptr, len);
3926     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3927     eptr+= len;
3928     }
3929     break;
3930    
3931     case OP_DIGIT:
3932     for (i = min; i < max; i++)
3933     {
3934     int len = 1;
3935     if (eptr >= md->end_subject) break;
3936     GETCHARLEN(c, eptr, len);
3937     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3938     eptr+= len;
3939     }
3940     break;
3941    
3942     case OP_NOT_WHITESPACE:
3943     for (i = min; i < max; i++)
3944     {
3945     int len = 1;
3946     if (eptr >= md->end_subject) break;
3947     GETCHARLEN(c, eptr, len);
3948     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3949     eptr+= len;
3950     }
3951     break;
3952    
3953     case OP_WHITESPACE:
3954     for (i = min; i < max; i++)
3955     {
3956     int len = 1;
3957     if (eptr >= md->end_subject) break;
3958     GETCHARLEN(c, eptr, len);
3959     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3960     eptr+= len;
3961     }
3962     break;
3963    
3964     case OP_NOT_WORDCHAR:
3965     for (i = min; i < max; i++)
3966     {
3967     int len = 1;
3968     if (eptr >= md->end_subject) break;
3969     GETCHARLEN(c, eptr, len);
3970     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
3971     eptr+= len;
3972     }
3973     break;
3974    
3975     case OP_WORDCHAR:
3976     for (i = min; i < max; i++)
3977     {
3978     int len = 1;
3979     if (eptr >= md->end_subject) break;
3980     GETCHARLEN(c, eptr, len);
3981     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
3982     eptr+= len;
3983     }
3984     break;
3985    
3986     default:
3987     RRETURN(PCRE_ERROR_INTERNAL);
3988     }
3989    
3990     /* eptr is now past the end of the maximum run */
3991    
3992 nigel 93 if (possessive) continue;
3993 nigel 77 for(;;)
3994     {
3995 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
3996 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3997     if (eptr-- == pp) break; /* Stop if tried at original pos */
3998     BACKCHAR(eptr);
3999     }
4000     }
4001     else
4002     #endif
4003    
4004     /* Not UTF-8 mode */
4005     {
4006     switch(ctype)
4007     {
4008     case OP_ANY:
4009     if ((ims & PCRE_DOTALL) == 0)
4010     {
4011     for (i = min; i < max; i++)
4012     {
4013 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4014 nigel 77 eptr++;
4015     }
4016     break;
4017     }
4018     /* For DOTALL case, fall through and treat as \C */
4019    
4020     case OP_ANYBYTE:
4021     c = max - min;
4022 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4023     c = md->end_subject - eptr;
4024 nigel 77 eptr += c;
4025     break;
4026    
4027 nigel 93 case OP_ANYNL:
4028     for (i = min; i < max; i++)
4029     {
4030     if (eptr >= md->end_subject) break;
4031     c = *eptr;
4032     if (c == 0x000d)
4033     {
4034     if (++eptr >= md->end_subject) break;
4035     if (*eptr == 0x000a) eptr++;
4036     }
4037     else
4038     {
4039     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4040     break;
4041     eptr++;
4042     }
4043     }
4044     break;
4045    
4046 ph10 178 case OP_NOT_HSPACE:
4047     for (i = min; i < max; i++)
4048     {
4049     if (eptr >= md->end_subject) break;
4050     c = *eptr;
4051     if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4052 ph10 182 eptr++;
4053 ph10 178 }
4054     break;
4055    
4056     case OP_HSPACE:
4057     for (i = min; i < max; i++)
4058     {
4059     if (eptr >= md->end_subject) break;
4060     c = *eptr;
4061     if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4062 ph10 182 eptr++;
4063 ph10 178 }
4064     break;
4065    
4066     case OP_NOT_VSPACE:
4067     for (i = min; i < max; i++)
4068     {
4069     if (eptr >= md->end_subject) break;
4070     c = *eptr;
4071     if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4072     break;
4073 ph10 182 eptr++;
4074 ph10 178 }
4075     break;
4076    
4077     case OP_VSPACE:
4078     for (i = min; i < max; i++)
4079     {
4080     if (eptr >= md->end_subject) break;
4081     c = *eptr;
4082     if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4083     break;
4084     eptr++;
4085     }
4086     break;
4087    
4088 nigel 77 case OP_NOT_DIGIT:
4089     for (i = min; i < max; i++)
4090     {
4091     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4092     break;
4093     eptr++;
4094     }
4095     break;
4096    
4097     case OP_DIGIT:
4098     for (i = min; i < max; i++)
4099     {
4100     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4101     break;
4102     eptr++;
4103     }
4104     break;
4105    
4106     case OP_NOT_WHITESPACE:
4107     for (i = min; i < max; i++)
4108     {
4109     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4110     break;
4111     eptr++;
4112     }
4113     break;
4114    
4115     case OP_WHITESPACE:
4116     for (i = min; i < max; i++)
4117     {
4118     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4119     break;
4120     eptr++;
4121     }
4122     break;
4123    
4124     case OP_NOT_WORDCHAR:
4125     for (i = min; i < max; i++)
4126     {
4127     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4128     break;
4129     eptr++;
4130     }
4131     break;
4132    
4133     case OP_WORDCHAR:
4134     for (i = min; i < max; i++)
4135     {
4136     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4137     break;
4138     eptr++;
4139     }
4140     break;
4141    
4142     default:
4143     RRETURN(PCRE_ERROR_INTERNAL);
4144     }
4145    
4146     /* eptr is now past the end of the maximum run */
4147    
4148 nigel 93 if (possessive) continue;
4149 nigel 77 while (eptr >= pp)
4150     {
4151 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4152 nigel 77 eptr--;
4153     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4154     }
4155     }
4156    
4157     /* Get here if we can't make it match with any permitted repetitions */
4158    
4159     RRETURN(MATCH_NOMATCH);
4160     }
4161     /* Control never gets here */
4162    
4163 nigel 93 /* There's been some horrible disaster. Arrival here can only mean there is
4164     something seriously wrong in the code above or the OP_xxx definitions. */
4165 nigel 77
4166