/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 211 - (hide annotations) (download)
Thu Aug 9 09:52:43 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 148898 byte(s)
Update UTF-8 validity check and documentation.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161     /* Separate the caselesss case for speed */
162    
163     if ((ims & PCRE_CASELESS) != 0)
164     {
165     while (length-- > 0)
166     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167     }
168     else
169     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170    
171     return TRUE;
172     }
173    
174    
175    
176     /***************************************************************************
177     ****************************************************************************
178     RECURSION IN THE match() FUNCTION
179    
180 nigel 87 The match() function is highly recursive, though not every recursive call
181     increases the recursive depth. Nevertheless, some regular expressions can cause
182     it to recurse to a great depth. I was writing for Unix, so I just let it call
183     itself recursively. This uses the stack for saving everything that has to be
184     saved for a recursive call. On Unix, the stack can be large, and this works
185     fine.
186 nigel 77
187 nigel 87 It turns out that on some non-Unix-like systems there are problems with
188     programs that use a lot of stack. (This despite the fact that every last chip
189     has oodles of memory these days, and techniques for extending the stack have
190     been known for decades.) So....
191 nigel 77
192     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193     calls by keeping local variables that need to be preserved in blocks of memory
194 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
195 nigel 77 achieve this so that the actual code doesn't look very different to what it
196     always used to.
197 ph10 164
198 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
199 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
200     Switzer, the use of longjmp() has been abolished, at the cost of having to
201     provide a unique number for each call to RMATCH. There is no way of generating
202     a sequence of numbers at compile time in C. I have given them names, to make
203     them stand out more clearly.
204    
205     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
208     don't have indeterminate values; this has meant that the frame size can be
209 ph10 164 reduced because the result can be "passed back" by straight setting of the
210     variable instead of being passed in the frame.
211 nigel 77 ****************************************************************************
212     ***************************************************************************/
213    
214    
215 ph10 164 /* Numbers for RMATCH calls */
216    
217     enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222     RM51, RM52, RM53 };
223 ph10 164
224 ph10 165
225 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
226 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
227 ph10 164 actuall used in this definition. */
228 nigel 77
229     #ifndef NO_RECURSE
230     #define REGISTER register
231 ph10 164
232 nigel 87 #ifdef DEBUG
233 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
234 nigel 87 { \
235     printf("match() called in line %d\n", __LINE__); \
236 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
237 nigel 87 printf("to line %d\n", __LINE__); \
238     }
239     #define RRETURN(ra) \
240     { \
241     printf("match() returned %d from line %d ", ra, __LINE__); \
242     return ra; \
243     }
244     #else
245 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
246 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
247 nigel 77 #define RRETURN(ra) return ra
248 nigel 87 #endif
249    
250 nigel 77 #else
251    
252    
253 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
254     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
255     argument of match(), which never changes. */
256 nigel 77
257     #define REGISTER
258    
259 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
260 nigel 77 {\
261     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
262 ph10 164 frame->Xwhere = rw; \
263     newframe->Xeptr = ra;\
264     newframe->Xecode = rb;\
265 ph10 168 newframe->Xmstart = mstart;\
266 ph10 164 newframe->Xoffset_top = rc;\
267     newframe->Xims = re;\
268     newframe->Xeptrb = rf;\
269     newframe->Xflags = rg;\
270     newframe->Xrdepth = frame->Xrdepth + 1;\
271     newframe->Xprevframe = frame;\
272     frame = newframe;\
273     DPRINTF(("restarting from line %d\n", __LINE__));\
274     goto HEAP_RECURSE;\
275     L_##rw:\
276     DPRINTF(("jumped back to line %d\n", __LINE__));\
277 nigel 77 }
278    
279     #define RRETURN(ra)\
280     {\
281     heapframe *newframe = frame;\
282     frame = newframe->Xprevframe;\
283     (pcre_stack_free)(newframe);\
284     if (frame != NULL)\
285     {\
286 ph10 164 rrc = ra;\
287     goto HEAP_RETURN;\
288 nigel 77 }\
289     return ra;\
290     }
291    
292    
293     /* Structure for remembering the local variables in a private frame */
294    
295     typedef struct heapframe {
296     struct heapframe *Xprevframe;
297    
298     /* Function arguments that may change */
299    
300     const uschar *Xeptr;
301     const uschar *Xecode;
302 ph10 172 const uschar *Xmstart;
303 nigel 77 int Xoffset_top;
304     long int Xims;
305     eptrblock *Xeptrb;
306     int Xflags;
307 nigel 91 unsigned int Xrdepth;
308 nigel 77
309     /* Function local variables */
310    
311     const uschar *Xcallpat;
312     const uschar *Xcharptr;
313     const uschar *Xdata;
314     const uschar *Xnext;
315     const uschar *Xpp;
316     const uschar *Xprev;
317     const uschar *Xsaved_eptr;
318    
319     recursion_info Xnew_recursive;
320    
321     BOOL Xcur_is_word;
322     BOOL Xcondition;
323     BOOL Xprev_is_word;
324    
325     unsigned long int Xoriginal_ims;
326    
327     #ifdef SUPPORT_UCP
328     int Xprop_type;
329 nigel 87 int Xprop_value;
330 nigel 77 int Xprop_fail_result;
331     int Xprop_category;
332     int Xprop_chartype;
333 nigel 87 int Xprop_script;
334 ph10 123 int Xoclength;
335     uschar Xocchars[8];
336 nigel 77 #endif
337    
338     int Xctype;
339 nigel 93 unsigned int Xfc;
340 nigel 77 int Xfi;
341     int Xlength;
342     int Xmax;
343     int Xmin;
344     int Xnumber;
345     int Xoffset;
346     int Xop;
347     int Xsave_capture_last;
348     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
349     int Xstacksave[REC_STACK_SAVE_MAX];
350    
351     eptrblock Xnewptrb;
352    
353 ph10 164 /* Where to jump back to */
354 nigel 77
355 ph10 164 int Xwhere;
356 ph10 165
357 nigel 77 } heapframe;
358    
359     #endif
360    
361    
362     /***************************************************************************
363     ***************************************************************************/
364    
365    
366    
367     /*************************************************
368     * Match from current position *
369     *************************************************/
370    
371 nigel 93 /* This function is called recursively in many circumstances. Whenever it
372 nigel 77 returns a negative (error) response, the outer incarnation must also return the
373     same response.
374    
375     Performance note: It might be tempting to extract commonly used fields from the
376     md structure (e.g. utf8, end_subject) into individual variables to improve
377     performance. Tests using gcc on a SPARC disproved this; in the first case, it
378     made performance worse.
379    
380     Arguments:
381 nigel 93 eptr pointer to current character in subject
382     ecode pointer to current position in compiled code
383 ph10 168 mstart pointer to the current match start position (can be modified
384 ph10 172 by encountering \K)
385 nigel 77 offset_top current top pointer
386     md pointer to "static" info for the match
387     ims current /i, /m, and /s options
388     eptrb pointer to chain of blocks containing eptr at start of
389     brackets - for testing for empty matches
390     flags can contain
391     match_condassert - this is an assertion condition
392 nigel 93 match_cbegroup - this is the start of an unlimited repeat
393     group that can match an empty string
394 nigel 87 rdepth the recursion depth
395 nigel 77
396     Returns: MATCH_MATCH if matched ) these values are >= 0
397     MATCH_NOMATCH if failed to match )
398     a negative PCRE_ERROR_xxx value if aborted by an error condition
399 nigel 87 (e.g. stopped by repeated call or recursion limit)
400 nigel 77 */
401    
402     static int
403 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
404 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
405 nigel 91 int flags, unsigned int rdepth)
406 nigel 77 {
407     /* These variables do not need to be preserved over recursion in this function,
408 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
409     "register" because they are used a lot in loops. */
410 nigel 77
411 nigel 91 register int rrc; /* Returns from recursive calls */
412     register int i; /* Used for loops not involving calls to RMATCH() */
413 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
414 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
415 nigel 77
416 nigel 93 BOOL minimize, possessive; /* Quantifier options */
417    
418 nigel 77 /* When recursion is not being used, all "local" variables that have to be
419     preserved over calls to RMATCH() are part of a "frame" which is obtained from
420     heap storage. Set up the top-level frame here; others are obtained from the
421     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
422    
423     #ifdef NO_RECURSE
424     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
425     frame->Xprevframe = NULL; /* Marks the top level */
426    
427     /* Copy in the original argument variables */
428    
429     frame->Xeptr = eptr;
430     frame->Xecode = ecode;
431 ph10 168 frame->Xmstart = mstart;
432 nigel 77 frame->Xoffset_top = offset_top;
433     frame->Xims = ims;
434     frame->Xeptrb = eptrb;
435     frame->Xflags = flags;
436 nigel 87 frame->Xrdepth = rdepth;
437 nigel 77
438     /* This is where control jumps back to to effect "recursion" */
439    
440     HEAP_RECURSE:
441    
442     /* Macros make the argument variables come from the current frame */
443    
444     #define eptr frame->Xeptr
445     #define ecode frame->Xecode
446 ph10 168 #define mstart frame->Xmstart
447 nigel 77 #define offset_top frame->Xoffset_top
448     #define ims frame->Xims
449     #define eptrb frame->Xeptrb
450     #define flags frame->Xflags
451 nigel 87 #define rdepth frame->Xrdepth
452 nigel 77
453     /* Ditto for the local variables */
454    
455     #ifdef SUPPORT_UTF8
456     #define charptr frame->Xcharptr
457     #endif
458     #define callpat frame->Xcallpat
459     #define data frame->Xdata
460     #define next frame->Xnext
461     #define pp frame->Xpp
462     #define prev frame->Xprev
463     #define saved_eptr frame->Xsaved_eptr
464    
465     #define new_recursive frame->Xnew_recursive
466    
467     #define cur_is_word frame->Xcur_is_word
468     #define condition frame->Xcondition
469     #define prev_is_word frame->Xprev_is_word
470    
471     #define original_ims frame->Xoriginal_ims
472    
473     #ifdef SUPPORT_UCP
474     #define prop_type frame->Xprop_type
475 nigel 87 #define prop_value frame->Xprop_value
476 nigel 77 #define prop_fail_result frame->Xprop_fail_result
477     #define prop_category frame->Xprop_category
478     #define prop_chartype frame->Xprop_chartype
479 nigel 87 #define prop_script frame->Xprop_script
480 ph10 115 #define oclength frame->Xoclength
481     #define occhars frame->Xocchars
482 nigel 77 #endif
483    
484     #define ctype frame->Xctype
485     #define fc frame->Xfc
486     #define fi frame->Xfi
487     #define length frame->Xlength
488     #define max frame->Xmax
489     #define min frame->Xmin
490     #define number frame->Xnumber
491     #define offset frame->Xoffset
492     #define op frame->Xop
493     #define save_capture_last frame->Xsave_capture_last
494     #define save_offset1 frame->Xsave_offset1
495     #define save_offset2 frame->Xsave_offset2
496     #define save_offset3 frame->Xsave_offset3
497     #define stacksave frame->Xstacksave
498    
499     #define newptrb frame->Xnewptrb
500    
501     /* When recursion is being used, local variables are allocated on the stack and
502     get preserved during recursion in the normal way. In this environment, fi and
503     i, and fc and c, can be the same variables. */
504    
505 nigel 93 #else /* NO_RECURSE not defined */
506 nigel 77 #define fi i
507     #define fc c
508    
509    
510 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
511     const uschar *charptr; /* in small blocks of the code. My normal */
512     #endif /* style of coding would have declared */
513     const uschar *callpat; /* them within each of those blocks. */
514     const uschar *data; /* However, in order to accommodate the */
515     const uschar *next; /* version of this code that uses an */
516     USPTR pp; /* external "stack" implemented on the */
517     const uschar *prev; /* heap, it is easier to declare them all */
518     USPTR saved_eptr; /* here, so the declarations can be cut */
519     /* out in a block. The only declarations */
520     recursion_info new_recursive; /* within blocks below are for variables */
521     /* that do not have to be preserved over */
522     BOOL cur_is_word; /* a recursive call to RMATCH(). */
523     BOOL condition;
524 nigel 77 BOOL prev_is_word;
525    
526     unsigned long int original_ims;
527    
528     #ifdef SUPPORT_UCP
529     int prop_type;
530 nigel 87 int prop_value;
531 nigel 77 int prop_fail_result;
532     int prop_category;
533     int prop_chartype;
534 nigel 87 int prop_script;
535 ph10 115 int oclength;
536     uschar occhars[8];
537 nigel 77 #endif
538    
539     int ctype;
540     int length;
541     int max;
542     int min;
543     int number;
544     int offset;
545     int op;
546     int save_capture_last;
547     int save_offset1, save_offset2, save_offset3;
548     int stacksave[REC_STACK_SAVE_MAX];
549    
550     eptrblock newptrb;
551 nigel 93 #endif /* NO_RECURSE */
552 nigel 77
553     /* These statements are here to stop the compiler complaining about unitialized
554     variables. */
555    
556     #ifdef SUPPORT_UCP
557 nigel 87 prop_value = 0;
558 nigel 77 prop_fail_result = 0;
559     #endif
560    
561 nigel 93
562 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
563     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
564     used. Thanks to Ian Taylor for noticing this possibility and sending the
565     original patch. */
566    
567     TAIL_RECURSE:
568    
569 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
570     are specified by the macro RMATCH and RRETURN is used to return. When
571     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
572     and a "return", respectively (possibly with some debugging if DEBUG is
573     defined). However, RMATCH isn't like a function call because it's quite a
574     complicated macro. It has to be used in one particular way. This shouldn't,
575     however, impact performance when true recursion is being used. */
576 nigel 77
577 ph10 164 #ifdef SUPPORT_UTF8
578     utf8 = md->utf8; /* Local copy of the flag */
579     #else
580     utf8 = FALSE;
581     #endif
582    
583 nigel 87 /* First check that we haven't called match() too many times, or that we
584     haven't exceeded the recursive call limit. */
585    
586 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
587 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
588 nigel 77
589     original_ims = ims; /* Save for resetting on ')' */
590 nigel 91
591 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
592     string, the match_cbegroup flag is set. When this is the case, add the current
593     subject pointer to the chain of such remembered pointers, to be checked when we
594     hit the closing ket, in order to break infinite loops that match no characters.
595 ph10 197 When match() is called in other circumstances, don't add to the chain. The
596     match_cbegroup flag must NOT be used with tail recursion, because the memory
597     block that is used is on the stack, so a new one may be required for each
598     match(). */
599 nigel 77
600 nigel 93 if ((flags & match_cbegroup) != 0)
601 nigel 77 {
602 ph10 197 newptrb.epb_saved_eptr = eptr;
603     newptrb.epb_prev = eptrb;
604     eptrb = &newptrb;
605 nigel 77 }
606    
607 nigel 93 /* Now start processing the opcodes. */
608 nigel 77
609     for (;;)
610     {
611 nigel 93 minimize = possessive = FALSE;
612 nigel 77 op = *ecode;
613    
614     /* For partial matching, remember if we ever hit the end of the subject after
615     matching at least one subject character. */
616    
617     if (md->partial &&
618     eptr >= md->end_subject &&
619 ph10 168 eptr > mstart)
620 nigel 77 md->hitend = TRUE;
621 ph10 208
622 nigel 93 switch(op)
623     {
624 ph10 210 case OP_FAIL:
625 ph10 211 return MATCH_NOMATCH;
626    
627 ph10 210 case OP_PRUNE:
628     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
629     ims, eptrb, flags, RM51);
630     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
631     return MATCH_PRUNE;
632 ph10 211
633 ph10 210 case OP_COMMIT:
634     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
635     ims, eptrb, flags, RM52);
636     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
637     return MATCH_COMMIT;
638 ph10 211
639 ph10 210 case OP_SKIP:
640     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
641     ims, eptrb, flags, RM53);
642     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
643 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
644 ph10 210 return MATCH_SKIP;
645 ph10 211
646 ph10 210 case OP_THEN:
647     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
648     ims, eptrb, flags, RM53);
649     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
650     return MATCH_THEN;
651 ph10 211
652 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
653     the current subject position in the working slot at the top of the vector.
654     We mustn't change the current values of the data slot, because they may be
655     set from a previous iteration of this group, and be referred to by a
656     reference inside the group.
657 nigel 77
658 nigel 93 If the bracket fails to match, we need to restore this value and also the
659     values of the final offsets, in case they were set by a previous iteration
660     of the same bracket.
661 nigel 77
662 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
663     a non-capturing bracket. Don't worry about setting the flag for the error
664     case here; that is handled in the code for KET. */
665 nigel 77
666 nigel 93 case OP_CBRA:
667     case OP_SCBRA:
668     number = GET2(ecode, 1+LINK_SIZE);
669 nigel 77 offset = number << 1;
670    
671     #ifdef DEBUG
672 nigel 93 printf("start bracket %d\n", number);
673     printf("subject=");
674 nigel 77 pchars(eptr, 16, TRUE, md);
675     printf("\n");
676     #endif
677    
678     if (offset < md->offset_max)
679     {
680     save_offset1 = md->offset_vector[offset];
681     save_offset2 = md->offset_vector[offset+1];
682     save_offset3 = md->offset_vector[md->offset_end - number];
683     save_capture_last = md->capture_last;
684    
685     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
686     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
687    
688 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
689 nigel 77 do
690     {
691 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
692     ims, eptrb, flags, RM1);
693 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
694 nigel 77 md->capture_last = save_capture_last;
695     ecode += GET(ecode, 1);
696     }
697     while (*ecode == OP_ALT);
698    
699     DPRINTF(("bracket %d failed\n", number));
700    
701     md->offset_vector[offset] = save_offset1;
702     md->offset_vector[offset+1] = save_offset2;
703     md->offset_vector[md->offset_end - number] = save_offset3;
704    
705     RRETURN(MATCH_NOMATCH);
706     }
707    
708 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
709     as a non-capturing bracket. */
710 nigel 77
711 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
713    
714 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
715 nigel 77
716 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
718    
719 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
720     final alternative within the brackets, we would return the result of a
721     recursive call to match() whatever happened. We can reduce stack usage by
722 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
723     is set.*/
724 nigel 77
725 nigel 93 case OP_BRA:
726     case OP_SBRA:
727     DPRINTF(("start non-capturing bracket\n"));
728     flags = (op >= OP_SBRA)? match_cbegroup : 0;
729 nigel 91 for (;;)
730 nigel 77 {
731 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
732 nigel 93 {
733 ph10 197 if (flags == 0) /* Not a possibly empty group */
734     {
735     ecode += _pcre_OP_lengths[*ecode];
736     DPRINTF(("bracket 0 tail recursion\n"));
737     goto TAIL_RECURSE;
738     }
739    
740     /* Possibly empty group; can't use tail recursion. */
741    
742     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
743     eptrb, flags, RM48);
744     RRETURN(rrc);
745 nigel 93 }
746 nigel 91
747     /* For non-final alternatives, continue the loop for a NOMATCH result;
748     otherwise return. */
749    
750 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
751     eptrb, flags, RM2);
752 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
753 nigel 77 ecode += GET(ecode, 1);
754     }
755 nigel 91 /* Control never reaches here. */
756 nigel 77
757     /* Conditional group: compilation checked that there are no more than
758     two branches. If the condition is false, skipping the first branch takes us
759     past the end if there is only one branch, but that's OK because that is
760 nigel 91 exactly what going to the ket would do. As there is only one branch to be
761     obeyed, we can use tail recursion to avoid using another stack frame. */
762 nigel 77
763     case OP_COND:
764 nigel 93 case OP_SCOND:
765     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
766 nigel 77 {
767 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
768     condition = md->recursive != NULL &&
769     (offset == RREF_ANY || offset == md->recursive->group_num);
770     ecode += condition? 3 : GET(ecode, 1);
771     }
772    
773     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
774     {
775 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
776 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
777     ecode += condition? 3 : GET(ecode, 1);
778 nigel 77 }
779    
780 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
781     {
782     condition = FALSE;
783     ecode += GET(ecode, 1);
784     }
785    
786 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
787 nigel 93 the final argument match_condassert causes it to stop at the end of an
788     assertion. */
789 nigel 77
790     else
791     {
792 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
793     match_condassert, RM3);
794 nigel 77 if (rrc == MATCH_MATCH)
795     {
796 nigel 93 condition = TRUE;
797     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
798 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
799     }
800 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
801 nigel 77 {
802     RRETURN(rrc); /* Need braces because of following else */
803     }
804 nigel 93 else
805     {
806     condition = FALSE;
807     ecode += GET(ecode, 1);
808     }
809     }
810 nigel 91
811 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
812 ph10 197 we can use tail recursion to avoid using another stack frame, except when
813     match_cbegroup is required for an unlimited repeat of a possibly empty
814     group. If the second alternative doesn't exist, we can just plough on. */
815 nigel 91
816 nigel 93 if (condition || *ecode == OP_ALT)
817     {
818 nigel 91 ecode += 1 + LINK_SIZE;
819 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
820     {
821     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
822     RRETURN(rrc);
823     }
824     else /* Group must match something */
825     {
826     flags = 0;
827     goto TAIL_RECURSE;
828     }
829 nigel 77 }
830 ph10 197 else /* Condition false & no 2nd alternative */
831 nigel 93 {
832     ecode += 1 + LINK_SIZE;
833     }
834     break;
835 nigel 77
836    
837 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
838     recursion, we should restore the offsets appropriately and continue from
839     after the call. */
840 nigel 77
841 ph10 210 case OP_ACCEPT:
842 nigel 77 case OP_END:
843     if (md->recursive != NULL && md->recursive->group_num == 0)
844     {
845     recursion_info *rec = md->recursive;
846 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
847 nigel 77 md->recursive = rec->prevrec;
848     memmove(md->offset_vector, rec->offset_save,
849     rec->saved_max * sizeof(int));
850 ph10 168 mstart = rec->save_start;
851 nigel 77 ims = original_ims;
852     ecode = rec->after_call;
853     break;
854     }
855    
856     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
857     string - backtracking will then try other alternatives, if any. */
858    
859 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
860     md->end_match_ptr = eptr; /* Record where we ended */
861     md->end_offset_top = offset_top; /* and how many extracts were taken */
862 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
863 nigel 77 RRETURN(MATCH_MATCH);
864    
865     /* Change option settings */
866    
867     case OP_OPT:
868     ims = ecode[1];
869     ecode += 2;
870     DPRINTF(("ims set to %02lx\n", ims));
871     break;
872    
873     /* Assertion brackets. Check the alternative branches in turn - the
874     matching won't pass the KET for an assertion. If any one branch matches,
875     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
876     start of each branch to move the current point backwards, so the code at
877     this level is identical to the lookahead case. */
878    
879     case OP_ASSERT:
880     case OP_ASSERTBACK:
881     do
882     {
883 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
884     RM4);
885 nigel 77 if (rrc == MATCH_MATCH) break;
886 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
887 nigel 77 ecode += GET(ecode, 1);
888     }
889     while (*ecode == OP_ALT);
890     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
891    
892     /* If checking an assertion for a condition, return MATCH_MATCH. */
893    
894     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
895    
896     /* Continue from after the assertion, updating the offsets high water
897     mark, since extracts may have been taken during the assertion. */
898    
899     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
900     ecode += 1 + LINK_SIZE;
901     offset_top = md->end_offset_top;
902     continue;
903    
904     /* Negative assertion: all branches must fail to match */
905    
906     case OP_ASSERT_NOT:
907     case OP_ASSERTBACK_NOT:
908     do
909     {
910 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
911     RM5);
912 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
913 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
914 nigel 77 ecode += GET(ecode,1);
915     }
916     while (*ecode == OP_ALT);
917    
918     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
919    
920     ecode += 1 + LINK_SIZE;
921     continue;
922    
923     /* Move the subject pointer back. This occurs only at the start of
924     each branch of a lookbehind assertion. If we are too close to the start to
925     move back, this match function fails. When working with UTF-8 we move
926     back a number of characters, not bytes. */
927    
928     case OP_REVERSE:
929     #ifdef SUPPORT_UTF8
930     if (utf8)
931     {
932 nigel 93 i = GET(ecode, 1);
933     while (i-- > 0)
934 nigel 77 {
935     eptr--;
936     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
937 ph10 207 BACKCHAR(eptr);
938 nigel 77 }
939     }
940     else
941     #endif
942    
943     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
944    
945     {
946 nigel 93 eptr -= GET(ecode, 1);
947 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
948     }
949    
950     /* Skip to next op code */
951    
952     ecode += 1 + LINK_SIZE;
953     break;
954    
955     /* The callout item calls an external function, if one is provided, passing
956     details of the match so far. This is mainly for debugging, though the
957     function is able to force a failure. */
958    
959     case OP_CALLOUT:
960     if (pcre_callout != NULL)
961     {
962     pcre_callout_block cb;
963     cb.version = 1; /* Version 1 of the callout block */
964     cb.callout_number = ecode[1];
965     cb.offset_vector = md->offset_vector;
966 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
967 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
968 ph10 168 cb.start_match = mstart - md->start_subject;
969 nigel 77 cb.current_position = eptr - md->start_subject;
970     cb.pattern_position = GET(ecode, 2);
971     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
972     cb.capture_top = offset_top/2;
973     cb.capture_last = md->capture_last;
974     cb.callout_data = md->callout_data;
975     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
976     if (rrc < 0) RRETURN(rrc);
977     }
978     ecode += 2 + 2*LINK_SIZE;
979     break;
980    
981     /* Recursion either matches the current regex, or some subexpression. The
982     offset data is the offset to the starting bracket from the start of the
983     whole pattern. (This is so that it works from duplicated subpatterns.)
984    
985     If there are any capturing brackets started but not finished, we have to
986     save their starting points and reinstate them after the recursion. However,
987     we don't know how many such there are (offset_top records the completed
988     total) so we just have to save all the potential data. There may be up to
989     65535 such values, which is too large to put on the stack, but using malloc
990     for small numbers seems expensive. As a compromise, the stack is used when
991     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
992     is used. A problem is what to do if the malloc fails ... there is no way of
993     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
994     values on the stack, and accept that the rest may be wrong.
995    
996     There are also other values that have to be saved. We use a chained
997     sequence of blocks that actually live on the stack. Thanks to Robin Houston
998     for the original version of this logic. */
999    
1000     case OP_RECURSE:
1001     {
1002     callpat = md->start_code + GET(ecode, 1);
1003 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1004     GET2(callpat, 1 + LINK_SIZE);
1005 nigel 77
1006     /* Add to "recursing stack" */
1007    
1008     new_recursive.prevrec = md->recursive;
1009     md->recursive = &new_recursive;
1010    
1011     /* Find where to continue from afterwards */
1012    
1013     ecode += 1 + LINK_SIZE;
1014     new_recursive.after_call = ecode;
1015    
1016     /* Now save the offset data. */
1017    
1018     new_recursive.saved_max = md->offset_end;
1019     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1020     new_recursive.offset_save = stacksave;
1021     else
1022     {
1023     new_recursive.offset_save =
1024     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1025     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1026     }
1027    
1028     memcpy(new_recursive.offset_save, md->offset_vector,
1029     new_recursive.saved_max * sizeof(int));
1030 ph10 168 new_recursive.save_start = mstart;
1031     mstart = eptr;
1032 nigel 77
1033     /* OK, now we can do the recursion. For each top-level alternative we
1034     restore the offset and recursion data. */
1035    
1036     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1037 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1038 nigel 77 do
1039     {
1040 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1041     md, ims, eptrb, flags, RM6);
1042 nigel 77 if (rrc == MATCH_MATCH)
1043     {
1044 nigel 87 DPRINTF(("Recursion matched\n"));
1045 nigel 77 md->recursive = new_recursive.prevrec;
1046     if (new_recursive.offset_save != stacksave)
1047     (pcre_free)(new_recursive.offset_save);
1048     RRETURN(MATCH_MATCH);
1049     }
1050 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1051 nigel 87 {
1052     DPRINTF(("Recursion gave error %d\n", rrc));
1053     RRETURN(rrc);
1054     }
1055 nigel 77
1056     md->recursive = &new_recursive;
1057     memcpy(md->offset_vector, new_recursive.offset_save,
1058     new_recursive.saved_max * sizeof(int));
1059     callpat += GET(callpat, 1);
1060     }
1061     while (*callpat == OP_ALT);
1062    
1063     DPRINTF(("Recursion didn't match\n"));
1064     md->recursive = new_recursive.prevrec;
1065     if (new_recursive.offset_save != stacksave)
1066     (pcre_free)(new_recursive.offset_save);
1067     RRETURN(MATCH_NOMATCH);
1068     }
1069     /* Control never reaches here */
1070    
1071     /* "Once" brackets are like assertion brackets except that after a match,
1072     the point in the subject string is not moved back. Thus there can never be
1073     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1074     Check the alternative branches in turn - the matching won't pass the KET
1075     for this kind of subpattern. If any one branch matches, we carry on as at
1076     the end of a normal bracket, leaving the subject pointer. */
1077    
1078     case OP_ONCE:
1079 nigel 91 prev = ecode;
1080     saved_eptr = eptr;
1081    
1082     do
1083 nigel 77 {
1084 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1085 nigel 91 if (rrc == MATCH_MATCH) break;
1086 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1087 nigel 91 ecode += GET(ecode,1);
1088     }
1089     while (*ecode == OP_ALT);
1090 nigel 77
1091 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1092 nigel 77
1093 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1094 nigel 77
1095 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1096     mark, since extracts may have been taken. */
1097 nigel 77
1098 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1099 nigel 77
1100 nigel 91 offset_top = md->end_offset_top;
1101     eptr = md->end_match_ptr;
1102 nigel 77
1103 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1104     happens for a repeating ket if no characters were matched in the group.
1105     This is the forcible breaking of infinite loops as implemented in Perl
1106     5.005. If there is an options reset, it will get obeyed in the normal
1107     course of events. */
1108 nigel 77
1109 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1110     {
1111     ecode += 1+LINK_SIZE;
1112     break;
1113     }
1114 nigel 77
1115 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1116     preceding bracket, in the appropriate order. The second "call" of match()
1117     uses tail recursion, to avoid using another stack frame. We need to reset
1118     any options that changed within the bracket before re-running it, so
1119     check the next opcode. */
1120 nigel 77
1121 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1122     {
1123     ims = (ims & ~PCRE_IMS) | ecode[4];
1124     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1125     }
1126 nigel 77
1127 nigel 91 if (*ecode == OP_KETRMIN)
1128     {
1129 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1130 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1131     ecode = prev;
1132 ph10 197 flags = 0;
1133 nigel 91 goto TAIL_RECURSE;
1134 nigel 77 }
1135 nigel 91 else /* OP_KETRMAX */
1136     {
1137 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1138 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1139     ecode += 1 + LINK_SIZE;
1140 ph10 197 flags = 0;
1141 nigel 91 goto TAIL_RECURSE;
1142     }
1143     /* Control never gets here */
1144 nigel 77
1145     /* An alternation is the end of a branch; scan along to find the end of the
1146     bracketed group and go to there. */
1147    
1148     case OP_ALT:
1149     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1150     break;
1151    
1152     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1153     that it may occur zero times. It may repeat infinitely, or not at all -
1154     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1155     repeat limits are compiled as a number of copies, with the optional ones
1156     preceded by BRAZERO or BRAMINZERO. */
1157    
1158     case OP_BRAZERO:
1159     {
1160     next = ecode+1;
1161 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1162 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1163     do next += GET(next,1); while (*next == OP_ALT);
1164 nigel 93 ecode = next + 1 + LINK_SIZE;
1165 nigel 77 }
1166     break;
1167    
1168     case OP_BRAMINZERO:
1169     {
1170     next = ecode+1;
1171 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1172 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1173 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1174     ecode++;
1175     }
1176     break;
1177    
1178 nigel 93 /* End of a group, repeated or non-repeating. */
1179 nigel 77
1180     case OP_KET:
1181     case OP_KETRMIN:
1182     case OP_KETRMAX:
1183 nigel 91 prev = ecode - GET(ecode, 1);
1184 nigel 77
1185 nigel 93 /* If this was a group that remembered the subject start, in order to break
1186     infinite repeats of empty string matches, retrieve the subject start from
1187     the chain. Otherwise, set it NULL. */
1188 nigel 77
1189 nigel 93 if (*prev >= OP_SBRA)
1190     {
1191     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1192     eptrb = eptrb->epb_prev; /* Backup to previous group */
1193     }
1194     else saved_eptr = NULL;
1195 nigel 77
1196 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1197     MATCH_MATCH, but record the current high water mark for use by positive
1198     assertions. Do this also for the "once" (atomic) groups. */
1199    
1200 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1201     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1202     *prev == OP_ONCE)
1203     {
1204     md->end_match_ptr = eptr; /* For ONCE */
1205     md->end_offset_top = offset_top;
1206     RRETURN(MATCH_MATCH);
1207     }
1208 nigel 77
1209 nigel 93 /* For capturing groups we have to check the group number back at the start
1210     and if necessary complete handling an extraction by setting the offsets and
1211     bumping the high water mark. Note that whole-pattern recursion is coded as
1212     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1213     when the OP_END is reached. Other recursion is handled here. */
1214 nigel 77
1215 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1216 nigel 91 {
1217 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1218 nigel 91 offset = number << 1;
1219 nigel 77
1220     #ifdef DEBUG
1221 nigel 91 printf("end bracket %d", number);
1222     printf("\n");
1223 nigel 77 #endif
1224    
1225 nigel 93 md->capture_last = number;
1226     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1227 nigel 91 {
1228 nigel 93 md->offset_vector[offset] =
1229     md->offset_vector[md->offset_end - number];
1230     md->offset_vector[offset+1] = eptr - md->start_subject;
1231     if (offset_top <= offset) offset_top = offset + 2;
1232     }
1233 nigel 77
1234 nigel 93 /* Handle a recursively called group. Restore the offsets
1235     appropriately and continue from after the call. */
1236 nigel 77
1237 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1238     {
1239     recursion_info *rec = md->recursive;
1240     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1241     md->recursive = rec->prevrec;
1242 ph10 168 mstart = rec->save_start;
1243 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1244     rec->saved_max * sizeof(int));
1245     ecode = rec->after_call;
1246     ims = original_ims;
1247     break;
1248 nigel 77 }
1249 nigel 91 }
1250 nigel 77
1251 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1252     flags, in case they got changed during the group. */
1253 nigel 77
1254 nigel 91 ims = original_ims;
1255     DPRINTF(("ims reset to %02lx\n", ims));
1256 nigel 77
1257 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1258     happens for a repeating ket if no characters were matched in the group.
1259     This is the forcible breaking of infinite loops as implemented in Perl
1260     5.005. If there is an options reset, it will get obeyed in the normal
1261     course of events. */
1262 nigel 77
1263 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1264     {
1265     ecode += 1 + LINK_SIZE;
1266     break;
1267     }
1268 nigel 77
1269 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1270     preceding bracket, in the appropriate order. In the second case, we can use
1271 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1272     unlimited repeat of a group that can match an empty string. */
1273 nigel 77
1274 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1275    
1276 nigel 91 if (*ecode == OP_KETRMIN)
1277     {
1278 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1279 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1280 ph10 197 if (flags != 0) /* Could match an empty string */
1281     {
1282     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1283     RRETURN(rrc);
1284     }
1285 nigel 91 ecode = prev;
1286     goto TAIL_RECURSE;
1287 nigel 77 }
1288 nigel 91 else /* OP_KETRMAX */
1289     {
1290 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1291 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1292     ecode += 1 + LINK_SIZE;
1293 ph10 197 flags = 0;
1294 nigel 91 goto TAIL_RECURSE;
1295     }
1296     /* Control never gets here */
1297 nigel 77
1298     /* Start of subject unless notbol, or after internal newline if multiline */
1299    
1300     case OP_CIRC:
1301     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1302     if ((ims & PCRE_MULTILINE) != 0)
1303     {
1304 nigel 91 if (eptr != md->start_subject &&
1305 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1306 nigel 77 RRETURN(MATCH_NOMATCH);
1307     ecode++;
1308     break;
1309     }
1310     /* ... else fall through */
1311    
1312     /* Start of subject assertion */
1313    
1314     case OP_SOD:
1315     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1316     ecode++;
1317     break;
1318    
1319     /* Start of match assertion */
1320    
1321     case OP_SOM:
1322     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1323     ecode++;
1324     break;
1325 ph10 172
1326 ph10 168 /* Reset the start of match point */
1327 ph10 172
1328 ph10 168 case OP_SET_SOM:
1329     mstart = eptr;
1330 ph10 172 ecode++;
1331     break;
1332 nigel 77
1333     /* Assert before internal newline if multiline, or before a terminating
1334     newline unless endonly is set, else end of subject unless noteol is set. */
1335    
1336     case OP_DOLL:
1337     if ((ims & PCRE_MULTILINE) != 0)
1338     {
1339     if (eptr < md->end_subject)
1340 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1341 nigel 77 else
1342     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1343     ecode++;
1344     break;
1345     }
1346     else
1347     {
1348     if (md->noteol) RRETURN(MATCH_NOMATCH);
1349     if (!md->endonly)
1350     {
1351 nigel 91 if (eptr != md->end_subject &&
1352 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1353 nigel 77 RRETURN(MATCH_NOMATCH);
1354     ecode++;
1355     break;
1356     }
1357     }
1358 nigel 91 /* ... else fall through for endonly */
1359 nigel 77
1360     /* End of subject assertion (\z) */
1361    
1362     case OP_EOD:
1363     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1364     ecode++;
1365     break;
1366    
1367     /* End of subject or ending \n assertion (\Z) */
1368    
1369     case OP_EODN:
1370 nigel 91 if (eptr != md->end_subject &&
1371 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1372 nigel 91 RRETURN(MATCH_NOMATCH);
1373 nigel 77 ecode++;
1374     break;
1375    
1376     /* Word boundary assertions */
1377    
1378     case OP_NOT_WORD_BOUNDARY:
1379     case OP_WORD_BOUNDARY:
1380     {
1381    
1382     /* Find out if the previous and current characters are "word" characters.
1383     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1384     be "non-word" characters. */
1385    
1386     #ifdef SUPPORT_UTF8
1387     if (utf8)
1388     {
1389     if (eptr == md->start_subject) prev_is_word = FALSE; else
1390     {
1391     const uschar *lastptr = eptr - 1;
1392     while((*lastptr & 0xc0) == 0x80) lastptr--;
1393     GETCHAR(c, lastptr);
1394     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1395     }
1396     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1397     {
1398     GETCHAR(c, eptr);
1399     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1400     }
1401     }
1402     else
1403     #endif
1404    
1405     /* More streamlined when not in UTF-8 mode */
1406    
1407     {
1408     prev_is_word = (eptr != md->start_subject) &&
1409     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1410     cur_is_word = (eptr < md->end_subject) &&
1411     ((md->ctypes[*eptr] & ctype_word) != 0);
1412     }
1413    
1414     /* Now see if the situation is what we want */
1415    
1416     if ((*ecode++ == OP_WORD_BOUNDARY)?
1417     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1418     RRETURN(MATCH_NOMATCH);
1419     }
1420     break;
1421    
1422     /* Match a single character type; inline for speed */
1423    
1424     case OP_ANY:
1425 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1426     {
1427 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1428 nigel 91 }
1429 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1430     if (utf8)
1431     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1432     ecode++;
1433     break;
1434    
1435     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1436     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1437    
1438     case OP_ANYBYTE:
1439     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1440     ecode++;
1441     break;
1442    
1443     case OP_NOT_DIGIT:
1444     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1445     GETCHARINCTEST(c, eptr);
1446     if (
1447     #ifdef SUPPORT_UTF8
1448     c < 256 &&
1449     #endif
1450     (md->ctypes[c] & ctype_digit) != 0
1451     )
1452     RRETURN(MATCH_NOMATCH);
1453     ecode++;
1454     break;
1455    
1456     case OP_DIGIT:
1457     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1458     GETCHARINCTEST(c, eptr);
1459     if (
1460     #ifdef SUPPORT_UTF8
1461     c >= 256 ||
1462     #endif
1463     (md->ctypes[c] & ctype_digit) == 0
1464     )
1465     RRETURN(MATCH_NOMATCH);
1466     ecode++;
1467     break;
1468    
1469     case OP_NOT_WHITESPACE:
1470     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1471     GETCHARINCTEST(c, eptr);
1472     if (
1473     #ifdef SUPPORT_UTF8
1474     c < 256 &&
1475     #endif
1476     (md->ctypes[c] & ctype_space) != 0
1477     )
1478     RRETURN(MATCH_NOMATCH);
1479     ecode++;
1480     break;
1481    
1482     case OP_WHITESPACE:
1483     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1484     GETCHARINCTEST(c, eptr);
1485     if (
1486     #ifdef SUPPORT_UTF8
1487     c >= 256 ||
1488     #endif
1489     (md->ctypes[c] & ctype_space) == 0
1490     )
1491     RRETURN(MATCH_NOMATCH);
1492     ecode++;
1493     break;
1494    
1495     case OP_NOT_WORDCHAR:
1496     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1497     GETCHARINCTEST(c, eptr);
1498     if (
1499     #ifdef SUPPORT_UTF8
1500     c < 256 &&
1501     #endif
1502     (md->ctypes[c] & ctype_word) != 0
1503     )
1504     RRETURN(MATCH_NOMATCH);
1505     ecode++;
1506     break;
1507    
1508     case OP_WORDCHAR:
1509     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1510     GETCHARINCTEST(c, eptr);
1511     if (
1512     #ifdef SUPPORT_UTF8
1513     c >= 256 ||
1514     #endif
1515     (md->ctypes[c] & ctype_word) == 0
1516     )
1517     RRETURN(MATCH_NOMATCH);
1518     ecode++;
1519     break;
1520    
1521 nigel 93 case OP_ANYNL:
1522     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1523     GETCHARINCTEST(c, eptr);
1524     switch(c)
1525     {
1526     default: RRETURN(MATCH_NOMATCH);
1527     case 0x000d:
1528     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1529     break;
1530     case 0x000a:
1531     case 0x000b:
1532     case 0x000c:
1533     case 0x0085:
1534     case 0x2028:
1535     case 0x2029:
1536     break;
1537     }
1538     ecode++;
1539     break;
1540    
1541 ph10 178 case OP_NOT_HSPACE:
1542     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1543     GETCHARINCTEST(c, eptr);
1544     switch(c)
1545     {
1546     default: break;
1547     case 0x09: /* HT */
1548     case 0x20: /* SPACE */
1549     case 0xa0: /* NBSP */
1550     case 0x1680: /* OGHAM SPACE MARK */
1551     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1552     case 0x2000: /* EN QUAD */
1553     case 0x2001: /* EM QUAD */
1554     case 0x2002: /* EN SPACE */
1555     case 0x2003: /* EM SPACE */
1556     case 0x2004: /* THREE-PER-EM SPACE */
1557     case 0x2005: /* FOUR-PER-EM SPACE */
1558     case 0x2006: /* SIX-PER-EM SPACE */
1559     case 0x2007: /* FIGURE SPACE */
1560     case 0x2008: /* PUNCTUATION SPACE */
1561     case 0x2009: /* THIN SPACE */
1562     case 0x200A: /* HAIR SPACE */
1563     case 0x202f: /* NARROW NO-BREAK SPACE */
1564     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1565     case 0x3000: /* IDEOGRAPHIC SPACE */
1566     RRETURN(MATCH_NOMATCH);
1567     }
1568     ecode++;
1569     break;
1570    
1571     case OP_HSPACE:
1572     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1573     GETCHARINCTEST(c, eptr);
1574     switch(c)
1575     {
1576     default: RRETURN(MATCH_NOMATCH);
1577     case 0x09: /* HT */
1578     case 0x20: /* SPACE */
1579     case 0xa0: /* NBSP */
1580     case 0x1680: /* OGHAM SPACE MARK */
1581     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1582     case 0x2000: /* EN QUAD */
1583     case 0x2001: /* EM QUAD */
1584     case 0x2002: /* EN SPACE */
1585     case 0x2003: /* EM SPACE */
1586     case 0x2004: /* THREE-PER-EM SPACE */
1587     case 0x2005: /* FOUR-PER-EM SPACE */
1588     case 0x2006: /* SIX-PER-EM SPACE */
1589     case 0x2007: /* FIGURE SPACE */
1590     case 0x2008: /* PUNCTUATION SPACE */
1591     case 0x2009: /* THIN SPACE */
1592     case 0x200A: /* HAIR SPACE */
1593     case 0x202f: /* NARROW NO-BREAK SPACE */
1594     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1595     case 0x3000: /* IDEOGRAPHIC SPACE */
1596     break;
1597     }
1598     ecode++;
1599     break;
1600    
1601     case OP_NOT_VSPACE:
1602     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1603     GETCHARINCTEST(c, eptr);
1604     switch(c)
1605     {
1606     default: break;
1607     case 0x0a: /* LF */
1608     case 0x0b: /* VT */
1609     case 0x0c: /* FF */
1610     case 0x0d: /* CR */
1611     case 0x85: /* NEL */
1612     case 0x2028: /* LINE SEPARATOR */
1613     case 0x2029: /* PARAGRAPH SEPARATOR */
1614     RRETURN(MATCH_NOMATCH);
1615     }
1616     ecode++;
1617     break;
1618    
1619     case OP_VSPACE:
1620     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1621     GETCHARINCTEST(c, eptr);
1622     switch(c)
1623     {
1624     default: RRETURN(MATCH_NOMATCH);
1625     case 0x0a: /* LF */
1626     case 0x0b: /* VT */
1627     case 0x0c: /* FF */
1628     case 0x0d: /* CR */
1629     case 0x85: /* NEL */
1630     case 0x2028: /* LINE SEPARATOR */
1631     case 0x2029: /* PARAGRAPH SEPARATOR */
1632     break;
1633     }
1634     ecode++;
1635     break;
1636    
1637 nigel 77 #ifdef SUPPORT_UCP
1638     /* Check the next character by Unicode property. We will get here only
1639     if the support is in the binary; otherwise a compile-time error occurs. */
1640    
1641     case OP_PROP:
1642     case OP_NOTPROP:
1643     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1644     GETCHARINCTEST(c, eptr);
1645     {
1646 nigel 87 int chartype, script;
1647     int category = _pcre_ucp_findprop(c, &chartype, &script);
1648 nigel 77
1649 nigel 87 switch(ecode[1])
1650     {
1651     case PT_ANY:
1652     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1653     break;
1654 nigel 77
1655 nigel 87 case PT_LAMP:
1656     if ((chartype == ucp_Lu ||
1657     chartype == ucp_Ll ||
1658     chartype == ucp_Lt) == (op == OP_NOTPROP))
1659 nigel 77 RRETURN(MATCH_NOMATCH);
1660 nigel 87 break;
1661    
1662     case PT_GC:
1663     if ((ecode[2] != category) == (op == OP_PROP))
1664 nigel 77 RRETURN(MATCH_NOMATCH);
1665 nigel 87 break;
1666    
1667     case PT_PC:
1668     if ((ecode[2] != chartype) == (op == OP_PROP))
1669     RRETURN(MATCH_NOMATCH);
1670     break;
1671    
1672     case PT_SC:
1673     if ((ecode[2] != script) == (op == OP_PROP))
1674     RRETURN(MATCH_NOMATCH);
1675     break;
1676    
1677     default:
1678     RRETURN(PCRE_ERROR_INTERNAL);
1679 nigel 77 }
1680 nigel 87
1681     ecode += 3;
1682 nigel 77 }
1683     break;
1684    
1685     /* Match an extended Unicode sequence. We will get here only if the support
1686     is in the binary; otherwise a compile-time error occurs. */
1687    
1688     case OP_EXTUNI:
1689     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1690     GETCHARINCTEST(c, eptr);
1691     {
1692 nigel 87 int chartype, script;
1693     int category = _pcre_ucp_findprop(c, &chartype, &script);
1694 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1695     while (eptr < md->end_subject)
1696     {
1697     int len = 1;
1698     if (!utf8) c = *eptr; else
1699     {
1700     GETCHARLEN(c, eptr, len);
1701     }
1702 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1703 nigel 77 if (category != ucp_M) break;
1704     eptr += len;
1705     }
1706     }
1707     ecode++;
1708     break;
1709     #endif
1710    
1711    
1712     /* Match a back reference, possibly repeatedly. Look past the end of the
1713     item to see if there is repeat information following. The code is similar
1714     to that for character classes, but repeated for efficiency. Then obey
1715     similar code to character type repeats - written out again for speed.
1716     However, if the referenced string is the empty string, always treat
1717     it as matched, any number of times (otherwise there could be infinite
1718     loops). */
1719    
1720     case OP_REF:
1721     {
1722     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1723     ecode += 3; /* Advance past item */
1724    
1725     /* If the reference is unset, set the length to be longer than the amount
1726     of subject left; this ensures that every attempt at a match fails. We
1727     can't just fail here, because of the possibility of quantifiers with zero
1728     minima. */
1729    
1730     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1731     md->end_subject - eptr + 1 :
1732     md->offset_vector[offset+1] - md->offset_vector[offset];
1733    
1734     /* Set up for repetition, or handle the non-repeated case */
1735    
1736     switch (*ecode)
1737     {
1738     case OP_CRSTAR:
1739     case OP_CRMINSTAR:
1740     case OP_CRPLUS:
1741     case OP_CRMINPLUS:
1742     case OP_CRQUERY:
1743     case OP_CRMINQUERY:
1744     c = *ecode++ - OP_CRSTAR;
1745     minimize = (c & 1) != 0;
1746     min = rep_min[c]; /* Pick up values from tables; */
1747     max = rep_max[c]; /* zero for max => infinity */
1748     if (max == 0) max = INT_MAX;
1749     break;
1750    
1751     case OP_CRRANGE:
1752     case OP_CRMINRANGE:
1753     minimize = (*ecode == OP_CRMINRANGE);
1754     min = GET2(ecode, 1);
1755     max = GET2(ecode, 3);
1756     if (max == 0) max = INT_MAX;
1757     ecode += 5;
1758     break;
1759    
1760     default: /* No repeat follows */
1761     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1762     eptr += length;
1763     continue; /* With the main loop */
1764     }
1765    
1766     /* If the length of the reference is zero, just continue with the
1767     main loop. */
1768    
1769     if (length == 0) continue;
1770    
1771     /* First, ensure the minimum number of matches are present. We get back
1772     the length of the reference string explicitly rather than passing the
1773     address of eptr, so that eptr can be a register variable. */
1774    
1775     for (i = 1; i <= min; i++)
1776     {
1777     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1778     eptr += length;
1779     }
1780    
1781     /* If min = max, continue at the same level without recursion.
1782     They are not both allowed to be zero. */
1783    
1784     if (min == max) continue;
1785    
1786     /* If minimizing, keep trying and advancing the pointer */
1787    
1788     if (minimize)
1789     {
1790     for (fi = min;; fi++)
1791     {
1792 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1793 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1794     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1795     RRETURN(MATCH_NOMATCH);
1796     eptr += length;
1797     }
1798     /* Control never gets here */
1799     }
1800    
1801     /* If maximizing, find the longest string and work backwards */
1802    
1803     else
1804     {
1805     pp = eptr;
1806     for (i = min; i < max; i++)
1807     {
1808     if (!match_ref(offset, eptr, length, md, ims)) break;
1809     eptr += length;
1810     }
1811     while (eptr >= pp)
1812     {
1813 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1814 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1815     eptr -= length;
1816     }
1817     RRETURN(MATCH_NOMATCH);
1818     }
1819     }
1820     /* Control never gets here */
1821    
1822    
1823    
1824     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1825     used when all the characters in the class have values in the range 0-255,
1826     and either the matching is caseful, or the characters are in the range
1827     0-127 when UTF-8 processing is enabled. The only difference between
1828     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1829     encountered.
1830    
1831     First, look past the end of the item to see if there is repeat information
1832     following. Then obey similar code to character type repeats - written out
1833     again for speed. */
1834    
1835     case OP_NCLASS:
1836     case OP_CLASS:
1837     {
1838     data = ecode + 1; /* Save for matching */
1839     ecode += 33; /* Advance past the item */
1840    
1841     switch (*ecode)
1842     {
1843     case OP_CRSTAR:
1844     case OP_CRMINSTAR:
1845     case OP_CRPLUS:
1846     case OP_CRMINPLUS:
1847     case OP_CRQUERY:
1848     case OP_CRMINQUERY:
1849     c = *ecode++ - OP_CRSTAR;
1850     minimize = (c & 1) != 0;
1851     min = rep_min[c]; /* Pick up values from tables; */
1852     max = rep_max[c]; /* zero for max => infinity */
1853     if (max == 0) max = INT_MAX;
1854     break;
1855    
1856     case OP_CRRANGE:
1857     case OP_CRMINRANGE:
1858     minimize = (*ecode == OP_CRMINRANGE);
1859     min = GET2(ecode, 1);
1860     max = GET2(ecode, 3);
1861     if (max == 0) max = INT_MAX;
1862     ecode += 5;
1863     break;
1864    
1865     default: /* No repeat follows */
1866     min = max = 1;
1867     break;
1868     }
1869    
1870     /* First, ensure the minimum number of matches are present. */
1871    
1872     #ifdef SUPPORT_UTF8
1873     /* UTF-8 mode */
1874     if (utf8)
1875     {
1876     for (i = 1; i <= min; i++)
1877     {
1878     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1879     GETCHARINC(c, eptr);
1880     if (c > 255)
1881     {
1882     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1883     }
1884     else
1885     {
1886     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1887     }
1888     }
1889     }
1890     else
1891     #endif
1892     /* Not UTF-8 mode */
1893     {
1894     for (i = 1; i <= min; i++)
1895     {
1896     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1897     c = *eptr++;
1898     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1899     }
1900     }
1901    
1902     /* If max == min we can continue with the main loop without the
1903     need to recurse. */
1904    
1905     if (min == max) continue;
1906    
1907     /* If minimizing, keep testing the rest of the expression and advancing
1908     the pointer while it matches the class. */
1909    
1910     if (minimize)
1911     {
1912     #ifdef SUPPORT_UTF8
1913     /* UTF-8 mode */
1914     if (utf8)
1915     {
1916     for (fi = min;; fi++)
1917     {
1918 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1919 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1920     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1921     GETCHARINC(c, eptr);
1922     if (c > 255)
1923     {
1924     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1925     }
1926     else
1927     {
1928     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1929     }
1930     }
1931     }
1932     else
1933     #endif
1934     /* Not UTF-8 mode */
1935     {
1936     for (fi = min;; fi++)
1937     {
1938 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1939 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1940     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1941     c = *eptr++;
1942     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1943     }
1944     }
1945     /* Control never gets here */
1946     }
1947    
1948     /* If maximizing, find the longest possible run, then work backwards. */
1949    
1950     else
1951     {
1952     pp = eptr;
1953    
1954     #ifdef SUPPORT_UTF8
1955     /* UTF-8 mode */
1956     if (utf8)
1957     {
1958     for (i = min; i < max; i++)
1959     {
1960     int len = 1;
1961     if (eptr >= md->end_subject) break;
1962     GETCHARLEN(c, eptr, len);
1963     if (c > 255)
1964     {
1965     if (op == OP_CLASS) break;
1966     }
1967     else
1968     {
1969     if ((data[c/8] & (1 << (c&7))) == 0) break;
1970     }
1971     eptr += len;
1972     }
1973     for (;;)
1974     {
1975 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1976 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1977     if (eptr-- == pp) break; /* Stop if tried at original pos */
1978     BACKCHAR(eptr);
1979     }
1980     }
1981     else
1982     #endif
1983     /* Not UTF-8 mode */
1984     {
1985     for (i = min; i < max; i++)
1986     {
1987     if (eptr >= md->end_subject) break;
1988     c = *eptr;
1989     if ((data[c/8] & (1 << (c&7))) == 0) break;
1990     eptr++;
1991     }
1992     while (eptr >= pp)
1993     {
1994 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1995 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1996 nigel 77 eptr--;
1997     }
1998     }
1999    
2000     RRETURN(MATCH_NOMATCH);
2001     }
2002     }
2003     /* Control never gets here */
2004    
2005    
2006     /* Match an extended character class. This opcode is encountered only
2007     in UTF-8 mode, because that's the only time it is compiled. */
2008    
2009     #ifdef SUPPORT_UTF8
2010     case OP_XCLASS:
2011     {
2012     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2013     ecode += GET(ecode, 1); /* Advance past the item */
2014    
2015     switch (*ecode)
2016     {
2017     case OP_CRSTAR:
2018     case OP_CRMINSTAR:
2019     case OP_CRPLUS:
2020     case OP_CRMINPLUS:
2021     case OP_CRQUERY:
2022     case OP_CRMINQUERY:
2023     c = *ecode++ - OP_CRSTAR;
2024     minimize = (c & 1) != 0;
2025     min = rep_min[c]; /* Pick up values from tables; */
2026     max = rep_max[c]; /* zero for max => infinity */
2027     if (max == 0) max = INT_MAX;
2028     break;
2029    
2030     case OP_CRRANGE:
2031     case OP_CRMINRANGE:
2032     minimize = (*ecode == OP_CRMINRANGE);
2033     min = GET2(ecode, 1);
2034     max = GET2(ecode, 3);
2035     if (max == 0) max = INT_MAX;
2036     ecode += 5;
2037     break;
2038    
2039     default: /* No repeat follows */
2040     min = max = 1;
2041     break;
2042     }
2043    
2044     /* First, ensure the minimum number of matches are present. */
2045    
2046     for (i = 1; i <= min; i++)
2047     {
2048     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2049     GETCHARINC(c, eptr);
2050     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2051     }
2052    
2053     /* If max == min we can continue with the main loop without the
2054     need to recurse. */
2055    
2056     if (min == max) continue;
2057    
2058     /* If minimizing, keep testing the rest of the expression and advancing
2059     the pointer while it matches the class. */
2060    
2061     if (minimize)
2062     {
2063     for (fi = min;; fi++)
2064     {
2065 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2066 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2068     GETCHARINC(c, eptr);
2069     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2070     }
2071     /* Control never gets here */
2072     }
2073    
2074     /* If maximizing, find the longest possible run, then work backwards. */
2075    
2076     else
2077     {
2078     pp = eptr;
2079     for (i = min; i < max; i++)
2080     {
2081     int len = 1;
2082     if (eptr >= md->end_subject) break;
2083     GETCHARLEN(c, eptr, len);
2084     if (!_pcre_xclass(c, data)) break;
2085     eptr += len;
2086     }
2087     for(;;)
2088     {
2089 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2090 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2091     if (eptr-- == pp) break; /* Stop if tried at original pos */
2092 ph10 207 BACKCHAR(eptr);
2093 nigel 77 }
2094     RRETURN(MATCH_NOMATCH);
2095     }
2096    
2097     /* Control never gets here */
2098     }
2099     #endif /* End of XCLASS */
2100    
2101     /* Match a single character, casefully */
2102    
2103     case OP_CHAR:
2104     #ifdef SUPPORT_UTF8
2105     if (utf8)
2106     {
2107     length = 1;
2108     ecode++;
2109     GETCHARLEN(fc, ecode, length);
2110     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2111     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2112     }
2113     else
2114     #endif
2115    
2116     /* Non-UTF-8 mode */
2117     {
2118     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2119     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2120     ecode += 2;
2121     }
2122     break;
2123    
2124     /* Match a single character, caselessly */
2125    
2126     case OP_CHARNC:
2127     #ifdef SUPPORT_UTF8
2128     if (utf8)
2129     {
2130     length = 1;
2131     ecode++;
2132     GETCHARLEN(fc, ecode, length);
2133    
2134     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2135    
2136     /* If the pattern character's value is < 128, we have only one byte, and
2137     can use the fast lookup table. */
2138    
2139     if (fc < 128)
2140     {
2141     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2142     }
2143    
2144     /* Otherwise we must pick up the subject character */
2145    
2146     else
2147     {
2148 nigel 93 unsigned int dc;
2149 nigel 77 GETCHARINC(dc, eptr);
2150     ecode += length;
2151    
2152     /* If we have Unicode property support, we can use it to test the other
2153 nigel 87 case of the character, if there is one. */
2154 nigel 77
2155     if (fc != dc)
2156     {
2157     #ifdef SUPPORT_UCP
2158 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2159 nigel 77 #endif
2160     RRETURN(MATCH_NOMATCH);
2161     }
2162     }
2163     }
2164     else
2165     #endif /* SUPPORT_UTF8 */
2166    
2167     /* Non-UTF-8 mode */
2168     {
2169     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2170     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2171     ecode += 2;
2172     }
2173     break;
2174    
2175 nigel 93 /* Match a single character repeatedly. */
2176 nigel 77
2177     case OP_EXACT:
2178     min = max = GET2(ecode, 1);
2179     ecode += 3;
2180     goto REPEATCHAR;
2181    
2182 nigel 93 case OP_POSUPTO:
2183     possessive = TRUE;
2184     /* Fall through */
2185    
2186 nigel 77 case OP_UPTO:
2187     case OP_MINUPTO:
2188     min = 0;
2189     max = GET2(ecode, 1);
2190     minimize = *ecode == OP_MINUPTO;
2191     ecode += 3;
2192     goto REPEATCHAR;
2193    
2194 nigel 93 case OP_POSSTAR:
2195     possessive = TRUE;
2196     min = 0;
2197     max = INT_MAX;
2198     ecode++;
2199     goto REPEATCHAR;
2200    
2201     case OP_POSPLUS:
2202     possessive = TRUE;
2203     min = 1;
2204     max = INT_MAX;
2205     ecode++;
2206     goto REPEATCHAR;
2207    
2208     case OP_POSQUERY:
2209     possessive = TRUE;
2210     min = 0;
2211     max = 1;
2212     ecode++;
2213     goto REPEATCHAR;
2214    
2215 nigel 77 case OP_STAR:
2216     case OP_MINSTAR:
2217     case OP_PLUS:
2218     case OP_MINPLUS:
2219     case OP_QUERY:
2220     case OP_MINQUERY:
2221     c = *ecode++ - OP_STAR;
2222     minimize = (c & 1) != 0;
2223     min = rep_min[c]; /* Pick up values from tables; */
2224     max = rep_max[c]; /* zero for max => infinity */
2225     if (max == 0) max = INT_MAX;
2226    
2227     /* Common code for all repeated single-character matches. We can give
2228     up quickly if there are fewer than the minimum number of characters left in
2229     the subject. */
2230    
2231     REPEATCHAR:
2232     #ifdef SUPPORT_UTF8
2233     if (utf8)
2234     {
2235     length = 1;
2236     charptr = ecode;
2237     GETCHARLEN(fc, ecode, length);
2238     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2239     ecode += length;
2240    
2241     /* Handle multibyte character matching specially here. There is
2242     support for caseless matching if UCP support is present. */
2243    
2244     if (length > 1)
2245     {
2246     #ifdef SUPPORT_UCP
2247 nigel 93 unsigned int othercase;
2248 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2249 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2250 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2251 ph10 115 else oclength = 0;
2252 nigel 77 #endif /* SUPPORT_UCP */
2253    
2254     for (i = 1; i <= min; i++)
2255     {
2256     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2257 ph10 123 #ifdef SUPPORT_UCP
2258 nigel 77 /* Need braces because of following else */
2259     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2260     else
2261     {
2262     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2263     eptr += oclength;
2264     }
2265 ph10 115 #else /* without SUPPORT_UCP */
2266     else { RRETURN(MATCH_NOMATCH); }
2267 ph10 123 #endif /* SUPPORT_UCP */
2268 nigel 77 }
2269    
2270     if (min == max) continue;
2271    
2272     if (minimize)
2273     {
2274     for (fi = min;; fi++)
2275     {
2276 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2277 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2278     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2279     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2280 ph10 123 #ifdef SUPPORT_UCP
2281 nigel 77 /* Need braces because of following else */
2282     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2283     else
2284     {
2285     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2286     eptr += oclength;
2287     }
2288 ph10 115 #else /* without SUPPORT_UCP */
2289     else { RRETURN (MATCH_NOMATCH); }
2290     #endif /* SUPPORT_UCP */
2291 nigel 77 }
2292     /* Control never gets here */
2293     }
2294 nigel 93
2295     else /* Maximize */
2296 nigel 77 {
2297     pp = eptr;
2298     for (i = min; i < max; i++)
2299     {
2300     if (eptr > md->end_subject - length) break;
2301     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2302 ph10 123 #ifdef SUPPORT_UCP
2303 nigel 77 else if (oclength == 0) break;
2304     else
2305     {
2306     if (memcmp(eptr, occhars, oclength) != 0) break;
2307     eptr += oclength;
2308     }
2309 ph10 115 #else /* without SUPPORT_UCP */
2310     else break;
2311 ph10 123 #endif /* SUPPORT_UCP */
2312 nigel 77 }
2313 nigel 93
2314     if (possessive) continue;
2315 ph10 120 for(;;)
2316 nigel 77 {
2317 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2318 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2319 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2320 ph10 115 #ifdef SUPPORT_UCP
2321     eptr--;
2322     BACKCHAR(eptr);
2323 ph10 123 #else /* without SUPPORT_UCP */
2324 nigel 77 eptr -= length;
2325 ph10 123 #endif /* SUPPORT_UCP */
2326 nigel 77 }
2327     }
2328     /* Control never gets here */
2329     }
2330    
2331     /* If the length of a UTF-8 character is 1, we fall through here, and
2332     obey the code as for non-UTF-8 characters below, though in this case the
2333     value of fc will always be < 128. */
2334     }
2335     else
2336     #endif /* SUPPORT_UTF8 */
2337    
2338     /* When not in UTF-8 mode, load a single-byte character. */
2339     {
2340     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2341     fc = *ecode++;
2342     }
2343    
2344     /* The value of fc at this point is always less than 256, though we may or
2345     may not be in UTF-8 mode. The code is duplicated for the caseless and
2346     caseful cases, for speed, since matching characters is likely to be quite
2347     common. First, ensure the minimum number of matches are present. If min =
2348     max, continue at the same level without recursing. Otherwise, if
2349     minimizing, keep trying the rest of the expression and advancing one
2350     matching character if failing, up to the maximum. Alternatively, if
2351     maximizing, find the maximum number of characters and work backwards. */
2352    
2353     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2354     max, eptr));
2355    
2356     if ((ims & PCRE_CASELESS) != 0)
2357     {
2358     fc = md->lcc[fc];
2359     for (i = 1; i <= min; i++)
2360     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2361     if (min == max) continue;
2362     if (minimize)
2363     {
2364     for (fi = min;; fi++)
2365     {
2366 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2367 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2368     if (fi >= max || eptr >= md->end_subject ||
2369     fc != md->lcc[*eptr++])
2370     RRETURN(MATCH_NOMATCH);
2371     }
2372     /* Control never gets here */
2373     }
2374 nigel 93 else /* Maximize */
2375 nigel 77 {
2376     pp = eptr;
2377     for (i = min; i < max; i++)
2378     {
2379     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2380     eptr++;
2381     }
2382 nigel 93 if (possessive) continue;
2383 nigel 77 while (eptr >= pp)
2384     {
2385 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2386 nigel 77 eptr--;
2387     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2388     }
2389     RRETURN(MATCH_NOMATCH);
2390     }
2391     /* Control never gets here */
2392     }
2393    
2394     /* Caseful comparisons (includes all multi-byte characters) */
2395    
2396     else
2397     {
2398     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2399     if (min == max) continue;
2400     if (minimize)
2401     {
2402     for (fi = min;; fi++)
2403     {
2404 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2405 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2406     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2407     RRETURN(MATCH_NOMATCH);
2408     }
2409     /* Control never gets here */
2410     }
2411 nigel 93 else /* Maximize */
2412 nigel 77 {
2413     pp = eptr;
2414     for (i = min; i < max; i++)
2415     {
2416     if (eptr >= md->end_subject || fc != *eptr) break;
2417     eptr++;
2418     }
2419 nigel 93 if (possessive) continue;
2420 nigel 77 while (eptr >= pp)
2421     {
2422 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2423 nigel 77 eptr--;
2424     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2425     }
2426     RRETURN(MATCH_NOMATCH);
2427     }
2428     }
2429     /* Control never gets here */
2430    
2431     /* Match a negated single one-byte character. The character we are
2432     checking can be multibyte. */
2433    
2434     case OP_NOT:
2435     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2436     ecode++;
2437     GETCHARINCTEST(c, eptr);
2438     if ((ims & PCRE_CASELESS) != 0)
2439     {
2440     #ifdef SUPPORT_UTF8
2441     if (c < 256)
2442     #endif
2443     c = md->lcc[c];
2444     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2445     }
2446     else
2447     {
2448     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2449     }
2450     break;
2451    
2452     /* Match a negated single one-byte character repeatedly. This is almost a
2453     repeat of the code for a repeated single character, but I haven't found a
2454     nice way of commoning these up that doesn't require a test of the
2455     positive/negative option for each character match. Maybe that wouldn't add
2456     very much to the time taken, but character matching *is* what this is all
2457     about... */
2458    
2459     case OP_NOTEXACT:
2460     min = max = GET2(ecode, 1);
2461     ecode += 3;
2462     goto REPEATNOTCHAR;
2463    
2464     case OP_NOTUPTO:
2465     case OP_NOTMINUPTO:
2466     min = 0;
2467     max = GET2(ecode, 1);
2468     minimize = *ecode == OP_NOTMINUPTO;
2469     ecode += 3;
2470     goto REPEATNOTCHAR;
2471    
2472 nigel 93 case OP_NOTPOSSTAR:
2473     possessive = TRUE;
2474     min = 0;
2475     max = INT_MAX;
2476     ecode++;
2477     goto REPEATNOTCHAR;
2478    
2479     case OP_NOTPOSPLUS:
2480     possessive = TRUE;
2481     min = 1;
2482     max = INT_MAX;
2483     ecode++;
2484     goto REPEATNOTCHAR;
2485    
2486     case OP_NOTPOSQUERY:
2487     possessive = TRUE;
2488     min = 0;
2489     max = 1;
2490     ecode++;
2491     goto REPEATNOTCHAR;
2492    
2493     case OP_NOTPOSUPTO:
2494     possessive = TRUE;
2495     min = 0;
2496     max = GET2(ecode, 1);
2497     ecode += 3;
2498     goto REPEATNOTCHAR;
2499    
2500 nigel 77 case OP_NOTSTAR:
2501     case OP_NOTMINSTAR:
2502     case OP_NOTPLUS:
2503     case OP_NOTMINPLUS:
2504     case OP_NOTQUERY:
2505     case OP_NOTMINQUERY:
2506     c = *ecode++ - OP_NOTSTAR;
2507     minimize = (c & 1) != 0;
2508     min = rep_min[c]; /* Pick up values from tables; */
2509     max = rep_max[c]; /* zero for max => infinity */
2510     if (max == 0) max = INT_MAX;
2511    
2512     /* Common code for all repeated single-byte matches. We can give up quickly
2513     if there are fewer than the minimum number of bytes left in the
2514     subject. */
2515    
2516     REPEATNOTCHAR:
2517     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2518     fc = *ecode++;
2519    
2520     /* The code is duplicated for the caseless and caseful cases, for speed,
2521     since matching characters is likely to be quite common. First, ensure the
2522     minimum number of matches are present. If min = max, continue at the same
2523     level without recursing. Otherwise, if minimizing, keep trying the rest of
2524     the expression and advancing one matching character if failing, up to the
2525     maximum. Alternatively, if maximizing, find the maximum number of
2526     characters and work backwards. */
2527    
2528     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2529     max, eptr));
2530    
2531     if ((ims & PCRE_CASELESS) != 0)
2532     {
2533     fc = md->lcc[fc];
2534    
2535     #ifdef SUPPORT_UTF8
2536     /* UTF-8 mode */
2537     if (utf8)
2538     {
2539 nigel 93 register unsigned int d;
2540 nigel 77 for (i = 1; i <= min; i++)
2541     {
2542     GETCHARINC(d, eptr);
2543     if (d < 256) d = md->lcc[d];
2544     if (fc == d) RRETURN(MATCH_NOMATCH);
2545     }
2546     }
2547     else
2548     #endif
2549    
2550     /* Not UTF-8 mode */
2551     {
2552     for (i = 1; i <= min; i++)
2553     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2554     }
2555    
2556     if (min == max) continue;
2557    
2558     if (minimize)
2559     {
2560     #ifdef SUPPORT_UTF8
2561     /* UTF-8 mode */
2562     if (utf8)
2563     {
2564 nigel 93 register unsigned int d;
2565 nigel 77 for (fi = min;; fi++)
2566     {
2567 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2568 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2569     GETCHARINC(d, eptr);
2570     if (d < 256) d = md->lcc[d];
2571     if (fi >= max || eptr >= md->end_subject || fc == d)
2572     RRETURN(MATCH_NOMATCH);
2573     }
2574     }
2575     else
2576     #endif
2577     /* Not UTF-8 mode */
2578     {
2579     for (fi = min;; fi++)
2580     {
2581 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2582 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2583     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2584     RRETURN(MATCH_NOMATCH);
2585     }
2586     }
2587     /* Control never gets here */
2588     }
2589    
2590     /* Maximize case */
2591    
2592     else
2593     {
2594     pp = eptr;
2595    
2596     #ifdef SUPPORT_UTF8
2597     /* UTF-8 mode */
2598     if (utf8)
2599     {
2600 nigel 93 register unsigned int d;
2601 nigel 77 for (i = min; i < max; i++)
2602     {
2603     int len = 1;
2604     if (eptr >= md->end_subject) break;
2605     GETCHARLEN(d, eptr, len);
2606     if (d < 256) d = md->lcc[d];
2607     if (fc == d) break;
2608     eptr += len;
2609     }
2610 nigel 93 if (possessive) continue;
2611     for(;;)
2612 nigel 77 {
2613 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2614 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2615     if (eptr-- == pp) break; /* Stop if tried at original pos */
2616     BACKCHAR(eptr);
2617     }
2618     }
2619     else
2620     #endif
2621     /* Not UTF-8 mode */
2622     {
2623     for (i = min; i < max; i++)
2624     {
2625     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2626     eptr++;
2627     }
2628 nigel 93 if (possessive) continue;
2629 nigel 77 while (eptr >= pp)
2630     {
2631 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2632 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2633     eptr--;
2634     }
2635     }
2636    
2637     RRETURN(MATCH_NOMATCH);
2638     }
2639     /* Control never gets here */
2640     }
2641    
2642     /* Caseful comparisons */
2643    
2644     else
2645     {
2646     #ifdef SUPPORT_UTF8
2647     /* UTF-8 mode */
2648     if (utf8)
2649     {
2650 nigel 93 register unsigned int d;
2651 nigel 77 for (i = 1; i <= min; i++)
2652     {
2653     GETCHARINC(d, eptr);
2654     if (fc == d) RRETURN(MATCH_NOMATCH);
2655     }
2656     }
2657     else
2658     #endif
2659     /* Not UTF-8 mode */
2660     {
2661     for (i = 1; i <= min; i++)
2662     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2663     }
2664    
2665     if (min == max) continue;
2666    
2667     if (minimize)
2668     {
2669     #ifdef SUPPORT_UTF8
2670     /* UTF-8 mode */
2671     if (utf8)
2672     {
2673 nigel 93 register unsigned int d;
2674 nigel 77 for (fi = min;; fi++)
2675     {
2676 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2677 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2678     GETCHARINC(d, eptr);
2679     if (fi >= max || eptr >= md->end_subject || fc == d)
2680     RRETURN(MATCH_NOMATCH);
2681     }
2682     }
2683     else
2684     #endif
2685     /* Not UTF-8 mode */
2686     {
2687     for (fi = min;; fi++)
2688     {
2689 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2690 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2691     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2692     RRETURN(MATCH_NOMATCH);
2693     }
2694     }
2695     /* Control never gets here */
2696     }
2697    
2698     /* Maximize case */
2699    
2700     else
2701     {
2702     pp = eptr;
2703    
2704     #ifdef SUPPORT_UTF8
2705     /* UTF-8 mode */
2706     if (utf8)
2707     {
2708 nigel 93 register unsigned int d;
2709 nigel 77 for (i = min; i < max; i++)
2710     {
2711     int len = 1;
2712     if (eptr >= md->end_subject) break;
2713     GETCHARLEN(d, eptr, len);
2714     if (fc == d) break;
2715     eptr += len;
2716     }
2717 nigel 93 if (possessive) continue;
2718 nigel 77 for(;;)
2719     {
2720 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2721 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2722     if (eptr-- == pp) break; /* Stop if tried at original pos */
2723     BACKCHAR(eptr);
2724     }
2725     }
2726     else
2727     #endif
2728     /* Not UTF-8 mode */
2729     {
2730     for (i = min; i < max; i++)
2731     {
2732     if (eptr >= md->end_subject || fc == *eptr) break;
2733     eptr++;
2734     }
2735 nigel 93 if (possessive) continue;
2736 nigel 77 while (eptr >= pp)
2737     {
2738 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2739 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2740     eptr--;
2741     }
2742     }
2743    
2744     RRETURN(MATCH_NOMATCH);
2745     }
2746     }
2747     /* Control never gets here */
2748    
2749     /* Match a single character type repeatedly; several different opcodes
2750     share code. This is very similar to the code for single characters, but we
2751     repeat it in the interests of efficiency. */
2752    
2753     case OP_TYPEEXACT:
2754     min = max = GET2(ecode, 1);
2755     minimize = TRUE;
2756     ecode += 3;
2757     goto REPEATTYPE;
2758    
2759     case OP_TYPEUPTO:
2760     case OP_TYPEMINUPTO:
2761     min = 0;
2762     max = GET2(ecode, 1);
2763     minimize = *ecode == OP_TYPEMINUPTO;
2764     ecode += 3;
2765     goto REPEATTYPE;
2766    
2767 nigel 93 case OP_TYPEPOSSTAR:
2768     possessive = TRUE;
2769     min = 0;
2770     max = INT_MAX;
2771     ecode++;
2772     goto REPEATTYPE;
2773    
2774     case OP_TYPEPOSPLUS:
2775     possessive = TRUE;
2776     min = 1;
2777     max = INT_MAX;
2778     ecode++;
2779     goto REPEATTYPE;
2780    
2781     case OP_TYPEPOSQUERY:
2782     possessive = TRUE;
2783     min = 0;
2784     max = 1;
2785     ecode++;
2786     goto REPEATTYPE;
2787    
2788     case OP_TYPEPOSUPTO:
2789     possessive = TRUE;
2790     min = 0;
2791     max = GET2(ecode, 1);
2792     ecode += 3;
2793     goto REPEATTYPE;
2794    
2795 nigel 77 case OP_TYPESTAR:
2796     case OP_TYPEMINSTAR:
2797     case OP_TYPEPLUS:
2798     case OP_TYPEMINPLUS:
2799     case OP_TYPEQUERY:
2800     case OP_TYPEMINQUERY:
2801     c = *ecode++ - OP_TYPESTAR;
2802     minimize = (c & 1) != 0;
2803     min = rep_min[c]; /* Pick up values from tables; */
2804     max = rep_max[c]; /* zero for max => infinity */
2805     if (max == 0) max = INT_MAX;
2806    
2807     /* Common code for all repeated single character type matches. Note that
2808     in UTF-8 mode, '.' matches a character of any length, but for the other
2809     character types, the valid characters are all one-byte long. */
2810    
2811     REPEATTYPE:
2812     ctype = *ecode++; /* Code for the character type */
2813    
2814     #ifdef SUPPORT_UCP
2815     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2816     {
2817     prop_fail_result = ctype == OP_NOTPROP;
2818     prop_type = *ecode++;
2819 nigel 87 prop_value = *ecode++;
2820 nigel 77 }
2821     else prop_type = -1;
2822     #endif
2823    
2824     /* First, ensure the minimum number of matches are present. Use inline
2825     code for maximizing the speed, and do the type test once at the start
2826     (i.e. keep it out of the loop). Also we can test that there are at least
2827     the minimum number of bytes before we start. This isn't as effective in
2828     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2829     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2830     and single-bytes. */
2831    
2832     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2833     if (min > 0)
2834     {
2835     #ifdef SUPPORT_UCP
2836 nigel 87 if (prop_type >= 0)
2837 nigel 77 {
2838 nigel 87 switch(prop_type)
2839 nigel 77 {
2840 nigel 87 case PT_ANY:
2841     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2842     for (i = 1; i <= min; i++)
2843     {
2844     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2845 ph10 184 GETCHARINCTEST(c, eptr);
2846 nigel 87 }
2847     break;
2848    
2849     case PT_LAMP:
2850     for (i = 1; i <= min; i++)
2851     {
2852     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2853 ph10 184 GETCHARINCTEST(c, eptr);
2854 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2855     if ((prop_chartype == ucp_Lu ||
2856     prop_chartype == ucp_Ll ||
2857     prop_chartype == ucp_Lt) == prop_fail_result)
2858     RRETURN(MATCH_NOMATCH);
2859     }
2860     break;
2861    
2862     case PT_GC:
2863     for (i = 1; i <= min; i++)
2864     {
2865     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2866 ph10 184 GETCHARINCTEST(c, eptr);
2867 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2868     if ((prop_category == prop_value) == prop_fail_result)
2869     RRETURN(MATCH_NOMATCH);
2870     }
2871     break;
2872    
2873     case PT_PC:
2874     for (i = 1; i <= min; i++)
2875     {
2876     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2877 ph10 184 GETCHARINCTEST(c, eptr);
2878 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2879     if ((prop_chartype == prop_value) == prop_fail_result)
2880     RRETURN(MATCH_NOMATCH);
2881     }
2882     break;
2883    
2884     case PT_SC:
2885     for (i = 1; i <= min; i++)
2886     {
2887     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2888 ph10 184 GETCHARINCTEST(c, eptr);
2889 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2890     if ((prop_script == prop_value) == prop_fail_result)
2891     RRETURN(MATCH_NOMATCH);
2892     }
2893     break;
2894    
2895     default:
2896     RRETURN(PCRE_ERROR_INTERNAL);
2897 nigel 77 }
2898     }
2899    
2900     /* Match extended Unicode sequences. We will get here only if the
2901     support is in the binary; otherwise a compile-time error occurs. */
2902    
2903     else if (ctype == OP_EXTUNI)
2904     {
2905     for (i = 1; i <= min; i++)
2906     {
2907     GETCHARINCTEST(c, eptr);
2908 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2909 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2910     while (eptr < md->end_subject)
2911     {
2912     int len = 1;
2913     if (!utf8) c = *eptr; else
2914     {
2915     GETCHARLEN(c, eptr, len);
2916     }
2917 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2918 nigel 77 if (prop_category != ucp_M) break;
2919     eptr += len;
2920     }
2921     }
2922     }
2923    
2924     else
2925     #endif /* SUPPORT_UCP */
2926    
2927     /* Handle all other cases when the coding is UTF-8 */
2928    
2929     #ifdef SUPPORT_UTF8
2930     if (utf8) switch(ctype)
2931     {
2932     case OP_ANY:
2933     for (i = 1; i <= min; i++)
2934     {
2935     if (eptr >= md->end_subject ||
2936 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2937 nigel 77 RRETURN(MATCH_NOMATCH);
2938 nigel 91 eptr++;
2939 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2940     }
2941     break;
2942    
2943     case OP_ANYBYTE:
2944     eptr += min;
2945     break;
2946    
2947 nigel 93 case OP_ANYNL:
2948     for (i = 1; i <= min; i++)
2949     {
2950     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951     GETCHARINC(c, eptr);
2952     switch(c)
2953     {
2954     default: RRETURN(MATCH_NOMATCH);
2955     case 0x000d:
2956     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2957     break;
2958     case 0x000a:
2959     case 0x000b:
2960     case 0x000c:
2961     case 0x0085:
2962     case 0x2028:
2963     case 0x2029:
2964     break;
2965     }
2966     }
2967     break;
2968    
2969 ph10 178 case OP_NOT_HSPACE:
2970     for (i = 1; i <= min; i++)
2971     {
2972     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2973     GETCHARINC(c, eptr);
2974     switch(c)
2975     {
2976     default: break;
2977     case 0x09: /* HT */
2978     case 0x20: /* SPACE */
2979     case 0xa0: /* NBSP */
2980     case 0x1680: /* OGHAM SPACE MARK */
2981     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2982     case 0x2000: /* EN QUAD */
2983     case 0x2001: /* EM QUAD */
2984     case 0x2002: /* EN SPACE */
2985     case 0x2003: /* EM SPACE */
2986     case 0x2004: /* THREE-PER-EM SPACE */
2987     case 0x2005: /* FOUR-PER-EM SPACE */
2988     case 0x2006: /* SIX-PER-EM SPACE */
2989     case 0x2007: /* FIGURE SPACE */
2990     case 0x2008: /* PUNCTUATION SPACE */
2991     case 0x2009: /* THIN SPACE */
2992     case 0x200A: /* HAIR SPACE */
2993     case 0x202f: /* NARROW NO-BREAK SPACE */
2994     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2995     case 0x3000: /* IDEOGRAPHIC SPACE */
2996     RRETURN(MATCH_NOMATCH);
2997     }
2998     }
2999     break;
3000 ph10 182
3001 ph10 178 case OP_HSPACE:
3002     for (i = 1; i <= min; i++)
3003     {
3004     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3005     GETCHARINC(c, eptr);
3006     switch(c)
3007     {
3008     default: RRETURN(MATCH_NOMATCH);
3009     case 0x09: /* HT */
3010     case 0x20: /* SPACE */
3011     case 0xa0: /* NBSP */
3012     case 0x1680: /* OGHAM SPACE MARK */
3013     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3014     case 0x2000: /* EN QUAD */
3015     case 0x2001: /* EM QUAD */
3016     case 0x2002: /* EN SPACE */
3017     case 0x2003: /* EM SPACE */
3018     case 0x2004: /* THREE-PER-EM SPACE */
3019     case 0x2005: /* FOUR-PER-EM SPACE */
3020     case 0x2006: /* SIX-PER-EM SPACE */
3021     case 0x2007: /* FIGURE SPACE */
3022     case 0x2008: /* PUNCTUATION SPACE */
3023     case 0x2009: /* THIN SPACE */
3024     case 0x200A: /* HAIR SPACE */
3025     case 0x202f: /* NARROW NO-BREAK SPACE */
3026     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3027     case 0x3000: /* IDEOGRAPHIC SPACE */
3028     break;
3029     }
3030     }
3031     break;
3032 ph10 182
3033 ph10 178 case OP_NOT_VSPACE:
3034     for (i = 1; i <= min; i++)
3035     {
3036     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3037     GETCHARINC(c, eptr);
3038     switch(c)
3039     {
3040     default: break;
3041     case 0x0a: /* LF */
3042     case 0x0b: /* VT */
3043     case 0x0c: /* FF */
3044     case 0x0d: /* CR */
3045     case 0x85: /* NEL */
3046     case 0x2028: /* LINE SEPARATOR */
3047     case 0x2029: /* PARAGRAPH SEPARATOR */
3048     RRETURN(MATCH_NOMATCH);
3049     }
3050     }
3051     break;
3052 ph10 182
3053 ph10 178 case OP_VSPACE:
3054     for (i = 1; i <= min; i++)
3055     {
3056     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3057     GETCHARINC(c, eptr);
3058     switch(c)
3059     {
3060     default: RRETURN(MATCH_NOMATCH);
3061     case 0x0a: /* LF */
3062     case 0x0b: /* VT */
3063     case 0x0c: /* FF */
3064     case 0x0d: /* CR */
3065     case 0x85: /* NEL */
3066     case 0x2028: /* LINE SEPARATOR */
3067     case 0x2029: /* PARAGRAPH SEPARATOR */
3068 ph10 182 break;
3069 ph10 178 }
3070     }
3071     break;
3072    
3073 nigel 77 case OP_NOT_DIGIT:
3074     for (i = 1; i <= min; i++)
3075     {
3076     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3077     GETCHARINC(c, eptr);
3078     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3079     RRETURN(MATCH_NOMATCH);
3080     }
3081     break;
3082    
3083     case OP_DIGIT:
3084     for (i = 1; i <= min; i++)
3085     {
3086     if (eptr >= md->end_subject ||
3087     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3088     RRETURN(MATCH_NOMATCH);
3089     /* No need to skip more bytes - we know it's a 1-byte character */
3090     }
3091     break;
3092    
3093     case OP_NOT_WHITESPACE:
3094     for (i = 1; i <= min; i++)
3095     {
3096     if (eptr >= md->end_subject ||
3097     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
3098     RRETURN(MATCH_NOMATCH);
3099     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3100     }
3101     break;
3102    
3103     case OP_WHITESPACE:
3104     for (i = 1; i <= min; i++)
3105     {
3106     if (eptr >= md->end_subject ||
3107     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3108     RRETURN(MATCH_NOMATCH);
3109     /* No need to skip more bytes - we know it's a 1-byte character */
3110     }
3111     break;
3112    
3113     case OP_NOT_WORDCHAR:
3114     for (i = 1; i <= min; i++)
3115     {
3116     if (eptr >= md->end_subject ||
3117     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
3118     RRETURN(MATCH_NOMATCH);
3119     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3120     }
3121     break;
3122    
3123     case OP_WORDCHAR:
3124     for (i = 1; i <= min; i++)
3125     {
3126     if (eptr >= md->end_subject ||
3127     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3128     RRETURN(MATCH_NOMATCH);
3129     /* No need to skip more bytes - we know it's a 1-byte character */
3130     }
3131     break;
3132    
3133     default:
3134     RRETURN(PCRE_ERROR_INTERNAL);
3135     } /* End switch(ctype) */
3136    
3137     else
3138     #endif /* SUPPORT_UTF8 */
3139    
3140     /* Code for the non-UTF-8 case for minimum matching of operators other
3141 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3142     number of bytes present, as this was tested above. */
3143 nigel 77
3144     switch(ctype)
3145     {
3146     case OP_ANY:
3147     if ((ims & PCRE_DOTALL) == 0)
3148     {
3149     for (i = 1; i <= min; i++)
3150 nigel 91 {
3151 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3152 nigel 91 eptr++;
3153     }
3154 nigel 77 }
3155     else eptr += min;
3156     break;
3157    
3158     case OP_ANYBYTE:
3159     eptr += min;
3160     break;
3161    
3162 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3163     bytes are present in this case. */
3164    
3165     case OP_ANYNL:
3166     for (i = 1; i <= min; i++)
3167     {
3168     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3169     switch(*eptr++)
3170     {
3171     default: RRETURN(MATCH_NOMATCH);
3172     case 0x000d:
3173     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3174     break;
3175     case 0x000a:
3176     case 0x000b:
3177     case 0x000c:
3178     case 0x0085:
3179     break;
3180     }
3181     }
3182     break;
3183    
3184 ph10 178 case OP_NOT_HSPACE:
3185     for (i = 1; i <= min; i++)
3186     {
3187     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3188     switch(*eptr++)
3189     {
3190     default: break;
3191     case 0x09: /* HT */
3192     case 0x20: /* SPACE */
3193     case 0xa0: /* NBSP */
3194     RRETURN(MATCH_NOMATCH);
3195     }
3196     }
3197     break;
3198    
3199     case OP_HSPACE:
3200     for (i = 1; i <= min; i++)
3201     {
3202     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3203     switch(*eptr++)
3204     {
3205     default: RRETURN(MATCH_NOMATCH);
3206     case 0x09: /* HT */
3207     case 0x20: /* SPACE */
3208     case 0xa0: /* NBSP */
3209 ph10 182 break;
3210 ph10 178 }
3211     }
3212     break;
3213    
3214     case OP_NOT_VSPACE:
3215     for (i = 1; i <= min; i++)
3216     {
3217     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3218     switch(*eptr++)
3219     {
3220     default: break;
3221     case 0x0a: /* LF */
3222     case 0x0b: /* VT */
3223     case 0x0c: /* FF */
3224     case 0x0d: /* CR */
3225     case 0x85: /* NEL */
3226     RRETURN(MATCH_NOMATCH);
3227     }
3228     }
3229     break;
3230    
3231     case OP_VSPACE:
3232     for (i = 1; i <= min; i++)
3233     {
3234     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3235     switch(*eptr++)
3236     {
3237     default: RRETURN(MATCH_NOMATCH);
3238     case 0x0a: /* LF */
3239     case 0x0b: /* VT */
3240     case 0x0c: /* FF */
3241     case 0x0d: /* CR */
3242     case 0x85: /* NEL */
3243 ph10 182 break;
3244 ph10 178 }
3245     }
3246     break;
3247    
3248 nigel 77 case OP_NOT_DIGIT:
3249     for (i = 1; i <= min; i++)
3250     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3251     break;
3252    
3253     case OP_DIGIT:
3254     for (i = 1; i <= min; i++)
3255     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3256     break;
3257    
3258     case OP_NOT_WHITESPACE:
3259     for (i = 1; i <= min; i++)
3260     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3261     break;
3262    
3263     case OP_WHITESPACE:
3264     for (i = 1; i <= min; i++)
3265     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3266     break;
3267    
3268     case OP_NOT_WORDCHAR:
3269     for (i = 1; i <= min; i++)
3270     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3271     RRETURN(MATCH_NOMATCH);
3272     break;
3273    
3274     case OP_WORDCHAR:
3275     for (i = 1; i <= min; i++)
3276     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3277     RRETURN(MATCH_NOMATCH);
3278     break;
3279    
3280     default:
3281     RRETURN(PCRE_ERROR_INTERNAL);
3282     }
3283     }
3284    
3285     /* If min = max, continue at the same level without recursing */
3286    
3287     if (min == max) continue;
3288    
3289     /* If minimizing, we have to test the rest of the pattern before each
3290     subsequent match. Again, separate the UTF-8 case for speed, and also
3291     separate the UCP cases. */
3292    
3293     if (minimize)
3294     {
3295     #ifdef SUPPORT_UCP
3296 nigel 87 if (prop_type >= 0)
3297 nigel 77 {
3298 nigel 87 switch(prop_type)
3299 nigel 77 {
3300 nigel 87 case PT_ANY:
3301     for (fi = min;; fi++)
3302     {
3303 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3304 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3305     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3306     GETCHARINC(c, eptr);
3307     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3308     }
3309 nigel 93 /* Control never gets here */
3310 nigel 87
3311     case PT_LAMP:
3312     for (fi = min;; fi++)
3313     {
3314 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3315 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3316     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3317     GETCHARINC(c, eptr);
3318     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3319     if ((prop_chartype == ucp_Lu ||
3320     prop_chartype == ucp_Ll ||
3321     prop_chartype == ucp_Lt) == prop_fail_result)
3322     RRETURN(MATCH_NOMATCH);
3323     }
3324 nigel 93 /* Control never gets here */
3325 nigel 87
3326     case PT_GC:
3327     for (fi = min;; fi++)
3328     {
3329 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3330 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3331     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3332     GETCHARINC(c, eptr);
3333     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3334     if ((prop_category == prop_value) == prop_fail_result)
3335     RRETURN(MATCH_NOMATCH);
3336     }
3337 nigel 93 /* Control never gets here */
3338 nigel 87
3339     case PT_PC:
3340     for (fi = min;; fi++)
3341     {
3342 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3343 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3344     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3345     GETCHARINC(c, eptr);
3346     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3347     if ((prop_chartype == prop_value) == prop_fail_result)
3348     RRETURN(MATCH_NOMATCH);
3349     }
3350 nigel 93 /* Control never gets here */
3351 nigel 87
3352     case PT_SC:
3353     for (fi = min;; fi++)
3354     {
3355 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3356 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3357     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3358     GETCHARINC(c, eptr);
3359     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3360     if ((prop_script == prop_value) == prop_fail_result)
3361     RRETURN(MATCH_NOMATCH);
3362     }
3363 nigel 93 /* Control never gets here */
3364 nigel 87
3365     default:
3366     RRETURN(PCRE_ERROR_INTERNAL);
3367 nigel 77 }
3368     }
3369    
3370     /* Match extended Unicode sequences. We will get here only if the
3371     support is in the binary; otherwise a compile-time error occurs. */
3372    
3373     else if (ctype == OP_EXTUNI)
3374     {
3375     for (fi = min;; fi++)
3376     {
3377 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3378 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3379     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3380     GETCHARINCTEST(c, eptr);
3381 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3382 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3383     while (eptr < md->end_subject)
3384     {
3385     int len = 1;
3386     if (!utf8) c = *eptr; else
3387     {
3388     GETCHARLEN(c, eptr, len);
3389     }
3390 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3391 nigel 77 if (prop_category != ucp_M) break;
3392     eptr += len;
3393     }
3394     }
3395     }
3396    
3397     else
3398     #endif /* SUPPORT_UCP */
3399    
3400     #ifdef SUPPORT_UTF8
3401     /* UTF-8 mode */
3402     if (utf8)
3403     {
3404     for (fi = min;; fi++)
3405     {
3406 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3407 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3408 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3409     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3410 nigel 93 IS_NEWLINE(eptr)))
3411 nigel 91 RRETURN(MATCH_NOMATCH);
3412 nigel 77
3413     GETCHARINC(c, eptr);
3414     switch(ctype)
3415     {
3416 nigel 91 case OP_ANY: /* This is the DOTALL case */
3417 nigel 77 break;
3418    
3419     case OP_ANYBYTE:
3420     break;
3421    
3422 nigel 93 case OP_ANYNL:
3423     switch(c)
3424     {
3425     default: RRETURN(MATCH_NOMATCH);
3426     case 0x000d:
3427     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3428     break;
3429     case 0x000a:
3430     case 0x000b:
3431     case 0x000c:
3432     case 0x0085:
3433     case 0x2028:
3434     case 0x2029:
3435     break;
3436     }
3437     break;
3438    
3439 ph10 178 case OP_NOT_HSPACE:
3440     switch(c)
3441     {
3442     default: break;
3443     case 0x09: /* HT */
3444     case 0x20: /* SPACE */
3445     case 0xa0: /* NBSP */
3446     case 0x1680: /* OGHAM SPACE MARK */
3447     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3448     case 0x2000: /* EN QUAD */
3449     case 0x2001: /* EM QUAD */
3450     case 0x2002: /* EN SPACE */
3451     case 0x2003: /* EM SPACE */
3452     case 0x2004: /* THREE-PER-EM SPACE */
3453     case 0x2005: /* FOUR-PER-EM SPACE */
3454     case 0x2006: /* SIX-PER-EM SPACE */
3455     case 0x2007: /* FIGURE SPACE */
3456     case 0x2008: /* PUNCTUATION SPACE */
3457     case 0x2009: /* THIN SPACE */
3458     case 0x200A: /* HAIR SPACE */
3459     case 0x202f: /* NARROW NO-BREAK SPACE */
3460     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3461     case 0x3000: /* IDEOGRAPHIC SPACE */
3462     RRETURN(MATCH_NOMATCH);
3463     }
3464     break;
3465    
3466     case OP_HSPACE:
3467     switch(c)
3468     {
3469     default: RRETURN(MATCH_NOMATCH);
3470     case 0x09: /* HT */
3471     case 0x20: /* SPACE */
3472     case 0xa0: /* NBSP */
3473     case 0x1680: /* OGHAM SPACE MARK */
3474     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3475     case 0x2000: /* EN QUAD */
3476     case 0x2001: /* EM QUAD */
3477     case 0x2002: /* EN SPACE */
3478     case 0x2003: /* EM SPACE */
3479     case 0x2004: /* THREE-PER-EM SPACE */
3480     case 0x2005: /* FOUR-PER-EM SPACE */
3481     case 0x2006: /* SIX-PER-EM SPACE */
3482     case 0x2007: /* FIGURE SPACE */
3483     case 0x2008: /* PUNCTUATION SPACE */
3484     case 0x2009: /* THIN SPACE */
3485     case 0x200A: /* HAIR SPACE */
3486     case 0x202f: /* NARROW NO-BREAK SPACE */
3487     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3488     case 0x3000: /* IDEOGRAPHIC SPACE */
3489     break;
3490     }
3491     break;
3492    
3493     case OP_NOT_VSPACE:
3494     switch(c)
3495     {
3496     default: break;
3497     case 0x0a: /* LF */
3498     case 0x0b: /* VT */
3499     case 0x0c: /* FF */
3500     case 0x0d: /* CR */
3501     case 0x85: /* NEL */
3502     case 0x2028: /* LINE SEPARATOR */
3503     case 0x2029: /* PARAGRAPH SEPARATOR */
3504     RRETURN(MATCH_NOMATCH);
3505     }
3506     break;
3507    
3508     case OP_VSPACE:
3509     switch(c)
3510     {
3511     default: RRETURN(MATCH_NOMATCH);
3512     case 0x0a: /* LF */
3513     case 0x0b: /* VT */
3514     case 0x0c: /* FF */
3515     case 0x0d: /* CR */
3516     case 0x85: /* NEL */
3517     case 0x2028: /* LINE SEPARATOR */
3518     case 0x2029: /* PARAGRAPH SEPARATOR */
3519     break;
3520     }
3521     break;
3522    
3523 nigel 77 case OP_NOT_DIGIT:
3524     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3525     RRETURN(MATCH_NOMATCH);
3526     break;
3527    
3528     case OP_DIGIT:
3529     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3530     RRETURN(MATCH_NOMATCH);
3531     break;
3532    
3533     case OP_NOT_WHITESPACE:
3534     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3535     RRETURN(MATCH_NOMATCH);
3536     break;
3537    
3538     case OP_WHITESPACE:
3539     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3540     RRETURN(MATCH_NOMATCH);
3541     break;
3542    
3543     case OP_NOT_WORDCHAR:
3544     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3545     RRETURN(MATCH_NOMATCH);
3546     break;
3547    
3548     case OP_WORDCHAR:
3549     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3550     RRETURN(MATCH_NOMATCH);
3551     break;
3552    
3553     default:
3554     RRETURN(PCRE_ERROR_INTERNAL);
3555     }
3556     }
3557     }
3558     else
3559     #endif
3560     /* Not UTF-8 mode */
3561     {
3562     for (fi = min;; fi++)
3563     {
3564 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3565 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3566 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3567 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3568 nigel 91 RRETURN(MATCH_NOMATCH);
3569    
3570 nigel 77 c = *eptr++;
3571     switch(ctype)
3572     {
3573 nigel 91 case OP_ANY: /* This is the DOTALL case */
3574 nigel 77 break;
3575    
3576     case OP_ANYBYTE:
3577     break;
3578    
3579 nigel 93 case OP_ANYNL:
3580     switch(c)
3581     {
3582     default: RRETURN(MATCH_NOMATCH);
3583     case 0x000d:
3584     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3585     break;
3586     case 0x000a:
3587     case 0x000b:
3588     case 0x000c:
3589     case 0x0085:
3590     break;
3591     }
3592     break;
3593    
3594 ph10 178 case OP_NOT_HSPACE:
3595     switch(c)
3596     {
3597     default: break;
3598     case 0x09: /* HT */
3599     case 0x20: /* SPACE */
3600     case 0xa0: /* NBSP */
3601     RRETURN(MATCH_NOMATCH);
3602     }
3603     break;
3604    
3605     case OP_HSPACE:
3606     switch(c)
3607     {
3608     default: RRETURN(MATCH_NOMATCH);
3609     case 0x09: /* HT */
3610     case 0x20: /* SPACE */
3611     case 0xa0: /* NBSP */
3612     break;
3613     }
3614     break;
3615    
3616     case OP_NOT_VSPACE:
3617     switch(c)
3618     {
3619     default: break;
3620     case 0x0a: /* LF */
3621     case 0x0b: /* VT */
3622     case 0x0c: /* FF */
3623     case 0x0d: /* CR */
3624     case 0x85: /* NEL */
3625     RRETURN(MATCH_NOMATCH);
3626     }
3627     break;
3628    
3629     case OP_VSPACE:
3630     switch(c)
3631     {
3632     default: RRETURN(MATCH_NOMATCH);
3633     case 0x0a: /* LF */
3634     case 0x0b: /* VT */
3635     case 0x0c: /* FF */
3636     case 0x0d: /* CR */
3637     case 0x85: /* NEL */
3638     break;
3639     }
3640     break;
3641    
3642 nigel 77 case OP_NOT_DIGIT:
3643     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3644     break;
3645    
3646     case OP_DIGIT:
3647     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3648     break;
3649    
3650     case OP_NOT_WHITESPACE:
3651     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3652     break;
3653    
3654     case OP_WHITESPACE:
3655     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3656     break;
3657    
3658     case OP_NOT_WORDCHAR:
3659     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3660     break;
3661    
3662     case OP_WORDCHAR:
3663     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3664     break;
3665    
3666     default:
3667     RRETURN(PCRE_ERROR_INTERNAL);
3668     }
3669     }
3670     }
3671     /* Control never gets here */
3672     }
3673    
3674 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3675 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3676     UTF-8 and UCP stuff separate. */
3677    
3678     else
3679     {
3680     pp = eptr; /* Remember where we started */
3681    
3682     #ifdef SUPPORT_UCP
3683 nigel 87 if (prop_type >= 0)
3684 nigel 77 {
3685 nigel 87 switch(prop_type)
3686 nigel 77 {
3687 nigel 87 case PT_ANY:
3688     for (i = min; i < max; i++)
3689     {
3690     int len = 1;
3691     if (eptr >= md->end_subject) break;
3692     GETCHARLEN(c, eptr, len);
3693     if (prop_fail_result) break;
3694     eptr+= len;
3695     }
3696     break;
3697    
3698     case PT_LAMP:
3699     for (i = min; i < max; i++)
3700     {
3701     int len = 1;
3702     if (eptr >= md->end_subject) break;
3703     GETCHARLEN(c, eptr, len);
3704     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3705     if ((prop_chartype == ucp_Lu ||
3706     prop_chartype == ucp_Ll ||
3707     prop_chartype == ucp_Lt) == prop_fail_result)
3708     break;
3709     eptr+= len;
3710     }
3711     break;
3712    
3713     case PT_GC:
3714     for (i = min; i < max; i++)
3715     {
3716     int len = 1;
3717     if (eptr >= md->end_subject) break;
3718     GETCHARLEN(c, eptr, len);
3719     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3720     if ((prop_category == prop_value) == prop_fail_result)
3721     break;
3722     eptr+= len;
3723     }
3724     break;
3725    
3726     case PT_PC:
3727     for (i = min; i < max; i++)
3728     {
3729     int len = 1;
3730     if (eptr >= md->end_subject) break;
3731     GETCHARLEN(c, eptr, len);
3732     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3733     if ((prop_chartype == prop_value) == prop_fail_result)
3734     break;
3735     eptr+= len;
3736     }
3737     break;
3738    
3739     case PT_SC:
3740     for (i = min; i < max; i++)
3741     {
3742     int len = 1;
3743     if (eptr >= md->end_subject) break;
3744     GETCHARLEN(c, eptr, len);
3745     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3746     if ((prop_script == prop_value) == prop_fail_result)
3747     break;
3748     eptr+= len;
3749     }
3750     break;
3751 nigel 77 }
3752    
3753     /* eptr is now past the end of the maximum run */
3754    
3755 nigel 93 if (possessive) continue;
3756 nigel 77 for(;;)
3757     {
3758 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3759 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3760     if (eptr-- == pp) break; /* Stop if tried at original pos */
3761 ph10 207 if (utf8) BACKCHAR(eptr);
3762 nigel 77 }
3763     }
3764    
3765     /* Match extended Unicode sequences. We will get here only if the
3766     support is in the binary; otherwise a compile-time error occurs. */
3767    
3768     else if (ctype == OP_EXTUNI)
3769     {
3770     for (i = min; i < max; i++)
3771     {
3772     if (eptr >= md->end_subject) break;
3773     GETCHARINCTEST(c, eptr);
3774 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3775 nigel 77 if (prop_category == ucp_M) break;
3776     while (eptr < md->end_subject)
3777     {
3778     int len = 1;
3779     if (!utf8) c = *eptr; else
3780     {
3781     GETCHARLEN(c, eptr, len);
3782     }
3783 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3784 nigel 77 if (prop_category != ucp_M) break;
3785     eptr += len;
3786     }
3787     }
3788    
3789     /* eptr is now past the end of the maximum run */
3790    
3791 nigel 93 if (possessive) continue;
3792 nigel 77 for(;;)
3793     {
3794 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3795 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3796     if (eptr-- == pp) break; /* Stop if tried at original pos */
3797     for (;;) /* Move back over one extended */
3798     {
3799     int len = 1;
3800     if (!utf8) c = *eptr; else
3801     {
3802 ph10 207 BACKCHAR(eptr);
3803 nigel 77 GETCHARLEN(c, eptr, len);
3804     }
3805 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3806 nigel 77 if (prop_category != ucp_M) break;
3807     eptr--;
3808     }
3809     }
3810     }
3811    
3812     else
3813     #endif /* SUPPORT_UCP */
3814    
3815     #ifdef SUPPORT_UTF8
3816     /* UTF-8 mode */
3817    
3818     if (utf8)
3819     {
3820     switch(ctype)
3821     {
3822     case OP_ANY:
3823     if (max < INT_MAX)
3824     {
3825     if ((ims & PCRE_DOTALL) == 0)
3826     {
3827     for (i = min; i < max; i++)
3828     {
3829 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3830 nigel 77 eptr++;
3831     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3832     }
3833     }
3834     else
3835     {
3836     for (i = min; i < max; i++)
3837     {
3838 nigel 91 if (eptr >= md->end_subject) break;
3839 nigel 77 eptr++;
3840     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3841     }
3842     }
3843     }
3844    
3845     /* Handle unlimited UTF-8 repeat */
3846    
3847     else
3848     {
3849     if ((ims & PCRE_DOTALL) == 0)
3850     {
3851     for (i = min; i < max; i++)
3852     {
3853 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3854 nigel 77 eptr++;
3855 ph10 190 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3856 nigel 77 }
3857     }
3858     else
3859     {
3860 ph10 190 eptr = md->end_subject;
3861 nigel 77 }
3862     }
3863     break;
3864    
3865     /* The byte case is the same as non-UTF8 */
3866    
3867     case OP_ANYBYTE:
3868     c = max - min;
3869 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3870     c = md->end_subject - eptr;
3871 nigel 77 eptr += c;
3872     break;
3873    
3874 nigel 93 case OP_ANYNL:
3875     for (i = min; i < max; i++)
3876     {
3877     int len = 1;
3878     if (eptr >= md->end_subject) break;
3879     GETCHARLEN(c, eptr, len);
3880     if (c == 0x000d)
3881     {
3882     if (++eptr >= md->end_subject) break;
3883     if (*eptr == 0x000a) eptr++;
3884     }
3885     else
3886     {
3887     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3888     c != 0x0085 && c != 0x2028 && c != 0x2029)
3889     break;
3890     eptr += len;
3891     }
3892     }
3893     break;
3894    
3895 ph10 178 case OP_NOT_HSPACE:
3896 ph10 182 case OP_HSPACE:
3897 ph10 178 for (i = min; i < max; i++)
3898     {
3899 ph10 182 BOOL gotspace;
3900 ph10 178 int len = 1;
3901     if (eptr >= md->end_subject) break;
3902     GETCHARLEN(c, eptr, len);
3903     switch(c)
3904 ph10 182 {
3905     default: gotspace = FALSE; break;
3906 ph10 178 case 0x09: /* HT */
3907     case 0x20: /* SPACE */
3908     case 0xa0: /* NBSP */
3909     case 0x1680: /* OGHAM SPACE MARK */
3910     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3911     case 0x2000: /* EN QUAD */
3912     case 0x2001: /* EM QUAD */
3913     case 0x2002: /* EN SPACE */
3914     case 0x2003: /* EM SPACE */
3915     case 0x2004: /* THREE-PER-EM SPACE */
3916     case 0x2005: /* FOUR-PER-EM SPACE */
3917     case 0x2006: /* SIX-PER-EM SPACE */
3918     case 0x2007: /* FIGURE SPACE */
3919     case 0x2008: /* PUNCTUATION SPACE */
3920     case 0x2009: /* THIN SPACE */
3921     case 0x200A: /* HAIR SPACE */
3922     case 0x202f: /* NARROW NO-BREAK SPACE */
3923     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3924     case 0x3000: /* IDEOGRAPHIC SPACE */
3925     gotspace = TRUE;
3926 ph10 182 break;
3927 ph10 178 }
3928     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3929     eptr += len;
3930     }
3931     break;
3932    
3933     case OP_NOT_VSPACE:
3934 ph10 182 case OP_VSPACE:
3935 ph10 178 for (i = min; i < max; i++)
3936     {
3937 ph10 182 BOOL gotspace;
3938 ph10 178 int len = 1;
3939     if (eptr >= md->end_subject) break;
3940     GETCHARLEN(c, eptr, len);
3941     switch(c)
3942     {
3943 ph10 182 default: gotspace = FALSE; break;
3944 ph10 178 case 0x0a: /* LF */
3945     case 0x0b: /* VT */
3946     case 0x0c: /* FF */
3947     case 0x0d: /* CR */
3948     case 0x85: /* NEL */
3949     case 0x2028: /* LINE SEPARATOR */
3950     case 0x2029: /* PARAGRAPH SEPARATOR */
3951     gotspace = TRUE;
3952     break;
3953     }
3954 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3955 ph10 178 eptr += len;
3956     }
3957     break;
3958    
3959 nigel 77 case OP_NOT_DIGIT:
3960     for (i = min; i < max; i++)
3961     {
3962     int len = 1;
3963     if (eptr >= md->end_subject) break;
3964     GETCHARLEN(c, eptr, len);
3965     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3966     eptr+= len;
3967     }
3968     break;
3969    
3970     case OP_DIGIT:
3971     for (i = min; i < max; i++)
3972     {
3973     int len = 1;
3974     if (eptr >= md->end_subject) break;
3975     GETCHARLEN(c, eptr, len);
3976     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3977     eptr+= len;
3978     }
3979     break;
3980    
3981     case OP_NOT_WHITESPACE:
3982     for (i = min; i < max; i++)
3983     {
3984     int len = 1;
3985     if (eptr >= md->end_subject) break;
3986     GETCHARLEN(c, eptr, len);
3987     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3988     eptr+= len;
3989     }
3990     break;
3991    
3992     case OP_WHITESPACE:
3993     for (i = min; i < max; i++)
3994     {
3995     int len = 1;
3996     if (eptr >= md->end_subject) break;
3997     GETCHARLEN(c, eptr, len);
3998     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3999     eptr+= len;
4000     }
4001     break;
4002    
4003     case OP_NOT_WORDCHAR:
4004     for (i = min; i < max; i++)
4005     {
4006     int len = 1;
4007     if (eptr >= md->end_subject) break;
4008     GETCHARLEN(c, eptr, len);
4009     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4010     eptr+= len;
4011     }
4012     break;
4013    
4014     case OP_WORDCHAR:
4015     for (i = min; i < max; i++)
4016     {
4017     int len = 1;
4018     if (eptr >= md->end_subject) break;
4019     GETCHARLEN(c, eptr, len);
4020     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4021     eptr+= len;
4022     }
4023     break;
4024    
4025     default:
4026     RRETURN(PCRE_ERROR_INTERNAL);
4027     }
4028    
4029     /* eptr is now past the end of the maximum run */
4030    
4031 nigel 93 if (possessive) continue;
4032 nigel 77 for(;;)
4033     {
4034 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4035 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4036     if (eptr-- == pp) break; /* Stop if tried at original pos */
4037     BACKCHAR(eptr);
4038     }
4039     }
4040     else
4041 ph10 207 #endif /* SUPPORT_UTF8 */
4042 nigel 77
4043     /* Not UTF-8 mode */
4044     {
4045     switch(ctype)
4046     {
4047     case OP_ANY:
4048     if ((ims & PCRE_DOTALL) == 0)
4049     {
4050     for (i = min; i < max; i++)
4051     {
4052 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4053 nigel 77 eptr++;
4054     }
4055     break;
4056     }
4057     /* For DOTALL case, fall through and treat as \C */
4058    
4059     case OP_ANYBYTE:
4060     c = max - min;
4061 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4062     c = md->end_subject - eptr;
4063 nigel 77 eptr += c;
4064     break;
4065    
4066 nigel 93 case OP_ANYNL:
4067     for (i = min; i < max; i++)
4068     {
4069     if (eptr >= md->end_subject) break;
4070     c = *eptr;
4071     if (c == 0x000d)
4072     {
4073     if (++eptr >= md->end_subject) break;
4074     if (*eptr == 0x000a) eptr++;
4075     }
4076     else
4077     {
4078     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4079     break;
4080     eptr++;
4081     }
4082     }
4083     break;
4084    
4085 ph10 178 case OP_NOT_HSPACE:
4086     for (i = min; i < max; i++)
4087     {
4088     if (eptr >= md->end_subject) break;
4089     c = *eptr;
4090     if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4091 ph10 182 eptr++;
4092 ph10 178 }
4093     break;
4094    
4095     case OP_HSPACE:
4096     for (i = min; i < max; i++)
4097     {
4098     if (eptr >= md->end_subject) break;
4099     c = *eptr;
4100     if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4101 ph10 182 eptr++;
4102 ph10 178 }
4103     break;
4104    
4105     case OP_NOT_VSPACE:
4106     for (i = min; i < max; i++)
4107     {
4108     if (eptr >= md->end_subject) break;
4109     c = *eptr;
4110     if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4111     break;
4112 ph10 182 eptr++;
4113 ph10 178 }
4114     break;
4115    
4116     case OP_VSPACE:
4117     for (i = min; i < max; i++)
4118     {
4119     if (eptr >= md->end_subject) break;
4120     c = *eptr;
4121     if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4122     break;
4123     eptr++;
4124     }
4125     break;
4126    
4127 nigel 77 case OP_NOT_DIGIT:
4128     for (i = min; i < max; i++)
4129     {
4130     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4131     break;
4132     eptr++;
4133     }
4134     break;
4135    
4136     case OP_DIGIT:
4137     for (i = min; i < max; i++)
4138     {
4139     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4140     break;
4141     eptr++;
4142     }
4143     break;
4144    
4145     case OP_NOT_WHITESPACE:
4146     for (i = min; i < max; i++)
4147     {
4148     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4149     break;
4150     eptr++;
4151     }
4152     break;
4153    
4154     case OP_WHITESPACE:
4155     for (i = min; i < max; i++)
4156