/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 214 - (hide annotations) (download)
Wed Aug 15 14:08:10 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 149064 byte(s)
Fixed another looking-too-far-back-in-non-UTF-8-mode bug.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 199 #include <config.h>
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161     /* Separate the caselesss case for speed */
162    
163     if ((ims & PCRE_CASELESS) != 0)
164     {
165     while (length-- > 0)
166     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167     }
168     else
169     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170    
171     return TRUE;
172     }
173    
174    
175    
176     /***************************************************************************
177     ****************************************************************************
178     RECURSION IN THE match() FUNCTION
179    
180 nigel 87 The match() function is highly recursive, though not every recursive call
181     increases the recursive depth. Nevertheless, some regular expressions can cause
182     it to recurse to a great depth. I was writing for Unix, so I just let it call
183     itself recursively. This uses the stack for saving everything that has to be
184     saved for a recursive call. On Unix, the stack can be large, and this works
185     fine.
186 nigel 77
187 nigel 87 It turns out that on some non-Unix-like systems there are problems with
188     programs that use a lot of stack. (This despite the fact that every last chip
189     has oodles of memory these days, and techniques for extending the stack have
190     been known for decades.) So....
191 nigel 77
192     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193     calls by keeping local variables that need to be preserved in blocks of memory
194 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
195 nigel 77 achieve this so that the actual code doesn't look very different to what it
196     always used to.
197 ph10 164
198 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
199 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
200     Switzer, the use of longjmp() has been abolished, at the cost of having to
201     provide a unique number for each call to RMATCH. There is no way of generating
202     a sequence of numbers at compile time in C. I have given them names, to make
203     them stand out more clearly.
204    
205     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
208     don't have indeterminate values; this has meant that the frame size can be
209 ph10 164 reduced because the result can be "passed back" by straight setting of the
210     variable instead of being passed in the frame.
211 nigel 77 ****************************************************************************
212     ***************************************************************************/
213    
214 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215     below must be updated in sync. */
216 nigel 77
217 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 ph10 212 RM51, RM52, RM53, RM54 };
223 ph10 164
224 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
225 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 ph10 164 actuall used in this definition. */
227 nigel 77
228     #ifndef NO_RECURSE
229     #define REGISTER register
230 ph10 164
231 nigel 87 #ifdef DEBUG
232 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 nigel 87 { \
234     printf("match() called in line %d\n", __LINE__); \
235 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 nigel 87 printf("to line %d\n", __LINE__); \
237     }
238     #define RRETURN(ra) \
239     { \
240     printf("match() returned %d from line %d ", ra, __LINE__); \
241     return ra; \
242     }
243     #else
244 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 nigel 77 #define RRETURN(ra) return ra
247 nigel 87 #endif
248    
249 nigel 77 #else
250    
251    
252 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
253     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254     argument of match(), which never changes. */
255 nigel 77
256     #define REGISTER
257    
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 nigel 77 {\
260     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 ph10 164 frame->Xwhere = rw; \
262     newframe->Xeptr = ra;\
263     newframe->Xecode = rb;\
264 ph10 168 newframe->Xmstart = mstart;\
265 ph10 164 newframe->Xoffset_top = rc;\
266     newframe->Xims = re;\
267     newframe->Xeptrb = rf;\
268     newframe->Xflags = rg;\
269     newframe->Xrdepth = frame->Xrdepth + 1;\
270     newframe->Xprevframe = frame;\
271     frame = newframe;\
272     DPRINTF(("restarting from line %d\n", __LINE__));\
273     goto HEAP_RECURSE;\
274     L_##rw:\
275     DPRINTF(("jumped back to line %d\n", __LINE__));\
276 nigel 77 }
277    
278     #define RRETURN(ra)\
279     {\
280     heapframe *newframe = frame;\
281     frame = newframe->Xprevframe;\
282     (pcre_stack_free)(newframe);\
283     if (frame != NULL)\
284     {\
285 ph10 164 rrc = ra;\
286     goto HEAP_RETURN;\
287 nigel 77 }\
288     return ra;\
289     }
290    
291    
292     /* Structure for remembering the local variables in a private frame */
293    
294     typedef struct heapframe {
295     struct heapframe *Xprevframe;
296    
297     /* Function arguments that may change */
298    
299     const uschar *Xeptr;
300     const uschar *Xecode;
301 ph10 172 const uschar *Xmstart;
302 nigel 77 int Xoffset_top;
303     long int Xims;
304     eptrblock *Xeptrb;
305     int Xflags;
306 nigel 91 unsigned int Xrdepth;
307 nigel 77
308     /* Function local variables */
309    
310     const uschar *Xcallpat;
311     const uschar *Xcharptr;
312     const uschar *Xdata;
313     const uschar *Xnext;
314     const uschar *Xpp;
315     const uschar *Xprev;
316     const uschar *Xsaved_eptr;
317    
318     recursion_info Xnew_recursive;
319    
320     BOOL Xcur_is_word;
321     BOOL Xcondition;
322     BOOL Xprev_is_word;
323    
324     unsigned long int Xoriginal_ims;
325    
326     #ifdef SUPPORT_UCP
327     int Xprop_type;
328 nigel 87 int Xprop_value;
329 nigel 77 int Xprop_fail_result;
330     int Xprop_category;
331     int Xprop_chartype;
332 nigel 87 int Xprop_script;
333 ph10 123 int Xoclength;
334     uschar Xocchars[8];
335 nigel 77 #endif
336    
337     int Xctype;
338 nigel 93 unsigned int Xfc;
339 nigel 77 int Xfi;
340     int Xlength;
341     int Xmax;
342     int Xmin;
343     int Xnumber;
344     int Xoffset;
345     int Xop;
346     int Xsave_capture_last;
347     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348     int Xstacksave[REC_STACK_SAVE_MAX];
349    
350     eptrblock Xnewptrb;
351    
352 ph10 164 /* Where to jump back to */
353 nigel 77
354 ph10 164 int Xwhere;
355 ph10 165
356 nigel 77 } heapframe;
357    
358     #endif
359    
360    
361     /***************************************************************************
362     ***************************************************************************/
363    
364    
365    
366     /*************************************************
367     * Match from current position *
368     *************************************************/
369    
370 nigel 93 /* This function is called recursively in many circumstances. Whenever it
371 nigel 77 returns a negative (error) response, the outer incarnation must also return the
372     same response.
373    
374     Performance note: It might be tempting to extract commonly used fields from the
375     md structure (e.g. utf8, end_subject) into individual variables to improve
376     performance. Tests using gcc on a SPARC disproved this; in the first case, it
377     made performance worse.
378    
379     Arguments:
380 nigel 93 eptr pointer to current character in subject
381     ecode pointer to current position in compiled code
382 ph10 168 mstart pointer to the current match start position (can be modified
383 ph10 172 by encountering \K)
384 nigel 77 offset_top current top pointer
385     md pointer to "static" info for the match
386     ims current /i, /m, and /s options
387     eptrb pointer to chain of blocks containing eptr at start of
388     brackets - for testing for empty matches
389     flags can contain
390     match_condassert - this is an assertion condition
391 nigel 93 match_cbegroup - this is the start of an unlimited repeat
392     group that can match an empty string
393 nigel 87 rdepth the recursion depth
394 nigel 77
395     Returns: MATCH_MATCH if matched ) these values are >= 0
396     MATCH_NOMATCH if failed to match )
397     a negative PCRE_ERROR_xxx value if aborted by an error condition
398 nigel 87 (e.g. stopped by repeated call or recursion limit)
399 nigel 77 */
400    
401     static int
402 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 nigel 91 int flags, unsigned int rdepth)
405 nigel 77 {
406     /* These variables do not need to be preserved over recursion in this function,
407 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
408     "register" because they are used a lot in loops. */
409 nigel 77
410 nigel 91 register int rrc; /* Returns from recursive calls */
411     register int i; /* Used for loops not involving calls to RMATCH() */
412 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414 nigel 77
415 nigel 93 BOOL minimize, possessive; /* Quantifier options */
416    
417 nigel 77 /* When recursion is not being used, all "local" variables that have to be
418     preserved over calls to RMATCH() are part of a "frame" which is obtained from
419     heap storage. Set up the top-level frame here; others are obtained from the
420     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421    
422     #ifdef NO_RECURSE
423     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424     frame->Xprevframe = NULL; /* Marks the top level */
425    
426     /* Copy in the original argument variables */
427    
428     frame->Xeptr = eptr;
429     frame->Xecode = ecode;
430 ph10 168 frame->Xmstart = mstart;
431 nigel 77 frame->Xoffset_top = offset_top;
432     frame->Xims = ims;
433     frame->Xeptrb = eptrb;
434     frame->Xflags = flags;
435 nigel 87 frame->Xrdepth = rdepth;
436 nigel 77
437     /* This is where control jumps back to to effect "recursion" */
438    
439     HEAP_RECURSE:
440    
441     /* Macros make the argument variables come from the current frame */
442    
443     #define eptr frame->Xeptr
444     #define ecode frame->Xecode
445 ph10 168 #define mstart frame->Xmstart
446 nigel 77 #define offset_top frame->Xoffset_top
447     #define ims frame->Xims
448     #define eptrb frame->Xeptrb
449     #define flags frame->Xflags
450 nigel 87 #define rdepth frame->Xrdepth
451 nigel 77
452     /* Ditto for the local variables */
453    
454     #ifdef SUPPORT_UTF8
455     #define charptr frame->Xcharptr
456     #endif
457     #define callpat frame->Xcallpat
458     #define data frame->Xdata
459     #define next frame->Xnext
460     #define pp frame->Xpp
461     #define prev frame->Xprev
462     #define saved_eptr frame->Xsaved_eptr
463    
464     #define new_recursive frame->Xnew_recursive
465    
466     #define cur_is_word frame->Xcur_is_word
467     #define condition frame->Xcondition
468     #define prev_is_word frame->Xprev_is_word
469    
470     #define original_ims frame->Xoriginal_ims
471    
472     #ifdef SUPPORT_UCP
473     #define prop_type frame->Xprop_type
474 nigel 87 #define prop_value frame->Xprop_value
475 nigel 77 #define prop_fail_result frame->Xprop_fail_result
476     #define prop_category frame->Xprop_category
477     #define prop_chartype frame->Xprop_chartype
478 nigel 87 #define prop_script frame->Xprop_script
479 ph10 115 #define oclength frame->Xoclength
480     #define occhars frame->Xocchars
481 nigel 77 #endif
482    
483     #define ctype frame->Xctype
484     #define fc frame->Xfc
485     #define fi frame->Xfi
486     #define length frame->Xlength
487     #define max frame->Xmax
488     #define min frame->Xmin
489     #define number frame->Xnumber
490     #define offset frame->Xoffset
491     #define op frame->Xop
492     #define save_capture_last frame->Xsave_capture_last
493     #define save_offset1 frame->Xsave_offset1
494     #define save_offset2 frame->Xsave_offset2
495     #define save_offset3 frame->Xsave_offset3
496     #define stacksave frame->Xstacksave
497    
498     #define newptrb frame->Xnewptrb
499    
500     /* When recursion is being used, local variables are allocated on the stack and
501     get preserved during recursion in the normal way. In this environment, fi and
502     i, and fc and c, can be the same variables. */
503    
504 nigel 93 #else /* NO_RECURSE not defined */
505 nigel 77 #define fi i
506     #define fc c
507    
508    
509 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510     const uschar *charptr; /* in small blocks of the code. My normal */
511     #endif /* style of coding would have declared */
512     const uschar *callpat; /* them within each of those blocks. */
513     const uschar *data; /* However, in order to accommodate the */
514     const uschar *next; /* version of this code that uses an */
515     USPTR pp; /* external "stack" implemented on the */
516     const uschar *prev; /* heap, it is easier to declare them all */
517     USPTR saved_eptr; /* here, so the declarations can be cut */
518     /* out in a block. The only declarations */
519     recursion_info new_recursive; /* within blocks below are for variables */
520     /* that do not have to be preserved over */
521     BOOL cur_is_word; /* a recursive call to RMATCH(). */
522     BOOL condition;
523 nigel 77 BOOL prev_is_word;
524    
525     unsigned long int original_ims;
526    
527     #ifdef SUPPORT_UCP
528     int prop_type;
529 nigel 87 int prop_value;
530 nigel 77 int prop_fail_result;
531     int prop_category;
532     int prop_chartype;
533 nigel 87 int prop_script;
534 ph10 115 int oclength;
535     uschar occhars[8];
536 nigel 77 #endif
537    
538     int ctype;
539     int length;
540     int max;
541     int min;
542     int number;
543     int offset;
544     int op;
545     int save_capture_last;
546     int save_offset1, save_offset2, save_offset3;
547     int stacksave[REC_STACK_SAVE_MAX];
548    
549     eptrblock newptrb;
550 nigel 93 #endif /* NO_RECURSE */
551 nigel 77
552     /* These statements are here to stop the compiler complaining about unitialized
553     variables. */
554    
555     #ifdef SUPPORT_UCP
556 nigel 87 prop_value = 0;
557 nigel 77 prop_fail_result = 0;
558     #endif
559    
560 nigel 93
561 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
562     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563     used. Thanks to Ian Taylor for noticing this possibility and sending the
564     original patch. */
565    
566     TAIL_RECURSE:
567    
568 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
569     are specified by the macro RMATCH and RRETURN is used to return. When
570     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571     and a "return", respectively (possibly with some debugging if DEBUG is
572     defined). However, RMATCH isn't like a function call because it's quite a
573     complicated macro. It has to be used in one particular way. This shouldn't,
574     however, impact performance when true recursion is being used. */
575 nigel 77
576 ph10 164 #ifdef SUPPORT_UTF8
577     utf8 = md->utf8; /* Local copy of the flag */
578     #else
579     utf8 = FALSE;
580     #endif
581    
582 nigel 87 /* First check that we haven't called match() too many times, or that we
583     haven't exceeded the recursive call limit. */
584    
585 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587 nigel 77
588     original_ims = ims; /* Save for resetting on ')' */
589 nigel 91
590 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
591     string, the match_cbegroup flag is set. When this is the case, add the current
592     subject pointer to the chain of such remembered pointers, to be checked when we
593     hit the closing ket, in order to break infinite loops that match no characters.
594 ph10 197 When match() is called in other circumstances, don't add to the chain. The
595     match_cbegroup flag must NOT be used with tail recursion, because the memory
596     block that is used is on the stack, so a new one may be required for each
597     match(). */
598 nigel 77
599 nigel 93 if ((flags & match_cbegroup) != 0)
600 nigel 77 {
601 ph10 197 newptrb.epb_saved_eptr = eptr;
602     newptrb.epb_prev = eptrb;
603     eptrb = &newptrb;
604 nigel 77 }
605    
606 nigel 93 /* Now start processing the opcodes. */
607 nigel 77
608     for (;;)
609     {
610 nigel 93 minimize = possessive = FALSE;
611 nigel 77 op = *ecode;
612    
613     /* For partial matching, remember if we ever hit the end of the subject after
614     matching at least one subject character. */
615    
616     if (md->partial &&
617     eptr >= md->end_subject &&
618 ph10 168 eptr > mstart)
619 nigel 77 md->hitend = TRUE;
620 ph10 208
621 nigel 93 switch(op)
622     {
623 ph10 210 case OP_FAIL:
624 ph10 212 RRETURN(MATCH_NOMATCH);
625 ph10 211
626 ph10 210 case OP_PRUNE:
627     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628     ims, eptrb, flags, RM51);
629     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 ph10 212 RRETURN(MATCH_PRUNE);
631 ph10 211
632 ph10 210 case OP_COMMIT:
633     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634     ims, eptrb, flags, RM52);
635     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 ph10 212 RRETURN(MATCH_COMMIT);
637 ph10 211
638 ph10 210 case OP_SKIP:
639     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640     ims, eptrb, flags, RM53);
641     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
643 ph10 212 RRETURN(MATCH_SKIP);
644 ph10 211
645 ph10 210 case OP_THEN:
646     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ph10 212 ims, eptrb, flags, RM54);
648 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 ph10 212 RRETURN(MATCH_THEN);
650 ph10 211
651 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
652     the current subject position in the working slot at the top of the vector.
653     We mustn't change the current values of the data slot, because they may be
654     set from a previous iteration of this group, and be referred to by a
655     reference inside the group.
656 nigel 77
657 nigel 93 If the bracket fails to match, we need to restore this value and also the
658     values of the final offsets, in case they were set by a previous iteration
659     of the same bracket.
660 nigel 77
661 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
662     a non-capturing bracket. Don't worry about setting the flag for the error
663     case here; that is handled in the code for KET. */
664 nigel 77
665 nigel 93 case OP_CBRA:
666     case OP_SCBRA:
667     number = GET2(ecode, 1+LINK_SIZE);
668 nigel 77 offset = number << 1;
669    
670     #ifdef DEBUG
671 nigel 93 printf("start bracket %d\n", number);
672     printf("subject=");
673 nigel 77 pchars(eptr, 16, TRUE, md);
674     printf("\n");
675     #endif
676    
677     if (offset < md->offset_max)
678     {
679     save_offset1 = md->offset_vector[offset];
680     save_offset2 = md->offset_vector[offset+1];
681     save_offset3 = md->offset_vector[md->offset_end - number];
682     save_capture_last = md->capture_last;
683    
684     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686    
687 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 nigel 77 do
689     {
690 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691     ims, eptrb, flags, RM1);
692 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 nigel 77 md->capture_last = save_capture_last;
694     ecode += GET(ecode, 1);
695     }
696     while (*ecode == OP_ALT);
697    
698     DPRINTF(("bracket %d failed\n", number));
699    
700     md->offset_vector[offset] = save_offset1;
701     md->offset_vector[offset+1] = save_offset2;
702     md->offset_vector[md->offset_end - number] = save_offset3;
703    
704     RRETURN(MATCH_NOMATCH);
705     }
706    
707 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708     as a non-capturing bracket. */
709 nigel 77
710 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712    
713 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714 nigel 77
715 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717    
718 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719     final alternative within the brackets, we would return the result of a
720     recursive call to match() whatever happened. We can reduce stack usage by
721 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
722     is set.*/
723 nigel 77
724 nigel 93 case OP_BRA:
725     case OP_SBRA:
726     DPRINTF(("start non-capturing bracket\n"));
727     flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 nigel 91 for (;;)
729 nigel 77 {
730 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 nigel 93 {
732 ph10 197 if (flags == 0) /* Not a possibly empty group */
733     {
734     ecode += _pcre_OP_lengths[*ecode];
735     DPRINTF(("bracket 0 tail recursion\n"));
736     goto TAIL_RECURSE;
737     }
738    
739     /* Possibly empty group; can't use tail recursion. */
740    
741     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742     eptrb, flags, RM48);
743     RRETURN(rrc);
744 nigel 93 }
745 nigel 91
746     /* For non-final alternatives, continue the loop for a NOMATCH result;
747     otherwise return. */
748    
749 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750     eptrb, flags, RM2);
751 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 nigel 77 ecode += GET(ecode, 1);
753     }
754 nigel 91 /* Control never reaches here. */
755 nigel 77
756     /* Conditional group: compilation checked that there are no more than
757     two branches. If the condition is false, skipping the first branch takes us
758     past the end if there is only one branch, but that's OK because that is
759 nigel 91 exactly what going to the ket would do. As there is only one branch to be
760     obeyed, we can use tail recursion to avoid using another stack frame. */
761 nigel 77
762     case OP_COND:
763 nigel 93 case OP_SCOND:
764     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 nigel 77 {
766 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767     condition = md->recursive != NULL &&
768     (offset == RREF_ANY || offset == md->recursive->group_num);
769     ecode += condition? 3 : GET(ecode, 1);
770     }
771    
772     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773     {
774 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776     ecode += condition? 3 : GET(ecode, 1);
777 nigel 77 }
778    
779 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780     {
781     condition = FALSE;
782     ecode += GET(ecode, 1);
783     }
784    
785 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
786 nigel 93 the final argument match_condassert causes it to stop at the end of an
787     assertion. */
788 nigel 77
789     else
790     {
791 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792     match_condassert, RM3);
793 nigel 77 if (rrc == MATCH_MATCH)
794     {
795 nigel 93 condition = TRUE;
796     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798     }
799 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 nigel 77 {
801     RRETURN(rrc); /* Need braces because of following else */
802     }
803 nigel 93 else
804     {
805     condition = FALSE;
806     ecode += GET(ecode, 1);
807     }
808     }
809 nigel 91
810 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
811 ph10 197 we can use tail recursion to avoid using another stack frame, except when
812     match_cbegroup is required for an unlimited repeat of a possibly empty
813     group. If the second alternative doesn't exist, we can just plough on. */
814 nigel 91
815 nigel 93 if (condition || *ecode == OP_ALT)
816     {
817 nigel 91 ecode += 1 + LINK_SIZE;
818 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
819     {
820     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821     RRETURN(rrc);
822     }
823     else /* Group must match something */
824     {
825     flags = 0;
826     goto TAIL_RECURSE;
827     }
828 nigel 77 }
829 ph10 197 else /* Condition false & no 2nd alternative */
830 nigel 93 {
831     ecode += 1 + LINK_SIZE;
832     }
833     break;
834 nigel 77
835    
836 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
837     recursion, we should restore the offsets appropriately and continue from
838     after the call. */
839 nigel 77
840 ph10 210 case OP_ACCEPT:
841 nigel 77 case OP_END:
842     if (md->recursive != NULL && md->recursive->group_num == 0)
843     {
844     recursion_info *rec = md->recursive;
845 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 nigel 77 md->recursive = rec->prevrec;
847     memmove(md->offset_vector, rec->offset_save,
848     rec->saved_max * sizeof(int));
849 ph10 168 mstart = rec->save_start;
850 nigel 77 ims = original_ims;
851     ecode = rec->after_call;
852     break;
853     }
854    
855     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856     string - backtracking will then try other alternatives, if any. */
857    
858 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859     md->end_match_ptr = eptr; /* Record where we ended */
860     md->end_offset_top = offset_top; /* and how many extracts were taken */
861 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 nigel 77 RRETURN(MATCH_MATCH);
863    
864     /* Change option settings */
865    
866     case OP_OPT:
867     ims = ecode[1];
868     ecode += 2;
869     DPRINTF(("ims set to %02lx\n", ims));
870     break;
871    
872     /* Assertion brackets. Check the alternative branches in turn - the
873     matching won't pass the KET for an assertion. If any one branch matches,
874     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875     start of each branch to move the current point backwards, so the code at
876     this level is identical to the lookahead case. */
877    
878     case OP_ASSERT:
879     case OP_ASSERTBACK:
880     do
881     {
882 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883     RM4);
884 nigel 77 if (rrc == MATCH_MATCH) break;
885 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 nigel 77 ecode += GET(ecode, 1);
887     }
888     while (*ecode == OP_ALT);
889     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890    
891     /* If checking an assertion for a condition, return MATCH_MATCH. */
892    
893     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894    
895     /* Continue from after the assertion, updating the offsets high water
896     mark, since extracts may have been taken during the assertion. */
897    
898     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899     ecode += 1 + LINK_SIZE;
900     offset_top = md->end_offset_top;
901     continue;
902    
903     /* Negative assertion: all branches must fail to match */
904    
905     case OP_ASSERT_NOT:
906     case OP_ASSERTBACK_NOT:
907     do
908     {
909 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910     RM5);
911 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 nigel 77 ecode += GET(ecode,1);
914     }
915     while (*ecode == OP_ALT);
916    
917     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918    
919     ecode += 1 + LINK_SIZE;
920     continue;
921    
922     /* Move the subject pointer back. This occurs only at the start of
923     each branch of a lookbehind assertion. If we are too close to the start to
924     move back, this match function fails. When working with UTF-8 we move
925     back a number of characters, not bytes. */
926    
927     case OP_REVERSE:
928     #ifdef SUPPORT_UTF8
929     if (utf8)
930     {
931 nigel 93 i = GET(ecode, 1);
932     while (i-- > 0)
933 nigel 77 {
934     eptr--;
935     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 ph10 207 BACKCHAR(eptr);
937 nigel 77 }
938     }
939     else
940     #endif
941    
942     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943    
944     {
945 nigel 93 eptr -= GET(ecode, 1);
946 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947     }
948    
949     /* Skip to next op code */
950    
951     ecode += 1 + LINK_SIZE;
952     break;
953    
954     /* The callout item calls an external function, if one is provided, passing
955     details of the match so far. This is mainly for debugging, though the
956     function is able to force a failure. */
957    
958     case OP_CALLOUT:
959     if (pcre_callout != NULL)
960     {
961     pcre_callout_block cb;
962     cb.version = 1; /* Version 1 of the callout block */
963     cb.callout_number = ecode[1];
964     cb.offset_vector = md->offset_vector;
965 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
966 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
967 ph10 168 cb.start_match = mstart - md->start_subject;
968 nigel 77 cb.current_position = eptr - md->start_subject;
969     cb.pattern_position = GET(ecode, 2);
970     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971     cb.capture_top = offset_top/2;
972     cb.capture_last = md->capture_last;
973     cb.callout_data = md->callout_data;
974     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975     if (rrc < 0) RRETURN(rrc);
976     }
977     ecode += 2 + 2*LINK_SIZE;
978     break;
979    
980     /* Recursion either matches the current regex, or some subexpression. The
981     offset data is the offset to the starting bracket from the start of the
982     whole pattern. (This is so that it works from duplicated subpatterns.)
983    
984     If there are any capturing brackets started but not finished, we have to
985     save their starting points and reinstate them after the recursion. However,
986     we don't know how many such there are (offset_top records the completed
987     total) so we just have to save all the potential data. There may be up to
988     65535 such values, which is too large to put on the stack, but using malloc
989     for small numbers seems expensive. As a compromise, the stack is used when
990     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991     is used. A problem is what to do if the malloc fails ... there is no way of
992     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993     values on the stack, and accept that the rest may be wrong.
994    
995     There are also other values that have to be saved. We use a chained
996     sequence of blocks that actually live on the stack. Thanks to Robin Houston
997     for the original version of this logic. */
998    
999     case OP_RECURSE:
1000     {
1001     callpat = md->start_code + GET(ecode, 1);
1002 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003     GET2(callpat, 1 + LINK_SIZE);
1004 nigel 77
1005     /* Add to "recursing stack" */
1006    
1007     new_recursive.prevrec = md->recursive;
1008     md->recursive = &new_recursive;
1009    
1010     /* Find where to continue from afterwards */
1011    
1012     ecode += 1 + LINK_SIZE;
1013     new_recursive.after_call = ecode;
1014    
1015     /* Now save the offset data. */
1016    
1017     new_recursive.saved_max = md->offset_end;
1018     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019     new_recursive.offset_save = stacksave;
1020     else
1021     {
1022     new_recursive.offset_save =
1023     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025     }
1026    
1027     memcpy(new_recursive.offset_save, md->offset_vector,
1028     new_recursive.saved_max * sizeof(int));
1029 ph10 168 new_recursive.save_start = mstart;
1030     mstart = eptr;
1031 nigel 77
1032     /* OK, now we can do the recursion. For each top-level alternative we
1033     restore the offset and recursion data. */
1034    
1035     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 nigel 77 do
1038     {
1039 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040     md, ims, eptrb, flags, RM6);
1041 nigel 77 if (rrc == MATCH_MATCH)
1042     {
1043 nigel 87 DPRINTF(("Recursion matched\n"));
1044 nigel 77 md->recursive = new_recursive.prevrec;
1045     if (new_recursive.offset_save != stacksave)
1046     (pcre_free)(new_recursive.offset_save);
1047     RRETURN(MATCH_MATCH);
1048     }
1049 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 nigel 87 {
1051     DPRINTF(("Recursion gave error %d\n", rrc));
1052     RRETURN(rrc);
1053     }
1054 nigel 77
1055     md->recursive = &new_recursive;
1056     memcpy(md->offset_vector, new_recursive.offset_save,
1057     new_recursive.saved_max * sizeof(int));
1058     callpat += GET(callpat, 1);
1059     }
1060     while (*callpat == OP_ALT);
1061    
1062     DPRINTF(("Recursion didn't match\n"));
1063     md->recursive = new_recursive.prevrec;
1064     if (new_recursive.offset_save != stacksave)
1065     (pcre_free)(new_recursive.offset_save);
1066     RRETURN(MATCH_NOMATCH);
1067     }
1068     /* Control never reaches here */
1069    
1070     /* "Once" brackets are like assertion brackets except that after a match,
1071     the point in the subject string is not moved back. Thus there can never be
1072     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073     Check the alternative branches in turn - the matching won't pass the KET
1074     for this kind of subpattern. If any one branch matches, we carry on as at
1075     the end of a normal bracket, leaving the subject pointer. */
1076    
1077     case OP_ONCE:
1078 nigel 91 prev = ecode;
1079     saved_eptr = eptr;
1080    
1081     do
1082 nigel 77 {
1083 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 nigel 91 if (rrc == MATCH_MATCH) break;
1085 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 nigel 91 ecode += GET(ecode,1);
1087     }
1088     while (*ecode == OP_ALT);
1089 nigel 77
1090 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1091 nigel 77
1092 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093 nigel 77
1094 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1095     mark, since extracts may have been taken. */
1096 nigel 77
1097 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098 nigel 77
1099 nigel 91 offset_top = md->end_offset_top;
1100     eptr = md->end_match_ptr;
1101 nigel 77
1102 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1103     happens for a repeating ket if no characters were matched in the group.
1104     This is the forcible breaking of infinite loops as implemented in Perl
1105     5.005. If there is an options reset, it will get obeyed in the normal
1106     course of events. */
1107 nigel 77
1108 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1109     {
1110     ecode += 1+LINK_SIZE;
1111     break;
1112     }
1113 nigel 77
1114 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1115     preceding bracket, in the appropriate order. The second "call" of match()
1116     uses tail recursion, to avoid using another stack frame. We need to reset
1117     any options that changed within the bracket before re-running it, so
1118     check the next opcode. */
1119 nigel 77
1120 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1121     {
1122     ims = (ims & ~PCRE_IMS) | ecode[4];
1123     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124     }
1125 nigel 77
1126 nigel 91 if (*ecode == OP_KETRMIN)
1127     {
1128 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130     ecode = prev;
1131 ph10 197 flags = 0;
1132 nigel 91 goto TAIL_RECURSE;
1133 nigel 77 }
1134 nigel 91 else /* OP_KETRMAX */
1135     {
1136 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138     ecode += 1 + LINK_SIZE;
1139 ph10 197 flags = 0;
1140 nigel 91 goto TAIL_RECURSE;
1141     }
1142     /* Control never gets here */
1143 nigel 77
1144     /* An alternation is the end of a branch; scan along to find the end of the
1145     bracketed group and go to there. */
1146    
1147     case OP_ALT:
1148     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149     break;
1150    
1151     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1152     that it may occur zero times. It may repeat infinitely, or not at all -
1153     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1154     repeat limits are compiled as a number of copies, with the optional ones
1155     preceded by BRAZERO or BRAMINZERO. */
1156    
1157     case OP_BRAZERO:
1158     {
1159     next = ecode+1;
1160 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162     do next += GET(next,1); while (*next == OP_ALT);
1163 nigel 93 ecode = next + 1 + LINK_SIZE;
1164 nigel 77 }
1165     break;
1166    
1167     case OP_BRAMINZERO:
1168     {
1169     next = ecode+1;
1170 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1171 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173     ecode++;
1174     }
1175     break;
1176    
1177 nigel 93 /* End of a group, repeated or non-repeating. */
1178 nigel 77
1179     case OP_KET:
1180     case OP_KETRMIN:
1181     case OP_KETRMAX:
1182 nigel 91 prev = ecode - GET(ecode, 1);
1183 nigel 77
1184 nigel 93 /* If this was a group that remembered the subject start, in order to break
1185     infinite repeats of empty string matches, retrieve the subject start from
1186     the chain. Otherwise, set it NULL. */
1187 nigel 77
1188 nigel 93 if (*prev >= OP_SBRA)
1189     {
1190     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1191     eptrb = eptrb->epb_prev; /* Backup to previous group */
1192     }
1193     else saved_eptr = NULL;
1194 nigel 77
1195 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1196     MATCH_MATCH, but record the current high water mark for use by positive
1197     assertions. Do this also for the "once" (atomic) groups. */
1198    
1199 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1200     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1201     *prev == OP_ONCE)
1202     {
1203     md->end_match_ptr = eptr; /* For ONCE */
1204     md->end_offset_top = offset_top;
1205     RRETURN(MATCH_MATCH);
1206     }
1207 nigel 77
1208 nigel 93 /* For capturing groups we have to check the group number back at the start
1209     and if necessary complete handling an extraction by setting the offsets and
1210     bumping the high water mark. Note that whole-pattern recursion is coded as
1211     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1212     when the OP_END is reached. Other recursion is handled here. */
1213 nigel 77
1214 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1215 nigel 91 {
1216 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1217 nigel 91 offset = number << 1;
1218 nigel 77
1219     #ifdef DEBUG
1220 nigel 91 printf("end bracket %d", number);
1221     printf("\n");
1222 nigel 77 #endif
1223    
1224 nigel 93 md->capture_last = number;
1225     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1226 nigel 91 {
1227 nigel 93 md->offset_vector[offset] =
1228     md->offset_vector[md->offset_end - number];
1229     md->offset_vector[offset+1] = eptr - md->start_subject;
1230     if (offset_top <= offset) offset_top = offset + 2;
1231     }
1232 nigel 77
1233 nigel 93 /* Handle a recursively called group. Restore the offsets
1234     appropriately and continue from after the call. */
1235 nigel 77
1236 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1237     {
1238     recursion_info *rec = md->recursive;
1239     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1240     md->recursive = rec->prevrec;
1241 ph10 168 mstart = rec->save_start;
1242 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1243     rec->saved_max * sizeof(int));
1244     ecode = rec->after_call;
1245     ims = original_ims;
1246     break;
1247 nigel 77 }
1248 nigel 91 }
1249 nigel 77
1250 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1251     flags, in case they got changed during the group. */
1252 nigel 77
1253 nigel 91 ims = original_ims;
1254     DPRINTF(("ims reset to %02lx\n", ims));
1255 nigel 77
1256 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1257     happens for a repeating ket if no characters were matched in the group.
1258     This is the forcible breaking of infinite loops as implemented in Perl
1259     5.005. If there is an options reset, it will get obeyed in the normal
1260     course of events. */
1261 nigel 77
1262 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1263     {
1264     ecode += 1 + LINK_SIZE;
1265     break;
1266     }
1267 nigel 77
1268 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1269     preceding bracket, in the appropriate order. In the second case, we can use
1270 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1271     unlimited repeat of a group that can match an empty string. */
1272 nigel 77
1273 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1274    
1275 nigel 91 if (*ecode == OP_KETRMIN)
1276     {
1277 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1278 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1279 ph10 197 if (flags != 0) /* Could match an empty string */
1280     {
1281     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1282     RRETURN(rrc);
1283     }
1284 nigel 91 ecode = prev;
1285     goto TAIL_RECURSE;
1286 nigel 77 }
1287 nigel 91 else /* OP_KETRMAX */
1288     {
1289 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1290 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1291     ecode += 1 + LINK_SIZE;
1292 ph10 197 flags = 0;
1293 nigel 91 goto TAIL_RECURSE;
1294     }
1295     /* Control never gets here */
1296 nigel 77
1297     /* Start of subject unless notbol, or after internal newline if multiline */
1298    
1299     case OP_CIRC:
1300     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1301     if ((ims & PCRE_MULTILINE) != 0)
1302     {
1303 nigel 91 if (eptr != md->start_subject &&
1304 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1305 nigel 77 RRETURN(MATCH_NOMATCH);
1306     ecode++;
1307     break;
1308     }
1309     /* ... else fall through */
1310    
1311     /* Start of subject assertion */
1312    
1313     case OP_SOD:
1314     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1315     ecode++;
1316     break;
1317    
1318     /* Start of match assertion */
1319    
1320     case OP_SOM:
1321     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1322     ecode++;
1323     break;
1324 ph10 172
1325 ph10 168 /* Reset the start of match point */
1326 ph10 172
1327 ph10 168 case OP_SET_SOM:
1328     mstart = eptr;
1329 ph10 172 ecode++;
1330     break;
1331 nigel 77
1332     /* Assert before internal newline if multiline, or before a terminating
1333     newline unless endonly is set, else end of subject unless noteol is set. */
1334    
1335     case OP_DOLL:
1336     if ((ims & PCRE_MULTILINE) != 0)
1337     {
1338     if (eptr < md->end_subject)
1339 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1340 nigel 77 else
1341     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1342     ecode++;
1343     break;
1344     }
1345     else
1346     {
1347     if (md->noteol) RRETURN(MATCH_NOMATCH);
1348     if (!md->endonly)
1349     {
1350 nigel 91 if (eptr != md->end_subject &&
1351 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1352 nigel 77 RRETURN(MATCH_NOMATCH);
1353     ecode++;
1354     break;
1355     }
1356     }
1357 nigel 91 /* ... else fall through for endonly */
1358 nigel 77
1359     /* End of subject assertion (\z) */
1360    
1361     case OP_EOD:
1362     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1363     ecode++;
1364     break;
1365    
1366     /* End of subject or ending \n assertion (\Z) */
1367    
1368     case OP_EODN:
1369 nigel 91 if (eptr != md->end_subject &&
1370 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1371 nigel 91 RRETURN(MATCH_NOMATCH);
1372 nigel 77 ecode++;
1373     break;
1374    
1375     /* Word boundary assertions */
1376    
1377     case OP_NOT_WORD_BOUNDARY:
1378     case OP_WORD_BOUNDARY:
1379     {
1380    
1381     /* Find out if the previous and current characters are "word" characters.
1382     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1383     be "non-word" characters. */
1384    
1385     #ifdef SUPPORT_UTF8
1386     if (utf8)
1387     {
1388     if (eptr == md->start_subject) prev_is_word = FALSE; else
1389     {
1390     const uschar *lastptr = eptr - 1;
1391     while((*lastptr & 0xc0) == 0x80) lastptr--;
1392     GETCHAR(c, lastptr);
1393     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1394     }
1395     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1396     {
1397     GETCHAR(c, eptr);
1398     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1399     }
1400     }
1401     else
1402     #endif
1403    
1404     /* More streamlined when not in UTF-8 mode */
1405    
1406     {
1407     prev_is_word = (eptr != md->start_subject) &&
1408     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1409     cur_is_word = (eptr < md->end_subject) &&
1410     ((md->ctypes[*eptr] & ctype_word) != 0);
1411     }
1412    
1413     /* Now see if the situation is what we want */
1414    
1415     if ((*ecode++ == OP_WORD_BOUNDARY)?
1416     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1417     RRETURN(MATCH_NOMATCH);
1418     }
1419     break;
1420    
1421     /* Match a single character type; inline for speed */
1422    
1423     case OP_ANY:
1424 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1425     {
1426 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1427 nigel 91 }
1428 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1429     if (utf8)
1430     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1431     ecode++;
1432     break;
1433    
1434     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1435     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1436    
1437     case OP_ANYBYTE:
1438     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1439     ecode++;
1440     break;
1441    
1442     case OP_NOT_DIGIT:
1443     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1444     GETCHARINCTEST(c, eptr);
1445     if (
1446     #ifdef SUPPORT_UTF8
1447     c < 256 &&
1448     #endif
1449     (md->ctypes[c] & ctype_digit) != 0
1450     )
1451     RRETURN(MATCH_NOMATCH);
1452     ecode++;
1453     break;
1454    
1455     case OP_DIGIT:
1456     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1457     GETCHARINCTEST(c, eptr);
1458     if (
1459     #ifdef SUPPORT_UTF8
1460     c >= 256 ||
1461     #endif
1462     (md->ctypes[c] & ctype_digit) == 0
1463     )
1464     RRETURN(MATCH_NOMATCH);
1465     ecode++;
1466     break;
1467    
1468     case OP_NOT_WHITESPACE:
1469     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1470     GETCHARINCTEST(c, eptr);
1471     if (
1472     #ifdef SUPPORT_UTF8
1473     c < 256 &&
1474     #endif
1475     (md->ctypes[c] & ctype_space) != 0
1476     )
1477     RRETURN(MATCH_NOMATCH);
1478     ecode++;
1479     break;
1480    
1481     case OP_WHITESPACE:
1482     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1483     GETCHARINCTEST(c, eptr);
1484     if (
1485     #ifdef SUPPORT_UTF8
1486     c >= 256 ||
1487     #endif
1488     (md->ctypes[c] & ctype_space) == 0
1489     )
1490     RRETURN(MATCH_NOMATCH);
1491     ecode++;
1492     break;
1493    
1494     case OP_NOT_WORDCHAR:
1495     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1496     GETCHARINCTEST(c, eptr);
1497     if (
1498     #ifdef SUPPORT_UTF8
1499     c < 256 &&
1500     #endif
1501     (md->ctypes[c] & ctype_word) != 0
1502     )
1503     RRETURN(MATCH_NOMATCH);
1504     ecode++;
1505     break;
1506    
1507     case OP_WORDCHAR:
1508     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1509     GETCHARINCTEST(c, eptr);
1510     if (
1511     #ifdef SUPPORT_UTF8
1512     c >= 256 ||
1513     #endif
1514     (md->ctypes[c] & ctype_word) == 0
1515     )
1516     RRETURN(MATCH_NOMATCH);
1517     ecode++;
1518     break;
1519    
1520 nigel 93 case OP_ANYNL:
1521     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1522     GETCHARINCTEST(c, eptr);
1523     switch(c)
1524     {
1525     default: RRETURN(MATCH_NOMATCH);
1526     case 0x000d:
1527     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1528     break;
1529     case 0x000a:
1530     case 0x000b:
1531     case 0x000c:
1532     case 0x0085:
1533     case 0x2028:
1534     case 0x2029:
1535     break;
1536     }
1537     ecode++;
1538     break;
1539    
1540 ph10 178 case OP_NOT_HSPACE:
1541     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1542     GETCHARINCTEST(c, eptr);
1543     switch(c)
1544     {
1545     default: break;
1546     case 0x09: /* HT */
1547     case 0x20: /* SPACE */
1548     case 0xa0: /* NBSP */
1549     case 0x1680: /* OGHAM SPACE MARK */
1550     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1551     case 0x2000: /* EN QUAD */
1552     case 0x2001: /* EM QUAD */
1553     case 0x2002: /* EN SPACE */
1554     case 0x2003: /* EM SPACE */
1555     case 0x2004: /* THREE-PER-EM SPACE */
1556     case 0x2005: /* FOUR-PER-EM SPACE */
1557     case 0x2006: /* SIX-PER-EM SPACE */
1558     case 0x2007: /* FIGURE SPACE */
1559     case 0x2008: /* PUNCTUATION SPACE */
1560     case 0x2009: /* THIN SPACE */
1561     case 0x200A: /* HAIR SPACE */
1562     case 0x202f: /* NARROW NO-BREAK SPACE */
1563     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1564     case 0x3000: /* IDEOGRAPHIC SPACE */
1565     RRETURN(MATCH_NOMATCH);
1566     }
1567     ecode++;
1568     break;
1569    
1570     case OP_HSPACE:
1571     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1572     GETCHARINCTEST(c, eptr);
1573     switch(c)
1574     {
1575     default: RRETURN(MATCH_NOMATCH);
1576     case 0x09: /* HT */
1577     case 0x20: /* SPACE */
1578     case 0xa0: /* NBSP */
1579     case 0x1680: /* OGHAM SPACE MARK */
1580     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1581     case 0x2000: /* EN QUAD */
1582     case 0x2001: /* EM QUAD */
1583     case 0x2002: /* EN SPACE */
1584     case 0x2003: /* EM SPACE */
1585     case 0x2004: /* THREE-PER-EM SPACE */
1586     case 0x2005: /* FOUR-PER-EM SPACE */
1587     case 0x2006: /* SIX-PER-EM SPACE */
1588     case 0x2007: /* FIGURE SPACE */
1589     case 0x2008: /* PUNCTUATION SPACE */
1590     case 0x2009: /* THIN SPACE */
1591     case 0x200A: /* HAIR SPACE */
1592     case 0x202f: /* NARROW NO-BREAK SPACE */
1593     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1594     case 0x3000: /* IDEOGRAPHIC SPACE */
1595     break;
1596     }
1597     ecode++;
1598     break;
1599    
1600     case OP_NOT_VSPACE:
1601     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1602     GETCHARINCTEST(c, eptr);
1603     switch(c)
1604     {
1605     default: break;
1606     case 0x0a: /* LF */
1607     case 0x0b: /* VT */
1608     case 0x0c: /* FF */
1609     case 0x0d: /* CR */
1610     case 0x85: /* NEL */
1611     case 0x2028: /* LINE SEPARATOR */
1612     case 0x2029: /* PARAGRAPH SEPARATOR */
1613     RRETURN(MATCH_NOMATCH);
1614     }
1615     ecode++;
1616     break;
1617    
1618     case OP_VSPACE:
1619     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1620     GETCHARINCTEST(c, eptr);
1621     switch(c)
1622     {
1623     default: RRETURN(MATCH_NOMATCH);
1624     case 0x0a: /* LF */
1625     case 0x0b: /* VT */
1626     case 0x0c: /* FF */
1627     case 0x0d: /* CR */
1628     case 0x85: /* NEL */
1629     case 0x2028: /* LINE SEPARATOR */
1630     case 0x2029: /* PARAGRAPH SEPARATOR */
1631     break;
1632     }
1633     ecode++;
1634     break;
1635    
1636 nigel 77 #ifdef SUPPORT_UCP
1637     /* Check the next character by Unicode property. We will get here only
1638     if the support is in the binary; otherwise a compile-time error occurs. */
1639    
1640     case OP_PROP:
1641     case OP_NOTPROP:
1642     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1643     GETCHARINCTEST(c, eptr);
1644     {
1645 nigel 87 int chartype, script;
1646     int category = _pcre_ucp_findprop(c, &chartype, &script);
1647 nigel 77
1648 nigel 87 switch(ecode[1])
1649     {
1650     case PT_ANY:
1651     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1652     break;
1653 nigel 77
1654 nigel 87 case PT_LAMP:
1655     if ((chartype == ucp_Lu ||
1656     chartype == ucp_Ll ||
1657     chartype == ucp_Lt) == (op == OP_NOTPROP))
1658 nigel 77 RRETURN(MATCH_NOMATCH);
1659 nigel 87 break;
1660    
1661     case PT_GC:
1662     if ((ecode[2] != category) == (op == OP_PROP))
1663 nigel 77 RRETURN(MATCH_NOMATCH);
1664 nigel 87 break;
1665    
1666     case PT_PC:
1667     if ((ecode[2] != chartype) == (op == OP_PROP))
1668     RRETURN(MATCH_NOMATCH);
1669     break;
1670    
1671     case PT_SC:
1672     if ((ecode[2] != script) == (op == OP_PROP))
1673     RRETURN(MATCH_NOMATCH);
1674     break;
1675    
1676     default:
1677     RRETURN(PCRE_ERROR_INTERNAL);
1678 nigel 77 }
1679 nigel 87
1680     ecode += 3;
1681 nigel 77 }
1682     break;
1683    
1684     /* Match an extended Unicode sequence. We will get here only if the support
1685     is in the binary; otherwise a compile-time error occurs. */
1686    
1687     case OP_EXTUNI:
1688     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1689     GETCHARINCTEST(c, eptr);
1690     {
1691 nigel 87 int chartype, script;
1692     int category = _pcre_ucp_findprop(c, &chartype, &script);
1693 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1694     while (eptr < md->end_subject)
1695     {
1696     int len = 1;
1697     if (!utf8) c = *eptr; else
1698     {
1699     GETCHARLEN(c, eptr, len);
1700     }
1701 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1702 nigel 77 if (category != ucp_M) break;
1703     eptr += len;
1704     }
1705     }
1706     ecode++;
1707     break;
1708     #endif
1709    
1710    
1711     /* Match a back reference, possibly repeatedly. Look past the end of the
1712     item to see if there is repeat information following. The code is similar
1713     to that for character classes, but repeated for efficiency. Then obey
1714     similar code to character type repeats - written out again for speed.
1715     However, if the referenced string is the empty string, always treat
1716     it as matched, any number of times (otherwise there could be infinite
1717     loops). */
1718    
1719     case OP_REF:
1720     {
1721     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1722     ecode += 3; /* Advance past item */
1723    
1724     /* If the reference is unset, set the length to be longer than the amount
1725     of subject left; this ensures that every attempt at a match fails. We
1726     can't just fail here, because of the possibility of quantifiers with zero
1727     minima. */
1728    
1729     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1730     md->end_subject - eptr + 1 :
1731     md->offset_vector[offset+1] - md->offset_vector[offset];
1732    
1733     /* Set up for repetition, or handle the non-repeated case */
1734    
1735     switch (*ecode)
1736     {
1737     case OP_CRSTAR:
1738     case OP_CRMINSTAR:
1739     case OP_CRPLUS:
1740     case OP_CRMINPLUS:
1741     case OP_CRQUERY:
1742     case OP_CRMINQUERY:
1743     c = *ecode++ - OP_CRSTAR;
1744     minimize = (c & 1) != 0;
1745     min = rep_min[c]; /* Pick up values from tables; */
1746     max = rep_max[c]; /* zero for max => infinity */
1747     if (max == 0) max = INT_MAX;
1748     break;
1749    
1750     case OP_CRRANGE:
1751     case OP_CRMINRANGE:
1752     minimize = (*ecode == OP_CRMINRANGE);
1753     min = GET2(ecode, 1);
1754     max = GET2(ecode, 3);
1755     if (max == 0) max = INT_MAX;
1756     ecode += 5;
1757     break;
1758    
1759     default: /* No repeat follows */
1760     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1761     eptr += length;
1762     continue; /* With the main loop */
1763     }
1764    
1765     /* If the length of the reference is zero, just continue with the
1766     main loop. */
1767    
1768     if (length == 0) continue;
1769    
1770     /* First, ensure the minimum number of matches are present. We get back
1771     the length of the reference string explicitly rather than passing the
1772     address of eptr, so that eptr can be a register variable. */
1773    
1774     for (i = 1; i <= min; i++)
1775     {
1776     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1777     eptr += length;
1778     }
1779    
1780     /* If min = max, continue at the same level without recursion.
1781     They are not both allowed to be zero. */
1782    
1783     if (min == max) continue;
1784    
1785     /* If minimizing, keep trying and advancing the pointer */
1786    
1787     if (minimize)
1788     {
1789     for (fi = min;; fi++)
1790     {
1791 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1792 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1793     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1794     RRETURN(MATCH_NOMATCH);
1795     eptr += length;
1796     }
1797     /* Control never gets here */
1798     }
1799    
1800     /* If maximizing, find the longest string and work backwards */
1801    
1802     else
1803     {
1804     pp = eptr;
1805     for (i = min; i < max; i++)
1806     {
1807     if (!match_ref(offset, eptr, length, md, ims)) break;
1808     eptr += length;
1809     }
1810     while (eptr >= pp)
1811     {
1812 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1813 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1814     eptr -= length;
1815     }
1816     RRETURN(MATCH_NOMATCH);
1817     }
1818     }
1819     /* Control never gets here */
1820    
1821    
1822    
1823     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1824     used when all the characters in the class have values in the range 0-255,
1825     and either the matching is caseful, or the characters are in the range
1826     0-127 when UTF-8 processing is enabled. The only difference between
1827     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1828     encountered.
1829    
1830     First, look past the end of the item to see if there is repeat information
1831     following. Then obey similar code to character type repeats - written out
1832     again for speed. */
1833    
1834     case OP_NCLASS:
1835     case OP_CLASS:
1836     {
1837     data = ecode + 1; /* Save for matching */
1838     ecode += 33; /* Advance past the item */
1839    
1840     switch (*ecode)
1841     {
1842     case OP_CRSTAR:
1843     case OP_CRMINSTAR:
1844     case OP_CRPLUS:
1845     case OP_CRMINPLUS:
1846     case OP_CRQUERY:
1847     case OP_CRMINQUERY:
1848     c = *ecode++ - OP_CRSTAR;
1849     minimize = (c & 1) != 0;
1850     min = rep_min[c]; /* Pick up values from tables; */
1851     max = rep_max[c]; /* zero for max => infinity */
1852     if (max == 0) max = INT_MAX;
1853     break;
1854    
1855     case OP_CRRANGE:
1856     case OP_CRMINRANGE:
1857     minimize = (*ecode == OP_CRMINRANGE);
1858     min = GET2(ecode, 1);
1859     max = GET2(ecode, 3);
1860     if (max == 0) max = INT_MAX;
1861     ecode += 5;
1862     break;
1863    
1864     default: /* No repeat follows */
1865     min = max = 1;
1866     break;
1867     }
1868    
1869     /* First, ensure the minimum number of matches are present. */
1870    
1871     #ifdef SUPPORT_UTF8
1872     /* UTF-8 mode */
1873     if (utf8)
1874     {
1875     for (i = 1; i <= min; i++)
1876     {
1877     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1878     GETCHARINC(c, eptr);
1879     if (c > 255)
1880     {
1881     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1882     }
1883     else
1884     {
1885     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1886     }
1887     }
1888     }
1889     else
1890     #endif
1891     /* Not UTF-8 mode */
1892     {
1893     for (i = 1; i <= min; i++)
1894     {
1895     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1896     c = *eptr++;
1897     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1898     }
1899     }
1900    
1901     /* If max == min we can continue with the main loop without the
1902     need to recurse. */
1903    
1904     if (min == max) continue;
1905    
1906     /* If minimizing, keep testing the rest of the expression and advancing
1907     the pointer while it matches the class. */
1908    
1909     if (minimize)
1910     {
1911     #ifdef SUPPORT_UTF8
1912     /* UTF-8 mode */
1913     if (utf8)
1914     {
1915     for (fi = min;; fi++)
1916     {
1917 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1918 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1919     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1920     GETCHARINC(c, eptr);
1921     if (c > 255)
1922     {
1923     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1924     }
1925     else
1926     {
1927     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1928     }
1929     }
1930     }
1931     else
1932     #endif
1933     /* Not UTF-8 mode */
1934     {
1935     for (fi = min;; fi++)
1936     {
1937 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1938 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1939     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940     c = *eptr++;
1941     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1942     }
1943     }
1944     /* Control never gets here */
1945     }
1946    
1947     /* If maximizing, find the longest possible run, then work backwards. */
1948    
1949     else
1950     {
1951     pp = eptr;
1952    
1953     #ifdef SUPPORT_UTF8
1954     /* UTF-8 mode */
1955     if (utf8)
1956     {
1957     for (i = min; i < max; i++)
1958     {
1959     int len = 1;
1960     if (eptr >= md->end_subject) break;
1961     GETCHARLEN(c, eptr, len);
1962     if (c > 255)
1963     {
1964     if (op == OP_CLASS) break;
1965     }
1966     else
1967     {
1968     if ((data[c/8] & (1 << (c&7))) == 0) break;
1969     }
1970     eptr += len;
1971     }
1972     for (;;)
1973     {
1974 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1975 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1976     if (eptr-- == pp) break; /* Stop if tried at original pos */
1977     BACKCHAR(eptr);
1978     }
1979     }
1980     else
1981     #endif
1982     /* Not UTF-8 mode */
1983     {
1984     for (i = min; i < max; i++)
1985     {
1986     if (eptr >= md->end_subject) break;
1987     c = *eptr;
1988     if ((data[c/8] & (1 << (c&7))) == 0) break;
1989     eptr++;
1990     }
1991     while (eptr >= pp)
1992     {
1993 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1994 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1995 nigel 77 eptr--;
1996     }
1997     }
1998    
1999     RRETURN(MATCH_NOMATCH);
2000     }
2001     }
2002     /* Control never gets here */
2003    
2004    
2005     /* Match an extended character class. This opcode is encountered only
2006     in UTF-8 mode, because that's the only time it is compiled. */
2007    
2008     #ifdef SUPPORT_UTF8
2009     case OP_XCLASS:
2010     {
2011     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2012     ecode += GET(ecode, 1); /* Advance past the item */
2013    
2014     switch (*ecode)
2015     {
2016     case OP_CRSTAR:
2017     case OP_CRMINSTAR:
2018     case OP_CRPLUS:
2019     case OP_CRMINPLUS:
2020     case OP_CRQUERY:
2021     case OP_CRMINQUERY:
2022     c = *ecode++ - OP_CRSTAR;
2023     minimize = (c & 1) != 0;
2024     min = rep_min[c]; /* Pick up values from tables; */
2025     max = rep_max[c]; /* zero for max => infinity */
2026     if (max == 0) max = INT_MAX;
2027     break;
2028    
2029     case OP_CRRANGE:
2030     case OP_CRMINRANGE:
2031     minimize = (*ecode == OP_CRMINRANGE);
2032     min = GET2(ecode, 1);
2033     max = GET2(ecode, 3);
2034     if (max == 0) max = INT_MAX;
2035     ecode += 5;
2036     break;
2037    
2038     default: /* No repeat follows */
2039     min = max = 1;
2040     break;
2041     }
2042    
2043     /* First, ensure the minimum number of matches are present. */
2044    
2045     for (i = 1; i <= min; i++)
2046     {
2047     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2048     GETCHARINC(c, eptr);
2049     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2050     }
2051    
2052     /* If max == min we can continue with the main loop without the
2053     need to recurse. */
2054    
2055     if (min == max) continue;
2056    
2057     /* If minimizing, keep testing the rest of the expression and advancing
2058     the pointer while it matches the class. */
2059    
2060     if (minimize)
2061     {
2062     for (fi = min;; fi++)
2063     {
2064 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2065 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2066     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2067     GETCHARINC(c, eptr);
2068     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2069     }
2070     /* Control never gets here */
2071     }
2072    
2073     /* If maximizing, find the longest possible run, then work backwards. */
2074    
2075     else
2076     {
2077     pp = eptr;
2078     for (i = min; i < max; i++)
2079     {
2080     int len = 1;
2081     if (eptr >= md->end_subject) break;
2082     GETCHARLEN(c, eptr, len);
2083     if (!_pcre_xclass(c, data)) break;
2084     eptr += len;
2085     }
2086     for(;;)
2087     {
2088 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2089 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2090     if (eptr-- == pp) break; /* Stop if tried at original pos */
2091 ph10 214 if (utf8) BACKCHAR(eptr);
2092 nigel 77 }
2093     RRETURN(MATCH_NOMATCH);
2094     }
2095    
2096     /* Control never gets here */
2097     }
2098     #endif /* End of XCLASS */
2099    
2100     /* Match a single character, casefully */
2101    
2102     case OP_CHAR:
2103     #ifdef SUPPORT_UTF8
2104     if (utf8)
2105     {
2106     length = 1;
2107     ecode++;
2108     GETCHARLEN(fc, ecode, length);
2109     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2110     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2111     }
2112     else
2113     #endif
2114    
2115     /* Non-UTF-8 mode */
2116     {
2117     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2118     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2119     ecode += 2;
2120     }
2121     break;
2122    
2123     /* Match a single character, caselessly */
2124    
2125     case OP_CHARNC:
2126     #ifdef SUPPORT_UTF8
2127     if (utf8)
2128     {
2129     length = 1;
2130     ecode++;
2131     GETCHARLEN(fc, ecode, length);
2132    
2133     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2134    
2135     /* If the pattern character's value is < 128, we have only one byte, and
2136     can use the fast lookup table. */
2137    
2138     if (fc < 128)
2139     {
2140     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2141     }
2142    
2143     /* Otherwise we must pick up the subject character */
2144    
2145     else
2146     {
2147 nigel 93 unsigned int dc;
2148 nigel 77 GETCHARINC(dc, eptr);
2149     ecode += length;
2150    
2151     /* If we have Unicode property support, we can use it to test the other
2152 nigel 87 case of the character, if there is one. */
2153 nigel 77
2154     if (fc != dc)
2155     {
2156     #ifdef SUPPORT_UCP
2157 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2158 nigel 77 #endif
2159     RRETURN(MATCH_NOMATCH);
2160     }
2161     }
2162     }
2163     else
2164     #endif /* SUPPORT_UTF8 */
2165    
2166     /* Non-UTF-8 mode */
2167     {
2168     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2169     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2170     ecode += 2;
2171     }
2172     break;
2173    
2174 nigel 93 /* Match a single character repeatedly. */
2175 nigel 77
2176     case OP_EXACT:
2177     min = max = GET2(ecode, 1);
2178     ecode += 3;
2179     goto REPEATCHAR;
2180    
2181 nigel 93 case OP_POSUPTO:
2182     possessive = TRUE;
2183     /* Fall through */
2184    
2185 nigel 77 case OP_UPTO:
2186     case OP_MINUPTO:
2187     min = 0;
2188     max = GET2(ecode, 1);
2189     minimize = *ecode == OP_MINUPTO;
2190     ecode += 3;
2191     goto REPEATCHAR;
2192    
2193 nigel 93 case OP_POSSTAR:
2194     possessive = TRUE;
2195     min = 0;
2196     max = INT_MAX;
2197     ecode++;
2198     goto REPEATCHAR;
2199    
2200     case OP_POSPLUS:
2201     possessive = TRUE;
2202     min = 1;
2203     max = INT_MAX;
2204     ecode++;
2205     goto REPEATCHAR;
2206    
2207     case OP_POSQUERY:
2208     possessive = TRUE;
2209     min = 0;
2210     max = 1;
2211     ecode++;
2212     goto REPEATCHAR;
2213    
2214 nigel 77 case OP_STAR:
2215     case OP_MINSTAR:
2216     case OP_PLUS:
2217     case OP_MINPLUS:
2218     case OP_QUERY:
2219     case OP_MINQUERY:
2220     c = *ecode++ - OP_STAR;
2221     minimize = (c & 1) != 0;
2222     min = rep_min[c]; /* Pick up values from tables; */
2223     max = rep_max[c]; /* zero for max => infinity */
2224     if (max == 0) max = INT_MAX;
2225    
2226     /* Common code for all repeated single-character matches. We can give
2227     up quickly if there are fewer than the minimum number of characters left in
2228     the subject. */
2229    
2230     REPEATCHAR:
2231     #ifdef SUPPORT_UTF8
2232     if (utf8)
2233     {
2234     length = 1;
2235     charptr = ecode;
2236     GETCHARLEN(fc, ecode, length);
2237     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2238     ecode += length;
2239    
2240     /* Handle multibyte character matching specially here. There is
2241     support for caseless matching if UCP support is present. */
2242    
2243     if (length > 1)
2244     {
2245     #ifdef SUPPORT_UCP
2246 nigel 93 unsigned int othercase;
2247 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2248 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2249 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2250 ph10 115 else oclength = 0;
2251 nigel 77 #endif /* SUPPORT_UCP */
2252    
2253     for (i = 1; i <= min; i++)
2254     {
2255     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2256 ph10 123 #ifdef SUPPORT_UCP
2257 nigel 77 /* Need braces because of following else */
2258     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2259     else
2260     {
2261     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2262     eptr += oclength;
2263     }
2264 ph10 115 #else /* without SUPPORT_UCP */
2265     else { RRETURN(MATCH_NOMATCH); }
2266 ph10 123 #endif /* SUPPORT_UCP */
2267 nigel 77 }
2268    
2269     if (min == max) continue;
2270    
2271     if (minimize)
2272     {
2273     for (fi = min;; fi++)
2274     {
2275 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2276 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2277     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2278     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2279 ph10 123 #ifdef SUPPORT_UCP
2280 nigel 77 /* Need braces because of following else */
2281     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2282     else
2283     {
2284     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2285     eptr += oclength;
2286     }
2287 ph10 115 #else /* without SUPPORT_UCP */
2288     else { RRETURN (MATCH_NOMATCH); }
2289     #endif /* SUPPORT_UCP */
2290 nigel 77 }
2291     /* Control never gets here */
2292     }
2293 nigel 93
2294     else /* Maximize */
2295 nigel 77 {
2296     pp = eptr;
2297     for (i = min; i < max; i++)
2298     {
2299     if (eptr > md->end_subject - length) break;
2300     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2301 ph10 123 #ifdef SUPPORT_UCP
2302 nigel 77 else if (oclength == 0) break;
2303     else
2304     {
2305     if (memcmp(eptr, occhars, oclength) != 0) break;
2306     eptr += oclength;
2307     }
2308 ph10 115 #else /* without SUPPORT_UCP */
2309     else break;
2310 ph10 123 #endif /* SUPPORT_UCP */
2311 nigel 77 }
2312 nigel 93
2313     if (possessive) continue;
2314 ph10 120 for(;;)
2315 nigel 77 {
2316 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2317 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2318 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2319 ph10 115 #ifdef SUPPORT_UCP
2320     eptr--;
2321     BACKCHAR(eptr);
2322 ph10 123 #else /* without SUPPORT_UCP */
2323 nigel 77 eptr -= length;
2324 ph10 123 #endif /* SUPPORT_UCP */
2325 nigel 77 }
2326     }
2327     /* Control never gets here */
2328     }
2329    
2330     /* If the length of a UTF-8 character is 1, we fall through here, and
2331     obey the code as for non-UTF-8 characters below, though in this case the
2332     value of fc will always be < 128. */
2333     }
2334     else
2335     #endif /* SUPPORT_UTF8 */
2336    
2337     /* When not in UTF-8 mode, load a single-byte character. */
2338     {
2339     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2340     fc = *ecode++;
2341     }
2342    
2343     /* The value of fc at this point is always less than 256, though we may or
2344     may not be in UTF-8 mode. The code is duplicated for the caseless and
2345     caseful cases, for speed, since matching characters is likely to be quite
2346     common. First, ensure the minimum number of matches are present. If min =
2347     max, continue at the same level without recursing. Otherwise, if
2348     minimizing, keep trying the rest of the expression and advancing one
2349     matching character if failing, up to the maximum. Alternatively, if
2350     maximizing, find the maximum number of characters and work backwards. */
2351    
2352     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2353     max, eptr));
2354    
2355     if ((ims & PCRE_CASELESS) != 0)
2356     {
2357     fc = md->lcc[fc];
2358     for (i = 1; i <= min; i++)
2359     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2360     if (min == max) continue;
2361     if (minimize)
2362     {
2363     for (fi = min;; fi++)
2364     {
2365 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2366 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2367     if (fi >= max || eptr >= md->end_subject ||
2368     fc != md->lcc[*eptr++])
2369     RRETURN(MATCH_NOMATCH);
2370     }
2371     /* Control never gets here */
2372     }
2373 nigel 93 else /* Maximize */
2374 nigel 77 {
2375     pp = eptr;
2376     for (i = min; i < max; i++)
2377     {
2378     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2379     eptr++;
2380     }
2381 nigel 93 if (possessive) continue;
2382 nigel 77 while (eptr >= pp)
2383     {
2384 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2385 nigel 77 eptr--;
2386     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2387     }
2388     RRETURN(MATCH_NOMATCH);
2389     }
2390     /* Control never gets here */
2391     }
2392    
2393     /* Caseful comparisons (includes all multi-byte characters) */
2394    
2395     else
2396     {
2397     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2398     if (min == max) continue;
2399     if (minimize)
2400     {
2401     for (fi = min;; fi++)
2402     {
2403 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2404 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2405     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2406     RRETURN(MATCH_NOMATCH);
2407     }
2408     /* Control never gets here */
2409     }
2410 nigel 93 else /* Maximize */
2411 nigel 77 {
2412     pp = eptr;
2413     for (i = min; i < max; i++)
2414     {
2415     if (eptr >= md->end_subject || fc != *eptr) break;
2416     eptr++;
2417     }
2418 nigel 93 if (possessive) continue;
2419 nigel 77 while (eptr >= pp)
2420     {
2421 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2422 nigel 77 eptr--;
2423     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2424     }
2425     RRETURN(MATCH_NOMATCH);
2426     }
2427     }
2428     /* Control never gets here */
2429    
2430     /* Match a negated single one-byte character. The character we are
2431     checking can be multibyte. */
2432    
2433     case OP_NOT:
2434     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2435     ecode++;
2436     GETCHARINCTEST(c, eptr);
2437     if ((ims & PCRE_CASELESS) != 0)
2438     {
2439     #ifdef SUPPORT_UTF8
2440     if (c < 256)
2441     #endif
2442     c = md->lcc[c];
2443     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2444     }
2445     else
2446     {
2447     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2448     }
2449     break;
2450    
2451     /* Match a negated single one-byte character repeatedly. This is almost a
2452     repeat of the code for a repeated single character, but I haven't found a
2453     nice way of commoning these up that doesn't require a test of the
2454     positive/negative option for each character match. Maybe that wouldn't add
2455     very much to the time taken, but character matching *is* what this is all
2456     about... */
2457    
2458     case OP_NOTEXACT:
2459     min = max = GET2(ecode, 1);
2460     ecode += 3;
2461     goto REPEATNOTCHAR;
2462    
2463     case OP_NOTUPTO:
2464     case OP_NOTMINUPTO:
2465     min = 0;
2466     max = GET2(ecode, 1);
2467     minimize = *ecode == OP_NOTMINUPTO;
2468     ecode += 3;
2469     goto REPEATNOTCHAR;
2470    
2471 nigel 93 case OP_NOTPOSSTAR:
2472     possessive = TRUE;
2473     min = 0;
2474     max = INT_MAX;
2475     ecode++;
2476     goto REPEATNOTCHAR;
2477    
2478     case OP_NOTPOSPLUS:
2479     possessive = TRUE;
2480     min = 1;
2481     max = INT_MAX;
2482     ecode++;
2483     goto REPEATNOTCHAR;
2484    
2485     case OP_NOTPOSQUERY:
2486     possessive = TRUE;
2487     min = 0;
2488     max = 1;
2489     ecode++;
2490     goto REPEATNOTCHAR;
2491    
2492     case OP_NOTPOSUPTO:
2493     possessive = TRUE;
2494     min = 0;
2495     max = GET2(ecode, 1);
2496     ecode += 3;
2497     goto REPEATNOTCHAR;
2498    
2499 nigel 77 case OP_NOTSTAR:
2500     case OP_NOTMINSTAR:
2501     case OP_NOTPLUS:
2502     case OP_NOTMINPLUS:
2503     case OP_NOTQUERY:
2504     case OP_NOTMINQUERY:
2505     c = *ecode++ - OP_NOTSTAR;
2506     minimize = (c & 1) != 0;
2507     min = rep_min[c]; /* Pick up values from tables; */
2508     max = rep_max[c]; /* zero for max => infinity */
2509     if (max == 0) max = INT_MAX;
2510    
2511     /* Common code for all repeated single-byte matches. We can give up quickly
2512     if there are fewer than the minimum number of bytes left in the
2513     subject. */
2514    
2515     REPEATNOTCHAR:
2516     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2517     fc = *ecode++;
2518    
2519     /* The code is duplicated for the caseless and caseful cases, for speed,
2520     since matching characters is likely to be quite common. First, ensure the
2521     minimum number of matches are present. If min = max, continue at the same
2522     level without recursing. Otherwise, if minimizing, keep trying the rest of
2523     the expression and advancing one matching character if failing, up to the
2524     maximum. Alternatively, if maximizing, find the maximum number of
2525     characters and work backwards. */
2526    
2527     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2528     max, eptr));
2529    
2530     if ((ims & PCRE_CASELESS) != 0)
2531     {
2532     fc = md->lcc[fc];
2533    
2534     #ifdef SUPPORT_UTF8
2535     /* UTF-8 mode */
2536     if (utf8)
2537     {
2538 nigel 93 register unsigned int d;
2539 nigel 77 for (i = 1; i <= min; i++)
2540     {
2541     GETCHARINC(d, eptr);
2542     if (d < 256) d = md->lcc[d];
2543     if (fc == d) RRETURN(MATCH_NOMATCH);
2544     }
2545     }
2546     else
2547     #endif
2548    
2549     /* Not UTF-8 mode */
2550     {
2551     for (i = 1; i <= min; i++)
2552     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2553     }
2554    
2555     if (min == max) continue;
2556    
2557     if (minimize)
2558     {
2559     #ifdef SUPPORT_UTF8
2560     /* UTF-8 mode */
2561     if (utf8)
2562     {
2563 nigel 93 register unsigned int d;
2564 nigel 77 for (fi = min;; fi++)
2565     {
2566 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2567 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2568     GETCHARINC(d, eptr);
2569     if (d < 256) d = md->lcc[d];
2570     if (fi >= max || eptr >= md->end_subject || fc == d)
2571     RRETURN(MATCH_NOMATCH);
2572     }
2573     }
2574     else
2575     #endif
2576     /* Not UTF-8 mode */
2577     {
2578     for (fi = min;; fi++)
2579     {
2580 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2581 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2582     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2583     RRETURN(MATCH_NOMATCH);
2584     }
2585     }
2586     /* Control never gets here */
2587     }
2588    
2589     /* Maximize case */
2590    
2591     else
2592     {
2593     pp = eptr;
2594    
2595     #ifdef SUPPORT_UTF8
2596     /* UTF-8 mode */
2597     if (utf8)
2598     {
2599 nigel 93 register unsigned int d;
2600 nigel 77 for (i = min; i < max; i++)
2601     {
2602     int len = 1;
2603     if (eptr >= md->end_subject) break;
2604     GETCHARLEN(d, eptr, len);
2605     if (d < 256) d = md->lcc[d];
2606     if (fc == d) break;
2607     eptr += len;
2608     }
2609 nigel 93 if (possessive) continue;
2610     for(;;)
2611 nigel 77 {
2612 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2613 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2614     if (eptr-- == pp) break; /* Stop if tried at original pos */
2615     BACKCHAR(eptr);
2616     }
2617     }
2618     else
2619     #endif
2620     /* Not UTF-8 mode */
2621     {
2622     for (i = min; i < max; i++)
2623     {
2624     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2625     eptr++;
2626     }
2627 nigel 93 if (possessive) continue;
2628 nigel 77 while (eptr >= pp)
2629     {
2630 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2631 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2632     eptr--;
2633     }
2634     }
2635    
2636     RRETURN(MATCH_NOMATCH);
2637     }
2638     /* Control never gets here */
2639     }
2640    
2641     /* Caseful comparisons */
2642    
2643     else
2644     {
2645     #ifdef SUPPORT_UTF8
2646     /* UTF-8 mode */
2647     if (utf8)
2648     {
2649 nigel 93 register unsigned int d;
2650 nigel 77 for (i = 1; i <= min; i++)
2651     {
2652     GETCHARINC(d, eptr);
2653     if (fc == d) RRETURN(MATCH_NOMATCH);
2654     }
2655     }
2656     else
2657     #endif
2658     /* Not UTF-8 mode */
2659     {
2660     for (i = 1; i <= min; i++)
2661     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2662     }
2663    
2664     if (min == max) continue;
2665    
2666     if (minimize)
2667     {
2668     #ifdef SUPPORT_UTF8
2669     /* UTF-8 mode */
2670     if (utf8)
2671     {
2672 nigel 93 register unsigned int d;
2673 nigel 77 for (fi = min;; fi++)
2674     {
2675 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2676 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2677     GETCHARINC(d, eptr);
2678     if (fi >= max || eptr >= md->end_subject || fc == d)
2679     RRETURN(MATCH_NOMATCH);
2680     }
2681     }
2682     else
2683     #endif
2684     /* Not UTF-8 mode */
2685     {
2686     for (fi = min;; fi++)
2687     {
2688 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2689 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2690     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2691     RRETURN(MATCH_NOMATCH);
2692     }
2693     }
2694     /* Control never gets here */
2695     }
2696    
2697     /* Maximize case */
2698    
2699     else
2700     {
2701     pp = eptr;
2702    
2703     #ifdef SUPPORT_UTF8
2704     /* UTF-8 mode */
2705     if (utf8)
2706     {
2707 nigel 93 register unsigned int d;
2708 nigel 77 for (i = min; i < max; i++)
2709     {
2710     int len = 1;
2711     if (eptr >= md->end_subject) break;
2712     GETCHARLEN(d, eptr, len);
2713     if (fc == d) break;
2714     eptr += len;
2715     }
2716 nigel 93 if (possessive) continue;
2717 nigel 77 for(;;)
2718     {
2719 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2720 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2721     if (eptr-- == pp) break; /* Stop if tried at original pos */
2722     BACKCHAR(eptr);
2723     }
2724     }
2725     else
2726     #endif
2727     /* Not UTF-8 mode */
2728     {
2729     for (i = min; i < max; i++)
2730     {
2731     if (eptr >= md->end_subject || fc == *eptr) break;
2732     eptr++;
2733     }
2734 nigel 93 if (possessive) continue;
2735 nigel 77 while (eptr >= pp)
2736     {
2737 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2738 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2739     eptr--;
2740     }
2741     }
2742    
2743     RRETURN(MATCH_NOMATCH);
2744     }
2745     }
2746     /* Control never gets here */
2747    
2748     /* Match a single character type repeatedly; several different opcodes
2749     share code. This is very similar to the code for single characters, but we
2750     repeat it in the interests of efficiency. */
2751    
2752     case OP_TYPEEXACT:
2753     min = max = GET2(ecode, 1);
2754     minimize = TRUE;
2755     ecode += 3;
2756     goto REPEATTYPE;
2757    
2758     case OP_TYPEUPTO:
2759     case OP_TYPEMINUPTO:
2760     min = 0;
2761     max = GET2(ecode, 1);
2762     minimize = *ecode == OP_TYPEMINUPTO;
2763     ecode += 3;
2764     goto REPEATTYPE;
2765    
2766 nigel 93 case OP_TYPEPOSSTAR:
2767     possessive = TRUE;
2768     min = 0;
2769     max = INT_MAX;
2770     ecode++;
2771     goto REPEATTYPE;
2772    
2773     case OP_TYPEPOSPLUS:
2774     possessive = TRUE;
2775     min = 1;
2776     max = INT_MAX;
2777     ecode++;
2778     goto REPEATTYPE;
2779    
2780     case OP_TYPEPOSQUERY:
2781     possessive = TRUE;
2782     min = 0;
2783     max = 1;
2784     ecode++;
2785     goto REPEATTYPE;
2786    
2787     case OP_TYPEPOSUPTO:
2788     possessive = TRUE;
2789     min = 0;
2790     max = GET2(ecode, 1);
2791     ecode += 3;
2792     goto REPEATTYPE;
2793    
2794 nigel 77 case OP_TYPESTAR:
2795     case OP_TYPEMINSTAR:
2796     case OP_TYPEPLUS:
2797     case OP_TYPEMINPLUS:
2798     case OP_TYPEQUERY:
2799     case OP_TYPEMINQUERY:
2800     c = *ecode++ - OP_TYPESTAR;
2801     minimize = (c & 1) != 0;
2802     min = rep_min[c]; /* Pick up values from tables; */
2803     max = rep_max[c]; /* zero for max => infinity */
2804     if (max == 0) max = INT_MAX;
2805    
2806     /* Common code for all repeated single character type matches. Note that
2807     in UTF-8 mode, '.' matches a character of any length, but for the other
2808     character types, the valid characters are all one-byte long. */
2809    
2810     REPEATTYPE:
2811     ctype = *ecode++; /* Code for the character type */
2812    
2813     #ifdef SUPPORT_UCP
2814     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2815     {
2816     prop_fail_result = ctype == OP_NOTPROP;
2817     prop_type = *ecode++;
2818 nigel 87 prop_value = *ecode++;
2819 nigel 77 }
2820     else prop_type = -1;
2821     #endif
2822    
2823     /* First, ensure the minimum number of matches are present. Use inline
2824     code for maximizing the speed, and do the type test once at the start
2825     (i.e. keep it out of the loop). Also we can test that there are at least
2826     the minimum number of bytes before we start. This isn't as effective in
2827     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2828     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2829     and single-bytes. */
2830    
2831     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2832     if (min > 0)
2833     {
2834     #ifdef SUPPORT_UCP
2835 nigel 87 if (prop_type >= 0)
2836 nigel 77 {
2837 nigel 87 switch(prop_type)
2838 nigel 77 {
2839 nigel 87 case PT_ANY:
2840     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2841     for (i = 1; i <= min; i++)
2842     {
2843     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2844 ph10 184 GETCHARINCTEST(c, eptr);
2845 nigel 87 }
2846     break;
2847    
2848     case PT_LAMP:
2849     for (i = 1; i <= min; i++)
2850     {
2851     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2852 ph10 184 GETCHARINCTEST(c, eptr);
2853 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2854     if ((prop_chartype == ucp_Lu ||
2855     prop_chartype == ucp_Ll ||
2856     prop_chartype == ucp_Lt) == prop_fail_result)
2857     RRETURN(MATCH_NOMATCH);
2858     }
2859     break;
2860    
2861     case PT_GC:
2862     for (i = 1; i <= min; i++)
2863     {
2864     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2865 ph10 184 GETCHARINCTEST(c, eptr);
2866 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2867     if ((prop_category == prop_value) == prop_fail_result)
2868     RRETURN(MATCH_NOMATCH);
2869     }
2870     break;
2871    
2872     case PT_PC:
2873     for (i = 1; i <= min; i++)
2874     {
2875     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2876 ph10 184 GETCHARINCTEST(c, eptr);
2877 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2878     if ((prop_chartype == prop_value) == prop_fail_result)
2879     RRETURN(MATCH_NOMATCH);
2880     }
2881     break;
2882    
2883     case PT_SC:
2884     for (i = 1; i <= min; i++)
2885     {
2886     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2887 ph10 184 GETCHARINCTEST(c, eptr);
2888 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2889     if ((prop_script == prop_value) == prop_fail_result)
2890     RRETURN(MATCH_NOMATCH);
2891     }
2892     break;
2893    
2894     default:
2895     RRETURN(PCRE_ERROR_INTERNAL);
2896 nigel 77 }
2897     }
2898    
2899     /* Match extended Unicode sequences. We will get here only if the
2900     support is in the binary; otherwise a compile-time error occurs. */
2901    
2902     else if (ctype == OP_EXTUNI)
2903     {
2904     for (i = 1; i <= min; i++)
2905     {
2906     GETCHARINCTEST(c, eptr);
2907 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2908 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2909     while (eptr < md->end_subject)
2910     {
2911     int len = 1;
2912     if (!utf8) c = *eptr; else
2913     {
2914     GETCHARLEN(c, eptr, len);
2915     }
2916 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2917 nigel 77 if (prop_category != ucp_M) break;
2918     eptr += len;
2919     }
2920     }
2921     }
2922    
2923     else
2924     #endif /* SUPPORT_UCP */
2925    
2926     /* Handle all other cases when the coding is UTF-8 */
2927    
2928     #ifdef SUPPORT_UTF8
2929     if (utf8) switch(ctype)
2930     {
2931     case OP_ANY:
2932     for (i = 1; i <= min; i++)
2933     {
2934     if (eptr >= md->end_subject ||
2935 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2936 nigel 77 RRETURN(MATCH_NOMATCH);
2937 nigel 91 eptr++;
2938 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2939     }
2940     break;
2941    
2942     case OP_ANYBYTE:
2943     eptr += min;
2944     break;
2945    
2946 nigel 93 case OP_ANYNL:
2947     for (i = 1; i <= min; i++)
2948     {
2949     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2950     GETCHARINC(c, eptr);
2951     switch(c)
2952     {
2953     default: RRETURN(MATCH_NOMATCH);
2954     case 0x000d:
2955     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2956     break;
2957     case 0x000a:
2958     case 0x000b:
2959     case 0x000c:
2960     case 0x0085:
2961     case 0x2028:
2962     case 0x2029:
2963     break;
2964     }
2965     }
2966     break;
2967    
2968 ph10 178 case OP_NOT_HSPACE:
2969     for (i = 1; i <= min; i++)
2970     {
2971     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2972     GETCHARINC(c, eptr);
2973     switch(c)
2974     {
2975     default: break;
2976     case 0x09: /* HT */
2977     case 0x20: /* SPACE */
2978     case 0xa0: /* NBSP */
2979     case 0x1680: /* OGHAM SPACE MARK */
2980     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2981     case 0x2000: /* EN QUAD */
2982     case 0x2001: /* EM QUAD */
2983     case 0x2002: /* EN SPACE */
2984     case 0x2003: /* EM SPACE */
2985     case 0x2004: /* THREE-PER-EM SPACE */
2986     case 0x2005: /* FOUR-PER-EM SPACE */
2987     case 0x2006: /* SIX-PER-EM SPACE */
2988     case 0x2007: /* FIGURE SPACE */
2989     case 0x2008: /* PUNCTUATION SPACE */
2990     case 0x2009: /* THIN SPACE */
2991     case 0x200A: /* HAIR SPACE */
2992     case 0x202f: /* NARROW NO-BREAK SPACE */
2993     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2994     case 0x3000: /* IDEOGRAPHIC SPACE */
2995     RRETURN(MATCH_NOMATCH);
2996     }
2997     }
2998     break;
2999 ph10 182
3000 ph10 178 case OP_HSPACE:
3001     for (i = 1; i <= min; i++)
3002     {
3003     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3004     GETCHARINC(c, eptr);
3005     switch(c)
3006     {
3007     default: RRETURN(MATCH_NOMATCH);
3008     case 0x09: /* HT */
3009     case 0x20: /* SPACE */
3010     case 0xa0: /* NBSP */
3011     case 0x1680: /* OGHAM SPACE MARK */
3012     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3013     case 0x2000: /* EN QUAD */
3014     case 0x2001: /* EM QUAD */
3015     case 0x2002: /* EN SPACE */
3016     case 0x2003: /* EM SPACE */
3017     case 0x2004: /* THREE-PER-EM SPACE */
3018     case 0x2005: /* FOUR-PER-EM SPACE */
3019     case 0x2006: /* SIX-PER-EM SPACE */
3020     case 0x2007: /* FIGURE SPACE */
3021     case 0x2008: /* PUNCTUATION SPACE */
3022     case 0x2009: /* THIN SPACE */
3023     case 0x200A: /* HAIR SPACE */
3024     case 0x202f: /* NARROW NO-BREAK SPACE */
3025     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3026     case 0x3000: /* IDEOGRAPHIC SPACE */
3027     break;
3028     }
3029     }
3030     break;
3031 ph10 182
3032 ph10 178 case OP_NOT_VSPACE:
3033     for (i = 1; i <= min; i++)
3034     {
3035     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3036     GETCHARINC(c, eptr);
3037     switch(c)
3038     {
3039     default: break;
3040     case 0x0a: /* LF */
3041     case 0x0b: /* VT */
3042     case 0x0c: /* FF */
3043     case 0x0d: /* CR */
3044     case 0x85: /* NEL */
3045     case 0x2028: /* LINE SEPARATOR */
3046     case 0x2029: /* PARAGRAPH SEPARATOR */
3047     RRETURN(MATCH_NOMATCH);
3048     }
3049     }
3050     break;
3051 ph10 182
3052 ph10 178 case OP_VSPACE:
3053     for (i = 1; i <= min; i++)
3054     {
3055     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3056     GETCHARINC(c, eptr);
3057     switch(c)
3058     {
3059     default: RRETURN(MATCH_NOMATCH);
3060     case 0x0a: /* LF */
3061     case 0x0b: /* VT */
3062     case 0x0c: /* FF */
3063     case 0x0d: /* CR */
3064     case 0x85: /* NEL */
3065     case 0x2028: /* LINE SEPARATOR */
3066     case 0x2029: /* PARAGRAPH SEPARATOR */
3067 ph10 182 break;
3068 ph10 178 }
3069     }
3070     break;
3071    
3072 nigel 77 case OP_NOT_DIGIT:
3073     for (i = 1; i <= min; i++)
3074     {
3075     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3076     GETCHARINC(c, eptr);
3077     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3078     RRETURN(MATCH_NOMATCH);
3079     }
3080     break;
3081    
3082     case OP_DIGIT:
3083     for (i = 1; i <= min; i++)
3084     {
3085     if (eptr >= md->end_subject ||
3086     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3087     RRETURN(MATCH_NOMATCH);
3088     /* No need to skip more bytes - we know it's a 1-byte character */
3089     }
3090     break;
3091    
3092     case OP_NOT_WHITESPACE:
3093     for (i = 1; i <= min; i++)
3094     {
3095     if (eptr >= md->end_subject ||
3096     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_space) != 0))
3097     RRETURN(MATCH_NOMATCH);
3098     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3099     }
3100     break;
3101    
3102     case OP_WHITESPACE:
3103     for (i = 1; i <= min; i++)
3104     {
3105     if (eptr >= md->end_subject ||
3106     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3107     RRETURN(MATCH_NOMATCH);
3108     /* No need to skip more bytes - we know it's a 1-byte character */
3109     }
3110     break;
3111    
3112     case OP_NOT_WORDCHAR:
3113     for (i = 1; i <= min; i++)
3114     {
3115     if (eptr >= md->end_subject ||
3116     (*eptr < 128 && (md->ctypes[*eptr++] & ctype_word) != 0))
3117     RRETURN(MATCH_NOMATCH);
3118     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3119     }
3120     break;
3121    
3122     case OP_WORDCHAR:
3123     for (i = 1; i <= min; i++)
3124     {
3125     if (eptr >= md->end_subject ||
3126     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3127     RRETURN(MATCH_NOMATCH);
3128     /* No need to skip more bytes - we know it's a 1-byte character */
3129     }
3130     break;
3131    
3132     default:
3133     RRETURN(PCRE_ERROR_INTERNAL);
3134     } /* End switch(ctype) */
3135    
3136     else
3137     #endif /* SUPPORT_UTF8 */
3138    
3139     /* Code for the non-UTF-8 case for minimum matching of operators other
3140 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3141     number of bytes present, as this was tested above. */
3142 nigel 77
3143     switch(ctype)
3144     {
3145     case OP_ANY:
3146     if ((ims & PCRE_DOTALL) == 0)
3147     {
3148     for (i = 1; i <= min; i++)
3149 nigel 91 {
3150 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3151 nigel 91 eptr++;
3152     }
3153 nigel 77 }
3154     else eptr += min;
3155     break;
3156    
3157     case OP_ANYBYTE:
3158     eptr += min;
3159     break;
3160    
3161 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3162     bytes are present in this case. */
3163    
3164     case OP_ANYNL:
3165     for (i = 1; i <= min; i++)
3166     {
3167     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3168     switch(*eptr++)
3169     {
3170     default: RRETURN(MATCH_NOMATCH);
3171     case 0x000d:
3172     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3173     break;
3174     case 0x000a:
3175     case 0x000b:
3176     case 0x000c:
3177     case 0x0085:
3178     break;
3179     }
3180     }
3181     break;
3182    
3183 ph10 178 case OP_NOT_HSPACE:
3184     for (i = 1; i <= min; i++)
3185     {
3186     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3187     switch(*eptr++)
3188     {
3189     default: break;
3190     case 0x09: /* HT */
3191     case 0x20: /* SPACE */
3192     case 0xa0: /* NBSP */
3193     RRETURN(MATCH_NOMATCH);
3194     }
3195     }
3196     break;
3197    
3198     case OP_HSPACE:
3199     for (i = 1; i <= min; i++)
3200     {
3201     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3202     switch(*eptr++)
3203     {
3204     default: RRETURN(MATCH_NOMATCH);
3205     case 0x09: /* HT */
3206     case 0x20: /* SPACE */
3207     case 0xa0: /* NBSP */
3208 ph10 182 break;
3209 ph10 178 }
3210     }
3211     break;
3212    
3213     case OP_NOT_VSPACE:
3214     for (i = 1; i <= min; i++)
3215     {
3216     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3217     switch(*eptr++)
3218     {
3219     default: break;
3220     case 0x0a: /* LF */
3221     case 0x0b: /* VT */
3222     case 0x0c: /* FF */
3223     case 0x0d: /* CR */
3224     case 0x85: /* NEL */
3225     RRETURN(MATCH_NOMATCH);
3226     }
3227     }
3228     break;
3229    
3230     case OP_VSPACE:
3231     for (i = 1; i <= min; i++)
3232     {
3233     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3234     switch(*eptr++)
3235     {
3236     default: RRETURN(MATCH_NOMATCH);
3237     case 0x0a: /* LF */
3238     case 0x0b: /* VT */
3239     case 0x0c: /* FF */
3240     case 0x0d: /* CR */
3241     case 0x85: /* NEL */
3242 ph10 182 break;
3243 ph10 178 }
3244     }
3245     break;
3246    
3247 nigel 77 case OP_NOT_DIGIT:
3248     for (i = 1; i <= min; i++)
3249     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3250     break;
3251    
3252     case OP_DIGIT:
3253     for (i = 1; i <= min; i++)
3254     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3255     break;
3256    
3257     case OP_NOT_WHITESPACE:
3258     for (i = 1; i <= min; i++)
3259     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3260     break;
3261    
3262     case OP_WHITESPACE:
3263     for (i = 1; i <= min; i++)
3264     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3265     break;
3266    
3267     case OP_NOT_WORDCHAR:
3268     for (i = 1; i <= min; i++)
3269     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3270     RRETURN(MATCH_NOMATCH);
3271     break;
3272    
3273     case OP_WORDCHAR:
3274     for (i = 1; i <= min; i++)
3275     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3276     RRETURN(MATCH_NOMATCH);
3277     break;
3278    
3279     default:
3280     RRETURN(PCRE_ERROR_INTERNAL);
3281     }
3282     }
3283    
3284     /* If min = max, continue at the same level without recursing */
3285    
3286     if (min == max) continue;
3287    
3288     /* If minimizing, we have to test the rest of the pattern before each
3289     subsequent match. Again, separate the UTF-8 case for speed, and also
3290     separate the UCP cases. */
3291    
3292     if (minimize)
3293     {
3294     #ifdef SUPPORT_UCP
3295 nigel 87 if (prop_type >= 0)
3296 nigel 77 {
3297 nigel 87 switch(prop_type)
3298 nigel 77 {
3299 nigel 87 case PT_ANY:
3300     for (fi = min;; fi++)
3301     {
3302 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3303 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3304     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3305     GETCHARINC(c, eptr);
3306     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3307     }
3308 nigel 93 /* Control never gets here */
3309 nigel 87
3310     case PT_LAMP:
3311     for (fi = min;; fi++)
3312     {
3313 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3314 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3315     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3316     GETCHARINC(c, eptr);
3317     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3318     if ((prop_chartype == ucp_Lu ||
3319     prop_chartype == ucp_Ll ||
3320     prop_chartype == ucp_Lt) == prop_fail_result)
3321     RRETURN(MATCH_NOMATCH);
3322     }
3323 nigel 93 /* Control never gets here */
3324 nigel 87
3325     case PT_GC:
3326     for (fi = min;; fi++)
3327     {
3328 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3329 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3330     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3331     GETCHARINC(c, eptr);
3332     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3333     if ((prop_category == prop_value) == prop_fail_result)
3334     RRETURN(MATCH_NOMATCH);
3335     }
3336 nigel 93 /* Control never gets here */
3337 nigel 87
3338     case PT_PC:
3339     for (fi = min;; fi++)
3340     {
3341 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3342 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3343     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3344     GETCHARINC(c, eptr);
3345     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3346     if ((prop_chartype == prop_value) == prop_fail_result)
3347     RRETURN(MATCH_NOMATCH);
3348     }
3349 nigel 93 /* Control never gets here */
3350 nigel 87
3351     case PT_SC:
3352     for (fi = min;; fi++)
3353     {
3354 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3355 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3356     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3357     GETCHARINC(c, eptr);
3358     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3359     if ((prop_script == prop_value) == prop_fail_result)
3360     RRETURN(MATCH_NOMATCH);
3361     }
3362 nigel 93 /* Control never gets here */
3363 nigel 87
3364     default:
3365     RRETURN(PCRE_ERROR_INTERNAL);
3366 nigel 77 }
3367     }
3368    
3369     /* Match extended Unicode sequences. We will get here only if the
3370     support is in the binary; otherwise a compile-time error occurs. */
3371    
3372     else if (ctype == OP_EXTUNI)
3373     {
3374     for (fi = min;; fi++)
3375     {
3376 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3377 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3378     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3379     GETCHARINCTEST(c, eptr);
3380 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3381 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3382     while (eptr < md->end_subject)
3383     {
3384     int len = 1;
3385     if (!utf8) c = *eptr; else
3386     {
3387     GETCHARLEN(c, eptr, len);
3388     }
3389 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3390 nigel 77 if (prop_category != ucp_M) break;
3391     eptr += len;
3392     }
3393     }
3394     }
3395    
3396     else
3397     #endif /* SUPPORT_UCP */
3398    
3399     #ifdef SUPPORT_UTF8
3400     /* UTF-8 mode */
3401     if (utf8)
3402     {
3403     for (fi = min;; fi++)
3404     {
3405 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3406 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3407 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3408     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3409 nigel 93 IS_NEWLINE(eptr)))
3410 nigel 91 RRETURN(MATCH_NOMATCH);
3411 nigel 77
3412     GETCHARINC(c, eptr);
3413     switch(ctype)
3414     {
3415 nigel 91 case OP_ANY: /* This is the DOTALL case */
3416 nigel 77 break;
3417    
3418     case OP_ANYBYTE:
3419     break;
3420    
3421 nigel 93 case OP_ANYNL:
3422     switch(c)
3423     {
3424     default: RRETURN(MATCH_NOMATCH);
3425     case 0x000d:
3426     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3427     break;
3428     case 0x000a:
3429     case 0x000b:
3430     case 0x000c:
3431     case 0x0085:
3432     case 0x2028:
3433     case 0x2029:
3434     break;
3435     }
3436     break;
3437    
3438 ph10 178 case OP_NOT_HSPACE:
3439     switch(c)
3440     {
3441     default: break;
3442     case 0x09: /* HT */
3443     case 0x20: /* SPACE */
3444     case 0xa0: /* NBSP */
3445     case 0x1680: /* OGHAM SPACE MARK */
3446     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3447     case 0x2000: /* EN QUAD */
3448     case 0x2001: /* EM QUAD */
3449     case 0x2002: /* EN SPACE */
3450     case 0x2003: /* EM SPACE */
3451     case 0x2004: /* THREE-PER-EM SPACE */
3452     case 0x2005: /* FOUR-PER-EM SPACE */
3453     case 0x2006: /* SIX-PER-EM SPACE */
3454     case 0x2007: /* FIGURE SPACE */
3455     case 0x2008: /* PUNCTUATION SPACE */
3456     case 0x2009: /* THIN SPACE */
3457     case 0x200A: /* HAIR SPACE */
3458     case 0x202f: /* NARROW NO-BREAK SPACE */
3459     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3460     case 0x3000: /* IDEOGRAPHIC SPACE */
3461     RRETURN(MATCH_NOMATCH);
3462     }
3463     break;
3464    
3465     case OP_HSPACE:
3466     switch(c)
3467     {
3468     default: RRETURN(MATCH_NOMATCH);
3469     case 0x09: /* HT */
3470     case 0x20: /* SPACE */
3471     case 0xa0: /* NBSP */
3472     case 0x1680: /* OGHAM SPACE MARK */
3473     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3474     case 0x2000: /* EN QUAD */
3475     case 0x2001: /* EM QUAD */
3476     case 0x2002: /* EN SPACE */
3477     case 0x2003: /* EM SPACE */
3478     case 0x2004: /* THREE-PER-EM SPACE */
3479     case 0x2005: /* FOUR-PER-EM SPACE */
3480     case 0x2006: /* SIX-PER-EM SPACE */
3481     case 0x2007: /* FIGURE SPACE */
3482     case 0x2008: /* PUNCTUATION SPACE */
3483     case 0x2009: /* THIN SPACE */
3484     case 0x200A: /* HAIR SPACE */
3485     case 0x202f: /* NARROW NO-BREAK SPACE */
3486     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3487     case 0x3000: /* IDEOGRAPHIC SPACE */
3488     break;
3489     }
3490     break;
3491    
3492     case OP_NOT_VSPACE:
3493     switch(c)
3494     {
3495     default: break;
3496     case 0x0a: /* LF */
3497     case 0x0b: /* VT */
3498     case 0x0c: /* FF */
3499     case 0x0d: /* CR */
3500     case 0x85: /* NEL */
3501     case 0x2028: /* LINE SEPARATOR */
3502     case 0x2029: /* PARAGRAPH SEPARATOR */
3503     RRETURN(MATCH_NOMATCH);
3504     }
3505     break;
3506    
3507     case OP_VSPACE:
3508     switch(c)
3509     {
3510     default: RRETURN(MATCH_NOMATCH);
3511     case 0x0a: /* LF */
3512     case 0x0b: /* VT */
3513     case 0x0c: /* FF */
3514     case 0x0d: /* CR */
3515     case 0x85: /* NEL */
3516     case 0x2028: /* LINE SEPARATOR */
3517     case 0x2029: /* PARAGRAPH SEPARATOR */
3518     break;
3519     }
3520     break;
3521    
3522 nigel 77 case OP_NOT_DIGIT:
3523     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3524     RRETURN(MATCH_NOMATCH);
3525     break;
3526    
3527     case OP_DIGIT:
3528     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3529     RRETURN(MATCH_NOMATCH);
3530     break;
3531    
3532     case OP_NOT_WHITESPACE:
3533     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3534     RRETURN(MATCH_NOMATCH);
3535     break;
3536    
3537     case OP_WHITESPACE:
3538     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3539     RRETURN(MATCH_NOMATCH);
3540     break;
3541    
3542     case OP_NOT_WORDCHAR:
3543     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3544     RRETURN(MATCH_NOMATCH);
3545     break;
3546    
3547     case OP_WORDCHAR:
3548     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3549     RRETURN(MATCH_NOMATCH);
3550     break;
3551    
3552     default:
3553     RRETURN(PCRE_ERROR_INTERNAL);
3554     }
3555     }
3556     }
3557     else
3558     #endif
3559     /* Not UTF-8 mode */
3560     {
3561     for (fi = min;; fi++)
3562     {
3563 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3564 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3565 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3566 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3567 nigel 91 RRETURN(MATCH_NOMATCH);
3568    
3569 nigel 77 c = *eptr++;
3570     switch(ctype)
3571     {
3572 nigel 91 case OP_ANY: /* This is the DOTALL case */
3573 nigel 77 break;
3574    
3575     case OP_ANYBYTE:
3576     break;
3577    
3578 nigel 93 case OP_ANYNL:
3579     switch(c)
3580     {
3581     default: RRETURN(MATCH_NOMATCH);
3582     case 0x000d:
3583     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3584     break;
3585     case 0x000a:
3586     case 0x000b:
3587     case 0x000c:
3588     case 0x0085:
3589     break;
3590     }
3591     break;
3592    
3593 ph10 178 case OP_NOT_HSPACE:
3594     switch(c)
3595     {
3596     default: break;
3597     case 0x09: /* HT */
3598     case 0x20: /* SPACE */
3599     case 0xa0: /* NBSP */
3600     RRETURN(MATCH_NOMATCH);
3601     }
3602     break;
3603    
3604     case OP_HSPACE:
3605     switch(c)
3606     {
3607     default: RRETURN(MATCH_NOMATCH);
3608     case 0x09: /* HT */
3609     case 0x20: /* SPACE */
3610     case 0xa0: /* NBSP */
3611     break;
3612     }
3613     break;
3614    
3615     case OP_NOT_VSPACE:
3616     switch(c)
3617     {
3618     default: break;
3619     case 0x0a: /* LF */
3620     case 0x0b: /* VT */
3621     case 0x0c: /* FF */
3622     case 0x0d: /* CR */
3623     case 0x85: /* NEL */
3624     RRETURN(MATCH_NOMATCH);
3625     }
3626     break;
3627    
3628     case OP_VSPACE:
3629     switch(c)
3630     {
3631     default: RRETURN(MATCH_NOMATCH);
3632     case 0x0a: /* LF */
3633     case 0x0b: /* VT */
3634     case 0x0c: /* FF */
3635     case 0x0d: /* CR */
3636     case 0x85: /* NEL */
3637     break;
3638     }
3639     break;
3640    
3641 nigel 77 case OP_NOT_DIGIT:
3642     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3643     break;
3644    
3645     case OP_DIGIT:
3646     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3647     break;
3648    
3649     case OP_NOT_WHITESPACE:
3650     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3651     break;
3652    
3653     case OP_WHITESPACE:
3654     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3655     break;
3656    
3657     case OP_NOT_WORDCHAR:
3658     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3659     break;
3660    
3661     case OP_WORDCHAR:
3662     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3663     break;
3664    
3665     default:
3666     RRETURN(PCRE_ERROR_INTERNAL);
3667     }
3668     }
3669     }
3670     /* Control never gets here */
3671     }
3672    
3673 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3674 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3675     UTF-8 and UCP stuff separate. */
3676    
3677     else
3678     {
3679     pp = eptr; /* Remember where we started */
3680    
3681     #ifdef SUPPORT_UCP
3682 nigel 87 if (prop_type >= 0)
3683 nigel 77 {
3684 nigel 87 switch(prop_type)
3685 nigel 77 {
3686 nigel 87 case PT_ANY:
3687     for (i = min; i < max; i++)
3688     {
3689     int len = 1;
3690     if (eptr >= md->end_subject) break;
3691     GETCHARLEN(c, eptr, len);
3692     if (prop_fail_result) break;
3693     eptr+= len;
3694     }
3695     break;
3696    
3697     case PT_LAMP:
3698     for (i = min; i < max; i++)
3699     {
3700     int len = 1;
3701     if (eptr >= md->end_subject) break;
3702     GETCHARLEN(c, eptr, len);
3703     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3704     if ((prop_chartype == ucp_Lu ||
3705     prop_chartype == ucp_Ll ||
3706     prop_chartype == ucp_Lt) == prop_fail_result)
3707     break;
3708     eptr+= len;
3709     }
3710     break;
3711    
3712     case PT_GC:
3713     for (i = min; i < max; i++)
3714     {
3715     int len = 1;
3716     if (eptr >= md->end_subject) break;
3717     GETCHARLEN(c, eptr, len);
3718     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3719     if ((prop_category == prop_value) == prop_fail_result)
3720     break;
3721     eptr+= len;
3722     }
3723     break;
3724    
3725     case PT_PC:
3726     for (i = min; i < max; i++)
3727     {
3728     int len = 1;
3729     if (eptr >= md->end_subject) break;
3730     GETCHARLEN(c, eptr, len);
3731     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3732     if ((prop_chartype == prop_value) == prop_fail_result)
3733     break;
3734     eptr+= len;
3735     }
3736     break;
3737    
3738     case PT_SC:
3739     for (i = min; i < max; i++)
3740     {
3741     int len = 1;
3742     if (eptr >= md->end_subject) break;
3743     GETCHARLEN(c, eptr, len);
3744     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3745     if ((prop_script == prop_value) == prop_fail_result)
3746     break;
3747     eptr+= len;
3748     }
3749     break;
3750 nigel 77 }
3751    
3752     /* eptr is now past the end of the maximum run */
3753    
3754 nigel 93 if (possessive) continue;
3755 nigel 77 for(;;)
3756     {
3757 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3758 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3759     if (eptr-- == pp) break; /* Stop if tried at original pos */
3760 ph10 207 if (utf8) BACKCHAR(eptr);
3761 nigel 77 }
3762     }
3763    
3764     /* Match extended Unicode sequences. We will get here only if the
3765     support is in the binary; otherwise a compile-time error occurs. */
3766    
3767     else if (ctype == OP_EXTUNI)
3768     {
3769     for (i = min; i < max; i++)
3770     {
3771     if (eptr >= md->end_subject) break;
3772     GETCHARINCTEST(c, eptr);
3773 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3774 nigel 77 if (prop_category == ucp_M) break;
3775     while (eptr < md->end_subject)
3776     {
3777     int len = 1;
3778     if (!utf8) c = *eptr; else
3779     {
3780     GETCHARLEN(c, eptr, len);
3781     }
3782 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3783 nigel 77 if (prop_category != ucp_M) break;
3784     eptr += len;
3785     }
3786     }
3787    
3788     /* eptr is now past the end of the maximum run */
3789    
3790 nigel 93 if (possessive) continue;
3791 nigel 77 for(;;)
3792     {
3793 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3794 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3795     if (eptr-- == pp) break; /* Stop if tried at original pos */
3796     for (;;) /* Move back over one extended */
3797     {
3798     int len = 1;
3799     if (!utf8) c = *eptr; else
3800     {
3801 ph10 207 BACKCHAR(eptr);
3802 nigel 77 GETCHARLEN(c, eptr, len);
3803     }
3804 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3805 nigel 77 if (prop_category != ucp_M) break;
3806     eptr--;
3807     }
3808     }
3809     }
3810    
3811     else
3812     #endif /* SUPPORT_UCP */
3813    
3814     #ifdef SUPPORT_UTF8
3815     /* UTF-8 mode */
3816    
3817     if (utf8)
3818     {
3819     switch(ctype)
3820     {
3821     case OP_ANY:
3822     if (max < INT_MAX)
3823     {
3824     if ((ims & PCRE_DOTALL) == 0)
3825     {
3826     for (i = min; i < max; i++)
3827     {
3828 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3829 nigel 77 eptr++;
3830     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3831     }
3832     }
3833     else
3834     {
3835     for (i = min; i < max; i++)
3836     {
3837 nigel 91 if (eptr >= md->end_subject) break;
3838 nigel 77 eptr++;
3839     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3840     }
3841     }
3842     }
3843    
3844     /* Handle unlimited UTF-8 repeat */
3845    
3846     else
3847     {
3848     if ((ims & PCRE_DOTALL) == 0)
3849     {
3850     for (i = min; i < max; i++)
3851     {
3852 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3853 nigel 77 eptr++;
3854 ph10 190 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3855 nigel 77 }
3856     }
3857     else
3858     {
3859 ph10 190 eptr = md->end_subject;
3860 nigel 77 }
3861     }
3862     break;
3863    
3864     /* The byte case is the same as non-UTF8 */
3865    
3866     case OP_ANYBYTE:
3867     c = max - min;
3868 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3869     c = md->end_subject - eptr;
3870 nigel 77 eptr += c;
3871     break;
3872    
3873 nigel 93 case OP_ANYNL:
3874     for (i = min; i < max; i++)
3875     {
3876     int len = 1;
3877     if (eptr >= md->end_subject) break;
3878     GETCHARLEN(c, eptr, len);
3879     if (c == 0x000d)
3880     {
3881     if (++eptr >= md->end_subject) break;
3882     if (*eptr == 0x000a) eptr++;
3883     }
3884     else
3885     {
3886     if (c != 0x000a && c != 0x000b && c != 0x000c &&
3887     c != 0x0085 && c != 0x2028 && c != 0x2029)
3888     break;
3889     eptr += len;
3890     }
3891     }
3892     break;
3893    
3894 ph10 178 case OP_NOT_HSPACE:
3895 ph10 182 case OP_HSPACE:
3896 ph10 178 for (i = min; i < max; i++)
3897     {
3898 ph10 182 BOOL gotspace;
3899 ph10 178 int len = 1;
3900     if (eptr >= md->end_subject) break;
3901     GETCHARLEN(c, eptr, len);
3902     switch(c)
3903 ph10 182 {
3904     default: gotspace = FALSE; break;
3905 ph10 178 case 0x09: /* HT */
3906     case 0x20: /* SPACE */
3907     case 0xa0: /* NBSP */
3908     case 0x1680: /* OGHAM SPACE MARK */
3909     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3910     case 0x2000: /* EN QUAD */
3911     case 0x2001: /* EM QUAD */
3912     case 0x2002: /* EN SPACE */
3913     case 0x2003: /* EM SPACE */
3914     case 0x2004: /* THREE-PER-EM SPACE */
3915     case 0x2005: /* FOUR-PER-EM SPACE */
3916     case 0x2006: /* SIX-PER-EM SPACE */
3917     case 0x2007: /* FIGURE SPACE */
3918     case 0x2008: /* PUNCTUATION SPACE */
3919     case 0x2009: /* THIN SPACE */
3920     case 0x200A: /* HAIR SPACE */
3921     case 0x202f: /* NARROW NO-BREAK SPACE */
3922     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3923     case 0x3000: /* IDEOGRAPHIC SPACE */
3924     gotspace = TRUE;
3925 ph10 182 break;
3926 ph10 178 }
3927     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3928     eptr += len;
3929     }
3930     break;
3931    
3932     case OP_NOT_VSPACE:
3933 ph10 182 case OP_VSPACE:
3934 ph10 178 for (i = min; i < max; i++)
3935     {
3936 ph10 182 BOOL gotspace;
3937 ph10 178 int len = 1;
3938     if (eptr >= md->end_subject) break;
3939     GETCHARLEN(c, eptr, len);
3940     switch(c)
3941     {
3942 ph10 182 default: gotspace = FALSE; break;
3943 ph10 178 case 0x0a: /* LF */
3944     case 0x0b: /* VT */
3945     case 0x0c: /* FF */
3946     case 0x0d: /* CR */
3947     case 0x85: /* NEL */
3948     case 0x2028: /* LINE SEPARATOR */
3949     case 0x2029: /* PARAGRAPH SEPARATOR */
3950     gotspace = TRUE;
3951     break;
3952     }
3953 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3954 ph10 178 eptr += len;
3955     }
3956     break;
3957    
3958 nigel 77 case OP_NOT_DIGIT:
3959     for (i = min; i < max; i++)
3960     {
3961     int len = 1;
3962     if (eptr >= md->end_subject) break;
3963     GETCHARLEN(c, eptr, len);
3964     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3965     eptr+= len;
3966     }
3967     break;
3968    
3969     case OP_DIGIT:
3970     for (i = min; i < max; i++)
3971     {
3972     int len = 1;
3973     if (eptr >= md->end_subject) break;
3974     GETCHARLEN(c, eptr, len);
3975     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3976     eptr+= len;
3977     }
3978     break;
3979    
3980     case OP_NOT_WHITESPACE:
3981     for (i = min; i < max; i++)
3982     {
3983     int len = 1;
3984     if (eptr >= md->end_subject) break;
3985     GETCHARLEN(c, eptr, len);
3986     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
3987     eptr+= len;
3988     }
3989     break;
3990    
3991     case OP_WHITESPACE:
3992     for (i = min; i < max; i++)
3993     {
3994     int len = 1;
3995     if (eptr >= md->end_subject) break;
3996     GETCHARLEN(c, eptr, len);
3997     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
3998     eptr+= len;
3999     }
4000     break;
4001    
4002     case OP_NOT_WORDCHAR:
4003     for (i = min; i < max; i++)
4004     {
4005     int len = 1;
4006     if (eptr >= md->end_subject) break;
4007     GETCHARLEN(c, eptr, len);
4008     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4009     eptr+= len;
4010     }
4011     break;
4012    
4013     case OP_WORDCHAR:
4014     for (i = min; i < max; i++)
4015     {
4016     int len = 1;
4017     if (eptr >= md->end_subject) break;
4018     GETCHARLEN(c, eptr, len);
4019     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4020     eptr+= len;
4021     }
4022     break;
4023    
4024     default:
4025     RRETURN(PCRE_ERROR_INTERNAL);
4026     }
4027    
4028     /* eptr is now past the end of the maximum run */
4029    
4030 nigel 93 if (possessive) continue;
4031 nigel 77 for(;;)
4032     {
4033 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4034 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4035     if (eptr-- == pp) break; /* Stop if tried at original pos */
4036     BACKCHAR(eptr);
4037     }
4038     }
4039     else
4040 ph10 207 #endif /* SUPPORT_UTF8 */
4041 nigel 77
4042     /* Not UTF-8 mode */
4043     {
4044     switch(ctype)
4045     {
4046     case OP_ANY:
4047     if ((ims & PCRE_DOTALL) == 0)
4048     {
4049     for (i = min; i < max; i++)
4050     {
4051 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4052 nigel 77 eptr++;
4053     }
4054     break;
4055     }
4056     /* For DOTALL case, fall through and treat as \C */
4057    
4058     case OP_ANYBYTE:
4059     c = max - min;
4060 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4061     c = md->end_subject - eptr;
4062 nigel 77 eptr += c;
4063     break;
4064    
4065 nigel 93 case OP_ANYNL:
4066     for (i = min; i < max; i++)
4067     {
4068     if (eptr >= md->end_subject) break;
4069     c = *eptr;
4070     if (c == 0x000d)
4071     {
4072     if (++eptr >= md->end_subject) break;
4073     if (*eptr == 0x000a) eptr++;
4074     }
4075     else
4076     {
4077     if (c != 0x000a && c != 0x000b && c != 0x000c && c != 0x0085)
4078     break;
4079     eptr++;
4080     }
4081     }
4082     break;
4083    
4084 ph10 178 case OP_NOT_HSPACE:
4085     for (i = min; i < max; i++)
4086     {
4087     if (eptr >= md->end_subject) break;
4088     c = *eptr;
4089     if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4090 ph10 182 eptr++;
4091 ph10 178 }
4092     break;
4093    
4094     case OP_HSPACE:
4095     for (i = min; i < max; i++)
4096     {
4097     if (eptr >= md->end_subject) break;
4098     c = *eptr;
4099     if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4100 ph10 182 eptr++;
4101 ph10 178 }
4102     break;
4103    
4104     case OP_NOT_VSPACE:
4105     for (i = min; i < max; i++)
4106     {
4107     if (eptr >= md->end_subject) break;
4108     c = *eptr;
4109     if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4110     break;
4111 ph10 182 eptr++;
4112 ph10 178 }
4113     break;
4114    
4115     case OP_VSPACE:
4116     for (i = min; i < max; i++)
4117     {
4118     if (eptr >= md->end_subject) break;
4119     c = *eptr;
4120     if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4121     break;
4122     eptr++;
4123     }
4124     break;
4125    
4126 nigel 77 case OP_NOT_DIGIT:
4127     for (i = min; i < max; i++)
4128     {
4129     if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4130     break;
4131