/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 335 - (hide annotations) (download)
Sat Apr 12 14:36:14 2008 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 150506 byte(s)
Do not discard subpatterns with {0} quantifiers, as they may be called as 
subroutines.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161     /* Separate the caselesss case for speed */
162    
163     if ((ims & PCRE_CASELESS) != 0)
164     {
165     while (length-- > 0)
166     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167     }
168     else
169     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170    
171     return TRUE;
172     }
173    
174    
175    
176     /***************************************************************************
177     ****************************************************************************
178     RECURSION IN THE match() FUNCTION
179    
180 nigel 87 The match() function is highly recursive, though not every recursive call
181     increases the recursive depth. Nevertheless, some regular expressions can cause
182     it to recurse to a great depth. I was writing for Unix, so I just let it call
183     itself recursively. This uses the stack for saving everything that has to be
184     saved for a recursive call. On Unix, the stack can be large, and this works
185     fine.
186 nigel 77
187 nigel 87 It turns out that on some non-Unix-like systems there are problems with
188     programs that use a lot of stack. (This despite the fact that every last chip
189     has oodles of memory these days, and techniques for extending the stack have
190     been known for decades.) So....
191 nigel 77
192     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193     calls by keeping local variables that need to be preserved in blocks of memory
194 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
195 nigel 77 achieve this so that the actual code doesn't look very different to what it
196     always used to.
197 ph10 164
198 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
199 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
200     Switzer, the use of longjmp() has been abolished, at the cost of having to
201     provide a unique number for each call to RMATCH. There is no way of generating
202     a sequence of numbers at compile time in C. I have given them names, to make
203     them stand out more clearly.
204    
205     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
208     don't have indeterminate values; this has meant that the frame size can be
209 ph10 164 reduced because the result can be "passed back" by straight setting of the
210     variable instead of being passed in the frame.
211 nigel 77 ****************************************************************************
212     ***************************************************************************/
213    
214 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215     below must be updated in sync. */
216 nigel 77
217 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 ph10 212 RM51, RM52, RM53, RM54 };
223 ph10 164
224 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
225 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 ph10 164 actuall used in this definition. */
227 nigel 77
228     #ifndef NO_RECURSE
229     #define REGISTER register
230 ph10 164
231 nigel 87 #ifdef DEBUG
232 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 nigel 87 { \
234     printf("match() called in line %d\n", __LINE__); \
235 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 nigel 87 printf("to line %d\n", __LINE__); \
237     }
238     #define RRETURN(ra) \
239     { \
240     printf("match() returned %d from line %d ", ra, __LINE__); \
241     return ra; \
242     }
243     #else
244 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 nigel 77 #define RRETURN(ra) return ra
247 nigel 87 #endif
248    
249 nigel 77 #else
250    
251    
252 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
253     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254     argument of match(), which never changes. */
255 nigel 77
256     #define REGISTER
257    
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 nigel 77 {\
260     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 ph10 164 frame->Xwhere = rw; \
262     newframe->Xeptr = ra;\
263     newframe->Xecode = rb;\
264 ph10 168 newframe->Xmstart = mstart;\
265 ph10 164 newframe->Xoffset_top = rc;\
266     newframe->Xims = re;\
267     newframe->Xeptrb = rf;\
268     newframe->Xflags = rg;\
269     newframe->Xrdepth = frame->Xrdepth + 1;\
270     newframe->Xprevframe = frame;\
271     frame = newframe;\
272     DPRINTF(("restarting from line %d\n", __LINE__));\
273     goto HEAP_RECURSE;\
274     L_##rw:\
275     DPRINTF(("jumped back to line %d\n", __LINE__));\
276 nigel 77 }
277    
278     #define RRETURN(ra)\
279     {\
280     heapframe *newframe = frame;\
281     frame = newframe->Xprevframe;\
282     (pcre_stack_free)(newframe);\
283     if (frame != NULL)\
284     {\
285 ph10 164 rrc = ra;\
286     goto HEAP_RETURN;\
287 nigel 77 }\
288     return ra;\
289     }
290    
291    
292     /* Structure for remembering the local variables in a private frame */
293    
294     typedef struct heapframe {
295     struct heapframe *Xprevframe;
296    
297     /* Function arguments that may change */
298    
299     const uschar *Xeptr;
300     const uschar *Xecode;
301 ph10 172 const uschar *Xmstart;
302 nigel 77 int Xoffset_top;
303     long int Xims;
304     eptrblock *Xeptrb;
305     int Xflags;
306 nigel 91 unsigned int Xrdepth;
307 nigel 77
308     /* Function local variables */
309    
310     const uschar *Xcallpat;
311     const uschar *Xcharptr;
312     const uschar *Xdata;
313     const uschar *Xnext;
314     const uschar *Xpp;
315     const uschar *Xprev;
316     const uschar *Xsaved_eptr;
317    
318     recursion_info Xnew_recursive;
319    
320     BOOL Xcur_is_word;
321     BOOL Xcondition;
322     BOOL Xprev_is_word;
323    
324     unsigned long int Xoriginal_ims;
325    
326     #ifdef SUPPORT_UCP
327     int Xprop_type;
328 nigel 87 int Xprop_value;
329 nigel 77 int Xprop_fail_result;
330     int Xprop_category;
331     int Xprop_chartype;
332 nigel 87 int Xprop_script;
333 ph10 123 int Xoclength;
334     uschar Xocchars[8];
335 nigel 77 #endif
336    
337     int Xctype;
338 nigel 93 unsigned int Xfc;
339 nigel 77 int Xfi;
340     int Xlength;
341     int Xmax;
342     int Xmin;
343     int Xnumber;
344     int Xoffset;
345     int Xop;
346     int Xsave_capture_last;
347     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348     int Xstacksave[REC_STACK_SAVE_MAX];
349    
350     eptrblock Xnewptrb;
351    
352 ph10 164 /* Where to jump back to */
353 nigel 77
354 ph10 164 int Xwhere;
355 ph10 165
356 nigel 77 } heapframe;
357    
358     #endif
359    
360    
361     /***************************************************************************
362     ***************************************************************************/
363    
364    
365    
366     /*************************************************
367     * Match from current position *
368     *************************************************/
369    
370 nigel 93 /* This function is called recursively in many circumstances. Whenever it
371 nigel 77 returns a negative (error) response, the outer incarnation must also return the
372     same response.
373    
374     Performance note: It might be tempting to extract commonly used fields from the
375     md structure (e.g. utf8, end_subject) into individual variables to improve
376     performance. Tests using gcc on a SPARC disproved this; in the first case, it
377     made performance worse.
378    
379     Arguments:
380 nigel 93 eptr pointer to current character in subject
381     ecode pointer to current position in compiled code
382 ph10 168 mstart pointer to the current match start position (can be modified
383 ph10 172 by encountering \K)
384 nigel 77 offset_top current top pointer
385     md pointer to "static" info for the match
386     ims current /i, /m, and /s options
387     eptrb pointer to chain of blocks containing eptr at start of
388     brackets - for testing for empty matches
389     flags can contain
390     match_condassert - this is an assertion condition
391 nigel 93 match_cbegroup - this is the start of an unlimited repeat
392     group that can match an empty string
393 nigel 87 rdepth the recursion depth
394 nigel 77
395     Returns: MATCH_MATCH if matched ) these values are >= 0
396     MATCH_NOMATCH if failed to match )
397     a negative PCRE_ERROR_xxx value if aborted by an error condition
398 nigel 87 (e.g. stopped by repeated call or recursion limit)
399 nigel 77 */
400    
401     static int
402 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 nigel 91 int flags, unsigned int rdepth)
405 nigel 77 {
406     /* These variables do not need to be preserved over recursion in this function,
407 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
408     "register" because they are used a lot in loops. */
409 nigel 77
410 nigel 91 register int rrc; /* Returns from recursive calls */
411     register int i; /* Used for loops not involving calls to RMATCH() */
412 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414 nigel 77
415 nigel 93 BOOL minimize, possessive; /* Quantifier options */
416    
417 nigel 77 /* When recursion is not being used, all "local" variables that have to be
418     preserved over calls to RMATCH() are part of a "frame" which is obtained from
419     heap storage. Set up the top-level frame here; others are obtained from the
420     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421    
422     #ifdef NO_RECURSE
423     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424     frame->Xprevframe = NULL; /* Marks the top level */
425    
426     /* Copy in the original argument variables */
427    
428     frame->Xeptr = eptr;
429     frame->Xecode = ecode;
430 ph10 168 frame->Xmstart = mstart;
431 nigel 77 frame->Xoffset_top = offset_top;
432     frame->Xims = ims;
433     frame->Xeptrb = eptrb;
434     frame->Xflags = flags;
435 nigel 87 frame->Xrdepth = rdepth;
436 nigel 77
437     /* This is where control jumps back to to effect "recursion" */
438    
439     HEAP_RECURSE:
440    
441     /* Macros make the argument variables come from the current frame */
442    
443     #define eptr frame->Xeptr
444     #define ecode frame->Xecode
445 ph10 168 #define mstart frame->Xmstart
446 nigel 77 #define offset_top frame->Xoffset_top
447     #define ims frame->Xims
448     #define eptrb frame->Xeptrb
449     #define flags frame->Xflags
450 nigel 87 #define rdepth frame->Xrdepth
451 nigel 77
452     /* Ditto for the local variables */
453    
454     #ifdef SUPPORT_UTF8
455     #define charptr frame->Xcharptr
456     #endif
457     #define callpat frame->Xcallpat
458     #define data frame->Xdata
459     #define next frame->Xnext
460     #define pp frame->Xpp
461     #define prev frame->Xprev
462     #define saved_eptr frame->Xsaved_eptr
463    
464     #define new_recursive frame->Xnew_recursive
465    
466     #define cur_is_word frame->Xcur_is_word
467     #define condition frame->Xcondition
468     #define prev_is_word frame->Xprev_is_word
469    
470     #define original_ims frame->Xoriginal_ims
471    
472     #ifdef SUPPORT_UCP
473     #define prop_type frame->Xprop_type
474 nigel 87 #define prop_value frame->Xprop_value
475 nigel 77 #define prop_fail_result frame->Xprop_fail_result
476     #define prop_category frame->Xprop_category
477     #define prop_chartype frame->Xprop_chartype
478 nigel 87 #define prop_script frame->Xprop_script
479 ph10 115 #define oclength frame->Xoclength
480     #define occhars frame->Xocchars
481 nigel 77 #endif
482    
483     #define ctype frame->Xctype
484     #define fc frame->Xfc
485     #define fi frame->Xfi
486     #define length frame->Xlength
487     #define max frame->Xmax
488     #define min frame->Xmin
489     #define number frame->Xnumber
490     #define offset frame->Xoffset
491     #define op frame->Xop
492     #define save_capture_last frame->Xsave_capture_last
493     #define save_offset1 frame->Xsave_offset1
494     #define save_offset2 frame->Xsave_offset2
495     #define save_offset3 frame->Xsave_offset3
496     #define stacksave frame->Xstacksave
497    
498     #define newptrb frame->Xnewptrb
499    
500     /* When recursion is being used, local variables are allocated on the stack and
501     get preserved during recursion in the normal way. In this environment, fi and
502     i, and fc and c, can be the same variables. */
503    
504 nigel 93 #else /* NO_RECURSE not defined */
505 nigel 77 #define fi i
506     #define fc c
507    
508    
509 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510     const uschar *charptr; /* in small blocks of the code. My normal */
511     #endif /* style of coding would have declared */
512     const uschar *callpat; /* them within each of those blocks. */
513     const uschar *data; /* However, in order to accommodate the */
514     const uschar *next; /* version of this code that uses an */
515     USPTR pp; /* external "stack" implemented on the */
516     const uschar *prev; /* heap, it is easier to declare them all */
517     USPTR saved_eptr; /* here, so the declarations can be cut */
518     /* out in a block. The only declarations */
519     recursion_info new_recursive; /* within blocks below are for variables */
520     /* that do not have to be preserved over */
521     BOOL cur_is_word; /* a recursive call to RMATCH(). */
522     BOOL condition;
523 nigel 77 BOOL prev_is_word;
524    
525     unsigned long int original_ims;
526    
527     #ifdef SUPPORT_UCP
528     int prop_type;
529 nigel 87 int prop_value;
530 nigel 77 int prop_fail_result;
531     int prop_category;
532     int prop_chartype;
533 nigel 87 int prop_script;
534 ph10 115 int oclength;
535     uschar occhars[8];
536 nigel 77 #endif
537    
538     int ctype;
539     int length;
540     int max;
541     int min;
542     int number;
543     int offset;
544     int op;
545     int save_capture_last;
546     int save_offset1, save_offset2, save_offset3;
547     int stacksave[REC_STACK_SAVE_MAX];
548    
549     eptrblock newptrb;
550 nigel 93 #endif /* NO_RECURSE */
551 nigel 77
552     /* These statements are here to stop the compiler complaining about unitialized
553     variables. */
554    
555     #ifdef SUPPORT_UCP
556 nigel 87 prop_value = 0;
557 nigel 77 prop_fail_result = 0;
558     #endif
559    
560 nigel 93
561 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
562     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563     used. Thanks to Ian Taylor for noticing this possibility and sending the
564     original patch. */
565    
566     TAIL_RECURSE:
567    
568 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
569     are specified by the macro RMATCH and RRETURN is used to return. When
570     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571     and a "return", respectively (possibly with some debugging if DEBUG is
572     defined). However, RMATCH isn't like a function call because it's quite a
573     complicated macro. It has to be used in one particular way. This shouldn't,
574     however, impact performance when true recursion is being used. */
575 nigel 77
576 ph10 164 #ifdef SUPPORT_UTF8
577     utf8 = md->utf8; /* Local copy of the flag */
578     #else
579     utf8 = FALSE;
580     #endif
581    
582 nigel 87 /* First check that we haven't called match() too many times, or that we
583     haven't exceeded the recursive call limit. */
584    
585 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587 nigel 77
588     original_ims = ims; /* Save for resetting on ')' */
589 nigel 91
590 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
591     string, the match_cbegroup flag is set. When this is the case, add the current
592     subject pointer to the chain of such remembered pointers, to be checked when we
593     hit the closing ket, in order to break infinite loops that match no characters.
594 ph10 197 When match() is called in other circumstances, don't add to the chain. The
595     match_cbegroup flag must NOT be used with tail recursion, because the memory
596     block that is used is on the stack, so a new one may be required for each
597     match(). */
598 nigel 77
599 nigel 93 if ((flags & match_cbegroup) != 0)
600 nigel 77 {
601 ph10 197 newptrb.epb_saved_eptr = eptr;
602     newptrb.epb_prev = eptrb;
603     eptrb = &newptrb;
604 nigel 77 }
605    
606 nigel 93 /* Now start processing the opcodes. */
607 nigel 77
608     for (;;)
609     {
610 nigel 93 minimize = possessive = FALSE;
611 nigel 77 op = *ecode;
612    
613     /* For partial matching, remember if we ever hit the end of the subject after
614     matching at least one subject character. */
615    
616     if (md->partial &&
617     eptr >= md->end_subject &&
618 ph10 168 eptr > mstart)
619 nigel 77 md->hitend = TRUE;
620 ph10 208
621 nigel 93 switch(op)
622     {
623 ph10 210 case OP_FAIL:
624 ph10 212 RRETURN(MATCH_NOMATCH);
625 ph10 211
626 ph10 210 case OP_PRUNE:
627     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628     ims, eptrb, flags, RM51);
629     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 ph10 212 RRETURN(MATCH_PRUNE);
631 ph10 211
632 ph10 210 case OP_COMMIT:
633     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634     ims, eptrb, flags, RM52);
635     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 ph10 212 RRETURN(MATCH_COMMIT);
637 ph10 211
638 ph10 210 case OP_SKIP:
639     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640     ims, eptrb, flags, RM53);
641     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
643 ph10 212 RRETURN(MATCH_SKIP);
644 ph10 211
645 ph10 210 case OP_THEN:
646     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ph10 212 ims, eptrb, flags, RM54);
648 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 ph10 212 RRETURN(MATCH_THEN);
650 ph10 211
651 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
652     the current subject position in the working slot at the top of the vector.
653     We mustn't change the current values of the data slot, because they may be
654     set from a previous iteration of this group, and be referred to by a
655     reference inside the group.
656 nigel 77
657 nigel 93 If the bracket fails to match, we need to restore this value and also the
658     values of the final offsets, in case they were set by a previous iteration
659     of the same bracket.
660 nigel 77
661 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
662     a non-capturing bracket. Don't worry about setting the flag for the error
663     case here; that is handled in the code for KET. */
664 nigel 77
665 nigel 93 case OP_CBRA:
666     case OP_SCBRA:
667     number = GET2(ecode, 1+LINK_SIZE);
668 nigel 77 offset = number << 1;
669    
670     #ifdef DEBUG
671 nigel 93 printf("start bracket %d\n", number);
672     printf("subject=");
673 nigel 77 pchars(eptr, 16, TRUE, md);
674     printf("\n");
675     #endif
676    
677     if (offset < md->offset_max)
678     {
679     save_offset1 = md->offset_vector[offset];
680     save_offset2 = md->offset_vector[offset+1];
681     save_offset3 = md->offset_vector[md->offset_end - number];
682     save_capture_last = md->capture_last;
683    
684     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686    
687 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 nigel 77 do
689     {
690 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691     ims, eptrb, flags, RM1);
692 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 nigel 77 md->capture_last = save_capture_last;
694     ecode += GET(ecode, 1);
695     }
696     while (*ecode == OP_ALT);
697    
698     DPRINTF(("bracket %d failed\n", number));
699    
700     md->offset_vector[offset] = save_offset1;
701     md->offset_vector[offset+1] = save_offset2;
702     md->offset_vector[md->offset_end - number] = save_offset3;
703    
704     RRETURN(MATCH_NOMATCH);
705     }
706    
707 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708     as a non-capturing bracket. */
709 nigel 77
710 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712    
713 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714 nigel 77
715 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717    
718 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719     final alternative within the brackets, we would return the result of a
720     recursive call to match() whatever happened. We can reduce stack usage by
721 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
722     is set.*/
723 nigel 77
724 nigel 93 case OP_BRA:
725     case OP_SBRA:
726     DPRINTF(("start non-capturing bracket\n"));
727     flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 nigel 91 for (;;)
729 nigel 77 {
730 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 nigel 93 {
732 ph10 197 if (flags == 0) /* Not a possibly empty group */
733     {
734     ecode += _pcre_OP_lengths[*ecode];
735     DPRINTF(("bracket 0 tail recursion\n"));
736     goto TAIL_RECURSE;
737     }
738    
739     /* Possibly empty group; can't use tail recursion. */
740    
741     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742     eptrb, flags, RM48);
743     RRETURN(rrc);
744 nigel 93 }
745 nigel 91
746     /* For non-final alternatives, continue the loop for a NOMATCH result;
747     otherwise return. */
748    
749 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750     eptrb, flags, RM2);
751 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 nigel 77 ecode += GET(ecode, 1);
753     }
754 nigel 91 /* Control never reaches here. */
755 nigel 77
756     /* Conditional group: compilation checked that there are no more than
757     two branches. If the condition is false, skipping the first branch takes us
758     past the end if there is only one branch, but that's OK because that is
759 nigel 91 exactly what going to the ket would do. As there is only one branch to be
760     obeyed, we can use tail recursion to avoid using another stack frame. */
761 nigel 77
762     case OP_COND:
763 nigel 93 case OP_SCOND:
764     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 nigel 77 {
766 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767     condition = md->recursive != NULL &&
768     (offset == RREF_ANY || offset == md->recursive->group_num);
769     ecode += condition? 3 : GET(ecode, 1);
770     }
771    
772     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773     {
774 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776     ecode += condition? 3 : GET(ecode, 1);
777 nigel 77 }
778    
779 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780     {
781     condition = FALSE;
782     ecode += GET(ecode, 1);
783     }
784    
785 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
786 nigel 93 the final argument match_condassert causes it to stop at the end of an
787     assertion. */
788 nigel 77
789     else
790     {
791 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792     match_condassert, RM3);
793 nigel 77 if (rrc == MATCH_MATCH)
794     {
795 nigel 93 condition = TRUE;
796     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798     }
799 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 nigel 77 {
801     RRETURN(rrc); /* Need braces because of following else */
802     }
803 nigel 93 else
804     {
805     condition = FALSE;
806     ecode += GET(ecode, 1);
807     }
808     }
809 nigel 91
810 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
811 ph10 197 we can use tail recursion to avoid using another stack frame, except when
812     match_cbegroup is required for an unlimited repeat of a possibly empty
813     group. If the second alternative doesn't exist, we can just plough on. */
814 nigel 91
815 nigel 93 if (condition || *ecode == OP_ALT)
816     {
817 nigel 91 ecode += 1 + LINK_SIZE;
818 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
819     {
820     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821     RRETURN(rrc);
822     }
823     else /* Group must match something */
824     {
825     flags = 0;
826     goto TAIL_RECURSE;
827     }
828 nigel 77 }
829 ph10 197 else /* Condition false & no 2nd alternative */
830 nigel 93 {
831     ecode += 1 + LINK_SIZE;
832     }
833     break;
834 nigel 77
835    
836 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
837     recursion, we should restore the offsets appropriately and continue from
838     after the call. */
839 nigel 77
840 ph10 210 case OP_ACCEPT:
841 nigel 77 case OP_END:
842     if (md->recursive != NULL && md->recursive->group_num == 0)
843     {
844     recursion_info *rec = md->recursive;
845 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 nigel 77 md->recursive = rec->prevrec;
847     memmove(md->offset_vector, rec->offset_save,
848     rec->saved_max * sizeof(int));
849 ph10 168 mstart = rec->save_start;
850 nigel 77 ims = original_ims;
851     ecode = rec->after_call;
852     break;
853     }
854    
855     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856     string - backtracking will then try other alternatives, if any. */
857    
858 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859     md->end_match_ptr = eptr; /* Record where we ended */
860     md->end_offset_top = offset_top; /* and how many extracts were taken */
861 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 nigel 77 RRETURN(MATCH_MATCH);
863    
864     /* Change option settings */
865    
866     case OP_OPT:
867     ims = ecode[1];
868     ecode += 2;
869     DPRINTF(("ims set to %02lx\n", ims));
870     break;
871    
872     /* Assertion brackets. Check the alternative branches in turn - the
873     matching won't pass the KET for an assertion. If any one branch matches,
874     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875     start of each branch to move the current point backwards, so the code at
876     this level is identical to the lookahead case. */
877    
878     case OP_ASSERT:
879     case OP_ASSERTBACK:
880     do
881     {
882 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883     RM4);
884 nigel 77 if (rrc == MATCH_MATCH) break;
885 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 nigel 77 ecode += GET(ecode, 1);
887     }
888     while (*ecode == OP_ALT);
889     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890    
891     /* If checking an assertion for a condition, return MATCH_MATCH. */
892    
893     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894    
895     /* Continue from after the assertion, updating the offsets high water
896     mark, since extracts may have been taken during the assertion. */
897    
898     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899     ecode += 1 + LINK_SIZE;
900     offset_top = md->end_offset_top;
901     continue;
902    
903     /* Negative assertion: all branches must fail to match */
904    
905     case OP_ASSERT_NOT:
906     case OP_ASSERTBACK_NOT:
907     do
908     {
909 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910     RM5);
911 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 nigel 77 ecode += GET(ecode,1);
914     }
915     while (*ecode == OP_ALT);
916    
917     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918    
919     ecode += 1 + LINK_SIZE;
920     continue;
921    
922     /* Move the subject pointer back. This occurs only at the start of
923     each branch of a lookbehind assertion. If we are too close to the start to
924     move back, this match function fails. When working with UTF-8 we move
925     back a number of characters, not bytes. */
926    
927     case OP_REVERSE:
928     #ifdef SUPPORT_UTF8
929     if (utf8)
930     {
931 nigel 93 i = GET(ecode, 1);
932     while (i-- > 0)
933 nigel 77 {
934     eptr--;
935     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 ph10 207 BACKCHAR(eptr);
937 nigel 77 }
938     }
939     else
940     #endif
941    
942     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943    
944     {
945 nigel 93 eptr -= GET(ecode, 1);
946 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947     }
948    
949     /* Skip to next op code */
950    
951     ecode += 1 + LINK_SIZE;
952     break;
953    
954     /* The callout item calls an external function, if one is provided, passing
955     details of the match so far. This is mainly for debugging, though the
956     function is able to force a failure. */
957    
958     case OP_CALLOUT:
959     if (pcre_callout != NULL)
960     {
961     pcre_callout_block cb;
962     cb.version = 1; /* Version 1 of the callout block */
963     cb.callout_number = ecode[1];
964     cb.offset_vector = md->offset_vector;
965 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
966 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
967 ph10 168 cb.start_match = mstart - md->start_subject;
968 nigel 77 cb.current_position = eptr - md->start_subject;
969     cb.pattern_position = GET(ecode, 2);
970     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971     cb.capture_top = offset_top/2;
972     cb.capture_last = md->capture_last;
973     cb.callout_data = md->callout_data;
974     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975     if (rrc < 0) RRETURN(rrc);
976     }
977     ecode += 2 + 2*LINK_SIZE;
978     break;
979    
980     /* Recursion either matches the current regex, or some subexpression. The
981     offset data is the offset to the starting bracket from the start of the
982     whole pattern. (This is so that it works from duplicated subpatterns.)
983    
984     If there are any capturing brackets started but not finished, we have to
985     save their starting points and reinstate them after the recursion. However,
986     we don't know how many such there are (offset_top records the completed
987     total) so we just have to save all the potential data. There may be up to
988     65535 such values, which is too large to put on the stack, but using malloc
989     for small numbers seems expensive. As a compromise, the stack is used when
990     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991     is used. A problem is what to do if the malloc fails ... there is no way of
992     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993     values on the stack, and accept that the rest may be wrong.
994    
995     There are also other values that have to be saved. We use a chained
996     sequence of blocks that actually live on the stack. Thanks to Robin Houston
997     for the original version of this logic. */
998    
999     case OP_RECURSE:
1000     {
1001     callpat = md->start_code + GET(ecode, 1);
1002 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003     GET2(callpat, 1 + LINK_SIZE);
1004 nigel 77
1005     /* Add to "recursing stack" */
1006    
1007     new_recursive.prevrec = md->recursive;
1008     md->recursive = &new_recursive;
1009    
1010     /* Find where to continue from afterwards */
1011    
1012     ecode += 1 + LINK_SIZE;
1013     new_recursive.after_call = ecode;
1014    
1015     /* Now save the offset data. */
1016    
1017     new_recursive.saved_max = md->offset_end;
1018     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019     new_recursive.offset_save = stacksave;
1020     else
1021     {
1022     new_recursive.offset_save =
1023     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025     }
1026    
1027     memcpy(new_recursive.offset_save, md->offset_vector,
1028     new_recursive.saved_max * sizeof(int));
1029 ph10 168 new_recursive.save_start = mstart;
1030     mstart = eptr;
1031 nigel 77
1032     /* OK, now we can do the recursion. For each top-level alternative we
1033     restore the offset and recursion data. */
1034    
1035     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 nigel 77 do
1038     {
1039 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040     md, ims, eptrb, flags, RM6);
1041 nigel 77 if (rrc == MATCH_MATCH)
1042     {
1043 nigel 87 DPRINTF(("Recursion matched\n"));
1044 nigel 77 md->recursive = new_recursive.prevrec;
1045     if (new_recursive.offset_save != stacksave)
1046     (pcre_free)(new_recursive.offset_save);
1047     RRETURN(MATCH_MATCH);
1048     }
1049 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 nigel 87 {
1051     DPRINTF(("Recursion gave error %d\n", rrc));
1052     RRETURN(rrc);
1053     }
1054 nigel 77
1055     md->recursive = &new_recursive;
1056     memcpy(md->offset_vector, new_recursive.offset_save,
1057     new_recursive.saved_max * sizeof(int));
1058     callpat += GET(callpat, 1);
1059     }
1060     while (*callpat == OP_ALT);
1061    
1062     DPRINTF(("Recursion didn't match\n"));
1063     md->recursive = new_recursive.prevrec;
1064     if (new_recursive.offset_save != stacksave)
1065     (pcre_free)(new_recursive.offset_save);
1066     RRETURN(MATCH_NOMATCH);
1067     }
1068     /* Control never reaches here */
1069    
1070     /* "Once" brackets are like assertion brackets except that after a match,
1071     the point in the subject string is not moved back. Thus there can never be
1072     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073     Check the alternative branches in turn - the matching won't pass the KET
1074     for this kind of subpattern. If any one branch matches, we carry on as at
1075     the end of a normal bracket, leaving the subject pointer. */
1076    
1077     case OP_ONCE:
1078 nigel 91 prev = ecode;
1079     saved_eptr = eptr;
1080    
1081     do
1082 nigel 77 {
1083 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 nigel 91 if (rrc == MATCH_MATCH) break;
1085 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 nigel 91 ecode += GET(ecode,1);
1087     }
1088     while (*ecode == OP_ALT);
1089 nigel 77
1090 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1091 nigel 77
1092 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093 nigel 77
1094 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1095     mark, since extracts may have been taken. */
1096 nigel 77
1097 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098 nigel 77
1099 nigel 91 offset_top = md->end_offset_top;
1100     eptr = md->end_match_ptr;
1101 nigel 77
1102 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1103     happens for a repeating ket if no characters were matched in the group.
1104     This is the forcible breaking of infinite loops as implemented in Perl
1105     5.005. If there is an options reset, it will get obeyed in the normal
1106     course of events. */
1107 nigel 77
1108 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1109     {
1110     ecode += 1+LINK_SIZE;
1111     break;
1112     }
1113 nigel 77
1114 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1115     preceding bracket, in the appropriate order. The second "call" of match()
1116     uses tail recursion, to avoid using another stack frame. We need to reset
1117     any options that changed within the bracket before re-running it, so
1118     check the next opcode. */
1119 nigel 77
1120 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1121     {
1122     ims = (ims & ~PCRE_IMS) | ecode[4];
1123     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124     }
1125 nigel 77
1126 nigel 91 if (*ecode == OP_KETRMIN)
1127     {
1128 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130     ecode = prev;
1131 ph10 197 flags = 0;
1132 nigel 91 goto TAIL_RECURSE;
1133 nigel 77 }
1134 nigel 91 else /* OP_KETRMAX */
1135     {
1136 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138     ecode += 1 + LINK_SIZE;
1139 ph10 197 flags = 0;
1140 nigel 91 goto TAIL_RECURSE;
1141     }
1142     /* Control never gets here */
1143 nigel 77
1144     /* An alternation is the end of a branch; scan along to find the end of the
1145     bracketed group and go to there. */
1146    
1147     case OP_ALT:
1148     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149     break;
1150    
1151 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152     indicating that it may occur zero times. It may repeat infinitely, or not
1153     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154     with fixed upper repeat limits are compiled as a number of copies, with the
1155     optional ones preceded by BRAZERO or BRAMINZERO. */
1156 nigel 77
1157     case OP_BRAZERO:
1158     {
1159     next = ecode+1;
1160 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162     do next += GET(next,1); while (*next == OP_ALT);
1163 nigel 93 ecode = next + 1 + LINK_SIZE;
1164 nigel 77 }
1165     break;
1166    
1167     case OP_BRAMINZERO:
1168     {
1169     next = ecode+1;
1170 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1171 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173     ecode++;
1174     }
1175     break;
1176    
1177 ph10 335 case OP_SKIPZERO:
1178     {
1179     next = ecode+1;
1180     do next += GET(next,1); while (*next == OP_ALT);
1181     ecode = next + 1 + LINK_SIZE;
1182     }
1183     break;
1184    
1185 nigel 93 /* End of a group, repeated or non-repeating. */
1186 nigel 77
1187     case OP_KET:
1188     case OP_KETRMIN:
1189     case OP_KETRMAX:
1190 nigel 91 prev = ecode - GET(ecode, 1);
1191 nigel 77
1192 nigel 93 /* If this was a group that remembered the subject start, in order to break
1193     infinite repeats of empty string matches, retrieve the subject start from
1194     the chain. Otherwise, set it NULL. */
1195 nigel 77
1196 nigel 93 if (*prev >= OP_SBRA)
1197     {
1198     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199     eptrb = eptrb->epb_prev; /* Backup to previous group */
1200     }
1201     else saved_eptr = NULL;
1202 nigel 77
1203 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1204     MATCH_MATCH, but record the current high water mark for use by positive
1205     assertions. Do this also for the "once" (atomic) groups. */
1206    
1207 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1209     *prev == OP_ONCE)
1210     {
1211     md->end_match_ptr = eptr; /* For ONCE */
1212     md->end_offset_top = offset_top;
1213     RRETURN(MATCH_MATCH);
1214     }
1215 nigel 77
1216 nigel 93 /* For capturing groups we have to check the group number back at the start
1217     and if necessary complete handling an extraction by setting the offsets and
1218     bumping the high water mark. Note that whole-pattern recursion is coded as
1219     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220     when the OP_END is reached. Other recursion is handled here. */
1221 nigel 77
1222 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1223 nigel 91 {
1224 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1225 nigel 91 offset = number << 1;
1226 nigel 77
1227     #ifdef DEBUG
1228 nigel 91 printf("end bracket %d", number);
1229     printf("\n");
1230 nigel 77 #endif
1231    
1232 nigel 93 md->capture_last = number;
1233     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1234 nigel 91 {
1235 nigel 93 md->offset_vector[offset] =
1236     md->offset_vector[md->offset_end - number];
1237     md->offset_vector[offset+1] = eptr - md->start_subject;
1238     if (offset_top <= offset) offset_top = offset + 2;
1239     }
1240 nigel 77
1241 nigel 93 /* Handle a recursively called group. Restore the offsets
1242     appropriately and continue from after the call. */
1243 nigel 77
1244 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1245     {
1246     recursion_info *rec = md->recursive;
1247     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248     md->recursive = rec->prevrec;
1249 ph10 168 mstart = rec->save_start;
1250 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1251     rec->saved_max * sizeof(int));
1252     ecode = rec->after_call;
1253     ims = original_ims;
1254     break;
1255 nigel 77 }
1256 nigel 91 }
1257 nigel 77
1258 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1259     flags, in case they got changed during the group. */
1260 nigel 77
1261 nigel 91 ims = original_ims;
1262     DPRINTF(("ims reset to %02lx\n", ims));
1263 nigel 77
1264 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1265     happens for a repeating ket if no characters were matched in the group.
1266     This is the forcible breaking of infinite loops as implemented in Perl
1267     5.005. If there is an options reset, it will get obeyed in the normal
1268     course of events. */
1269 nigel 77
1270 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1271     {
1272     ecode += 1 + LINK_SIZE;
1273     break;
1274     }
1275 nigel 77
1276 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1277     preceding bracket, in the appropriate order. In the second case, we can use
1278 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1279     unlimited repeat of a group that can match an empty string. */
1280 nigel 77
1281 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1282    
1283 nigel 91 if (*ecode == OP_KETRMIN)
1284     {
1285 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 ph10 197 if (flags != 0) /* Could match an empty string */
1288     {
1289     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1290     RRETURN(rrc);
1291     }
1292 nigel 91 ecode = prev;
1293     goto TAIL_RECURSE;
1294 nigel 77 }
1295 nigel 91 else /* OP_KETRMAX */
1296     {
1297 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299     ecode += 1 + LINK_SIZE;
1300 ph10 197 flags = 0;
1301 nigel 91 goto TAIL_RECURSE;
1302     }
1303     /* Control never gets here */
1304 nigel 77
1305     /* Start of subject unless notbol, or after internal newline if multiline */
1306    
1307     case OP_CIRC:
1308     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309     if ((ims & PCRE_MULTILINE) != 0)
1310     {
1311 nigel 91 if (eptr != md->start_subject &&
1312 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 nigel 77 RRETURN(MATCH_NOMATCH);
1314     ecode++;
1315     break;
1316     }
1317     /* ... else fall through */
1318    
1319     /* Start of subject assertion */
1320    
1321     case OP_SOD:
1322     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1323     ecode++;
1324     break;
1325    
1326     /* Start of match assertion */
1327    
1328     case OP_SOM:
1329     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1330     ecode++;
1331     break;
1332 ph10 172
1333 ph10 168 /* Reset the start of match point */
1334 ph10 172
1335 ph10 168 case OP_SET_SOM:
1336     mstart = eptr;
1337 ph10 172 ecode++;
1338     break;
1339 nigel 77
1340     /* Assert before internal newline if multiline, or before a terminating
1341     newline unless endonly is set, else end of subject unless noteol is set. */
1342    
1343     case OP_DOLL:
1344     if ((ims & PCRE_MULTILINE) != 0)
1345     {
1346     if (eptr < md->end_subject)
1347 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1348 nigel 77 else
1349     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1350     ecode++;
1351     break;
1352     }
1353     else
1354     {
1355     if (md->noteol) RRETURN(MATCH_NOMATCH);
1356     if (!md->endonly)
1357     {
1358 nigel 91 if (eptr != md->end_subject &&
1359 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 nigel 77 RRETURN(MATCH_NOMATCH);
1361     ecode++;
1362     break;
1363     }
1364     }
1365 nigel 91 /* ... else fall through for endonly */
1366 nigel 77
1367     /* End of subject assertion (\z) */
1368    
1369     case OP_EOD:
1370     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1371     ecode++;
1372     break;
1373    
1374     /* End of subject or ending \n assertion (\Z) */
1375    
1376     case OP_EODN:
1377 nigel 91 if (eptr != md->end_subject &&
1378 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 nigel 91 RRETURN(MATCH_NOMATCH);
1380 nigel 77 ecode++;
1381     break;
1382    
1383     /* Word boundary assertions */
1384    
1385     case OP_NOT_WORD_BOUNDARY:
1386     case OP_WORD_BOUNDARY:
1387     {
1388    
1389     /* Find out if the previous and current characters are "word" characters.
1390     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391     be "non-word" characters. */
1392    
1393     #ifdef SUPPORT_UTF8
1394     if (utf8)
1395     {
1396     if (eptr == md->start_subject) prev_is_word = FALSE; else
1397     {
1398     const uschar *lastptr = eptr - 1;
1399     while((*lastptr & 0xc0) == 0x80) lastptr--;
1400     GETCHAR(c, lastptr);
1401     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1402     }
1403     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1404     {
1405     GETCHAR(c, eptr);
1406     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1407     }
1408     }
1409     else
1410     #endif
1411    
1412     /* More streamlined when not in UTF-8 mode */
1413    
1414     {
1415     prev_is_word = (eptr != md->start_subject) &&
1416     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417     cur_is_word = (eptr < md->end_subject) &&
1418     ((md->ctypes[*eptr] & ctype_word) != 0);
1419     }
1420    
1421     /* Now see if the situation is what we want */
1422    
1423     if ((*ecode++ == OP_WORD_BOUNDARY)?
1424     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425     RRETURN(MATCH_NOMATCH);
1426     }
1427     break;
1428    
1429     /* Match a single character type; inline for speed */
1430    
1431     case OP_ANY:
1432 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1433     {
1434 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1435 nigel 91 }
1436 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1437     if (utf8)
1438     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1439     ecode++;
1440     break;
1441    
1442     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1443     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1444    
1445     case OP_ANYBYTE:
1446     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1447     ecode++;
1448     break;
1449    
1450     case OP_NOT_DIGIT:
1451     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1452     GETCHARINCTEST(c, eptr);
1453     if (
1454     #ifdef SUPPORT_UTF8
1455     c < 256 &&
1456     #endif
1457     (md->ctypes[c] & ctype_digit) != 0
1458     )
1459     RRETURN(MATCH_NOMATCH);
1460     ecode++;
1461     break;
1462    
1463     case OP_DIGIT:
1464     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1465     GETCHARINCTEST(c, eptr);
1466     if (
1467     #ifdef SUPPORT_UTF8
1468     c >= 256 ||
1469     #endif
1470     (md->ctypes[c] & ctype_digit) == 0
1471     )
1472     RRETURN(MATCH_NOMATCH);
1473     ecode++;
1474     break;
1475    
1476     case OP_NOT_WHITESPACE:
1477     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1478     GETCHARINCTEST(c, eptr);
1479     if (
1480     #ifdef SUPPORT_UTF8
1481     c < 256 &&
1482     #endif
1483     (md->ctypes[c] & ctype_space) != 0
1484     )
1485     RRETURN(MATCH_NOMATCH);
1486     ecode++;
1487     break;
1488    
1489     case OP_WHITESPACE:
1490     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491     GETCHARINCTEST(c, eptr);
1492     if (
1493     #ifdef SUPPORT_UTF8
1494     c >= 256 ||
1495     #endif
1496     (md->ctypes[c] & ctype_space) == 0
1497     )
1498     RRETURN(MATCH_NOMATCH);
1499     ecode++;
1500     break;
1501    
1502     case OP_NOT_WORDCHAR:
1503     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1504     GETCHARINCTEST(c, eptr);
1505     if (
1506     #ifdef SUPPORT_UTF8
1507     c < 256 &&
1508     #endif
1509     (md->ctypes[c] & ctype_word) != 0
1510     )
1511     RRETURN(MATCH_NOMATCH);
1512     ecode++;
1513     break;
1514    
1515     case OP_WORDCHAR:
1516     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517     GETCHARINCTEST(c, eptr);
1518     if (
1519     #ifdef SUPPORT_UTF8
1520     c >= 256 ||
1521     #endif
1522     (md->ctypes[c] & ctype_word) == 0
1523     )
1524     RRETURN(MATCH_NOMATCH);
1525     ecode++;
1526     break;
1527    
1528 nigel 93 case OP_ANYNL:
1529     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530     GETCHARINCTEST(c, eptr);
1531     switch(c)
1532     {
1533     default: RRETURN(MATCH_NOMATCH);
1534     case 0x000d:
1535     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1536     break;
1537 ph10 231
1538 nigel 93 case 0x000a:
1539 ph10 231 break;
1540    
1541 nigel 93 case 0x000b:
1542     case 0x000c:
1543     case 0x0085:
1544     case 0x2028:
1545     case 0x2029:
1546 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1547 nigel 93 break;
1548     }
1549     ecode++;
1550     break;
1551    
1552 ph10 178 case OP_NOT_HSPACE:
1553     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1554     GETCHARINCTEST(c, eptr);
1555     switch(c)
1556     {
1557     default: break;
1558     case 0x09: /* HT */
1559     case 0x20: /* SPACE */
1560     case 0xa0: /* NBSP */
1561     case 0x1680: /* OGHAM SPACE MARK */
1562     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1563     case 0x2000: /* EN QUAD */
1564     case 0x2001: /* EM QUAD */
1565     case 0x2002: /* EN SPACE */
1566     case 0x2003: /* EM SPACE */
1567     case 0x2004: /* THREE-PER-EM SPACE */
1568     case 0x2005: /* FOUR-PER-EM SPACE */
1569     case 0x2006: /* SIX-PER-EM SPACE */
1570     case 0x2007: /* FIGURE SPACE */
1571     case 0x2008: /* PUNCTUATION SPACE */
1572     case 0x2009: /* THIN SPACE */
1573     case 0x200A: /* HAIR SPACE */
1574     case 0x202f: /* NARROW NO-BREAK SPACE */
1575     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1576     case 0x3000: /* IDEOGRAPHIC SPACE */
1577     RRETURN(MATCH_NOMATCH);
1578     }
1579     ecode++;
1580     break;
1581    
1582     case OP_HSPACE:
1583     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1584     GETCHARINCTEST(c, eptr);
1585     switch(c)
1586     {
1587     default: RRETURN(MATCH_NOMATCH);
1588     case 0x09: /* HT */
1589     case 0x20: /* SPACE */
1590     case 0xa0: /* NBSP */
1591     case 0x1680: /* OGHAM SPACE MARK */
1592     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1593     case 0x2000: /* EN QUAD */
1594     case 0x2001: /* EM QUAD */
1595     case 0x2002: /* EN SPACE */
1596     case 0x2003: /* EM SPACE */
1597     case 0x2004: /* THREE-PER-EM SPACE */
1598     case 0x2005: /* FOUR-PER-EM SPACE */
1599     case 0x2006: /* SIX-PER-EM SPACE */
1600     case 0x2007: /* FIGURE SPACE */
1601     case 0x2008: /* PUNCTUATION SPACE */
1602     case 0x2009: /* THIN SPACE */
1603     case 0x200A: /* HAIR SPACE */
1604     case 0x202f: /* NARROW NO-BREAK SPACE */
1605     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1606     case 0x3000: /* IDEOGRAPHIC SPACE */
1607     break;
1608     }
1609     ecode++;
1610     break;
1611    
1612     case OP_NOT_VSPACE:
1613     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1614     GETCHARINCTEST(c, eptr);
1615     switch(c)
1616     {
1617     default: break;
1618     case 0x0a: /* LF */
1619     case 0x0b: /* VT */
1620     case 0x0c: /* FF */
1621     case 0x0d: /* CR */
1622     case 0x85: /* NEL */
1623     case 0x2028: /* LINE SEPARATOR */
1624     case 0x2029: /* PARAGRAPH SEPARATOR */
1625     RRETURN(MATCH_NOMATCH);
1626     }
1627     ecode++;
1628     break;
1629    
1630     case OP_VSPACE:
1631     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1632     GETCHARINCTEST(c, eptr);
1633     switch(c)
1634     {
1635     default: RRETURN(MATCH_NOMATCH);
1636     case 0x0a: /* LF */
1637     case 0x0b: /* VT */
1638     case 0x0c: /* FF */
1639     case 0x0d: /* CR */
1640     case 0x85: /* NEL */
1641     case 0x2028: /* LINE SEPARATOR */
1642     case 0x2029: /* PARAGRAPH SEPARATOR */
1643     break;
1644     }
1645     ecode++;
1646     break;
1647    
1648 nigel 77 #ifdef SUPPORT_UCP
1649     /* Check the next character by Unicode property. We will get here only
1650     if the support is in the binary; otherwise a compile-time error occurs. */
1651    
1652     case OP_PROP:
1653     case OP_NOTPROP:
1654     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1655     GETCHARINCTEST(c, eptr);
1656     {
1657 nigel 87 int chartype, script;
1658     int category = _pcre_ucp_findprop(c, &chartype, &script);
1659 nigel 77
1660 nigel 87 switch(ecode[1])
1661     {
1662     case PT_ANY:
1663     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1664     break;
1665 nigel 77
1666 nigel 87 case PT_LAMP:
1667     if ((chartype == ucp_Lu ||
1668     chartype == ucp_Ll ||
1669     chartype == ucp_Lt) == (op == OP_NOTPROP))
1670 nigel 77 RRETURN(MATCH_NOMATCH);
1671 nigel 87 break;
1672    
1673     case PT_GC:
1674     if ((ecode[2] != category) == (op == OP_PROP))
1675 nigel 77 RRETURN(MATCH_NOMATCH);
1676 nigel 87 break;
1677    
1678     case PT_PC:
1679     if ((ecode[2] != chartype) == (op == OP_PROP))
1680     RRETURN(MATCH_NOMATCH);
1681     break;
1682    
1683     case PT_SC:
1684     if ((ecode[2] != script) == (op == OP_PROP))
1685     RRETURN(MATCH_NOMATCH);
1686     break;
1687    
1688     default:
1689     RRETURN(PCRE_ERROR_INTERNAL);
1690 nigel 77 }
1691 nigel 87
1692     ecode += 3;
1693 nigel 77 }
1694     break;
1695    
1696     /* Match an extended Unicode sequence. We will get here only if the support
1697     is in the binary; otherwise a compile-time error occurs. */
1698    
1699     case OP_EXTUNI:
1700     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1701     GETCHARINCTEST(c, eptr);
1702     {
1703 nigel 87 int chartype, script;
1704     int category = _pcre_ucp_findprop(c, &chartype, &script);
1705 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1706     while (eptr < md->end_subject)
1707     {
1708     int len = 1;
1709     if (!utf8) c = *eptr; else
1710     {
1711     GETCHARLEN(c, eptr, len);
1712     }
1713 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1714 nigel 77 if (category != ucp_M) break;
1715     eptr += len;
1716     }
1717     }
1718     ecode++;
1719     break;
1720     #endif
1721    
1722    
1723     /* Match a back reference, possibly repeatedly. Look past the end of the
1724     item to see if there is repeat information following. The code is similar
1725     to that for character classes, but repeated for efficiency. Then obey
1726     similar code to character type repeats - written out again for speed.
1727     However, if the referenced string is the empty string, always treat
1728     it as matched, any number of times (otherwise there could be infinite
1729     loops). */
1730    
1731     case OP_REF:
1732     {
1733     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1734     ecode += 3; /* Advance past item */
1735    
1736     /* If the reference is unset, set the length to be longer than the amount
1737     of subject left; this ensures that every attempt at a match fails. We
1738     can't just fail here, because of the possibility of quantifiers with zero
1739     minima. */
1740    
1741     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1742     md->end_subject - eptr + 1 :
1743     md->offset_vector[offset+1] - md->offset_vector[offset];
1744    
1745     /* Set up for repetition, or handle the non-repeated case */
1746    
1747     switch (*ecode)
1748     {
1749     case OP_CRSTAR:
1750     case OP_CRMINSTAR:
1751     case OP_CRPLUS:
1752     case OP_CRMINPLUS:
1753     case OP_CRQUERY:
1754     case OP_CRMINQUERY:
1755     c = *ecode++ - OP_CRSTAR;
1756     minimize = (c & 1) != 0;
1757     min = rep_min[c]; /* Pick up values from tables; */
1758     max = rep_max[c]; /* zero for max => infinity */
1759     if (max == 0) max = INT_MAX;
1760     break;
1761    
1762     case OP_CRRANGE:
1763     case OP_CRMINRANGE:
1764     minimize = (*ecode == OP_CRMINRANGE);
1765     min = GET2(ecode, 1);
1766     max = GET2(ecode, 3);
1767     if (max == 0) max = INT_MAX;
1768     ecode += 5;
1769     break;
1770    
1771     default: /* No repeat follows */
1772     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1773     eptr += length;
1774     continue; /* With the main loop */
1775     }
1776    
1777     /* If the length of the reference is zero, just continue with the
1778     main loop. */
1779    
1780     if (length == 0) continue;
1781    
1782     /* First, ensure the minimum number of matches are present. We get back
1783     the length of the reference string explicitly rather than passing the
1784     address of eptr, so that eptr can be a register variable. */
1785    
1786     for (i = 1; i <= min; i++)
1787     {
1788     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1789     eptr += length;
1790     }
1791    
1792     /* If min = max, continue at the same level without recursion.
1793     They are not both allowed to be zero. */
1794    
1795     if (min == max) continue;
1796    
1797     /* If minimizing, keep trying and advancing the pointer */
1798    
1799     if (minimize)
1800     {
1801     for (fi = min;; fi++)
1802     {
1803 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1804 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1805     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1806     RRETURN(MATCH_NOMATCH);
1807     eptr += length;
1808     }
1809     /* Control never gets here */
1810     }
1811    
1812     /* If maximizing, find the longest string and work backwards */
1813    
1814     else
1815     {
1816     pp = eptr;
1817     for (i = min; i < max; i++)
1818     {
1819     if (!match_ref(offset, eptr, length, md, ims)) break;
1820     eptr += length;
1821     }
1822     while (eptr >= pp)
1823     {
1824 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1825 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1826     eptr -= length;
1827     }
1828     RRETURN(MATCH_NOMATCH);
1829     }
1830     }
1831     /* Control never gets here */
1832    
1833    
1834    
1835     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1836     used when all the characters in the class have values in the range 0-255,
1837     and either the matching is caseful, or the characters are in the range
1838     0-127 when UTF-8 processing is enabled. The only difference between
1839     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1840     encountered.
1841    
1842     First, look past the end of the item to see if there is repeat information
1843     following. Then obey similar code to character type repeats - written out
1844     again for speed. */
1845    
1846     case OP_NCLASS:
1847     case OP_CLASS:
1848     {
1849     data = ecode + 1; /* Save for matching */
1850     ecode += 33; /* Advance past the item */
1851    
1852     switch (*ecode)
1853     {
1854     case OP_CRSTAR:
1855     case OP_CRMINSTAR:
1856     case OP_CRPLUS:
1857     case OP_CRMINPLUS:
1858     case OP_CRQUERY:
1859     case OP_CRMINQUERY:
1860     c = *ecode++ - OP_CRSTAR;
1861     minimize = (c & 1) != 0;
1862     min = rep_min[c]; /* Pick up values from tables; */
1863     max = rep_max[c]; /* zero for max => infinity */
1864     if (max == 0) max = INT_MAX;
1865     break;
1866    
1867     case OP_CRRANGE:
1868     case OP_CRMINRANGE:
1869     minimize = (*ecode == OP_CRMINRANGE);
1870     min = GET2(ecode, 1);
1871     max = GET2(ecode, 3);
1872     if (max == 0) max = INT_MAX;
1873     ecode += 5;
1874     break;
1875    
1876     default: /* No repeat follows */
1877     min = max = 1;
1878     break;
1879     }
1880    
1881     /* First, ensure the minimum number of matches are present. */
1882    
1883     #ifdef SUPPORT_UTF8
1884     /* UTF-8 mode */
1885     if (utf8)
1886     {
1887     for (i = 1; i <= min; i++)
1888     {
1889     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1890     GETCHARINC(c, eptr);
1891     if (c > 255)
1892     {
1893     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1894     }
1895     else
1896     {
1897     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1898     }
1899     }
1900     }
1901     else
1902     #endif
1903     /* Not UTF-8 mode */
1904     {
1905     for (i = 1; i <= min; i++)
1906     {
1907     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1908     c = *eptr++;
1909     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1910     }
1911     }
1912    
1913     /* If max == min we can continue with the main loop without the
1914     need to recurse. */
1915    
1916     if (min == max) continue;
1917    
1918     /* If minimizing, keep testing the rest of the expression and advancing
1919     the pointer while it matches the class. */
1920    
1921     if (minimize)
1922     {
1923     #ifdef SUPPORT_UTF8
1924     /* UTF-8 mode */
1925     if (utf8)
1926     {
1927     for (fi = min;; fi++)
1928     {
1929 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1930 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1931     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1932     GETCHARINC(c, eptr);
1933     if (c > 255)
1934     {
1935     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1936     }
1937     else
1938     {
1939     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1940     }
1941     }
1942     }
1943     else
1944     #endif
1945     /* Not UTF-8 mode */
1946     {
1947     for (fi = min;; fi++)
1948     {
1949 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1950 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1951     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1952     c = *eptr++;
1953     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1954     }
1955     }
1956     /* Control never gets here */
1957     }
1958    
1959     /* If maximizing, find the longest possible run, then work backwards. */
1960    
1961     else
1962     {
1963     pp = eptr;
1964    
1965     #ifdef SUPPORT_UTF8
1966     /* UTF-8 mode */
1967     if (utf8)
1968     {
1969     for (i = min; i < max; i++)
1970     {
1971     int len = 1;
1972     if (eptr >= md->end_subject) break;
1973     GETCHARLEN(c, eptr, len);
1974     if (c > 255)
1975     {
1976     if (op == OP_CLASS) break;
1977     }
1978     else
1979     {
1980     if ((data[c/8] & (1 << (c&7))) == 0) break;
1981     }
1982     eptr += len;
1983     }
1984     for (;;)
1985     {
1986 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1987 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1988     if (eptr-- == pp) break; /* Stop if tried at original pos */
1989     BACKCHAR(eptr);
1990     }
1991     }
1992     else
1993     #endif
1994     /* Not UTF-8 mode */
1995     {
1996     for (i = min; i < max; i++)
1997     {
1998     if (eptr >= md->end_subject) break;
1999     c = *eptr;
2000     if ((data[c/8] & (1 << (c&7))) == 0) break;
2001     eptr++;
2002     }
2003     while (eptr >= pp)
2004     {
2005 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2006 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2007 nigel 77 eptr--;
2008     }
2009     }
2010    
2011     RRETURN(MATCH_NOMATCH);
2012     }
2013     }
2014     /* Control never gets here */
2015    
2016    
2017     /* Match an extended character class. This opcode is encountered only
2018     in UTF-8 mode, because that's the only time it is compiled. */
2019    
2020     #ifdef SUPPORT_UTF8
2021     case OP_XCLASS:
2022     {
2023     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2024     ecode += GET(ecode, 1); /* Advance past the item */
2025    
2026     switch (*ecode)
2027     {
2028     case OP_CRSTAR:
2029     case OP_CRMINSTAR:
2030     case OP_CRPLUS:
2031     case OP_CRMINPLUS:
2032     case OP_CRQUERY:
2033     case OP_CRMINQUERY:
2034     c = *ecode++ - OP_CRSTAR;
2035     minimize = (c & 1) != 0;
2036     min = rep_min[c]; /* Pick up values from tables; */
2037     max = rep_max[c]; /* zero for max => infinity */
2038     if (max == 0) max = INT_MAX;
2039     break;
2040    
2041     case OP_CRRANGE:
2042     case OP_CRMINRANGE:
2043     minimize = (*ecode == OP_CRMINRANGE);
2044     min = GET2(ecode, 1);
2045     max = GET2(ecode, 3);
2046     if (max == 0) max = INT_MAX;
2047     ecode += 5;
2048     break;
2049    
2050     default: /* No repeat follows */
2051     min = max = 1;
2052     break;
2053     }
2054    
2055     /* First, ensure the minimum number of matches are present. */
2056    
2057     for (i = 1; i <= min; i++)
2058     {
2059     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2060     GETCHARINC(c, eptr);
2061     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2062     }
2063    
2064     /* If max == min we can continue with the main loop without the
2065     need to recurse. */
2066    
2067     if (min == max) continue;
2068    
2069     /* If minimizing, keep testing the rest of the expression and advancing
2070     the pointer while it matches the class. */
2071    
2072     if (minimize)
2073     {
2074     for (fi = min;; fi++)
2075     {
2076 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2077 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2078     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2079     GETCHARINC(c, eptr);
2080     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2081     }
2082     /* Control never gets here */
2083     }
2084    
2085     /* If maximizing, find the longest possible run, then work backwards. */
2086    
2087     else
2088     {
2089     pp = eptr;
2090     for (i = min; i < max; i++)
2091     {
2092     int len = 1;
2093     if (eptr >= md->end_subject) break;
2094     GETCHARLEN(c, eptr, len);
2095     if (!_pcre_xclass(c, data)) break;
2096     eptr += len;
2097     }
2098     for(;;)
2099     {
2100 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2101 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2102     if (eptr-- == pp) break; /* Stop if tried at original pos */
2103 ph10 214 if (utf8) BACKCHAR(eptr);
2104 nigel 77 }
2105     RRETURN(MATCH_NOMATCH);
2106     }
2107    
2108     /* Control never gets here */
2109     }
2110     #endif /* End of XCLASS */
2111    
2112     /* Match a single character, casefully */
2113    
2114     case OP_CHAR:
2115     #ifdef SUPPORT_UTF8
2116     if (utf8)
2117     {
2118     length = 1;
2119     ecode++;
2120     GETCHARLEN(fc, ecode, length);
2121     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2122     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2123     }
2124     else
2125     #endif
2126    
2127     /* Non-UTF-8 mode */
2128     {
2129     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2130     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2131     ecode += 2;
2132     }
2133     break;
2134    
2135     /* Match a single character, caselessly */
2136    
2137     case OP_CHARNC:
2138     #ifdef SUPPORT_UTF8
2139     if (utf8)
2140     {
2141     length = 1;
2142     ecode++;
2143     GETCHARLEN(fc, ecode, length);
2144    
2145     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2146    
2147     /* If the pattern character's value is < 128, we have only one byte, and
2148     can use the fast lookup table. */
2149    
2150     if (fc < 128)
2151     {
2152     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2153     }
2154    
2155     /* Otherwise we must pick up the subject character */
2156    
2157     else
2158     {
2159 nigel 93 unsigned int dc;
2160 nigel 77 GETCHARINC(dc, eptr);
2161     ecode += length;
2162    
2163     /* If we have Unicode property support, we can use it to test the other
2164 nigel 87 case of the character, if there is one. */
2165 nigel 77
2166     if (fc != dc)
2167     {
2168     #ifdef SUPPORT_UCP
2169 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2170 nigel 77 #endif
2171     RRETURN(MATCH_NOMATCH);
2172     }
2173     }
2174     }
2175     else
2176     #endif /* SUPPORT_UTF8 */
2177    
2178     /* Non-UTF-8 mode */
2179     {
2180     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2181     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2182     ecode += 2;
2183     }
2184     break;
2185    
2186 nigel 93 /* Match a single character repeatedly. */
2187 nigel 77
2188     case OP_EXACT:
2189     min = max = GET2(ecode, 1);
2190     ecode += 3;
2191     goto REPEATCHAR;
2192    
2193 nigel 93 case OP_POSUPTO:
2194     possessive = TRUE;
2195     /* Fall through */
2196    
2197 nigel 77 case OP_UPTO:
2198     case OP_MINUPTO:
2199     min = 0;
2200     max = GET2(ecode, 1);
2201     minimize = *ecode == OP_MINUPTO;
2202     ecode += 3;
2203     goto REPEATCHAR;
2204    
2205 nigel 93 case OP_POSSTAR:
2206     possessive = TRUE;
2207     min = 0;
2208     max = INT_MAX;
2209     ecode++;
2210     goto REPEATCHAR;
2211    
2212     case OP_POSPLUS:
2213     possessive = TRUE;
2214     min = 1;
2215     max = INT_MAX;
2216     ecode++;
2217     goto REPEATCHAR;
2218    
2219     case OP_POSQUERY:
2220     possessive = TRUE;
2221     min = 0;
2222     max = 1;
2223     ecode++;
2224     goto REPEATCHAR;
2225    
2226 nigel 77 case OP_STAR:
2227     case OP_MINSTAR:
2228     case OP_PLUS:
2229     case OP_MINPLUS:
2230     case OP_QUERY:
2231     case OP_MINQUERY:
2232     c = *ecode++ - OP_STAR;
2233     minimize = (c & 1) != 0;
2234     min = rep_min[c]; /* Pick up values from tables; */
2235     max = rep_max[c]; /* zero for max => infinity */
2236     if (max == 0) max = INT_MAX;
2237    
2238     /* Common code for all repeated single-character matches. We can give
2239     up quickly if there are fewer than the minimum number of characters left in
2240     the subject. */
2241    
2242     REPEATCHAR:
2243     #ifdef SUPPORT_UTF8
2244     if (utf8)
2245     {
2246     length = 1;
2247     charptr = ecode;
2248     GETCHARLEN(fc, ecode, length);
2249     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2250     ecode += length;
2251    
2252     /* Handle multibyte character matching specially here. There is
2253     support for caseless matching if UCP support is present. */
2254    
2255     if (length > 1)
2256     {
2257     #ifdef SUPPORT_UCP
2258 nigel 93 unsigned int othercase;
2259 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2260 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2261 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2262 ph10 115 else oclength = 0;
2263 nigel 77 #endif /* SUPPORT_UCP */
2264    
2265     for (i = 1; i <= min; i++)
2266     {
2267     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2268 ph10 123 #ifdef SUPPORT_UCP
2269 nigel 77 /* Need braces because of following else */
2270     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2271     else
2272     {
2273     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2274     eptr += oclength;
2275     }
2276 ph10 115 #else /* without SUPPORT_UCP */
2277     else { RRETURN(MATCH_NOMATCH); }
2278 ph10 123 #endif /* SUPPORT_UCP */
2279 nigel 77 }
2280    
2281     if (min == max) continue;
2282    
2283     if (minimize)
2284     {
2285     for (fi = min;; fi++)
2286     {
2287 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2288 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2289     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2290     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2291 ph10 123 #ifdef SUPPORT_UCP
2292 nigel 77 /* Need braces because of following else */
2293     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2294     else
2295     {
2296     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2297     eptr += oclength;
2298     }
2299 ph10 115 #else /* without SUPPORT_UCP */
2300     else { RRETURN (MATCH_NOMATCH); }
2301     #endif /* SUPPORT_UCP */
2302 nigel 77 }
2303     /* Control never gets here */
2304     }
2305 nigel 93
2306     else /* Maximize */
2307 nigel 77 {
2308     pp = eptr;
2309     for (i = min; i < max; i++)
2310     {
2311     if (eptr > md->end_subject - length) break;
2312     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2313 ph10 123 #ifdef SUPPORT_UCP
2314 nigel 77 else if (oclength == 0) break;
2315     else
2316     {
2317     if (memcmp(eptr, occhars, oclength) != 0) break;
2318     eptr += oclength;
2319     }
2320 ph10 115 #else /* without SUPPORT_UCP */
2321     else break;
2322 ph10 123 #endif /* SUPPORT_UCP */
2323 nigel 77 }
2324 nigel 93
2325     if (possessive) continue;
2326 ph10 120 for(;;)
2327 nigel 77 {
2328 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2329 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2330 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2331 ph10 115 #ifdef SUPPORT_UCP
2332     eptr--;
2333     BACKCHAR(eptr);
2334 ph10 123 #else /* without SUPPORT_UCP */
2335 nigel 77 eptr -= length;
2336 ph10 123 #endif /* SUPPORT_UCP */
2337 nigel 77 }
2338     }
2339     /* Control never gets here */
2340     }
2341    
2342     /* If the length of a UTF-8 character is 1, we fall through here, and
2343     obey the code as for non-UTF-8 characters below, though in this case the
2344     value of fc will always be < 128. */
2345     }
2346     else
2347     #endif /* SUPPORT_UTF8 */
2348    
2349     /* When not in UTF-8 mode, load a single-byte character. */
2350     {
2351     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2352     fc = *ecode++;
2353     }
2354    
2355     /* The value of fc at this point is always less than 256, though we may or
2356     may not be in UTF-8 mode. The code is duplicated for the caseless and
2357     caseful cases, for speed, since matching characters is likely to be quite
2358     common. First, ensure the minimum number of matches are present. If min =
2359     max, continue at the same level without recursing. Otherwise, if
2360     minimizing, keep trying the rest of the expression and advancing one
2361     matching character if failing, up to the maximum. Alternatively, if
2362     maximizing, find the maximum number of characters and work backwards. */
2363    
2364     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2365     max, eptr));
2366    
2367     if ((ims & PCRE_CASELESS) != 0)
2368     {
2369     fc = md->lcc[fc];
2370     for (i = 1; i <= min; i++)
2371     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2372     if (min == max) continue;
2373     if (minimize)
2374     {
2375     for (fi = min;; fi++)
2376     {
2377 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2378 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2379     if (fi >= max || eptr >= md->end_subject ||
2380     fc != md->lcc[*eptr++])
2381     RRETURN(MATCH_NOMATCH);
2382     }
2383     /* Control never gets here */
2384     }
2385 nigel 93 else /* Maximize */
2386 nigel 77 {
2387     pp = eptr;
2388     for (i = min; i < max; i++)
2389     {
2390     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2391     eptr++;
2392     }
2393 nigel 93 if (possessive) continue;
2394 nigel 77 while (eptr >= pp)
2395     {
2396 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2397 nigel 77 eptr--;
2398     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2399     }
2400     RRETURN(MATCH_NOMATCH);
2401     }
2402     /* Control never gets here */
2403     }
2404    
2405     /* Caseful comparisons (includes all multi-byte characters) */
2406    
2407     else
2408     {
2409     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2410     if (min == max) continue;
2411     if (minimize)
2412     {
2413     for (fi = min;; fi++)
2414     {
2415 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2416 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2417     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2418     RRETURN(MATCH_NOMATCH);
2419     }
2420     /* Control never gets here */
2421     }
2422 nigel 93 else /* Maximize */
2423 nigel 77 {
2424     pp = eptr;
2425     for (i = min; i < max; i++)
2426     {
2427     if (eptr >= md->end_subject || fc != *eptr) break;
2428     eptr++;
2429     }
2430 nigel 93 if (possessive) continue;
2431 nigel 77 while (eptr >= pp)
2432     {
2433 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2434 nigel 77 eptr--;
2435     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2436     }
2437     RRETURN(MATCH_NOMATCH);
2438     }
2439     }
2440     /* Control never gets here */
2441    
2442     /* Match a negated single one-byte character. The character we are
2443     checking can be multibyte. */
2444    
2445     case OP_NOT:
2446     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2447     ecode++;
2448     GETCHARINCTEST(c, eptr);
2449     if ((ims & PCRE_CASELESS) != 0)
2450     {
2451     #ifdef SUPPORT_UTF8
2452     if (c < 256)
2453     #endif
2454     c = md->lcc[c];
2455     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2456     }
2457     else
2458     {
2459     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2460     }
2461     break;
2462    
2463     /* Match a negated single one-byte character repeatedly. This is almost a
2464     repeat of the code for a repeated single character, but I haven't found a
2465     nice way of commoning these up that doesn't require a test of the
2466     positive/negative option for each character match. Maybe that wouldn't add
2467     very much to the time taken, but character matching *is* what this is all
2468     about... */
2469    
2470     case OP_NOTEXACT:
2471     min = max = GET2(ecode, 1);
2472     ecode += 3;
2473     goto REPEATNOTCHAR;
2474    
2475     case OP_NOTUPTO:
2476     case OP_NOTMINUPTO:
2477     min = 0;
2478     max = GET2(ecode, 1);
2479     minimize = *ecode == OP_NOTMINUPTO;
2480     ecode += 3;
2481     goto REPEATNOTCHAR;
2482    
2483 nigel 93 case OP_NOTPOSSTAR:
2484     possessive = TRUE;
2485     min = 0;
2486     max = INT_MAX;
2487     ecode++;
2488     goto REPEATNOTCHAR;
2489    
2490     case OP_NOTPOSPLUS:
2491     possessive = TRUE;
2492     min = 1;
2493     max = INT_MAX;
2494     ecode++;
2495     goto REPEATNOTCHAR;
2496    
2497     case OP_NOTPOSQUERY:
2498     possessive = TRUE;
2499     min = 0;
2500     max = 1;
2501     ecode++;
2502     goto REPEATNOTCHAR;
2503    
2504     case OP_NOTPOSUPTO:
2505     possessive = TRUE;
2506     min = 0;
2507     max = GET2(ecode, 1);
2508     ecode += 3;
2509     goto REPEATNOTCHAR;
2510    
2511 nigel 77 case OP_NOTSTAR:
2512     case OP_NOTMINSTAR:
2513     case OP_NOTPLUS:
2514     case OP_NOTMINPLUS:
2515     case OP_NOTQUERY:
2516     case OP_NOTMINQUERY:
2517     c = *ecode++ - OP_NOTSTAR;
2518     minimize = (c & 1) != 0;
2519     min = rep_min[c]; /* Pick up values from tables; */
2520     max = rep_max[c]; /* zero for max => infinity */
2521     if (max == 0) max = INT_MAX;
2522    
2523     /* Common code for all repeated single-byte matches. We can give up quickly
2524     if there are fewer than the minimum number of bytes left in the
2525     subject. */
2526    
2527     REPEATNOTCHAR:
2528     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2529     fc = *ecode++;
2530    
2531     /* The code is duplicated for the caseless and caseful cases, for speed,
2532     since matching characters is likely to be quite common. First, ensure the
2533     minimum number of matches are present. If min = max, continue at the same
2534     level without recursing. Otherwise, if minimizing, keep trying the rest of
2535     the expression and advancing one matching character if failing, up to the
2536     maximum. Alternatively, if maximizing, find the maximum number of
2537     characters and work backwards. */
2538    
2539     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2540     max, eptr));
2541    
2542     if ((ims & PCRE_CASELESS) != 0)
2543     {
2544     fc = md->lcc[fc];
2545    
2546     #ifdef SUPPORT_UTF8
2547     /* UTF-8 mode */
2548     if (utf8)
2549     {
2550 nigel 93 register unsigned int d;
2551 nigel 77 for (i = 1; i <= min; i++)
2552     {
2553     GETCHARINC(d, eptr);
2554     if (d < 256) d = md->lcc[d];
2555     if (fc == d) RRETURN(MATCH_NOMATCH);
2556     }
2557     }
2558     else
2559     #endif
2560    
2561     /* Not UTF-8 mode */
2562     {
2563     for (i = 1; i <= min; i++)
2564     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2565     }
2566    
2567     if (min == max) continue;
2568    
2569     if (minimize)
2570     {
2571     #ifdef SUPPORT_UTF8
2572     /* UTF-8 mode */
2573     if (utf8)
2574     {
2575 nigel 93 register unsigned int d;
2576 nigel 77 for (fi = min;; fi++)
2577     {
2578 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2579 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2580     GETCHARINC(d, eptr);
2581     if (d < 256) d = md->lcc[d];
2582     if (fi >= max || eptr >= md->end_subject || fc == d)
2583     RRETURN(MATCH_NOMATCH);
2584     }
2585     }
2586     else
2587     #endif
2588     /* Not UTF-8 mode */
2589     {
2590     for (fi = min;; fi++)
2591     {
2592 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2593 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2594     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2595     RRETURN(MATCH_NOMATCH);
2596     }
2597     }
2598     /* Control never gets here */
2599     }
2600    
2601     /* Maximize case */
2602    
2603     else
2604     {
2605     pp = eptr;
2606    
2607     #ifdef SUPPORT_UTF8
2608     /* UTF-8 mode */
2609     if (utf8)
2610     {
2611 nigel 93 register unsigned int d;
2612 nigel 77 for (i = min; i < max; i++)
2613     {
2614     int len = 1;
2615     if (eptr >= md->end_subject) break;
2616     GETCHARLEN(d, eptr, len);
2617     if (d < 256) d = md->lcc[d];
2618     if (fc == d) break;
2619     eptr += len;
2620     }
2621 nigel 93 if (possessive) continue;
2622     for(;;)
2623 nigel 77 {
2624 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2625 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2626     if (eptr-- == pp) break; /* Stop if tried at original pos */
2627     BACKCHAR(eptr);
2628     }
2629     }
2630     else
2631     #endif
2632     /* Not UTF-8 mode */
2633     {
2634     for (i = min; i < max; i++)
2635     {
2636     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2637     eptr++;
2638     }
2639 nigel 93 if (possessive) continue;
2640 nigel 77 while (eptr >= pp)
2641     {
2642 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2643 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2644     eptr--;
2645     }
2646     }
2647    
2648     RRETURN(MATCH_NOMATCH);
2649     }
2650     /* Control never gets here */
2651     }
2652    
2653     /* Caseful comparisons */
2654    
2655     else
2656     {
2657     #ifdef SUPPORT_UTF8
2658     /* UTF-8 mode */
2659     if (utf8)
2660     {
2661 nigel 93 register unsigned int d;
2662 nigel 77 for (i = 1; i <= min; i++)
2663     {
2664     GETCHARINC(d, eptr);
2665     if (fc == d) RRETURN(MATCH_NOMATCH);
2666     }
2667     }
2668     else
2669     #endif
2670     /* Not UTF-8 mode */
2671     {
2672     for (i = 1; i <= min; i++)
2673     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2674     }
2675    
2676     if (min == max) continue;
2677    
2678     if (minimize)
2679     {
2680     #ifdef SUPPORT_UTF8
2681     /* UTF-8 mode */
2682     if (utf8)
2683     {
2684 nigel 93 register unsigned int d;
2685 nigel 77 for (fi = min;; fi++)
2686     {
2687 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2688 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2689     GETCHARINC(d, eptr);
2690     if (fi >= max || eptr >= md->end_subject || fc == d)
2691     RRETURN(MATCH_NOMATCH);
2692     }
2693     }
2694     else
2695     #endif
2696     /* Not UTF-8 mode */
2697     {
2698     for (fi = min;; fi++)
2699     {
2700 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2701 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2702     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2703     RRETURN(MATCH_NOMATCH);
2704     }
2705     }
2706     /* Control never gets here */
2707     }
2708    
2709     /* Maximize case */
2710    
2711     else
2712     {
2713     pp = eptr;
2714    
2715     #ifdef SUPPORT_UTF8
2716     /* UTF-8 mode */
2717     if (utf8)
2718     {
2719 nigel 93 register unsigned int d;
2720 nigel 77 for (i = min; i < max; i++)
2721     {
2722     int len = 1;
2723     if (eptr >= md->end_subject) break;
2724     GETCHARLEN(d, eptr, len);
2725     if (fc == d) break;
2726     eptr += len;
2727     }
2728 nigel 93 if (possessive) continue;
2729 nigel 77 for(;;)
2730     {
2731 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2732 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2733     if (eptr-- == pp) break; /* Stop if tried at original pos */
2734     BACKCHAR(eptr);
2735     }
2736     }
2737     else
2738     #endif
2739     /* Not UTF-8 mode */
2740     {
2741     for (i = min; i < max; i++)
2742     {
2743     if (eptr >= md->end_subject || fc == *eptr) break;
2744     eptr++;
2745     }
2746 nigel 93 if (possessive) continue;
2747 nigel 77 while (eptr >= pp)
2748     {
2749 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2750 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2751     eptr--;
2752     }
2753     }
2754    
2755     RRETURN(MATCH_NOMATCH);
2756     }
2757     }
2758     /* Control never gets here */
2759    
2760     /* Match a single character type repeatedly; several different opcodes
2761     share code. This is very similar to the code for single characters, but we
2762     repeat it in the interests of efficiency. */
2763    
2764     case OP_TYPEEXACT:
2765     min = max = GET2(ecode, 1);
2766     minimize = TRUE;
2767     ecode += 3;
2768     goto REPEATTYPE;
2769    
2770     case OP_TYPEUPTO:
2771     case OP_TYPEMINUPTO:
2772     min = 0;
2773     max = GET2(ecode, 1);
2774     minimize = *ecode == OP_TYPEMINUPTO;
2775     ecode += 3;
2776     goto REPEATTYPE;
2777    
2778 nigel 93 case OP_TYPEPOSSTAR:
2779     possessive = TRUE;
2780     min = 0;
2781     max = INT_MAX;
2782     ecode++;
2783     goto REPEATTYPE;
2784    
2785     case OP_TYPEPOSPLUS:
2786     possessive = TRUE;
2787     min = 1;
2788     max = INT_MAX;
2789     ecode++;
2790     goto REPEATTYPE;
2791    
2792     case OP_TYPEPOSQUERY:
2793     possessive = TRUE;
2794     min = 0;
2795     max = 1;
2796     ecode++;
2797     goto REPEATTYPE;
2798    
2799     case OP_TYPEPOSUPTO:
2800     possessive = TRUE;
2801     min = 0;
2802     max = GET2(ecode, 1);
2803     ecode += 3;
2804     goto REPEATTYPE;
2805    
2806 nigel 77 case OP_TYPESTAR:
2807     case OP_TYPEMINSTAR:
2808     case OP_TYPEPLUS:
2809     case OP_TYPEMINPLUS:
2810     case OP_TYPEQUERY:
2811     case OP_TYPEMINQUERY:
2812     c = *ecode++ - OP_TYPESTAR;
2813     minimize = (c & 1) != 0;
2814     min = rep_min[c]; /* Pick up values from tables; */
2815     max = rep_max[c]; /* zero for max => infinity */
2816     if (max == 0) max = INT_MAX;
2817    
2818     /* Common code for all repeated single character type matches. Note that
2819     in UTF-8 mode, '.' matches a character of any length, but for the other
2820     character types, the valid characters are all one-byte long. */
2821    
2822     REPEATTYPE:
2823     ctype = *ecode++; /* Code for the character type */
2824    
2825     #ifdef SUPPORT_UCP
2826     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2827     {
2828     prop_fail_result = ctype == OP_NOTPROP;
2829     prop_type = *ecode++;
2830 nigel 87 prop_value = *ecode++;
2831 nigel 77 }
2832     else prop_type = -1;
2833     #endif
2834    
2835     /* First, ensure the minimum number of matches are present. Use inline
2836     code for maximizing the speed, and do the type test once at the start
2837     (i.e. keep it out of the loop). Also we can test that there are at least
2838     the minimum number of bytes before we start. This isn't as effective in
2839     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2840     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2841     and single-bytes. */
2842    
2843     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2844     if (min > 0)
2845     {
2846     #ifdef SUPPORT_UCP
2847 nigel 87 if (prop_type >= 0)
2848 nigel 77 {
2849 nigel 87 switch(prop_type)
2850 nigel 77 {
2851 nigel 87 case PT_ANY:
2852     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2853     for (i = 1; i <= min; i++)
2854     {
2855     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2856 ph10 184 GETCHARINCTEST(c, eptr);
2857 nigel 87 }
2858     break;
2859    
2860     case PT_LAMP:
2861     for (i = 1; i <= min; i++)
2862     {
2863     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2864 ph10 184 GETCHARINCTEST(c, eptr);
2865 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2866     if ((prop_chartype == ucp_Lu ||
2867     prop_chartype == ucp_Ll ||
2868     prop_chartype == ucp_Lt) == prop_fail_result)
2869     RRETURN(MATCH_NOMATCH);
2870     }
2871     break;
2872    
2873     case PT_GC:
2874     for (i = 1; i <= min; i++)
2875     {
2876     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2877 ph10 184 GETCHARINCTEST(c, eptr);
2878 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2879     if ((prop_category == prop_value) == prop_fail_result)
2880     RRETURN(MATCH_NOMATCH);
2881     }
2882     break;
2883    
2884     case PT_PC:
2885     for (i = 1; i <= min; i++)
2886     {
2887     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2888 ph10 184 GETCHARINCTEST(c, eptr);
2889 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2890     if ((prop_chartype == prop_value) == prop_fail_result)
2891     RRETURN(MATCH_NOMATCH);
2892     }
2893     break;
2894    
2895     case PT_SC:
2896     for (i = 1; i <= min; i++)
2897     {
2898     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2899 ph10 184 GETCHARINCTEST(c, eptr);
2900 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2901     if ((prop_script == prop_value) == prop_fail_result)
2902     RRETURN(MATCH_NOMATCH);
2903     }
2904     break;
2905    
2906     default:
2907     RRETURN(PCRE_ERROR_INTERNAL);
2908 nigel 77 }
2909     }
2910    
2911     /* Match extended Unicode sequences. We will get here only if the
2912     support is in the binary; otherwise a compile-time error occurs. */
2913    
2914     else if (ctype == OP_EXTUNI)
2915     {
2916     for (i = 1; i <= min; i++)
2917     {
2918     GETCHARINCTEST(c, eptr);
2919 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2920 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2921     while (eptr < md->end_subject)
2922     {
2923     int len = 1;
2924     if (!utf8) c = *eptr; else
2925     {
2926     GETCHARLEN(c, eptr, len);
2927     }
2928 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2929 nigel 77 if (prop_category != ucp_M) break;
2930     eptr += len;
2931     }
2932     }
2933     }
2934    
2935     else
2936     #endif /* SUPPORT_UCP */
2937    
2938     /* Handle all other cases when the coding is UTF-8 */
2939    
2940     #ifdef SUPPORT_UTF8
2941     if (utf8) switch(ctype)
2942     {
2943     case OP_ANY:
2944     for (i = 1; i <= min; i++)
2945     {
2946     if (eptr >= md->end_subject ||
2947 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2948 nigel 77 RRETURN(MATCH_NOMATCH);
2949 nigel 91 eptr++;
2950 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2951     }
2952     break;
2953    
2954     case OP_ANYBYTE:
2955     eptr += min;
2956     break;
2957    
2958 nigel 93 case OP_ANYNL:
2959     for (i = 1; i <= min; i++)
2960     {
2961     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962     GETCHARINC(c, eptr);
2963     switch(c)
2964     {
2965     default: RRETURN(MATCH_NOMATCH);
2966     case 0x000d:
2967     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2968     break;
2969 ph10 231
2970 nigel 93 case 0x000a:
2971 ph10 231 break;
2972    
2973 nigel 93 case 0x000b:
2974     case 0x000c:
2975     case 0x0085:
2976     case 0x2028:
2977     case 0x2029:
2978 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2979 nigel 93 break;
2980     }
2981     }
2982     break;
2983    
2984 ph10 178 case OP_NOT_HSPACE:
2985     for (i = 1; i <= min; i++)
2986     {
2987     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2988     GETCHARINC(c, eptr);
2989     switch(c)
2990     {
2991     default: break;
2992     case 0x09: /* HT */
2993     case 0x20: /* SPACE */
2994     case 0xa0: /* NBSP */
2995     case 0x1680: /* OGHAM SPACE MARK */
2996     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2997     case 0x2000: /* EN QUAD */
2998     case 0x2001: /* EM QUAD */
2999     case 0x2002: /* EN SPACE */
3000     case 0x2003: /* EM SPACE */
3001     case 0x2004: /* THREE-PER-EM SPACE */
3002     case 0x2005: /* FOUR-PER-EM SPACE */
3003     case 0x2006: /* SIX-PER-EM SPACE */
3004     case 0x2007: /* FIGURE SPACE */
3005     case 0x2008: /* PUNCTUATION SPACE */
3006     case 0x2009: /* THIN SPACE */
3007     case 0x200A: /* HAIR SPACE */
3008     case 0x202f: /* NARROW NO-BREAK SPACE */
3009     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3010     case 0x3000: /* IDEOGRAPHIC SPACE */
3011     RRETURN(MATCH_NOMATCH);
3012     }
3013     }
3014     break;
3015 ph10 182
3016 ph10 178 case OP_HSPACE:
3017     for (i = 1; i <= min; i++)
3018     {
3019     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3020     GETCHARINC(c, eptr);
3021     switch(c)
3022     {
3023     default: RRETURN(MATCH_NOMATCH);
3024     case 0x09: /* HT */
3025     case 0x20: /* SPACE */
3026     case 0xa0: /* NBSP */
3027     case 0x1680: /* OGHAM SPACE MARK */
3028     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3029     case 0x2000: /* EN QUAD */
3030     case 0x2001: /* EM QUAD */
3031     case 0x2002: /* EN SPACE */
3032     case 0x2003: /* EM SPACE */
3033     case 0x2004: /* THREE-PER-EM SPACE */
3034     case 0x2005: /* FOUR-PER-EM SPACE */
3035     case 0x2006: /* SIX-PER-EM SPACE */
3036     case 0x2007: /* FIGURE SPACE */
3037     case 0x2008: /* PUNCTUATION SPACE */
3038     case 0x2009: /* THIN SPACE */
3039     case 0x200A: /* HAIR SPACE */
3040     case 0x202f: /* NARROW NO-BREAK SPACE */
3041     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3042     case 0x3000: /* IDEOGRAPHIC SPACE */
3043     break;
3044     }
3045     }
3046     break;
3047 ph10 182
3048 ph10 178 case OP_NOT_VSPACE:
3049     for (i = 1; i <= min; i++)
3050     {
3051     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3052     GETCHARINC(c, eptr);
3053     switch(c)
3054     {
3055     default: break;
3056     case 0x0a: /* LF */
3057     case 0x0b: /* VT */
3058     case 0x0c: /* FF */
3059     case 0x0d: /* CR */
3060     case 0x85: /* NEL */
3061     case 0x2028: /* LINE SEPARATOR */
3062     case 0x2029: /* PARAGRAPH SEPARATOR */
3063     RRETURN(MATCH_NOMATCH);
3064     }
3065     }
3066     break;
3067 ph10 182
3068 ph10 178 case OP_VSPACE:
3069     for (i = 1; i <= min; i++)
3070     {
3071     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3072     GETCHARINC(c, eptr);
3073     switch(c)
3074     {
3075     default: RRETURN(MATCH_NOMATCH);
3076     case 0x0a: /* LF */
3077     case 0x0b: /* VT */
3078     case 0x0c: /* FF */
3079     case 0x0d: /* CR */
3080     case 0x85: /* NEL */
3081     case 0x2028: /* LINE SEPARATOR */
3082     case 0x2029: /* PARAGRAPH SEPARATOR */
3083 ph10 182 break;
3084 ph10 178 }
3085     }
3086     break;
3087    
3088 nigel 77 case OP_NOT_DIGIT:
3089     for (i = 1; i <= min; i++)
3090     {
3091     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3092     GETCHARINC(c, eptr);
3093     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3094     RRETURN(MATCH_NOMATCH);
3095     }
3096     break;
3097    
3098     case OP_DIGIT:
3099     for (i = 1; i <= min; i++)
3100     {
3101     if (eptr >= md->end_subject ||
3102     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3103     RRETURN(MATCH_NOMATCH);
3104     /* No need to skip more bytes - we know it's a 1-byte character */
3105     }
3106     break;
3107    
3108     case OP_NOT_WHITESPACE:
3109     for (i = 1; i <= min; i++)
3110     {
3111     if (eptr >= md->end_subject ||
3112 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3113 nigel 77 RRETURN(MATCH_NOMATCH);
3114 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3115 nigel 77 }
3116     break;
3117    
3118     case OP_WHITESPACE:
3119     for (i = 1; i <= min; i++)
3120     {
3121     if (eptr >= md->end_subject ||
3122     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3123     RRETURN(MATCH_NOMATCH);
3124     /* No need to skip more bytes - we know it's a 1-byte character */
3125     }
3126     break;
3127    
3128     case OP_NOT_WORDCHAR:
3129     for (i = 1; i <= min; i++)
3130     {
3131     if (eptr >= md->end_subject ||
3132 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3133 nigel 77 RRETURN(MATCH_NOMATCH);
3134 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3135 nigel 77 }
3136     break;
3137    
3138     case OP_WORDCHAR:
3139     for (i = 1; i <= min; i++)
3140     {
3141     if (eptr >= md->end_subject ||
3142     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3143     RRETURN(MATCH_NOMATCH);
3144     /* No need to skip more bytes - we know it's a 1-byte character */
3145     }
3146     break;
3147    
3148     default:
3149     RRETURN(PCRE_ERROR_INTERNAL);
3150     } /* End switch(ctype) */
3151    
3152     else
3153     #endif /* SUPPORT_UTF8 */
3154    
3155     /* Code for the non-UTF-8 case for minimum matching of operators other
3156 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3157     number of bytes present, as this was tested above. */
3158 nigel 77
3159     switch(ctype)
3160     {
3161     case OP_ANY:
3162     if ((ims & PCRE_DOTALL) == 0)
3163     {
3164     for (i = 1; i <= min; i++)
3165 nigel 91 {
3166 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3167 nigel 91 eptr++;
3168     }
3169 nigel 77 }
3170     else eptr += min;
3171     break;
3172    
3173     case OP_ANYBYTE:
3174     eptr += min;
3175     break;
3176    
3177 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3178     bytes are present in this case. */
3179    
3180     case OP_ANYNL:
3181     for (i = 1; i <= min; i++)
3182     {
3183     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3184     switch(*eptr++)
3185     {
3186     default: RRETURN(MATCH_NOMATCH);
3187     case 0x000d:
3188     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3189     break;
3190     case 0x000a:
3191 ph10 231 break;
3192    
3193 nigel 93 case 0x000b:
3194     case 0x000c:
3195     case 0x0085:
3196 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3197 nigel 93 break;
3198     }
3199     }
3200     break;
3201    
3202 ph10 178 case OP_NOT_HSPACE:
3203     for (i = 1; i <= min; i++)
3204     {
3205     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3206     switch(*eptr++)
3207     {
3208     default: break;
3209     case 0x09: /* HT */
3210     case 0x20: /* SPACE */
3211     case 0xa0: /* NBSP */
3212     RRETURN(MATCH_NOMATCH);
3213     }
3214     }
3215     break;
3216    
3217     case OP_HSPACE:
3218     for (i = 1; i <= min; i++)
3219     {
3220     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3221     switch(*eptr++)
3222     {
3223     default: RRETURN(MATCH_NOMATCH);
3224     case 0x09: /* HT */
3225     case 0x20: /* SPACE */
3226     case 0xa0: /* NBSP */
3227 ph10 182 break;
3228 ph10 178 }
3229     }
3230     break;
3231    
3232     case OP_NOT_VSPACE:
3233     for (i = 1; i <= min; i++)
3234     {
3235     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3236     switch(*eptr++)
3237     {
3238     default: break;
3239     case 0x0a: /* LF */
3240     case 0x0b: /* VT */
3241     case 0x0c: /* FF */
3242     case 0x0d: /* CR */
3243     case 0x85: /* NEL */
3244     RRETURN(MATCH_NOMATCH);
3245     }
3246     }
3247     break;
3248    
3249     case OP_VSPACE:
3250     for (i = 1; i <= min; i++)
3251     {
3252     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3253     switch(*eptr++)
3254     {
3255     default: RRETURN(MATCH_NOMATCH);
3256     case 0x0a: /* LF */
3257     case 0x0b: /* VT */
3258     case 0x0c: /* FF */
3259     case 0x0d: /* CR */
3260     case 0x85: /* NEL */
3261 ph10 182 break;
3262 ph10 178 }
3263     }
3264     break;
3265    
3266 nigel 77 case OP_NOT_DIGIT:
3267     for (i = 1; i <= min; i++)
3268     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3269     break;
3270    
3271     case OP_DIGIT:
3272     for (i = 1; i <= min; i++)
3273     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3274     break;
3275    
3276     case OP_NOT_WHITESPACE:
3277     for (i = 1; i <= min; i++)
3278     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3279     break;
3280    
3281     case OP_WHITESPACE:
3282     for (i = 1; i <= min; i++)
3283     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3284     break;
3285    
3286     case OP_NOT_WORDCHAR:
3287     for (i = 1; i <= min; i++)
3288     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3289     RRETURN(MATCH_NOMATCH);
3290     break;
3291    
3292     case OP_WORDCHAR:
3293     for (i = 1; i <= min; i++)
3294     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3295     RRETURN(MATCH_NOMATCH);
3296     break;
3297    
3298     default:
3299     RRETURN(PCRE_ERROR_INTERNAL);
3300     }
3301     }
3302    
3303     /* If min = max, continue at the same level without recursing */
3304    
3305     if (min == max) continue;
3306    
3307     /* If minimizing, we have to test the rest of the pattern before each
3308     subsequent match. Again, separate the UTF-8 case for speed, and also
3309     separate the UCP cases. */
3310    
3311     if (minimize)
3312     {
3313     #ifdef SUPPORT_UCP
3314 nigel 87 if (prop_type >= 0)
3315 nigel 77 {
3316 nigel 87 switch(prop_type)
3317 nigel 77 {
3318 nigel 87 case PT_ANY:
3319     for (fi = min;; fi++)
3320     {
3321 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3322 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3323     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3324     GETCHARINC(c, eptr);
3325     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3326     }
3327 nigel 93 /* Control never gets here */
3328 nigel 87
3329     case PT_LAMP:
3330     for (fi = min;; fi++)
3331     {
3332 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3333 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3334     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3335     GETCHARINC(c, eptr);
3336     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3337     if ((prop_chartype == ucp_Lu ||
3338     prop_chartype == ucp_Ll ||
3339     prop_chartype == ucp_Lt) == prop_fail_result)
3340     RRETURN(MATCH_NOMATCH);
3341     }
3342 nigel 93 /* Control never gets here */
3343 nigel 87
3344     case PT_GC:
3345     for (fi = min;; fi++)
3346     {
3347 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3348 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3349     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3350     GETCHARINC(c, eptr);
3351     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3352     if ((prop_category == prop_value) == prop_fail_result)
3353     RRETURN(MATCH_NOMATCH);
3354     }
3355 nigel 93 /* Control never gets here */
3356 nigel 87
3357     case PT_PC:
3358     for (fi = min;; fi++)
3359     {
3360 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3361 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3362     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3363     GETCHARINC(c, eptr);
3364     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3365     if ((prop_chartype == prop_value) == prop_fail_result)
3366     RRETURN(MATCH_NOMATCH);
3367     }
3368 nigel 93 /* Control never gets here */
3369 nigel 87
3370     case PT_SC:
3371     for (fi = min;; fi++)
3372     {
3373 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3374 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3375     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3376     GETCHARINC(c, eptr);
3377     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3378     if ((prop_script == prop_value) == prop_fail_result)
3379     RRETURN(MATCH_NOMATCH);
3380     }
3381 nigel 93 /* Control never gets here */
3382 nigel 87
3383     default:
3384     RRETURN(PCRE_ERROR_INTERNAL);
3385 nigel 77 }
3386     }
3387    
3388     /* Match extended Unicode sequences. We will get here only if the
3389     support is in the binary; otherwise a compile-time error occurs. */
3390    
3391     else if (ctype == OP_EXTUNI)
3392     {
3393     for (fi = min;; fi++)
3394     {
3395 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3396 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3397     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3398     GETCHARINCTEST(c, eptr);
3399 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3400 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3401     while (eptr < md->end_subject)
3402     {
3403     int len = 1;
3404     if (!utf8) c = *eptr; else
3405     {
3406     GETCHARLEN(c, eptr, len);
3407     }
3408 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3409 nigel 77 if (prop_category != ucp_M) break;
3410     eptr += len;
3411     }
3412     }
3413     }
3414    
3415     else
3416     #endif /* SUPPORT_UCP */
3417    
3418     #ifdef SUPPORT_UTF8
3419     /* UTF-8 mode */
3420     if (utf8)
3421     {
3422     for (fi = min;; fi++)
3423     {
3424 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3425 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3426 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3427     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3428 nigel 93 IS_NEWLINE(eptr)))
3429 nigel 91 RRETURN(MATCH_NOMATCH);
3430 nigel 77
3431     GETCHARINC(c, eptr);
3432     switch(ctype)
3433     {
3434 nigel 91 case OP_ANY: /* This is the DOTALL case */
3435 nigel 77 break;
3436    
3437     case OP_ANYBYTE:
3438     break;
3439    
3440 nigel 93 case OP_ANYNL:
3441     switch(c)
3442     {
3443     default: RRETURN(MATCH_NOMATCH);
3444     case 0x000d:
3445     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3446     break;
3447     case 0x000a:
3448 ph10 231 break;
3449    
3450 nigel 93 case 0x000b:
3451     case 0x000c:
3452     case 0x0085:
3453     case 0x2028:
3454     case 0x2029:
3455 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3456 nigel 93 break;
3457     }
3458     break;
3459    
3460 ph10 178 case OP_NOT_HSPACE:
3461     switch(c)
3462     {
3463     default: break;
3464     case 0x09: /* HT */
3465     case 0x20: /* SPACE */
3466     case 0xa0: /* NBSP */
3467     case 0x1680: /* OGHAM SPACE MARK */
3468     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3469     case 0x2000: /* EN QUAD */
3470     case 0x2001: /* EM QUAD */
3471     case 0x2002: /* EN SPACE */
3472     case 0x2003: /* EM SPACE */
3473     case 0x2004: /* THREE-PER-EM SPACE */
3474     case 0x2005: /* FOUR-PER-EM SPACE */
3475     case 0x2006: /* SIX-PER-EM SPACE */
3476     case 0x2007: /* FIGURE SPACE */
3477     case 0x2008: /* PUNCTUATION SPACE */
3478     case 0x2009: /* THIN SPACE */
3479     case 0x200A: /* HAIR SPACE */
3480     case 0x202f: /* NARROW NO-BREAK SPACE */
3481     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3482     case 0x3000: /* IDEOGRAPHIC SPACE */
3483     RRETURN(MATCH_NOMATCH);
3484     }
3485     break;
3486    
3487     case OP_HSPACE:
3488     switch(c)
3489     {
3490     default: RRETURN(MATCH_NOMATCH);
3491     case 0x09: /* HT */
3492     case 0x20: /* SPACE */
3493     case 0xa0: /* NBSP */
3494     case 0x1680: /* OGHAM SPACE MARK */
3495     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3496     case 0x2000: /* EN QUAD */
3497     case 0x2001: /* EM QUAD */
3498     case 0x2002: /* EN SPACE */
3499     case 0x2003: /* EM SPACE */
3500     case 0x2004: /* THREE-PER-EM SPACE */
3501     case 0x2005: /* FOUR-PER-EM SPACE */
3502     case 0x2006: /* SIX-PER-EM SPACE */
3503     case 0x2007: /* FIGURE SPACE */
3504     case 0x2008: /* PUNCTUATION SPACE */
3505     case 0x2009: /* THIN SPACE */
3506     case 0x200A: /* HAIR SPACE */
3507     case 0x202f: /* NARROW NO-BREAK SPACE */
3508     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3509     case 0x3000: /* IDEOGRAPHIC SPACE */
3510     break;
3511     }
3512     break;
3513    
3514     case OP_NOT_VSPACE:
3515     switch(c)
3516     {
3517     default: break;
3518     case 0x0a: /* LF */
3519     case 0x0b: /* VT */
3520     case 0x0c: /* FF */
3521     case 0x0d: /* CR */
3522     case 0x85: /* NEL */
3523     case 0x2028: /* LINE SEPARATOR */
3524     case 0x2029: /* PARAGRAPH SEPARATOR */
3525     RRETURN(MATCH_NOMATCH);
3526     }
3527     break;
3528    
3529     case OP_VSPACE:
3530     switch(c)
3531     {
3532     default: RRETURN(MATCH_NOMATCH);
3533     case 0x0a: /* LF */
3534     case 0x0b: /* VT */
3535     case 0x0c: /* FF */
3536     case 0x0d: /* CR */
3537     case 0x85: /* NEL */
3538     case 0x2028: /* LINE SEPARATOR */
3539     case 0x2029: /* PARAGRAPH SEPARATOR */
3540     break;
3541     }
3542     break;
3543    
3544 nigel 77 case OP_NOT_DIGIT:
3545     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3546     RRETURN(MATCH_NOMATCH);
3547     break;
3548    
3549     case OP_DIGIT:
3550     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3551     RRETURN(MATCH_NOMATCH);
3552     break;
3553    
3554     case OP_NOT_WHITESPACE:
3555     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3556     RRETURN(MATCH_NOMATCH);
3557     break;
3558    
3559     case OP_WHITESPACE:
3560     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3561     RRETURN(MATCH_NOMATCH);
3562     break;
3563    
3564     case OP_NOT_WORDCHAR:
3565     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3566     RRETURN(MATCH_NOMATCH);
3567     break;
3568    
3569     case OP_WORDCHAR:
3570     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3571     RRETURN(MATCH_NOMATCH);
3572     break;
3573    
3574     default:
3575     RRETURN(PCRE_ERROR_INTERNAL);
3576     }
3577     }
3578     }
3579     else
3580     #endif
3581     /* Not UTF-8 mode */
3582     {
3583     for (fi = min;; fi++)
3584     {
3585 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3586 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3587 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3588 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3589 nigel 91 RRETURN(MATCH_NOMATCH);
3590    
3591 nigel 77 c = *eptr++;
3592     switch(ctype)
3593     {
3594 nigel 91 case OP_ANY: /* This is the DOTALL case */
3595 nigel 77 break;
3596    
3597     case OP_ANYBYTE:
3598     break;
3599    
3600 nigel 93 case OP_ANYNL:
3601     switch(c)
3602     {
3603     default: RRETURN(MATCH_NOMATCH);
3604     case 0x000d:
3605     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3606     break;
3607 ph10 231
3608 nigel 93 case 0x000a:
3609 ph10 231 break;
3610    
3611 nigel 93 case 0x000b:
3612     case 0x000c:
3613     case 0x0085:
3614 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3615 nigel 93 break;
3616     }
3617     break;
3618    
3619 ph10 178 case OP_NOT_HSPACE:
3620     switch(c)
3621     {
3622     default: break;
3623     case 0x09: /* HT */
3624     case 0x20: /* SPACE */
3625     case 0xa0: /* NBSP */
3626     RRETURN(MATCH_NOMATCH);
3627     }
3628     break;
3629    
3630     case OP_HSPACE:
3631     switch(c)
3632     {
3633     default: RRETURN(MATCH_NOMATCH);
3634     case 0x09: /* HT */
3635     case 0x20: /* SPACE */
3636     case 0xa0: /* NBSP */
3637     break;
3638     }
3639     break;
3640    
3641     case OP_NOT_VSPACE:
3642     switch(c)
3643     {
3644     default: break;
3645     case 0x0a: /* LF */
3646     case 0x0b: /* VT */
3647     case 0x0c: /* FF */
3648     case 0x0d: /* CR */
3649     case 0x85: /* NEL */
3650     RRETURN(MATCH_NOMATCH);
3651     }
3652     break;
3653    
3654     case OP_VSPACE:
3655     switch(c)
3656     {
3657     default: RRETURN(MATCH_NOMATCH);
3658     case 0x0a: /* LF */
3659     case 0x0b: /* VT */
3660     case 0x0c: /* FF */
3661     case 0x0d: /* CR */
3662     case 0x85: /* NEL */
3663     break;
3664     }
3665     break;
3666    
3667 nigel 77 case OP_NOT_DIGIT:
3668     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3669     break;
3670    
3671     case OP_DIGIT:
3672     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3673     break;
3674    
3675     case OP_NOT_WHITESPACE:
3676     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3677     break;
3678    
3679     case OP_WHITESPACE:
3680     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3681     break;
3682    
3683     case OP_NOT_WORDCHAR:
3684     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3685     break;
3686    
3687     case OP_WORDCHAR:
3688     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3689     break;
3690    
3691     default:
3692     RRETURN(PCRE_ERROR_INTERNAL);
3693     }
3694     }
3695     }
3696     /* Control never gets here */
3697     }
3698    
3699 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3700 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3701     UTF-8 and UCP stuff separate. */
3702    
3703     else
3704     {
3705     pp = eptr; /* Remember where we started */
3706    
3707     #ifdef SUPPORT_UCP
3708 nigel 87 if (prop_type >= 0)
3709 nigel 77 {
3710 nigel 87 switch(prop_type)
3711 nigel 77 {
3712 nigel 87 case PT_ANY:
3713     for (i = min; i < max; i++)
3714     {
3715     int len = 1;
3716     if (eptr >= md->end_subject) break;
3717     GETCHARLEN(c, eptr, len);
3718     if (prop_fail_result) break;
3719     eptr+= len;
3720     }
3721     break;
3722    
3723     case PT_LAMP:
3724     for (i = min; i < max; i++)
3725     {
3726     int len = 1;
3727     if (eptr >= md->end_subject) break;
3728     GETCHARLEN(c, eptr, len);
3729     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3730     if ((prop_chartype == ucp_Lu ||
3731     prop_chartype == ucp_Ll ||
3732     prop_chartype == ucp_Lt) == prop_fail_result)
3733     break;
3734     eptr+= len;
3735     }
3736     break;
3737    
3738     case PT_GC:
3739     for (i = min; i < max; i++)
3740     {
3741     int len = 1;
3742     if (eptr >= md->end_subject) break;
3743     GETCHARLEN(c, eptr, len);
3744     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3745     if ((prop_category == prop_value) == prop_fail_result)
3746     break;
3747     eptr+= len;
3748     }
3749     break;
3750    
3751     case PT_PC:
3752     for (i = min; i < max; i++)
3753     {
3754     int len = 1;
3755     if (eptr >= md->end_subject) break;
3756     GETCHARLEN(c, eptr, len);
3757     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3758     if ((prop_chartype == prop_value) == prop_fail_result)
3759     break;
3760     eptr+= len;
3761     }
3762     break;
3763    
3764     case PT_SC:
3765     for (i = min; i < max; i++)
3766     {
3767     int len = 1;
3768     if (eptr >= md->end_subject) break;
3769     GETCHARLEN(c, eptr, len);
3770     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3771     if ((prop_script == prop_value) == prop_fail_result)
3772     break;
3773     eptr+= len;
3774     }
3775     break;
3776 nigel 77 }
3777    
3778     /* eptr is now past the end of the maximum run */
3779    
3780 nigel 93 if (possessive) continue;
3781 nigel 77 for(;;)
3782     {
3783 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3784 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3785     if (eptr-- == pp) break; /* Stop if tried at original pos */
3786 ph10 207 if (utf8) BACKCHAR(eptr);
3787 nigel 77 }
3788     }
3789    
3790     /* Match extended Unicode sequences. We will get here only if the
3791     support is in the binary; otherwise a compile-time error occurs. */
3792    
3793     else if (ctype == OP_EXTUNI)
3794     {
3795     for (i = min; i < max; i++)
3796     {
3797     if (eptr >= md->end_subject) break;
3798     GETCHARINCTEST(c, eptr);
3799 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3800 nigel 77 if (prop_category == ucp_M) break;
3801     while (eptr < md->end_subject)
3802     {
3803     int len = 1;
3804     if (!utf8) c = *eptr; else
3805     {
3806     GETCHARLEN(c, eptr, len);
3807     }
3808 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3809 nigel 77 if (prop_category != ucp_M) break;
3810     eptr += len;
3811     }
3812     }
3813    
3814     /* eptr is now past the end of the maximum run */
3815    
3816 nigel 93 if (possessive) continue;
3817 nigel 77 for(;;)
3818     {
3819 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3820 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3821     if (eptr-- == pp) break; /* Stop if tried at original pos */
3822     for (;;) /* Move back over one extended */
3823     {
3824     int len = 1;
3825     if (!utf8) c = *eptr; else
3826     {
3827 ph10 207 BACKCHAR(eptr);
3828 nigel 77 GETCHARLEN(c, eptr, len);
3829     }
3830 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3831 nigel 77 if (prop_category != ucp_M) break;
3832     eptr--;
3833     }
3834     }
3835     }
3836    
3837     else
3838     #endif /* SUPPORT_UCP */
3839    
3840     #ifdef SUPPORT_UTF8
3841     /* UTF-8 mode */
3842    
3843     if (utf8)
3844     {
3845     switch(ctype)
3846     {
3847     case OP_ANY:
3848     if (max < INT_MAX)
3849     {
3850     if ((ims & PCRE_DOTALL) == 0)
3851     {
3852     for (i = min; i < max; i++)
3853     {
3854 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3855 nigel 77 eptr++;
3856     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3857     }
3858     }
3859     else
3860     {
3861     for (i = min; i < max; i++)
3862     {
3863 nigel 91 if (eptr >= md->end_subject) break;
3864 nigel 77 eptr++;
3865     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3866     }
3867     }
3868     }
3869    
3870     /* Handle unlimited UTF-8 repeat */
3871    
3872     else
3873     {
3874     if ((ims & PCRE_DOTALL) == 0)
3875     {
3876     for (i = min; i < max; i++)
3877     {
3878 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3879 nigel 77 eptr++;
3880 ph10 190 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3881 nigel 77 }
3882     }
3883     else
3884     {
3885 ph10 190 eptr = md->end_subject;
3886 nigel 77 }
3887     }
3888     break;
3889    
3890     /* The byte case is the same as non-UTF8 */
3891    
3892     case OP_ANYBYTE:
3893     c = max - min;
3894 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3895     c = md->end_subject - eptr;
3896 nigel 77 eptr += c;
3897     break;
3898    
3899 nigel 93 case OP_ANYNL:
3900     for (i = min; i < max; i++)
3901     {
3902     int len = 1;
3903     if (eptr >= md->end_subject) break;
3904     GETCHARLEN(c, eptr, len);
3905     if (c == 0x000d)
3906     {
3907     if (++eptr >= md->end_subject) break;
3908     if (*eptr == 0x000a) eptr++;
3909     }
3910     else
3911     {
3912 ph10 231 if (c != 0x000a &&
3913     (md->bsr_anycrlf ||
3914     (c != 0x000b && c != 0x000c &&
3915     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3916 nigel 93 break;
3917     eptr += len;
3918     }
3919     }
3920     break;
3921    
3922 ph10 178 case OP_NOT_HSPACE:
3923 ph10 182 case OP_HSPACE:
3924 ph10 178 for (i = min; i < max; i++)
3925     {
3926 ph10 182 BOOL gotspace;
3927 ph10 178 int len = 1;
3928     if (eptr >= md->end_subject) break;
3929     GETCHARLEN(c, eptr, len);
3930     switch(c)
3931 ph10 182 {
3932     default: gotspace = FALSE; break;
3933 ph10 178 case 0x09: /* HT */
3934     case 0x20: /* SPACE */
3935     case 0xa0: /* NBSP */
3936     case 0x1680: /* OGHAM SPACE MARK */
3937     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3938     case 0x2000: /* EN QUAD */
3939     case 0x2001: /* EM QUAD */
3940     case 0x2002: /* EN SPACE */
3941     case 0x2003: /* EM SPACE */
3942     case 0x2004: /* THREE-PER-EM SPACE */
3943     case 0x2005: /* FOUR-PER-EM SPACE */
3944     case 0x2006: /* SIX-PER-EM SPACE */
3945     case 0x2007: /* FIGURE SPACE */
3946     case 0x2008: /* PUNCTUATION SPACE */
3947     case 0x2009: /* THIN SPACE */
3948     case 0x200A: /* HAIR SPACE */
3949     case 0x202f: /* NARROW NO-BREAK SPACE */
3950     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3951     case 0x3000: /* IDEOGRAPHIC SPACE */
3952     gotspace = TRUE;
3953 ph10 182 break;
3954 ph10 178 }
3955     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3956     eptr += len;
3957     }
3958     break;
3959    
3960     case OP_NOT_VSPACE:
3961 ph10 182 case OP_VSPACE:
3962 ph10 178 for (i = min; i < max; i++)
3963     {
3964 ph10 182 BOOL gotspace;
3965 ph10 178 int len = 1;
3966     if (eptr >= md->end_subject) break;
3967     GETCHARLEN(c, eptr, len);
3968     switch(c)
3969     {
3970 ph10 182 default: gotspace = FALSE; break;
3971 ph10 178 case 0x0a: /* LF */
3972     case 0x0b: /* VT */
3973     case 0x0c: /* FF */
3974     case 0x0d: /* CR */
3975     case 0x85: /* NEL */
3976     case 0x2028: /* LINE SEPARATOR */
3977     case 0x2029: /* PARAGRAPH SEPARATOR */
3978     gotspace = TRUE;
3979     break;
3980     }
3981 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3982 ph10 178 eptr += len;
3983     }
3984     break;
3985    
3986 nigel 77 case OP_NOT_DIGIT:
3987     for (i = min; i < max; i++)
3988     {
3989     int len = 1;
3990     if (eptr >= md->end_subject) break;
3991     GETCHARLEN(c, eptr, len);
3992     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3993     eptr+= len;
3994     }
3995     break;
3996    
3997     case OP_DIGIT:
3998     for (i = min; i < max; i++)
3999     {
4000     int len = 1;
4001     if (eptr >= md->end_subject) break;
4002     GETCHARLEN(c, eptr, len);
4003     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4004     eptr+= len;
4005     }
4006     break;
4007    
4008     case OP_NOT_WHITESPACE:
4009     for (i = min; i < max; i++)
4010     {
4011     int len = 1;
4012     if (eptr >= md->end_subject) break;
4013     GETCHARLEN(c, eptr, len);
4014     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4015     eptr+= len;
4016     }
4017     break;
4018    
4019     case OP_WHITESPACE:
4020     for (i = min; i < max; i++)
4021     {
4022     int len = 1;
4023     if (eptr >= md->end_subject) break;
4024     GETCHARLEN(c, eptr, len);
4025     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4026     eptr+= len;
4027     }
4028     break;
4029    
4030     case OP_NOT_WORDCHAR:
4031     for (i = min; i < max; i++)
4032     {
4033     int len = 1;
4034     if (eptr >= md->end_subject) break;
4035     GETCHARLEN(c, eptr, len);
4036     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4037     eptr+= len;
4038     }
4039     break;
4040    
4041     case OP_WORDCHAR:
4042     for (i = min; i < max; i++)
4043     {
4044     int len = 1;
4045     if (eptr >= md->end_subject) break;
4046     GETCHARLEN(c, eptr, len);
4047     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4048     eptr+= len;
4049     }
4050     break;
4051    
4052     default:
4053     RRETURN(PCRE_ERROR_INTERNAL);
4054     }
4055    
4056     /* eptr is now past the end of the maximum run */
4057    
4058 nigel 93 if (possessive) continue;
4059 nigel 77 for(;;)
4060     {
4061 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4062 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4063     if (eptr-- == pp) break; /* Stop if tried at original pos */
4064     BACKCHAR(eptr);
4065     }
4066     }
4067     else
4068 ph10 207 #endif /* SUPPORT_UTF8 */
4069 nigel 77
4070     /* Not UTF-8 mode */
4071     {
4072     switch(ctype)
4073     {
4074     case OP_ANY:
4075     if ((ims & PCRE_DOTALL) == 0)
4076     {
4077     for (i = min; i < max; i++)
4078     {
4079 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4080 nigel 77 eptr++;
4081     }
4082     break;
4083     }
4084     /* For DOTALL case, fall through and treat as \C */
4085    
4086     case OP_ANYBYTE:
4087     c = max - min;
4088 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4089     c = md->end_subject - eptr;
4090 nigel 77 eptr += c;
4091     break;
4092    
4093 nigel 93 case OP_ANYNL:
4094     for (i = min; i < max; i++)
4095     {
4096     if (eptr >= md->end_subject) break;
4097     c = *eptr;
4098     if (c == 0x000d)
4099     {
4100     if (++eptr >= md->end_subject) break;
4101     if (*eptr == 0x000a) eptr++;
4102     }
4103     else
4104     {
4105 ph10 231 if (c != 0x000a &&
4106     (md->bsr_anycrlf ||
4107     (c != 0x000b && c != 0x000c && c != 0x0085)))
4108 nigel 93 break;
4109     eptr++;
4110     }
4111     }
4112     break;
4113    
4114 ph10 178 case OP_NOT_HSPACE:
4115     for (i = min; i < max; i++)
4116     {
4117     if (eptr >= md->end_subject) break;
4118     c = *eptr;
4119     if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4120 ph10 182 eptr++;
4121 ph10 178 }
4122     break;
4123    
4124     case OP_HSPACE:
4125     for (i = min; i < max; i++)
4126     {
4127     if (eptr >= md->end_subject) break;
4128     c = *eptr;
4129     if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4130 ph10 182 eptr++;
4131 ph10 178 }
4132     break;
4133    
4134     case OP_NOT_VSPACE:
4135     for (i = min; i < max; i++)
4136     {
4137     if (eptr >= md->end_subject) break;
4138     c = *eptr;
4139     if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4140     break;
4141 ph10 182 eptr++;
4142 ph10 178 }
4143     break;
4144