/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 243 - (hide annotations) (download)
Thu Sep 13 09:28:14 2007 UTC (7 years ago) by ph10
File MIME type: text/plain
File size: 150163 byte(s)
Detrailed files for 7.4-RC1 test release.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 117 Copyright (c) 1997-2007 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161     /* Separate the caselesss case for speed */
162    
163     if ((ims & PCRE_CASELESS) != 0)
164     {
165     while (length-- > 0)
166     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167     }
168     else
169     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170    
171     return TRUE;
172     }
173    
174    
175    
176     /***************************************************************************
177     ****************************************************************************
178     RECURSION IN THE match() FUNCTION
179    
180 nigel 87 The match() function is highly recursive, though not every recursive call
181     increases the recursive depth. Nevertheless, some regular expressions can cause
182     it to recurse to a great depth. I was writing for Unix, so I just let it call
183     itself recursively. This uses the stack for saving everything that has to be
184     saved for a recursive call. On Unix, the stack can be large, and this works
185     fine.
186 nigel 77
187 nigel 87 It turns out that on some non-Unix-like systems there are problems with
188     programs that use a lot of stack. (This despite the fact that every last chip
189     has oodles of memory these days, and techniques for extending the stack have
190     been known for decades.) So....
191 nigel 77
192     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193     calls by keeping local variables that need to be preserved in blocks of memory
194 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
195 nigel 77 achieve this so that the actual code doesn't look very different to what it
196     always used to.
197 ph10 164
198 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
199 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
200     Switzer, the use of longjmp() has been abolished, at the cost of having to
201     provide a unique number for each call to RMATCH. There is no way of generating
202     a sequence of numbers at compile time in C. I have given them names, to make
203     them stand out more clearly.
204    
205     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
208     don't have indeterminate values; this has meant that the frame size can be
209 ph10 164 reduced because the result can be "passed back" by straight setting of the
210     variable instead of being passed in the frame.
211 nigel 77 ****************************************************************************
212     ***************************************************************************/
213    
214 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215     below must be updated in sync. */
216 nigel 77
217 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 ph10 212 RM51, RM52, RM53, RM54 };
223 ph10 164
224 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
225 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 ph10 164 actuall used in this definition. */
227 nigel 77
228     #ifndef NO_RECURSE
229     #define REGISTER register
230 ph10 164
231 nigel 87 #ifdef DEBUG
232 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 nigel 87 { \
234     printf("match() called in line %d\n", __LINE__); \
235 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 nigel 87 printf("to line %d\n", __LINE__); \
237     }
238     #define RRETURN(ra) \
239     { \
240     printf("match() returned %d from line %d ", ra, __LINE__); \
241     return ra; \
242     }
243     #else
244 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 nigel 77 #define RRETURN(ra) return ra
247 nigel 87 #endif
248    
249 nigel 77 #else
250    
251    
252 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
253     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254     argument of match(), which never changes. */
255 nigel 77
256     #define REGISTER
257    
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 nigel 77 {\
260     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 ph10 164 frame->Xwhere = rw; \
262     newframe->Xeptr = ra;\
263     newframe->Xecode = rb;\
264 ph10 168 newframe->Xmstart = mstart;\
265 ph10 164 newframe->Xoffset_top = rc;\
266     newframe->Xims = re;\
267     newframe->Xeptrb = rf;\
268     newframe->Xflags = rg;\
269     newframe->Xrdepth = frame->Xrdepth + 1;\
270     newframe->Xprevframe = frame;\
271     frame = newframe;\
272     DPRINTF(("restarting from line %d\n", __LINE__));\
273     goto HEAP_RECURSE;\
274     L_##rw:\
275     DPRINTF(("jumped back to line %d\n", __LINE__));\
276 nigel 77 }
277    
278     #define RRETURN(ra)\
279     {\
280     heapframe *newframe = frame;\
281     frame = newframe->Xprevframe;\
282     (pcre_stack_free)(newframe);\
283     if (frame != NULL)\
284     {\
285 ph10 164 rrc = ra;\
286     goto HEAP_RETURN;\
287 nigel 77 }\
288     return ra;\
289     }
290    
291    
292     /* Structure for remembering the local variables in a private frame */
293    
294     typedef struct heapframe {
295     struct heapframe *Xprevframe;
296    
297     /* Function arguments that may change */
298    
299     const uschar *Xeptr;
300     const uschar *Xecode;
301 ph10 172 const uschar *Xmstart;
302 nigel 77 int Xoffset_top;
303     long int Xims;
304     eptrblock *Xeptrb;
305     int Xflags;
306 nigel 91 unsigned int Xrdepth;
307 nigel 77
308     /* Function local variables */
309    
310     const uschar *Xcallpat;
311     const uschar *Xcharptr;
312     const uschar *Xdata;
313     const uschar *Xnext;
314     const uschar *Xpp;
315     const uschar *Xprev;
316     const uschar *Xsaved_eptr;
317    
318     recursion_info Xnew_recursive;
319    
320     BOOL Xcur_is_word;
321     BOOL Xcondition;
322     BOOL Xprev_is_word;
323    
324     unsigned long int Xoriginal_ims;
325    
326     #ifdef SUPPORT_UCP
327     int Xprop_type;
328 nigel 87 int Xprop_value;
329 nigel 77 int Xprop_fail_result;
330     int Xprop_category;
331     int Xprop_chartype;
332 nigel 87 int Xprop_script;
333 ph10 123 int Xoclength;
334     uschar Xocchars[8];
335 nigel 77 #endif
336    
337     int Xctype;
338 nigel 93 unsigned int Xfc;
339 nigel 77 int Xfi;
340     int Xlength;
341     int Xmax;
342     int Xmin;
343     int Xnumber;
344     int Xoffset;
345     int Xop;
346     int Xsave_capture_last;
347     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348     int Xstacksave[REC_STACK_SAVE_MAX];
349    
350     eptrblock Xnewptrb;
351    
352 ph10 164 /* Where to jump back to */
353 nigel 77
354 ph10 164 int Xwhere;
355 ph10 165
356 nigel 77 } heapframe;
357    
358     #endif
359    
360    
361     /***************************************************************************
362     ***************************************************************************/
363    
364    
365    
366     /*************************************************
367     * Match from current position *
368     *************************************************/
369    
370 nigel 93 /* This function is called recursively in many circumstances. Whenever it
371 nigel 77 returns a negative (error) response, the outer incarnation must also return the
372     same response.
373    
374     Performance note: It might be tempting to extract commonly used fields from the
375     md structure (e.g. utf8, end_subject) into individual variables to improve
376     performance. Tests using gcc on a SPARC disproved this; in the first case, it
377     made performance worse.
378    
379     Arguments:
380 nigel 93 eptr pointer to current character in subject
381     ecode pointer to current position in compiled code
382 ph10 168 mstart pointer to the current match start position (can be modified
383 ph10 172 by encountering \K)
384 nigel 77 offset_top current top pointer
385     md pointer to "static" info for the match
386     ims current /i, /m, and /s options
387     eptrb pointer to chain of blocks containing eptr at start of
388     brackets - for testing for empty matches
389     flags can contain
390     match_condassert - this is an assertion condition
391 nigel 93 match_cbegroup - this is the start of an unlimited repeat
392     group that can match an empty string
393 nigel 87 rdepth the recursion depth
394 nigel 77
395     Returns: MATCH_MATCH if matched ) these values are >= 0
396     MATCH_NOMATCH if failed to match )
397     a negative PCRE_ERROR_xxx value if aborted by an error condition
398 nigel 87 (e.g. stopped by repeated call or recursion limit)
399 nigel 77 */
400    
401     static int
402 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 nigel 91 int flags, unsigned int rdepth)
405 nigel 77 {
406     /* These variables do not need to be preserved over recursion in this function,
407 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
408     "register" because they are used a lot in loops. */
409 nigel 77
410 nigel 91 register int rrc; /* Returns from recursive calls */
411     register int i; /* Used for loops not involving calls to RMATCH() */
412 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414 nigel 77
415 nigel 93 BOOL minimize, possessive; /* Quantifier options */
416    
417 nigel 77 /* When recursion is not being used, all "local" variables that have to be
418     preserved over calls to RMATCH() are part of a "frame" which is obtained from
419     heap storage. Set up the top-level frame here; others are obtained from the
420     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421    
422     #ifdef NO_RECURSE
423     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424     frame->Xprevframe = NULL; /* Marks the top level */
425    
426     /* Copy in the original argument variables */
427    
428     frame->Xeptr = eptr;
429     frame->Xecode = ecode;
430 ph10 168 frame->Xmstart = mstart;
431 nigel 77 frame->Xoffset_top = offset_top;
432     frame->Xims = ims;
433     frame->Xeptrb = eptrb;
434     frame->Xflags = flags;
435 nigel 87 frame->Xrdepth = rdepth;
436 nigel 77
437     /* This is where control jumps back to to effect "recursion" */
438    
439     HEAP_RECURSE:
440    
441     /* Macros make the argument variables come from the current frame */
442    
443     #define eptr frame->Xeptr
444     #define ecode frame->Xecode
445 ph10 168 #define mstart frame->Xmstart
446 nigel 77 #define offset_top frame->Xoffset_top
447     #define ims frame->Xims
448     #define eptrb frame->Xeptrb
449     #define flags frame->Xflags
450 nigel 87 #define rdepth frame->Xrdepth
451 nigel 77
452     /* Ditto for the local variables */
453    
454     #ifdef SUPPORT_UTF8
455     #define charptr frame->Xcharptr
456     #endif
457     #define callpat frame->Xcallpat
458     #define data frame->Xdata
459     #define next frame->Xnext
460     #define pp frame->Xpp
461     #define prev frame->Xprev
462     #define saved_eptr frame->Xsaved_eptr
463    
464     #define new_recursive frame->Xnew_recursive
465    
466     #define cur_is_word frame->Xcur_is_word
467     #define condition frame->Xcondition
468     #define prev_is_word frame->Xprev_is_word
469    
470     #define original_ims frame->Xoriginal_ims
471    
472     #ifdef SUPPORT_UCP
473     #define prop_type frame->Xprop_type
474 nigel 87 #define prop_value frame->Xprop_value
475 nigel 77 #define prop_fail_result frame->Xprop_fail_result
476     #define prop_category frame->Xprop_category
477     #define prop_chartype frame->Xprop_chartype
478 nigel 87 #define prop_script frame->Xprop_script
479 ph10 115 #define oclength frame->Xoclength
480     #define occhars frame->Xocchars
481 nigel 77 #endif
482    
483     #define ctype frame->Xctype
484     #define fc frame->Xfc
485     #define fi frame->Xfi
486     #define length frame->Xlength
487     #define max frame->Xmax
488     #define min frame->Xmin
489     #define number frame->Xnumber
490     #define offset frame->Xoffset
491     #define op frame->Xop
492     #define save_capture_last frame->Xsave_capture_last
493     #define save_offset1 frame->Xsave_offset1
494     #define save_offset2 frame->Xsave_offset2
495     #define save_offset3 frame->Xsave_offset3
496     #define stacksave frame->Xstacksave
497    
498     #define newptrb frame->Xnewptrb
499    
500     /* When recursion is being used, local variables are allocated on the stack and
501     get preserved during recursion in the normal way. In this environment, fi and
502     i, and fc and c, can be the same variables. */
503    
504 nigel 93 #else /* NO_RECURSE not defined */
505 nigel 77 #define fi i
506     #define fc c
507    
508    
509 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510     const uschar *charptr; /* in small blocks of the code. My normal */
511     #endif /* style of coding would have declared */
512     const uschar *callpat; /* them within each of those blocks. */
513     const uschar *data; /* However, in order to accommodate the */
514     const uschar *next; /* version of this code that uses an */
515     USPTR pp; /* external "stack" implemented on the */
516     const uschar *prev; /* heap, it is easier to declare them all */
517     USPTR saved_eptr; /* here, so the declarations can be cut */
518     /* out in a block. The only declarations */
519     recursion_info new_recursive; /* within blocks below are for variables */
520     /* that do not have to be preserved over */
521     BOOL cur_is_word; /* a recursive call to RMATCH(). */
522     BOOL condition;
523 nigel 77 BOOL prev_is_word;
524    
525     unsigned long int original_ims;
526    
527     #ifdef SUPPORT_UCP
528     int prop_type;
529 nigel 87 int prop_value;
530 nigel 77 int prop_fail_result;
531     int prop_category;
532     int prop_chartype;
533 nigel 87 int prop_script;
534 ph10 115 int oclength;
535     uschar occhars[8];
536 nigel 77 #endif
537    
538     int ctype;
539     int length;
540     int max;
541     int min;
542     int number;
543     int offset;
544     int op;
545     int save_capture_last;
546     int save_offset1, save_offset2, save_offset3;
547     int stacksave[REC_STACK_SAVE_MAX];
548    
549     eptrblock newptrb;
550 nigel 93 #endif /* NO_RECURSE */
551 nigel 77
552     /* These statements are here to stop the compiler complaining about unitialized
553     variables. */
554    
555     #ifdef SUPPORT_UCP
556 nigel 87 prop_value = 0;
557 nigel 77 prop_fail_result = 0;
558     #endif
559    
560 nigel 93
561 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
562     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563     used. Thanks to Ian Taylor for noticing this possibility and sending the
564     original patch. */
565    
566     TAIL_RECURSE:
567    
568 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
569     are specified by the macro RMATCH and RRETURN is used to return. When
570     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571     and a "return", respectively (possibly with some debugging if DEBUG is
572     defined). However, RMATCH isn't like a function call because it's quite a
573     complicated macro. It has to be used in one particular way. This shouldn't,
574     however, impact performance when true recursion is being used. */
575 nigel 77
576 ph10 164 #ifdef SUPPORT_UTF8
577     utf8 = md->utf8; /* Local copy of the flag */
578     #else
579     utf8 = FALSE;
580     #endif
581    
582 nigel 87 /* First check that we haven't called match() too many times, or that we
583     haven't exceeded the recursive call limit. */
584    
585 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587 nigel 77
588     original_ims = ims; /* Save for resetting on ')' */
589 nigel 91
590 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
591     string, the match_cbegroup flag is set. When this is the case, add the current
592     subject pointer to the chain of such remembered pointers, to be checked when we
593     hit the closing ket, in order to break infinite loops that match no characters.
594 ph10 197 When match() is called in other circumstances, don't add to the chain. The
595     match_cbegroup flag must NOT be used with tail recursion, because the memory
596     block that is used is on the stack, so a new one may be required for each
597     match(). */
598 nigel 77
599 nigel 93 if ((flags & match_cbegroup) != 0)
600 nigel 77 {
601 ph10 197 newptrb.epb_saved_eptr = eptr;
602     newptrb.epb_prev = eptrb;
603     eptrb = &newptrb;
604 nigel 77 }
605    
606 nigel 93 /* Now start processing the opcodes. */
607 nigel 77
608     for (;;)
609     {
610 nigel 93 minimize = possessive = FALSE;
611 nigel 77 op = *ecode;
612    
613     /* For partial matching, remember if we ever hit the end of the subject after
614     matching at least one subject character. */
615    
616     if (md->partial &&
617     eptr >= md->end_subject &&
618 ph10 168 eptr > mstart)
619 nigel 77 md->hitend = TRUE;
620 ph10 208
621 nigel 93 switch(op)
622     {
623 ph10 210 case OP_FAIL:
624 ph10 212 RRETURN(MATCH_NOMATCH);
625 ph10 211
626 ph10 210 case OP_PRUNE:
627     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628     ims, eptrb, flags, RM51);
629     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 ph10 212 RRETURN(MATCH_PRUNE);
631 ph10 211
632 ph10 210 case OP_COMMIT:
633     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634     ims, eptrb, flags, RM52);
635     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 ph10 212 RRETURN(MATCH_COMMIT);
637 ph10 211
638 ph10 210 case OP_SKIP:
639     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640     ims, eptrb, flags, RM53);
641     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
643 ph10 212 RRETURN(MATCH_SKIP);
644 ph10 211
645 ph10 210 case OP_THEN:
646     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ph10 212 ims, eptrb, flags, RM54);
648 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 ph10 212 RRETURN(MATCH_THEN);
650 ph10 211
651 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
652     the current subject position in the working slot at the top of the vector.
653     We mustn't change the current values of the data slot, because they may be
654     set from a previous iteration of this group, and be referred to by a
655     reference inside the group.
656 nigel 77
657 nigel 93 If the bracket fails to match, we need to restore this value and also the
658     values of the final offsets, in case they were set by a previous iteration
659     of the same bracket.
660 nigel 77
661 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
662     a non-capturing bracket. Don't worry about setting the flag for the error
663     case here; that is handled in the code for KET. */
664 nigel 77
665 nigel 93 case OP_CBRA:
666     case OP_SCBRA:
667     number = GET2(ecode, 1+LINK_SIZE);
668 nigel 77 offset = number << 1;
669    
670     #ifdef DEBUG
671 nigel 93 printf("start bracket %d\n", number);
672     printf("subject=");
673 nigel 77 pchars(eptr, 16, TRUE, md);
674     printf("\n");
675     #endif
676    
677     if (offset < md->offset_max)
678     {
679     save_offset1 = md->offset_vector[offset];
680     save_offset2 = md->offset_vector[offset+1];
681     save_offset3 = md->offset_vector[md->offset_end - number];
682     save_capture_last = md->capture_last;
683    
684     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686    
687 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 nigel 77 do
689     {
690 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691     ims, eptrb, flags, RM1);
692 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 nigel 77 md->capture_last = save_capture_last;
694     ecode += GET(ecode, 1);
695     }
696     while (*ecode == OP_ALT);
697    
698     DPRINTF(("bracket %d failed\n", number));
699    
700     md->offset_vector[offset] = save_offset1;
701     md->offset_vector[offset+1] = save_offset2;
702     md->offset_vector[md->offset_end - number] = save_offset3;
703    
704     RRETURN(MATCH_NOMATCH);
705     }
706    
707 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708     as a non-capturing bracket. */
709 nigel 77
710 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712    
713 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714 nigel 77
715 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717    
718 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719     final alternative within the brackets, we would return the result of a
720     recursive call to match() whatever happened. We can reduce stack usage by
721 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
722     is set.*/
723 nigel 77
724 nigel 93 case OP_BRA:
725     case OP_SBRA:
726     DPRINTF(("start non-capturing bracket\n"));
727     flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 nigel 91 for (;;)
729 nigel 77 {
730 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 nigel 93 {
732 ph10 197 if (flags == 0) /* Not a possibly empty group */
733     {
734     ecode += _pcre_OP_lengths[*ecode];
735     DPRINTF(("bracket 0 tail recursion\n"));
736     goto TAIL_RECURSE;
737     }
738    
739     /* Possibly empty group; can't use tail recursion. */
740    
741     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742     eptrb, flags, RM48);
743     RRETURN(rrc);
744 nigel 93 }
745 nigel 91
746     /* For non-final alternatives, continue the loop for a NOMATCH result;
747     otherwise return. */
748    
749 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750     eptrb, flags, RM2);
751 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 nigel 77 ecode += GET(ecode, 1);
753     }
754 nigel 91 /* Control never reaches here. */
755 nigel 77
756     /* Conditional group: compilation checked that there are no more than
757     two branches. If the condition is false, skipping the first branch takes us
758     past the end if there is only one branch, but that's OK because that is
759 nigel 91 exactly what going to the ket would do. As there is only one branch to be
760     obeyed, we can use tail recursion to avoid using another stack frame. */
761 nigel 77
762     case OP_COND:
763 nigel 93 case OP_SCOND:
764     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 nigel 77 {
766 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767     condition = md->recursive != NULL &&
768     (offset == RREF_ANY || offset == md->recursive->group_num);
769     ecode += condition? 3 : GET(ecode, 1);
770     }
771    
772     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773     {
774 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776     ecode += condition? 3 : GET(ecode, 1);
777 nigel 77 }
778    
779 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780     {
781     condition = FALSE;
782     ecode += GET(ecode, 1);
783     }
784    
785 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
786 nigel 93 the final argument match_condassert causes it to stop at the end of an
787     assertion. */
788 nigel 77
789     else
790     {
791 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792     match_condassert, RM3);
793 nigel 77 if (rrc == MATCH_MATCH)
794     {
795 nigel 93 condition = TRUE;
796     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798     }
799 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 nigel 77 {
801     RRETURN(rrc); /* Need braces because of following else */
802     }
803 nigel 93 else
804     {
805     condition = FALSE;
806     ecode += GET(ecode, 1);
807     }
808     }
809 nigel 91
810 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
811 ph10 197 we can use tail recursion to avoid using another stack frame, except when
812     match_cbegroup is required for an unlimited repeat of a possibly empty
813     group. If the second alternative doesn't exist, we can just plough on. */
814 nigel 91
815 nigel 93 if (condition || *ecode == OP_ALT)
816     {
817 nigel 91 ecode += 1 + LINK_SIZE;
818 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
819     {
820     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821     RRETURN(rrc);
822     }
823     else /* Group must match something */
824     {
825     flags = 0;
826     goto TAIL_RECURSE;
827     }
828 nigel 77 }
829 ph10 197 else /* Condition false & no 2nd alternative */
830 nigel 93 {
831     ecode += 1 + LINK_SIZE;
832     }
833     break;
834 nigel 77
835    
836 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
837     recursion, we should restore the offsets appropriately and continue from
838     after the call. */
839 nigel 77
840 ph10 210 case OP_ACCEPT:
841 nigel 77 case OP_END:
842     if (md->recursive != NULL && md->recursive->group_num == 0)
843     {
844     recursion_info *rec = md->recursive;
845 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 nigel 77 md->recursive = rec->prevrec;
847     memmove(md->offset_vector, rec->offset_save,
848     rec->saved_max * sizeof(int));
849 ph10 168 mstart = rec->save_start;
850 nigel 77 ims = original_ims;
851     ecode = rec->after_call;
852     break;
853     }
854    
855     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856     string - backtracking will then try other alternatives, if any. */
857    
858 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859     md->end_match_ptr = eptr; /* Record where we ended */
860     md->end_offset_top = offset_top; /* and how many extracts were taken */
861 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 nigel 77 RRETURN(MATCH_MATCH);
863    
864     /* Change option settings */
865    
866     case OP_OPT:
867     ims = ecode[1];
868     ecode += 2;
869     DPRINTF(("ims set to %02lx\n", ims));
870     break;
871    
872     /* Assertion brackets. Check the alternative branches in turn - the
873     matching won't pass the KET for an assertion. If any one branch matches,
874     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875     start of each branch to move the current point backwards, so the code at
876     this level is identical to the lookahead case. */
877    
878     case OP_ASSERT:
879     case OP_ASSERTBACK:
880     do
881     {
882 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883     RM4);
884 nigel 77 if (rrc == MATCH_MATCH) break;
885 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 nigel 77 ecode += GET(ecode, 1);
887     }
888     while (*ecode == OP_ALT);
889     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890    
891     /* If checking an assertion for a condition, return MATCH_MATCH. */
892    
893     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894    
895     /* Continue from after the assertion, updating the offsets high water
896     mark, since extracts may have been taken during the assertion. */
897    
898     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899     ecode += 1 + LINK_SIZE;
900     offset_top = md->end_offset_top;
901     continue;
902    
903     /* Negative assertion: all branches must fail to match */
904    
905     case OP_ASSERT_NOT:
906     case OP_ASSERTBACK_NOT:
907     do
908     {
909 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910     RM5);
911 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 nigel 77 ecode += GET(ecode,1);
914     }
915     while (*ecode == OP_ALT);
916    
917     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918    
919     ecode += 1 + LINK_SIZE;
920     continue;
921    
922     /* Move the subject pointer back. This occurs only at the start of
923     each branch of a lookbehind assertion. If we are too close to the start to
924     move back, this match function fails. When working with UTF-8 we move
925     back a number of characters, not bytes. */
926    
927     case OP_REVERSE:
928     #ifdef SUPPORT_UTF8
929     if (utf8)
930     {
931 nigel 93 i = GET(ecode, 1);
932     while (i-- > 0)
933 nigel 77 {
934     eptr--;
935     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 ph10 207 BACKCHAR(eptr);
937 nigel 77 }
938     }
939     else
940     #endif
941    
942     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943    
944     {
945 nigel 93 eptr -= GET(ecode, 1);
946 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947     }
948    
949     /* Skip to next op code */
950    
951     ecode += 1 + LINK_SIZE;
952     break;
953    
954     /* The callout item calls an external function, if one is provided, passing
955     details of the match so far. This is mainly for debugging, though the
956     function is able to force a failure. */
957    
958     case OP_CALLOUT:
959     if (pcre_callout != NULL)
960     {
961     pcre_callout_block cb;
962     cb.version = 1; /* Version 1 of the callout block */
963     cb.callout_number = ecode[1];
964     cb.offset_vector = md->offset_vector;
965 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
966 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
967 ph10 168 cb.start_match = mstart - md->start_subject;
968 nigel 77 cb.current_position = eptr - md->start_subject;
969     cb.pattern_position = GET(ecode, 2);
970     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971     cb.capture_top = offset_top/2;
972     cb.capture_last = md->capture_last;
973     cb.callout_data = md->callout_data;
974     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975     if (rrc < 0) RRETURN(rrc);
976     }
977     ecode += 2 + 2*LINK_SIZE;
978     break;
979    
980     /* Recursion either matches the current regex, or some subexpression. The
981     offset data is the offset to the starting bracket from the start of the
982     whole pattern. (This is so that it works from duplicated subpatterns.)
983    
984     If there are any capturing brackets started but not finished, we have to
985     save their starting points and reinstate them after the recursion. However,
986     we don't know how many such there are (offset_top records the completed
987     total) so we just have to save all the potential data. There may be up to
988     65535 such values, which is too large to put on the stack, but using malloc
989     for small numbers seems expensive. As a compromise, the stack is used when
990     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991     is used. A problem is what to do if the malloc fails ... there is no way of
992     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993     values on the stack, and accept that the rest may be wrong.
994    
995     There are also other values that have to be saved. We use a chained
996     sequence of blocks that actually live on the stack. Thanks to Robin Houston
997     for the original version of this logic. */
998    
999     case OP_RECURSE:
1000     {
1001     callpat = md->start_code + GET(ecode, 1);
1002 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003     GET2(callpat, 1 + LINK_SIZE);
1004 nigel 77
1005     /* Add to "recursing stack" */
1006    
1007     new_recursive.prevrec = md->recursive;
1008     md->recursive = &new_recursive;
1009    
1010     /* Find where to continue from afterwards */
1011    
1012     ecode += 1 + LINK_SIZE;
1013     new_recursive.after_call = ecode;
1014    
1015     /* Now save the offset data. */
1016    
1017     new_recursive.saved_max = md->offset_end;
1018     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019     new_recursive.offset_save = stacksave;
1020     else
1021     {
1022     new_recursive.offset_save =
1023     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025     }
1026    
1027     memcpy(new_recursive.offset_save, md->offset_vector,
1028     new_recursive.saved_max * sizeof(int));
1029 ph10 168 new_recursive.save_start = mstart;
1030     mstart = eptr;
1031 nigel 77
1032     /* OK, now we can do the recursion. For each top-level alternative we
1033     restore the offset and recursion data. */
1034    
1035     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 nigel 77 do
1038     {
1039 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040     md, ims, eptrb, flags, RM6);
1041 nigel 77 if (rrc == MATCH_MATCH)
1042     {
1043 nigel 87 DPRINTF(("Recursion matched\n"));
1044 nigel 77 md->recursive = new_recursive.prevrec;
1045     if (new_recursive.offset_save != stacksave)
1046     (pcre_free)(new_recursive.offset_save);
1047     RRETURN(MATCH_MATCH);
1048     }
1049 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 nigel 87 {
1051     DPRINTF(("Recursion gave error %d\n", rrc));
1052     RRETURN(rrc);
1053     }
1054 nigel 77
1055     md->recursive = &new_recursive;
1056     memcpy(md->offset_vector, new_recursive.offset_save,
1057     new_recursive.saved_max * sizeof(int));
1058     callpat += GET(callpat, 1);
1059     }
1060     while (*callpat == OP_ALT);
1061    
1062     DPRINTF(("Recursion didn't match\n"));
1063     md->recursive = new_recursive.prevrec;
1064     if (new_recursive.offset_save != stacksave)
1065     (pcre_free)(new_recursive.offset_save);
1066     RRETURN(MATCH_NOMATCH);
1067     }
1068     /* Control never reaches here */
1069    
1070     /* "Once" brackets are like assertion brackets except that after a match,
1071     the point in the subject string is not moved back. Thus there can never be
1072     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073     Check the alternative branches in turn - the matching won't pass the KET
1074     for this kind of subpattern. If any one branch matches, we carry on as at
1075     the end of a normal bracket, leaving the subject pointer. */
1076    
1077     case OP_ONCE:
1078 nigel 91 prev = ecode;
1079     saved_eptr = eptr;
1080    
1081     do
1082 nigel 77 {
1083 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 nigel 91 if (rrc == MATCH_MATCH) break;
1085 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 nigel 91 ecode += GET(ecode,1);
1087     }
1088     while (*ecode == OP_ALT);
1089 nigel 77
1090 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1091 nigel 77
1092 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093 nigel 77
1094 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1095     mark, since extracts may have been taken. */
1096 nigel 77
1097 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098 nigel 77
1099 nigel 91 offset_top = md->end_offset_top;
1100     eptr = md->end_match_ptr;
1101 nigel 77
1102 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1103     happens for a repeating ket if no characters were matched in the group.
1104     This is the forcible breaking of infinite loops as implemented in Perl
1105     5.005. If there is an options reset, it will get obeyed in the normal
1106     course of events. */
1107 nigel 77
1108 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1109     {
1110     ecode += 1+LINK_SIZE;
1111     break;
1112     }
1113 nigel 77
1114 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1115     preceding bracket, in the appropriate order. The second "call" of match()
1116     uses tail recursion, to avoid using another stack frame. We need to reset
1117     any options that changed within the bracket before re-running it, so
1118     check the next opcode. */
1119 nigel 77
1120 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1121     {
1122     ims = (ims & ~PCRE_IMS) | ecode[4];
1123     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124     }
1125 nigel 77
1126 nigel 91 if (*ecode == OP_KETRMIN)
1127     {
1128 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130     ecode = prev;
1131 ph10 197 flags = 0;
1132 nigel 91 goto TAIL_RECURSE;
1133 nigel 77 }
1134 nigel 91 else /* OP_KETRMAX */
1135     {
1136 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138     ecode += 1 + LINK_SIZE;
1139 ph10 197 flags = 0;
1140 nigel 91 goto TAIL_RECURSE;
1141     }
1142     /* Control never gets here */
1143 nigel 77
1144     /* An alternation is the end of a branch; scan along to find the end of the
1145     bracketed group and go to there. */
1146    
1147     case OP_ALT:
1148     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149     break;
1150    
1151     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
1152     that it may occur zero times. It may repeat infinitely, or not at all -
1153     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
1154     repeat limits are compiled as a number of copies, with the optional ones
1155     preceded by BRAZERO or BRAMINZERO. */
1156    
1157     case OP_BRAZERO:
1158     {
1159     next = ecode+1;
1160 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162     do next += GET(next,1); while (*next == OP_ALT);
1163 nigel 93 ecode = next + 1 + LINK_SIZE;
1164 nigel 77 }
1165     break;
1166    
1167     case OP_BRAMINZERO:
1168     {
1169     next = ecode+1;
1170 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1171 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173     ecode++;
1174     }
1175     break;
1176    
1177 nigel 93 /* End of a group, repeated or non-repeating. */
1178 nigel 77
1179     case OP_KET:
1180     case OP_KETRMIN:
1181     case OP_KETRMAX:
1182 nigel 91 prev = ecode - GET(ecode, 1);
1183 nigel 77
1184 nigel 93 /* If this was a group that remembered the subject start, in order to break
1185     infinite repeats of empty string matches, retrieve the subject start from
1186     the chain. Otherwise, set it NULL. */
1187 nigel 77
1188 nigel 93 if (*prev >= OP_SBRA)
1189     {
1190     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1191     eptrb = eptrb->epb_prev; /* Backup to previous group */
1192     }
1193     else saved_eptr = NULL;
1194 nigel 77
1195 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1196     MATCH_MATCH, but record the current high water mark for use by positive
1197     assertions. Do this also for the "once" (atomic) groups. */
1198    
1199 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1200     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1201     *prev == OP_ONCE)
1202     {
1203     md->end_match_ptr = eptr; /* For ONCE */
1204     md->end_offset_top = offset_top;
1205     RRETURN(MATCH_MATCH);
1206     }
1207 nigel 77
1208 nigel 93 /* For capturing groups we have to check the group number back at the start
1209     and if necessary complete handling an extraction by setting the offsets and
1210     bumping the high water mark. Note that whole-pattern recursion is coded as
1211     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1212     when the OP_END is reached. Other recursion is handled here. */
1213 nigel 77
1214 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1215 nigel 91 {
1216 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1217 nigel 91 offset = number << 1;
1218 nigel 77
1219     #ifdef DEBUG
1220 nigel 91 printf("end bracket %d", number);
1221     printf("\n");
1222 nigel 77 #endif
1223    
1224 nigel 93 md->capture_last = number;
1225     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1226 nigel 91 {
1227 nigel 93 md->offset_vector[offset] =
1228     md->offset_vector[md->offset_end - number];
1229     md->offset_vector[offset+1] = eptr - md->start_subject;
1230     if (offset_top <= offset) offset_top = offset + 2;
1231     }
1232 nigel 77
1233 nigel 93 /* Handle a recursively called group. Restore the offsets
1234     appropriately and continue from after the call. */
1235 nigel 77
1236 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1237     {
1238     recursion_info *rec = md->recursive;
1239     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1240     md->recursive = rec->prevrec;
1241 ph10 168 mstart = rec->save_start;
1242 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1243     rec->saved_max * sizeof(int));
1244     ecode = rec->after_call;
1245     ims = original_ims;
1246     break;
1247 nigel 77 }
1248 nigel 91 }
1249 nigel 77
1250 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1251     flags, in case they got changed during the group. */
1252 nigel 77
1253 nigel 91 ims = original_ims;
1254     DPRINTF(("ims reset to %02lx\n", ims));
1255 nigel 77
1256 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1257     happens for a repeating ket if no characters were matched in the group.
1258     This is the forcible breaking of infinite loops as implemented in Perl
1259     5.005. If there is an options reset, it will get obeyed in the normal
1260     course of events. */
1261 nigel 77
1262 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1263     {
1264     ecode += 1 + LINK_SIZE;
1265     break;
1266     }
1267 nigel 77
1268 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1269     preceding bracket, in the appropriate order. In the second case, we can use
1270 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1271     unlimited repeat of a group that can match an empty string. */
1272 nigel 77
1273 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1274    
1275 nigel 91 if (*ecode == OP_KETRMIN)
1276     {
1277 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1278 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1279 ph10 197 if (flags != 0) /* Could match an empty string */
1280     {
1281     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1282     RRETURN(rrc);
1283     }
1284 nigel 91 ecode = prev;
1285     goto TAIL_RECURSE;
1286 nigel 77 }
1287 nigel 91 else /* OP_KETRMAX */
1288     {
1289 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1290 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1291     ecode += 1 + LINK_SIZE;
1292 ph10 197 flags = 0;
1293 nigel 91 goto TAIL_RECURSE;
1294     }
1295     /* Control never gets here */
1296 nigel 77
1297     /* Start of subject unless notbol, or after internal newline if multiline */
1298    
1299     case OP_CIRC:
1300     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1301     if ((ims & PCRE_MULTILINE) != 0)
1302     {
1303 nigel 91 if (eptr != md->start_subject &&
1304 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1305 nigel 77 RRETURN(MATCH_NOMATCH);
1306     ecode++;
1307     break;
1308     }
1309     /* ... else fall through */
1310    
1311     /* Start of subject assertion */
1312    
1313     case OP_SOD:
1314     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1315     ecode++;
1316     break;
1317    
1318     /* Start of match assertion */
1319    
1320     case OP_SOM:
1321     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1322     ecode++;
1323     break;
1324 ph10 172
1325 ph10 168 /* Reset the start of match point */
1326 ph10 172
1327 ph10 168 case OP_SET_SOM:
1328     mstart = eptr;
1329 ph10 172 ecode++;
1330     break;
1331 nigel 77
1332     /* Assert before internal newline if multiline, or before a terminating
1333     newline unless endonly is set, else end of subject unless noteol is set. */
1334    
1335     case OP_DOLL:
1336     if ((ims & PCRE_MULTILINE) != 0)
1337     {
1338     if (eptr < md->end_subject)
1339 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1340 nigel 77 else
1341     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1342     ecode++;
1343     break;
1344     }
1345     else
1346     {
1347     if (md->noteol) RRETURN(MATCH_NOMATCH);
1348     if (!md->endonly)
1349     {
1350 nigel 91 if (eptr != md->end_subject &&
1351 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1352 nigel 77 RRETURN(MATCH_NOMATCH);
1353     ecode++;
1354     break;
1355     }
1356     }
1357 nigel 91 /* ... else fall through for endonly */
1358 nigel 77
1359     /* End of subject assertion (\z) */
1360    
1361     case OP_EOD:
1362     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1363     ecode++;
1364     break;
1365    
1366     /* End of subject or ending \n assertion (\Z) */
1367    
1368     case OP_EODN:
1369 nigel 91 if (eptr != md->end_subject &&
1370 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1371 nigel 91 RRETURN(MATCH_NOMATCH);
1372 nigel 77 ecode++;
1373     break;
1374    
1375     /* Word boundary assertions */
1376    
1377     case OP_NOT_WORD_BOUNDARY:
1378     case OP_WORD_BOUNDARY:
1379     {
1380    
1381     /* Find out if the previous and current characters are "word" characters.
1382     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1383     be "non-word" characters. */
1384    
1385     #ifdef SUPPORT_UTF8
1386     if (utf8)
1387     {
1388     if (eptr == md->start_subject) prev_is_word = FALSE; else
1389     {
1390     const uschar *lastptr = eptr - 1;
1391     while((*lastptr & 0xc0) == 0x80) lastptr--;
1392     GETCHAR(c, lastptr);
1393     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1394     }
1395     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1396     {
1397     GETCHAR(c, eptr);
1398     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1399     }
1400     }
1401     else
1402     #endif
1403    
1404     /* More streamlined when not in UTF-8 mode */
1405    
1406     {
1407     prev_is_word = (eptr != md->start_subject) &&
1408     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1409     cur_is_word = (eptr < md->end_subject) &&
1410     ((md->ctypes[*eptr] & ctype_word) != 0);
1411     }
1412    
1413     /* Now see if the situation is what we want */
1414    
1415     if ((*ecode++ == OP_WORD_BOUNDARY)?
1416     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1417     RRETURN(MATCH_NOMATCH);
1418     }
1419     break;
1420    
1421     /* Match a single character type; inline for speed */
1422    
1423     case OP_ANY:
1424 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1425     {
1426 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1427 nigel 91 }
1428 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1429     if (utf8)
1430     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1431     ecode++;
1432     break;
1433    
1434     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1435     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1436    
1437     case OP_ANYBYTE:
1438     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1439     ecode++;
1440     break;
1441    
1442     case OP_NOT_DIGIT:
1443     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1444     GETCHARINCTEST(c, eptr);
1445     if (
1446     #ifdef SUPPORT_UTF8
1447     c < 256 &&
1448     #endif
1449     (md->ctypes[c] & ctype_digit) != 0
1450     )
1451     RRETURN(MATCH_NOMATCH);
1452     ecode++;
1453     break;
1454    
1455     case OP_DIGIT:
1456     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1457     GETCHARINCTEST(c, eptr);
1458     if (
1459     #ifdef SUPPORT_UTF8
1460     c >= 256 ||
1461     #endif
1462     (md->ctypes[c] & ctype_digit) == 0
1463     )
1464     RRETURN(MATCH_NOMATCH);
1465     ecode++;
1466     break;
1467    
1468     case OP_NOT_WHITESPACE:
1469     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1470     GETCHARINCTEST(c, eptr);
1471     if (
1472     #ifdef SUPPORT_UTF8
1473     c < 256 &&
1474     #endif
1475     (md->ctypes[c] & ctype_space) != 0
1476     )
1477     RRETURN(MATCH_NOMATCH);
1478     ecode++;
1479     break;
1480    
1481     case OP_WHITESPACE:
1482     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1483     GETCHARINCTEST(c, eptr);
1484     if (
1485     #ifdef SUPPORT_UTF8
1486     c >= 256 ||
1487     #endif
1488     (md->ctypes[c] & ctype_space) == 0
1489     )
1490     RRETURN(MATCH_NOMATCH);
1491     ecode++;
1492     break;
1493    
1494     case OP_NOT_WORDCHAR:
1495     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1496     GETCHARINCTEST(c, eptr);
1497     if (
1498     #ifdef SUPPORT_UTF8
1499     c < 256 &&
1500     #endif
1501     (md->ctypes[c] & ctype_word) != 0
1502     )
1503     RRETURN(MATCH_NOMATCH);
1504     ecode++;
1505     break;
1506    
1507     case OP_WORDCHAR:
1508     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1509     GETCHARINCTEST(c, eptr);
1510     if (
1511     #ifdef SUPPORT_UTF8
1512     c >= 256 ||
1513     #endif
1514     (md->ctypes[c] & ctype_word) == 0
1515     )
1516     RRETURN(MATCH_NOMATCH);
1517     ecode++;
1518     break;
1519    
1520 nigel 93 case OP_ANYNL:
1521     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1522     GETCHARINCTEST(c, eptr);
1523     switch(c)
1524     {
1525     default: RRETURN(MATCH_NOMATCH);
1526     case 0x000d:
1527     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1528     break;
1529 ph10 231
1530 nigel 93 case 0x000a:
1531 ph10 231 break;
1532    
1533 nigel 93 case 0x000b:
1534     case 0x000c:
1535     case 0x0085:
1536     case 0x2028:
1537     case 0x2029:
1538 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1539 nigel 93 break;
1540     }
1541     ecode++;
1542     break;
1543    
1544 ph10 178 case OP_NOT_HSPACE:
1545     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1546     GETCHARINCTEST(c, eptr);
1547     switch(c)
1548     {
1549     default: break;
1550     case 0x09: /* HT */
1551     case 0x20: /* SPACE */
1552     case 0xa0: /* NBSP */
1553     case 0x1680: /* OGHAM SPACE MARK */
1554     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1555     case 0x2000: /* EN QUAD */
1556     case 0x2001: /* EM QUAD */
1557     case 0x2002: /* EN SPACE */
1558     case 0x2003: /* EM SPACE */
1559     case 0x2004: /* THREE-PER-EM SPACE */
1560     case 0x2005: /* FOUR-PER-EM SPACE */
1561     case 0x2006: /* SIX-PER-EM SPACE */
1562     case 0x2007: /* FIGURE SPACE */
1563     case 0x2008: /* PUNCTUATION SPACE */
1564     case 0x2009: /* THIN SPACE */
1565     case 0x200A: /* HAIR SPACE */
1566     case 0x202f: /* NARROW NO-BREAK SPACE */
1567     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1568     case 0x3000: /* IDEOGRAPHIC SPACE */
1569     RRETURN(MATCH_NOMATCH);
1570     }
1571     ecode++;
1572     break;
1573    
1574     case OP_HSPACE:
1575     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1576     GETCHARINCTEST(c, eptr);
1577     switch(c)
1578     {
1579     default: RRETURN(MATCH_NOMATCH);
1580     case 0x09: /* HT */
1581     case 0x20: /* SPACE */
1582     case 0xa0: /* NBSP */
1583     case 0x1680: /* OGHAM SPACE MARK */
1584     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1585     case 0x2000: /* EN QUAD */
1586     case 0x2001: /* EM QUAD */
1587     case 0x2002: /* EN SPACE */
1588     case 0x2003: /* EM SPACE */
1589     case 0x2004: /* THREE-PER-EM SPACE */
1590     case 0x2005: /* FOUR-PER-EM SPACE */
1591     case 0x2006: /* SIX-PER-EM SPACE */
1592     case 0x2007: /* FIGURE SPACE */
1593     case 0x2008: /* PUNCTUATION SPACE */
1594     case 0x2009: /* THIN SPACE */
1595     case 0x200A: /* HAIR SPACE */
1596     case 0x202f: /* NARROW NO-BREAK SPACE */
1597     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1598     case 0x3000: /* IDEOGRAPHIC SPACE */
1599     break;
1600     }
1601     ecode++;
1602     break;
1603    
1604     case OP_NOT_VSPACE:
1605     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1606     GETCHARINCTEST(c, eptr);
1607     switch(c)
1608     {
1609     default: break;
1610     case 0x0a: /* LF */
1611     case 0x0b: /* VT */
1612     case 0x0c: /* FF */
1613     case 0x0d: /* CR */
1614     case 0x85: /* NEL */
1615     case 0x2028: /* LINE SEPARATOR */
1616     case 0x2029: /* PARAGRAPH SEPARATOR */
1617     RRETURN(MATCH_NOMATCH);
1618     }
1619     ecode++;
1620     break;
1621    
1622     case OP_VSPACE:
1623     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1624     GETCHARINCTEST(c, eptr);
1625     switch(c)
1626     {
1627     default: RRETURN(MATCH_NOMATCH);
1628     case 0x0a: /* LF */
1629     case 0x0b: /* VT */
1630     case 0x0c: /* FF */
1631     case 0x0d: /* CR */
1632     case 0x85: /* NEL */
1633     case 0x2028: /* LINE SEPARATOR */
1634     case 0x2029: /* PARAGRAPH SEPARATOR */
1635     break;
1636     }
1637     ecode++;
1638     break;
1639    
1640 nigel 77 #ifdef SUPPORT_UCP
1641     /* Check the next character by Unicode property. We will get here only
1642     if the support is in the binary; otherwise a compile-time error occurs. */
1643    
1644     case OP_PROP:
1645     case OP_NOTPROP:
1646     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1647     GETCHARINCTEST(c, eptr);
1648     {
1649 nigel 87 int chartype, script;
1650     int category = _pcre_ucp_findprop(c, &chartype, &script);
1651 nigel 77
1652 nigel 87 switch(ecode[1])
1653     {
1654     case PT_ANY:
1655     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1656     break;
1657 nigel 77
1658 nigel 87 case PT_LAMP:
1659     if ((chartype == ucp_Lu ||
1660     chartype == ucp_Ll ||
1661     chartype == ucp_Lt) == (op == OP_NOTPROP))
1662 nigel 77 RRETURN(MATCH_NOMATCH);
1663 nigel 87 break;
1664    
1665     case PT_GC:
1666     if ((ecode[2] != category) == (op == OP_PROP))
1667 nigel 77 RRETURN(MATCH_NOMATCH);
1668 nigel 87 break;
1669    
1670     case PT_PC:
1671     if ((ecode[2] != chartype) == (op == OP_PROP))
1672     RRETURN(MATCH_NOMATCH);
1673     break;
1674    
1675     case PT_SC:
1676     if ((ecode[2] != script) == (op == OP_PROP))
1677     RRETURN(MATCH_NOMATCH);
1678     break;
1679    
1680     default:
1681     RRETURN(PCRE_ERROR_INTERNAL);
1682 nigel 77 }
1683 nigel 87
1684     ecode += 3;
1685 nigel 77 }
1686     break;
1687    
1688     /* Match an extended Unicode sequence. We will get here only if the support
1689     is in the binary; otherwise a compile-time error occurs. */
1690    
1691     case OP_EXTUNI:
1692     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1693     GETCHARINCTEST(c, eptr);
1694     {
1695 nigel 87 int chartype, script;
1696     int category = _pcre_ucp_findprop(c, &chartype, &script);
1697 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1698     while (eptr < md->end_subject)
1699     {
1700     int len = 1;
1701     if (!utf8) c = *eptr; else
1702     {
1703     GETCHARLEN(c, eptr, len);
1704     }
1705 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1706 nigel 77 if (category != ucp_M) break;
1707     eptr += len;
1708     }
1709     }
1710     ecode++;
1711     break;
1712     #endif
1713    
1714    
1715     /* Match a back reference, possibly repeatedly. Look past the end of the
1716     item to see if there is repeat information following. The code is similar
1717     to that for character classes, but repeated for efficiency. Then obey
1718     similar code to character type repeats - written out again for speed.
1719     However, if the referenced string is the empty string, always treat
1720     it as matched, any number of times (otherwise there could be infinite
1721     loops). */
1722    
1723     case OP_REF:
1724     {
1725     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1726     ecode += 3; /* Advance past item */
1727    
1728     /* If the reference is unset, set the length to be longer than the amount
1729     of subject left; this ensures that every attempt at a match fails. We
1730     can't just fail here, because of the possibility of quantifiers with zero
1731     minima. */
1732    
1733     length = (offset >= offset_top || md->offset_vector[offset] < 0)?
1734     md->end_subject - eptr + 1 :
1735     md->offset_vector[offset+1] - md->offset_vector[offset];
1736    
1737     /* Set up for repetition, or handle the non-repeated case */
1738    
1739     switch (*ecode)
1740     {
1741     case OP_CRSTAR:
1742     case OP_CRMINSTAR:
1743     case OP_CRPLUS:
1744     case OP_CRMINPLUS:
1745     case OP_CRQUERY:
1746     case OP_CRMINQUERY:
1747     c = *ecode++ - OP_CRSTAR;
1748     minimize = (c & 1) != 0;
1749     min = rep_min[c]; /* Pick up values from tables; */
1750     max = rep_max[c]; /* zero for max => infinity */
1751     if (max == 0) max = INT_MAX;
1752     break;
1753    
1754     case OP_CRRANGE:
1755     case OP_CRMINRANGE:
1756     minimize = (*ecode == OP_CRMINRANGE);
1757     min = GET2(ecode, 1);
1758     max = GET2(ecode, 3);
1759     if (max == 0) max = INT_MAX;
1760     ecode += 5;
1761     break;
1762    
1763     default: /* No repeat follows */
1764     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1765     eptr += length;
1766     continue; /* With the main loop */
1767     }
1768    
1769     /* If the length of the reference is zero, just continue with the
1770     main loop. */
1771    
1772     if (length == 0) continue;
1773    
1774     /* First, ensure the minimum number of matches are present. We get back
1775     the length of the reference string explicitly rather than passing the
1776     address of eptr, so that eptr can be a register variable. */
1777    
1778     for (i = 1; i <= min; i++)
1779     {
1780     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1781     eptr += length;
1782     }
1783    
1784     /* If min = max, continue at the same level without recursion.
1785     They are not both allowed to be zero. */
1786    
1787     if (min == max) continue;
1788    
1789     /* If minimizing, keep trying and advancing the pointer */
1790    
1791     if (minimize)
1792     {
1793     for (fi = min;; fi++)
1794     {
1795 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1796 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1797     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1798     RRETURN(MATCH_NOMATCH);
1799     eptr += length;
1800     }
1801     /* Control never gets here */
1802     }
1803    
1804     /* If maximizing, find the longest string and work backwards */
1805    
1806     else
1807     {
1808     pp = eptr;
1809     for (i = min; i < max; i++)
1810     {
1811     if (!match_ref(offset, eptr, length, md, ims)) break;
1812     eptr += length;
1813     }
1814     while (eptr >= pp)
1815     {
1816 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1817 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1818     eptr -= length;
1819     }
1820     RRETURN(MATCH_NOMATCH);
1821     }
1822     }
1823     /* Control never gets here */
1824    
1825    
1826    
1827     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1828     used when all the characters in the class have values in the range 0-255,
1829     and either the matching is caseful, or the characters are in the range
1830     0-127 when UTF-8 processing is enabled. The only difference between
1831     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1832     encountered.
1833    
1834     First, look past the end of the item to see if there is repeat information
1835     following. Then obey similar code to character type repeats - written out
1836     again for speed. */
1837    
1838     case OP_NCLASS:
1839     case OP_CLASS:
1840     {
1841     data = ecode + 1; /* Save for matching */
1842     ecode += 33; /* Advance past the item */
1843    
1844     switch (*ecode)
1845     {
1846     case OP_CRSTAR:
1847     case OP_CRMINSTAR:
1848     case OP_CRPLUS:
1849     case OP_CRMINPLUS:
1850     case OP_CRQUERY:
1851     case OP_CRMINQUERY:
1852     c = *ecode++ - OP_CRSTAR;
1853     minimize = (c & 1) != 0;
1854     min = rep_min[c]; /* Pick up values from tables; */
1855     max = rep_max[c]; /* zero for max => infinity */
1856     if (max == 0) max = INT_MAX;
1857     break;
1858    
1859     case OP_CRRANGE:
1860     case OP_CRMINRANGE:
1861     minimize = (*ecode == OP_CRMINRANGE);
1862     min = GET2(ecode, 1);
1863     max = GET2(ecode, 3);
1864     if (max == 0) max = INT_MAX;
1865     ecode += 5;
1866     break;
1867    
1868     default: /* No repeat follows */
1869     min = max = 1;
1870     break;
1871     }
1872    
1873     /* First, ensure the minimum number of matches are present. */
1874    
1875     #ifdef SUPPORT_UTF8
1876     /* UTF-8 mode */
1877     if (utf8)
1878     {
1879     for (i = 1; i <= min; i++)
1880     {
1881     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1882     GETCHARINC(c, eptr);
1883     if (c > 255)
1884     {
1885     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1886     }
1887     else
1888     {
1889     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1890     }
1891     }
1892     }
1893     else
1894     #endif
1895     /* Not UTF-8 mode */
1896     {
1897     for (i = 1; i <= min; i++)
1898     {
1899     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1900     c = *eptr++;
1901     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1902     }
1903     }
1904    
1905     /* If max == min we can continue with the main loop without the
1906     need to recurse. */
1907    
1908     if (min == max) continue;
1909    
1910     /* If minimizing, keep testing the rest of the expression and advancing
1911     the pointer while it matches the class. */
1912    
1913     if (minimize)
1914     {
1915     #ifdef SUPPORT_UTF8
1916     /* UTF-8 mode */
1917     if (utf8)
1918     {
1919     for (fi = min;; fi++)
1920     {
1921 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1922 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1923     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1924     GETCHARINC(c, eptr);
1925     if (c > 255)
1926     {
1927     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1928     }
1929     else
1930     {
1931     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1932     }
1933     }
1934     }
1935     else
1936     #endif
1937     /* Not UTF-8 mode */
1938     {
1939     for (fi = min;; fi++)
1940     {
1941 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1942 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1943     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1944     c = *eptr++;
1945     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1946     }
1947     }
1948     /* Control never gets here */
1949     }
1950    
1951     /* If maximizing, find the longest possible run, then work backwards. */
1952    
1953     else
1954     {
1955     pp = eptr;
1956    
1957     #ifdef SUPPORT_UTF8
1958     /* UTF-8 mode */
1959     if (utf8)
1960     {
1961     for (i = min; i < max; i++)
1962     {
1963     int len = 1;
1964     if (eptr >= md->end_subject) break;
1965     GETCHARLEN(c, eptr, len);
1966     if (c > 255)
1967     {
1968     if (op == OP_CLASS) break;
1969     }
1970     else
1971     {
1972     if ((data[c/8] & (1 << (c&7))) == 0) break;
1973     }
1974     eptr += len;
1975     }
1976     for (;;)
1977     {
1978 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1979 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1980     if (eptr-- == pp) break; /* Stop if tried at original pos */
1981     BACKCHAR(eptr);
1982     }
1983     }
1984     else
1985     #endif
1986     /* Not UTF-8 mode */
1987     {
1988     for (i = min; i < max; i++)
1989     {
1990     if (eptr >= md->end_subject) break;
1991     c = *eptr;
1992     if ((data[c/8] & (1 << (c&7))) == 0) break;
1993     eptr++;
1994     }
1995     while (eptr >= pp)
1996     {
1997 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
1998 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1999 nigel 77 eptr--;
2000     }
2001     }
2002    
2003     RRETURN(MATCH_NOMATCH);
2004     }
2005     }
2006     /* Control never gets here */
2007    
2008    
2009     /* Match an extended character class. This opcode is encountered only
2010     in UTF-8 mode, because that's the only time it is compiled. */
2011    
2012     #ifdef SUPPORT_UTF8
2013     case OP_XCLASS:
2014     {
2015     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2016     ecode += GET(ecode, 1); /* Advance past the item */
2017    
2018     switch (*ecode)
2019     {
2020     case OP_CRSTAR:
2021     case OP_CRMINSTAR:
2022     case OP_CRPLUS:
2023     case OP_CRMINPLUS:
2024     case OP_CRQUERY:
2025     case OP_CRMINQUERY:
2026     c = *ecode++ - OP_CRSTAR;
2027     minimize = (c & 1) != 0;
2028     min = rep_min[c]; /* Pick up values from tables; */
2029     max = rep_max[c]; /* zero for max => infinity */
2030     if (max == 0) max = INT_MAX;
2031     break;
2032    
2033     case OP_CRRANGE:
2034     case OP_CRMINRANGE:
2035     minimize = (*ecode == OP_CRMINRANGE);
2036     min = GET2(ecode, 1);
2037     max = GET2(ecode, 3);
2038     if (max == 0) max = INT_MAX;
2039     ecode += 5;
2040     break;
2041    
2042     default: /* No repeat follows */
2043     min = max = 1;
2044     break;
2045     }
2046    
2047     /* First, ensure the minimum number of matches are present. */
2048    
2049     for (i = 1; i <= min; i++)
2050     {
2051     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2052     GETCHARINC(c, eptr);
2053     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2054     }
2055    
2056     /* If max == min we can continue with the main loop without the
2057     need to recurse. */
2058    
2059     if (min == max) continue;
2060    
2061     /* If minimizing, keep testing the rest of the expression and advancing
2062     the pointer while it matches the class. */
2063    
2064     if (minimize)
2065     {
2066     for (fi = min;; fi++)
2067     {
2068 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2069 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2070     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2071     GETCHARINC(c, eptr);
2072     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2073     }
2074     /* Control never gets here */
2075     }
2076    
2077     /* If maximizing, find the longest possible run, then work backwards. */
2078    
2079     else
2080     {
2081     pp = eptr;
2082     for (i = min; i < max; i++)
2083     {
2084     int len = 1;
2085     if (eptr >= md->end_subject) break;
2086     GETCHARLEN(c, eptr, len);
2087     if (!_pcre_xclass(c, data)) break;
2088     eptr += len;
2089     }
2090     for(;;)
2091     {
2092 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2093 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2094     if (eptr-- == pp) break; /* Stop if tried at original pos */
2095 ph10 214 if (utf8) BACKCHAR(eptr);
2096 nigel 77 }
2097     RRETURN(MATCH_NOMATCH);
2098     }
2099    
2100     /* Control never gets here */
2101     }
2102     #endif /* End of XCLASS */
2103    
2104     /* Match a single character, casefully */
2105    
2106     case OP_CHAR:
2107     #ifdef SUPPORT_UTF8
2108     if (utf8)
2109     {
2110     length = 1;
2111     ecode++;
2112     GETCHARLEN(fc, ecode, length);
2113     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2114     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2115     }
2116     else
2117     #endif
2118    
2119     /* Non-UTF-8 mode */
2120     {
2121     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2122     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2123     ecode += 2;
2124     }
2125     break;
2126    
2127     /* Match a single character, caselessly */
2128    
2129     case OP_CHARNC:
2130     #ifdef SUPPORT_UTF8
2131     if (utf8)
2132     {
2133     length = 1;
2134     ecode++;
2135     GETCHARLEN(fc, ecode, length);
2136    
2137     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2138    
2139     /* If the pattern character's value is < 128, we have only one byte, and
2140     can use the fast lookup table. */
2141    
2142     if (fc < 128)
2143     {
2144     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2145     }
2146    
2147     /* Otherwise we must pick up the subject character */
2148    
2149     else
2150     {
2151 nigel 93 unsigned int dc;
2152 nigel 77 GETCHARINC(dc, eptr);
2153     ecode += length;
2154    
2155     /* If we have Unicode property support, we can use it to test the other
2156 nigel 87 case of the character, if there is one. */
2157 nigel 77
2158     if (fc != dc)
2159     {
2160     #ifdef SUPPORT_UCP
2161 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2162 nigel 77 #endif
2163     RRETURN(MATCH_NOMATCH);
2164     }
2165     }
2166     }
2167     else
2168     #endif /* SUPPORT_UTF8 */
2169    
2170     /* Non-UTF-8 mode */
2171     {
2172     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2173     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2174     ecode += 2;
2175     }
2176     break;
2177    
2178 nigel 93 /* Match a single character repeatedly. */
2179 nigel 77
2180     case OP_EXACT:
2181     min = max = GET2(ecode, 1);
2182     ecode += 3;
2183     goto REPEATCHAR;
2184    
2185 nigel 93 case OP_POSUPTO:
2186     possessive = TRUE;
2187     /* Fall through */
2188    
2189 nigel 77 case OP_UPTO:
2190     case OP_MINUPTO:
2191     min = 0;
2192     max = GET2(ecode, 1);
2193     minimize = *ecode == OP_MINUPTO;
2194     ecode += 3;
2195     goto REPEATCHAR;
2196    
2197 nigel 93 case OP_POSSTAR:
2198     possessive = TRUE;
2199     min = 0;
2200     max = INT_MAX;
2201     ecode++;
2202     goto REPEATCHAR;
2203    
2204     case OP_POSPLUS:
2205     possessive = TRUE;
2206     min = 1;
2207     max = INT_MAX;
2208     ecode++;
2209     goto REPEATCHAR;
2210    
2211     case OP_POSQUERY:
2212     possessive = TRUE;
2213     min = 0;
2214     max = 1;
2215     ecode++;
2216     goto REPEATCHAR;
2217    
2218 nigel 77 case OP_STAR:
2219     case OP_MINSTAR:
2220     case OP_PLUS:
2221     case OP_MINPLUS:
2222     case OP_QUERY:
2223     case OP_MINQUERY:
2224     c = *ecode++ - OP_STAR;
2225     minimize = (c & 1) != 0;
2226     min = rep_min[c]; /* Pick up values from tables; */
2227     max = rep_max[c]; /* zero for max => infinity */
2228     if (max == 0) max = INT_MAX;
2229    
2230     /* Common code for all repeated single-character matches. We can give
2231     up quickly if there are fewer than the minimum number of characters left in
2232     the subject. */
2233    
2234     REPEATCHAR:
2235     #ifdef SUPPORT_UTF8
2236     if (utf8)
2237     {
2238     length = 1;
2239     charptr = ecode;
2240     GETCHARLEN(fc, ecode, length);
2241     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2242     ecode += length;
2243    
2244     /* Handle multibyte character matching specially here. There is
2245     support for caseless matching if UCP support is present. */
2246    
2247     if (length > 1)
2248     {
2249     #ifdef SUPPORT_UCP
2250 nigel 93 unsigned int othercase;
2251 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2252 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2253 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2254 ph10 115 else oclength = 0;
2255 nigel 77 #endif /* SUPPORT_UCP */
2256    
2257     for (i = 1; i <= min; i++)
2258     {
2259     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2260 ph10 123 #ifdef SUPPORT_UCP
2261 nigel 77 /* Need braces because of following else */
2262     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2263     else
2264     {
2265     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2266     eptr += oclength;
2267     }
2268 ph10 115 #else /* without SUPPORT_UCP */
2269     else { RRETURN(MATCH_NOMATCH); }
2270 ph10 123 #endif /* SUPPORT_UCP */
2271 nigel 77 }
2272    
2273     if (min == max) continue;
2274    
2275     if (minimize)
2276     {
2277     for (fi = min;; fi++)
2278     {
2279 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2280 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2281     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2282     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2283 ph10 123 #ifdef SUPPORT_UCP
2284 nigel 77 /* Need braces because of following else */
2285     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2286     else
2287     {
2288     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2289     eptr += oclength;
2290     }
2291 ph10 115 #else /* without SUPPORT_UCP */
2292     else { RRETURN (MATCH_NOMATCH); }
2293     #endif /* SUPPORT_UCP */
2294 nigel 77 }
2295     /* Control never gets here */
2296     }
2297 nigel 93
2298     else /* Maximize */
2299 nigel 77 {
2300     pp = eptr;
2301     for (i = min; i < max; i++)
2302     {
2303     if (eptr > md->end_subject - length) break;
2304     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2305 ph10 123 #ifdef SUPPORT_UCP
2306 nigel 77 else if (oclength == 0) break;
2307     else
2308     {
2309     if (memcmp(eptr, occhars, oclength) != 0) break;
2310     eptr += oclength;
2311     }
2312 ph10 115 #else /* without SUPPORT_UCP */
2313     else break;
2314 ph10 123 #endif /* SUPPORT_UCP */
2315 nigel 77 }
2316 nigel 93
2317     if (possessive) continue;
2318 ph10 120 for(;;)
2319 nigel 77 {
2320 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2321 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2322 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2323 ph10 115 #ifdef SUPPORT_UCP
2324     eptr--;
2325     BACKCHAR(eptr);
2326 ph10 123 #else /* without SUPPORT_UCP */
2327 nigel 77 eptr -= length;
2328 ph10 123 #endif /* SUPPORT_UCP */
2329 nigel 77 }
2330     }
2331     /* Control never gets here */
2332     }
2333    
2334     /* If the length of a UTF-8 character is 1, we fall through here, and
2335     obey the code as for non-UTF-8 characters below, though in this case the
2336     value of fc will always be < 128. */
2337     }
2338     else
2339     #endif /* SUPPORT_UTF8 */
2340    
2341     /* When not in UTF-8 mode, load a single-byte character. */
2342     {
2343     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2344     fc = *ecode++;
2345     }
2346    
2347     /* The value of fc at this point is always less than 256, though we may or
2348     may not be in UTF-8 mode. The code is duplicated for the caseless and
2349     caseful cases, for speed, since matching characters is likely to be quite
2350     common. First, ensure the minimum number of matches are present. If min =
2351     max, continue at the same level without recursing. Otherwise, if
2352     minimizing, keep trying the rest of the expression and advancing one
2353     matching character if failing, up to the maximum. Alternatively, if
2354     maximizing, find the maximum number of characters and work backwards. */
2355    
2356     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2357     max, eptr));
2358    
2359     if ((ims & PCRE_CASELESS) != 0)
2360     {
2361     fc = md->lcc[fc];
2362     for (i = 1; i <= min; i++)
2363     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2364     if (min == max) continue;
2365     if (minimize)
2366     {
2367     for (fi = min;; fi++)
2368     {
2369 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2370 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2371     if (fi >= max || eptr >= md->end_subject ||
2372     fc != md->lcc[*eptr++])
2373     RRETURN(MATCH_NOMATCH);
2374     }
2375     /* Control never gets here */
2376     }
2377 nigel 93 else /* Maximize */
2378 nigel 77 {
2379     pp = eptr;
2380     for (i = min; i < max; i++)
2381     {
2382     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2383     eptr++;
2384     }
2385 nigel 93 if (possessive) continue;
2386 nigel 77 while (eptr >= pp)
2387     {
2388 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2389 nigel 77 eptr--;
2390     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2391     }
2392     RRETURN(MATCH_NOMATCH);
2393     }
2394     /* Control never gets here */
2395     }
2396    
2397     /* Caseful comparisons (includes all multi-byte characters) */
2398    
2399     else
2400     {
2401     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2402     if (min == max) continue;
2403     if (minimize)
2404     {
2405     for (fi = min;; fi++)
2406     {
2407 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2408 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2409     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2410     RRETURN(MATCH_NOMATCH);
2411     }
2412     /* Control never gets here */
2413     }
2414 nigel 93 else /* Maximize */
2415 nigel 77 {
2416     pp = eptr;
2417     for (i = min; i < max; i++)
2418     {
2419     if (eptr >= md->end_subject || fc != *eptr) break;
2420     eptr++;
2421     }
2422 nigel 93 if (possessive) continue;
2423 nigel 77 while (eptr >= pp)
2424     {
2425 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2426 nigel 77 eptr--;
2427     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2428     }
2429     RRETURN(MATCH_NOMATCH);
2430     }
2431     }
2432     /* Control never gets here */
2433    
2434     /* Match a negated single one-byte character. The character we are
2435     checking can be multibyte. */
2436    
2437     case OP_NOT:
2438     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2439     ecode++;
2440     GETCHARINCTEST(c, eptr);
2441     if ((ims & PCRE_CASELESS) != 0)
2442     {
2443     #ifdef SUPPORT_UTF8
2444     if (c < 256)
2445     #endif
2446     c = md->lcc[c];
2447     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2448     }
2449     else
2450     {
2451     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2452     }
2453     break;
2454    
2455     /* Match a negated single one-byte character repeatedly. This is almost a
2456     repeat of the code for a repeated single character, but I haven't found a
2457     nice way of commoning these up that doesn't require a test of the
2458     positive/negative option for each character match. Maybe that wouldn't add
2459     very much to the time taken, but character matching *is* what this is all
2460     about... */
2461    
2462     case OP_NOTEXACT:
2463     min = max = GET2(ecode, 1);
2464     ecode += 3;
2465     goto REPEATNOTCHAR;
2466    
2467     case OP_NOTUPTO:
2468     case OP_NOTMINUPTO:
2469     min = 0;
2470     max = GET2(ecode, 1);
2471     minimize = *ecode == OP_NOTMINUPTO;
2472     ecode += 3;
2473     goto REPEATNOTCHAR;
2474    
2475 nigel 93 case OP_NOTPOSSTAR:
2476     possessive = TRUE;
2477     min = 0;
2478     max = INT_MAX;
2479     ecode++;
2480     goto REPEATNOTCHAR;
2481    
2482     case OP_NOTPOSPLUS:
2483     possessive = TRUE;
2484     min = 1;
2485     max = INT_MAX;
2486     ecode++;
2487     goto REPEATNOTCHAR;
2488    
2489     case OP_NOTPOSQUERY:
2490     possessive = TRUE;
2491     min = 0;
2492     max = 1;
2493     ecode++;
2494     goto REPEATNOTCHAR;
2495    
2496     case OP_NOTPOSUPTO:
2497     possessive = TRUE;
2498     min = 0;
2499     max = GET2(ecode, 1);
2500     ecode += 3;
2501     goto REPEATNOTCHAR;
2502    
2503 nigel 77 case OP_NOTSTAR:
2504     case OP_NOTMINSTAR:
2505     case OP_NOTPLUS:
2506     case OP_NOTMINPLUS:
2507     case OP_NOTQUERY:
2508     case OP_NOTMINQUERY:
2509     c = *ecode++ - OP_NOTSTAR;
2510     minimize = (c & 1) != 0;
2511     min = rep_min[c]; /* Pick up values from tables; */
2512     max = rep_max[c]; /* zero for max => infinity */
2513     if (max == 0) max = INT_MAX;
2514    
2515     /* Common code for all repeated single-byte matches. We can give up quickly
2516     if there are fewer than the minimum number of bytes left in the
2517     subject. */
2518    
2519     REPEATNOTCHAR:
2520     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2521     fc = *ecode++;
2522    
2523     /* The code is duplicated for the caseless and caseful cases, for speed,
2524     since matching characters is likely to be quite common. First, ensure the
2525     minimum number of matches are present. If min = max, continue at the same
2526     level without recursing. Otherwise, if minimizing, keep trying the rest of
2527     the expression and advancing one matching character if failing, up to the
2528     maximum. Alternatively, if maximizing, find the maximum number of
2529     characters and work backwards. */
2530    
2531     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2532     max, eptr));
2533    
2534     if ((ims & PCRE_CASELESS) != 0)
2535     {
2536     fc = md->lcc[fc];
2537    
2538     #ifdef SUPPORT_UTF8
2539     /* UTF-8 mode */
2540     if (utf8)
2541     {
2542 nigel 93 register unsigned int d;
2543 nigel 77 for (i = 1; i <= min; i++)
2544     {
2545     GETCHARINC(d, eptr);
2546     if (d < 256) d = md->lcc[d];
2547     if (fc == d) RRETURN(MATCH_NOMATCH);
2548     }
2549     }
2550     else
2551     #endif
2552    
2553     /* Not UTF-8 mode */
2554     {
2555     for (i = 1; i <= min; i++)
2556     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2557     }
2558    
2559     if (min == max) continue;
2560    
2561     if (minimize)
2562     {
2563     #ifdef SUPPORT_UTF8
2564     /* UTF-8 mode */
2565     if (utf8)
2566     {
2567 nigel 93 register unsigned int d;
2568 nigel 77 for (fi = min;; fi++)
2569     {
2570 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2571 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2572     GETCHARINC(d, eptr);
2573     if (d < 256) d = md->lcc[d];
2574     if (fi >= max || eptr >= md->end_subject || fc == d)
2575     RRETURN(MATCH_NOMATCH);
2576     }
2577     }
2578     else
2579     #endif
2580     /* Not UTF-8 mode */
2581     {
2582     for (fi = min;; fi++)
2583     {
2584 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2585 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2586     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2587     RRETURN(MATCH_NOMATCH);
2588     }
2589     }
2590     /* Control never gets here */
2591     }
2592    
2593     /* Maximize case */
2594    
2595     else
2596     {
2597     pp = eptr;
2598    
2599     #ifdef SUPPORT_UTF8
2600     /* UTF-8 mode */
2601     if (utf8)
2602     {
2603 nigel 93 register unsigned int d;
2604 nigel 77 for (i = min; i < max; i++)
2605     {
2606     int len = 1;
2607     if (eptr >= md->end_subject) break;
2608     GETCHARLEN(d, eptr, len);
2609     if (d < 256) d = md->lcc[d];
2610     if (fc == d) break;
2611     eptr += len;
2612     }
2613 nigel 93 if (possessive) continue;
2614     for(;;)
2615 nigel 77 {
2616 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2617 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2618     if (eptr-- == pp) break; /* Stop if tried at original pos */
2619     BACKCHAR(eptr);
2620     }
2621     }
2622     else
2623     #endif
2624     /* Not UTF-8 mode */
2625     {
2626     for (i = min; i < max; i++)
2627     {
2628     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2629     eptr++;
2630     }
2631 nigel 93 if (possessive) continue;
2632 nigel 77 while (eptr >= pp)
2633     {
2634 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2635 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2636     eptr--;
2637     }
2638     }
2639    
2640     RRETURN(MATCH_NOMATCH);
2641     }
2642     /* Control never gets here */
2643     }
2644    
2645     /* Caseful comparisons */
2646    
2647     else
2648     {
2649     #ifdef SUPPORT_UTF8
2650     /* UTF-8 mode */
2651     if (utf8)
2652     {
2653 nigel 93 register unsigned int d;
2654 nigel 77 for (i = 1; i <= min; i++)
2655     {
2656     GETCHARINC(d, eptr);
2657     if (fc == d) RRETURN(MATCH_NOMATCH);
2658     }
2659     }
2660     else
2661     #endif
2662     /* Not UTF-8 mode */
2663     {
2664     for (i = 1; i <= min; i++)
2665     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2666     }
2667    
2668     if (min == max) continue;
2669    
2670     if (minimize)
2671     {
2672     #ifdef SUPPORT_UTF8
2673     /* UTF-8 mode */
2674     if (utf8)
2675     {
2676 nigel 93 register unsigned int d;
2677 nigel 77 for (fi = min;; fi++)
2678     {
2679 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2680 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2681     GETCHARINC(d, eptr);
2682     if (fi >= max || eptr >= md->end_subject || fc == d)
2683     RRETURN(MATCH_NOMATCH);
2684     }
2685     }
2686     else
2687     #endif
2688     /* Not UTF-8 mode */
2689     {
2690     for (fi = min;; fi++)
2691     {
2692 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2693 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2694     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2695     RRETURN(MATCH_NOMATCH);
2696     }
2697     }
2698     /* Control never gets here */
2699     }
2700    
2701     /* Maximize case */
2702    
2703     else
2704     {
2705     pp = eptr;
2706    
2707     #ifdef SUPPORT_UTF8
2708     /* UTF-8 mode */
2709     if (utf8)
2710     {
2711 nigel 93 register unsigned int d;
2712 nigel 77 for (i = min; i < max; i++)
2713     {
2714     int len = 1;
2715     if (eptr >= md->end_subject) break;
2716     GETCHARLEN(d, eptr, len);
2717     if (fc == d) break;
2718     eptr += len;
2719     }
2720 nigel 93 if (possessive) continue;
2721 nigel 77 for(;;)
2722     {
2723 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2724 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2725     if (eptr-- == pp) break; /* Stop if tried at original pos */
2726     BACKCHAR(eptr);
2727     }
2728     }
2729     else
2730     #endif
2731     /* Not UTF-8 mode */
2732     {
2733     for (i = min; i < max; i++)
2734     {
2735     if (eptr >= md->end_subject || fc == *eptr) break;
2736     eptr++;
2737     }
2738 nigel 93 if (possessive) continue;
2739 nigel 77 while (eptr >= pp)
2740     {
2741 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2742 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2743     eptr--;
2744     }
2745     }
2746    
2747     RRETURN(MATCH_NOMATCH);
2748     }
2749     }
2750     /* Control never gets here */
2751    
2752     /* Match a single character type repeatedly; several different opcodes
2753     share code. This is very similar to the code for single characters, but we
2754     repeat it in the interests of efficiency. */
2755    
2756     case OP_TYPEEXACT:
2757     min = max = GET2(ecode, 1);
2758     minimize = TRUE;
2759     ecode += 3;
2760     goto REPEATTYPE;
2761    
2762     case OP_TYPEUPTO:
2763     case OP_TYPEMINUPTO:
2764     min = 0;
2765     max = GET2(ecode, 1);
2766     minimize = *ecode == OP_TYPEMINUPTO;
2767     ecode += 3;
2768     goto REPEATTYPE;
2769    
2770 nigel 93 case OP_TYPEPOSSTAR:
2771     possessive = TRUE;
2772     min = 0;
2773     max = INT_MAX;
2774     ecode++;
2775     goto REPEATTYPE;
2776    
2777     case OP_TYPEPOSPLUS:
2778     possessive = TRUE;
2779     min = 1;
2780     max = INT_MAX;
2781     ecode++;
2782     goto REPEATTYPE;
2783    
2784     case OP_TYPEPOSQUERY:
2785     possessive = TRUE;
2786     min = 0;
2787     max = 1;
2788     ecode++;
2789     goto REPEATTYPE;
2790    
2791     case OP_TYPEPOSUPTO:
2792     possessive = TRUE;
2793     min = 0;
2794     max = GET2(ecode, 1);
2795     ecode += 3;
2796     goto REPEATTYPE;
2797    
2798 nigel 77 case OP_TYPESTAR:
2799     case OP_TYPEMINSTAR:
2800     case OP_TYPEPLUS:
2801     case OP_TYPEMINPLUS:
2802     case OP_TYPEQUERY:
2803     case OP_TYPEMINQUERY:
2804     c = *ecode++ - OP_TYPESTAR;
2805     minimize = (c & 1) != 0;
2806     min = rep_min[c]; /* Pick up values from tables; */
2807     max = rep_max[c]; /* zero for max => infinity */
2808     if (max == 0) max = INT_MAX;
2809    
2810     /* Common code for all repeated single character type matches. Note that
2811     in UTF-8 mode, '.' matches a character of any length, but for the other
2812     character types, the valid characters are all one-byte long. */
2813    
2814     REPEATTYPE:
2815     ctype = *ecode++; /* Code for the character type */
2816    
2817     #ifdef SUPPORT_UCP
2818     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2819     {
2820     prop_fail_result = ctype == OP_NOTPROP;
2821     prop_type = *ecode++;
2822 nigel 87 prop_value = *ecode++;
2823 nigel 77 }
2824     else prop_type = -1;
2825     #endif
2826    
2827     /* First, ensure the minimum number of matches are present. Use inline
2828     code for maximizing the speed, and do the type test once at the start
2829     (i.e. keep it out of the loop). Also we can test that there are at least
2830     the minimum number of bytes before we start. This isn't as effective in
2831     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2832     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2833     and single-bytes. */
2834    
2835     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2836     if (min > 0)
2837     {
2838     #ifdef SUPPORT_UCP
2839 nigel 87 if (prop_type >= 0)
2840 nigel 77 {
2841 nigel 87 switch(prop_type)
2842 nigel 77 {
2843 nigel 87 case PT_ANY:
2844     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2845     for (i = 1; i <= min; i++)
2846     {
2847     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2848 ph10 184 GETCHARINCTEST(c, eptr);
2849 nigel 87 }
2850     break;
2851    
2852     case PT_LAMP:
2853     for (i = 1; i <= min; i++)
2854     {
2855     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2856 ph10 184 GETCHARINCTEST(c, eptr);
2857 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2858     if ((prop_chartype == ucp_Lu ||
2859     prop_chartype == ucp_Ll ||
2860     prop_chartype == ucp_Lt) == prop_fail_result)
2861     RRETURN(MATCH_NOMATCH);
2862     }
2863     break;
2864    
2865     case PT_GC:
2866     for (i = 1; i <= min; i++)
2867     {
2868     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2869 ph10 184 GETCHARINCTEST(c, eptr);
2870 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2871     if ((prop_category == prop_value) == prop_fail_result)
2872     RRETURN(MATCH_NOMATCH);
2873     }
2874     break;
2875    
2876     case PT_PC:
2877     for (i = 1; i <= min; i++)
2878     {
2879     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2880 ph10 184 GETCHARINCTEST(c, eptr);
2881 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2882     if ((prop_chartype == prop_value) == prop_fail_result)
2883     RRETURN(MATCH_NOMATCH);
2884     }
2885     break;
2886    
2887     case PT_SC:
2888     for (i = 1; i <= min; i++)
2889     {
2890     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2891 ph10 184 GETCHARINCTEST(c, eptr);
2892 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2893     if ((prop_script == prop_value) == prop_fail_result)
2894     RRETURN(MATCH_NOMATCH);
2895     }
2896     break;
2897    
2898     default:
2899     RRETURN(PCRE_ERROR_INTERNAL);
2900 nigel 77 }
2901     }
2902    
2903     /* Match extended Unicode sequences. We will get here only if the
2904     support is in the binary; otherwise a compile-time error occurs. */
2905    
2906     else if (ctype == OP_EXTUNI)
2907     {
2908     for (i = 1; i <= min; i++)
2909     {
2910     GETCHARINCTEST(c, eptr);
2911 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2912 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2913     while (eptr < md->end_subject)
2914     {
2915     int len = 1;
2916     if (!utf8) c = *eptr; else
2917     {
2918     GETCHARLEN(c, eptr, len);
2919     }
2920 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2921 nigel 77 if (prop_category != ucp_M) break;
2922     eptr += len;
2923     }
2924     }
2925     }
2926    
2927     else
2928     #endif /* SUPPORT_UCP */
2929    
2930     /* Handle all other cases when the coding is UTF-8 */
2931    
2932     #ifdef SUPPORT_UTF8
2933     if (utf8) switch(ctype)
2934     {
2935     case OP_ANY:
2936     for (i = 1; i <= min; i++)
2937     {
2938     if (eptr >= md->end_subject ||
2939 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2940 nigel 77 RRETURN(MATCH_NOMATCH);
2941 nigel 91 eptr++;
2942 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2943     }
2944     break;
2945    
2946     case OP_ANYBYTE:
2947     eptr += min;
2948     break;
2949    
2950 nigel 93 case OP_ANYNL:
2951     for (i = 1; i <= min; i++)
2952     {
2953     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2954     GETCHARINC(c, eptr);
2955     switch(c)
2956     {
2957     default: RRETURN(MATCH_NOMATCH);
2958     case 0x000d:
2959     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2960     break;
2961 ph10 231
2962 nigel 93 case 0x000a:
2963 ph10 231 break;
2964    
2965 nigel 93 case 0x000b:
2966     case 0x000c:
2967     case 0x0085:
2968     case 0x2028:
2969     case 0x2029:
2970 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
2971 nigel 93 break;
2972     }
2973     }
2974     break;
2975    
2976 ph10 178 case OP_NOT_HSPACE:
2977     for (i = 1; i <= min; i++)
2978     {
2979     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2980     GETCHARINC(c, eptr);
2981     switch(c)
2982     {
2983     default: break;
2984     case 0x09: /* HT */
2985     case 0x20: /* SPACE */
2986     case 0xa0: /* NBSP */
2987     case 0x1680: /* OGHAM SPACE MARK */
2988     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2989     case 0x2000: /* EN QUAD */
2990     case 0x2001: /* EM QUAD */
2991     case 0x2002: /* EN SPACE */
2992     case 0x2003: /* EM SPACE */
2993     case 0x2004: /* THREE-PER-EM SPACE */
2994     case 0x2005: /* FOUR-PER-EM SPACE */
2995     case 0x2006: /* SIX-PER-EM SPACE */
2996     case 0x2007: /* FIGURE SPACE */
2997     case 0x2008: /* PUNCTUATION SPACE */
2998     case 0x2009: /* THIN SPACE */
2999     case 0x200A: /* HAIR SPACE */
3000     case 0x202f: /* NARROW NO-BREAK SPACE */
3001     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3002     case 0x3000: /* IDEOGRAPHIC SPACE */
3003     RRETURN(MATCH_NOMATCH);
3004     }
3005     }
3006     break;
3007 ph10 182
3008 ph10 178 case OP_HSPACE:
3009     for (i = 1; i <= min; i++)
3010     {
3011     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3012     GETCHARINC(c, eptr);
3013     switch(c)
3014     {
3015     default: RRETURN(MATCH_NOMATCH);
3016     case 0x09: /* HT */
3017     case 0x20: /* SPACE */
3018     case 0xa0: /* NBSP */
3019     case 0x1680: /* OGHAM SPACE MARK */
3020     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3021     case 0x2000: /* EN QUAD */
3022     case 0x2001: /* EM QUAD */
3023     case 0x2002: /* EN SPACE */
3024     case 0x2003: /* EM SPACE */
3025     case 0x2004: /* THREE-PER-EM SPACE */
3026     case 0x2005: /* FOUR-PER-EM SPACE */
3027     case 0x2006: /* SIX-PER-EM SPACE */
3028     case 0x2007: /* FIGURE SPACE */
3029     case 0x2008: /* PUNCTUATION SPACE */
3030     case 0x2009: /* THIN SPACE */
3031     case 0x200A: /* HAIR SPACE */
3032     case 0x202f: /* NARROW NO-BREAK SPACE */
3033     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3034     case 0x3000: /* IDEOGRAPHIC SPACE */
3035     break;
3036     }
3037     }
3038     break;
3039 ph10 182
3040 ph10 178 case OP_NOT_VSPACE:
3041     for (i = 1; i <= min; i++)
3042     {
3043     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3044     GETCHARINC(c, eptr);
3045     switch(c)
3046     {
3047     default: break;
3048     case 0x0a: /* LF */
3049     case 0x0b: /* VT */
3050     case 0x0c: /* FF */
3051     case 0x0d: /* CR */
3052     case 0x85: /* NEL */
3053     case 0x2028: /* LINE SEPARATOR */
3054     case 0x2029: /* PARAGRAPH SEPARATOR */
3055     RRETURN(MATCH_NOMATCH);
3056     }
3057     }
3058     break;
3059 ph10 182
3060 ph10 178 case OP_VSPACE:
3061     for (i = 1; i <= min; i++)
3062     {
3063     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3064     GETCHARINC(c, eptr);
3065     switch(c)
3066     {
3067     default: RRETURN(MATCH_NOMATCH);
3068     case 0x0a: /* LF */
3069     case 0x0b: /* VT */
3070     case 0x0c: /* FF */
3071     case 0x0d: /* CR */
3072     case 0x85: /* NEL */
3073     case 0x2028: /* LINE SEPARATOR */
3074     case 0x2029: /* PARAGRAPH SEPARATOR */
3075 ph10 182 break;
3076 ph10 178 }
3077     }
3078     break;
3079    
3080 nigel 77 case OP_NOT_DIGIT:
3081     for (i = 1; i <= min; i++)
3082     {
3083     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3084     GETCHARINC(c, eptr);
3085     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3086     RRETURN(MATCH_NOMATCH);
3087     }
3088     break;
3089    
3090     case OP_DIGIT:
3091     for (i = 1; i <= min; i++)
3092     {
3093     if (eptr >= md->end_subject ||
3094     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3095     RRETURN(MATCH_NOMATCH);
3096     /* No need to skip more bytes - we know it's a 1-byte character */
3097     }
3098     break;
3099    
3100     case OP_NOT_WHITESPACE:
3101     for (i = 1; i <= min; i++)
3102     {
3103     if (eptr >= md->end_subject ||
3104 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3105 nigel 77 RRETURN(MATCH_NOMATCH);
3106 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3107 nigel 77 }
3108     break;
3109    
3110     case OP_WHITESPACE:
3111     for (i = 1; i <= min; i++)
3112     {
3113     if (eptr >= md->end_subject ||
3114     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3115     RRETURN(MATCH_NOMATCH);
3116     /* No need to skip more bytes - we know it's a 1-byte character */
3117     }
3118     break;
3119    
3120     case OP_NOT_WORDCHAR:
3121     for (i = 1; i <= min; i++)
3122     {
3123     if (eptr >= md->end_subject ||
3124 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3125 nigel 77 RRETURN(MATCH_NOMATCH);
3126 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3127 nigel 77 }
3128     break;
3129    
3130     case OP_WORDCHAR:
3131     for (i = 1; i <= min; i++)
3132     {
3133     if (eptr >= md->end_subject ||
3134     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3135     RRETURN(MATCH_NOMATCH);
3136     /* No need to skip more bytes - we know it's a 1-byte character */
3137     }
3138     break;
3139    
3140     default:
3141     RRETURN(PCRE_ERROR_INTERNAL);
3142     } /* End switch(ctype) */
3143    
3144     else
3145     #endif /* SUPPORT_UTF8 */
3146    
3147     /* Code for the non-UTF-8 case for minimum matching of operators other
3148 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3149     number of bytes present, as this was tested above. */
3150 nigel 77
3151     switch(ctype)
3152     {
3153     case OP_ANY:
3154     if ((ims & PCRE_DOTALL) == 0)
3155     {
3156     for (i = 1; i <= min; i++)
3157 nigel 91 {
3158 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3159 nigel 91 eptr++;
3160     }
3161 nigel 77 }
3162     else eptr += min;
3163     break;
3164    
3165     case OP_ANYBYTE:
3166     eptr += min;
3167     break;
3168    
3169 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3170     bytes are present in this case. */
3171    
3172     case OP_ANYNL:
3173     for (i = 1; i <= min; i++)
3174     {
3175     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3176     switch(*eptr++)
3177     {
3178     default: RRETURN(MATCH_NOMATCH);
3179     case 0x000d:
3180     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3181     break;
3182     case 0x000a:
3183 ph10 231 break;
3184    
3185 nigel 93 case 0x000b:
3186     case 0x000c:
3187     case 0x0085:
3188 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3189 nigel 93 break;
3190     }
3191     }
3192     break;
3193    
3194 ph10 178 case OP_NOT_HSPACE:
3195     for (i = 1; i <= min; i++)
3196     {
3197     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3198     switch(*eptr++)
3199     {
3200     default: break;
3201     case 0x09: /* HT */
3202     case 0x20: /* SPACE */
3203     case 0xa0: /* NBSP */
3204     RRETURN(MATCH_NOMATCH);
3205     }
3206     }
3207     break;
3208    
3209     case OP_HSPACE:
3210     for (i = 1; i <= min; i++)
3211     {
3212     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3213     switch(*eptr++)
3214     {
3215     default: RRETURN(MATCH_NOMATCH);
3216     case 0x09: /* HT */
3217     case 0x20: /* SPACE */
3218     case 0xa0: /* NBSP */
3219 ph10 182 break;
3220 ph10 178 }
3221     }
3222     break;
3223    
3224     case OP_NOT_VSPACE:
3225     for (i = 1; i <= min; i++)
3226     {
3227     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3228     switch(*eptr++)
3229     {
3230     default: break;
3231     case 0x0a: /* LF */
3232     case 0x0b: /* VT */
3233     case 0x0c: /* FF */
3234     case 0x0d: /* CR */
3235     case 0x85: /* NEL */
3236     RRETURN(MATCH_NOMATCH);
3237     }
3238     }
3239     break;
3240    
3241     case OP_VSPACE:
3242     for (i = 1; i <= min; i++)
3243     {
3244     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3245     switch(*eptr++)
3246     {
3247     default: RRETURN(MATCH_NOMATCH);
3248     case 0x0a: /* LF */
3249     case 0x0b: /* VT */
3250     case 0x0c: /* FF */
3251     case 0x0d: /* CR */
3252     case 0x85: /* NEL */
3253 ph10 182 break;
3254 ph10 178 }
3255     }
3256     break;
3257    
3258 nigel 77 case OP_NOT_DIGIT:
3259     for (i = 1; i <= min; i++)
3260     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3261     break;
3262    
3263     case OP_DIGIT:
3264     for (i = 1; i <= min; i++)
3265     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3266     break;
3267    
3268     case OP_NOT_WHITESPACE:
3269     for (i = 1; i <= min; i++)
3270     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3271     break;
3272    
3273     case OP_WHITESPACE:
3274     for (i = 1; i <= min; i++)
3275     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3276     break;
3277    
3278     case OP_NOT_WORDCHAR:
3279     for (i = 1; i <= min; i++)
3280     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3281     RRETURN(MATCH_NOMATCH);
3282     break;
3283    
3284     case OP_WORDCHAR:
3285     for (i = 1; i <= min; i++)
3286     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3287     RRETURN(MATCH_NOMATCH);
3288     break;
3289    
3290     default:
3291     RRETURN(PCRE_ERROR_INTERNAL);
3292     }
3293     }
3294    
3295     /* If min = max, continue at the same level without recursing */
3296    
3297     if (min == max) continue;
3298    
3299     /* If minimizing, we have to test the rest of the pattern before each
3300     subsequent match. Again, separate the UTF-8 case for speed, and also
3301     separate the UCP cases. */
3302    
3303     if (minimize)
3304     {
3305     #ifdef SUPPORT_UCP
3306 nigel 87 if (prop_type >= 0)
3307 nigel 77 {
3308 nigel 87 switch(prop_type)
3309 nigel 77 {
3310 nigel 87 case PT_ANY:
3311     for (fi = min;; fi++)
3312     {
3313 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3314 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3315     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3316     GETCHARINC(c, eptr);
3317     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3318     }
3319 nigel 93 /* Control never gets here */
3320 nigel 87
3321     case PT_LAMP:
3322     for (fi = min;; fi++)
3323     {
3324 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3325 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3326     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3327     GETCHARINC(c, eptr);
3328     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3329     if ((prop_chartype == ucp_Lu ||
3330     prop_chartype == ucp_Ll ||
3331     prop_chartype == ucp_Lt) == prop_fail_result)
3332     RRETURN(MATCH_NOMATCH);
3333     }
3334 nigel 93 /* Control never gets here */
3335 nigel 87
3336     case PT_GC:
3337     for (fi = min;; fi++)
3338     {
3339 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3340 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3341     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3342     GETCHARINC(c, eptr);
3343     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3344     if ((prop_category == prop_value) == prop_fail_result)
3345     RRETURN(MATCH_NOMATCH);
3346     }
3347 nigel 93 /* Control never gets here */
3348 nigel 87
3349     case PT_PC:
3350     for (fi = min;; fi++)
3351     {
3352 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3353 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3354     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3355     GETCHARINC(c, eptr);
3356     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3357     if ((prop_chartype == prop_value) == prop_fail_result)
3358     RRETURN(MATCH_NOMATCH);
3359     }
3360 nigel 93 /* Control never gets here */
3361 nigel 87
3362     case PT_SC:
3363     for (fi = min;; fi++)
3364     {
3365 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3366 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3367     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3368     GETCHARINC(c, eptr);
3369     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3370     if ((prop_script == prop_value) == prop_fail_result)
3371     RRETURN(MATCH_NOMATCH);
3372     }
3373 nigel 93 /* Control never gets here */
3374 nigel 87
3375     default:
3376     RRETURN(PCRE_ERROR_INTERNAL);
3377 nigel 77 }
3378     }
3379    
3380     /* Match extended Unicode sequences. We will get here only if the
3381     support is in the binary; otherwise a compile-time error occurs. */
3382    
3383     else if (ctype == OP_EXTUNI)
3384     {
3385     for (fi = min;; fi++)
3386     {
3387 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3388 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3389     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3390     GETCHARINCTEST(c, eptr);
3391 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3392 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3393     while (eptr < md->end_subject)
3394     {
3395     int len = 1;
3396     if (!utf8) c = *eptr; else
3397     {
3398     GETCHARLEN(c, eptr, len);
3399     }
3400 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3401 nigel 77 if (prop_category != ucp_M) break;
3402     eptr += len;
3403     }
3404     }
3405     }
3406    
3407     else
3408     #endif /* SUPPORT_UCP */
3409    
3410     #ifdef SUPPORT_UTF8
3411     /* UTF-8 mode */
3412     if (utf8)
3413     {
3414     for (fi = min;; fi++)
3415     {
3416 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3417 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3418 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3419     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3420 nigel 93 IS_NEWLINE(eptr)))
3421 nigel 91 RRETURN(MATCH_NOMATCH);
3422 nigel 77
3423     GETCHARINC(c, eptr);
3424     switch(ctype)
3425     {
3426 nigel 91 case OP_ANY: /* This is the DOTALL case */
3427 nigel 77 break;
3428    
3429     case OP_ANYBYTE:
3430     break;
3431    
3432 nigel 93 case OP_ANYNL:
3433     switch(c)
3434     {
3435     default: RRETURN(MATCH_NOMATCH);
3436     case 0x000d:
3437     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3438     break;
3439     case 0x000a:
3440 ph10 231 break;
3441    
3442 nigel 93 case 0x000b:
3443     case 0x000c:
3444     case 0x0085:
3445     case 0x2028:
3446     case 0x2029:
3447 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3448 nigel 93 break;
3449     }
3450     break;
3451    
3452 ph10 178 case OP_NOT_HSPACE:
3453     switch(c)
3454     {
3455     default: break;
3456     case 0x09: /* HT */
3457     case 0x20: /* SPACE */
3458     case 0xa0: /* NBSP */
3459     case 0x1680: /* OGHAM SPACE MARK */
3460     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3461     case 0x2000: /* EN QUAD */
3462     case 0x2001: /* EM QUAD */
3463     case 0x2002: /* EN SPACE */
3464     case 0x2003: /* EM SPACE */
3465     case 0x2004: /* THREE-PER-EM SPACE */
3466     case 0x2005: /* FOUR-PER-EM SPACE */
3467     case 0x2006: /* SIX-PER-EM SPACE */
3468     case 0x2007: /* FIGURE SPACE */
3469     case 0x2008: /* PUNCTUATION SPACE */
3470     case 0x2009: /* THIN SPACE */
3471     case 0x200A: /* HAIR SPACE */
3472     case 0x202f: /* NARROW NO-BREAK SPACE */
3473     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3474     case 0x3000: /* IDEOGRAPHIC SPACE */
3475     RRETURN(MATCH_NOMATCH);
3476     }
3477     break;
3478    
3479     case OP_HSPACE:
3480     switch(c)
3481     {
3482     default: RRETURN(MATCH_NOMATCH);
3483     case 0x09: /* HT */
3484     case 0x20: /* SPACE */
3485     case 0xa0: /* NBSP */
3486     case 0x1680: /* OGHAM SPACE MARK */
3487     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3488     case 0x2000: /* EN QUAD */
3489     case 0x2001: /* EM QUAD */
3490     case 0x2002: /* EN SPACE */
3491     case 0x2003: /* EM SPACE */
3492     case 0x2004: /* THREE-PER-EM SPACE */
3493     case 0x2005: /* FOUR-PER-EM SPACE */
3494     case 0x2006: /* SIX-PER-EM SPACE */
3495     case 0x2007: /* FIGURE SPACE */
3496     case 0x2008: /* PUNCTUATION SPACE */
3497     case 0x2009: /* THIN SPACE */
3498     case 0x200A: /* HAIR SPACE */
3499     case 0x202f: /* NARROW NO-BREAK SPACE */
3500     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3501     case 0x3000: /* IDEOGRAPHIC SPACE */
3502     break;
3503     }
3504     break;
3505    
3506     case OP_NOT_VSPACE:
3507     switch(c)
3508     {
3509     default: break;
3510     case 0x0a: /* LF */
3511     case 0x0b: /* VT */
3512     case 0x0c: /* FF */
3513     case 0x0d: /* CR */
3514     case 0x85: /* NEL */
3515     case 0x2028: /* LINE SEPARATOR */
3516     case 0x2029: /* PARAGRAPH SEPARATOR */
3517     RRETURN(MATCH_NOMATCH);
3518     }
3519     break;
3520    
3521     case OP_VSPACE:
3522     switch(c)
3523     {
3524     default: RRETURN(MATCH_NOMATCH);
3525     case 0x0a: /* LF */
3526     case 0x0b: /* VT */
3527     case 0x0c: /* FF */
3528     case 0x0d: /* CR */
3529     case 0x85: /* NEL */
3530     case 0x2028: /* LINE SEPARATOR */
3531     case 0x2029: /* PARAGRAPH SEPARATOR */
3532     break;
3533     }
3534     break;
3535    
3536 nigel 77 case OP_NOT_DIGIT:
3537     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3538     RRETURN(MATCH_NOMATCH);
3539     break;
3540    
3541     case OP_DIGIT:
3542     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3543     RRETURN(MATCH_NOMATCH);
3544     break;
3545    
3546     case OP_NOT_WHITESPACE:
3547     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3548     RRETURN(MATCH_NOMATCH);
3549     break;
3550    
3551     case OP_WHITESPACE:
3552     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3553     RRETURN(MATCH_NOMATCH);
3554     break;
3555    
3556     case OP_NOT_WORDCHAR:
3557     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3558     RRETURN(MATCH_NOMATCH);
3559     break;
3560    
3561     case OP_WORDCHAR:
3562     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3563     RRETURN(MATCH_NOMATCH);
3564     break;
3565    
3566     default:
3567     RRETURN(PCRE_ERROR_INTERNAL);
3568     }
3569     }
3570     }
3571     else
3572     #endif
3573     /* Not UTF-8 mode */
3574     {
3575     for (fi = min;; fi++)
3576     {
3577 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3578 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3579 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3580 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3581 nigel 91 RRETURN(MATCH_NOMATCH);
3582    
3583 nigel 77 c = *eptr++;
3584     switch(ctype)
3585     {
3586 nigel 91 case OP_ANY: /* This is the DOTALL case */
3587 nigel 77 break;
3588    
3589     case OP_ANYBYTE:
3590     break;
3591    
3592 nigel 93 case OP_ANYNL:
3593     switch(c)
3594     {
3595     default: RRETURN(MATCH_NOMATCH);
3596     case 0x000d:
3597     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3598     break;
3599 ph10 231
3600 nigel 93 case 0x000a:
3601 ph10 231 break;
3602    
3603 nigel 93 case 0x000b:
3604     case 0x000c:
3605     case 0x0085:
3606 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3607 nigel 93 break;
3608     }
3609     break;
3610    
3611 ph10 178 case OP_NOT_HSPACE:
3612     switch(c)
3613     {
3614     default: break;
3615     case 0x09: /* HT */
3616     case 0x20: /* SPACE */
3617     case 0xa0: /* NBSP */
3618     RRETURN(MATCH_NOMATCH);
3619     }
3620     break;
3621    
3622     case OP_HSPACE:
3623     switch(c)
3624     {
3625     default: RRETURN(MATCH_NOMATCH);
3626     case 0x09: /* HT */
3627     case 0x20: /* SPACE */
3628     case 0xa0: /* NBSP */
3629     break;
3630     }
3631     break;
3632    
3633     case OP_NOT_VSPACE:
3634     switch(c)
3635     {
3636     default: break;
3637     case 0x0a: /* LF */
3638     case 0x0b: /* VT */
3639     case 0x0c: /* FF */
3640     case 0x0d: /* CR */
3641     case 0x85: /* NEL */
3642     RRETURN(MATCH_NOMATCH);
3643     }
3644     break;
3645    
3646     case OP_VSPACE:
3647     switch(c)
3648     {
3649     default: RRETURN(MATCH_NOMATCH);
3650     case 0x0a: /* LF */
3651     case 0x0b: /* VT */
3652     case 0x0c: /* FF */
3653     case 0x0d: /* CR */
3654     case 0x85: /* NEL */
3655     break;
3656     }
3657     break;
3658    
3659 nigel 77 case OP_NOT_DIGIT:
3660     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3661     break;
3662    
3663     case OP_DIGIT:
3664     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3665     break;
3666    
3667     case OP_NOT_WHITESPACE:
3668     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3669     break;
3670    
3671     case OP_WHITESPACE:
3672     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3673     break;
3674    
3675     case OP_NOT_WORDCHAR:
3676     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3677     break;
3678    
3679     case OP_WORDCHAR:
3680     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3681     break;
3682    
3683     default:
3684     RRETURN(PCRE_ERROR_INTERNAL);
3685     }
3686     }
3687     }
3688     /* Control never gets here */
3689     }
3690    
3691 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3692 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3693     UTF-8 and UCP stuff separate. */
3694    
3695     else
3696     {
3697     pp = eptr; /* Remember where we started */
3698    
3699     #ifdef SUPPORT_UCP
3700 nigel 87 if (prop_type >= 0)
3701 nigel 77 {
3702 nigel 87 switch(prop_type)
3703 nigel 77 {
3704 nigel 87 case PT_ANY:
3705     for (i = min; i < max; i++)
3706     {
3707     int len = 1;
3708     if (eptr >= md->end_subject) break;
3709     GETCHARLEN(c, eptr, len);
3710     if (prop_fail_result) break;
3711     eptr+= len;
3712     }
3713     break;
3714    
3715     case PT_LAMP:
3716     for (i = min; i < max; i++)
3717     {
3718     int len = 1;
3719     if (eptr >= md->end_subject) break;
3720     GETCHARLEN(c, eptr, len);
3721     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3722     if ((prop_chartype == ucp_Lu ||
3723     prop_chartype == ucp_Ll ||
3724     prop_chartype == ucp_Lt) == prop_fail_result)
3725     break;
3726     eptr+= len;
3727     }
3728     break;
3729    
3730     case PT_GC:
3731     for (i = min; i < max; i++)
3732     {
3733     int len = 1;
3734     if (eptr >= md->end_subject) break;
3735     GETCHARLEN(c, eptr, len);
3736     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3737     if ((prop_category == prop_value) == prop_fail_result)
3738     break;
3739     eptr+= len;
3740     }
3741     break;
3742    
3743     case PT_PC:
3744     for (i = min; i < max; i++)
3745     {
3746     int len = 1;
3747     if (eptr >= md->end_subject) break;
3748     GETCHARLEN(c, eptr, len);
3749     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3750     if ((prop_chartype == prop_value) == prop_fail_result)
3751     break;
3752     eptr+= len;
3753     }
3754     break;
3755    
3756     case PT_SC:
3757     for (i = min; i < max; i++)
3758     {
3759     int len = 1;
3760     if (eptr >= md->end_subject) break;
3761     GETCHARLEN(c, eptr, len);
3762     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3763     if ((prop_script == prop_value) == prop_fail_result)
3764     break;
3765     eptr+= len;
3766     }
3767     break;
3768 nigel 77 }
3769    
3770     /* eptr is now past the end of the maximum run */
3771    
3772 nigel 93 if (possessive) continue;
3773 nigel 77 for(;;)
3774     {
3775 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3776 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3777     if (eptr-- == pp) break; /* Stop if tried at original pos */
3778 ph10 207 if (utf8) BACKCHAR(eptr);
3779 nigel 77 }
3780     }
3781    
3782     /* Match extended Unicode sequences. We will get here only if the
3783     support is in the binary; otherwise a compile-time error occurs. */
3784    
3785     else if (ctype == OP_EXTUNI)
3786     {
3787     for (i = min; i < max; i++)
3788     {
3789     if (eptr >= md->end_subject) break;
3790     GETCHARINCTEST(c, eptr);
3791 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3792 nigel 77 if (prop_category == ucp_M) break;
3793     while (eptr < md->end_subject)
3794     {
3795     int len = 1;
3796     if (!utf8) c = *eptr; else
3797     {
3798     GETCHARLEN(c, eptr, len);
3799     }
3800 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3801 nigel 77 if (prop_category != ucp_M) break;
3802     eptr += len;
3803     }
3804     }
3805    
3806     /* eptr is now past the end of the maximum run */
3807    
3808 nigel 93 if (possessive) continue;
3809 nigel 77 for(;;)
3810     {
3811 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3812 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3813     if (eptr-- == pp) break; /* Stop if tried at original pos */
3814     for (;;) /* Move back over one extended */
3815     {
3816     int len = 1;
3817     if (!utf8) c = *eptr; else
3818     {
3819 ph10 207 BACKCHAR(eptr);
3820 nigel 77 GETCHARLEN(c, eptr, len);
3821     }
3822 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3823 nigel 77 if (prop_category != ucp_M) break;
3824     eptr--;
3825     }
3826     }
3827     }
3828    
3829     else
3830     #endif /* SUPPORT_UCP */
3831    
3832     #ifdef SUPPORT_UTF8
3833     /* UTF-8 mode */
3834    
3835     if (utf8)
3836     {
3837     switch(ctype)
3838     {
3839     case OP_ANY:
3840     if (max < INT_MAX)
3841     {
3842     if ((ims & PCRE_DOTALL) == 0)
3843     {
3844     for (i = min; i < max; i++)
3845     {
3846 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3847 nigel 77 eptr++;
3848     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3849     }
3850     }
3851     else
3852     {
3853     for (i = min; i < max; i++)
3854     {
3855 nigel 91 if (eptr >= md->end_subject) break;
3856 nigel 77 eptr++;
3857     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3858     }
3859     }
3860     }
3861    
3862     /* Handle unlimited UTF-8 repeat */
3863    
3864     else
3865     {
3866     if ((ims & PCRE_DOTALL) == 0)
3867     {
3868     for (i = min; i < max; i++)
3869     {
3870 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3871 nigel 77 eptr++;
3872 ph10 190 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3873 nigel 77 }
3874     }
3875     else
3876     {
3877 ph10 190 eptr = md->end_subject;
3878 nigel 77 }
3879     }
3880     break;
3881    
3882     /* The byte case is the same as non-UTF8 */
3883    
3884     case OP_ANYBYTE:
3885     c = max - min;
3886 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3887     c = md->end_subject - eptr;
3888 nigel 77 eptr += c;
3889     break;
3890    
3891 nigel 93 case OP_ANYNL:
3892     for (i = min; i < max; i++)
3893     {
3894     int len = 1;
3895     if (eptr >= md->end_subject) break;
3896     GETCHARLEN(c, eptr, len);
3897     if (c == 0x000d)
3898     {
3899     if (++eptr >= md->end_subject) break;
3900     if (*eptr == 0x000a) eptr++;
3901     }
3902     else
3903     {
3904 ph10 231 if (c != 0x000a &&
3905     (md->bsr_anycrlf ||
3906     (c != 0x000b && c != 0x000c &&
3907     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3908 nigel 93 break;
3909     eptr += len;
3910     }
3911     }
3912     break;
3913    
3914 ph10 178 case OP_NOT_HSPACE:
3915 ph10 182 case OP_HSPACE:
3916 ph10 178 for (i = min; i < max; i++)
3917     {
3918 ph10 182 BOOL gotspace;
3919 ph10 178 int len = 1;
3920     if (eptr >= md->end_subject) break;
3921     GETCHARLEN(c, eptr, len);
3922     switch(c)
3923 ph10 182 {
3924     default: gotspace = FALSE; break;
3925 ph10 178 case 0x09: /* HT */
3926     case 0x20: /* SPACE */
3927     case 0xa0: /* NBSP */
3928     case 0x1680: /* OGHAM SPACE MARK */
3929     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3930     case 0x2000: /* EN QUAD */
3931     case 0x2001: /* EM QUAD */
3932     case 0x2002: /* EN SPACE */
3933     case 0x2003: /* EM SPACE */
3934     case 0x2004: /* THREE-PER-EM SPACE */
3935     case 0x2005: /* FOUR-PER-EM SPACE */
3936     case 0x2006: /* SIX-PER-EM SPACE */
3937     case 0x2007: /* FIGURE SPACE */
3938     case 0x2008: /* PUNCTUATION SPACE */
3939     case 0x2009: /* THIN SPACE */
3940     case 0x200A: /* HAIR SPACE */
3941     case 0x202f: /* NARROW NO-BREAK SPACE */
3942     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3943     case 0x3000: /* IDEOGRAPHIC SPACE */
3944     gotspace = TRUE;
3945 ph10 182 break;
3946 ph10 178 }
3947     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3948     eptr += len;
3949     }
3950     break;
3951    
3952     case OP_NOT_VSPACE:
3953 ph10 182 case OP_VSPACE:
3954 ph10 178 for (i = min; i < max; i++)
3955     {
3956 ph10 182 BOOL gotspace;
3957 ph10 178 int len = 1;
3958     if (eptr >= md->end_subject) break;
3959     GETCHARLEN(c, eptr, len);
3960     switch(c)
3961     {
3962 ph10 182 default: gotspace = FALSE; break;
3963 ph10 178 case 0x0a: /* LF */
3964     case 0x0b: /* VT */
3965     case 0x0c: /* FF */
3966     case 0x0d: /* CR */
3967     case 0x85: /* NEL */
3968     case 0x2028: /* LINE SEPARATOR */
3969     case 0x2029: /* PARAGRAPH SEPARATOR */
3970     gotspace = TRUE;
3971     break;
3972     }
3973 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
3974 ph10 178 eptr += len;
3975     }
3976     break;
3977    
3978 nigel 77 case OP_NOT_DIGIT:
3979     for (i = min; i < max; i++)
3980     {
3981     int len = 1;
3982     if (eptr >= md->end_subject) break;
3983     GETCHARLEN(c, eptr, len);
3984     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
3985     eptr+= len;
3986     }
3987     break;
3988    
3989     case OP_DIGIT:
3990     for (i = min; i < max; i++)
3991     {
3992     int len = 1;
3993     if (eptr >= md->end_subject) break;
3994     GETCHARLEN(c, eptr, len);
3995     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
3996     eptr+= len;
3997     }
3998     break;
3999    
4000     case OP_NOT_WHITESPACE:
4001     for (i = min; i < max; i++)
4002     {
4003     int len = 1;
4004     if (eptr >= md->end_subject) break;
4005     GETCHARLEN(c, eptr, len);
4006     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4007     eptr+= len;
4008     }
4009     break;
4010    
4011     case OP_WHITESPACE:
4012     for (i = min; i < max; i++)
4013     {
4014     int len = 1;
4015     if (eptr >= md->end_subject) break;
4016     GETCHARLEN(c, eptr, len);
4017     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4018     eptr+= len;
4019     }
4020     break;
4021    
4022     case OP_NOT_WORDCHAR:
4023     for (i = min; i < max; i++)
4024     {
4025     int len = 1;
4026     if (eptr >= md->end_subject) break;
4027     GETCHARLEN(c, eptr, len);
4028     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4029     eptr+= len;
4030     }
4031     break;
4032    
4033     case OP_WORDCHAR:
4034     for (i = min; i < max; i++)
4035     {
4036     int len = 1;
4037     if (eptr >= md->end_subject) break;
4038     GETCHARLEN(c, eptr, len);
4039     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4040     eptr+= len;
4041     }
4042     break;
4043    
4044     default:
4045     RRETURN(PCRE_ERROR_INTERNAL);
4046     }
4047    
4048     /* eptr is now past the end of the maximum run */
4049    
4050 nigel 93 if (possessive) continue;
4051 nigel 77 for(;;)
4052     {
4053 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4054 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4055     if (eptr-- == pp) break; /* Stop if tried at original pos */
4056     BACKCHAR(eptr);
4057     }
4058     }
4059     else
4060 ph10 207 #endif /* SUPPORT_UTF8 */
4061 nigel 77
4062     /* Not UTF-8 mode */
4063     {
4064     switch(ctype)
4065     {
4066     case OP_ANY:
4067     if ((ims & PCRE_DOTALL) == 0)
4068     {
4069     for (i = min; i < max; i++)
4070     {
4071 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4072 nigel 77 eptr++;
4073     }
4074     break;
4075     }
4076     /* For DOTALL case, fall through and treat as \C */
4077    
4078     case OP_ANYBYTE:
4079     c = max - min;
4080 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4081     c = md->end_subject - eptr;
4082 nigel 77 eptr += c;
4083     break;
4084    
4085 nigel 93 case OP_ANYNL:
4086     for (i = min; i < max; i++)
4087     {
4088     if (eptr >= md->end_subject) break;
4089     c = *eptr;
4090     if (c == 0x000d)
4091     {
4092     if (++eptr >= md->end_subject) break;
4093     if (*eptr == 0x000a) eptr++;
4094     }
4095     else
4096     {
4097 ph10 231 if (c != 0x000a &&
4098     (md->bsr_anycrlf ||
4099     (c != 0x000b && c != 0x000c && c != 0x0085)))
4100 nigel 93 break;
4101     eptr++;
4102     }
4103     }
4104     break;
4105    
4106 ph10 178 case OP_NOT_HSPACE:
4107     for (i = min; i < max; i++)
4108     {
4109     if (eptr >= md->end_subject) break;
4110     c = *eptr;
4111     if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4112 ph10 182 eptr++;
4113 ph10 178 }
4114     break;
4115    
4116     case OP_HSPACE:
4117     for (i = min; i < max; i++)
4118     {
4119     if (eptr >= md->end_subject) break;
4120     c = *eptr;
4121     if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4122 ph10 182 eptr++;
4123 ph10 178 }
4124     break;
4125    
4126     case OP_NOT_VSPACE:
4127     for (i = min; i < max; i++)
4128     {
4129     if (eptr >= md->end_subject) break;