/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 341 - (hide annotations) (download)
Sat Apr 19 16:41:04 2008 UTC (6 years, 6 months ago) by ph10
File MIME type: text/plain
File size: 151712 byte(s)
Fix DFA (?!) bug; add support for JavaScript empty classes.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 305 Copyright (c) 1997-2008 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161     /* Separate the caselesss case for speed */
162    
163     if ((ims & PCRE_CASELESS) != 0)
164     {
165     while (length-- > 0)
166     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
167     }
168     else
169     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
170    
171     return TRUE;
172     }
173    
174    
175    
176     /***************************************************************************
177     ****************************************************************************
178     RECURSION IN THE match() FUNCTION
179    
180 nigel 87 The match() function is highly recursive, though not every recursive call
181     increases the recursive depth. Nevertheless, some regular expressions can cause
182     it to recurse to a great depth. I was writing for Unix, so I just let it call
183     itself recursively. This uses the stack for saving everything that has to be
184     saved for a recursive call. On Unix, the stack can be large, and this works
185     fine.
186 nigel 77
187 nigel 87 It turns out that on some non-Unix-like systems there are problems with
188     programs that use a lot of stack. (This despite the fact that every last chip
189     has oodles of memory these days, and techniques for extending the stack have
190     been known for decades.) So....
191 nigel 77
192     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
193     calls by keeping local variables that need to be preserved in blocks of memory
194 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
195 nigel 77 achieve this so that the actual code doesn't look very different to what it
196     always used to.
197 ph10 164
198 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
199 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
200     Switzer, the use of longjmp() has been abolished, at the cost of having to
201     provide a unique number for each call to RMATCH. There is no way of generating
202     a sequence of numbers at compile time in C. I have given them names, to make
203     them stand out more clearly.
204    
205     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
206     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
207 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
208     don't have indeterminate values; this has meant that the frame size can be
209 ph10 164 reduced because the result can be "passed back" by straight setting of the
210     variable instead of being passed in the frame.
211 nigel 77 ****************************************************************************
212     ***************************************************************************/
213    
214 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
215     below must be updated in sync. */
216 nigel 77
217 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
218     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
219     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
220     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
221 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
222 ph10 212 RM51, RM52, RM53, RM54 };
223 ph10 164
224 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
225 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
226 ph10 164 actuall used in this definition. */
227 nigel 77
228     #ifndef NO_RECURSE
229     #define REGISTER register
230 ph10 164
231 nigel 87 #ifdef DEBUG
232 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
233 nigel 87 { \
234     printf("match() called in line %d\n", __LINE__); \
235 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
236 nigel 87 printf("to line %d\n", __LINE__); \
237     }
238     #define RRETURN(ra) \
239     { \
240     printf("match() returned %d from line %d ", ra, __LINE__); \
241     return ra; \
242     }
243     #else
244 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
245 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
246 nigel 77 #define RRETURN(ra) return ra
247 nigel 87 #endif
248    
249 nigel 77 #else
250    
251    
252 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
253     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
254     argument of match(), which never changes. */
255 nigel 77
256     #define REGISTER
257    
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
259 nigel 77 {\
260     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
261 ph10 164 frame->Xwhere = rw; \
262     newframe->Xeptr = ra;\
263     newframe->Xecode = rb;\
264 ph10 168 newframe->Xmstart = mstart;\
265 ph10 164 newframe->Xoffset_top = rc;\
266     newframe->Xims = re;\
267     newframe->Xeptrb = rf;\
268     newframe->Xflags = rg;\
269     newframe->Xrdepth = frame->Xrdepth + 1;\
270     newframe->Xprevframe = frame;\
271     frame = newframe;\
272     DPRINTF(("restarting from line %d\n", __LINE__));\
273     goto HEAP_RECURSE;\
274     L_##rw:\
275     DPRINTF(("jumped back to line %d\n", __LINE__));\
276 nigel 77 }
277    
278     #define RRETURN(ra)\
279     {\
280     heapframe *newframe = frame;\
281     frame = newframe->Xprevframe;\
282     (pcre_stack_free)(newframe);\
283     if (frame != NULL)\
284     {\
285 ph10 164 rrc = ra;\
286     goto HEAP_RETURN;\
287 nigel 77 }\
288     return ra;\
289     }
290    
291    
292     /* Structure for remembering the local variables in a private frame */
293    
294     typedef struct heapframe {
295     struct heapframe *Xprevframe;
296    
297     /* Function arguments that may change */
298    
299     const uschar *Xeptr;
300     const uschar *Xecode;
301 ph10 172 const uschar *Xmstart;
302 nigel 77 int Xoffset_top;
303     long int Xims;
304     eptrblock *Xeptrb;
305     int Xflags;
306 nigel 91 unsigned int Xrdepth;
307 nigel 77
308     /* Function local variables */
309    
310     const uschar *Xcallpat;
311     const uschar *Xcharptr;
312     const uschar *Xdata;
313     const uschar *Xnext;
314     const uschar *Xpp;
315     const uschar *Xprev;
316     const uschar *Xsaved_eptr;
317    
318     recursion_info Xnew_recursive;
319    
320     BOOL Xcur_is_word;
321     BOOL Xcondition;
322     BOOL Xprev_is_word;
323    
324     unsigned long int Xoriginal_ims;
325    
326     #ifdef SUPPORT_UCP
327     int Xprop_type;
328 nigel 87 int Xprop_value;
329 nigel 77 int Xprop_fail_result;
330     int Xprop_category;
331     int Xprop_chartype;
332 nigel 87 int Xprop_script;
333 ph10 123 int Xoclength;
334     uschar Xocchars[8];
335 nigel 77 #endif
336    
337     int Xctype;
338 nigel 93 unsigned int Xfc;
339 nigel 77 int Xfi;
340     int Xlength;
341     int Xmax;
342     int Xmin;
343     int Xnumber;
344     int Xoffset;
345     int Xop;
346     int Xsave_capture_last;
347     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
348     int Xstacksave[REC_STACK_SAVE_MAX];
349    
350     eptrblock Xnewptrb;
351    
352 ph10 164 /* Where to jump back to */
353 nigel 77
354 ph10 164 int Xwhere;
355 ph10 165
356 nigel 77 } heapframe;
357    
358     #endif
359    
360    
361     /***************************************************************************
362     ***************************************************************************/
363    
364    
365    
366     /*************************************************
367     * Match from current position *
368     *************************************************/
369    
370 nigel 93 /* This function is called recursively in many circumstances. Whenever it
371 nigel 77 returns a negative (error) response, the outer incarnation must also return the
372     same response.
373    
374     Performance note: It might be tempting to extract commonly used fields from the
375     md structure (e.g. utf8, end_subject) into individual variables to improve
376     performance. Tests using gcc on a SPARC disproved this; in the first case, it
377     made performance worse.
378    
379     Arguments:
380 nigel 93 eptr pointer to current character in subject
381     ecode pointer to current position in compiled code
382 ph10 168 mstart pointer to the current match start position (can be modified
383 ph10 172 by encountering \K)
384 nigel 77 offset_top current top pointer
385     md pointer to "static" info for the match
386     ims current /i, /m, and /s options
387     eptrb pointer to chain of blocks containing eptr at start of
388     brackets - for testing for empty matches
389     flags can contain
390     match_condassert - this is an assertion condition
391 nigel 93 match_cbegroup - this is the start of an unlimited repeat
392     group that can match an empty string
393 nigel 87 rdepth the recursion depth
394 nigel 77
395     Returns: MATCH_MATCH if matched ) these values are >= 0
396     MATCH_NOMATCH if failed to match )
397     a negative PCRE_ERROR_xxx value if aborted by an error condition
398 nigel 87 (e.g. stopped by repeated call or recursion limit)
399 nigel 77 */
400    
401     static int
402 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
403 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
404 nigel 91 int flags, unsigned int rdepth)
405 nigel 77 {
406     /* These variables do not need to be preserved over recursion in this function,
407 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
408     "register" because they are used a lot in loops. */
409 nigel 77
410 nigel 91 register int rrc; /* Returns from recursive calls */
411     register int i; /* Used for loops not involving calls to RMATCH() */
412 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
413 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
414 nigel 77
415 nigel 93 BOOL minimize, possessive; /* Quantifier options */
416    
417 nigel 77 /* When recursion is not being used, all "local" variables that have to be
418     preserved over calls to RMATCH() are part of a "frame" which is obtained from
419     heap storage. Set up the top-level frame here; others are obtained from the
420     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
421    
422     #ifdef NO_RECURSE
423     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
424     frame->Xprevframe = NULL; /* Marks the top level */
425    
426     /* Copy in the original argument variables */
427    
428     frame->Xeptr = eptr;
429     frame->Xecode = ecode;
430 ph10 168 frame->Xmstart = mstart;
431 nigel 77 frame->Xoffset_top = offset_top;
432     frame->Xims = ims;
433     frame->Xeptrb = eptrb;
434     frame->Xflags = flags;
435 nigel 87 frame->Xrdepth = rdepth;
436 nigel 77
437     /* This is where control jumps back to to effect "recursion" */
438    
439     HEAP_RECURSE:
440    
441     /* Macros make the argument variables come from the current frame */
442    
443     #define eptr frame->Xeptr
444     #define ecode frame->Xecode
445 ph10 168 #define mstart frame->Xmstart
446 nigel 77 #define offset_top frame->Xoffset_top
447     #define ims frame->Xims
448     #define eptrb frame->Xeptrb
449     #define flags frame->Xflags
450 nigel 87 #define rdepth frame->Xrdepth
451 nigel 77
452     /* Ditto for the local variables */
453    
454     #ifdef SUPPORT_UTF8
455     #define charptr frame->Xcharptr
456     #endif
457     #define callpat frame->Xcallpat
458     #define data frame->Xdata
459     #define next frame->Xnext
460     #define pp frame->Xpp
461     #define prev frame->Xprev
462     #define saved_eptr frame->Xsaved_eptr
463    
464     #define new_recursive frame->Xnew_recursive
465    
466     #define cur_is_word frame->Xcur_is_word
467     #define condition frame->Xcondition
468     #define prev_is_word frame->Xprev_is_word
469    
470     #define original_ims frame->Xoriginal_ims
471    
472     #ifdef SUPPORT_UCP
473     #define prop_type frame->Xprop_type
474 nigel 87 #define prop_value frame->Xprop_value
475 nigel 77 #define prop_fail_result frame->Xprop_fail_result
476     #define prop_category frame->Xprop_category
477     #define prop_chartype frame->Xprop_chartype
478 nigel 87 #define prop_script frame->Xprop_script
479 ph10 115 #define oclength frame->Xoclength
480     #define occhars frame->Xocchars
481 nigel 77 #endif
482    
483     #define ctype frame->Xctype
484     #define fc frame->Xfc
485     #define fi frame->Xfi
486     #define length frame->Xlength
487     #define max frame->Xmax
488     #define min frame->Xmin
489     #define number frame->Xnumber
490     #define offset frame->Xoffset
491     #define op frame->Xop
492     #define save_capture_last frame->Xsave_capture_last
493     #define save_offset1 frame->Xsave_offset1
494     #define save_offset2 frame->Xsave_offset2
495     #define save_offset3 frame->Xsave_offset3
496     #define stacksave frame->Xstacksave
497    
498     #define newptrb frame->Xnewptrb
499    
500     /* When recursion is being used, local variables are allocated on the stack and
501     get preserved during recursion in the normal way. In this environment, fi and
502     i, and fc and c, can be the same variables. */
503    
504 nigel 93 #else /* NO_RECURSE not defined */
505 nigel 77 #define fi i
506     #define fc c
507    
508    
509 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
510     const uschar *charptr; /* in small blocks of the code. My normal */
511     #endif /* style of coding would have declared */
512     const uschar *callpat; /* them within each of those blocks. */
513     const uschar *data; /* However, in order to accommodate the */
514     const uschar *next; /* version of this code that uses an */
515     USPTR pp; /* external "stack" implemented on the */
516     const uschar *prev; /* heap, it is easier to declare them all */
517     USPTR saved_eptr; /* here, so the declarations can be cut */
518     /* out in a block. The only declarations */
519     recursion_info new_recursive; /* within blocks below are for variables */
520     /* that do not have to be preserved over */
521     BOOL cur_is_word; /* a recursive call to RMATCH(). */
522     BOOL condition;
523 nigel 77 BOOL prev_is_word;
524    
525     unsigned long int original_ims;
526    
527     #ifdef SUPPORT_UCP
528     int prop_type;
529 nigel 87 int prop_value;
530 nigel 77 int prop_fail_result;
531     int prop_category;
532     int prop_chartype;
533 nigel 87 int prop_script;
534 ph10 115 int oclength;
535     uschar occhars[8];
536 nigel 77 #endif
537    
538     int ctype;
539     int length;
540     int max;
541     int min;
542     int number;
543     int offset;
544     int op;
545     int save_capture_last;
546     int save_offset1, save_offset2, save_offset3;
547     int stacksave[REC_STACK_SAVE_MAX];
548    
549     eptrblock newptrb;
550 nigel 93 #endif /* NO_RECURSE */
551 nigel 77
552     /* These statements are here to stop the compiler complaining about unitialized
553     variables. */
554    
555     #ifdef SUPPORT_UCP
556 nigel 87 prop_value = 0;
557 nigel 77 prop_fail_result = 0;
558     #endif
559    
560 nigel 93
561 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
562     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
563     used. Thanks to Ian Taylor for noticing this possibility and sending the
564     original patch. */
565    
566     TAIL_RECURSE:
567    
568 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
569     are specified by the macro RMATCH and RRETURN is used to return. When
570     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
571     and a "return", respectively (possibly with some debugging if DEBUG is
572     defined). However, RMATCH isn't like a function call because it's quite a
573     complicated macro. It has to be used in one particular way. This shouldn't,
574     however, impact performance when true recursion is being used. */
575 nigel 77
576 ph10 164 #ifdef SUPPORT_UTF8
577     utf8 = md->utf8; /* Local copy of the flag */
578     #else
579     utf8 = FALSE;
580     #endif
581    
582 nigel 87 /* First check that we haven't called match() too many times, or that we
583     haven't exceeded the recursive call limit. */
584    
585 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
586 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
587 nigel 77
588     original_ims = ims; /* Save for resetting on ')' */
589 nigel 91
590 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
591     string, the match_cbegroup flag is set. When this is the case, add the current
592     subject pointer to the chain of such remembered pointers, to be checked when we
593     hit the closing ket, in order to break infinite loops that match no characters.
594 ph10 197 When match() is called in other circumstances, don't add to the chain. The
595     match_cbegroup flag must NOT be used with tail recursion, because the memory
596     block that is used is on the stack, so a new one may be required for each
597     match(). */
598 nigel 77
599 nigel 93 if ((flags & match_cbegroup) != 0)
600 nigel 77 {
601 ph10 197 newptrb.epb_saved_eptr = eptr;
602     newptrb.epb_prev = eptrb;
603     eptrb = &newptrb;
604 nigel 77 }
605    
606 nigel 93 /* Now start processing the opcodes. */
607 nigel 77
608     for (;;)
609     {
610 nigel 93 minimize = possessive = FALSE;
611 nigel 77 op = *ecode;
612    
613     /* For partial matching, remember if we ever hit the end of the subject after
614     matching at least one subject character. */
615    
616     if (md->partial &&
617     eptr >= md->end_subject &&
618 ph10 168 eptr > mstart)
619 nigel 77 md->hitend = TRUE;
620 ph10 208
621 nigel 93 switch(op)
622     {
623 ph10 210 case OP_FAIL:
624 ph10 212 RRETURN(MATCH_NOMATCH);
625 ph10 211
626 ph10 210 case OP_PRUNE:
627     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
628     ims, eptrb, flags, RM51);
629     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
630 ph10 212 RRETURN(MATCH_PRUNE);
631 ph10 211
632 ph10 210 case OP_COMMIT:
633     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
634     ims, eptrb, flags, RM52);
635     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
636 ph10 212 RRETURN(MATCH_COMMIT);
637 ph10 211
638 ph10 210 case OP_SKIP:
639     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
640     ims, eptrb, flags, RM53);
641     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
642 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
643 ph10 212 RRETURN(MATCH_SKIP);
644 ph10 211
645 ph10 210 case OP_THEN:
646     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
647 ph10 212 ims, eptrb, flags, RM54);
648 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
649 ph10 212 RRETURN(MATCH_THEN);
650 ph10 211
651 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
652     the current subject position in the working slot at the top of the vector.
653     We mustn't change the current values of the data slot, because they may be
654     set from a previous iteration of this group, and be referred to by a
655     reference inside the group.
656 nigel 77
657 nigel 93 If the bracket fails to match, we need to restore this value and also the
658     values of the final offsets, in case they were set by a previous iteration
659     of the same bracket.
660 nigel 77
661 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
662     a non-capturing bracket. Don't worry about setting the flag for the error
663     case here; that is handled in the code for KET. */
664 nigel 77
665 nigel 93 case OP_CBRA:
666     case OP_SCBRA:
667     number = GET2(ecode, 1+LINK_SIZE);
668 nigel 77 offset = number << 1;
669    
670     #ifdef DEBUG
671 nigel 93 printf("start bracket %d\n", number);
672     printf("subject=");
673 nigel 77 pchars(eptr, 16, TRUE, md);
674     printf("\n");
675     #endif
676    
677     if (offset < md->offset_max)
678     {
679     save_offset1 = md->offset_vector[offset];
680     save_offset2 = md->offset_vector[offset+1];
681     save_offset3 = md->offset_vector[md->offset_end - number];
682     save_capture_last = md->capture_last;
683    
684     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
685     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
686    
687 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
688 nigel 77 do
689     {
690 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
691     ims, eptrb, flags, RM1);
692 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
693 nigel 77 md->capture_last = save_capture_last;
694     ecode += GET(ecode, 1);
695     }
696     while (*ecode == OP_ALT);
697    
698     DPRINTF(("bracket %d failed\n", number));
699    
700     md->offset_vector[offset] = save_offset1;
701     md->offset_vector[offset+1] = save_offset2;
702     md->offset_vector[md->offset_end - number] = save_offset3;
703    
704     RRETURN(MATCH_NOMATCH);
705     }
706    
707 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
708     as a non-capturing bracket. */
709 nigel 77
710 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
711     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
712    
713 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
714 nigel 77
715 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
716     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
717    
718 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
719     final alternative within the brackets, we would return the result of a
720     recursive call to match() whatever happened. We can reduce stack usage by
721 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
722     is set.*/
723 nigel 77
724 nigel 93 case OP_BRA:
725     case OP_SBRA:
726     DPRINTF(("start non-capturing bracket\n"));
727     flags = (op >= OP_SBRA)? match_cbegroup : 0;
728 nigel 91 for (;;)
729 nigel 77 {
730 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
731 nigel 93 {
732 ph10 197 if (flags == 0) /* Not a possibly empty group */
733     {
734     ecode += _pcre_OP_lengths[*ecode];
735     DPRINTF(("bracket 0 tail recursion\n"));
736     goto TAIL_RECURSE;
737     }
738    
739     /* Possibly empty group; can't use tail recursion. */
740    
741     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
742     eptrb, flags, RM48);
743     RRETURN(rrc);
744 nigel 93 }
745 nigel 91
746     /* For non-final alternatives, continue the loop for a NOMATCH result;
747     otherwise return. */
748    
749 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
750     eptrb, flags, RM2);
751 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
752 nigel 77 ecode += GET(ecode, 1);
753     }
754 nigel 91 /* Control never reaches here. */
755 nigel 77
756     /* Conditional group: compilation checked that there are no more than
757     two branches. If the condition is false, skipping the first branch takes us
758     past the end if there is only one branch, but that's OK because that is
759 nigel 91 exactly what going to the ket would do. As there is only one branch to be
760     obeyed, we can use tail recursion to avoid using another stack frame. */
761 nigel 77
762     case OP_COND:
763 nigel 93 case OP_SCOND:
764     if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
765 nigel 77 {
766 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
767     condition = md->recursive != NULL &&
768     (offset == RREF_ANY || offset == md->recursive->group_num);
769     ecode += condition? 3 : GET(ecode, 1);
770     }
771    
772     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
773     {
774 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
775 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
776     ecode += condition? 3 : GET(ecode, 1);
777 nigel 77 }
778    
779 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
780     {
781     condition = FALSE;
782     ecode += GET(ecode, 1);
783     }
784    
785 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
786 nigel 93 the final argument match_condassert causes it to stop at the end of an
787     assertion. */
788 nigel 77
789     else
790     {
791 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
792     match_condassert, RM3);
793 nigel 77 if (rrc == MATCH_MATCH)
794     {
795 nigel 93 condition = TRUE;
796     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
797 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
798     }
799 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
800 nigel 77 {
801     RRETURN(rrc); /* Need braces because of following else */
802     }
803 nigel 93 else
804     {
805     condition = FALSE;
806     ecode += GET(ecode, 1);
807     }
808     }
809 nigel 91
810 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
811 ph10 197 we can use tail recursion to avoid using another stack frame, except when
812     match_cbegroup is required for an unlimited repeat of a possibly empty
813     group. If the second alternative doesn't exist, we can just plough on. */
814 nigel 91
815 nigel 93 if (condition || *ecode == OP_ALT)
816     {
817 nigel 91 ecode += 1 + LINK_SIZE;
818 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
819     {
820     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
821     RRETURN(rrc);
822     }
823     else /* Group must match something */
824     {
825     flags = 0;
826     goto TAIL_RECURSE;
827     }
828 nigel 77 }
829 ph10 197 else /* Condition false & no 2nd alternative */
830 nigel 93 {
831     ecode += 1 + LINK_SIZE;
832     }
833     break;
834 nigel 77
835    
836 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
837     recursion, we should restore the offsets appropriately and continue from
838     after the call. */
839 nigel 77
840 ph10 210 case OP_ACCEPT:
841 nigel 77 case OP_END:
842     if (md->recursive != NULL && md->recursive->group_num == 0)
843     {
844     recursion_info *rec = md->recursive;
845 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
846 nigel 77 md->recursive = rec->prevrec;
847     memmove(md->offset_vector, rec->offset_save,
848     rec->saved_max * sizeof(int));
849 ph10 168 mstart = rec->save_start;
850 nigel 77 ims = original_ims;
851     ecode = rec->after_call;
852     break;
853     }
854    
855     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
856     string - backtracking will then try other alternatives, if any. */
857    
858 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
859     md->end_match_ptr = eptr; /* Record where we ended */
860     md->end_offset_top = offset_top; /* and how many extracts were taken */
861 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
862 nigel 77 RRETURN(MATCH_MATCH);
863    
864     /* Change option settings */
865    
866     case OP_OPT:
867     ims = ecode[1];
868     ecode += 2;
869     DPRINTF(("ims set to %02lx\n", ims));
870     break;
871    
872     /* Assertion brackets. Check the alternative branches in turn - the
873     matching won't pass the KET for an assertion. If any one branch matches,
874     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
875     start of each branch to move the current point backwards, so the code at
876     this level is identical to the lookahead case. */
877    
878     case OP_ASSERT:
879     case OP_ASSERTBACK:
880     do
881     {
882 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
883     RM4);
884 nigel 77 if (rrc == MATCH_MATCH) break;
885 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
886 nigel 77 ecode += GET(ecode, 1);
887     }
888     while (*ecode == OP_ALT);
889     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
890    
891     /* If checking an assertion for a condition, return MATCH_MATCH. */
892    
893     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
894    
895     /* Continue from after the assertion, updating the offsets high water
896     mark, since extracts may have been taken during the assertion. */
897    
898     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
899     ecode += 1 + LINK_SIZE;
900     offset_top = md->end_offset_top;
901     continue;
902    
903     /* Negative assertion: all branches must fail to match */
904    
905     case OP_ASSERT_NOT:
906     case OP_ASSERTBACK_NOT:
907     do
908     {
909 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
910     RM5);
911 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
912 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
913 nigel 77 ecode += GET(ecode,1);
914     }
915     while (*ecode == OP_ALT);
916    
917     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
918    
919     ecode += 1 + LINK_SIZE;
920     continue;
921    
922     /* Move the subject pointer back. This occurs only at the start of
923     each branch of a lookbehind assertion. If we are too close to the start to
924     move back, this match function fails. When working with UTF-8 we move
925     back a number of characters, not bytes. */
926    
927     case OP_REVERSE:
928     #ifdef SUPPORT_UTF8
929     if (utf8)
930     {
931 nigel 93 i = GET(ecode, 1);
932     while (i-- > 0)
933 nigel 77 {
934     eptr--;
935     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
936 ph10 207 BACKCHAR(eptr);
937 nigel 77 }
938     }
939     else
940     #endif
941    
942     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
943    
944     {
945 nigel 93 eptr -= GET(ecode, 1);
946 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
947     }
948    
949     /* Skip to next op code */
950    
951     ecode += 1 + LINK_SIZE;
952     break;
953    
954     /* The callout item calls an external function, if one is provided, passing
955     details of the match so far. This is mainly for debugging, though the
956     function is able to force a failure. */
957    
958     case OP_CALLOUT:
959     if (pcre_callout != NULL)
960     {
961     pcre_callout_block cb;
962     cb.version = 1; /* Version 1 of the callout block */
963     cb.callout_number = ecode[1];
964     cb.offset_vector = md->offset_vector;
965 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
966 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
967 ph10 168 cb.start_match = mstart - md->start_subject;
968 nigel 77 cb.current_position = eptr - md->start_subject;
969     cb.pattern_position = GET(ecode, 2);
970     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
971     cb.capture_top = offset_top/2;
972     cb.capture_last = md->capture_last;
973     cb.callout_data = md->callout_data;
974     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
975     if (rrc < 0) RRETURN(rrc);
976     }
977     ecode += 2 + 2*LINK_SIZE;
978     break;
979    
980     /* Recursion either matches the current regex, or some subexpression. The
981     offset data is the offset to the starting bracket from the start of the
982     whole pattern. (This is so that it works from duplicated subpatterns.)
983    
984     If there are any capturing brackets started but not finished, we have to
985     save their starting points and reinstate them after the recursion. However,
986     we don't know how many such there are (offset_top records the completed
987     total) so we just have to save all the potential data. There may be up to
988     65535 such values, which is too large to put on the stack, but using malloc
989     for small numbers seems expensive. As a compromise, the stack is used when
990     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
991     is used. A problem is what to do if the malloc fails ... there is no way of
992     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
993     values on the stack, and accept that the rest may be wrong.
994    
995     There are also other values that have to be saved. We use a chained
996     sequence of blocks that actually live on the stack. Thanks to Robin Houston
997     for the original version of this logic. */
998    
999     case OP_RECURSE:
1000     {
1001     callpat = md->start_code + GET(ecode, 1);
1002 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1003     GET2(callpat, 1 + LINK_SIZE);
1004 nigel 77
1005     /* Add to "recursing stack" */
1006    
1007     new_recursive.prevrec = md->recursive;
1008     md->recursive = &new_recursive;
1009    
1010     /* Find where to continue from afterwards */
1011    
1012     ecode += 1 + LINK_SIZE;
1013     new_recursive.after_call = ecode;
1014    
1015     /* Now save the offset data. */
1016    
1017     new_recursive.saved_max = md->offset_end;
1018     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1019     new_recursive.offset_save = stacksave;
1020     else
1021     {
1022     new_recursive.offset_save =
1023     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1024     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1025     }
1026    
1027     memcpy(new_recursive.offset_save, md->offset_vector,
1028     new_recursive.saved_max * sizeof(int));
1029 ph10 168 new_recursive.save_start = mstart;
1030     mstart = eptr;
1031 nigel 77
1032     /* OK, now we can do the recursion. For each top-level alternative we
1033     restore the offset and recursion data. */
1034    
1035     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1036 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1037 nigel 77 do
1038     {
1039 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1040     md, ims, eptrb, flags, RM6);
1041 nigel 77 if (rrc == MATCH_MATCH)
1042     {
1043 nigel 87 DPRINTF(("Recursion matched\n"));
1044 nigel 77 md->recursive = new_recursive.prevrec;
1045     if (new_recursive.offset_save != stacksave)
1046     (pcre_free)(new_recursive.offset_save);
1047     RRETURN(MATCH_MATCH);
1048     }
1049 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1050 nigel 87 {
1051     DPRINTF(("Recursion gave error %d\n", rrc));
1052     RRETURN(rrc);
1053     }
1054 nigel 77
1055     md->recursive = &new_recursive;
1056     memcpy(md->offset_vector, new_recursive.offset_save,
1057     new_recursive.saved_max * sizeof(int));
1058     callpat += GET(callpat, 1);
1059     }
1060     while (*callpat == OP_ALT);
1061    
1062     DPRINTF(("Recursion didn't match\n"));
1063     md->recursive = new_recursive.prevrec;
1064     if (new_recursive.offset_save != stacksave)
1065     (pcre_free)(new_recursive.offset_save);
1066     RRETURN(MATCH_NOMATCH);
1067     }
1068     /* Control never reaches here */
1069    
1070     /* "Once" brackets are like assertion brackets except that after a match,
1071     the point in the subject string is not moved back. Thus there can never be
1072     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1073     Check the alternative branches in turn - the matching won't pass the KET
1074     for this kind of subpattern. If any one branch matches, we carry on as at
1075     the end of a normal bracket, leaving the subject pointer. */
1076    
1077     case OP_ONCE:
1078 nigel 91 prev = ecode;
1079     saved_eptr = eptr;
1080    
1081     do
1082 nigel 77 {
1083 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1084 nigel 91 if (rrc == MATCH_MATCH) break;
1085 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1086 nigel 91 ecode += GET(ecode,1);
1087     }
1088     while (*ecode == OP_ALT);
1089 nigel 77
1090 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1091 nigel 77
1092 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1093 nigel 77
1094 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1095     mark, since extracts may have been taken. */
1096 nigel 77
1097 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1098 nigel 77
1099 nigel 91 offset_top = md->end_offset_top;
1100     eptr = md->end_match_ptr;
1101 nigel 77
1102 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1103     happens for a repeating ket if no characters were matched in the group.
1104     This is the forcible breaking of infinite loops as implemented in Perl
1105     5.005. If there is an options reset, it will get obeyed in the normal
1106     course of events. */
1107 nigel 77
1108 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1109     {
1110     ecode += 1+LINK_SIZE;
1111     break;
1112     }
1113 nigel 77
1114 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1115     preceding bracket, in the appropriate order. The second "call" of match()
1116     uses tail recursion, to avoid using another stack frame. We need to reset
1117     any options that changed within the bracket before re-running it, so
1118     check the next opcode. */
1119 nigel 77
1120 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1121     {
1122     ims = (ims & ~PCRE_IMS) | ecode[4];
1123     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1124     }
1125 nigel 77
1126 nigel 91 if (*ecode == OP_KETRMIN)
1127     {
1128 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1129 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1130     ecode = prev;
1131 ph10 197 flags = 0;
1132 nigel 91 goto TAIL_RECURSE;
1133 nigel 77 }
1134 nigel 91 else /* OP_KETRMAX */
1135     {
1136 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1137 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1138     ecode += 1 + LINK_SIZE;
1139 ph10 197 flags = 0;
1140 nigel 91 goto TAIL_RECURSE;
1141     }
1142     /* Control never gets here */
1143 nigel 77
1144     /* An alternation is the end of a branch; scan along to find the end of the
1145     bracketed group and go to there. */
1146    
1147     case OP_ALT:
1148     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1149     break;
1150    
1151 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1152     indicating that it may occur zero times. It may repeat infinitely, or not
1153     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1154     with fixed upper repeat limits are compiled as a number of copies, with the
1155     optional ones preceded by BRAZERO or BRAMINZERO. */
1156 nigel 77
1157     case OP_BRAZERO:
1158     {
1159     next = ecode+1;
1160 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1161 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1162     do next += GET(next,1); while (*next == OP_ALT);
1163 nigel 93 ecode = next + 1 + LINK_SIZE;
1164 nigel 77 }
1165     break;
1166    
1167     case OP_BRAMINZERO:
1168     {
1169     next = ecode+1;
1170 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1171 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1172 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1173     ecode++;
1174     }
1175     break;
1176    
1177 ph10 335 case OP_SKIPZERO:
1178     {
1179     next = ecode+1;
1180     do next += GET(next,1); while (*next == OP_ALT);
1181     ecode = next + 1 + LINK_SIZE;
1182     }
1183     break;
1184    
1185 nigel 93 /* End of a group, repeated or non-repeating. */
1186 nigel 77
1187     case OP_KET:
1188     case OP_KETRMIN:
1189     case OP_KETRMAX:
1190 nigel 91 prev = ecode - GET(ecode, 1);
1191 nigel 77
1192 nigel 93 /* If this was a group that remembered the subject start, in order to break
1193     infinite repeats of empty string matches, retrieve the subject start from
1194     the chain. Otherwise, set it NULL. */
1195 nigel 77
1196 nigel 93 if (*prev >= OP_SBRA)
1197     {
1198     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1199     eptrb = eptrb->epb_prev; /* Backup to previous group */
1200     }
1201     else saved_eptr = NULL;
1202 nigel 77
1203 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1204     MATCH_MATCH, but record the current high water mark for use by positive
1205     assertions. Do this also for the "once" (atomic) groups. */
1206    
1207 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1208     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1209     *prev == OP_ONCE)
1210     {
1211     md->end_match_ptr = eptr; /* For ONCE */
1212     md->end_offset_top = offset_top;
1213     RRETURN(MATCH_MATCH);
1214     }
1215 nigel 77
1216 nigel 93 /* For capturing groups we have to check the group number back at the start
1217     and if necessary complete handling an extraction by setting the offsets and
1218     bumping the high water mark. Note that whole-pattern recursion is coded as
1219     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1220     when the OP_END is reached. Other recursion is handled here. */
1221 nigel 77
1222 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1223 nigel 91 {
1224 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1225 nigel 91 offset = number << 1;
1226 nigel 77
1227     #ifdef DEBUG
1228 nigel 91 printf("end bracket %d", number);
1229     printf("\n");
1230 nigel 77 #endif
1231    
1232 nigel 93 md->capture_last = number;
1233     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1234 nigel 91 {
1235 nigel 93 md->offset_vector[offset] =
1236     md->offset_vector[md->offset_end - number];
1237     md->offset_vector[offset+1] = eptr - md->start_subject;
1238     if (offset_top <= offset) offset_top = offset + 2;
1239     }
1240 nigel 77
1241 nigel 93 /* Handle a recursively called group. Restore the offsets
1242     appropriately and continue from after the call. */
1243 nigel 77
1244 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1245     {
1246     recursion_info *rec = md->recursive;
1247     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1248     md->recursive = rec->prevrec;
1249 ph10 168 mstart = rec->save_start;
1250 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1251     rec->saved_max * sizeof(int));
1252     ecode = rec->after_call;
1253     ims = original_ims;
1254     break;
1255 nigel 77 }
1256 nigel 91 }
1257 nigel 77
1258 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1259     flags, in case they got changed during the group. */
1260 nigel 77
1261 nigel 91 ims = original_ims;
1262     DPRINTF(("ims reset to %02lx\n", ims));
1263 nigel 77
1264 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1265     happens for a repeating ket if no characters were matched in the group.
1266     This is the forcible breaking of infinite loops as implemented in Perl
1267     5.005. If there is an options reset, it will get obeyed in the normal
1268     course of events. */
1269 nigel 77
1270 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1271     {
1272     ecode += 1 + LINK_SIZE;
1273     break;
1274     }
1275 nigel 77
1276 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1277     preceding bracket, in the appropriate order. In the second case, we can use
1278 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1279     unlimited repeat of a group that can match an empty string. */
1280 nigel 77
1281 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1282    
1283 nigel 91 if (*ecode == OP_KETRMIN)
1284     {
1285 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1286 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1287 ph10 197 if (flags != 0) /* Could match an empty string */
1288     {
1289     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1290     RRETURN(rrc);
1291     }
1292 nigel 91 ecode = prev;
1293     goto TAIL_RECURSE;
1294 nigel 77 }
1295 nigel 91 else /* OP_KETRMAX */
1296     {
1297 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1298 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1299     ecode += 1 + LINK_SIZE;
1300 ph10 197 flags = 0;
1301 nigel 91 goto TAIL_RECURSE;
1302     }
1303     /* Control never gets here */
1304 nigel 77
1305     /* Start of subject unless notbol, or after internal newline if multiline */
1306    
1307     case OP_CIRC:
1308     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1309     if ((ims & PCRE_MULTILINE) != 0)
1310     {
1311 nigel 91 if (eptr != md->start_subject &&
1312 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1313 nigel 77 RRETURN(MATCH_NOMATCH);
1314     ecode++;
1315     break;
1316     }
1317     /* ... else fall through */
1318    
1319     /* Start of subject assertion */
1320    
1321     case OP_SOD:
1322     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1323     ecode++;
1324     break;
1325    
1326     /* Start of match assertion */
1327    
1328     case OP_SOM:
1329     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1330     ecode++;
1331     break;
1332 ph10 172
1333 ph10 168 /* Reset the start of match point */
1334 ph10 172
1335 ph10 168 case OP_SET_SOM:
1336     mstart = eptr;
1337 ph10 172 ecode++;
1338     break;
1339 nigel 77
1340     /* Assert before internal newline if multiline, or before a terminating
1341     newline unless endonly is set, else end of subject unless noteol is set. */
1342    
1343     case OP_DOLL:
1344     if ((ims & PCRE_MULTILINE) != 0)
1345     {
1346     if (eptr < md->end_subject)
1347 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1348 nigel 77 else
1349     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1350     ecode++;
1351     break;
1352     }
1353     else
1354     {
1355     if (md->noteol) RRETURN(MATCH_NOMATCH);
1356     if (!md->endonly)
1357     {
1358 nigel 91 if (eptr != md->end_subject &&
1359 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1360 nigel 77 RRETURN(MATCH_NOMATCH);
1361     ecode++;
1362     break;
1363     }
1364     }
1365 nigel 91 /* ... else fall through for endonly */
1366 nigel 77
1367     /* End of subject assertion (\z) */
1368    
1369     case OP_EOD:
1370     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1371     ecode++;
1372     break;
1373    
1374     /* End of subject or ending \n assertion (\Z) */
1375    
1376     case OP_EODN:
1377 nigel 91 if (eptr != md->end_subject &&
1378 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1379 nigel 91 RRETURN(MATCH_NOMATCH);
1380 nigel 77 ecode++;
1381     break;
1382    
1383     /* Word boundary assertions */
1384    
1385     case OP_NOT_WORD_BOUNDARY:
1386     case OP_WORD_BOUNDARY:
1387     {
1388    
1389     /* Find out if the previous and current characters are "word" characters.
1390     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1391     be "non-word" characters. */
1392    
1393     #ifdef SUPPORT_UTF8
1394     if (utf8)
1395     {
1396     if (eptr == md->start_subject) prev_is_word = FALSE; else
1397     {
1398     const uschar *lastptr = eptr - 1;
1399     while((*lastptr & 0xc0) == 0x80) lastptr--;
1400     GETCHAR(c, lastptr);
1401     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1402     }
1403     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1404     {
1405     GETCHAR(c, eptr);
1406     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1407     }
1408     }
1409     else
1410     #endif
1411    
1412     /* More streamlined when not in UTF-8 mode */
1413    
1414     {
1415     prev_is_word = (eptr != md->start_subject) &&
1416     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1417     cur_is_word = (eptr < md->end_subject) &&
1418     ((md->ctypes[*eptr] & ctype_word) != 0);
1419     }
1420    
1421     /* Now see if the situation is what we want */
1422    
1423     if ((*ecode++ == OP_WORD_BOUNDARY)?
1424     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1425     RRETURN(MATCH_NOMATCH);
1426     }
1427     break;
1428    
1429     /* Match a single character type; inline for speed */
1430    
1431     case OP_ANY:
1432 nigel 91 if ((ims & PCRE_DOTALL) == 0)
1433     {
1434 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1435 nigel 91 }
1436 ph10 341 /* Fall through */
1437    
1438     case OP_ALLANY:
1439 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1440     if (utf8)
1441     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1442     ecode++;
1443     break;
1444    
1445     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1446     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1447    
1448     case OP_ANYBYTE:
1449     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1450     ecode++;
1451     break;
1452    
1453     case OP_NOT_DIGIT:
1454     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1455     GETCHARINCTEST(c, eptr);
1456     if (
1457     #ifdef SUPPORT_UTF8
1458     c < 256 &&
1459     #endif
1460     (md->ctypes[c] & ctype_digit) != 0
1461     )
1462     RRETURN(MATCH_NOMATCH);
1463     ecode++;
1464     break;
1465    
1466     case OP_DIGIT:
1467     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1468     GETCHARINCTEST(c, eptr);
1469     if (
1470     #ifdef SUPPORT_UTF8
1471     c >= 256 ||
1472     #endif
1473     (md->ctypes[c] & ctype_digit) == 0
1474     )
1475     RRETURN(MATCH_NOMATCH);
1476     ecode++;
1477     break;
1478    
1479     case OP_NOT_WHITESPACE:
1480     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1481     GETCHARINCTEST(c, eptr);
1482     if (
1483     #ifdef SUPPORT_UTF8
1484     c < 256 &&
1485     #endif
1486     (md->ctypes[c] & ctype_space) != 0
1487     )
1488     RRETURN(MATCH_NOMATCH);
1489     ecode++;
1490     break;
1491    
1492     case OP_WHITESPACE:
1493     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1494     GETCHARINCTEST(c, eptr);
1495     if (
1496     #ifdef SUPPORT_UTF8
1497     c >= 256 ||
1498     #endif
1499     (md->ctypes[c] & ctype_space) == 0
1500     )
1501     RRETURN(MATCH_NOMATCH);
1502     ecode++;
1503     break;
1504    
1505     case OP_NOT_WORDCHAR:
1506     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1507     GETCHARINCTEST(c, eptr);
1508     if (
1509     #ifdef SUPPORT_UTF8
1510     c < 256 &&
1511     #endif
1512     (md->ctypes[c] & ctype_word) != 0
1513     )
1514     RRETURN(MATCH_NOMATCH);
1515     ecode++;
1516     break;
1517    
1518     case OP_WORDCHAR:
1519     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1520     GETCHARINCTEST(c, eptr);
1521     if (
1522     #ifdef SUPPORT_UTF8
1523     c >= 256 ||
1524     #endif
1525     (md->ctypes[c] & ctype_word) == 0
1526     )
1527     RRETURN(MATCH_NOMATCH);
1528     ecode++;
1529     break;
1530    
1531 nigel 93 case OP_ANYNL:
1532     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1533     GETCHARINCTEST(c, eptr);
1534     switch(c)
1535     {
1536     default: RRETURN(MATCH_NOMATCH);
1537     case 0x000d:
1538     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1539     break;
1540 ph10 231
1541 nigel 93 case 0x000a:
1542 ph10 231 break;
1543    
1544 nigel 93 case 0x000b:
1545     case 0x000c:
1546     case 0x0085:
1547     case 0x2028:
1548     case 0x2029:
1549 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1550 nigel 93 break;
1551     }
1552     ecode++;
1553     break;
1554    
1555 ph10 178 case OP_NOT_HSPACE:
1556     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1557     GETCHARINCTEST(c, eptr);
1558     switch(c)
1559     {
1560     default: break;
1561     case 0x09: /* HT */
1562     case 0x20: /* SPACE */
1563     case 0xa0: /* NBSP */
1564     case 0x1680: /* OGHAM SPACE MARK */
1565     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1566     case 0x2000: /* EN QUAD */
1567     case 0x2001: /* EM QUAD */
1568     case 0x2002: /* EN SPACE */
1569     case 0x2003: /* EM SPACE */
1570     case 0x2004: /* THREE-PER-EM SPACE */
1571     case 0x2005: /* FOUR-PER-EM SPACE */
1572     case 0x2006: /* SIX-PER-EM SPACE */
1573     case 0x2007: /* FIGURE SPACE */
1574     case 0x2008: /* PUNCTUATION SPACE */
1575     case 0x2009: /* THIN SPACE */
1576     case 0x200A: /* HAIR SPACE */
1577     case 0x202f: /* NARROW NO-BREAK SPACE */
1578     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1579     case 0x3000: /* IDEOGRAPHIC SPACE */
1580     RRETURN(MATCH_NOMATCH);
1581     }
1582     ecode++;
1583     break;
1584    
1585     case OP_HSPACE:
1586     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1587     GETCHARINCTEST(c, eptr);
1588     switch(c)
1589     {
1590     default: RRETURN(MATCH_NOMATCH);
1591     case 0x09: /* HT */
1592     case 0x20: /* SPACE */
1593     case 0xa0: /* NBSP */
1594     case 0x1680: /* OGHAM SPACE MARK */
1595     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1596     case 0x2000: /* EN QUAD */
1597     case 0x2001: /* EM QUAD */
1598     case 0x2002: /* EN SPACE */
1599     case 0x2003: /* EM SPACE */
1600     case 0x2004: /* THREE-PER-EM SPACE */
1601     case 0x2005: /* FOUR-PER-EM SPACE */
1602     case 0x2006: /* SIX-PER-EM SPACE */
1603     case 0x2007: /* FIGURE SPACE */
1604     case 0x2008: /* PUNCTUATION SPACE */
1605     case 0x2009: /* THIN SPACE */
1606     case 0x200A: /* HAIR SPACE */
1607     case 0x202f: /* NARROW NO-BREAK SPACE */
1608     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1609     case 0x3000: /* IDEOGRAPHIC SPACE */
1610     break;
1611     }
1612     ecode++;
1613     break;
1614    
1615     case OP_NOT_VSPACE:
1616     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1617     GETCHARINCTEST(c, eptr);
1618     switch(c)
1619     {
1620     default: break;
1621     case 0x0a: /* LF */
1622     case 0x0b: /* VT */
1623     case 0x0c: /* FF */
1624     case 0x0d: /* CR */
1625     case 0x85: /* NEL */
1626     case 0x2028: /* LINE SEPARATOR */
1627     case 0x2029: /* PARAGRAPH SEPARATOR */
1628     RRETURN(MATCH_NOMATCH);
1629     }
1630     ecode++;
1631     break;
1632    
1633     case OP_VSPACE:
1634     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1635     GETCHARINCTEST(c, eptr);
1636     switch(c)
1637     {
1638     default: RRETURN(MATCH_NOMATCH);
1639     case 0x0a: /* LF */
1640     case 0x0b: /* VT */
1641     case 0x0c: /* FF */
1642     case 0x0d: /* CR */
1643     case 0x85: /* NEL */
1644     case 0x2028: /* LINE SEPARATOR */
1645     case 0x2029: /* PARAGRAPH SEPARATOR */
1646     break;
1647     }
1648     ecode++;
1649     break;
1650    
1651 nigel 77 #ifdef SUPPORT_UCP
1652     /* Check the next character by Unicode property. We will get here only
1653     if the support is in the binary; otherwise a compile-time error occurs. */
1654    
1655     case OP_PROP:
1656     case OP_NOTPROP:
1657     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1658     GETCHARINCTEST(c, eptr);
1659     {
1660 nigel 87 int chartype, script;
1661     int category = _pcre_ucp_findprop(c, &chartype, &script);
1662 nigel 77
1663 nigel 87 switch(ecode[1])
1664     {
1665     case PT_ANY:
1666     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1667     break;
1668 nigel 77
1669 nigel 87 case PT_LAMP:
1670     if ((chartype == ucp_Lu ||
1671     chartype == ucp_Ll ||
1672     chartype == ucp_Lt) == (op == OP_NOTPROP))
1673 nigel 77 RRETURN(MATCH_NOMATCH);
1674 nigel 87 break;
1675    
1676     case PT_GC:
1677     if ((ecode[2] != category) == (op == OP_PROP))
1678 nigel 77 RRETURN(MATCH_NOMATCH);
1679 nigel 87 break;
1680    
1681     case PT_PC:
1682     if ((ecode[2] != chartype) == (op == OP_PROP))
1683     RRETURN(MATCH_NOMATCH);
1684     break;
1685    
1686     case PT_SC:
1687     if ((ecode[2] != script) == (op == OP_PROP))
1688     RRETURN(MATCH_NOMATCH);
1689     break;
1690    
1691     default:
1692     RRETURN(PCRE_ERROR_INTERNAL);
1693 nigel 77 }
1694 nigel 87
1695     ecode += 3;
1696 nigel 77 }
1697     break;
1698    
1699     /* Match an extended Unicode sequence. We will get here only if the support
1700     is in the binary; otherwise a compile-time error occurs. */
1701    
1702     case OP_EXTUNI:
1703     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1704     GETCHARINCTEST(c, eptr);
1705     {
1706 nigel 87 int chartype, script;
1707     int category = _pcre_ucp_findprop(c, &chartype, &script);
1708 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1709     while (eptr < md->end_subject)
1710     {
1711     int len = 1;
1712     if (!utf8) c = *eptr; else
1713     {
1714     GETCHARLEN(c, eptr, len);
1715     }
1716 nigel 87 category = _pcre_ucp_findprop(c, &chartype, &script);
1717 nigel 77 if (category != ucp_M) break;
1718     eptr += len;
1719     }
1720     }
1721     ecode++;
1722     break;
1723     #endif
1724    
1725    
1726     /* Match a back reference, possibly repeatedly. Look past the end of the
1727     item to see if there is repeat information following. The code is similar
1728     to that for character classes, but repeated for efficiency. Then obey
1729     similar code to character type repeats - written out again for speed.
1730     However, if the referenced string is the empty string, always treat
1731     it as matched, any number of times (otherwise there could be infinite
1732     loops). */
1733    
1734     case OP_REF:
1735     {
1736     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1737 ph10 336 ecode += 3;
1738    
1739     /* If the reference is unset, there are two possibilities:
1740    
1741     (a) In the default, Perl-compatible state, set the length to be longer
1742     than the amount of subject left; this ensures that every attempt at a
1743     match fails. We can't just fail here, because of the possibility of
1744     quantifiers with zero minima.
1745    
1746     (b) If the JavaScript compatibility flag is set, set the length to zero
1747     so that the back reference matches an empty string.
1748    
1749     Otherwise, set the length to the length of what was matched by the
1750     referenced subpattern. */
1751    
1752     if (offset >= offset_top || md->offset_vector[offset] < 0)
1753     length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1754     else
1755     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1756 nigel 77
1757     /* Set up for repetition, or handle the non-repeated case */
1758    
1759     switch (*ecode)
1760     {
1761     case OP_CRSTAR:
1762     case OP_CRMINSTAR:
1763     case OP_CRPLUS:
1764     case OP_CRMINPLUS:
1765     case OP_CRQUERY:
1766     case OP_CRMINQUERY:
1767     c = *ecode++ - OP_CRSTAR;
1768     minimize = (c & 1) != 0;
1769     min = rep_min[c]; /* Pick up values from tables; */
1770     max = rep_max[c]; /* zero for max => infinity */
1771     if (max == 0) max = INT_MAX;
1772     break;
1773    
1774     case OP_CRRANGE:
1775     case OP_CRMINRANGE:
1776     minimize = (*ecode == OP_CRMINRANGE);
1777     min = GET2(ecode, 1);
1778     max = GET2(ecode, 3);
1779     if (max == 0) max = INT_MAX;
1780     ecode += 5;
1781     break;
1782    
1783     default: /* No repeat follows */
1784     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1785     eptr += length;
1786     continue; /* With the main loop */
1787     }
1788    
1789     /* If the length of the reference is zero, just continue with the
1790     main loop. */
1791    
1792     if (length == 0) continue;
1793    
1794     /* First, ensure the minimum number of matches are present. We get back
1795     the length of the reference string explicitly rather than passing the
1796     address of eptr, so that eptr can be a register variable. */
1797    
1798     for (i = 1; i <= min; i++)
1799     {
1800     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1801     eptr += length;
1802     }
1803    
1804     /* If min = max, continue at the same level without recursion.
1805     They are not both allowed to be zero. */
1806    
1807     if (min == max) continue;
1808    
1809     /* If minimizing, keep trying and advancing the pointer */
1810    
1811     if (minimize)
1812     {
1813     for (fi = min;; fi++)
1814     {
1815 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1816 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1817     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1818     RRETURN(MATCH_NOMATCH);
1819     eptr += length;
1820     }
1821     /* Control never gets here */
1822     }
1823    
1824     /* If maximizing, find the longest string and work backwards */
1825    
1826     else
1827     {
1828     pp = eptr;
1829     for (i = min; i < max; i++)
1830     {
1831     if (!match_ref(offset, eptr, length, md, ims)) break;
1832     eptr += length;
1833     }
1834     while (eptr >= pp)
1835     {
1836 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1837 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1838     eptr -= length;
1839     }
1840     RRETURN(MATCH_NOMATCH);
1841     }
1842     }
1843     /* Control never gets here */
1844    
1845    
1846    
1847     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1848     used when all the characters in the class have values in the range 0-255,
1849     and either the matching is caseful, or the characters are in the range
1850     0-127 when UTF-8 processing is enabled. The only difference between
1851     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1852     encountered.
1853    
1854     First, look past the end of the item to see if there is repeat information
1855     following. Then obey similar code to character type repeats - written out
1856     again for speed. */
1857    
1858     case OP_NCLASS:
1859     case OP_CLASS:
1860     {
1861     data = ecode + 1; /* Save for matching */
1862     ecode += 33; /* Advance past the item */
1863    
1864     switch (*ecode)
1865     {
1866     case OP_CRSTAR:
1867     case OP_CRMINSTAR:
1868     case OP_CRPLUS:
1869     case OP_CRMINPLUS:
1870     case OP_CRQUERY:
1871     case OP_CRMINQUERY:
1872     c = *ecode++ - OP_CRSTAR;
1873     minimize = (c & 1) != 0;
1874     min = rep_min[c]; /* Pick up values from tables; */
1875     max = rep_max[c]; /* zero for max => infinity */
1876     if (max == 0) max = INT_MAX;
1877     break;
1878    
1879     case OP_CRRANGE:
1880     case OP_CRMINRANGE:
1881     minimize = (*ecode == OP_CRMINRANGE);
1882     min = GET2(ecode, 1);
1883     max = GET2(ecode, 3);
1884     if (max == 0) max = INT_MAX;
1885     ecode += 5;
1886     break;
1887    
1888     default: /* No repeat follows */
1889     min = max = 1;
1890     break;
1891     }
1892    
1893     /* First, ensure the minimum number of matches are present. */
1894    
1895     #ifdef SUPPORT_UTF8
1896     /* UTF-8 mode */
1897     if (utf8)
1898     {
1899     for (i = 1; i <= min; i++)
1900     {
1901     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1902     GETCHARINC(c, eptr);
1903     if (c > 255)
1904     {
1905     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1906     }
1907     else
1908     {
1909     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1910     }
1911     }
1912     }
1913     else
1914     #endif
1915     /* Not UTF-8 mode */
1916     {
1917     for (i = 1; i <= min; i++)
1918     {
1919     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1920     c = *eptr++;
1921     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1922     }
1923     }
1924    
1925     /* If max == min we can continue with the main loop without the
1926     need to recurse. */
1927    
1928     if (min == max) continue;
1929    
1930     /* If minimizing, keep testing the rest of the expression and advancing
1931     the pointer while it matches the class. */
1932    
1933     if (minimize)
1934     {
1935     #ifdef SUPPORT_UTF8
1936     /* UTF-8 mode */
1937     if (utf8)
1938     {
1939     for (fi = min;; fi++)
1940     {
1941 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1942 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1943     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1944     GETCHARINC(c, eptr);
1945     if (c > 255)
1946     {
1947     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1948     }
1949     else
1950     {
1951     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1952     }
1953     }
1954     }
1955     else
1956     #endif
1957     /* Not UTF-8 mode */
1958     {
1959     for (fi = min;; fi++)
1960     {
1961 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1962 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1963     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1964     c = *eptr++;
1965     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1966     }
1967     }
1968     /* Control never gets here */
1969     }
1970    
1971     /* If maximizing, find the longest possible run, then work backwards. */
1972    
1973     else
1974     {
1975     pp = eptr;
1976    
1977     #ifdef SUPPORT_UTF8
1978     /* UTF-8 mode */
1979     if (utf8)
1980     {
1981     for (i = min; i < max; i++)
1982     {
1983     int len = 1;
1984     if (eptr >= md->end_subject) break;
1985     GETCHARLEN(c, eptr, len);
1986     if (c > 255)
1987     {
1988     if (op == OP_CLASS) break;
1989     }
1990     else
1991     {
1992     if ((data[c/8] & (1 << (c&7))) == 0) break;
1993     }
1994     eptr += len;
1995     }
1996     for (;;)
1997     {
1998 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
1999 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2000     if (eptr-- == pp) break; /* Stop if tried at original pos */
2001     BACKCHAR(eptr);
2002     }
2003     }
2004     else
2005     #endif
2006     /* Not UTF-8 mode */
2007     {
2008     for (i = min; i < max; i++)
2009     {
2010     if (eptr >= md->end_subject) break;
2011     c = *eptr;
2012     if ((data[c/8] & (1 << (c&7))) == 0) break;
2013     eptr++;
2014     }
2015     while (eptr >= pp)
2016     {
2017 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2018 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2019 nigel 77 eptr--;
2020     }
2021     }
2022    
2023     RRETURN(MATCH_NOMATCH);
2024     }
2025     }
2026     /* Control never gets here */
2027    
2028    
2029     /* Match an extended character class. This opcode is encountered only
2030     in UTF-8 mode, because that's the only time it is compiled. */
2031    
2032     #ifdef SUPPORT_UTF8
2033     case OP_XCLASS:
2034     {
2035     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2036     ecode += GET(ecode, 1); /* Advance past the item */
2037    
2038     switch (*ecode)
2039     {
2040     case OP_CRSTAR:
2041     case OP_CRMINSTAR:
2042     case OP_CRPLUS:
2043     case OP_CRMINPLUS:
2044     case OP_CRQUERY:
2045     case OP_CRMINQUERY:
2046     c = *ecode++ - OP_CRSTAR;
2047     minimize = (c & 1) != 0;
2048     min = rep_min[c]; /* Pick up values from tables; */
2049     max = rep_max[c]; /* zero for max => infinity */
2050     if (max == 0) max = INT_MAX;
2051     break;
2052    
2053     case OP_CRRANGE:
2054     case OP_CRMINRANGE:
2055     minimize = (*ecode == OP_CRMINRANGE);
2056     min = GET2(ecode, 1);
2057     max = GET2(ecode, 3);
2058     if (max == 0) max = INT_MAX;
2059     ecode += 5;
2060     break;
2061    
2062     default: /* No repeat follows */
2063     min = max = 1;
2064     break;
2065     }
2066    
2067     /* First, ensure the minimum number of matches are present. */
2068    
2069     for (i = 1; i <= min; i++)
2070     {
2071     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2072     GETCHARINC(c, eptr);
2073     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2074     }
2075    
2076     /* If max == min we can continue with the main loop without the
2077     need to recurse. */
2078    
2079     if (min == max) continue;
2080    
2081     /* If minimizing, keep testing the rest of the expression and advancing
2082     the pointer while it matches the class. */
2083    
2084     if (minimize)
2085     {
2086     for (fi = min;; fi++)
2087     {
2088 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2089 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2090     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2091     GETCHARINC(c, eptr);
2092     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2093     }
2094     /* Control never gets here */
2095     }
2096    
2097     /* If maximizing, find the longest possible run, then work backwards. */
2098    
2099     else
2100     {
2101     pp = eptr;
2102     for (i = min; i < max; i++)
2103     {
2104     int len = 1;
2105     if (eptr >= md->end_subject) break;
2106     GETCHARLEN(c, eptr, len);
2107     if (!_pcre_xclass(c, data)) break;
2108     eptr += len;
2109     }
2110     for(;;)
2111     {
2112 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2113 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2114     if (eptr-- == pp) break; /* Stop if tried at original pos */
2115 ph10 214 if (utf8) BACKCHAR(eptr);
2116 nigel 77 }
2117     RRETURN(MATCH_NOMATCH);
2118     }
2119    
2120     /* Control never gets here */
2121     }
2122     #endif /* End of XCLASS */
2123    
2124     /* Match a single character, casefully */
2125    
2126     case OP_CHAR:
2127     #ifdef SUPPORT_UTF8
2128     if (utf8)
2129     {
2130     length = 1;
2131     ecode++;
2132     GETCHARLEN(fc, ecode, length);
2133     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2134     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2135     }
2136     else
2137     #endif
2138    
2139     /* Non-UTF-8 mode */
2140     {
2141     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2142     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2143     ecode += 2;
2144     }
2145     break;
2146    
2147     /* Match a single character, caselessly */
2148    
2149     case OP_CHARNC:
2150     #ifdef SUPPORT_UTF8
2151     if (utf8)
2152     {
2153     length = 1;
2154     ecode++;
2155     GETCHARLEN(fc, ecode, length);
2156    
2157     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2158    
2159     /* If the pattern character's value is < 128, we have only one byte, and
2160     can use the fast lookup table. */
2161    
2162     if (fc < 128)
2163     {
2164     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2165     }
2166    
2167     /* Otherwise we must pick up the subject character */
2168    
2169     else
2170     {
2171 nigel 93 unsigned int dc;
2172 nigel 77 GETCHARINC(dc, eptr);
2173     ecode += length;
2174    
2175     /* If we have Unicode property support, we can use it to test the other
2176 nigel 87 case of the character, if there is one. */
2177 nigel 77
2178     if (fc != dc)
2179     {
2180     #ifdef SUPPORT_UCP
2181 nigel 87 if (dc != _pcre_ucp_othercase(fc))
2182 nigel 77 #endif
2183     RRETURN(MATCH_NOMATCH);
2184     }
2185     }
2186     }
2187     else
2188     #endif /* SUPPORT_UTF8 */
2189    
2190     /* Non-UTF-8 mode */
2191     {
2192     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2193     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2194     ecode += 2;
2195     }
2196     break;
2197    
2198 nigel 93 /* Match a single character repeatedly. */
2199 nigel 77
2200     case OP_EXACT:
2201     min = max = GET2(ecode, 1);
2202     ecode += 3;
2203     goto REPEATCHAR;
2204    
2205 nigel 93 case OP_POSUPTO:
2206     possessive = TRUE;
2207     /* Fall through */
2208    
2209 nigel 77 case OP_UPTO:
2210     case OP_MINUPTO:
2211     min = 0;
2212     max = GET2(ecode, 1);
2213     minimize = *ecode == OP_MINUPTO;
2214     ecode += 3;
2215     goto REPEATCHAR;
2216    
2217 nigel 93 case OP_POSSTAR:
2218     possessive = TRUE;
2219     min = 0;
2220     max = INT_MAX;
2221     ecode++;
2222     goto REPEATCHAR;
2223    
2224     case OP_POSPLUS:
2225     possessive = TRUE;
2226     min = 1;
2227     max = INT_MAX;
2228     ecode++;
2229     goto REPEATCHAR;
2230    
2231     case OP_POSQUERY:
2232     possessive = TRUE;
2233     min = 0;
2234     max = 1;
2235     ecode++;
2236     goto REPEATCHAR;
2237    
2238 nigel 77 case OP_STAR:
2239     case OP_MINSTAR:
2240     case OP_PLUS:
2241     case OP_MINPLUS:
2242     case OP_QUERY:
2243     case OP_MINQUERY:
2244     c = *ecode++ - OP_STAR;
2245     minimize = (c & 1) != 0;
2246     min = rep_min[c]; /* Pick up values from tables; */
2247     max = rep_max[c]; /* zero for max => infinity */
2248     if (max == 0) max = INT_MAX;
2249    
2250     /* Common code for all repeated single-character matches. We can give
2251     up quickly if there are fewer than the minimum number of characters left in
2252     the subject. */
2253    
2254     REPEATCHAR:
2255     #ifdef SUPPORT_UTF8
2256     if (utf8)
2257     {
2258     length = 1;
2259     charptr = ecode;
2260     GETCHARLEN(fc, ecode, length);
2261     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2262     ecode += length;
2263    
2264     /* Handle multibyte character matching specially here. There is
2265     support for caseless matching if UCP support is present. */
2266    
2267     if (length > 1)
2268     {
2269     #ifdef SUPPORT_UCP
2270 nigel 93 unsigned int othercase;
2271 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2272 nigel 93 (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
2273 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2274 ph10 115 else oclength = 0;
2275 nigel 77 #endif /* SUPPORT_UCP */
2276    
2277     for (i = 1; i <= min; i++)
2278     {
2279     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2280 ph10 123 #ifdef SUPPORT_UCP
2281 nigel 77 /* Need braces because of following else */
2282     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2283     else
2284     {
2285     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2286     eptr += oclength;
2287     }
2288 ph10 115 #else /* without SUPPORT_UCP */
2289     else { RRETURN(MATCH_NOMATCH); }
2290 ph10 123 #endif /* SUPPORT_UCP */
2291 nigel 77 }
2292    
2293     if (min == max) continue;
2294    
2295     if (minimize)
2296     {
2297     for (fi = min;; fi++)
2298     {
2299 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2300 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2301     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2302     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2303 ph10 123 #ifdef SUPPORT_UCP
2304 nigel 77 /* Need braces because of following else */
2305     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2306     else
2307     {
2308     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2309     eptr += oclength;
2310     }
2311 ph10 115 #else /* without SUPPORT_UCP */
2312     else { RRETURN (MATCH_NOMATCH); }
2313     #endif /* SUPPORT_UCP */
2314 nigel 77 }
2315     /* Control never gets here */
2316     }
2317 nigel 93
2318     else /* Maximize */
2319 nigel 77 {
2320     pp = eptr;
2321     for (i = min; i < max; i++)
2322     {
2323     if (eptr > md->end_subject - length) break;
2324     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2325 ph10 123 #ifdef SUPPORT_UCP
2326 nigel 77 else if (oclength == 0) break;
2327     else
2328     {
2329     if (memcmp(eptr, occhars, oclength) != 0) break;
2330     eptr += oclength;
2331     }
2332 ph10 115 #else /* without SUPPORT_UCP */
2333     else break;
2334 ph10 123 #endif /* SUPPORT_UCP */
2335 nigel 77 }
2336 nigel 93
2337     if (possessive) continue;
2338 ph10 120 for(;;)
2339 nigel 77 {
2340 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2341 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2342 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2343 ph10 115 #ifdef SUPPORT_UCP
2344     eptr--;
2345     BACKCHAR(eptr);
2346 ph10 123 #else /* without SUPPORT_UCP */
2347 nigel 77 eptr -= length;
2348 ph10 123 #endif /* SUPPORT_UCP */
2349 nigel 77 }
2350     }
2351     /* Control never gets here */
2352     }
2353    
2354     /* If the length of a UTF-8 character is 1, we fall through here, and
2355     obey the code as for non-UTF-8 characters below, though in this case the
2356     value of fc will always be < 128. */
2357     }
2358     else
2359     #endif /* SUPPORT_UTF8 */
2360    
2361     /* When not in UTF-8 mode, load a single-byte character. */
2362     {
2363     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2364     fc = *ecode++;
2365     }
2366    
2367     /* The value of fc at this point is always less than 256, though we may or
2368     may not be in UTF-8 mode. The code is duplicated for the caseless and
2369     caseful cases, for speed, since matching characters is likely to be quite
2370     common. First, ensure the minimum number of matches are present. If min =
2371     max, continue at the same level without recursing. Otherwise, if
2372     minimizing, keep trying the rest of the expression and advancing one
2373     matching character if failing, up to the maximum. Alternatively, if
2374     maximizing, find the maximum number of characters and work backwards. */
2375    
2376     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2377     max, eptr));
2378    
2379     if ((ims & PCRE_CASELESS) != 0)
2380     {
2381     fc = md->lcc[fc];
2382     for (i = 1; i <= min; i++)
2383     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2384     if (min == max) continue;
2385     if (minimize)
2386     {
2387     for (fi = min;; fi++)
2388     {
2389 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2390 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2391     if (fi >= max || eptr >= md->end_subject ||
2392     fc != md->lcc[*eptr++])
2393     RRETURN(MATCH_NOMATCH);
2394     }
2395     /* Control never gets here */
2396     }
2397 nigel 93 else /* Maximize */
2398 nigel 77 {
2399     pp = eptr;
2400     for (i = min; i < max; i++)
2401     {
2402     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2403     eptr++;
2404     }
2405 nigel 93 if (possessive) continue;
2406 nigel 77 while (eptr >= pp)
2407     {
2408 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2409 nigel 77 eptr--;
2410     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411     }
2412     RRETURN(MATCH_NOMATCH);
2413     }
2414     /* Control never gets here */
2415     }
2416    
2417     /* Caseful comparisons (includes all multi-byte characters) */
2418    
2419     else
2420     {
2421     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2422     if (min == max) continue;
2423     if (minimize)
2424     {
2425     for (fi = min;; fi++)
2426     {
2427 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2428 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2429     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2430     RRETURN(MATCH_NOMATCH);
2431     }
2432     /* Control never gets here */
2433     }
2434 nigel 93 else /* Maximize */
2435 nigel 77 {
2436     pp = eptr;
2437     for (i = min; i < max; i++)
2438     {
2439     if (eptr >= md->end_subject || fc != *eptr) break;
2440     eptr++;
2441     }
2442 nigel 93 if (possessive) continue;
2443 nigel 77 while (eptr >= pp)
2444     {
2445 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2446 nigel 77 eptr--;
2447     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2448     }
2449     RRETURN(MATCH_NOMATCH);
2450     }
2451     }
2452     /* Control never gets here */
2453    
2454     /* Match a negated single one-byte character. The character we are
2455     checking can be multibyte. */
2456    
2457     case OP_NOT:
2458     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2459     ecode++;
2460     GETCHARINCTEST(c, eptr);
2461     if ((ims & PCRE_CASELESS) != 0)
2462     {
2463     #ifdef SUPPORT_UTF8
2464     if (c < 256)
2465     #endif
2466     c = md->lcc[c];
2467     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2468     }
2469     else
2470     {
2471     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2472     }
2473     break;
2474    
2475     /* Match a negated single one-byte character repeatedly. This is almost a
2476     repeat of the code for a repeated single character, but I haven't found a
2477     nice way of commoning these up that doesn't require a test of the
2478     positive/negative option for each character match. Maybe that wouldn't add
2479     very much to the time taken, but character matching *is* what this is all
2480     about... */
2481    
2482     case OP_NOTEXACT:
2483     min = max = GET2(ecode, 1);
2484     ecode += 3;
2485     goto REPEATNOTCHAR;
2486    
2487     case OP_NOTUPTO:
2488     case OP_NOTMINUPTO:
2489     min = 0;
2490     max = GET2(ecode, 1);
2491     minimize = *ecode == OP_NOTMINUPTO;
2492     ecode += 3;
2493     goto REPEATNOTCHAR;
2494    
2495 nigel 93 case OP_NOTPOSSTAR:
2496     possessive = TRUE;
2497     min = 0;
2498     max = INT_MAX;
2499     ecode++;
2500     goto REPEATNOTCHAR;
2501    
2502     case OP_NOTPOSPLUS:
2503     possessive = TRUE;
2504     min = 1;
2505     max = INT_MAX;
2506     ecode++;
2507     goto REPEATNOTCHAR;
2508    
2509     case OP_NOTPOSQUERY:
2510     possessive = TRUE;
2511     min = 0;
2512     max = 1;
2513     ecode++;
2514     goto REPEATNOTCHAR;
2515    
2516     case OP_NOTPOSUPTO:
2517     possessive = TRUE;
2518     min = 0;
2519     max = GET2(ecode, 1);
2520     ecode += 3;
2521     goto REPEATNOTCHAR;
2522    
2523 nigel 77 case OP_NOTSTAR:
2524     case OP_NOTMINSTAR:
2525     case OP_NOTPLUS:
2526     case OP_NOTMINPLUS:
2527     case OP_NOTQUERY:
2528     case OP_NOTMINQUERY:
2529     c = *ecode++ - OP_NOTSTAR;
2530     minimize = (c & 1) != 0;
2531     min = rep_min[c]; /* Pick up values from tables; */
2532     max = rep_max[c]; /* zero for max => infinity */
2533     if (max == 0) max = INT_MAX;
2534    
2535     /* Common code for all repeated single-byte matches. We can give up quickly
2536     if there are fewer than the minimum number of bytes left in the
2537     subject. */
2538    
2539     REPEATNOTCHAR:
2540     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2541     fc = *ecode++;
2542    
2543     /* The code is duplicated for the caseless and caseful cases, for speed,
2544     since matching characters is likely to be quite common. First, ensure the
2545     minimum number of matches are present. If min = max, continue at the same
2546     level without recursing. Otherwise, if minimizing, keep trying the rest of
2547     the expression and advancing one matching character if failing, up to the
2548     maximum. Alternatively, if maximizing, find the maximum number of
2549     characters and work backwards. */
2550    
2551     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2552     max, eptr));
2553    
2554     if ((ims & PCRE_CASELESS) != 0)
2555     {
2556     fc = md->lcc[fc];
2557    
2558     #ifdef SUPPORT_UTF8
2559     /* UTF-8 mode */
2560     if (utf8)
2561     {
2562 nigel 93 register unsigned int d;
2563 nigel 77 for (i = 1; i <= min; i++)
2564     {
2565     GETCHARINC(d, eptr);
2566     if (d < 256) d = md->lcc[d];
2567     if (fc == d) RRETURN(MATCH_NOMATCH);
2568     }
2569     }
2570     else
2571     #endif
2572    
2573     /* Not UTF-8 mode */
2574     {
2575     for (i = 1; i <= min; i++)
2576     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2577     }
2578    
2579     if (min == max) continue;
2580    
2581     if (minimize)
2582     {
2583     #ifdef SUPPORT_UTF8
2584     /* UTF-8 mode */
2585     if (utf8)
2586     {
2587 nigel 93 register unsigned int d;
2588 nigel 77 for (fi = min;; fi++)
2589     {
2590 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2591 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2592     GETCHARINC(d, eptr);
2593     if (d < 256) d = md->lcc[d];
2594     if (fi >= max || eptr >= md->end_subject || fc == d)
2595     RRETURN(MATCH_NOMATCH);
2596     }
2597     }
2598     else
2599     #endif
2600     /* Not UTF-8 mode */
2601     {
2602     for (fi = min;; fi++)
2603     {
2604 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2605 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2606     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2607     RRETURN(MATCH_NOMATCH);
2608     }
2609     }
2610     /* Control never gets here */
2611     }
2612    
2613     /* Maximize case */
2614    
2615     else
2616     {
2617     pp = eptr;
2618    
2619     #ifdef SUPPORT_UTF8
2620     /* UTF-8 mode */
2621     if (utf8)
2622     {
2623 nigel 93 register unsigned int d;
2624 nigel 77 for (i = min; i < max; i++)
2625     {
2626     int len = 1;
2627     if (eptr >= md->end_subject) break;
2628     GETCHARLEN(d, eptr, len);
2629     if (d < 256) d = md->lcc[d];
2630     if (fc == d) break;
2631     eptr += len;
2632     }
2633 nigel 93 if (possessive) continue;
2634     for(;;)
2635 nigel 77 {
2636 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2637 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2638     if (eptr-- == pp) break; /* Stop if tried at original pos */
2639     BACKCHAR(eptr);
2640     }
2641     }
2642     else
2643     #endif
2644     /* Not UTF-8 mode */
2645     {
2646     for (i = min; i < max; i++)
2647     {
2648     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2649     eptr++;
2650     }
2651 nigel 93 if (possessive) continue;
2652 nigel 77 while (eptr >= pp)
2653     {
2654 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2655 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2656     eptr--;
2657     }
2658     }
2659    
2660     RRETURN(MATCH_NOMATCH);
2661     }
2662     /* Control never gets here */
2663     }
2664    
2665     /* Caseful comparisons */
2666    
2667     else
2668     {
2669     #ifdef SUPPORT_UTF8
2670     /* UTF-8 mode */
2671     if (utf8)
2672     {
2673 nigel 93 register unsigned int d;
2674 nigel 77 for (i = 1; i <= min; i++)
2675     {
2676     GETCHARINC(d, eptr);
2677     if (fc == d) RRETURN(MATCH_NOMATCH);
2678     }
2679     }
2680     else
2681     #endif
2682     /* Not UTF-8 mode */
2683     {
2684     for (i = 1; i <= min; i++)
2685     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2686     }
2687    
2688     if (min == max) continue;
2689    
2690     if (minimize)
2691     {
2692     #ifdef SUPPORT_UTF8
2693     /* UTF-8 mode */
2694     if (utf8)
2695     {
2696 nigel 93 register unsigned int d;
2697 nigel 77 for (fi = min;; fi++)
2698     {
2699 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2700 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2701     GETCHARINC(d, eptr);
2702     if (fi >= max || eptr >= md->end_subject || fc == d)
2703     RRETURN(MATCH_NOMATCH);
2704     }
2705     }
2706     else
2707     #endif
2708     /* Not UTF-8 mode */
2709     {
2710     for (fi = min;; fi++)
2711     {
2712 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2713 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2714     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2715     RRETURN(MATCH_NOMATCH);
2716     }
2717     }
2718     /* Control never gets here */
2719     }
2720    
2721     /* Maximize case */
2722    
2723     else
2724     {
2725     pp = eptr;
2726    
2727     #ifdef SUPPORT_UTF8
2728     /* UTF-8 mode */
2729     if (utf8)
2730     {
2731 nigel 93 register unsigned int d;
2732 nigel 77 for (i = min; i < max; i++)
2733     {
2734     int len = 1;
2735     if (eptr >= md->end_subject) break;
2736     GETCHARLEN(d, eptr, len);
2737     if (fc == d) break;
2738     eptr += len;
2739     }
2740 nigel 93 if (possessive) continue;
2741 nigel 77 for(;;)
2742     {
2743 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2744 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2745     if (eptr-- == pp) break; /* Stop if tried at original pos */
2746     BACKCHAR(eptr);
2747     }
2748     }
2749     else
2750     #endif
2751     /* Not UTF-8 mode */
2752     {
2753     for (i = min; i < max; i++)
2754     {
2755     if (eptr >= md->end_subject || fc == *eptr) break;
2756     eptr++;
2757     }
2758 nigel 93 if (possessive) continue;
2759 nigel 77 while (eptr >= pp)
2760     {
2761 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2762 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763     eptr--;
2764     }
2765     }
2766    
2767     RRETURN(MATCH_NOMATCH);
2768     }
2769     }
2770     /* Control never gets here */
2771    
2772     /* Match a single character type repeatedly; several different opcodes
2773     share code. This is very similar to the code for single characters, but we
2774     repeat it in the interests of efficiency. */
2775    
2776     case OP_TYPEEXACT:
2777     min = max = GET2(ecode, 1);
2778     minimize = TRUE;
2779     ecode += 3;
2780     goto REPEATTYPE;
2781    
2782     case OP_TYPEUPTO:
2783     case OP_TYPEMINUPTO:
2784     min = 0;
2785     max = GET2(ecode, 1);
2786     minimize = *ecode == OP_TYPEMINUPTO;
2787     ecode += 3;
2788     goto REPEATTYPE;
2789    
2790 nigel 93 case OP_TYPEPOSSTAR:
2791     possessive = TRUE;
2792     min = 0;
2793     max = INT_MAX;
2794     ecode++;
2795     goto REPEATTYPE;
2796    
2797     case OP_TYPEPOSPLUS:
2798     possessive = TRUE;
2799     min = 1;
2800     max = INT_MAX;
2801     ecode++;
2802     goto REPEATTYPE;
2803    
2804     case OP_TYPEPOSQUERY:
2805     possessive = TRUE;
2806     min = 0;
2807     max = 1;
2808     ecode++;
2809     goto REPEATTYPE;
2810    
2811     case OP_TYPEPOSUPTO:
2812     possessive = TRUE;
2813     min = 0;
2814     max = GET2(ecode, 1);
2815     ecode += 3;
2816     goto REPEATTYPE;
2817    
2818 nigel 77 case OP_TYPESTAR:
2819     case OP_TYPEMINSTAR:
2820     case OP_TYPEPLUS:
2821     case OP_TYPEMINPLUS:
2822     case OP_TYPEQUERY:
2823     case OP_TYPEMINQUERY:
2824     c = *ecode++ - OP_TYPESTAR;
2825     minimize = (c & 1) != 0;
2826     min = rep_min[c]; /* Pick up values from tables; */
2827     max = rep_max[c]; /* zero for max => infinity */
2828     if (max == 0) max = INT_MAX;
2829    
2830     /* Common code for all repeated single character type matches. Note that
2831     in UTF-8 mode, '.' matches a character of any length, but for the other
2832     character types, the valid characters are all one-byte long. */
2833    
2834     REPEATTYPE:
2835     ctype = *ecode++; /* Code for the character type */
2836    
2837     #ifdef SUPPORT_UCP
2838     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2839     {
2840     prop_fail_result = ctype == OP_NOTPROP;
2841     prop_type = *ecode++;
2842 nigel 87 prop_value = *ecode++;
2843 nigel 77 }
2844     else prop_type = -1;
2845     #endif
2846    
2847     /* First, ensure the minimum number of matches are present. Use inline
2848     code for maximizing the speed, and do the type test once at the start
2849     (i.e. keep it out of the loop). Also we can test that there are at least
2850     the minimum number of bytes before we start. This isn't as effective in
2851     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2852     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2853     and single-bytes. */
2854    
2855     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2856     if (min > 0)
2857     {
2858     #ifdef SUPPORT_UCP
2859 nigel 87 if (prop_type >= 0)
2860 nigel 77 {
2861 nigel 87 switch(prop_type)
2862 nigel 77 {
2863 nigel 87 case PT_ANY:
2864     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2865     for (i = 1; i <= min; i++)
2866     {
2867     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2868 ph10 184 GETCHARINCTEST(c, eptr);
2869 nigel 87 }
2870     break;
2871    
2872     case PT_LAMP:
2873     for (i = 1; i <= min; i++)
2874     {
2875     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2876 ph10 184 GETCHARINCTEST(c, eptr);
2877 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2878     if ((prop_chartype == ucp_Lu ||
2879     prop_chartype == ucp_Ll ||
2880     prop_chartype == ucp_Lt) == prop_fail_result)
2881     RRETURN(MATCH_NOMATCH);
2882     }
2883     break;
2884    
2885     case PT_GC:
2886     for (i = 1; i <= min; i++)
2887     {
2888     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2889 ph10 184 GETCHARINCTEST(c, eptr);
2890 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2891     if ((prop_category == prop_value) == prop_fail_result)
2892     RRETURN(MATCH_NOMATCH);
2893     }
2894     break;
2895    
2896     case PT_PC:
2897     for (i = 1; i <= min; i++)
2898     {
2899     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2900 ph10 184 GETCHARINCTEST(c, eptr);
2901 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2902     if ((prop_chartype == prop_value) == prop_fail_result)
2903     RRETURN(MATCH_NOMATCH);
2904     }
2905     break;
2906    
2907     case PT_SC:
2908     for (i = 1; i <= min; i++)
2909     {
2910     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2911 ph10 184 GETCHARINCTEST(c, eptr);
2912 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2913     if ((prop_script == prop_value) == prop_fail_result)
2914     RRETURN(MATCH_NOMATCH);
2915     }
2916     break;
2917    
2918     default:
2919     RRETURN(PCRE_ERROR_INTERNAL);
2920 nigel 77 }
2921     }
2922    
2923     /* Match extended Unicode sequences. We will get here only if the
2924     support is in the binary; otherwise a compile-time error occurs. */
2925    
2926     else if (ctype == OP_EXTUNI)
2927     {
2928     for (i = 1; i <= min; i++)
2929     {
2930     GETCHARINCTEST(c, eptr);
2931 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2932 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2933     while (eptr < md->end_subject)
2934     {
2935     int len = 1;
2936     if (!utf8) c = *eptr; else
2937     {
2938     GETCHARLEN(c, eptr, len);
2939     }
2940 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
2941 nigel 77 if (prop_category != ucp_M) break;
2942     eptr += len;
2943     }
2944     }
2945     }
2946    
2947     else
2948     #endif /* SUPPORT_UCP */
2949    
2950     /* Handle all other cases when the coding is UTF-8 */
2951    
2952     #ifdef SUPPORT_UTF8
2953     if (utf8) switch(ctype)
2954     {
2955     case OP_ANY:
2956     for (i = 1; i <= min; i++)
2957     {
2958     if (eptr >= md->end_subject ||
2959 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
2960 nigel 77 RRETURN(MATCH_NOMATCH);
2961 nigel 91 eptr++;
2962 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2963     }
2964     break;
2965    
2966 ph10 341 case OP_ALLANY:
2967     for (i = 1; i <= min; i++)
2968     {
2969     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2970     eptr++;
2971     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2972     }
2973     break;
2974    
2975 nigel 77 case OP_ANYBYTE:
2976     eptr += min;
2977     break;
2978    
2979 nigel 93 case OP_ANYNL:
2980     for (i = 1; i <= min; i++)
2981     {
2982     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2983     GETCHARINC(c, eptr);
2984     switch(c)
2985     {
2986     default: RRETURN(MATCH_NOMATCH);
2987     case 0x000d:
2988     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2989     break;
2990 ph10 231
2991 nigel 93 case 0x000a:
2992 ph10 231 break;
2993    
2994 nigel 93 case 0x000b:
2995     case 0x000c:
2996     case 0x0085:
2997     case 0x2028:
2998     case 0x2029:
2999 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3000 nigel 93 break;
3001     }
3002     }
3003     break;
3004    
3005 ph10 178 case OP_NOT_HSPACE:
3006     for (i = 1; i <= min; i++)
3007     {
3008     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3009     GETCHARINC(c, eptr);
3010     switch(c)
3011     {
3012     default: break;
3013     case 0x09: /* HT */
3014     case 0x20: /* SPACE */
3015     case 0xa0: /* NBSP */
3016     case 0x1680: /* OGHAM SPACE MARK */
3017     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3018     case 0x2000: /* EN QUAD */
3019     case 0x2001: /* EM QUAD */
3020     case 0x2002: /* EN SPACE */
3021     case 0x2003: /* EM SPACE */
3022     case 0x2004: /* THREE-PER-EM SPACE */
3023     case 0x2005: /* FOUR-PER-EM SPACE */
3024     case 0x2006: /* SIX-PER-EM SPACE */
3025     case 0x2007: /* FIGURE SPACE */
3026     case 0x2008: /* PUNCTUATION SPACE */
3027     case 0x2009: /* THIN SPACE */
3028     case 0x200A: /* HAIR SPACE */
3029     case 0x202f: /* NARROW NO-BREAK SPACE */
3030     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3031     case 0x3000: /* IDEOGRAPHIC SPACE */
3032     RRETURN(MATCH_NOMATCH);
3033     }
3034     }
3035     break;
3036 ph10 182
3037 ph10 178 case OP_HSPACE:
3038     for (i = 1; i <= min; i++)
3039     {
3040     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3041     GETCHARINC(c, eptr);
3042     switch(c)
3043     {
3044     default: RRETURN(MATCH_NOMATCH);
3045     case 0x09: /* HT */
3046     case 0x20: /* SPACE */
3047     case 0xa0: /* NBSP */
3048     case 0x1680: /* OGHAM SPACE MARK */
3049     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3050     case 0x2000: /* EN QUAD */
3051     case 0x2001: /* EM QUAD */
3052     case 0x2002: /* EN SPACE */
3053     case 0x2003: /* EM SPACE */
3054     case 0x2004: /* THREE-PER-EM SPACE */
3055     case 0x2005: /* FOUR-PER-EM SPACE */
3056     case 0x2006: /* SIX-PER-EM SPACE */
3057     case 0x2007: /* FIGURE SPACE */
3058     case 0x2008: /* PUNCTUATION SPACE */
3059     case 0x2009: /* THIN SPACE */
3060     case 0x200A: /* HAIR SPACE */
3061     case 0x202f: /* NARROW NO-BREAK SPACE */
3062     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3063     case 0x3000: /* IDEOGRAPHIC SPACE */
3064     break;
3065     }
3066     }
3067     break;
3068 ph10 182
3069 ph10 178 case OP_NOT_VSPACE:
3070     for (i = 1; i <= min; i++)
3071     {
3072     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3073     GETCHARINC(c, eptr);
3074     switch(c)
3075     {
3076     default: break;
3077     case 0x0a: /* LF */
3078     case 0x0b: /* VT */
3079     case 0x0c: /* FF */
3080     case 0x0d: /* CR */
3081     case 0x85: /* NEL */
3082     case 0x2028: /* LINE SEPARATOR */
3083     case 0x2029: /* PARAGRAPH SEPARATOR */
3084     RRETURN(MATCH_NOMATCH);
3085     }
3086     }
3087     break;
3088 ph10 182
3089 ph10 178 case OP_VSPACE:
3090     for (i = 1; i <= min; i++)
3091     {
3092     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3093     GETCHARINC(c, eptr);
3094     switch(c)
3095     {
3096     default: RRETURN(MATCH_NOMATCH);
3097     case 0x0a: /* LF */
3098     case 0x0b: /* VT */
3099     case 0x0c: /* FF */
3100     case 0x0d: /* CR */
3101     case 0x85: /* NEL */
3102     case 0x2028: /* LINE SEPARATOR */
3103     case 0x2029: /* PARAGRAPH SEPARATOR */
3104 ph10 182 break;
3105 ph10 178 }
3106     }
3107     break;
3108    
3109 nigel 77 case OP_NOT_DIGIT:
3110     for (i = 1; i <= min; i++)
3111     {
3112     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3113     GETCHARINC(c, eptr);
3114     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3115     RRETURN(MATCH_NOMATCH);
3116     }
3117     break;
3118    
3119     case OP_DIGIT:
3120     for (i = 1; i <= min; i++)
3121     {
3122     if (eptr >= md->end_subject ||
3123     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3124     RRETURN(MATCH_NOMATCH);
3125     /* No need to skip more bytes - we know it's a 1-byte character */
3126     }
3127     break;
3128    
3129     case OP_NOT_WHITESPACE:
3130     for (i = 1; i <= min; i++)
3131     {
3132     if (eptr >= md->end_subject ||
3133 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3134 nigel 77 RRETURN(MATCH_NOMATCH);
3135 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3136 nigel 77 }
3137     break;
3138    
3139     case OP_WHITESPACE:
3140     for (i = 1; i <= min; i++)
3141     {
3142     if (eptr >= md->end_subject ||
3143     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3144     RRETURN(MATCH_NOMATCH);
3145     /* No need to skip more bytes - we know it's a 1-byte character */
3146     }
3147     break;
3148    
3149     case OP_NOT_WORDCHAR:
3150     for (i = 1; i <= min; i++)
3151     {
3152     if (eptr >= md->end_subject ||
3153 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3154 nigel 77 RRETURN(MATCH_NOMATCH);
3155 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3156 nigel 77 }
3157     break;
3158    
3159     case OP_WORDCHAR:
3160     for (i = 1; i <= min; i++)
3161     {
3162     if (eptr >= md->end_subject ||
3163     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3164     RRETURN(MATCH_NOMATCH);
3165     /* No need to skip more bytes - we know it's a 1-byte character */
3166     }
3167     break;
3168    
3169     default:
3170     RRETURN(PCRE_ERROR_INTERNAL);
3171     } /* End switch(ctype) */
3172    
3173     else
3174     #endif /* SUPPORT_UTF8 */
3175    
3176     /* Code for the non-UTF-8 case for minimum matching of operators other
3177 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3178     number of bytes present, as this was tested above. */
3179 nigel 77
3180     switch(ctype)
3181     {
3182     case OP_ANY:
3183     if ((ims & PCRE_DOTALL) == 0)
3184     {
3185     for (i = 1; i <= min; i++)
3186 nigel 91 {
3187 nigel 93 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3188 nigel 91 eptr++;
3189     }
3190 nigel 77 }
3191     else eptr += min;
3192     break;
3193    
3194 ph10 341 case OP_ALLANY:
3195     eptr += min;
3196     break;
3197    
3198 nigel 77 case OP_ANYBYTE:
3199     eptr += min;
3200     break;
3201    
3202 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3203     bytes are present in this case. */
3204    
3205     case OP_ANYNL:
3206     for (i = 1; i <= min; i++)
3207     {
3208     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3209     switch(*eptr++)
3210     {
3211     default: RRETURN(MATCH_NOMATCH);
3212     case 0x000d:
3213     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3214     break;
3215     case 0x000a:
3216 ph10 231 break;
3217    
3218 nigel 93 case 0x000b:
3219     case 0x000c:
3220     case 0x0085:
3221 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3222 nigel 93 break;
3223     }
3224     }
3225     break;
3226    
3227 ph10 178 case OP_NOT_HSPACE:
3228     for (i = 1; i <= min; i++)
3229     {
3230     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3231     switch(*eptr++)
3232     {
3233     default: break;
3234     case 0x09: /* HT */
3235     case 0x20: /* SPACE */
3236     case 0xa0: /* NBSP */
3237     RRETURN(MATCH_NOMATCH);
3238     }
3239     }
3240     break;
3241    
3242     case OP_HSPACE:
3243     for (i = 1; i <= min; i++)
3244     {
3245     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3246     switch(*eptr++)
3247     {
3248     default: RRETURN(MATCH_NOMATCH);
3249     case 0x09: /* HT */
3250     case 0x20: /* SPACE */
3251     case 0xa0: /* NBSP */
3252 ph10 182 break;
3253 ph10 178 }
3254     }
3255     break;
3256    
3257     case OP_NOT_VSPACE:
3258     for (i = 1; i <= min; i++)
3259     {
3260     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3261     switch(*eptr++)
3262     {
3263     default: break;
3264     case 0x0a: /* LF */
3265     case 0x0b: /* VT */
3266     case 0x0c: /* FF */
3267     case 0x0d: /* CR */
3268     case 0x85: /* NEL */
3269     RRETURN(MATCH_NOMATCH);
3270     }
3271     }
3272     break;
3273    
3274     case OP_VSPACE:
3275     for (i = 1; i <= min; i++)
3276     {
3277     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3278     switch(*eptr++)
3279     {
3280     default: RRETURN(MATCH_NOMATCH);
3281     case 0x0a: /* LF */
3282     case 0x0b: /* VT */
3283     case 0x0c: /* FF */
3284     case 0x0d: /* CR */
3285     case 0x85: /* NEL */
3286 ph10 182 break;
3287 ph10 178 }
3288     }
3289     break;
3290    
3291 nigel 77 case OP_NOT_DIGIT:
3292     for (i = 1; i <= min; i++)
3293     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3294     break;
3295    
3296     case OP_DIGIT:
3297     for (i = 1; i <= min; i++)
3298     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3299     break;
3300    
3301     case OP_NOT_WHITESPACE:
3302     for (i = 1; i <= min; i++)
3303     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3304     break;
3305    
3306     case OP_WHITESPACE:
3307     for (i = 1; i <= min; i++)
3308     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3309     break;
3310    
3311     case OP_NOT_WORDCHAR:
3312     for (i = 1; i <= min; i++)
3313     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3314     RRETURN(MATCH_NOMATCH);
3315     break;
3316    
3317     case OP_WORDCHAR:
3318     for (i = 1; i <= min; i++)
3319     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3320     RRETURN(MATCH_NOMATCH);
3321     break;
3322    
3323     default:
3324     RRETURN(PCRE_ERROR_INTERNAL);
3325     }
3326     }
3327    
3328     /* If min = max, continue at the same level without recursing */
3329    
3330     if (min == max) continue;
3331    
3332     /* If minimizing, we have to test the rest of the pattern before each
3333     subsequent match. Again, separate the UTF-8 case for speed, and also
3334     separate the UCP cases. */
3335    
3336     if (minimize)
3337     {
3338     #ifdef SUPPORT_UCP
3339 nigel 87 if (prop_type >= 0)
3340 nigel 77 {
3341 nigel 87 switch(prop_type)
3342 nigel 77 {
3343 nigel 87 case PT_ANY:
3344     for (fi = min;; fi++)
3345     {
3346 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3347 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3348     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3349     GETCHARINC(c, eptr);
3350     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3351     }
3352 nigel 93 /* Control never gets here */
3353 nigel 87
3354     case PT_LAMP:
3355     for (fi = min;; fi++)
3356     {
3357 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3358 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3359     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3360     GETCHARINC(c, eptr);
3361     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3362     if ((prop_chartype == ucp_Lu ||
3363     prop_chartype == ucp_Ll ||
3364     prop_chartype == ucp_Lt) == prop_fail_result)
3365     RRETURN(MATCH_NOMATCH);
3366     }
3367 nigel 93 /* Control never gets here */
3368 nigel 87
3369     case PT_GC:
3370     for (fi = min;; fi++)
3371     {
3372 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3373 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3374     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3375     GETCHARINC(c, eptr);
3376     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3377     if ((prop_category == prop_value) == prop_fail_result)
3378     RRETURN(MATCH_NOMATCH);
3379     }
3380 nigel 93 /* Control never gets here */
3381 nigel 87
3382     case PT_PC:
3383     for (fi = min;; fi++)
3384     {
3385 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3386 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3387     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3388     GETCHARINC(c, eptr);
3389     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3390     if ((prop_chartype == prop_value) == prop_fail_result)
3391     RRETURN(MATCH_NOMATCH);
3392     }
3393 nigel 93 /* Control never gets here */
3394 nigel 87
3395     case PT_SC:
3396     for (fi = min;; fi++)
3397     {
3398 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3399 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3400     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3401     GETCHARINC(c, eptr);
3402     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3403     if ((prop_script == prop_value) == prop_fail_result)
3404     RRETURN(MATCH_NOMATCH);
3405     }
3406 nigel 93 /* Control never gets here */
3407 nigel 87
3408     default:
3409     RRETURN(PCRE_ERROR_INTERNAL);
3410 nigel 77 }
3411     }
3412    
3413     /* Match extended Unicode sequences. We will get here only if the
3414     support is in the binary; otherwise a compile-time error occurs. */
3415    
3416     else if (ctype == OP_EXTUNI)
3417     {
3418     for (fi = min;; fi++)
3419     {
3420 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3421 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3422     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3423     GETCHARINCTEST(c, eptr);
3424 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3425 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3426     while (eptr < md->end_subject)
3427     {
3428     int len = 1;
3429     if (!utf8) c = *eptr; else
3430     {
3431     GETCHARLEN(c, eptr, len);
3432     }
3433 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3434 nigel 77 if (prop_category != ucp_M) break;
3435     eptr += len;
3436     }
3437     }
3438     }
3439    
3440     else
3441     #endif /* SUPPORT_UCP */
3442    
3443     #ifdef SUPPORT_UTF8
3444     /* UTF-8 mode */
3445     if (utf8)
3446     {
3447     for (fi = min;; fi++)
3448     {
3449 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3450 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3451 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3452     (ctype == OP_ANY && (ims & PCRE_DOTALL) == 0 &&
3453 nigel 93 IS_NEWLINE(eptr)))
3454 nigel 91 RRETURN(MATCH_NOMATCH);
3455 nigel 77
3456     GETCHARINC(c, eptr);
3457     switch(ctype)
3458     {
3459 nigel 91 case OP_ANY: /* This is the DOTALL case */
3460 ph10 341 case OP_ALLANY:
3461 nigel 77 case OP_ANYBYTE:
3462     break;
3463    
3464 nigel 93 case OP_ANYNL:
3465     switch(c)
3466     {
3467     default: RRETURN(MATCH_NOMATCH);
3468     case 0x000d:
3469     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3470     break;
3471     case 0x000a:
3472 ph10 231 break;
3473    
3474 nigel 93 case 0x000b:
3475     case 0x000c:
3476     case 0x0085:
3477     case 0x2028:
3478     case 0x2029:
3479 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3480 nigel 93 break;
3481     }
3482     break;
3483    
3484 ph10 178 case OP_NOT_HSPACE:
3485     switch(c)
3486     {
3487     default: break;
3488     case 0x09: /* HT */
3489     case 0x20: /* SPACE */
3490     case 0xa0: /* NBSP */
3491     case 0x1680: /* OGHAM SPACE MARK */
3492     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3493     case 0x2000: /* EN QUAD */
3494     case 0x2001: /* EM QUAD */
3495     case 0x2002: /* EN SPACE */
3496     case 0x2003: /* EM SPACE */
3497     case 0x2004: /* THREE-PER-EM SPACE */
3498     case 0x2005: /* FOUR-PER-EM SPACE */
3499     case 0x2006: /* SIX-PER-EM SPACE */
3500     case 0x2007: /* FIGURE SPACE */
3501     case 0x2008: /* PUNCTUATION SPACE */
3502     case 0x2009: /* THIN SPACE */
3503     case 0x200A: /* HAIR SPACE */
3504     case 0x202f: /* NARROW NO-BREAK SPACE */
3505     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3506     case 0x3000: /* IDEOGRAPHIC SPACE */
3507     RRETURN(MATCH_NOMATCH);
3508     }
3509     break;
3510    
3511     case OP_HSPACE:
3512     switch(c)
3513     {
3514     default: RRETURN(MATCH_NOMATCH);
3515     case 0x09: /* HT */
3516     case 0x20: /* SPACE */
3517     case 0xa0: /* NBSP */
3518     case 0x1680: /* OGHAM SPACE MARK */
3519     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3520     case 0x2000: /* EN QUAD */
3521     case 0x2001: /* EM QUAD */
3522     case 0x2002: /* EN SPACE */
3523     case 0x2003: /* EM SPACE */
3524     case 0x2004: /* THREE-PER-EM SPACE */
3525     case 0x2005: /* FOUR-PER-EM SPACE */
3526     case 0x2006: /* SIX-PER-EM SPACE */
3527     case 0x2007: /* FIGURE SPACE */
3528     case 0x2008: /* PUNCTUATION SPACE */
3529     case 0x2009: /* THIN SPACE */
3530     case 0x200A: /* HAIR SPACE */
3531     case 0x202f: /* NARROW NO-BREAK SPACE */
3532     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3533     case 0x3000: /* IDEOGRAPHIC SPACE */
3534     break;
3535     }
3536     break;
3537    
3538     case OP_NOT_VSPACE:
3539     switch(c)
3540     {
3541     default: break;
3542     case 0x0a: /* LF */
3543     case 0x0b: /* VT */
3544     case 0x0c: /* FF */
3545     case 0x0d: /* CR */
3546     case 0x85: /* NEL */
3547     case 0x2028: /* LINE SEPARATOR */
3548     case 0x2029: /* PARAGRAPH SEPARATOR */
3549     RRETURN(MATCH_NOMATCH);
3550     }
3551     break;
3552    
3553     case OP_VSPACE:
3554     switch(c)
3555     {
3556     default: RRETURN(MATCH_NOMATCH);
3557     case 0x0a: /* LF */
3558     case 0x0b: /* VT */
3559     case 0x0c: /* FF */
3560     case 0x0d: /* CR */
3561     case 0x85: /* NEL */
3562     case 0x2028: /* LINE SEPARATOR */
3563     case 0x2029: /* PARAGRAPH SEPARATOR */
3564     break;
3565     }
3566     break;
3567    
3568 nigel 77 case OP_NOT_DIGIT:
3569     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3570     RRETURN(MATCH_NOMATCH);
3571     break;
3572    
3573     case OP_DIGIT:
3574     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3575     RRETURN(MATCH_NOMATCH);
3576     break;
3577    
3578     case OP_NOT_WHITESPACE:
3579     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3580     RRETURN(MATCH_NOMATCH);
3581     break;
3582    
3583     case OP_WHITESPACE:
3584     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3585     RRETURN(MATCH_NOMATCH);
3586     break;
3587    
3588     case OP_NOT_WORDCHAR:
3589     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3590     RRETURN(MATCH_NOMATCH);
3591     break;
3592    
3593     case OP_WORDCHAR:
3594     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3595     RRETURN(MATCH_NOMATCH);
3596     break;
3597    
3598     default:
3599     RRETURN(PCRE_ERROR_INTERNAL);
3600     }
3601     }
3602     }
3603     else
3604     #endif
3605     /* Not UTF-8 mode */
3606     {
3607     for (fi = min;; fi++)
3608     {
3609 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3610 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3611 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3612 nigel 93 ((ims & PCRE_DOTALL) == 0 && IS_NEWLINE(eptr)))
3613 nigel 91 RRETURN(MATCH_NOMATCH);
3614    
3615 nigel 77 c = *eptr++;
3616     switch(ctype)
3617     {
3618 ph10 341 case OP_ANY: /* This is the DOTALL case */
3619     case OP_ALLANY:
3620 nigel 77 case OP_ANYBYTE:
3621     break;
3622    
3623 nigel 93 case OP_ANYNL:
3624     switch(c)
3625     {
3626     default: RRETURN(MATCH_NOMATCH);
3627     case 0x000d:
3628     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3629     break;
3630 ph10 231
3631 nigel 93 case 0x000a:
3632 ph10 231 break;
3633    
3634 nigel 93 case 0x000b:
3635     case 0x000c:
3636     case 0x0085:
3637 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3638 nigel 93 break;
3639     }
3640     break;
3641    
3642 ph10 178 case OP_NOT_HSPACE:
3643     switch(c)
3644     {
3645     default: break;
3646     case 0x09: /* HT */
3647     case 0x20: /* SPACE */
3648     case 0xa0: /* NBSP */
3649     RRETURN(MATCH_NOMATCH);
3650     }
3651     break;
3652    
3653     case OP_HSPACE:
3654     switch(c)
3655     {
3656     default: RRETURN(MATCH_NOMATCH);
3657     case 0x09: /* HT */
3658     case 0x20: /* SPACE */
3659     case 0xa0: /* NBSP */
3660     break;
3661     }
3662     break;
3663    
3664     case OP_NOT_VSPACE:
3665     switch(c)
3666     {
3667     default: break;
3668     case 0x0a: /* LF */
3669     case 0x0b: /* VT */
3670     case 0x0c: /* FF */
3671     case 0x0d: /* CR */
3672     case 0x85: /* NEL */
3673     RRETURN(MATCH_NOMATCH);
3674     }
3675     break;
3676    
3677     case OP_VSPACE:
3678     switch(c)
3679     {
3680     default: RRETURN(MATCH_NOMATCH);
3681     case 0x0a: /* LF */
3682     case 0x0b: /* VT */
3683     case 0x0c: /* FF */
3684     case 0x0d: /* CR */
3685     case 0x85: /* NEL */
3686     break;
3687     }
3688     break;
3689    
3690 nigel 77 case OP_NOT_DIGIT:
3691     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3692     break;
3693    
3694     case OP_DIGIT:
3695     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3696     break;
3697    
3698     case OP_NOT_WHITESPACE:
3699     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3700     break;
3701    
3702     case OP_WHITESPACE:
3703     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3704     break;
3705    
3706     case OP_NOT_WORDCHAR:
3707     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3708     break;
3709    
3710     case OP_WORDCHAR:
3711     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3712     break;
3713    
3714     default:
3715     RRETURN(PCRE_ERROR_INTERNAL);
3716     }
3717     }
3718     }
3719     /* Control never gets here */
3720     }
3721    
3722 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3723 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3724     UTF-8 and UCP stuff separate. */
3725    
3726     else
3727     {
3728     pp = eptr; /* Remember where we started */
3729    
3730     #ifdef SUPPORT_UCP
3731 nigel 87 if (prop_type >= 0)
3732 nigel 77 {
3733 nigel 87 switch(prop_type)
3734 nigel 77 {
3735 nigel 87 case PT_ANY:
3736     for (i = min; i < max; i++)
3737     {
3738     int len = 1;
3739     if (eptr >= md->end_subject) break;
3740     GETCHARLEN(c, eptr, len);
3741     if (prop_fail_result) break;
3742     eptr+= len;
3743     }
3744     break;
3745    
3746     case PT_LAMP:
3747     for (i = min; i < max; i++)
3748     {
3749     int len = 1;
3750     if (eptr >= md->end_subject) break;
3751     GETCHARLEN(c, eptr, len);
3752     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3753     if ((prop_chartype == ucp_Lu ||
3754     prop_chartype == ucp_Ll ||
3755     prop_chartype == ucp_Lt) == prop_fail_result)
3756     break;
3757     eptr+= len;
3758     }
3759     break;
3760    
3761     case PT_GC:
3762     for (i = min; i < max; i++)
3763     {
3764     int len = 1;
3765     if (eptr >= md->end_subject) break;
3766     GETCHARLEN(c, eptr, len);
3767     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3768     if ((prop_category == prop_value) == prop_fail_result)
3769     break;
3770     eptr+= len;
3771     }
3772     break;
3773    
3774     case PT_PC:
3775     for (i = min; i < max; i++)
3776     {
3777     int len = 1;
3778     if (eptr >= md->end_subject) break;
3779     GETCHARLEN(c, eptr, len);
3780     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3781     if ((prop_chartype == prop_value) == prop_fail_result)
3782     break;
3783     eptr+= len;
3784     }
3785     break;
3786    
3787     case PT_SC:
3788     for (i = min; i < max; i++)
3789     {
3790     int len = 1;
3791     if (eptr >= md->end_subject) break;
3792     GETCHARLEN(c, eptr, len);
3793     prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3794     if ((prop_script == prop_value) == prop_fail_result)
3795     break;
3796     eptr+= len;
3797     }
3798     break;
3799 nigel 77 }
3800    
3801     /* eptr is now past the end of the maximum run */
3802    
3803 nigel 93 if (possessive) continue;
3804 nigel 77 for(;;)
3805     {
3806 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3807 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3808     if (eptr-- == pp) break; /* Stop if tried at original pos */
3809 ph10 207 if (utf8) BACKCHAR(eptr);
3810 nigel 77 }
3811     }
3812    
3813     /* Match extended Unicode sequences. We will get here only if the
3814     support is in the binary; otherwise a compile-time error occurs. */
3815    
3816     else if (ctype == OP_EXTUNI)
3817     {
3818     for (i = min; i < max; i++)
3819     {
3820     if (eptr >= md->end_subject) break;
3821     GETCHARINCTEST(c, eptr);
3822 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3823 nigel 77 if (prop_category == ucp_M) break;
3824     while (eptr < md->end_subject)
3825     {
3826     int len = 1;
3827     if (!utf8) c = *eptr; else
3828     {
3829     GETCHARLEN(c, eptr, len);
3830     }
3831 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3832 nigel 77 if (prop_category != ucp_M) break;
3833     eptr += len;
3834     }
3835     }
3836    
3837     /* eptr is now past the end of the maximum run */
3838    
3839 nigel 93 if (possessive) continue;
3840 nigel 77 for(;;)
3841     {
3842 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3843 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3844     if (eptr-- == pp) break; /* Stop if tried at original pos */
3845     for (;;) /* Move back over one extended */
3846     {
3847     int len = 1;
3848     if (!utf8) c = *eptr; else
3849     {
3850 ph10 207 BACKCHAR(eptr);
3851 nigel 77 GETCHARLEN(c, eptr, len);
3852     }
3853 nigel 87 prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
3854 nigel 77 if (prop_category != ucp_M) break;
3855     eptr--;
3856     }
3857     }
3858     }
3859    
3860     else
3861     #endif /* SUPPORT_UCP */
3862    
3863     #ifdef SUPPORT_UTF8
3864     /* UTF-8 mode */
3865    
3866     if (utf8)
3867     {
3868     switch(ctype)
3869     {
3870     case OP_ANY:
3871     if (max < INT_MAX)
3872     {
3873     if ((ims & PCRE_DOTALL) == 0)
3874     {
3875     for (i = min; i < max; i++)
3876     {
3877 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3878 nigel 77 eptr++;
3879     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3880     }
3881     }
3882     else
3883     {
3884     for (i = min; i < max; i++)
3885     {
3886 nigel 91 if (eptr >= md->end_subject) break;
3887 nigel 77 eptr++;
3888     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3889     }
3890     }
3891     }
3892    
3893     /* Handle unlimited UTF-8 repeat */
3894    
3895     else
3896     {
3897     if ((ims & PCRE_DOTALL) == 0)
3898     {
3899     for (i = min; i < max; i++)
3900     {
3901 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3902 nigel 77 eptr++;
3903 ph10 190 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3904 nigel 77 }
3905     }
3906     else
3907     {
3908 ph10 190 eptr = md->end_subject;
3909 nigel 77 }
3910     }
3911     break;
3912    
3913 ph10 341 case OP_ALLANY:
3914     if (max < INT_MAX)
3915     {
3916     for (i = min; i < max; i++)
3917     {
3918     if (eptr >= md->end_subject) break;
3919     eptr++;
3920     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3921     }
3922     }
3923     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3924     break;
3925    
3926 nigel 77 /* The byte case is the same as non-UTF8 */
3927    
3928     case OP_ANYBYTE:
3929     c = max - min;
3930 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3931     c = md->end_subject - eptr;
3932 nigel 77 eptr += c;
3933     break;
3934    
3935 nigel 93 case OP_ANYNL:
3936     for (i = min; i < max; i++)
3937     {
3938     int len = 1;
3939     if (eptr >= md->end_subject) break;
3940     GETCHARLEN(c, eptr, len);
3941     if (c == 0x000d)
3942     {
3943     if (++eptr >= md->end_subject) break;
3944     if (*eptr == 0x000a) eptr++;
3945     }
3946     else
3947     {
3948 ph10 231 if (c != 0x000a &&
3949     (md->bsr_anycrlf ||
3950     (c != 0x000b && c != 0x000c &&
3951     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3952 nigel 93 break;
3953     eptr += len;
3954     }
3955     }
3956     break;
3957    
3958 ph10 178 case OP_NOT_HSPACE:
3959 ph10 182 case OP_HSPACE:
3960 ph10 178 for (i = min; i < max; i++)
3961     {
3962 ph10 182 BOOL gotspace;
3963 ph10 178 int len = 1;
3964     if (eptr >= md->end_subject) break;
3965     GETCHARLEN(c, eptr, len);
3966     switch(c)
3967 ph10 182 {
3968     default: gotspace = FALSE; break;
3969 ph10 178 case 0x09: /* HT */
3970     case 0x20: /* SPACE */
3971     case 0xa0: /* NBSP */
3972     case 0x1680: /* OGHAM SPACE MARK */
3973     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3974     case 0x2000: /* EN QUAD */
3975     case 0x2001: /* EM QUAD */
3976     case 0x2002: /* EN SPACE */
3977     case 0x2003: /* EM SPACE */
3978     case 0x2004: /* THREE-PER-EM SPACE */
3979     case 0x2005: /* FOUR-PER-EM SPACE */
3980     case 0x2006: /* SIX-PER-EM SPACE */
3981     case 0x2007: /* FIGURE SPACE */
3982     case 0x2008: /* PUNCTUATION SPACE */
3983     case 0x2009: /* THIN SPACE */
3984     case 0x200A: /* HAIR SPACE */
3985     case 0x202f: /* NARROW NO-BREAK SPACE */
3986     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3987     case 0x3000: /* IDEOGRAPHIC SPACE */
3988     gotspace = TRUE;
3989 ph10 182 break;
3990 ph10 178 }
3991     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3992     eptr += len;
3993     }
3994     break;
3995    
3996     case OP_NOT_VSPACE:
3997 ph10 182 case OP_VSPACE:
3998 ph10 178 for (i = min; i < max; i++)
3999     {
4000 ph10 182 BOOL gotspace;
4001 ph10 178 int len = 1;
4002     if (eptr >= md->end_subject) break;
4003     GETCHARLEN(c, eptr, len);
4004     switch(c)
4005     {
4006 ph10 182 default: gotspace = FALSE; break;
4007 ph10 178 case 0x0a: /* LF */
4008     case 0x0b: /* VT */
4009     case 0x0c: /* FF */
4010     case 0x0d: /* CR */
4011     case 0x85: /* NEL */
4012     case 0x2028: /* LINE SEPARATOR */
4013     case 0x2029: /* PARAGRAPH SEPARATOR */
4014     gotspace = TRUE;
4015     break;
4016     }
4017 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4018 ph10 178 eptr += len;
4019     }
4020     break;
4021    
4022 nigel 77 case OP_NOT_DIGIT:
4023     for (i = min; i < max; i++)
4024     {
4025     int len = 1;
4026     if (eptr >= md->end_subject) break;
4027     GETCHARLEN(c, eptr, len);
4028     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4029     eptr+= len;
4030     }
4031     break;
4032    
4033     case OP_DIGIT:
4034     for (i = min; i < max; i++)
4035     {
4036     int len = 1;
4037     if (eptr >= md->end_subject) break;
4038     GETCHARLEN(c, eptr, len);
4039     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4040     eptr+= len;
4041     }
4042     break;
4043    
4044     case OP_NOT_WHITESPACE:
4045     for (i = min; i < max; i++)
4046     {
4047     int len = 1;
4048     if (eptr >= md->end_subject) break;
4049     GETCHARLEN(c, eptr, len);
4050     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4051     eptr+= len;
4052     }
4053     break;
4054    
4055     case OP_WHITESPACE:
4056     for (i = min; i < max; i++)
4057     {
4058     int len = 1;
4059     if (eptr >= md->end_subject) break;
4060     GETCHARLEN(c, eptr, len);
4061     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4062     eptr+= len;
4063     }
4064     break;
4065    
4066     case OP_NOT_WORDCHAR:
4067     for (i = min; i < max; i++)
4068     {
4069     int len = 1;
4070     if (eptr >= md->end_subject) break;
4071     GETCHARLEN(c, eptr, len);
4072     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4073     eptr+= len;
4074     }
4075     break;
4076    
4077     case OP_WORDCHAR:
4078     for (i = min; i < max; i++)
4079     {
4080     int len = 1;
4081     if (eptr >= md->end_subject) break;
4082     GETCHARLEN(c, eptr, len);
4083     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4084     eptr+= len;
4085     }
4086     break;
4087    
4088     default:
4089     RRETURN(PCRE_ERROR_INTERNAL);
4090     }
4091    
4092     /* eptr is now past the end of the maximum run */
4093    
4094 nigel 93 if (possessive) continue;
4095 nigel 77 for(;;)
4096     {
4097 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4098 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4099     if (eptr-- == pp) break; /* Stop if tried at original pos */
4100     BACKCHAR(eptr);
4101     }
4102     }
4103     else
4104 ph10 207 #endif /* SUPPORT_UTF8 */
4105 nigel 77
4106     /* Not UTF-8 mode */
4107     {
4108     switch(ctype)
4109     {
4110     case OP_ANY:
4111     if ((ims & PCRE_DOTALL) == 0)
4112     {
4113     for (i = min; i < max; i++)
4114     {
4115 nigel 93 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4116 nigel 77 eptr++;
4117     }
4118     break;
4119     }
4120 ph10 341 /* For DOTALL case, fall through */
4121 nigel 77
4122 ph10 341 case OP_ALLANY:
4123 nigel 77 case OP_ANYBYTE:
4124     c = max - min;
4125 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4126     c = md->end_subject - eptr;
4127 nigel 77 eptr += c;
4128     break;
4129    
4130 nigel 93 case OP_ANYNL:
4131     for (i = min; i < max; i++)
4132     {
4133     if (eptr >= md->end_subject) break;
4134     c = *eptr;
4135     if (c == 0x000d)
4136     {
4137     if (++eptr >= md->end_subject) break;
4138     if (*eptr == 0x000a) eptr++;
4139     }
4140     else
4141     {
4142 ph10 231 if (c != 0x000a &&
4143     (md->bsr_anycrlf ||
4144     (c != 0x000b && c != 0x000c && c != 0x0085)))
4145 nigel 93 break;
4146     eptr++;
4147     }
4148     }
4149     break;