/[pcre]/code/trunk/pcre_exec.c
ViewVC logotype

Contents of /code/trunk/pcre_exec.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 381 - (hide annotations) (download)
Tue Mar 3 16:08:23 2009 UTC (4 years, 2 months ago) by ph10
File MIME type: text/plain
File size: 152246 byte(s)
Fix bug with (?(?=.*b)b|^) thinking it must match at start of line; also fix 
bug causing a crash when auto-callout is used with a conditional assertion.

1 nigel 77 /*************************************************
2     * Perl-Compatible Regular Expressions *
3     *************************************************/
4    
5     /* PCRE is a library of functions to support regular expressions whose syntax
6     and semantics are as close as possible to those of the Perl 5 language.
7    
8     Written by Philip Hazel
9 ph10 381 Copyright (c) 1997-2009 University of Cambridge
10 nigel 77
11     -----------------------------------------------------------------------------
12     Redistribution and use in source and binary forms, with or without
13     modification, are permitted provided that the following conditions are met:
14    
15     * Redistributions of source code must retain the above copyright notice,
16     this list of conditions and the following disclaimer.
17    
18     * Redistributions in binary form must reproduce the above copyright
19     notice, this list of conditions and the following disclaimer in the
20     documentation and/or other materials provided with the distribution.
21    
22     * Neither the name of the University of Cambridge nor the names of its
23     contributors may be used to endorse or promote products derived from
24     this software without specific prior written permission.
25    
26     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27     AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28     IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29     ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30     LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31     CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32     SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33     INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34     CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35     ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36     POSSIBILITY OF SUCH DAMAGE.
37     -----------------------------------------------------------------------------
38     */
39    
40    
41     /* This module contains pcre_exec(), the externally visible function that does
42     pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43     possible. There are also some static supporting functions. */
44    
45 ph10 200 #ifdef HAVE_CONFIG_H
46 ph10 236 #include "config.h"
47 ph10 200 #endif
48 ph10 199
49 nigel 93 #define NLBLOCK md /* Block containing newline information */
50     #define PSSTART start_subject /* Field containing processed string start */
51     #define PSEND end_subject /* Field containing processed string end */
52    
53 nigel 77 #include "pcre_internal.h"
54    
55 ph10 137 /* Undefine some potentially clashing cpp symbols */
56    
57     #undef min
58     #undef max
59    
60 nigel 77 /* Flag bits for the match() function */
61    
62 nigel 93 #define match_condassert 0x01 /* Called to check a condition assertion */
63     #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 nigel 77
65     /* Non-error returns from the match() function. Error returns are externally
66     defined PCRE_ERROR_xxx codes, which are all negative. */
67    
68     #define MATCH_MATCH 1
69     #define MATCH_NOMATCH 0
70    
71 ph10 211 /* Special internal returns from the match() function. Make them sufficiently
72 ph10 210 negative to avoid the external error codes. */
73    
74     #define MATCH_COMMIT (-999)
75     #define MATCH_PRUNE (-998)
76     #define MATCH_SKIP (-997)
77     #define MATCH_THEN (-996)
78    
79 nigel 77 /* Maximum number of ints of offset to save on the stack for recursive calls.
80     If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81     because the offset vector is always a multiple of 3 long. */
82    
83     #define REC_STACK_SAVE_MAX 30
84    
85     /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86    
87     static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88     static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89    
90    
91    
92     #ifdef DEBUG
93     /*************************************************
94     * Debugging function to print chars *
95     *************************************************/
96    
97     /* Print a sequence of chars in printable format, stopping at the end of the
98     subject if the requested.
99    
100     Arguments:
101     p points to characters
102     length number to print
103     is_subject TRUE if printing from within md->start_subject
104     md pointer to matching data block, if is_subject is TRUE
105    
106     Returns: nothing
107     */
108    
109     static void
110     pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111     {
112 nigel 93 unsigned int c;
113 nigel 77 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114     while (length-- > 0)
115     if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116     }
117     #endif
118    
119    
120    
121     /*************************************************
122     * Match a back-reference *
123     *************************************************/
124    
125     /* If a back reference hasn't been set, the length that is passed is greater
126     than the number of characters left in the string, so the match fails.
127    
128     Arguments:
129     offset index into the offset vector
130     eptr points into the subject
131     length length to be matched
132     md points to match data block
133     ims the ims flags
134    
135     Returns: TRUE if matched
136     */
137    
138     static BOOL
139 nigel 87 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140 nigel 77 unsigned long int ims)
141     {
142 nigel 87 USPTR p = md->start_subject + md->offset_vector[offset];
143 nigel 77
144     #ifdef DEBUG
145     if (eptr >= md->end_subject)
146     printf("matching subject <null>");
147     else
148     {
149     printf("matching subject ");
150     pchars(eptr, length, TRUE, md);
151     }
152     printf(" against backref ");
153     pchars(p, length, FALSE, md);
154     printf("\n");
155     #endif
156    
157     /* Always fail if not enough characters left */
158    
159     if (length > md->end_subject - eptr) return FALSE;
160    
161 ph10 354 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162     properly if Unicode properties are supported. Otherwise, we can check only
163     ASCII characters. */
164 nigel 77
165     if ((ims & PCRE_CASELESS) != 0)
166     {
167 ph10 354 #ifdef SUPPORT_UTF8
168     #ifdef SUPPORT_UCP
169     if (md->utf8)
170     {
171 ph10 358 USPTR endptr = eptr + length;
172 ph10 354 while (eptr < endptr)
173     {
174 ph10 358 int c, d;
175 ph10 354 GETCHARINC(c, eptr);
176     GETCHARINC(d, p);
177     if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178 ph10 358 }
179     }
180 ph10 354 else
181     #endif
182     #endif
183    
184     /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185     is no UCP support. */
186 ph10 358
187 nigel 77 while (length-- > 0)
188 ph10 354 { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189 nigel 77 }
190 ph10 358
191 ph10 354 /* In the caseful case, we can just compare the bytes, whether or not we
192     are in UTF-8 mode. */
193 ph10 358
194 nigel 77 else
195     { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196    
197     return TRUE;
198     }
199    
200    
201    
202     /***************************************************************************
203     ****************************************************************************
204     RECURSION IN THE match() FUNCTION
205    
206 nigel 87 The match() function is highly recursive, though not every recursive call
207     increases the recursive depth. Nevertheless, some regular expressions can cause
208     it to recurse to a great depth. I was writing for Unix, so I just let it call
209     itself recursively. This uses the stack for saving everything that has to be
210     saved for a recursive call. On Unix, the stack can be large, and this works
211     fine.
212 nigel 77
213 nigel 87 It turns out that on some non-Unix-like systems there are problems with
214     programs that use a lot of stack. (This despite the fact that every last chip
215     has oodles of memory these days, and techniques for extending the stack have
216     been known for decades.) So....
217 nigel 77
218     There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219     calls by keeping local variables that need to be preserved in blocks of memory
220 nigel 87 obtained from malloc() instead instead of on the stack. Macros are used to
221 nigel 77 achieve this so that the actual code doesn't look very different to what it
222     always used to.
223 ph10 164
224 ph10 165 The original heap-recursive code used longjmp(). However, it seems that this
225 ph10 164 can be very slow on some operating systems. Following a suggestion from Stan
226     Switzer, the use of longjmp() has been abolished, at the cost of having to
227     provide a unique number for each call to RMATCH. There is no way of generating
228     a sequence of numbers at compile time in C. I have given them names, to make
229     them stand out more clearly.
230    
231     Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232     FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 ph10 165 tests. Furthermore, not using longjmp() means that local dynamic variables
234     don't have indeterminate values; this has meant that the frame size can be
235 ph10 164 reduced because the result can be "passed back" by straight setting of the
236     variable instead of being passed in the frame.
237 nigel 77 ****************************************************************************
238     ***************************************************************************/
239    
240 ph10 212 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241     below must be updated in sync. */
242 nigel 77
243 ph10 164 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244     RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245     RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246     RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247 ph10 210 RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248 ph10 212 RM51, RM52, RM53, RM54 };
249 ph10 164
250 nigel 87 /* These versions of the macros use the stack, as normal. There are debugging
251 ph10 165 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 ph10 164 actuall used in this definition. */
253 nigel 77
254     #ifndef NO_RECURSE
255     #define REGISTER register
256 ph10 164
257 nigel 87 #ifdef DEBUG
258 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259 nigel 87 { \
260     printf("match() called in line %d\n", __LINE__); \
261 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262 nigel 87 printf("to line %d\n", __LINE__); \
263     }
264     #define RRETURN(ra) \
265     { \
266     printf("match() returned %d from line %d ", ra, __LINE__); \
267     return ra; \
268     }
269     #else
270 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271 ph10 168 rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 nigel 77 #define RRETURN(ra) return ra
273 nigel 87 #endif
274    
275 nigel 77 #else
276    
277    
278 ph10 164 /* These versions of the macros manage a private stack on the heap. Note that
279     the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280     argument of match(), which never changes. */
281 nigel 77
282     #define REGISTER
283    
284 ph10 164 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285 nigel 77 {\
286     heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287 ph10 164 frame->Xwhere = rw; \
288     newframe->Xeptr = ra;\
289     newframe->Xecode = rb;\
290 ph10 168 newframe->Xmstart = mstart;\
291 ph10 164 newframe->Xoffset_top = rc;\
292     newframe->Xims = re;\
293     newframe->Xeptrb = rf;\
294     newframe->Xflags = rg;\
295     newframe->Xrdepth = frame->Xrdepth + 1;\
296     newframe->Xprevframe = frame;\
297     frame = newframe;\
298     DPRINTF(("restarting from line %d\n", __LINE__));\
299     goto HEAP_RECURSE;\
300     L_##rw:\
301     DPRINTF(("jumped back to line %d\n", __LINE__));\
302 nigel 77 }
303    
304     #define RRETURN(ra)\
305     {\
306     heapframe *newframe = frame;\
307     frame = newframe->Xprevframe;\
308     (pcre_stack_free)(newframe);\
309     if (frame != NULL)\
310     {\
311 ph10 164 rrc = ra;\
312     goto HEAP_RETURN;\
313 nigel 77 }\
314     return ra;\
315     }
316    
317    
318     /* Structure for remembering the local variables in a private frame */
319    
320     typedef struct heapframe {
321     struct heapframe *Xprevframe;
322    
323     /* Function arguments that may change */
324    
325     const uschar *Xeptr;
326     const uschar *Xecode;
327 ph10 172 const uschar *Xmstart;
328 nigel 77 int Xoffset_top;
329     long int Xims;
330     eptrblock *Xeptrb;
331     int Xflags;
332 nigel 91 unsigned int Xrdepth;
333 nigel 77
334     /* Function local variables */
335    
336     const uschar *Xcallpat;
337     const uschar *Xcharptr;
338     const uschar *Xdata;
339     const uschar *Xnext;
340     const uschar *Xpp;
341     const uschar *Xprev;
342     const uschar *Xsaved_eptr;
343    
344     recursion_info Xnew_recursive;
345    
346     BOOL Xcur_is_word;
347     BOOL Xcondition;
348     BOOL Xprev_is_word;
349    
350     unsigned long int Xoriginal_ims;
351    
352     #ifdef SUPPORT_UCP
353     int Xprop_type;
354 nigel 87 int Xprop_value;
355 nigel 77 int Xprop_fail_result;
356     int Xprop_category;
357     int Xprop_chartype;
358 nigel 87 int Xprop_script;
359 ph10 123 int Xoclength;
360     uschar Xocchars[8];
361 nigel 77 #endif
362    
363     int Xctype;
364 nigel 93 unsigned int Xfc;
365 nigel 77 int Xfi;
366     int Xlength;
367     int Xmax;
368     int Xmin;
369     int Xnumber;
370     int Xoffset;
371     int Xop;
372     int Xsave_capture_last;
373     int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374     int Xstacksave[REC_STACK_SAVE_MAX];
375    
376     eptrblock Xnewptrb;
377    
378 ph10 164 /* Where to jump back to */
379 nigel 77
380 ph10 164 int Xwhere;
381 ph10 165
382 nigel 77 } heapframe;
383    
384     #endif
385    
386    
387     /***************************************************************************
388     ***************************************************************************/
389    
390    
391    
392     /*************************************************
393     * Match from current position *
394     *************************************************/
395    
396 nigel 93 /* This function is called recursively in many circumstances. Whenever it
397 nigel 77 returns a negative (error) response, the outer incarnation must also return the
398     same response.
399    
400     Performance note: It might be tempting to extract commonly used fields from the
401     md structure (e.g. utf8, end_subject) into individual variables to improve
402     performance. Tests using gcc on a SPARC disproved this; in the first case, it
403     made performance worse.
404    
405     Arguments:
406 nigel 93 eptr pointer to current character in subject
407     ecode pointer to current position in compiled code
408 ph10 168 mstart pointer to the current match start position (can be modified
409 ph10 172 by encountering \K)
410 nigel 77 offset_top current top pointer
411     md pointer to "static" info for the match
412     ims current /i, /m, and /s options
413     eptrb pointer to chain of blocks containing eptr at start of
414     brackets - for testing for empty matches
415     flags can contain
416     match_condassert - this is an assertion condition
417 nigel 93 match_cbegroup - this is the start of an unlimited repeat
418     group that can match an empty string
419 nigel 87 rdepth the recursion depth
420 nigel 77
421     Returns: MATCH_MATCH if matched ) these values are >= 0
422     MATCH_NOMATCH if failed to match )
423     a negative PCRE_ERROR_xxx value if aborted by an error condition
424 nigel 87 (e.g. stopped by repeated call or recursion limit)
425 nigel 77 */
426    
427     static int
428 ph10 172 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429 nigel 77 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430 nigel 91 int flags, unsigned int rdepth)
431 nigel 77 {
432     /* These variables do not need to be preserved over recursion in this function,
433 nigel 93 so they can be ordinary variables in all cases. Mark some of them with
434     "register" because they are used a lot in loops. */
435 nigel 77
436 nigel 91 register int rrc; /* Returns from recursive calls */
437     register int i; /* Used for loops not involving calls to RMATCH() */
438 nigel 93 register unsigned int c; /* Character values not kept over RMATCH() calls */
439 nigel 91 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440 nigel 77
441 nigel 93 BOOL minimize, possessive; /* Quantifier options */
442    
443 nigel 77 /* When recursion is not being used, all "local" variables that have to be
444     preserved over calls to RMATCH() are part of a "frame" which is obtained from
445     heap storage. Set up the top-level frame here; others are obtained from the
446     heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447    
448     #ifdef NO_RECURSE
449     heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450     frame->Xprevframe = NULL; /* Marks the top level */
451    
452     /* Copy in the original argument variables */
453    
454     frame->Xeptr = eptr;
455     frame->Xecode = ecode;
456 ph10 168 frame->Xmstart = mstart;
457 nigel 77 frame->Xoffset_top = offset_top;
458     frame->Xims = ims;
459     frame->Xeptrb = eptrb;
460     frame->Xflags = flags;
461 nigel 87 frame->Xrdepth = rdepth;
462 nigel 77
463     /* This is where control jumps back to to effect "recursion" */
464    
465     HEAP_RECURSE:
466    
467     /* Macros make the argument variables come from the current frame */
468    
469     #define eptr frame->Xeptr
470     #define ecode frame->Xecode
471 ph10 168 #define mstart frame->Xmstart
472 nigel 77 #define offset_top frame->Xoffset_top
473     #define ims frame->Xims
474     #define eptrb frame->Xeptrb
475     #define flags frame->Xflags
476 nigel 87 #define rdepth frame->Xrdepth
477 nigel 77
478     /* Ditto for the local variables */
479    
480     #ifdef SUPPORT_UTF8
481     #define charptr frame->Xcharptr
482     #endif
483     #define callpat frame->Xcallpat
484     #define data frame->Xdata
485     #define next frame->Xnext
486     #define pp frame->Xpp
487     #define prev frame->Xprev
488     #define saved_eptr frame->Xsaved_eptr
489    
490     #define new_recursive frame->Xnew_recursive
491    
492     #define cur_is_word frame->Xcur_is_word
493     #define condition frame->Xcondition
494     #define prev_is_word frame->Xprev_is_word
495    
496     #define original_ims frame->Xoriginal_ims
497    
498     #ifdef SUPPORT_UCP
499     #define prop_type frame->Xprop_type
500 nigel 87 #define prop_value frame->Xprop_value
501 nigel 77 #define prop_fail_result frame->Xprop_fail_result
502     #define prop_category frame->Xprop_category
503     #define prop_chartype frame->Xprop_chartype
504 nigel 87 #define prop_script frame->Xprop_script
505 ph10 115 #define oclength frame->Xoclength
506     #define occhars frame->Xocchars
507 nigel 77 #endif
508    
509     #define ctype frame->Xctype
510     #define fc frame->Xfc
511     #define fi frame->Xfi
512     #define length frame->Xlength
513     #define max frame->Xmax
514     #define min frame->Xmin
515     #define number frame->Xnumber
516     #define offset frame->Xoffset
517     #define op frame->Xop
518     #define save_capture_last frame->Xsave_capture_last
519     #define save_offset1 frame->Xsave_offset1
520     #define save_offset2 frame->Xsave_offset2
521     #define save_offset3 frame->Xsave_offset3
522     #define stacksave frame->Xstacksave
523    
524     #define newptrb frame->Xnewptrb
525    
526     /* When recursion is being used, local variables are allocated on the stack and
527     get preserved during recursion in the normal way. In this environment, fi and
528     i, and fc and c, can be the same variables. */
529    
530 nigel 93 #else /* NO_RECURSE not defined */
531 nigel 77 #define fi i
532     #define fc c
533    
534    
535 nigel 87 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536     const uschar *charptr; /* in small blocks of the code. My normal */
537     #endif /* style of coding would have declared */
538     const uschar *callpat; /* them within each of those blocks. */
539     const uschar *data; /* However, in order to accommodate the */
540     const uschar *next; /* version of this code that uses an */
541     USPTR pp; /* external "stack" implemented on the */
542     const uschar *prev; /* heap, it is easier to declare them all */
543     USPTR saved_eptr; /* here, so the declarations can be cut */
544     /* out in a block. The only declarations */
545     recursion_info new_recursive; /* within blocks below are for variables */
546     /* that do not have to be preserved over */
547     BOOL cur_is_word; /* a recursive call to RMATCH(). */
548     BOOL condition;
549 nigel 77 BOOL prev_is_word;
550    
551     unsigned long int original_ims;
552    
553     #ifdef SUPPORT_UCP
554     int prop_type;
555 nigel 87 int prop_value;
556 nigel 77 int prop_fail_result;
557     int prop_category;
558     int prop_chartype;
559 nigel 87 int prop_script;
560 ph10 115 int oclength;
561     uschar occhars[8];
562 nigel 77 #endif
563    
564     int ctype;
565     int length;
566     int max;
567     int min;
568     int number;
569     int offset;
570     int op;
571     int save_capture_last;
572     int save_offset1, save_offset2, save_offset3;
573     int stacksave[REC_STACK_SAVE_MAX];
574    
575     eptrblock newptrb;
576 nigel 93 #endif /* NO_RECURSE */
577 nigel 77
578     /* These statements are here to stop the compiler complaining about unitialized
579     variables. */
580    
581     #ifdef SUPPORT_UCP
582 nigel 87 prop_value = 0;
583 nigel 77 prop_fail_result = 0;
584     #endif
585    
586 nigel 93
587 nigel 91 /* This label is used for tail recursion, which is used in a few cases even
588     when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589     used. Thanks to Ian Taylor for noticing this possibility and sending the
590     original patch. */
591    
592     TAIL_RECURSE:
593    
594 nigel 87 /* OK, now we can get on with the real code of the function. Recursive calls
595     are specified by the macro RMATCH and RRETURN is used to return. When
596     NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597     and a "return", respectively (possibly with some debugging if DEBUG is
598     defined). However, RMATCH isn't like a function call because it's quite a
599     complicated macro. It has to be used in one particular way. This shouldn't,
600     however, impact performance when true recursion is being used. */
601 nigel 77
602 ph10 164 #ifdef SUPPORT_UTF8
603     utf8 = md->utf8; /* Local copy of the flag */
604     #else
605     utf8 = FALSE;
606     #endif
607    
608 nigel 87 /* First check that we haven't called match() too many times, or that we
609     haven't exceeded the recursive call limit. */
610    
611 nigel 77 if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612 nigel 87 if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613 nigel 77
614     original_ims = ims; /* Save for resetting on ')' */
615 nigel 91
616 nigel 93 /* At the start of a group with an unlimited repeat that may match an empty
617     string, the match_cbegroup flag is set. When this is the case, add the current
618     subject pointer to the chain of such remembered pointers, to be checked when we
619     hit the closing ket, in order to break infinite loops that match no characters.
620 ph10 197 When match() is called in other circumstances, don't add to the chain. The
621     match_cbegroup flag must NOT be used with tail recursion, because the memory
622     block that is used is on the stack, so a new one may be required for each
623     match(). */
624 nigel 77
625 nigel 93 if ((flags & match_cbegroup) != 0)
626 nigel 77 {
627 ph10 197 newptrb.epb_saved_eptr = eptr;
628     newptrb.epb_prev = eptrb;
629     eptrb = &newptrb;
630 nigel 77 }
631    
632 nigel 93 /* Now start processing the opcodes. */
633 nigel 77
634     for (;;)
635     {
636 nigel 93 minimize = possessive = FALSE;
637 nigel 77 op = *ecode;
638    
639     /* For partial matching, remember if we ever hit the end of the subject after
640     matching at least one subject character. */
641    
642     if (md->partial &&
643     eptr >= md->end_subject &&
644 ph10 168 eptr > mstart)
645 nigel 77 md->hitend = TRUE;
646 ph10 208
647 nigel 93 switch(op)
648     {
649 ph10 210 case OP_FAIL:
650 ph10 212 RRETURN(MATCH_NOMATCH);
651 ph10 211
652 ph10 210 case OP_PRUNE:
653     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654     ims, eptrb, flags, RM51);
655     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656 ph10 212 RRETURN(MATCH_PRUNE);
657 ph10 211
658 ph10 210 case OP_COMMIT:
659     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660     ims, eptrb, flags, RM52);
661     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662 ph10 212 RRETURN(MATCH_COMMIT);
663 ph10 211
664 ph10 210 case OP_SKIP:
665     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666     ims, eptrb, flags, RM53);
667     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668 ph10 211 md->start_match_ptr = eptr; /* Pass back current position */
669 ph10 212 RRETURN(MATCH_SKIP);
670 ph10 211
671 ph10 210 case OP_THEN:
672     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673 ph10 212 ims, eptrb, flags, RM54);
674 ph10 210 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675 ph10 212 RRETURN(MATCH_THEN);
676 ph10 211
677 nigel 93 /* Handle a capturing bracket. If there is space in the offset vector, save
678     the current subject position in the working slot at the top of the vector.
679     We mustn't change the current values of the data slot, because they may be
680     set from a previous iteration of this group, and be referred to by a
681     reference inside the group.
682 nigel 77
683 nigel 93 If the bracket fails to match, we need to restore this value and also the
684     values of the final offsets, in case they were set by a previous iteration
685     of the same bracket.
686 nigel 77
687 nigel 93 If there isn't enough space in the offset vector, treat this as if it were
688     a non-capturing bracket. Don't worry about setting the flag for the error
689     case here; that is handled in the code for KET. */
690 nigel 77
691 nigel 93 case OP_CBRA:
692     case OP_SCBRA:
693     number = GET2(ecode, 1+LINK_SIZE);
694 nigel 77 offset = number << 1;
695    
696     #ifdef DEBUG
697 nigel 93 printf("start bracket %d\n", number);
698     printf("subject=");
699 nigel 77 pchars(eptr, 16, TRUE, md);
700     printf("\n");
701     #endif
702    
703     if (offset < md->offset_max)
704     {
705     save_offset1 = md->offset_vector[offset];
706     save_offset2 = md->offset_vector[offset+1];
707     save_offset3 = md->offset_vector[md->offset_end - number];
708     save_capture_last = md->capture_last;
709    
710     DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711     md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712    
713 nigel 93 flags = (op == OP_SCBRA)? match_cbegroup : 0;
714 nigel 77 do
715     {
716 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717     ims, eptrb, flags, RM1);
718 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719 nigel 77 md->capture_last = save_capture_last;
720     ecode += GET(ecode, 1);
721     }
722     while (*ecode == OP_ALT);
723    
724     DPRINTF(("bracket %d failed\n", number));
725    
726     md->offset_vector[offset] = save_offset1;
727     md->offset_vector[offset+1] = save_offset2;
728     md->offset_vector[md->offset_end - number] = save_offset3;
729    
730     RRETURN(MATCH_NOMATCH);
731     }
732    
733 ph10 197 /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734     as a non-capturing bracket. */
735 nigel 77
736 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738    
739 nigel 93 DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740 nigel 77
741 ph10 197 /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742     /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743    
744 nigel 93 /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745     final alternative within the brackets, we would return the result of a
746     recursive call to match() whatever happened. We can reduce stack usage by
747 ph10 197 turning this into a tail recursion, except in the case when match_cbegroup
748     is set.*/
749 nigel 77
750 nigel 93 case OP_BRA:
751     case OP_SBRA:
752     DPRINTF(("start non-capturing bracket\n"));
753     flags = (op >= OP_SBRA)? match_cbegroup : 0;
754 nigel 91 for (;;)
755 nigel 77 {
756 ph10 197 if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757 nigel 93 {
758 ph10 197 if (flags == 0) /* Not a possibly empty group */
759     {
760     ecode += _pcre_OP_lengths[*ecode];
761     DPRINTF(("bracket 0 tail recursion\n"));
762     goto TAIL_RECURSE;
763     }
764    
765     /* Possibly empty group; can't use tail recursion. */
766    
767     RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768     eptrb, flags, RM48);
769     RRETURN(rrc);
770 nigel 93 }
771 nigel 91
772     /* For non-final alternatives, continue the loop for a NOMATCH result;
773     otherwise return. */
774    
775 ph10 164 RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776     eptrb, flags, RM2);
777 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778 nigel 77 ecode += GET(ecode, 1);
779     }
780 nigel 91 /* Control never reaches here. */
781 nigel 77
782     /* Conditional group: compilation checked that there are no more than
783     two branches. If the condition is false, skipping the first branch takes us
784     past the end if there is only one branch, but that's OK because that is
785 nigel 91 exactly what going to the ket would do. As there is only one branch to be
786     obeyed, we can use tail recursion to avoid using another stack frame. */
787 nigel 77
788     case OP_COND:
789 nigel 93 case OP_SCOND:
790 ph10 381 /* Because of the way auto-callout works during compile, a callout item is
791     inserted between OP_COND and an assertion condition. */
792    
793     if (ecode[LINK_SIZE+1] == OP_CALLOUT)
794     {
795     if (pcre_callout != NULL)
796     {
797     pcre_callout_block cb;
798     cb.version = 1; /* Version 1 of the callout block */
799     cb.callout_number = ecode[LINK_SIZE+2];
800     cb.offset_vector = md->offset_vector;
801     cb.subject = (PCRE_SPTR)md->start_subject;
802     cb.subject_length = md->end_subject - md->start_subject;
803     cb.start_match = mstart - md->start_subject;
804     cb.current_position = eptr - md->start_subject;
805     cb.pattern_position = GET(ecode, LINK_SIZE + 3);
806     cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
807     cb.capture_top = offset_top/2;
808     cb.capture_last = md->capture_last;
809     cb.callout_data = md->callout_data;
810     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
811     if (rrc < 0) RRETURN(rrc);
812     }
813     ecode += _pcre_OP_lengths[OP_CALLOUT];
814     }
815    
816     /* Now see what the actual condition is */
817    
818 nigel 93 if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
819 nigel 77 {
820 nigel 93 offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
821     condition = md->recursive != NULL &&
822     (offset == RREF_ANY || offset == md->recursive->group_num);
823     ecode += condition? 3 : GET(ecode, 1);
824     }
825    
826     else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
827     {
828 nigel 77 offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
829 nigel 93 condition = offset < offset_top && md->offset_vector[offset] >= 0;
830     ecode += condition? 3 : GET(ecode, 1);
831 nigel 77 }
832    
833 nigel 93 else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
834     {
835     condition = FALSE;
836     ecode += GET(ecode, 1);
837     }
838    
839 nigel 77 /* The condition is an assertion. Call match() to evaluate it - setting
840 nigel 93 the final argument match_condassert causes it to stop at the end of an
841     assertion. */
842 nigel 77
843     else
844     {
845 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
846     match_condassert, RM3);
847 nigel 77 if (rrc == MATCH_MATCH)
848     {
849 nigel 93 condition = TRUE;
850     ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
851 nigel 77 while (*ecode == OP_ALT) ecode += GET(ecode, 1);
852     }
853 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
854 nigel 77 {
855     RRETURN(rrc); /* Need braces because of following else */
856     }
857 nigel 93 else
858     {
859     condition = FALSE;
860     ecode += GET(ecode, 1);
861     }
862     }
863 nigel 91
864 nigel 93 /* We are now at the branch that is to be obeyed. As there is only one,
865 ph10 197 we can use tail recursion to avoid using another stack frame, except when
866     match_cbegroup is required for an unlimited repeat of a possibly empty
867     group. If the second alternative doesn't exist, we can just plough on. */
868 nigel 91
869 nigel 93 if (condition || *ecode == OP_ALT)
870     {
871 nigel 91 ecode += 1 + LINK_SIZE;
872 ph10 197 if (op == OP_SCOND) /* Possibly empty group */
873     {
874     RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
875     RRETURN(rrc);
876     }
877     else /* Group must match something */
878     {
879     flags = 0;
880     goto TAIL_RECURSE;
881     }
882 nigel 77 }
883 ph10 197 else /* Condition false & no 2nd alternative */
884 nigel 93 {
885     ecode += 1 + LINK_SIZE;
886     }
887     break;
888 nigel 77
889    
890 ph10 210 /* End of the pattern, either real or forced. If we are in a top-level
891     recursion, we should restore the offsets appropriately and continue from
892     after the call. */
893 nigel 77
894 ph10 210 case OP_ACCEPT:
895 nigel 77 case OP_END:
896     if (md->recursive != NULL && md->recursive->group_num == 0)
897     {
898     recursion_info *rec = md->recursive;
899 nigel 87 DPRINTF(("End of pattern in a (?0) recursion\n"));
900 nigel 77 md->recursive = rec->prevrec;
901     memmove(md->offset_vector, rec->offset_save,
902     rec->saved_max * sizeof(int));
903 ph10 168 mstart = rec->save_start;
904 nigel 77 ims = original_ims;
905     ecode = rec->after_call;
906     break;
907     }
908    
909     /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
910     string - backtracking will then try other alternatives, if any. */
911    
912 ph10 168 if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
913     md->end_match_ptr = eptr; /* Record where we ended */
914     md->end_offset_top = offset_top; /* and how many extracts were taken */
915 ph10 210 md->start_match_ptr = mstart; /* and the start (\K can modify) */
916 nigel 77 RRETURN(MATCH_MATCH);
917    
918     /* Change option settings */
919    
920     case OP_OPT:
921     ims = ecode[1];
922     ecode += 2;
923     DPRINTF(("ims set to %02lx\n", ims));
924     break;
925    
926     /* Assertion brackets. Check the alternative branches in turn - the
927     matching won't pass the KET for an assertion. If any one branch matches,
928     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
929     start of each branch to move the current point backwards, so the code at
930     this level is identical to the lookahead case. */
931    
932     case OP_ASSERT:
933     case OP_ASSERTBACK:
934     do
935     {
936 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
937     RM4);
938 nigel 77 if (rrc == MATCH_MATCH) break;
939 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
940 nigel 77 ecode += GET(ecode, 1);
941     }
942     while (*ecode == OP_ALT);
943     if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
944    
945     /* If checking an assertion for a condition, return MATCH_MATCH. */
946    
947     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
948    
949     /* Continue from after the assertion, updating the offsets high water
950     mark, since extracts may have been taken during the assertion. */
951    
952     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
953     ecode += 1 + LINK_SIZE;
954     offset_top = md->end_offset_top;
955     continue;
956    
957     /* Negative assertion: all branches must fail to match */
958    
959     case OP_ASSERT_NOT:
960     case OP_ASSERTBACK_NOT:
961     do
962     {
963 ph10 164 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
964     RM5);
965 nigel 77 if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
966 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
967 nigel 77 ecode += GET(ecode,1);
968     }
969     while (*ecode == OP_ALT);
970    
971     if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
972    
973     ecode += 1 + LINK_SIZE;
974     continue;
975    
976     /* Move the subject pointer back. This occurs only at the start of
977     each branch of a lookbehind assertion. If we are too close to the start to
978     move back, this match function fails. When working with UTF-8 we move
979     back a number of characters, not bytes. */
980    
981     case OP_REVERSE:
982     #ifdef SUPPORT_UTF8
983     if (utf8)
984     {
985 nigel 93 i = GET(ecode, 1);
986     while (i-- > 0)
987 nigel 77 {
988     eptr--;
989     if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
990 ph10 207 BACKCHAR(eptr);
991 nigel 77 }
992     }
993     else
994     #endif
995    
996     /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
997    
998     {
999 nigel 93 eptr -= GET(ecode, 1);
1000 nigel 77 if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1001     }
1002    
1003     /* Skip to next op code */
1004    
1005     ecode += 1 + LINK_SIZE;
1006     break;
1007    
1008     /* The callout item calls an external function, if one is provided, passing
1009     details of the match so far. This is mainly for debugging, though the
1010     function is able to force a failure. */
1011    
1012     case OP_CALLOUT:
1013     if (pcre_callout != NULL)
1014     {
1015     pcre_callout_block cb;
1016     cb.version = 1; /* Version 1 of the callout block */
1017     cb.callout_number = ecode[1];
1018     cb.offset_vector = md->offset_vector;
1019 nigel 87 cb.subject = (PCRE_SPTR)md->start_subject;
1020 nigel 77 cb.subject_length = md->end_subject - md->start_subject;
1021 ph10 168 cb.start_match = mstart - md->start_subject;
1022 nigel 77 cb.current_position = eptr - md->start_subject;
1023     cb.pattern_position = GET(ecode, 2);
1024     cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1025     cb.capture_top = offset_top/2;
1026     cb.capture_last = md->capture_last;
1027     cb.callout_data = md->callout_data;
1028     if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1029     if (rrc < 0) RRETURN(rrc);
1030     }
1031     ecode += 2 + 2*LINK_SIZE;
1032     break;
1033    
1034     /* Recursion either matches the current regex, or some subexpression. The
1035     offset data is the offset to the starting bracket from the start of the
1036     whole pattern. (This is so that it works from duplicated subpatterns.)
1037    
1038     If there are any capturing brackets started but not finished, we have to
1039     save their starting points and reinstate them after the recursion. However,
1040     we don't know how many such there are (offset_top records the completed
1041     total) so we just have to save all the potential data. There may be up to
1042     65535 such values, which is too large to put on the stack, but using malloc
1043     for small numbers seems expensive. As a compromise, the stack is used when
1044     there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1045     is used. A problem is what to do if the malloc fails ... there is no way of
1046     returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1047     values on the stack, and accept that the rest may be wrong.
1048    
1049     There are also other values that have to be saved. We use a chained
1050     sequence of blocks that actually live on the stack. Thanks to Robin Houston
1051     for the original version of this logic. */
1052    
1053     case OP_RECURSE:
1054     {
1055     callpat = md->start_code + GET(ecode, 1);
1056 nigel 93 new_recursive.group_num = (callpat == md->start_code)? 0 :
1057     GET2(callpat, 1 + LINK_SIZE);
1058 nigel 77
1059     /* Add to "recursing stack" */
1060    
1061     new_recursive.prevrec = md->recursive;
1062     md->recursive = &new_recursive;
1063    
1064     /* Find where to continue from afterwards */
1065    
1066     ecode += 1 + LINK_SIZE;
1067     new_recursive.after_call = ecode;
1068    
1069     /* Now save the offset data. */
1070    
1071     new_recursive.saved_max = md->offset_end;
1072     if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1073     new_recursive.offset_save = stacksave;
1074     else
1075     {
1076     new_recursive.offset_save =
1077     (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1078     if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1079     }
1080    
1081     memcpy(new_recursive.offset_save, md->offset_vector,
1082     new_recursive.saved_max * sizeof(int));
1083 ph10 168 new_recursive.save_start = mstart;
1084     mstart = eptr;
1085 nigel 77
1086     /* OK, now we can do the recursion. For each top-level alternative we
1087     restore the offset and recursion data. */
1088    
1089     DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1090 nigel 93 flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1091 nigel 77 do
1092     {
1093 ph10 164 RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1094     md, ims, eptrb, flags, RM6);
1095 nigel 77 if (rrc == MATCH_MATCH)
1096     {
1097 nigel 87 DPRINTF(("Recursion matched\n"));
1098 nigel 77 md->recursive = new_recursive.prevrec;
1099     if (new_recursive.offset_save != stacksave)
1100     (pcre_free)(new_recursive.offset_save);
1101     RRETURN(MATCH_MATCH);
1102     }
1103 ph10 210 else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1104 nigel 87 {
1105     DPRINTF(("Recursion gave error %d\n", rrc));
1106     RRETURN(rrc);
1107     }
1108 nigel 77
1109     md->recursive = &new_recursive;
1110     memcpy(md->offset_vector, new_recursive.offset_save,
1111     new_recursive.saved_max * sizeof(int));
1112     callpat += GET(callpat, 1);
1113     }
1114     while (*callpat == OP_ALT);
1115    
1116     DPRINTF(("Recursion didn't match\n"));
1117     md->recursive = new_recursive.prevrec;
1118     if (new_recursive.offset_save != stacksave)
1119     (pcre_free)(new_recursive.offset_save);
1120     RRETURN(MATCH_NOMATCH);
1121     }
1122     /* Control never reaches here */
1123    
1124     /* "Once" brackets are like assertion brackets except that after a match,
1125     the point in the subject string is not moved back. Thus there can never be
1126     a move back into the brackets. Friedl calls these "atomic" subpatterns.
1127     Check the alternative branches in turn - the matching won't pass the KET
1128     for this kind of subpattern. If any one branch matches, we carry on as at
1129     the end of a normal bracket, leaving the subject pointer. */
1130    
1131     case OP_ONCE:
1132 nigel 91 prev = ecode;
1133     saved_eptr = eptr;
1134    
1135     do
1136 nigel 77 {
1137 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1138 nigel 91 if (rrc == MATCH_MATCH) break;
1139 ph10 210 if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1140 nigel 91 ecode += GET(ecode,1);
1141     }
1142     while (*ecode == OP_ALT);
1143 nigel 77
1144 nigel 91 /* If hit the end of the group (which could be repeated), fail */
1145 nigel 77
1146 nigel 91 if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1147 nigel 77
1148 nigel 91 /* Continue as from after the assertion, updating the offsets high water
1149     mark, since extracts may have been taken. */
1150 nigel 77
1151 nigel 93 do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1152 nigel 77
1153 nigel 91 offset_top = md->end_offset_top;
1154     eptr = md->end_match_ptr;
1155 nigel 77
1156 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1157     happens for a repeating ket if no characters were matched in the group.
1158     This is the forcible breaking of infinite loops as implemented in Perl
1159     5.005. If there is an options reset, it will get obeyed in the normal
1160     course of events. */
1161 nigel 77
1162 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1163     {
1164     ecode += 1+LINK_SIZE;
1165     break;
1166     }
1167 nigel 77
1168 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1169     preceding bracket, in the appropriate order. The second "call" of match()
1170     uses tail recursion, to avoid using another stack frame. We need to reset
1171     any options that changed within the bracket before re-running it, so
1172     check the next opcode. */
1173 nigel 77
1174 nigel 91 if (ecode[1+LINK_SIZE] == OP_OPT)
1175     {
1176     ims = (ims & ~PCRE_IMS) | ecode[4];
1177     DPRINTF(("ims set to %02lx at group repeat\n", ims));
1178     }
1179 nigel 77
1180 nigel 91 if (*ecode == OP_KETRMIN)
1181     {
1182 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1183 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1184     ecode = prev;
1185 ph10 197 flags = 0;
1186 nigel 91 goto TAIL_RECURSE;
1187 nigel 77 }
1188 nigel 91 else /* OP_KETRMAX */
1189     {
1190 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1191 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1192     ecode += 1 + LINK_SIZE;
1193 ph10 197 flags = 0;
1194 nigel 91 goto TAIL_RECURSE;
1195     }
1196     /* Control never gets here */
1197 nigel 77
1198     /* An alternation is the end of a branch; scan along to find the end of the
1199     bracketed group and go to there. */
1200    
1201     case OP_ALT:
1202     do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1203     break;
1204    
1205 ph10 335 /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1206     indicating that it may occur zero times. It may repeat infinitely, or not
1207     at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1208     with fixed upper repeat limits are compiled as a number of copies, with the
1209     optional ones preceded by BRAZERO or BRAMINZERO. */
1210 nigel 77
1211     case OP_BRAZERO:
1212     {
1213     next = ecode+1;
1214 ph10 164 RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1215 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1216     do next += GET(next,1); while (*next == OP_ALT);
1217 nigel 93 ecode = next + 1 + LINK_SIZE;
1218 nigel 77 }
1219     break;
1220    
1221     case OP_BRAMINZERO:
1222     {
1223     next = ecode+1;
1224 nigel 93 do next += GET(next, 1); while (*next == OP_ALT);
1225 ph10 164 RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1226 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1227     ecode++;
1228     }
1229     break;
1230    
1231 ph10 335 case OP_SKIPZERO:
1232     {
1233     next = ecode+1;
1234     do next += GET(next,1); while (*next == OP_ALT);
1235     ecode = next + 1 + LINK_SIZE;
1236     }
1237     break;
1238    
1239 nigel 93 /* End of a group, repeated or non-repeating. */
1240 nigel 77
1241     case OP_KET:
1242     case OP_KETRMIN:
1243     case OP_KETRMAX:
1244 nigel 91 prev = ecode - GET(ecode, 1);
1245 nigel 77
1246 nigel 93 /* If this was a group that remembered the subject start, in order to break
1247     infinite repeats of empty string matches, retrieve the subject start from
1248     the chain. Otherwise, set it NULL. */
1249 nigel 77
1250 nigel 93 if (*prev >= OP_SBRA)
1251     {
1252     saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1253     eptrb = eptrb->epb_prev; /* Backup to previous group */
1254     }
1255     else saved_eptr = NULL;
1256 nigel 77
1257 nigel 93 /* If we are at the end of an assertion group, stop matching and return
1258     MATCH_MATCH, but record the current high water mark for use by positive
1259     assertions. Do this also for the "once" (atomic) groups. */
1260    
1261 nigel 91 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1262     *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1263     *prev == OP_ONCE)
1264     {
1265     md->end_match_ptr = eptr; /* For ONCE */
1266     md->end_offset_top = offset_top;
1267     RRETURN(MATCH_MATCH);
1268     }
1269 nigel 77
1270 nigel 93 /* For capturing groups we have to check the group number back at the start
1271     and if necessary complete handling an extraction by setting the offsets and
1272     bumping the high water mark. Note that whole-pattern recursion is coded as
1273     a recurse into group 0, so it won't be picked up here. Instead, we catch it
1274     when the OP_END is reached. Other recursion is handled here. */
1275 nigel 77
1276 nigel 93 if (*prev == OP_CBRA || *prev == OP_SCBRA)
1277 nigel 91 {
1278 nigel 93 number = GET2(prev, 1+LINK_SIZE);
1279 nigel 91 offset = number << 1;
1280 nigel 77
1281     #ifdef DEBUG
1282 nigel 91 printf("end bracket %d", number);
1283     printf("\n");
1284 nigel 77 #endif
1285    
1286 nigel 93 md->capture_last = number;
1287     if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1288 nigel 91 {
1289 nigel 93 md->offset_vector[offset] =
1290     md->offset_vector[md->offset_end - number];
1291     md->offset_vector[offset+1] = eptr - md->start_subject;
1292     if (offset_top <= offset) offset_top = offset + 2;
1293     }
1294 nigel 77
1295 nigel 93 /* Handle a recursively called group. Restore the offsets
1296     appropriately and continue from after the call. */
1297 nigel 77
1298 nigel 93 if (md->recursive != NULL && md->recursive->group_num == number)
1299     {
1300     recursion_info *rec = md->recursive;
1301     DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1302     md->recursive = rec->prevrec;
1303 ph10 168 mstart = rec->save_start;
1304 nigel 93 memcpy(md->offset_vector, rec->offset_save,
1305     rec->saved_max * sizeof(int));
1306     ecode = rec->after_call;
1307     ims = original_ims;
1308     break;
1309 nigel 77 }
1310 nigel 91 }
1311 nigel 77
1312 nigel 93 /* For both capturing and non-capturing groups, reset the value of the ims
1313     flags, in case they got changed during the group. */
1314 nigel 77
1315 nigel 91 ims = original_ims;
1316     DPRINTF(("ims reset to %02lx\n", ims));
1317 nigel 77
1318 nigel 91 /* For a non-repeating ket, just continue at this level. This also
1319     happens for a repeating ket if no characters were matched in the group.
1320     This is the forcible breaking of infinite loops as implemented in Perl
1321     5.005. If there is an options reset, it will get obeyed in the normal
1322     course of events. */
1323 nigel 77
1324 nigel 91 if (*ecode == OP_KET || eptr == saved_eptr)
1325     {
1326     ecode += 1 + LINK_SIZE;
1327     break;
1328     }
1329 nigel 77
1330 nigel 91 /* The repeating kets try the rest of the pattern or restart from the
1331     preceding bracket, in the appropriate order. In the second case, we can use
1332 ph10 197 tail recursion to avoid using another stack frame, unless we have an
1333     unlimited repeat of a group that can match an empty string. */
1334 nigel 77
1335 nigel 93 flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1336    
1337 nigel 91 if (*ecode == OP_KETRMIN)
1338     {
1339 ph10 197 RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1340 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1341 ph10 197 if (flags != 0) /* Could match an empty string */
1342     {
1343     RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1344     RRETURN(rrc);
1345     }
1346 nigel 91 ecode = prev;
1347     goto TAIL_RECURSE;
1348 nigel 77 }
1349 nigel 91 else /* OP_KETRMAX */
1350     {
1351 ph10 164 RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1352 nigel 91 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353     ecode += 1 + LINK_SIZE;
1354 ph10 197 flags = 0;
1355 nigel 91 goto TAIL_RECURSE;
1356     }
1357     /* Control never gets here */
1358 nigel 77
1359     /* Start of subject unless notbol, or after internal newline if multiline */
1360    
1361     case OP_CIRC:
1362     if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1363     if ((ims & PCRE_MULTILINE) != 0)
1364     {
1365 nigel 91 if (eptr != md->start_subject &&
1366 nigel 93 (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1367 nigel 77 RRETURN(MATCH_NOMATCH);
1368     ecode++;
1369     break;
1370     }
1371     /* ... else fall through */
1372    
1373     /* Start of subject assertion */
1374    
1375     case OP_SOD:
1376     if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1377     ecode++;
1378     break;
1379    
1380     /* Start of match assertion */
1381    
1382     case OP_SOM:
1383     if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1384     ecode++;
1385     break;
1386 ph10 172
1387 ph10 168 /* Reset the start of match point */
1388 ph10 172
1389 ph10 168 case OP_SET_SOM:
1390     mstart = eptr;
1391 ph10 172 ecode++;
1392     break;
1393 nigel 77
1394     /* Assert before internal newline if multiline, or before a terminating
1395     newline unless endonly is set, else end of subject unless noteol is set. */
1396    
1397     case OP_DOLL:
1398     if ((ims & PCRE_MULTILINE) != 0)
1399     {
1400     if (eptr < md->end_subject)
1401 nigel 91 { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1402 nigel 77 else
1403     { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1404     ecode++;
1405     break;
1406     }
1407     else
1408     {
1409     if (md->noteol) RRETURN(MATCH_NOMATCH);
1410     if (!md->endonly)
1411     {
1412 nigel 91 if (eptr != md->end_subject &&
1413 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1414 nigel 77 RRETURN(MATCH_NOMATCH);
1415     ecode++;
1416     break;
1417     }
1418     }
1419 nigel 91 /* ... else fall through for endonly */
1420 nigel 77
1421     /* End of subject assertion (\z) */
1422    
1423     case OP_EOD:
1424     if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1425     ecode++;
1426     break;
1427    
1428     /* End of subject or ending \n assertion (\Z) */
1429    
1430     case OP_EODN:
1431 nigel 91 if (eptr != md->end_subject &&
1432 nigel 93 (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1433 nigel 91 RRETURN(MATCH_NOMATCH);
1434 nigel 77 ecode++;
1435     break;
1436    
1437     /* Word boundary assertions */
1438    
1439     case OP_NOT_WORD_BOUNDARY:
1440     case OP_WORD_BOUNDARY:
1441     {
1442    
1443     /* Find out if the previous and current characters are "word" characters.
1444     It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1445     be "non-word" characters. */
1446    
1447     #ifdef SUPPORT_UTF8
1448     if (utf8)
1449     {
1450     if (eptr == md->start_subject) prev_is_word = FALSE; else
1451     {
1452     const uschar *lastptr = eptr - 1;
1453     while((*lastptr & 0xc0) == 0x80) lastptr--;
1454     GETCHAR(c, lastptr);
1455     prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1456     }
1457     if (eptr >= md->end_subject) cur_is_word = FALSE; else
1458     {
1459     GETCHAR(c, eptr);
1460     cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1461     }
1462     }
1463     else
1464     #endif
1465    
1466     /* More streamlined when not in UTF-8 mode */
1467    
1468     {
1469     prev_is_word = (eptr != md->start_subject) &&
1470     ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1471     cur_is_word = (eptr < md->end_subject) &&
1472     ((md->ctypes[*eptr] & ctype_word) != 0);
1473     }
1474    
1475     /* Now see if the situation is what we want */
1476    
1477     if ((*ecode++ == OP_WORD_BOUNDARY)?
1478     cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1479     RRETURN(MATCH_NOMATCH);
1480     }
1481     break;
1482    
1483     /* Match a single character type; inline for speed */
1484    
1485     case OP_ANY:
1486 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1487 ph10 345 /* Fall through */
1488    
1489 ph10 341 case OP_ALLANY:
1490 nigel 77 if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1491 ph10 342 if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1492 nigel 77 ecode++;
1493     break;
1494    
1495     /* Match a single byte, even in UTF-8 mode. This opcode really does match
1496     any byte, even newline, independent of the setting of PCRE_DOTALL. */
1497    
1498     case OP_ANYBYTE:
1499     if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1500     ecode++;
1501     break;
1502    
1503     case OP_NOT_DIGIT:
1504     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1505     GETCHARINCTEST(c, eptr);
1506     if (
1507     #ifdef SUPPORT_UTF8
1508     c < 256 &&
1509     #endif
1510     (md->ctypes[c] & ctype_digit) != 0
1511     )
1512     RRETURN(MATCH_NOMATCH);
1513     ecode++;
1514     break;
1515    
1516     case OP_DIGIT:
1517     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1518     GETCHARINCTEST(c, eptr);
1519     if (
1520     #ifdef SUPPORT_UTF8
1521     c >= 256 ||
1522     #endif
1523     (md->ctypes[c] & ctype_digit) == 0
1524     )
1525     RRETURN(MATCH_NOMATCH);
1526     ecode++;
1527     break;
1528    
1529     case OP_NOT_WHITESPACE:
1530     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1531     GETCHARINCTEST(c, eptr);
1532     if (
1533     #ifdef SUPPORT_UTF8
1534     c < 256 &&
1535     #endif
1536     (md->ctypes[c] & ctype_space) != 0
1537     )
1538     RRETURN(MATCH_NOMATCH);
1539     ecode++;
1540     break;
1541    
1542     case OP_WHITESPACE:
1543     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1544     GETCHARINCTEST(c, eptr);
1545     if (
1546     #ifdef SUPPORT_UTF8
1547     c >= 256 ||
1548     #endif
1549     (md->ctypes[c] & ctype_space) == 0
1550     )
1551     RRETURN(MATCH_NOMATCH);
1552     ecode++;
1553     break;
1554    
1555     case OP_NOT_WORDCHAR:
1556     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1557     GETCHARINCTEST(c, eptr);
1558     if (
1559     #ifdef SUPPORT_UTF8
1560     c < 256 &&
1561     #endif
1562     (md->ctypes[c] & ctype_word) != 0
1563     )
1564     RRETURN(MATCH_NOMATCH);
1565     ecode++;
1566     break;
1567    
1568     case OP_WORDCHAR:
1569     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1570     GETCHARINCTEST(c, eptr);
1571     if (
1572     #ifdef SUPPORT_UTF8
1573     c >= 256 ||
1574     #endif
1575     (md->ctypes[c] & ctype_word) == 0
1576     )
1577     RRETURN(MATCH_NOMATCH);
1578     ecode++;
1579     break;
1580    
1581 nigel 93 case OP_ANYNL:
1582     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1583     GETCHARINCTEST(c, eptr);
1584     switch(c)
1585     {
1586     default: RRETURN(MATCH_NOMATCH);
1587     case 0x000d:
1588     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1589     break;
1590 ph10 231
1591 nigel 93 case 0x000a:
1592 ph10 231 break;
1593    
1594 nigel 93 case 0x000b:
1595     case 0x000c:
1596     case 0x0085:
1597     case 0x2028:
1598     case 0x2029:
1599 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1600 nigel 93 break;
1601     }
1602     ecode++;
1603     break;
1604    
1605 ph10 178 case OP_NOT_HSPACE:
1606     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1607     GETCHARINCTEST(c, eptr);
1608     switch(c)
1609     {
1610     default: break;
1611     case 0x09: /* HT */
1612     case 0x20: /* SPACE */
1613     case 0xa0: /* NBSP */
1614     case 0x1680: /* OGHAM SPACE MARK */
1615     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1616     case 0x2000: /* EN QUAD */
1617     case 0x2001: /* EM QUAD */
1618     case 0x2002: /* EN SPACE */
1619     case 0x2003: /* EM SPACE */
1620     case 0x2004: /* THREE-PER-EM SPACE */
1621     case 0x2005: /* FOUR-PER-EM SPACE */
1622     case 0x2006: /* SIX-PER-EM SPACE */
1623     case 0x2007: /* FIGURE SPACE */
1624     case 0x2008: /* PUNCTUATION SPACE */
1625     case 0x2009: /* THIN SPACE */
1626     case 0x200A: /* HAIR SPACE */
1627     case 0x202f: /* NARROW NO-BREAK SPACE */
1628     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1629     case 0x3000: /* IDEOGRAPHIC SPACE */
1630     RRETURN(MATCH_NOMATCH);
1631     }
1632     ecode++;
1633     break;
1634    
1635     case OP_HSPACE:
1636     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1637     GETCHARINCTEST(c, eptr);
1638     switch(c)
1639     {
1640     default: RRETURN(MATCH_NOMATCH);
1641     case 0x09: /* HT */
1642     case 0x20: /* SPACE */
1643     case 0xa0: /* NBSP */
1644     case 0x1680: /* OGHAM SPACE MARK */
1645     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1646     case 0x2000: /* EN QUAD */
1647     case 0x2001: /* EM QUAD */
1648     case 0x2002: /* EN SPACE */
1649     case 0x2003: /* EM SPACE */
1650     case 0x2004: /* THREE-PER-EM SPACE */
1651     case 0x2005: /* FOUR-PER-EM SPACE */
1652     case 0x2006: /* SIX-PER-EM SPACE */
1653     case 0x2007: /* FIGURE SPACE */
1654     case 0x2008: /* PUNCTUATION SPACE */
1655     case 0x2009: /* THIN SPACE */
1656     case 0x200A: /* HAIR SPACE */
1657     case 0x202f: /* NARROW NO-BREAK SPACE */
1658     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1659     case 0x3000: /* IDEOGRAPHIC SPACE */
1660     break;
1661     }
1662     ecode++;
1663     break;
1664    
1665     case OP_NOT_VSPACE:
1666     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1667     GETCHARINCTEST(c, eptr);
1668     switch(c)
1669     {
1670     default: break;
1671     case 0x0a: /* LF */
1672     case 0x0b: /* VT */
1673     case 0x0c: /* FF */
1674     case 0x0d: /* CR */
1675     case 0x85: /* NEL */
1676     case 0x2028: /* LINE SEPARATOR */
1677     case 0x2029: /* PARAGRAPH SEPARATOR */
1678     RRETURN(MATCH_NOMATCH);
1679     }
1680     ecode++;
1681     break;
1682    
1683     case OP_VSPACE:
1684     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1685     GETCHARINCTEST(c, eptr);
1686     switch(c)
1687     {
1688     default: RRETURN(MATCH_NOMATCH);
1689     case 0x0a: /* LF */
1690     case 0x0b: /* VT */
1691     case 0x0c: /* FF */
1692     case 0x0d: /* CR */
1693     case 0x85: /* NEL */
1694     case 0x2028: /* LINE SEPARATOR */
1695     case 0x2029: /* PARAGRAPH SEPARATOR */
1696     break;
1697     }
1698     ecode++;
1699     break;
1700    
1701 nigel 77 #ifdef SUPPORT_UCP
1702     /* Check the next character by Unicode property. We will get here only
1703     if the support is in the binary; otherwise a compile-time error occurs. */
1704    
1705     case OP_PROP:
1706     case OP_NOTPROP:
1707     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1708     GETCHARINCTEST(c, eptr);
1709     {
1710 ph10 349 const ucd_record * prop = GET_UCD(c);
1711 nigel 77
1712 nigel 87 switch(ecode[1])
1713     {
1714     case PT_ANY:
1715     if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1716     break;
1717 nigel 77
1718 nigel 87 case PT_LAMP:
1719 ph10 349 if ((prop->chartype == ucp_Lu ||
1720     prop->chartype == ucp_Ll ||
1721     prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1722 nigel 77 RRETURN(MATCH_NOMATCH);
1723 nigel 87 break;
1724    
1725     case PT_GC:
1726 ph10 351 if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1727 nigel 77 RRETURN(MATCH_NOMATCH);
1728 nigel 87 break;
1729    
1730     case PT_PC:
1731 ph10 349 if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1732 nigel 87 RRETURN(MATCH_NOMATCH);
1733     break;
1734    
1735     case PT_SC:
1736 ph10 349 if ((ecode[2] != prop->script) == (op == OP_PROP))
1737 nigel 87 RRETURN(MATCH_NOMATCH);
1738     break;
1739    
1740     default:
1741     RRETURN(PCRE_ERROR_INTERNAL);
1742 nigel 77 }
1743 nigel 87
1744     ecode += 3;
1745 nigel 77 }
1746     break;
1747    
1748     /* Match an extended Unicode sequence. We will get here only if the support
1749     is in the binary; otherwise a compile-time error occurs. */
1750    
1751     case OP_EXTUNI:
1752     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1753     GETCHARINCTEST(c, eptr);
1754     {
1755 ph10 349 int category = UCD_CATEGORY(c);
1756 nigel 77 if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1757     while (eptr < md->end_subject)
1758     {
1759     int len = 1;
1760     if (!utf8) c = *eptr; else
1761     {
1762     GETCHARLEN(c, eptr, len);
1763     }
1764 ph10 349 category = UCD_CATEGORY(c);
1765 nigel 77 if (category != ucp_M) break;
1766     eptr += len;
1767     }
1768     }
1769     ecode++;
1770     break;
1771     #endif
1772    
1773    
1774     /* Match a back reference, possibly repeatedly. Look past the end of the
1775     item to see if there is repeat information following. The code is similar
1776     to that for character classes, but repeated for efficiency. Then obey
1777     similar code to character type repeats - written out again for speed.
1778     However, if the referenced string is the empty string, always treat
1779     it as matched, any number of times (otherwise there could be infinite
1780     loops). */
1781    
1782     case OP_REF:
1783     {
1784     offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1785 ph10 345 ecode += 3;
1786    
1787 ph10 336 /* If the reference is unset, there are two possibilities:
1788 ph10 345
1789 ph10 336 (a) In the default, Perl-compatible state, set the length to be longer
1790     than the amount of subject left; this ensures that every attempt at a
1791     match fails. We can't just fail here, because of the possibility of
1792     quantifiers with zero minima.
1793 ph10 345
1794     (b) If the JavaScript compatibility flag is set, set the length to zero
1795     so that the back reference matches an empty string.
1796    
1797     Otherwise, set the length to the length of what was matched by the
1798 ph10 336 referenced subpattern. */
1799 ph10 345
1800 ph10 336 if (offset >= offset_top || md->offset_vector[offset] < 0)
1801 ph10 345 length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1802 ph10 336 else
1803     length = md->offset_vector[offset+1] - md->offset_vector[offset];
1804 nigel 77
1805     /* Set up for repetition, or handle the non-repeated case */
1806    
1807     switch (*ecode)
1808     {
1809     case OP_CRSTAR:
1810     case OP_CRMINSTAR:
1811     case OP_CRPLUS:
1812     case OP_CRMINPLUS:
1813     case OP_CRQUERY:
1814     case OP_CRMINQUERY:
1815     c = *ecode++ - OP_CRSTAR;
1816     minimize = (c & 1) != 0;
1817     min = rep_min[c]; /* Pick up values from tables; */
1818     max = rep_max[c]; /* zero for max => infinity */
1819     if (max == 0) max = INT_MAX;
1820     break;
1821    
1822     case OP_CRRANGE:
1823     case OP_CRMINRANGE:
1824     minimize = (*ecode == OP_CRMINRANGE);
1825     min = GET2(ecode, 1);
1826     max = GET2(ecode, 3);
1827     if (max == 0) max = INT_MAX;
1828     ecode += 5;
1829     break;
1830    
1831     default: /* No repeat follows */
1832     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1833     eptr += length;
1834     continue; /* With the main loop */
1835     }
1836    
1837     /* If the length of the reference is zero, just continue with the
1838     main loop. */
1839    
1840     if (length == 0) continue;
1841    
1842     /* First, ensure the minimum number of matches are present. We get back
1843     the length of the reference string explicitly rather than passing the
1844     address of eptr, so that eptr can be a register variable. */
1845    
1846     for (i = 1; i <= min; i++)
1847     {
1848     if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1849     eptr += length;
1850     }
1851    
1852     /* If min = max, continue at the same level without recursion.
1853     They are not both allowed to be zero. */
1854    
1855     if (min == max) continue;
1856    
1857     /* If minimizing, keep trying and advancing the pointer */
1858    
1859     if (minimize)
1860     {
1861     for (fi = min;; fi++)
1862     {
1863 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1864 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1865     if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1866     RRETURN(MATCH_NOMATCH);
1867     eptr += length;
1868     }
1869     /* Control never gets here */
1870     }
1871    
1872     /* If maximizing, find the longest string and work backwards */
1873    
1874     else
1875     {
1876     pp = eptr;
1877     for (i = min; i < max; i++)
1878     {
1879     if (!match_ref(offset, eptr, length, md, ims)) break;
1880     eptr += length;
1881     }
1882     while (eptr >= pp)
1883     {
1884 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1885 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1886     eptr -= length;
1887     }
1888     RRETURN(MATCH_NOMATCH);
1889     }
1890     }
1891     /* Control never gets here */
1892    
1893    
1894    
1895     /* Match a bit-mapped character class, possibly repeatedly. This op code is
1896     used when all the characters in the class have values in the range 0-255,
1897     and either the matching is caseful, or the characters are in the range
1898     0-127 when UTF-8 processing is enabled. The only difference between
1899     OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1900     encountered.
1901    
1902     First, look past the end of the item to see if there is repeat information
1903     following. Then obey similar code to character type repeats - written out
1904     again for speed. */
1905    
1906     case OP_NCLASS:
1907     case OP_CLASS:
1908     {
1909     data = ecode + 1; /* Save for matching */
1910     ecode += 33; /* Advance past the item */
1911    
1912     switch (*ecode)
1913     {
1914     case OP_CRSTAR:
1915     case OP_CRMINSTAR:
1916     case OP_CRPLUS:
1917     case OP_CRMINPLUS:
1918     case OP_CRQUERY:
1919     case OP_CRMINQUERY:
1920     c = *ecode++ - OP_CRSTAR;
1921     minimize = (c & 1) != 0;
1922     min = rep_min[c]; /* Pick up values from tables; */
1923     max = rep_max[c]; /* zero for max => infinity */
1924     if (max == 0) max = INT_MAX;
1925     break;
1926    
1927     case OP_CRRANGE:
1928     case OP_CRMINRANGE:
1929     minimize = (*ecode == OP_CRMINRANGE);
1930     min = GET2(ecode, 1);
1931     max = GET2(ecode, 3);
1932     if (max == 0) max = INT_MAX;
1933     ecode += 5;
1934     break;
1935    
1936     default: /* No repeat follows */
1937     min = max = 1;
1938     break;
1939     }
1940    
1941     /* First, ensure the minimum number of matches are present. */
1942    
1943     #ifdef SUPPORT_UTF8
1944     /* UTF-8 mode */
1945     if (utf8)
1946     {
1947     for (i = 1; i <= min; i++)
1948     {
1949     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1950     GETCHARINC(c, eptr);
1951     if (c > 255)
1952     {
1953     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1954     }
1955     else
1956     {
1957     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1958     }
1959     }
1960     }
1961     else
1962     #endif
1963     /* Not UTF-8 mode */
1964     {
1965     for (i = 1; i <= min; i++)
1966     {
1967     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1968     c = *eptr++;
1969     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970     }
1971     }
1972    
1973     /* If max == min we can continue with the main loop without the
1974     need to recurse. */
1975    
1976     if (min == max) continue;
1977    
1978     /* If minimizing, keep testing the rest of the expression and advancing
1979     the pointer while it matches the class. */
1980    
1981     if (minimize)
1982     {
1983     #ifdef SUPPORT_UTF8
1984     /* UTF-8 mode */
1985     if (utf8)
1986     {
1987     for (fi = min;; fi++)
1988     {
1989 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1990 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1991     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1992     GETCHARINC(c, eptr);
1993     if (c > 255)
1994     {
1995     if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1996     }
1997     else
1998     {
1999     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2000     }
2001     }
2002     }
2003     else
2004     #endif
2005     /* Not UTF-8 mode */
2006     {
2007     for (fi = min;; fi++)
2008     {
2009 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2010 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2011     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2012     c = *eptr++;
2013     if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2014     }
2015     }
2016     /* Control never gets here */
2017     }
2018    
2019     /* If maximizing, find the longest possible run, then work backwards. */
2020    
2021     else
2022     {
2023     pp = eptr;
2024    
2025     #ifdef SUPPORT_UTF8
2026     /* UTF-8 mode */
2027     if (utf8)
2028     {
2029     for (i = min; i < max; i++)
2030     {
2031     int len = 1;
2032     if (eptr >= md->end_subject) break;
2033     GETCHARLEN(c, eptr, len);
2034     if (c > 255)
2035     {
2036     if (op == OP_CLASS) break;
2037     }
2038     else
2039     {
2040     if ((data[c/8] & (1 << (c&7))) == 0) break;
2041     }
2042     eptr += len;
2043     }
2044     for (;;)
2045     {
2046 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2047 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048     if (eptr-- == pp) break; /* Stop if tried at original pos */
2049     BACKCHAR(eptr);
2050     }
2051     }
2052     else
2053     #endif
2054     /* Not UTF-8 mode */
2055     {
2056     for (i = min; i < max; i++)
2057     {
2058     if (eptr >= md->end_subject) break;
2059     c = *eptr;
2060     if ((data[c/8] & (1 << (c&7))) == 0) break;
2061     eptr++;
2062     }
2063     while (eptr >= pp)
2064     {
2065 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2066 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2067 nigel 77 eptr--;
2068     }
2069     }
2070    
2071     RRETURN(MATCH_NOMATCH);
2072     }
2073     }
2074     /* Control never gets here */
2075    
2076    
2077     /* Match an extended character class. This opcode is encountered only
2078     in UTF-8 mode, because that's the only time it is compiled. */
2079    
2080     #ifdef SUPPORT_UTF8
2081     case OP_XCLASS:
2082     {
2083     data = ecode + 1 + LINK_SIZE; /* Save for matching */
2084     ecode += GET(ecode, 1); /* Advance past the item */
2085    
2086     switch (*ecode)
2087     {
2088     case OP_CRSTAR:
2089     case OP_CRMINSTAR:
2090     case OP_CRPLUS:
2091     case OP_CRMINPLUS:
2092     case OP_CRQUERY:
2093     case OP_CRMINQUERY:
2094     c = *ecode++ - OP_CRSTAR;
2095     minimize = (c & 1) != 0;
2096     min = rep_min[c]; /* Pick up values from tables; */
2097     max = rep_max[c]; /* zero for max => infinity */
2098     if (max == 0) max = INT_MAX;
2099     break;
2100    
2101     case OP_CRRANGE:
2102     case OP_CRMINRANGE:
2103     minimize = (*ecode == OP_CRMINRANGE);
2104     min = GET2(ecode, 1);
2105     max = GET2(ecode, 3);
2106     if (max == 0) max = INT_MAX;
2107     ecode += 5;
2108     break;
2109    
2110     default: /* No repeat follows */
2111     min = max = 1;
2112     break;
2113     }
2114    
2115     /* First, ensure the minimum number of matches are present. */
2116    
2117     for (i = 1; i <= min; i++)
2118     {
2119     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2120     GETCHARINC(c, eptr);
2121     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2122     }
2123    
2124     /* If max == min we can continue with the main loop without the
2125     need to recurse. */
2126    
2127     if (min == max) continue;
2128    
2129     /* If minimizing, keep testing the rest of the expression and advancing
2130     the pointer while it matches the class. */
2131    
2132     if (minimize)
2133     {
2134     for (fi = min;; fi++)
2135     {
2136 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2137 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2138     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2139     GETCHARINC(c, eptr);
2140     if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2141     }
2142     /* Control never gets here */
2143     }
2144    
2145     /* If maximizing, find the longest possible run, then work backwards. */
2146    
2147     else
2148     {
2149     pp = eptr;
2150     for (i = min; i < max; i++)
2151     {
2152     int len = 1;
2153     if (eptr >= md->end_subject) break;
2154     GETCHARLEN(c, eptr, len);
2155     if (!_pcre_xclass(c, data)) break;
2156     eptr += len;
2157     }
2158     for(;;)
2159     {
2160 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2161 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2162     if (eptr-- == pp) break; /* Stop if tried at original pos */
2163 ph10 214 if (utf8) BACKCHAR(eptr);
2164 nigel 77 }
2165     RRETURN(MATCH_NOMATCH);
2166     }
2167    
2168     /* Control never gets here */
2169     }
2170     #endif /* End of XCLASS */
2171    
2172     /* Match a single character, casefully */
2173    
2174     case OP_CHAR:
2175     #ifdef SUPPORT_UTF8
2176     if (utf8)
2177     {
2178     length = 1;
2179     ecode++;
2180     GETCHARLEN(fc, ecode, length);
2181     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2182     while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2183     }
2184     else
2185     #endif
2186    
2187     /* Non-UTF-8 mode */
2188     {
2189     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2190     if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2191     ecode += 2;
2192     }
2193     break;
2194    
2195     /* Match a single character, caselessly */
2196    
2197     case OP_CHARNC:
2198     #ifdef SUPPORT_UTF8
2199     if (utf8)
2200     {
2201     length = 1;
2202     ecode++;
2203     GETCHARLEN(fc, ecode, length);
2204    
2205     if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2206    
2207     /* If the pattern character's value is < 128, we have only one byte, and
2208     can use the fast lookup table. */
2209    
2210     if (fc < 128)
2211     {
2212     if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2213     }
2214    
2215     /* Otherwise we must pick up the subject character */
2216    
2217     else
2218     {
2219 nigel 93 unsigned int dc;
2220 nigel 77 GETCHARINC(dc, eptr);
2221     ecode += length;
2222    
2223     /* If we have Unicode property support, we can use it to test the other
2224 nigel 87 case of the character, if there is one. */
2225 nigel 77
2226     if (fc != dc)
2227     {
2228     #ifdef SUPPORT_UCP
2229 ph10 349 if (dc != UCD_OTHERCASE(fc))
2230 nigel 77 #endif
2231     RRETURN(MATCH_NOMATCH);
2232     }
2233     }
2234     }
2235     else
2236     #endif /* SUPPORT_UTF8 */
2237    
2238     /* Non-UTF-8 mode */
2239     {
2240     if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2241     if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2242     ecode += 2;
2243     }
2244     break;
2245    
2246 nigel 93 /* Match a single character repeatedly. */
2247 nigel 77
2248     case OP_EXACT:
2249     min = max = GET2(ecode, 1);
2250     ecode += 3;
2251     goto REPEATCHAR;
2252    
2253 nigel 93 case OP_POSUPTO:
2254     possessive = TRUE;
2255     /* Fall through */
2256    
2257 nigel 77 case OP_UPTO:
2258     case OP_MINUPTO:
2259     min = 0;
2260     max = GET2(ecode, 1);
2261     minimize = *ecode == OP_MINUPTO;
2262     ecode += 3;
2263     goto REPEATCHAR;
2264    
2265 nigel 93 case OP_POSSTAR:
2266     possessive = TRUE;
2267     min = 0;
2268     max = INT_MAX;
2269     ecode++;
2270     goto REPEATCHAR;
2271    
2272     case OP_POSPLUS:
2273     possessive = TRUE;
2274     min = 1;
2275     max = INT_MAX;
2276     ecode++;
2277     goto REPEATCHAR;
2278    
2279     case OP_POSQUERY:
2280     possessive = TRUE;
2281     min = 0;
2282     max = 1;
2283     ecode++;
2284     goto REPEATCHAR;
2285    
2286 nigel 77 case OP_STAR:
2287     case OP_MINSTAR:
2288     case OP_PLUS:
2289     case OP_MINPLUS:
2290     case OP_QUERY:
2291     case OP_MINQUERY:
2292     c = *ecode++ - OP_STAR;
2293     minimize = (c & 1) != 0;
2294     min = rep_min[c]; /* Pick up values from tables; */
2295     max = rep_max[c]; /* zero for max => infinity */
2296     if (max == 0) max = INT_MAX;
2297    
2298     /* Common code for all repeated single-character matches. We can give
2299     up quickly if there are fewer than the minimum number of characters left in
2300     the subject. */
2301    
2302     REPEATCHAR:
2303     #ifdef SUPPORT_UTF8
2304     if (utf8)
2305     {
2306     length = 1;
2307     charptr = ecode;
2308     GETCHARLEN(fc, ecode, length);
2309     if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2310     ecode += length;
2311    
2312     /* Handle multibyte character matching specially here. There is
2313     support for caseless matching if UCP support is present. */
2314    
2315     if (length > 1)
2316     {
2317     #ifdef SUPPORT_UCP
2318 nigel 93 unsigned int othercase;
2319 nigel 77 if ((ims & PCRE_CASELESS) != 0 &&
2320 ph10 349 (othercase = UCD_OTHERCASE(fc)) != fc)
2321 nigel 77 oclength = _pcre_ord2utf8(othercase, occhars);
2322 ph10 115 else oclength = 0;
2323 nigel 77 #endif /* SUPPORT_UCP */
2324    
2325     for (i = 1; i <= min; i++)
2326     {
2327     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2328 ph10 123 #ifdef SUPPORT_UCP
2329 nigel 77 /* Need braces because of following else */
2330     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2331     else
2332     {
2333     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2334     eptr += oclength;
2335     }
2336 ph10 115 #else /* without SUPPORT_UCP */
2337     else { RRETURN(MATCH_NOMATCH); }
2338 ph10 123 #endif /* SUPPORT_UCP */
2339 nigel 77 }
2340    
2341     if (min == max) continue;
2342    
2343     if (minimize)
2344     {
2345     for (fi = min;; fi++)
2346     {
2347 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2348 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2349     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2350     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2351 ph10 123 #ifdef SUPPORT_UCP
2352 nigel 77 /* Need braces because of following else */
2353     else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2354     else
2355     {
2356     if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2357     eptr += oclength;
2358     }
2359 ph10 115 #else /* without SUPPORT_UCP */
2360     else { RRETURN (MATCH_NOMATCH); }
2361     #endif /* SUPPORT_UCP */
2362 nigel 77 }
2363     /* Control never gets here */
2364     }
2365 nigel 93
2366     else /* Maximize */
2367 nigel 77 {
2368     pp = eptr;
2369     for (i = min; i < max; i++)
2370     {
2371     if (eptr > md->end_subject - length) break;
2372     if (memcmp(eptr, charptr, length) == 0) eptr += length;
2373 ph10 123 #ifdef SUPPORT_UCP
2374 nigel 77 else if (oclength == 0) break;
2375     else
2376     {
2377     if (memcmp(eptr, occhars, oclength) != 0) break;
2378     eptr += oclength;
2379     }
2380 ph10 115 #else /* without SUPPORT_UCP */
2381     else break;
2382 ph10 123 #endif /* SUPPORT_UCP */
2383 nigel 77 }
2384 nigel 93
2385     if (possessive) continue;
2386 ph10 120 for(;;)
2387 nigel 77 {
2388 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2389 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2390 ph10 120 if (eptr == pp) RRETURN(MATCH_NOMATCH);
2391 ph10 115 #ifdef SUPPORT_UCP
2392     eptr--;
2393     BACKCHAR(eptr);
2394 ph10 123 #else /* without SUPPORT_UCP */
2395 nigel 77 eptr -= length;
2396 ph10 123 #endif /* SUPPORT_UCP */
2397 nigel 77 }
2398     }
2399     /* Control never gets here */
2400     }
2401    
2402     /* If the length of a UTF-8 character is 1, we fall through here, and
2403     obey the code as for non-UTF-8 characters below, though in this case the
2404     value of fc will always be < 128. */
2405     }
2406     else
2407     #endif /* SUPPORT_UTF8 */
2408    
2409     /* When not in UTF-8 mode, load a single-byte character. */
2410     {
2411     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2412     fc = *ecode++;
2413     }
2414    
2415     /* The value of fc at this point is always less than 256, though we may or
2416     may not be in UTF-8 mode. The code is duplicated for the caseless and
2417     caseful cases, for speed, since matching characters is likely to be quite
2418     common. First, ensure the minimum number of matches are present. If min =
2419     max, continue at the same level without recursing. Otherwise, if
2420     minimizing, keep trying the rest of the expression and advancing one
2421     matching character if failing, up to the maximum. Alternatively, if
2422     maximizing, find the maximum number of characters and work backwards. */
2423    
2424     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2425     max, eptr));
2426    
2427     if ((ims & PCRE_CASELESS) != 0)
2428     {
2429     fc = md->lcc[fc];
2430     for (i = 1; i <= min; i++)
2431     if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2432     if (min == max) continue;
2433     if (minimize)
2434     {
2435     for (fi = min;; fi++)
2436     {
2437 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2438 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2439     if (fi >= max || eptr >= md->end_subject ||
2440     fc != md->lcc[*eptr++])
2441     RRETURN(MATCH_NOMATCH);
2442     }
2443     /* Control never gets here */
2444     }
2445 nigel 93 else /* Maximize */
2446 nigel 77 {
2447     pp = eptr;
2448     for (i = min; i < max; i++)
2449     {
2450     if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2451     eptr++;
2452     }
2453 nigel 93 if (possessive) continue;
2454 nigel 77 while (eptr >= pp)
2455     {
2456 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2457 nigel 77 eptr--;
2458     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2459     }
2460     RRETURN(MATCH_NOMATCH);
2461     }
2462     /* Control never gets here */
2463     }
2464    
2465     /* Caseful comparisons (includes all multi-byte characters) */
2466    
2467     else
2468     {
2469     for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2470     if (min == max) continue;
2471     if (minimize)
2472     {
2473     for (fi = min;; fi++)
2474     {
2475 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2476 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2477     if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2478     RRETURN(MATCH_NOMATCH);
2479     }
2480     /* Control never gets here */
2481     }
2482 nigel 93 else /* Maximize */
2483 nigel 77 {
2484     pp = eptr;
2485     for (i = min; i < max; i++)
2486     {
2487     if (eptr >= md->end_subject || fc != *eptr) break;
2488     eptr++;
2489     }
2490 nigel 93 if (possessive) continue;
2491 nigel 77 while (eptr >= pp)
2492     {
2493 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2494 nigel 77 eptr--;
2495     if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2496     }
2497     RRETURN(MATCH_NOMATCH);
2498     }
2499     }
2500     /* Control never gets here */
2501    
2502     /* Match a negated single one-byte character. The character we are
2503     checking can be multibyte. */
2504    
2505     case OP_NOT:
2506     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2507     ecode++;
2508     GETCHARINCTEST(c, eptr);
2509     if ((ims & PCRE_CASELESS) != 0)
2510     {
2511     #ifdef SUPPORT_UTF8
2512     if (c < 256)
2513     #endif
2514     c = md->lcc[c];
2515     if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2516     }
2517     else
2518     {
2519     if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2520     }
2521     break;
2522    
2523     /* Match a negated single one-byte character repeatedly. This is almost a
2524     repeat of the code for a repeated single character, but I haven't found a
2525     nice way of commoning these up that doesn't require a test of the
2526     positive/negative option for each character match. Maybe that wouldn't add
2527     very much to the time taken, but character matching *is* what this is all
2528     about... */
2529    
2530     case OP_NOTEXACT:
2531     min = max = GET2(ecode, 1);
2532     ecode += 3;
2533     goto REPEATNOTCHAR;
2534    
2535     case OP_NOTUPTO:
2536     case OP_NOTMINUPTO:
2537     min = 0;
2538     max = GET2(ecode, 1);
2539     minimize = *ecode == OP_NOTMINUPTO;
2540     ecode += 3;
2541     goto REPEATNOTCHAR;
2542    
2543 nigel 93 case OP_NOTPOSSTAR:
2544     possessive = TRUE;
2545     min = 0;
2546     max = INT_MAX;
2547     ecode++;
2548     goto REPEATNOTCHAR;
2549    
2550     case OP_NOTPOSPLUS:
2551     possessive = TRUE;
2552     min = 1;
2553     max = INT_MAX;
2554     ecode++;
2555     goto REPEATNOTCHAR;
2556    
2557     case OP_NOTPOSQUERY:
2558     possessive = TRUE;
2559     min = 0;
2560     max = 1;
2561     ecode++;
2562     goto REPEATNOTCHAR;
2563    
2564     case OP_NOTPOSUPTO:
2565     possessive = TRUE;
2566     min = 0;
2567     max = GET2(ecode, 1);
2568     ecode += 3;
2569     goto REPEATNOTCHAR;
2570    
2571 nigel 77 case OP_NOTSTAR:
2572     case OP_NOTMINSTAR:
2573     case OP_NOTPLUS:
2574     case OP_NOTMINPLUS:
2575     case OP_NOTQUERY:
2576     case OP_NOTMINQUERY:
2577     c = *ecode++ - OP_NOTSTAR;
2578     minimize = (c & 1) != 0;
2579     min = rep_min[c]; /* Pick up values from tables; */
2580     max = rep_max[c]; /* zero for max => infinity */
2581     if (max == 0) max = INT_MAX;
2582    
2583     /* Common code for all repeated single-byte matches. We can give up quickly
2584     if there are fewer than the minimum number of bytes left in the
2585     subject. */
2586    
2587     REPEATNOTCHAR:
2588     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2589     fc = *ecode++;
2590    
2591     /* The code is duplicated for the caseless and caseful cases, for speed,
2592     since matching characters is likely to be quite common. First, ensure the
2593     minimum number of matches are present. If min = max, continue at the same
2594     level without recursing. Otherwise, if minimizing, keep trying the rest of
2595     the expression and advancing one matching character if failing, up to the
2596     maximum. Alternatively, if maximizing, find the maximum number of
2597     characters and work backwards. */
2598    
2599     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2600     max, eptr));
2601    
2602     if ((ims & PCRE_CASELESS) != 0)
2603     {
2604     fc = md->lcc[fc];
2605    
2606     #ifdef SUPPORT_UTF8
2607     /* UTF-8 mode */
2608     if (utf8)
2609     {
2610 nigel 93 register unsigned int d;
2611 nigel 77 for (i = 1; i <= min; i++)
2612     {
2613     GETCHARINC(d, eptr);
2614     if (d < 256) d = md->lcc[d];
2615     if (fc == d) RRETURN(MATCH_NOMATCH);
2616     }
2617     }
2618     else
2619     #endif
2620    
2621     /* Not UTF-8 mode */
2622     {
2623     for (i = 1; i <= min; i++)
2624     if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2625     }
2626    
2627     if (min == max) continue;
2628    
2629     if (minimize)
2630     {
2631     #ifdef SUPPORT_UTF8
2632     /* UTF-8 mode */
2633     if (utf8)
2634     {
2635 nigel 93 register unsigned int d;
2636 nigel 77 for (fi = min;; fi++)
2637     {
2638 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2639 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2640 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2641 nigel 77 GETCHARINC(d, eptr);
2642     if (d < 256) d = md->lcc[d];
2643 ph10 366 if (fc == d) RRETURN(MATCH_NOMATCH);
2644 ph10 371
2645 nigel 77 }
2646     }
2647     else
2648     #endif
2649     /* Not UTF-8 mode */
2650     {
2651     for (fi = min;; fi++)
2652     {
2653 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2654 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2655     if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2656     RRETURN(MATCH_NOMATCH);
2657     }
2658     }
2659     /* Control never gets here */
2660     }
2661    
2662     /* Maximize case */
2663    
2664     else
2665     {
2666     pp = eptr;
2667    
2668     #ifdef SUPPORT_UTF8
2669     /* UTF-8 mode */
2670     if (utf8)
2671     {
2672 nigel 93 register unsigned int d;
2673 nigel 77 for (i = min; i < max; i++)
2674     {
2675     int len = 1;
2676     if (eptr >= md->end_subject) break;
2677     GETCHARLEN(d, eptr, len);
2678     if (d < 256) d = md->lcc[d];
2679     if (fc == d) break;
2680     eptr += len;
2681     }
2682 nigel 93 if (possessive) continue;
2683     for(;;)
2684 nigel 77 {
2685 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2686 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2687     if (eptr-- == pp) break; /* Stop if tried at original pos */
2688     BACKCHAR(eptr);
2689     }
2690     }
2691     else
2692     #endif
2693     /* Not UTF-8 mode */
2694     {
2695     for (i = min; i < max; i++)
2696     {
2697     if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2698     eptr++;
2699     }
2700 nigel 93 if (possessive) continue;
2701 nigel 77 while (eptr >= pp)
2702     {
2703 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2704 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2705     eptr--;
2706     }
2707     }
2708    
2709     RRETURN(MATCH_NOMATCH);
2710     }
2711     /* Control never gets here */
2712     }
2713    
2714     /* Caseful comparisons */
2715    
2716     else
2717     {
2718     #ifdef SUPPORT_UTF8
2719     /* UTF-8 mode */
2720     if (utf8)
2721     {
2722 nigel 93 register unsigned int d;
2723 nigel 77 for (i = 1; i <= min; i++)
2724     {
2725     GETCHARINC(d, eptr);
2726     if (fc == d) RRETURN(MATCH_NOMATCH);
2727     }
2728     }
2729     else
2730     #endif
2731     /* Not UTF-8 mode */
2732     {
2733     for (i = 1; i <= min; i++)
2734     if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2735     }
2736    
2737     if (min == max) continue;
2738    
2739     if (minimize)
2740     {
2741     #ifdef SUPPORT_UTF8
2742     /* UTF-8 mode */
2743     if (utf8)
2744     {
2745 nigel 93 register unsigned int d;
2746 nigel 77 for (fi = min;; fi++)
2747     {
2748 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2749 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2750 ph10 366 if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2751 nigel 77 GETCHARINC(d, eptr);
2752 ph10 371 if (fc == d) RRETURN(MATCH_NOMATCH);
2753 nigel 77 }
2754     }
2755     else
2756     #endif
2757     /* Not UTF-8 mode */
2758     {
2759     for (fi = min;; fi++)
2760     {
2761 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2762 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763     if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2764     RRETURN(MATCH_NOMATCH);
2765     }
2766     }
2767     /* Control never gets here */
2768     }
2769    
2770     /* Maximize case */
2771    
2772     else
2773     {
2774     pp = eptr;
2775    
2776     #ifdef SUPPORT_UTF8
2777     /* UTF-8 mode */
2778     if (utf8)
2779     {
2780 nigel 93 register unsigned int d;
2781 nigel 77 for (i = min; i < max; i++)
2782     {
2783     int len = 1;
2784     if (eptr >= md->end_subject) break;
2785     GETCHARLEN(d, eptr, len);
2786     if (fc == d) break;
2787     eptr += len;
2788     }
2789 nigel 93 if (possessive) continue;
2790 nigel 77 for(;;)
2791     {
2792 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2793 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2794     if (eptr-- == pp) break; /* Stop if tried at original pos */
2795     BACKCHAR(eptr);
2796     }
2797     }
2798     else
2799     #endif
2800     /* Not UTF-8 mode */
2801     {
2802     for (i = min; i < max; i++)
2803     {
2804     if (eptr >= md->end_subject || fc == *eptr) break;
2805     eptr++;
2806     }
2807 nigel 93 if (possessive) continue;
2808 nigel 77 while (eptr >= pp)
2809     {
2810 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2811 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2812     eptr--;
2813     }
2814     }
2815    
2816     RRETURN(MATCH_NOMATCH);
2817     }
2818     }
2819     /* Control never gets here */
2820    
2821     /* Match a single character type repeatedly; several different opcodes
2822     share code. This is very similar to the code for single characters, but we
2823     repeat it in the interests of efficiency. */
2824    
2825     case OP_TYPEEXACT:
2826     min = max = GET2(ecode, 1);
2827     minimize = TRUE;
2828     ecode += 3;
2829     goto REPEATTYPE;
2830    
2831     case OP_TYPEUPTO:
2832     case OP_TYPEMINUPTO:
2833     min = 0;
2834     max = GET2(ecode, 1);
2835     minimize = *ecode == OP_TYPEMINUPTO;
2836     ecode += 3;
2837     goto REPEATTYPE;
2838    
2839 nigel 93 case OP_TYPEPOSSTAR:
2840     possessive = TRUE;
2841     min = 0;
2842     max = INT_MAX;
2843     ecode++;
2844     goto REPEATTYPE;
2845    
2846     case OP_TYPEPOSPLUS:
2847     possessive = TRUE;
2848     min = 1;
2849     max = INT_MAX;
2850     ecode++;
2851     goto REPEATTYPE;
2852    
2853     case OP_TYPEPOSQUERY:
2854     possessive = TRUE;
2855     min = 0;
2856     max = 1;
2857     ecode++;
2858     goto REPEATTYPE;
2859    
2860     case OP_TYPEPOSUPTO:
2861     possessive = TRUE;
2862     min = 0;
2863     max = GET2(ecode, 1);
2864     ecode += 3;
2865     goto REPEATTYPE;
2866    
2867 nigel 77 case OP_TYPESTAR:
2868     case OP_TYPEMINSTAR:
2869     case OP_TYPEPLUS:
2870     case OP_TYPEMINPLUS:
2871     case OP_TYPEQUERY:
2872     case OP_TYPEMINQUERY:
2873     c = *ecode++ - OP_TYPESTAR;
2874     minimize = (c & 1) != 0;
2875     min = rep_min[c]; /* Pick up values from tables; */
2876     max = rep_max[c]; /* zero for max => infinity */
2877     if (max == 0) max = INT_MAX;
2878    
2879     /* Common code for all repeated single character type matches. Note that
2880     in UTF-8 mode, '.' matches a character of any length, but for the other
2881     character types, the valid characters are all one-byte long. */
2882    
2883     REPEATTYPE:
2884     ctype = *ecode++; /* Code for the character type */
2885    
2886     #ifdef SUPPORT_UCP
2887     if (ctype == OP_PROP || ctype == OP_NOTPROP)
2888     {
2889     prop_fail_result = ctype == OP_NOTPROP;
2890     prop_type = *ecode++;
2891 nigel 87 prop_value = *ecode++;
2892 nigel 77 }
2893     else prop_type = -1;
2894     #endif
2895    
2896     /* First, ensure the minimum number of matches are present. Use inline
2897     code for maximizing the speed, and do the type test once at the start
2898     (i.e. keep it out of the loop). Also we can test that there are at least
2899     the minimum number of bytes before we start. This isn't as effective in
2900     UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2901     is tidier. Also separate the UCP code, which can be the same for both UTF-8
2902     and single-bytes. */
2903    
2904     if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2905     if (min > 0)
2906     {
2907     #ifdef SUPPORT_UCP
2908 nigel 87 if (prop_type >= 0)
2909 nigel 77 {
2910 nigel 87 switch(prop_type)
2911 nigel 77 {
2912 nigel 87 case PT_ANY:
2913     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2914     for (i = 1; i <= min; i++)
2915     {
2916     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2917 ph10 184 GETCHARINCTEST(c, eptr);
2918 nigel 87 }
2919     break;
2920    
2921     case PT_LAMP:
2922     for (i = 1; i <= min; i++)
2923     {
2924     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2925 ph10 184 GETCHARINCTEST(c, eptr);
2926 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2927 nigel 87 if ((prop_chartype == ucp_Lu ||
2928     prop_chartype == ucp_Ll ||
2929     prop_chartype == ucp_Lt) == prop_fail_result)
2930     RRETURN(MATCH_NOMATCH);
2931     }
2932     break;
2933    
2934     case PT_GC:
2935     for (i = 1; i <= min; i++)
2936     {
2937     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938 ph10 184 GETCHARINCTEST(c, eptr);
2939 ph10 349 prop_category = UCD_CATEGORY(c);
2940 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
2941     RRETURN(MATCH_NOMATCH);
2942     }
2943     break;
2944    
2945     case PT_PC:
2946     for (i = 1; i <= min; i++)
2947     {
2948     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2949 ph10 184 GETCHARINCTEST(c, eptr);
2950 ph10 349 prop_chartype = UCD_CHARTYPE(c);
2951 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
2952     RRETURN(MATCH_NOMATCH);
2953     }
2954     break;
2955    
2956     case PT_SC:
2957     for (i = 1; i <= min; i++)
2958     {
2959     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2960 ph10 184 GETCHARINCTEST(c, eptr);
2961 ph10 349 prop_script = UCD_SCRIPT(c);
2962 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
2963     RRETURN(MATCH_NOMATCH);
2964     }
2965     break;
2966    
2967     default:
2968     RRETURN(PCRE_ERROR_INTERNAL);
2969 nigel 77 }
2970     }
2971    
2972     /* Match extended Unicode sequences. We will get here only if the
2973     support is in the binary; otherwise a compile-time error occurs. */
2974    
2975     else if (ctype == OP_EXTUNI)
2976     {
2977     for (i = 1; i <= min; i++)
2978     {
2979     GETCHARINCTEST(c, eptr);
2980 ph10 349 prop_category = UCD_CATEGORY(c);
2981 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2982     while (eptr < md->end_subject)
2983     {
2984     int len = 1;
2985     if (!utf8) c = *eptr; else
2986     {
2987     GETCHARLEN(c, eptr, len);
2988     }
2989 ph10 349 prop_category = UCD_CATEGORY(c);
2990 nigel 77 if (prop_category != ucp_M) break;
2991     eptr += len;
2992     }
2993     }
2994     }
2995    
2996     else
2997     #endif /* SUPPORT_UCP */
2998    
2999     /* Handle all other cases when the coding is UTF-8 */
3000    
3001     #ifdef SUPPORT_UTF8
3002     if (utf8) switch(ctype)
3003     {
3004     case OP_ANY:
3005     for (i = 1; i <= min; i++)
3006     {
3007 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3008 nigel 77 RRETURN(MATCH_NOMATCH);
3009 nigel 91 eptr++;
3010 nigel 77 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3011     }
3012     break;
3013    
3014 ph10 341 case OP_ALLANY:
3015     for (i = 1; i <= min; i++)
3016     {
3017     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3018     eptr++;
3019     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3020     }
3021     break;
3022    
3023 nigel 77 case OP_ANYBYTE:
3024     eptr += min;
3025     break;
3026    
3027 nigel 93 case OP_ANYNL:
3028     for (i = 1; i <= min; i++)
3029     {
3030     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3031     GETCHARINC(c, eptr);
3032     switch(c)
3033     {
3034     default: RRETURN(MATCH_NOMATCH);
3035     case 0x000d:
3036     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3037     break;
3038 ph10 231
3039 nigel 93 case 0x000a:
3040 ph10 231 break;
3041    
3042 nigel 93 case 0x000b:
3043     case 0x000c:
3044     case 0x0085:
3045     case 0x2028:
3046     case 0x2029:
3047 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3048 nigel 93 break;
3049     }
3050     }
3051     break;
3052    
3053 ph10 178 case OP_NOT_HSPACE:
3054     for (i = 1; i <= min; i++)
3055     {
3056     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3057     GETCHARINC(c, eptr);
3058     switch(c)
3059     {
3060     default: break;
3061     case 0x09: /* HT */
3062     case 0x20: /* SPACE */
3063     case 0xa0: /* NBSP */
3064     case 0x1680: /* OGHAM SPACE MARK */
3065     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3066     case 0x2000: /* EN QUAD */
3067     case 0x2001: /* EM QUAD */
3068     case 0x2002: /* EN SPACE */
3069     case 0x2003: /* EM SPACE */
3070     case 0x2004: /* THREE-PER-EM SPACE */
3071     case 0x2005: /* FOUR-PER-EM SPACE */
3072     case 0x2006: /* SIX-PER-EM SPACE */
3073     case 0x2007: /* FIGURE SPACE */
3074     case 0x2008: /* PUNCTUATION SPACE */
3075     case 0x2009: /* THIN SPACE */
3076     case 0x200A: /* HAIR SPACE */
3077     case 0x202f: /* NARROW NO-BREAK SPACE */
3078     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3079     case 0x3000: /* IDEOGRAPHIC SPACE */
3080     RRETURN(MATCH_NOMATCH);
3081     }
3082     }
3083     break;
3084 ph10 182
3085 ph10 178 case OP_HSPACE:
3086     for (i = 1; i <= min; i++)
3087     {
3088     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3089     GETCHARINC(c, eptr);
3090     switch(c)
3091     {
3092     default: RRETURN(MATCH_NOMATCH);
3093     case 0x09: /* HT */
3094     case 0x20: /* SPACE */
3095     case 0xa0: /* NBSP */
3096     case 0x1680: /* OGHAM SPACE MARK */
3097     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3098     case 0x2000: /* EN QUAD */
3099     case 0x2001: /* EM QUAD */
3100     case 0x2002: /* EN SPACE */
3101     case 0x2003: /* EM SPACE */
3102     case 0x2004: /* THREE-PER-EM SPACE */
3103     case 0x2005: /* FOUR-PER-EM SPACE */
3104     case 0x2006: /* SIX-PER-EM SPACE */
3105     case 0x2007: /* FIGURE SPACE */
3106     case 0x2008: /* PUNCTUATION SPACE */
3107     case 0x2009: /* THIN SPACE */
3108     case 0x200A: /* HAIR SPACE */
3109     case 0x202f: /* NARROW NO-BREAK SPACE */
3110     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3111     case 0x3000: /* IDEOGRAPHIC SPACE */
3112     break;
3113     }
3114     }
3115     break;
3116 ph10 182
3117 ph10 178 case OP_NOT_VSPACE:
3118     for (i = 1; i <= min; i++)
3119     {
3120     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3121     GETCHARINC(c, eptr);
3122     switch(c)
3123     {
3124     default: break;
3125     case 0x0a: /* LF */
3126     case 0x0b: /* VT */
3127     case 0x0c: /* FF */
3128     case 0x0d: /* CR */
3129     case 0x85: /* NEL */
3130     case 0x2028: /* LINE SEPARATOR */
3131     case 0x2029: /* PARAGRAPH SEPARATOR */
3132     RRETURN(MATCH_NOMATCH);
3133     }
3134     }
3135     break;
3136 ph10 182
3137 ph10 178 case OP_VSPACE:
3138     for (i = 1; i <= min; i++)
3139     {
3140     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3141     GETCHARINC(c, eptr);
3142     switch(c)
3143     {
3144     default: RRETURN(MATCH_NOMATCH);
3145     case 0x0a: /* LF */
3146     case 0x0b: /* VT */
3147     case 0x0c: /* FF */
3148     case 0x0d: /* CR */
3149     case 0x85: /* NEL */
3150     case 0x2028: /* LINE SEPARATOR */
3151     case 0x2029: /* PARAGRAPH SEPARATOR */
3152 ph10 182 break;
3153 ph10 178 }
3154     }
3155     break;
3156    
3157 nigel 77 case OP_NOT_DIGIT:
3158     for (i = 1; i <= min; i++)
3159     {
3160     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3161     GETCHARINC(c, eptr);
3162     if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3163     RRETURN(MATCH_NOMATCH);
3164     }
3165     break;
3166    
3167     case OP_DIGIT:
3168     for (i = 1; i <= min; i++)
3169     {
3170     if (eptr >= md->end_subject ||
3171     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3172     RRETURN(MATCH_NOMATCH);
3173     /* No need to skip more bytes - we know it's a 1-byte character */
3174     }
3175     break;
3176    
3177     case OP_NOT_WHITESPACE:
3178     for (i = 1; i <= min; i++)
3179     {
3180     if (eptr >= md->end_subject ||
3181 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3182 nigel 77 RRETURN(MATCH_NOMATCH);
3183 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3184 nigel 77 }
3185     break;
3186    
3187     case OP_WHITESPACE:
3188     for (i = 1; i <= min; i++)
3189     {
3190     if (eptr >= md->end_subject ||
3191     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3192     RRETURN(MATCH_NOMATCH);
3193     /* No need to skip more bytes - we know it's a 1-byte character */
3194     }
3195     break;
3196    
3197     case OP_NOT_WORDCHAR:
3198     for (i = 1; i <= min; i++)
3199     {
3200     if (eptr >= md->end_subject ||
3201 ph10 219 (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3202 nigel 77 RRETURN(MATCH_NOMATCH);
3203 ph10 219 while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3204 nigel 77 }
3205     break;
3206    
3207     case OP_WORDCHAR:
3208     for (i = 1; i <= min; i++)
3209     {
3210     if (eptr >= md->end_subject ||
3211     *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3212     RRETURN(MATCH_NOMATCH);
3213     /* No need to skip more bytes - we know it's a 1-byte character */
3214     }
3215     break;
3216    
3217     default:
3218     RRETURN(PCRE_ERROR_INTERNAL);
3219     } /* End switch(ctype) */
3220    
3221     else
3222     #endif /* SUPPORT_UTF8 */
3223    
3224     /* Code for the non-UTF-8 case for minimum matching of operators other
3225 nigel 93 than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3226     number of bytes present, as this was tested above. */
3227 nigel 77
3228     switch(ctype)
3229     {
3230     case OP_ANY:
3231 ph10 342 for (i = 1; i <= min; i++)
3232 nigel 77 {
3233 ph10 342 if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3234     eptr++;
3235 nigel 77 }
3236     break;
3237    
3238 ph10 341 case OP_ALLANY:
3239     eptr += min;
3240     break;
3241    
3242 nigel 77 case OP_ANYBYTE:
3243     eptr += min;
3244     break;
3245    
3246 nigel 93 /* Because of the CRLF case, we can't assume the minimum number of
3247     bytes are present in this case. */
3248    
3249     case OP_ANYNL:
3250     for (i = 1; i <= min; i++)
3251     {
3252     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3253     switch(*eptr++)
3254     {
3255     default: RRETURN(MATCH_NOMATCH);
3256     case 0x000d:
3257     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3258     break;
3259     case 0x000a:
3260 ph10 231 break;
3261    
3262 nigel 93 case 0x000b:
3263     case 0x000c:
3264     case 0x0085:
3265 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3266 nigel 93 break;
3267     }
3268     }
3269     break;
3270    
3271 ph10 178 case OP_NOT_HSPACE:
3272     for (i = 1; i <= min; i++)
3273     {
3274     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3275     switch(*eptr++)
3276     {
3277     default: break;
3278     case 0x09: /* HT */
3279     case 0x20: /* SPACE */
3280     case 0xa0: /* NBSP */
3281     RRETURN(MATCH_NOMATCH);
3282     }
3283     }
3284     break;
3285    
3286     case OP_HSPACE:
3287     for (i = 1; i <= min; i++)
3288     {
3289     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3290     switch(*eptr++)
3291     {
3292     default: RRETURN(MATCH_NOMATCH);
3293     case 0x09: /* HT */
3294     case 0x20: /* SPACE */
3295     case 0xa0: /* NBSP */
3296 ph10 182 break;
3297 ph10 178 }
3298     }
3299     break;
3300    
3301     case OP_NOT_VSPACE:
3302     for (i = 1; i <= min; i++)
3303     {
3304     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3305     switch(*eptr++)
3306     {
3307     default: break;
3308     case 0x0a: /* LF */
3309     case 0x0b: /* VT */
3310     case 0x0c: /* FF */
3311     case 0x0d: /* CR */
3312     case 0x85: /* NEL */
3313     RRETURN(MATCH_NOMATCH);
3314     }
3315     }
3316     break;
3317    
3318     case OP_VSPACE:
3319     for (i = 1; i <= min; i++)
3320     {
3321     if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3322     switch(*eptr++)
3323     {
3324     default: RRETURN(MATCH_NOMATCH);
3325     case 0x0a: /* LF */
3326     case 0x0b: /* VT */
3327     case 0x0c: /* FF */
3328     case 0x0d: /* CR */
3329     case 0x85: /* NEL */
3330 ph10 182 break;
3331 ph10 178 }
3332     }
3333     break;
3334    
3335 nigel 77 case OP_NOT_DIGIT:
3336     for (i = 1; i <= min; i++)
3337     if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3338     break;
3339    
3340     case OP_DIGIT:
3341     for (i = 1; i <= min; i++)
3342     if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3343     break;
3344    
3345     case OP_NOT_WHITESPACE:
3346     for (i = 1; i <= min; i++)
3347     if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3348     break;
3349    
3350     case OP_WHITESPACE:
3351     for (i = 1; i <= min; i++)
3352     if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3353     break;
3354    
3355     case OP_NOT_WORDCHAR:
3356     for (i = 1; i <= min; i++)
3357     if ((md->ctypes[*eptr++] & ctype_word) != 0)
3358     RRETURN(MATCH_NOMATCH);
3359     break;
3360    
3361     case OP_WORDCHAR:
3362     for (i = 1; i <= min; i++)
3363     if ((md->ctypes[*eptr++] & ctype_word) == 0)
3364     RRETURN(MATCH_NOMATCH);
3365     break;
3366    
3367     default:
3368     RRETURN(PCRE_ERROR_INTERNAL);
3369     }
3370     }
3371    
3372     /* If min = max, continue at the same level without recursing */
3373    
3374     if (min == max) continue;
3375    
3376     /* If minimizing, we have to test the rest of the pattern before each
3377     subsequent match. Again, separate the UTF-8 case for speed, and also
3378     separate the UCP cases. */
3379    
3380     if (minimize)
3381     {
3382     #ifdef SUPPORT_UCP
3383 nigel 87 if (prop_type >= 0)
3384 nigel 77 {
3385 nigel 87 switch(prop_type)
3386 nigel 77 {
3387 nigel 87 case PT_ANY:
3388     for (fi = min;; fi++)
3389     {
3390 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3391 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3392     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3393     GETCHARINC(c, eptr);
3394     if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3395     }
3396 nigel 93 /* Control never gets here */
3397 nigel 87
3398     case PT_LAMP:
3399     for (fi = min;; fi++)
3400     {
3401 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3402 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404     GETCHARINC(c, eptr);
3405 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3406 nigel 87 if ((prop_chartype == ucp_Lu ||
3407     prop_chartype == ucp_Ll ||
3408     prop_chartype == ucp_Lt) == prop_fail_result)
3409     RRETURN(MATCH_NOMATCH);
3410     }
3411 nigel 93 /* Control never gets here */
3412 nigel 87
3413     case PT_GC:
3414     for (fi = min;; fi++)
3415     {
3416 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3417 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3418     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3419     GETCHARINC(c, eptr);
3420 ph10 349 prop_category = UCD_CATEGORY(c);
3421 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3422     RRETURN(MATCH_NOMATCH);
3423     }
3424 nigel 93 /* Control never gets here */
3425 nigel 87
3426     case PT_PC:
3427     for (fi = min;; fi++)
3428     {
3429 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3430 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3432     GETCHARINC(c, eptr);
3433 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3434 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3435     RRETURN(MATCH_NOMATCH);
3436     }
3437 nigel 93 /* Control never gets here */
3438 nigel 87
3439     case PT_SC:
3440     for (fi = min;; fi++)
3441     {
3442 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3443 nigel 87 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3445     GETCHARINC(c, eptr);
3446 ph10 349 prop_script = UCD_SCRIPT(c);
3447 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3448     RRETURN(MATCH_NOMATCH);
3449     }
3450 nigel 93 /* Control never gets here */
3451 nigel 87
3452     default:
3453     RRETURN(PCRE_ERROR_INTERNAL);
3454 nigel 77 }
3455     }
3456    
3457     /* Match extended Unicode sequences. We will get here only if the
3458     support is in the binary; otherwise a compile-time error occurs. */
3459    
3460     else if (ctype == OP_EXTUNI)
3461     {
3462     for (fi = min;; fi++)
3463     {
3464 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3465 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3466     if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3467     GETCHARINCTEST(c, eptr);
3468 ph10 349 prop_category = UCD_CATEGORY(c);
3469 nigel 77 if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3470     while (eptr < md->end_subject)
3471     {
3472     int len = 1;
3473     if (!utf8) c = *eptr; else
3474     {
3475     GETCHARLEN(c, eptr, len);
3476     }
3477 ph10 349 prop_category = UCD_CATEGORY(c);
3478 nigel 77 if (prop_category != ucp_M) break;
3479     eptr += len;
3480     }
3481     }
3482     }
3483    
3484     else
3485     #endif /* SUPPORT_UCP */
3486    
3487     #ifdef SUPPORT_UTF8
3488     /* UTF-8 mode */
3489     if (utf8)
3490     {
3491     for (fi = min;; fi++)
3492     {
3493 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3494 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3495 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3496 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3497 nigel 91 RRETURN(MATCH_NOMATCH);
3498 nigel 77
3499     GETCHARINC(c, eptr);
3500     switch(ctype)
3501     {
3502 ph10 342 case OP_ANY: /* This is the non-NL case */
3503 ph10 345 case OP_ALLANY:
3504 nigel 77 case OP_ANYBYTE:
3505     break;
3506    
3507 nigel 93 case OP_ANYNL:
3508     switch(c)
3509     {
3510     default: RRETURN(MATCH_NOMATCH);
3511     case 0x000d:
3512     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3513     break;
3514     case 0x000a:
3515 ph10 231 break;
3516    
3517 nigel 93 case 0x000b:
3518     case 0x000c:
3519     case 0x0085:
3520     case 0x2028:
3521     case 0x2029:
3522 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3523 nigel 93 break;
3524     }
3525     break;
3526    
3527 ph10 178 case OP_NOT_HSPACE:
3528     switch(c)
3529     {
3530     default: break;
3531     case 0x09: /* HT */
3532     case 0x20: /* SPACE */
3533     case 0xa0: /* NBSP */
3534     case 0x1680: /* OGHAM SPACE MARK */
3535     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3536     case 0x2000: /* EN QUAD */
3537     case 0x2001: /* EM QUAD */
3538     case 0x2002: /* EN SPACE */
3539     case 0x2003: /* EM SPACE */
3540     case 0x2004: /* THREE-PER-EM SPACE */
3541     case 0x2005: /* FOUR-PER-EM SPACE */
3542     case 0x2006: /* SIX-PER-EM SPACE */
3543     case 0x2007: /* FIGURE SPACE */
3544     case 0x2008: /* PUNCTUATION SPACE */
3545     case 0x2009: /* THIN SPACE */
3546     case 0x200A: /* HAIR SPACE */
3547     case 0x202f: /* NARROW NO-BREAK SPACE */
3548     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3549     case 0x3000: /* IDEOGRAPHIC SPACE */
3550     RRETURN(MATCH_NOMATCH);
3551     }
3552     break;
3553    
3554     case OP_HSPACE:
3555     switch(c)
3556     {
3557     default: RRETURN(MATCH_NOMATCH);
3558     case 0x09: /* HT */
3559     case 0x20: /* SPACE */
3560     case 0xa0: /* NBSP */
3561     case 0x1680: /* OGHAM SPACE MARK */
3562     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3563     case 0x2000: /* EN QUAD */
3564     case 0x2001: /* EM QUAD */
3565     case 0x2002: /* EN SPACE */
3566     case 0x2003: /* EM SPACE */
3567     case 0x2004: /* THREE-PER-EM SPACE */
3568     case 0x2005: /* FOUR-PER-EM SPACE */
3569     case 0x2006: /* SIX-PER-EM SPACE */
3570     case 0x2007: /* FIGURE SPACE */
3571     case 0x2008: /* PUNCTUATION SPACE */
3572     case 0x2009: /* THIN SPACE */
3573     case 0x200A: /* HAIR SPACE */
3574     case 0x202f: /* NARROW NO-BREAK SPACE */
3575     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3576     case 0x3000: /* IDEOGRAPHIC SPACE */
3577     break;
3578     }
3579     break;
3580    
3581     case OP_NOT_VSPACE:
3582     switch(c)
3583     {
3584     default: break;
3585     case 0x0a: /* LF */
3586     case 0x0b: /* VT */
3587     case 0x0c: /* FF */
3588     case 0x0d: /* CR */
3589     case 0x85: /* NEL */
3590     case 0x2028: /* LINE SEPARATOR */
3591     case 0x2029: /* PARAGRAPH SEPARATOR */
3592     RRETURN(MATCH_NOMATCH);
3593     }
3594     break;
3595    
3596     case OP_VSPACE:
3597     switch(c)
3598     {
3599     default: RRETURN(MATCH_NOMATCH);
3600     case 0x0a: /* LF */
3601     case 0x0b: /* VT */
3602     case 0x0c: /* FF */
3603     case 0x0d: /* CR */
3604     case 0x85: /* NEL */
3605     case 0x2028: /* LINE SEPARATOR */
3606     case 0x2029: /* PARAGRAPH SEPARATOR */
3607     break;
3608     }
3609     break;
3610    
3611 nigel 77 case OP_NOT_DIGIT:
3612     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3613     RRETURN(MATCH_NOMATCH);
3614     break;
3615    
3616     case OP_DIGIT:
3617     if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3618     RRETURN(MATCH_NOMATCH);
3619     break;
3620    
3621     case OP_NOT_WHITESPACE:
3622     if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3623     RRETURN(MATCH_NOMATCH);
3624     break;
3625    
3626     case OP_WHITESPACE:
3627     if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3628     RRETURN(MATCH_NOMATCH);
3629     break;
3630    
3631     case OP_NOT_WORDCHAR:
3632     if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3633     RRETURN(MATCH_NOMATCH);
3634     break;
3635    
3636     case OP_WORDCHAR:
3637     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3638     RRETURN(MATCH_NOMATCH);
3639     break;
3640    
3641     default:
3642     RRETURN(PCRE_ERROR_INTERNAL);
3643     }
3644     }
3645     }
3646     else
3647     #endif
3648     /* Not UTF-8 mode */
3649     {
3650     for (fi = min;; fi++)
3651     {
3652 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3653 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3654 nigel 91 if (fi >= max || eptr >= md->end_subject ||
3655 ph10 342 (ctype == OP_ANY && IS_NEWLINE(eptr)))
3656 nigel 91 RRETURN(MATCH_NOMATCH);
3657    
3658 nigel 77 c = *eptr++;
3659     switch(ctype)
3660     {
3661 ph10 342 case OP_ANY: /* This is the non-NL case */
3662 ph10 345 case OP_ALLANY:
3663 nigel 77 case OP_ANYBYTE:
3664     break;
3665    
3666 nigel 93 case OP_ANYNL:
3667     switch(c)
3668     {
3669     default: RRETURN(MATCH_NOMATCH);
3670     case 0x000d:
3671     if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3672     break;
3673 ph10 231
3674 nigel 93 case 0x000a:
3675 ph10 231 break;
3676    
3677 nigel 93 case 0x000b:
3678     case 0x000c:
3679     case 0x0085:
3680 ph10 231 if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3681 nigel 93 break;
3682     }
3683     break;
3684    
3685 ph10 178 case OP_NOT_HSPACE:
3686     switch(c)
3687     {
3688     default: break;
3689     case 0x09: /* HT */
3690     case 0x20: /* SPACE */
3691     case 0xa0: /* NBSP */
3692     RRETURN(MATCH_NOMATCH);
3693     }
3694     break;
3695    
3696     case OP_HSPACE:
3697     switch(c)
3698     {
3699     default: RRETURN(MATCH_NOMATCH);
3700     case 0x09: /* HT */
3701     case 0x20: /* SPACE */
3702     case 0xa0: /* NBSP */
3703     break;
3704     }
3705     break;
3706    
3707     case OP_NOT_VSPACE:
3708     switch(c)
3709     {
3710     default: break;
3711     case 0x0a: /* LF */
3712     case 0x0b: /* VT */
3713     case 0x0c: /* FF */
3714     case 0x0d: /* CR */
3715     case 0x85: /* NEL */
3716     RRETURN(MATCH_NOMATCH);
3717     }
3718     break;
3719    
3720     case OP_VSPACE:
3721     switch(c)
3722     {
3723     default: RRETURN(MATCH_NOMATCH);
3724     case 0x0a: /* LF */
3725     case 0x0b: /* VT */
3726     case 0x0c: /* FF */
3727     case 0x0d: /* CR */
3728     case 0x85: /* NEL */
3729     break;
3730     }
3731     break;
3732    
3733 nigel 77 case OP_NOT_DIGIT:
3734     if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3735     break;
3736    
3737     case OP_DIGIT:
3738     if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3739     break;
3740    
3741     case OP_NOT_WHITESPACE:
3742     if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3743     break;
3744    
3745     case OP_WHITESPACE:
3746     if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3747     break;
3748    
3749     case OP_NOT_WORDCHAR:
3750     if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3751     break;
3752    
3753     case OP_WORDCHAR:
3754     if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3755     break;
3756    
3757     default:
3758     RRETURN(PCRE_ERROR_INTERNAL);
3759     }
3760     }
3761     }
3762     /* Control never gets here */
3763     }
3764    
3765 nigel 93 /* If maximizing, it is worth using inline code for speed, doing the type
3766 nigel 77 test once at the start (i.e. keep it out of the loop). Again, keep the
3767     UTF-8 and UCP stuff separate. */
3768    
3769     else
3770     {
3771     pp = eptr; /* Remember where we started */
3772    
3773     #ifdef SUPPORT_UCP
3774 nigel 87 if (prop_type >= 0)
3775 nigel 77 {
3776 nigel 87 switch(prop_type)
3777 nigel 77 {
3778 nigel 87 case PT_ANY:
3779     for (i = min; i < max; i++)
3780     {
3781     int len = 1;
3782     if (eptr >= md->end_subject) break;
3783     GETCHARLEN(c, eptr, len);
3784     if (prop_fail_result) break;
3785     eptr+= len;
3786     }
3787     break;
3788    
3789     case PT_LAMP:
3790     for (i = min; i < max; i++)
3791     {
3792     int len = 1;
3793     if (eptr >= md->end_subject) break;
3794     GETCHARLEN(c, eptr, len);
3795 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3796 nigel 87 if ((prop_chartype == ucp_Lu ||
3797     prop_chartype == ucp_Ll ||
3798     prop_chartype == ucp_Lt) == prop_fail_result)
3799     break;
3800     eptr+= len;
3801     }
3802     break;
3803    
3804     case PT_GC:
3805     for (i = min; i < max; i++)
3806     {
3807     int len = 1;
3808     if (eptr >= md->end_subject) break;
3809     GETCHARLEN(c, eptr, len);
3810 ph10 349 prop_category = UCD_CATEGORY(c);
3811 nigel 87 if ((prop_category == prop_value) == prop_fail_result)
3812     break;
3813     eptr+= len;
3814     }
3815     break;
3816    
3817     case PT_PC:
3818     for (i = min; i < max; i++)
3819     {
3820     int len = 1;
3821     if (eptr >= md->end_subject) break;
3822     GETCHARLEN(c, eptr, len);
3823 ph10 349 prop_chartype = UCD_CHARTYPE(c);
3824 nigel 87 if ((prop_chartype == prop_value) == prop_fail_result)
3825     break;
3826     eptr+= len;
3827     }
3828     break;
3829    
3830     case PT_SC:
3831     for (i = min; i < max; i++)
3832     {
3833     int len = 1;
3834     if (eptr >= md->end_subject) break;
3835     GETCHARLEN(c, eptr, len);
3836 ph10 349 prop_script = UCD_SCRIPT(c);
3837 nigel 87 if ((prop_script == prop_value) == prop_fail_result)
3838     break;
3839     eptr+= len;
3840     }
3841     break;
3842 nigel 77 }
3843    
3844     /* eptr is now past the end of the maximum run */
3845    
3846 nigel 93 if (possessive) continue;
3847 nigel 77 for(;;)
3848     {
3849 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3850 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3851     if (eptr-- == pp) break; /* Stop if tried at original pos */
3852 ph10 207 if (utf8) BACKCHAR(eptr);
3853 nigel 77 }
3854     }
3855    
3856     /* Match extended Unicode sequences. We will get here only if the
3857     support is in the binary; otherwise a compile-time error occurs. */
3858    
3859     else if (ctype == OP_EXTUNI)
3860     {
3861     for (i = min; i < max; i++)
3862     {
3863     if (eptr >= md->end_subject) break;
3864     GETCHARINCTEST(c, eptr);
3865 ph10 349 prop_category = UCD_CATEGORY(c);
3866 nigel 77 if (prop_category == ucp_M) break;
3867     while (eptr < md->end_subject)
3868     {
3869     int len = 1;
3870     if (!utf8) c = *eptr; else
3871     {
3872     GETCHARLEN(c, eptr, len);
3873     }
3874 ph10 349 prop_category = UCD_CATEGORY(c);
3875 nigel 77 if (prop_category != ucp_M) break;
3876     eptr += len;
3877     }
3878     }
3879    
3880     /* eptr is now past the end of the maximum run */
3881    
3882 nigel 93 if (possessive) continue;
3883 nigel 77 for(;;)
3884     {
3885 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3886 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3887     if (eptr-- == pp) break; /* Stop if tried at original pos */
3888     for (;;) /* Move back over one extended */
3889     {
3890     int len = 1;
3891     if (!utf8) c = *eptr; else
3892     {
3893 ph10 207 BACKCHAR(eptr);
3894 nigel 77 GETCHARLEN(c, eptr, len);
3895     }
3896 ph10 349 prop_category = UCD_CATEGORY(c);
3897 nigel 77 if (prop_category != ucp_M) break;
3898     eptr--;
3899     }
3900     }
3901     }
3902    
3903     else
3904     #endif /* SUPPORT_UCP */
3905    
3906     #ifdef SUPPORT_UTF8
3907     /* UTF-8 mode */
3908    
3909     if (utf8)
3910     {
3911     switch(ctype)
3912     {
3913     case OP_ANY:
3914     if (max < INT_MAX)
3915     {
3916 ph10 342 for (i = min; i < max; i++)
3917 nigel 77 {
3918 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3919     eptr++;
3920     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3921 nigel 77 }
3922     }
3923    
3924     /* Handle unlimited UTF-8 repeat */
3925    
3926     else
3927     {
3928 ph10 342 for (i = min; i < max; i++)
3929 nigel 77 {
3930 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3931     eptr++;
3932     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3933 nigel 77 }
3934     }
3935     break;
3936    
3937 ph10 341 case OP_ALLANY:
3938     if (max < INT_MAX)
3939     {
3940     for (i = min; i < max; i++)
3941     {
3942     if (eptr >= md->end_subject) break;
3943     eptr++;
3944     while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3945     }
3946     }
3947     else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3948     break;
3949    
3950 nigel 77 /* The byte case is the same as non-UTF8 */
3951    
3952     case OP_ANYBYTE:
3953     c = max - min;
3954 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
3955     c = md->end_subject - eptr;
3956 nigel 77 eptr += c;
3957     break;
3958    
3959 nigel 93 case OP_ANYNL:
3960     for (i = min; i < max; i++)
3961     {
3962     int len = 1;
3963     if (eptr >= md->end_subject) break;
3964     GETCHARLEN(c, eptr, len);
3965     if (c == 0x000d)
3966     {
3967     if (++eptr >= md->end_subject) break;
3968     if (*eptr == 0x000a) eptr++;
3969     }
3970     else
3971     {
3972 ph10 231 if (c != 0x000a &&
3973     (md->bsr_anycrlf ||
3974     (c != 0x000b && c != 0x000c &&
3975     c != 0x0085 && c != 0x2028 && c != 0x2029)))
3976 nigel 93 break;
3977     eptr += len;
3978     }
3979     }
3980     break;
3981    
3982 ph10 178 case OP_NOT_HSPACE:
3983 ph10 182 case OP_HSPACE:
3984 ph10 178 for (i = min; i < max; i++)
3985     {
3986 ph10 182 BOOL gotspace;
3987 ph10 178 int len = 1;
3988     if (eptr >= md->end_subject) break;
3989     GETCHARLEN(c, eptr, len);
3990     switch(c)
3991 ph10 182 {
3992     default: gotspace = FALSE; break;
3993 ph10 178 case 0x09: /* HT */
3994     case 0x20: /* SPACE */
3995     case 0xa0: /* NBSP */
3996     case 0x1680: /* OGHAM SPACE MARK */
3997     case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3998     case 0x2000: /* EN QUAD */
3999     case 0x2001: /* EM QUAD */
4000     case 0x2002: /* EN SPACE */
4001     case 0x2003: /* EM SPACE */
4002     case 0x2004: /* THREE-PER-EM SPACE */
4003     case 0x2005: /* FOUR-PER-EM SPACE */
4004     case 0x2006: /* SIX-PER-EM SPACE */
4005     case 0x2007: /* FIGURE SPACE */
4006     case 0x2008: /* PUNCTUATION SPACE */
4007     case 0x2009: /* THIN SPACE */
4008     case 0x200A: /* HAIR SPACE */
4009     case 0x202f: /* NARROW NO-BREAK SPACE */
4010     case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4011     case 0x3000: /* IDEOGRAPHIC SPACE */
4012     gotspace = TRUE;
4013 ph10 182 break;
4014 ph10 178 }
4015     if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4016     eptr += len;
4017     }
4018     break;
4019    
4020     case OP_NOT_VSPACE:
4021 ph10 182 case OP_VSPACE:
4022 ph10 178 for (i = min; i < max; i++)
4023     {
4024 ph10 182 BOOL gotspace;
4025 ph10 178 int len = 1;
4026     if (eptr >= md->end_subject) break;
4027     GETCHARLEN(c, eptr, len);
4028     switch(c)
4029     {
4030 ph10 182 default: gotspace = FALSE; break;
4031 ph10 178 case 0x0a: /* LF */
4032     case 0x0b: /* VT */
4033     case 0x0c: /* FF */
4034     case 0x0d: /* CR */
4035     case 0x85: /* NEL */
4036     case 0x2028: /* LINE SEPARATOR */
4037     case 0x2029: /* PARAGRAPH SEPARATOR */
4038     gotspace = TRUE;
4039     break;
4040     }
4041 ph10 182 if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4042 ph10 178 eptr += len;
4043     }
4044     break;
4045    
4046 nigel 77 case OP_NOT_DIGIT:
4047     for (i = min; i < max; i++)
4048     {
4049     int len = 1;
4050     if (eptr >= md->end_subject) break;
4051     GETCHARLEN(c, eptr, len);
4052     if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4053     eptr+= len;
4054     }
4055     break;
4056    
4057     case OP_DIGIT:
4058     for (i = min; i < max; i++)
4059     {
4060     int len = 1;
4061     if (eptr >= md->end_subject) break;
4062     GETCHARLEN(c, eptr, len);
4063     if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4064     eptr+= len;
4065     }
4066     break;
4067    
4068     case OP_NOT_WHITESPACE:
4069     for (i = min; i < max; i++)
4070     {
4071     int len = 1;
4072     if (eptr >= md->end_subject) break;
4073     GETCHARLEN(c, eptr, len);
4074     if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4075     eptr+= len;
4076     }
4077     break;
4078    
4079     case OP_WHITESPACE:
4080     for (i = min; i < max; i++)
4081     {
4082     int len = 1;
4083     if (eptr >= md->end_subject) break;
4084     GETCHARLEN(c, eptr, len);
4085     if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4086     eptr+= len;
4087     }
4088     break;
4089    
4090     case OP_NOT_WORDCHAR:
4091     for (i = min; i < max; i++)
4092     {
4093     int len = 1;
4094     if (eptr >= md->end_subject) break;
4095     GETCHARLEN(c, eptr, len);
4096     if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4097     eptr+= len;
4098     }
4099     break;
4100    
4101     case OP_WORDCHAR:
4102     for (i = min; i < max; i++)
4103     {
4104     int len = 1;
4105     if (eptr >= md->end_subject) break;
4106     GETCHARLEN(c, eptr, len);
4107     if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4108     eptr+= len;
4109     }
4110     break;
4111    
4112     default:
4113     RRETURN(PCRE_ERROR_INTERNAL);
4114     }
4115    
4116     /* eptr is now past the end of the maximum run */
4117    
4118 nigel 93 if (possessive) continue;
4119 nigel 77 for(;;)
4120     {
4121 ph10 164 RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4122 nigel 77 if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4123     if (eptr-- == pp) break; /* Stop if tried at original pos */
4124     BACKCHAR(eptr);
4125     }
4126     }
4127     else
4128 ph10 207 #endif /* SUPPORT_UTF8 */
4129 nigel 77
4130     /* Not UTF-8 mode */
4131     {
4132     switch(ctype)
4133     {
4134     case OP_ANY:
4135 ph10 342 for (i = min; i < max; i++)
4136 nigel 77 {
4137 ph10 342 if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4138     eptr++;
4139 nigel 77 }
4140 ph10 342 break;
4141 nigel 77
4142 ph10 341 case OP_ALLANY:
4143 nigel 77 case OP_ANYBYTE:
4144     c = max - min;
4145 nigel 93 if (c > (unsigned int)(md->end_subject - eptr))
4146     c = md->end_subject - eptr;
4147 nigel 77 eptr += c;
4148     break;